Commit | Line | Data |
---|---|---|
1da177e4 LT |
1 | /* |
2 | * Copyright (C) 2002 Sistina Software (UK) Limited. | |
373a392b | 3 | * Copyright (C) 2006 Red Hat GmbH |
1da177e4 LT |
4 | * |
5 | * This file is released under the GPL. | |
6 | * | |
7 | * Kcopyd provides a simple interface for copying an area of one | |
8 | * block-device to one or more other block-devices, with an asynchronous | |
9 | * completion notification. | |
10 | */ | |
11 | ||
eb69aca5 | 12 | #include <linux/types.h> |
60063497 | 13 | #include <linux/atomic.h> |
1da177e4 | 14 | #include <linux/blkdev.h> |
1da177e4 LT |
15 | #include <linux/fs.h> |
16 | #include <linux/init.h> | |
17 | #include <linux/list.h> | |
18 | #include <linux/mempool.h> | |
19 | #include <linux/module.h> | |
20 | #include <linux/pagemap.h> | |
21 | #include <linux/slab.h> | |
22 | #include <linux/vmalloc.h> | |
23 | #include <linux/workqueue.h> | |
48c9c27b | 24 | #include <linux/mutex.h> |
df5d2e90 | 25 | #include <linux/delay.h> |
586e80e6 | 26 | #include <linux/device-mapper.h> |
a765e20e | 27 | #include <linux/dm-kcopyd.h> |
1da177e4 | 28 | |
4cc96131 | 29 | #include "dm-core.h" |
1da177e4 | 30 | |
c6ea41fb MP |
31 | #define SPLIT_COUNT 8 |
32 | #define MIN_JOBS 8 | |
c663e040 NT |
33 | |
34 | #define DEFAULT_SUB_JOB_SIZE_KB 512 | |
35 | #define MAX_SUB_JOB_SIZE_KB 1024 | |
36 | ||
37 | static unsigned kcopyd_subjob_size_kb = DEFAULT_SUB_JOB_SIZE_KB; | |
38 | ||
39 | module_param(kcopyd_subjob_size_kb, uint, S_IRUGO | S_IWUSR); | |
40 | MODULE_PARM_DESC(kcopyd_subjob_size_kb, "Sub-job size for dm-kcopyd clients"); | |
41 | ||
42 | static unsigned dm_get_kcopyd_subjob_size(void) | |
43 | { | |
44 | unsigned sub_job_size_kb; | |
45 | ||
46 | sub_job_size_kb = __dm_get_module_param(&kcopyd_subjob_size_kb, | |
47 | DEFAULT_SUB_JOB_SIZE_KB, | |
48 | MAX_SUB_JOB_SIZE_KB); | |
49 | ||
50 | return sub_job_size_kb << 1; | |
51 | } | |
c6ea41fb | 52 | |
1da177e4 LT |
53 | /*----------------------------------------------------------------- |
54 | * Each kcopyd client has its own little pool of preallocated | |
55 | * pages for kcopyd io. | |
56 | *---------------------------------------------------------------*/ | |
eb69aca5 | 57 | struct dm_kcopyd_client { |
1da177e4 | 58 | struct page_list *pages; |
d0471458 MP |
59 | unsigned nr_reserved_pages; |
60 | unsigned nr_free_pages; | |
c663e040 | 61 | unsigned sub_job_size; |
138728dc | 62 | |
373a392b MB |
63 | struct dm_io_client *io_client; |
64 | ||
138728dc | 65 | wait_queue_head_t destroyq; |
8c0cbc2f | 66 | |
6f1c819c | 67 | mempool_t job_pool; |
08d8757a | 68 | |
8c0cbc2f MP |
69 | struct workqueue_struct *kcopyd_wq; |
70 | struct work_struct kcopyd_work; | |
71 | ||
df5d2e90 MP |
72 | struct dm_kcopyd_throttle *throttle; |
73 | ||
72d711c8 MS |
74 | atomic_t nr_jobs; |
75 | ||
8c0cbc2f | 76 | /* |
d7e6b8df | 77 | * We maintain four lists of jobs: |
8c0cbc2f MP |
78 | * |
79 | * i) jobs waiting for pages | |
80 | * ii) jobs that have pages, and are waiting for the io to be issued. | |
d7e6b8df NT |
81 | * iii) jobs that don't need to do any IO and just run a callback |
82 | * iv) jobs that have completed. | |
8c0cbc2f | 83 | * |
d7e6b8df | 84 | * All four of these are protected by job_lock. |
8c0cbc2f MP |
85 | */ |
86 | spinlock_t job_lock; | |
d7e6b8df | 87 | struct list_head callback_jobs; |
8c0cbc2f MP |
88 | struct list_head complete_jobs; |
89 | struct list_head io_jobs; | |
90 | struct list_head pages_jobs; | |
1da177e4 LT |
91 | }; |
92 | ||
7f069653 MP |
93 | static struct page_list zero_page_list; |
94 | ||
df5d2e90 MP |
95 | static DEFINE_SPINLOCK(throttle_spinlock); |
96 | ||
97 | /* | |
98 | * IO/IDLE accounting slowly decays after (1 << ACCOUNT_INTERVAL_SHIFT) period. | |
99 | * When total_period >= (1 << ACCOUNT_INTERVAL_SHIFT) the counters are divided | |
100 | * by 2. | |
101 | */ | |
102 | #define ACCOUNT_INTERVAL_SHIFT SHIFT_HZ | |
103 | ||
104 | /* | |
105 | * Sleep this number of milliseconds. | |
106 | * | |
107 | * The value was decided experimentally. | |
108 | * Smaller values seem to cause an increased copy rate above the limit. | |
109 | * The reason for this is unknown but possibly due to jiffies rounding errors | |
110 | * or read/write cache inside the disk. | |
111 | */ | |
112 | #define SLEEP_MSEC 100 | |
113 | ||
114 | /* | |
115 | * Maximum number of sleep events. There is a theoretical livelock if more | |
116 | * kcopyd clients do work simultaneously which this limit avoids. | |
117 | */ | |
118 | #define MAX_SLEEPS 10 | |
119 | ||
120 | static void io_job_start(struct dm_kcopyd_throttle *t) | |
121 | { | |
122 | unsigned throttle, now, difference; | |
123 | int slept = 0, skew; | |
124 | ||
125 | if (unlikely(!t)) | |
126 | return; | |
127 | ||
128 | try_again: | |
129 | spin_lock_irq(&throttle_spinlock); | |
130 | ||
6aa7de05 | 131 | throttle = READ_ONCE(t->throttle); |
df5d2e90 MP |
132 | |
133 | if (likely(throttle >= 100)) | |
134 | goto skip_limit; | |
135 | ||
136 | now = jiffies; | |
137 | difference = now - t->last_jiffies; | |
138 | t->last_jiffies = now; | |
139 | if (t->num_io_jobs) | |
140 | t->io_period += difference; | |
141 | t->total_period += difference; | |
142 | ||
143 | /* | |
144 | * Maintain sane values if we got a temporary overflow. | |
145 | */ | |
146 | if (unlikely(t->io_period > t->total_period)) | |
147 | t->io_period = t->total_period; | |
148 | ||
149 | if (unlikely(t->total_period >= (1 << ACCOUNT_INTERVAL_SHIFT))) { | |
150 | int shift = fls(t->total_period >> ACCOUNT_INTERVAL_SHIFT); | |
151 | t->total_period >>= shift; | |
152 | t->io_period >>= shift; | |
153 | } | |
154 | ||
155 | skew = t->io_period - throttle * t->total_period / 100; | |
156 | ||
157 | if (unlikely(skew > 0) && slept < MAX_SLEEPS) { | |
158 | slept++; | |
159 | spin_unlock_irq(&throttle_spinlock); | |
160 | msleep(SLEEP_MSEC); | |
161 | goto try_again; | |
162 | } | |
163 | ||
164 | skip_limit: | |
165 | t->num_io_jobs++; | |
166 | ||
167 | spin_unlock_irq(&throttle_spinlock); | |
168 | } | |
169 | ||
170 | static void io_job_finish(struct dm_kcopyd_throttle *t) | |
171 | { | |
172 | unsigned long flags; | |
173 | ||
174 | if (unlikely(!t)) | |
175 | return; | |
176 | ||
177 | spin_lock_irqsave(&throttle_spinlock, flags); | |
178 | ||
179 | t->num_io_jobs--; | |
180 | ||
6aa7de05 | 181 | if (likely(READ_ONCE(t->throttle) >= 100)) |
df5d2e90 MP |
182 | goto skip_limit; |
183 | ||
184 | if (!t->num_io_jobs) { | |
185 | unsigned now, difference; | |
186 | ||
187 | now = jiffies; | |
188 | difference = now - t->last_jiffies; | |
189 | t->last_jiffies = now; | |
190 | ||
191 | t->io_period += difference; | |
192 | t->total_period += difference; | |
193 | ||
194 | /* | |
195 | * Maintain sane values if we got a temporary overflow. | |
196 | */ | |
197 | if (unlikely(t->io_period > t->total_period)) | |
198 | t->io_period = t->total_period; | |
199 | } | |
200 | ||
201 | skip_limit: | |
202 | spin_unlock_irqrestore(&throttle_spinlock, flags); | |
203 | } | |
204 | ||
205 | ||
8c0cbc2f MP |
206 | static void wake(struct dm_kcopyd_client *kc) |
207 | { | |
208 | queue_work(kc->kcopyd_wq, &kc->kcopyd_work); | |
209 | } | |
210 | ||
d0471458 MP |
211 | /* |
212 | * Obtain one page for the use of kcopyd. | |
213 | */ | |
f99b55ee | 214 | static struct page_list *alloc_pl(gfp_t gfp) |
1da177e4 LT |
215 | { |
216 | struct page_list *pl; | |
217 | ||
f99b55ee | 218 | pl = kmalloc(sizeof(*pl), gfp); |
1da177e4 LT |
219 | if (!pl) |
220 | return NULL; | |
221 | ||
f99b55ee | 222 | pl->page = alloc_page(gfp); |
1da177e4 LT |
223 | if (!pl->page) { |
224 | kfree(pl); | |
225 | return NULL; | |
226 | } | |
227 | ||
228 | return pl; | |
229 | } | |
230 | ||
231 | static void free_pl(struct page_list *pl) | |
232 | { | |
233 | __free_page(pl->page); | |
234 | kfree(pl); | |
235 | } | |
236 | ||
d0471458 MP |
237 | /* |
238 | * Add the provided pages to a client's free page list, releasing | |
239 | * back to the system any beyond the reserved_pages limit. | |
240 | */ | |
241 | static void kcopyd_put_pages(struct dm_kcopyd_client *kc, struct page_list *pl) | |
1da177e4 | 242 | { |
d0471458 | 243 | struct page_list *next; |
1da177e4 | 244 | |
d0471458 MP |
245 | do { |
246 | next = pl->next; | |
1da177e4 | 247 | |
d0471458 MP |
248 | if (kc->nr_free_pages >= kc->nr_reserved_pages) |
249 | free_pl(pl); | |
250 | else { | |
251 | pl->next = kc->pages; | |
252 | kc->pages = pl; | |
253 | kc->nr_free_pages++; | |
254 | } | |
1da177e4 | 255 | |
d0471458 MP |
256 | pl = next; |
257 | } while (pl); | |
1da177e4 LT |
258 | } |
259 | ||
d0471458 MP |
260 | static int kcopyd_get_pages(struct dm_kcopyd_client *kc, |
261 | unsigned int nr, struct page_list **pages) | |
1da177e4 | 262 | { |
d0471458 | 263 | struct page_list *pl; |
1da177e4 | 264 | |
d0471458 MP |
265 | *pages = NULL; |
266 | ||
267 | do { | |
d0164adc | 268 | pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY | __GFP_KSWAPD_RECLAIM); |
d0471458 MP |
269 | if (unlikely(!pl)) { |
270 | /* Use reserved pages */ | |
271 | pl = kc->pages; | |
272 | if (unlikely(!pl)) | |
273 | goto out_of_memory; | |
274 | kc->pages = pl->next; | |
275 | kc->nr_free_pages--; | |
276 | } | |
277 | pl->next = *pages; | |
278 | *pages = pl; | |
279 | } while (--nr); | |
280 | ||
281 | return 0; | |
1da177e4 | 282 | |
d0471458 MP |
283 | out_of_memory: |
284 | if (*pages) | |
285 | kcopyd_put_pages(kc, *pages); | |
286 | return -ENOMEM; | |
1da177e4 LT |
287 | } |
288 | ||
289 | /* | |
290 | * These three functions resize the page pool. | |
291 | */ | |
292 | static void drop_pages(struct page_list *pl) | |
293 | { | |
294 | struct page_list *next; | |
295 | ||
296 | while (pl) { | |
297 | next = pl->next; | |
298 | free_pl(pl); | |
299 | pl = next; | |
300 | } | |
301 | } | |
302 | ||
d0471458 MP |
303 | /* |
304 | * Allocate and reserve nr_pages for the use of a specific client. | |
305 | */ | |
306 | static int client_reserve_pages(struct dm_kcopyd_client *kc, unsigned nr_pages) | |
1da177e4 | 307 | { |
d0471458 | 308 | unsigned i; |
1da177e4 LT |
309 | struct page_list *pl = NULL, *next; |
310 | ||
d0471458 | 311 | for (i = 0; i < nr_pages; i++) { |
f99b55ee | 312 | next = alloc_pl(GFP_KERNEL); |
1da177e4 LT |
313 | if (!next) { |
314 | if (pl) | |
315 | drop_pages(pl); | |
316 | return -ENOMEM; | |
317 | } | |
318 | next->next = pl; | |
319 | pl = next; | |
320 | } | |
321 | ||
d0471458 | 322 | kc->nr_reserved_pages += nr_pages; |
1da177e4 | 323 | kcopyd_put_pages(kc, pl); |
d0471458 | 324 | |
1da177e4 LT |
325 | return 0; |
326 | } | |
327 | ||
eb69aca5 | 328 | static void client_free_pages(struct dm_kcopyd_client *kc) |
1da177e4 | 329 | { |
d0471458 | 330 | BUG_ON(kc->nr_free_pages != kc->nr_reserved_pages); |
1da177e4 LT |
331 | drop_pages(kc->pages); |
332 | kc->pages = NULL; | |
d0471458 | 333 | kc->nr_free_pages = kc->nr_reserved_pages = 0; |
1da177e4 LT |
334 | } |
335 | ||
336 | /*----------------------------------------------------------------- | |
337 | * kcopyd_jobs need to be allocated by the *clients* of kcopyd, | |
338 | * for this reason we use a mempool to prevent the client from | |
339 | * ever having to do io (which could cause a deadlock). | |
340 | *---------------------------------------------------------------*/ | |
341 | struct kcopyd_job { | |
eb69aca5 | 342 | struct dm_kcopyd_client *kc; |
1da177e4 LT |
343 | struct list_head list; |
344 | unsigned long flags; | |
345 | ||
346 | /* | |
347 | * Error state of the job. | |
348 | */ | |
349 | int read_err; | |
4cdc1d1f | 350 | unsigned long write_err; |
1da177e4 LT |
351 | |
352 | /* | |
353 | * Either READ or WRITE | |
354 | */ | |
355 | int rw; | |
22a1ceb1 | 356 | struct dm_io_region source; |
1da177e4 LT |
357 | |
358 | /* | |
359 | * The destinations for the transfer. | |
360 | */ | |
361 | unsigned int num_dests; | |
eb69aca5 | 362 | struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS]; |
1da177e4 | 363 | |
1da177e4 LT |
364 | struct page_list *pages; |
365 | ||
366 | /* | |
367 | * Set this to ensure you are notified when the job has | |
368 | * completed. 'context' is for callback to use. | |
369 | */ | |
eb69aca5 | 370 | dm_kcopyd_notify_fn fn; |
1da177e4 LT |
371 | void *context; |
372 | ||
373 | /* | |
374 | * These fields are only used if the job has been split | |
375 | * into more manageable parts. | |
376 | */ | |
def5b5b2 | 377 | struct mutex lock; |
1da177e4 LT |
378 | atomic_t sub_jobs; |
379 | sector_t progress; | |
b73c67c2 | 380 | sector_t write_offset; |
1da177e4 | 381 | |
c6ea41fb MP |
382 | struct kcopyd_job *master_job; |
383 | }; | |
1da177e4 | 384 | |
e18b890b | 385 | static struct kmem_cache *_job_cache; |
1da177e4 | 386 | |
945fa4d2 | 387 | int __init dm_kcopyd_init(void) |
1da177e4 | 388 | { |
c6ea41fb MP |
389 | _job_cache = kmem_cache_create("kcopyd_job", |
390 | sizeof(struct kcopyd_job) * (SPLIT_COUNT + 1), | |
391 | __alignof__(struct kcopyd_job), 0, NULL); | |
1da177e4 LT |
392 | if (!_job_cache) |
393 | return -ENOMEM; | |
394 | ||
7f069653 MP |
395 | zero_page_list.next = &zero_page_list; |
396 | zero_page_list.page = ZERO_PAGE(0); | |
397 | ||
1da177e4 LT |
398 | return 0; |
399 | } | |
400 | ||
945fa4d2 | 401 | void dm_kcopyd_exit(void) |
1da177e4 | 402 | { |
1da177e4 | 403 | kmem_cache_destroy(_job_cache); |
1da177e4 LT |
404 | _job_cache = NULL; |
405 | } | |
406 | ||
407 | /* | |
408 | * Functions to push and pop a job onto the head of a given job | |
409 | * list. | |
410 | */ | |
b73c67c2 DLM |
411 | static struct kcopyd_job *pop_io_job(struct list_head *jobs, |
412 | struct dm_kcopyd_client *kc) | |
413 | { | |
414 | struct kcopyd_job *job; | |
415 | ||
416 | /* | |
417 | * For I/O jobs, pop any read, any write without sequential write | |
418 | * constraint and sequential writes that are at the right position. | |
419 | */ | |
420 | list_for_each_entry(job, jobs, list) { | |
421 | if (job->rw == READ || !test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags)) { | |
422 | list_del(&job->list); | |
423 | return job; | |
424 | } | |
425 | ||
426 | if (job->write_offset == job->master_job->write_offset) { | |
427 | job->master_job->write_offset += job->source.count; | |
428 | list_del(&job->list); | |
429 | return job; | |
430 | } | |
431 | } | |
432 | ||
433 | return NULL; | |
434 | } | |
435 | ||
8c0cbc2f MP |
436 | static struct kcopyd_job *pop(struct list_head *jobs, |
437 | struct dm_kcopyd_client *kc) | |
1da177e4 LT |
438 | { |
439 | struct kcopyd_job *job = NULL; | |
440 | unsigned long flags; | |
441 | ||
8c0cbc2f | 442 | spin_lock_irqsave(&kc->job_lock, flags); |
1da177e4 LT |
443 | |
444 | if (!list_empty(jobs)) { | |
b73c67c2 DLM |
445 | if (jobs == &kc->io_jobs) |
446 | job = pop_io_job(jobs, kc); | |
447 | else { | |
448 | job = list_entry(jobs->next, struct kcopyd_job, list); | |
449 | list_del(&job->list); | |
450 | } | |
1da177e4 | 451 | } |
8c0cbc2f | 452 | spin_unlock_irqrestore(&kc->job_lock, flags); |
1da177e4 LT |
453 | |
454 | return job; | |
455 | } | |
456 | ||
028867ac | 457 | static void push(struct list_head *jobs, struct kcopyd_job *job) |
1da177e4 LT |
458 | { |
459 | unsigned long flags; | |
8c0cbc2f | 460 | struct dm_kcopyd_client *kc = job->kc; |
1da177e4 | 461 | |
8c0cbc2f | 462 | spin_lock_irqsave(&kc->job_lock, flags); |
1da177e4 | 463 | list_add_tail(&job->list, jobs); |
8c0cbc2f | 464 | spin_unlock_irqrestore(&kc->job_lock, flags); |
1da177e4 LT |
465 | } |
466 | ||
b673c3a8 KI |
467 | |
468 | static void push_head(struct list_head *jobs, struct kcopyd_job *job) | |
469 | { | |
470 | unsigned long flags; | |
471 | struct dm_kcopyd_client *kc = job->kc; | |
472 | ||
473 | spin_lock_irqsave(&kc->job_lock, flags); | |
474 | list_add(&job->list, jobs); | |
475 | spin_unlock_irqrestore(&kc->job_lock, flags); | |
476 | } | |
477 | ||
1da177e4 LT |
478 | /* |
479 | * These three functions process 1 item from the corresponding | |
480 | * job list. | |
481 | * | |
482 | * They return: | |
483 | * < 0: error | |
484 | * 0: success | |
485 | * > 0: can't process yet. | |
486 | */ | |
487 | static int run_complete_job(struct kcopyd_job *job) | |
488 | { | |
489 | void *context = job->context; | |
490 | int read_err = job->read_err; | |
4cdc1d1f | 491 | unsigned long write_err = job->write_err; |
eb69aca5 HM |
492 | dm_kcopyd_notify_fn fn = job->fn; |
493 | struct dm_kcopyd_client *kc = job->kc; | |
1da177e4 | 494 | |
7f069653 | 495 | if (job->pages && job->pages != &zero_page_list) |
73830857 | 496 | kcopyd_put_pages(kc, job->pages); |
c6ea41fb MP |
497 | /* |
498 | * If this is the master job, the sub jobs have already | |
499 | * completed so we can free everything. | |
500 | */ | |
d5ffebdd MS |
501 | if (job->master_job == job) { |
502 | mutex_destroy(&job->lock); | |
6f1c819c | 503 | mempool_free(job, &kc->job_pool); |
d5ffebdd | 504 | } |
1da177e4 | 505 | fn(read_err, write_err, context); |
138728dc AK |
506 | |
507 | if (atomic_dec_and_test(&kc->nr_jobs)) | |
508 | wake_up(&kc->destroyq); | |
509 | ||
784c9a29 JP |
510 | cond_resched(); |
511 | ||
1da177e4 LT |
512 | return 0; |
513 | } | |
514 | ||
515 | static void complete_io(unsigned long error, void *context) | |
516 | { | |
517 | struct kcopyd_job *job = (struct kcopyd_job *) context; | |
8c0cbc2f | 518 | struct dm_kcopyd_client *kc = job->kc; |
1da177e4 | 519 | |
df5d2e90 MP |
520 | io_job_finish(kc->throttle); |
521 | ||
1da177e4 | 522 | if (error) { |
51111666 | 523 | if (op_is_write(job->rw)) |
ce503f59 | 524 | job->write_err |= error; |
1da177e4 LT |
525 | else |
526 | job->read_err = 1; | |
527 | ||
eb69aca5 | 528 | if (!test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) { |
8c0cbc2f MP |
529 | push(&kc->complete_jobs, job); |
530 | wake(kc); | |
1da177e4 LT |
531 | return; |
532 | } | |
533 | } | |
534 | ||
51111666 | 535 | if (op_is_write(job->rw)) |
8c0cbc2f | 536 | push(&kc->complete_jobs, job); |
1da177e4 LT |
537 | |
538 | else { | |
539 | job->rw = WRITE; | |
8c0cbc2f | 540 | push(&kc->io_jobs, job); |
1da177e4 LT |
541 | } |
542 | ||
8c0cbc2f | 543 | wake(kc); |
1da177e4 LT |
544 | } |
545 | ||
546 | /* | |
547 | * Request io on as many buffer heads as we can currently get for | |
548 | * a particular job. | |
549 | */ | |
550 | static int run_io_job(struct kcopyd_job *job) | |
551 | { | |
552 | int r; | |
373a392b | 553 | struct dm_io_request io_req = { |
e6047149 MC |
554 | .bi_op = job->rw, |
555 | .bi_op_flags = 0, | |
373a392b MB |
556 | .mem.type = DM_IO_PAGE_LIST, |
557 | .mem.ptr.pl = job->pages, | |
4622afb3 | 558 | .mem.offset = 0, |
373a392b MB |
559 | .notify.fn = complete_io, |
560 | .notify.context = job, | |
561 | .client = job->kc->io_client, | |
562 | }; | |
1da177e4 | 563 | |
b73c67c2 DLM |
564 | /* |
565 | * If we need to write sequentially and some reads or writes failed, | |
566 | * no point in continuing. | |
567 | */ | |
568 | if (test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags) && | |
569 | job->master_job->write_err) | |
570 | return -EIO; | |
571 | ||
df5d2e90 MP |
572 | io_job_start(job->kc->throttle); |
573 | ||
7eaceacc | 574 | if (job->rw == READ) |
373a392b | 575 | r = dm_io(&io_req, 1, &job->source, NULL); |
721a9602 | 576 | else |
373a392b | 577 | r = dm_io(&io_req, job->num_dests, job->dests, NULL); |
1da177e4 LT |
578 | |
579 | return r; | |
580 | } | |
581 | ||
582 | static int run_pages_job(struct kcopyd_job *job) | |
583 | { | |
584 | int r; | |
5bf45a3d | 585 | unsigned nr_pages = dm_div_up(job->dests[0].count, PAGE_SIZE >> 9); |
1da177e4 | 586 | |
5bf45a3d | 587 | r = kcopyd_get_pages(job->kc, nr_pages, &job->pages); |
1da177e4 LT |
588 | if (!r) { |
589 | /* this job is ready for io */ | |
8c0cbc2f | 590 | push(&job->kc->io_jobs, job); |
1da177e4 LT |
591 | return 0; |
592 | } | |
593 | ||
594 | if (r == -ENOMEM) | |
595 | /* can't complete now */ | |
596 | return 1; | |
597 | ||
598 | return r; | |
599 | } | |
600 | ||
601 | /* | |
602 | * Run through a list for as long as possible. Returns the count | |
603 | * of successful jobs. | |
604 | */ | |
8c0cbc2f MP |
605 | static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc, |
606 | int (*fn) (struct kcopyd_job *)) | |
1da177e4 LT |
607 | { |
608 | struct kcopyd_job *job; | |
609 | int r, count = 0; | |
610 | ||
8c0cbc2f | 611 | while ((job = pop(jobs, kc))) { |
1da177e4 LT |
612 | |
613 | r = fn(job); | |
614 | ||
615 | if (r < 0) { | |
616 | /* error this rogue job */ | |
51111666 | 617 | if (op_is_write(job->rw)) |
4cdc1d1f | 618 | job->write_err = (unsigned long) -1L; |
1da177e4 LT |
619 | else |
620 | job->read_err = 1; | |
8c0cbc2f | 621 | push(&kc->complete_jobs, job); |
1da177e4 LT |
622 | break; |
623 | } | |
624 | ||
625 | if (r > 0) { | |
626 | /* | |
627 | * We couldn't service this job ATM, so | |
628 | * push this job back onto the list. | |
629 | */ | |
b673c3a8 | 630 | push_head(jobs, job); |
1da177e4 LT |
631 | break; |
632 | } | |
633 | ||
634 | count++; | |
635 | } | |
636 | ||
637 | return count; | |
638 | } | |
639 | ||
640 | /* | |
641 | * kcopyd does this every time it's woken up. | |
642 | */ | |
8c0cbc2f | 643 | static void do_work(struct work_struct *work) |
1da177e4 | 644 | { |
8c0cbc2f MP |
645 | struct dm_kcopyd_client *kc = container_of(work, |
646 | struct dm_kcopyd_client, kcopyd_work); | |
7eaceacc | 647 | struct blk_plug plug; |
d7e6b8df | 648 | unsigned long flags; |
8c0cbc2f | 649 | |
1da177e4 LT |
650 | /* |
651 | * The order that these are called is *very* important. | |
652 | * complete jobs can free some pages for pages jobs. | |
653 | * Pages jobs when successful will jump onto the io jobs | |
654 | * list. io jobs call wake when they complete and it all | |
655 | * starts again. | |
656 | */ | |
d7e6b8df NT |
657 | spin_lock_irqsave(&kc->job_lock, flags); |
658 | list_splice_tail_init(&kc->callback_jobs, &kc->complete_jobs); | |
659 | spin_unlock_irqrestore(&kc->job_lock, flags); | |
660 | ||
7eaceacc | 661 | blk_start_plug(&plug); |
8c0cbc2f MP |
662 | process_jobs(&kc->complete_jobs, kc, run_complete_job); |
663 | process_jobs(&kc->pages_jobs, kc, run_pages_job); | |
664 | process_jobs(&kc->io_jobs, kc, run_io_job); | |
7eaceacc | 665 | blk_finish_plug(&plug); |
1da177e4 LT |
666 | } |
667 | ||
668 | /* | |
669 | * If we are copying a small region we just dispatch a single job | |
670 | * to do the copy, otherwise the io has to be split up into many | |
671 | * jobs. | |
672 | */ | |
673 | static void dispatch_job(struct kcopyd_job *job) | |
674 | { | |
8c0cbc2f MP |
675 | struct dm_kcopyd_client *kc = job->kc; |
676 | atomic_inc(&kc->nr_jobs); | |
9ca170a3 | 677 | if (unlikely(!job->source.count)) |
d7e6b8df | 678 | push(&kc->callback_jobs, job); |
7f069653 MP |
679 | else if (job->pages == &zero_page_list) |
680 | push(&kc->io_jobs, job); | |
9ca170a3 MP |
681 | else |
682 | push(&kc->pages_jobs, job); | |
8c0cbc2f | 683 | wake(kc); |
1da177e4 LT |
684 | } |
685 | ||
4cdc1d1f AK |
686 | static void segment_complete(int read_err, unsigned long write_err, |
687 | void *context) | |
1da177e4 LT |
688 | { |
689 | /* FIXME: tidy this function */ | |
690 | sector_t progress = 0; | |
691 | sector_t count = 0; | |
c6ea41fb MP |
692 | struct kcopyd_job *sub_job = (struct kcopyd_job *) context; |
693 | struct kcopyd_job *job = sub_job->master_job; | |
73830857 | 694 | struct dm_kcopyd_client *kc = job->kc; |
1da177e4 | 695 | |
def5b5b2 | 696 | mutex_lock(&job->lock); |
1da177e4 LT |
697 | |
698 | /* update the error */ | |
699 | if (read_err) | |
700 | job->read_err = 1; | |
701 | ||
702 | if (write_err) | |
ce503f59 | 703 | job->write_err |= write_err; |
1da177e4 LT |
704 | |
705 | /* | |
706 | * Only dispatch more work if there hasn't been an error. | |
707 | */ | |
708 | if ((!job->read_err && !job->write_err) || | |
eb69aca5 | 709 | test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) { |
1da177e4 LT |
710 | /* get the next chunk of work */ |
711 | progress = job->progress; | |
712 | count = job->source.count - progress; | |
713 | if (count) { | |
c663e040 NT |
714 | if (count > kc->sub_job_size) |
715 | count = kc->sub_job_size; | |
1da177e4 LT |
716 | |
717 | job->progress += count; | |
718 | } | |
719 | } | |
def5b5b2 | 720 | mutex_unlock(&job->lock); |
1da177e4 LT |
721 | |
722 | if (count) { | |
723 | int i; | |
1da177e4 LT |
724 | |
725 | *sub_job = *job; | |
b73c67c2 | 726 | sub_job->write_offset = progress; |
1da177e4 LT |
727 | sub_job->source.sector += progress; |
728 | sub_job->source.count = count; | |
729 | ||
730 | for (i = 0; i < job->num_dests; i++) { | |
731 | sub_job->dests[i].sector += progress; | |
732 | sub_job->dests[i].count = count; | |
733 | } | |
734 | ||
735 | sub_job->fn = segment_complete; | |
c6ea41fb | 736 | sub_job->context = sub_job; |
1da177e4 LT |
737 | dispatch_job(sub_job); |
738 | ||
739 | } else if (atomic_dec_and_test(&job->sub_jobs)) { | |
740 | ||
741 | /* | |
340cd444 MP |
742 | * Queue the completion callback to the kcopyd thread. |
743 | * | |
744 | * Some callers assume that all the completions are called | |
745 | * from a single thread and don't race with each other. | |
746 | * | |
747 | * We must not call the callback directly here because this | |
748 | * code may not be executing in the thread. | |
1da177e4 | 749 | */ |
340cd444 MP |
750 | push(&kc->complete_jobs, job); |
751 | wake(kc); | |
1da177e4 LT |
752 | } |
753 | } | |
754 | ||
755 | /* | |
c6ea41fb | 756 | * Create some sub jobs to share the work between them. |
1da177e4 | 757 | */ |
c6ea41fb | 758 | static void split_job(struct kcopyd_job *master_job) |
1da177e4 LT |
759 | { |
760 | int i; | |
761 | ||
c6ea41fb | 762 | atomic_inc(&master_job->kc->nr_jobs); |
340cd444 | 763 | |
c6ea41fb MP |
764 | atomic_set(&master_job->sub_jobs, SPLIT_COUNT); |
765 | for (i = 0; i < SPLIT_COUNT; i++) { | |
766 | master_job[i + 1].master_job = master_job; | |
767 | segment_complete(0, 0u, &master_job[i + 1]); | |
768 | } | |
1da177e4 LT |
769 | } |
770 | ||
7209049d MS |
771 | void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, |
772 | unsigned int num_dests, struct dm_io_region *dests, | |
773 | unsigned int flags, dm_kcopyd_notify_fn fn, void *context) | |
1da177e4 LT |
774 | { |
775 | struct kcopyd_job *job; | |
70d6c400 | 776 | int i; |
1da177e4 LT |
777 | |
778 | /* | |
c6ea41fb MP |
779 | * Allocate an array of jobs consisting of one master job |
780 | * followed by SPLIT_COUNT sub jobs. | |
1da177e4 | 781 | */ |
6f1c819c | 782 | job = mempool_alloc(&kc->job_pool, GFP_NOIO); |
d5ffebdd | 783 | mutex_init(&job->lock); |
1da177e4 LT |
784 | |
785 | /* | |
786 | * set up for the read. | |
787 | */ | |
788 | job->kc = kc; | |
789 | job->flags = flags; | |
790 | job->read_err = 0; | |
791 | job->write_err = 0; | |
1da177e4 LT |
792 | |
793 | job->num_dests = num_dests; | |
794 | memcpy(&job->dests, dests, sizeof(*dests) * num_dests); | |
795 | ||
b73c67c2 DLM |
796 | /* |
797 | * If one of the destination is a host-managed zoned block device, | |
798 | * we need to write sequentially. If one of the destination is a | |
799 | * host-aware device, then leave it to the caller to choose what to do. | |
800 | */ | |
801 | if (!test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags)) { | |
802 | for (i = 0; i < job->num_dests; i++) { | |
803 | if (bdev_zoned_model(dests[i].bdev) == BLK_ZONED_HM) { | |
804 | set_bit(DM_KCOPYD_WRITE_SEQ, &job->flags); | |
805 | break; | |
806 | } | |
807 | } | |
808 | } | |
809 | ||
810 | /* | |
811 | * If we need to write sequentially, errors cannot be ignored. | |
812 | */ | |
813 | if (test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags) && | |
814 | test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) | |
815 | clear_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags); | |
816 | ||
7f069653 MP |
817 | if (from) { |
818 | job->source = *from; | |
819 | job->pages = NULL; | |
820 | job->rw = READ; | |
821 | } else { | |
822 | memset(&job->source, 0, sizeof job->source); | |
823 | job->source.count = job->dests[0].count; | |
824 | job->pages = &zero_page_list; | |
70d6c400 MS |
825 | |
826 | /* | |
615ec946 | 827 | * Use WRITE ZEROES to optimize zeroing if all dests support it. |
70d6c400 | 828 | */ |
615ec946 | 829 | job->rw = REQ_OP_WRITE_ZEROES; |
70d6c400 | 830 | for (i = 0; i < job->num_dests; i++) |
615ec946 | 831 | if (!bdev_write_zeroes_sectors(job->dests[i].bdev)) { |
70d6c400 MS |
832 | job->rw = WRITE; |
833 | break; | |
834 | } | |
7f069653 | 835 | } |
1da177e4 LT |
836 | |
837 | job->fn = fn; | |
838 | job->context = context; | |
c6ea41fb | 839 | job->master_job = job; |
b73c67c2 | 840 | job->write_offset = 0; |
1da177e4 | 841 | |
c663e040 | 842 | if (job->source.count <= kc->sub_job_size) |
1da177e4 | 843 | dispatch_job(job); |
1da177e4 | 844 | else { |
1da177e4 LT |
845 | job->progress = 0; |
846 | split_job(job); | |
847 | } | |
1da177e4 | 848 | } |
eb69aca5 | 849 | EXPORT_SYMBOL(dm_kcopyd_copy); |
1da177e4 | 850 | |
7209049d MS |
851 | void dm_kcopyd_zero(struct dm_kcopyd_client *kc, |
852 | unsigned num_dests, struct dm_io_region *dests, | |
853 | unsigned flags, dm_kcopyd_notify_fn fn, void *context) | |
7f069653 | 854 | { |
7209049d | 855 | dm_kcopyd_copy(kc, NULL, num_dests, dests, flags, fn, context); |
7f069653 MP |
856 | } |
857 | EXPORT_SYMBOL(dm_kcopyd_zero); | |
858 | ||
a6e50b40 MP |
859 | void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc, |
860 | dm_kcopyd_notify_fn fn, void *context) | |
861 | { | |
862 | struct kcopyd_job *job; | |
863 | ||
6f1c819c | 864 | job = mempool_alloc(&kc->job_pool, GFP_NOIO); |
a6e50b40 MP |
865 | |
866 | memset(job, 0, sizeof(struct kcopyd_job)); | |
867 | job->kc = kc; | |
868 | job->fn = fn; | |
869 | job->context = context; | |
d136f2ef | 870 | job->master_job = job; |
a6e50b40 MP |
871 | |
872 | atomic_inc(&kc->nr_jobs); | |
873 | ||
874 | return job; | |
875 | } | |
876 | EXPORT_SYMBOL(dm_kcopyd_prepare_callback); | |
877 | ||
878 | void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err) | |
879 | { | |
880 | struct kcopyd_job *job = j; | |
881 | struct dm_kcopyd_client *kc = job->kc; | |
882 | ||
883 | job->read_err = read_err; | |
884 | job->write_err = write_err; | |
885 | ||
d7e6b8df | 886 | push(&kc->callback_jobs, job); |
a6e50b40 MP |
887 | wake(kc); |
888 | } | |
889 | EXPORT_SYMBOL(dm_kcopyd_do_callback); | |
890 | ||
1da177e4 LT |
891 | /* |
892 | * Cancels a kcopyd job, eg. someone might be deactivating a | |
893 | * mirror. | |
894 | */ | |
0b56306e | 895 | #if 0 |
1da177e4 LT |
896 | int kcopyd_cancel(struct kcopyd_job *job, int block) |
897 | { | |
898 | /* FIXME: finish */ | |
899 | return -1; | |
900 | } | |
0b56306e | 901 | #endif /* 0 */ |
1da177e4 LT |
902 | |
903 | /*----------------------------------------------------------------- | |
945fa4d2 | 904 | * Client setup |
1da177e4 | 905 | *---------------------------------------------------------------*/ |
df5d2e90 | 906 | struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *throttle) |
1da177e4 | 907 | { |
6f1c819c | 908 | int r; |
c663e040 | 909 | unsigned reserve_pages; |
eb69aca5 | 910 | struct dm_kcopyd_client *kc; |
1da177e4 | 911 | |
d3775354 | 912 | kc = kzalloc(sizeof(*kc), GFP_KERNEL); |
945fa4d2 | 913 | if (!kc) |
fa34ce73 | 914 | return ERR_PTR(-ENOMEM); |
1da177e4 | 915 | |
8c0cbc2f | 916 | spin_lock_init(&kc->job_lock); |
d7e6b8df | 917 | INIT_LIST_HEAD(&kc->callback_jobs); |
8c0cbc2f MP |
918 | INIT_LIST_HEAD(&kc->complete_jobs); |
919 | INIT_LIST_HEAD(&kc->io_jobs); | |
920 | INIT_LIST_HEAD(&kc->pages_jobs); | |
df5d2e90 | 921 | kc->throttle = throttle; |
8c0cbc2f | 922 | |
6f1c819c KO |
923 | r = mempool_init_slab_pool(&kc->job_pool, MIN_JOBS, _job_cache); |
924 | if (r) | |
945fa4d2 | 925 | goto bad_slab; |
08d8757a | 926 | |
8c0cbc2f | 927 | INIT_WORK(&kc->kcopyd_work, do_work); |
670368a8 | 928 | kc->kcopyd_wq = alloc_workqueue("kcopyd", WQ_MEM_RECLAIM, 0); |
6f1c819c KO |
929 | if (!kc->kcopyd_wq) { |
930 | r = -ENOMEM; | |
945fa4d2 | 931 | goto bad_workqueue; |
6f1c819c | 932 | } |
8c0cbc2f | 933 | |
c663e040 NT |
934 | kc->sub_job_size = dm_get_kcopyd_subjob_size(); |
935 | reserve_pages = DIV_ROUND_UP(kc->sub_job_size << SECTOR_SHIFT, PAGE_SIZE); | |
936 | ||
1da177e4 | 937 | kc->pages = NULL; |
d0471458 | 938 | kc->nr_reserved_pages = kc->nr_free_pages = 0; |
c663e040 | 939 | r = client_reserve_pages(kc, reserve_pages); |
945fa4d2 MP |
940 | if (r) |
941 | goto bad_client_pages; | |
1da177e4 | 942 | |
bda8efec | 943 | kc->io_client = dm_io_client_create(); |
373a392b MB |
944 | if (IS_ERR(kc->io_client)) { |
945 | r = PTR_ERR(kc->io_client); | |
945fa4d2 | 946 | goto bad_io_client; |
1da177e4 LT |
947 | } |
948 | ||
138728dc AK |
949 | init_waitqueue_head(&kc->destroyq); |
950 | atomic_set(&kc->nr_jobs, 0); | |
951 | ||
fa34ce73 | 952 | return kc; |
945fa4d2 MP |
953 | |
954 | bad_io_client: | |
955 | client_free_pages(kc); | |
956 | bad_client_pages: | |
957 | destroy_workqueue(kc->kcopyd_wq); | |
958 | bad_workqueue: | |
6f1c819c | 959 | mempool_exit(&kc->job_pool); |
945fa4d2 MP |
960 | bad_slab: |
961 | kfree(kc); | |
962 | ||
fa34ce73 | 963 | return ERR_PTR(r); |
1da177e4 | 964 | } |
eb69aca5 | 965 | EXPORT_SYMBOL(dm_kcopyd_client_create); |
1da177e4 | 966 | |
eb69aca5 | 967 | void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc) |
1da177e4 | 968 | { |
138728dc AK |
969 | /* Wait for completion of all jobs submitted by this client. */ |
970 | wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs)); | |
971 | ||
d7e6b8df | 972 | BUG_ON(!list_empty(&kc->callback_jobs)); |
8c0cbc2f MP |
973 | BUG_ON(!list_empty(&kc->complete_jobs)); |
974 | BUG_ON(!list_empty(&kc->io_jobs)); | |
975 | BUG_ON(!list_empty(&kc->pages_jobs)); | |
976 | destroy_workqueue(kc->kcopyd_wq); | |
373a392b | 977 | dm_io_client_destroy(kc->io_client); |
1da177e4 | 978 | client_free_pages(kc); |
6f1c819c | 979 | mempool_exit(&kc->job_pool); |
1da177e4 | 980 | kfree(kc); |
1da177e4 | 981 | } |
eb69aca5 | 982 | EXPORT_SYMBOL(dm_kcopyd_client_destroy); |