Commit | Line | Data |
---|---|---|
3bd94003 | 1 | // SPDX-License-Identifier: GPL-2.0-only |
1da177e4 LT |
2 | /* |
3 | * Copyright (C) 2002 Sistina Software (UK) Limited. | |
373a392b | 4 | * Copyright (C) 2006 Red Hat GmbH |
1da177e4 LT |
5 | * |
6 | * This file is released under the GPL. | |
7 | * | |
8 | * Kcopyd provides a simple interface for copying an area of one | |
9 | * block-device to one or more other block-devices, with an asynchronous | |
10 | * completion notification. | |
11 | */ | |
12 | ||
eb69aca5 | 13 | #include <linux/types.h> |
60063497 | 14 | #include <linux/atomic.h> |
1da177e4 | 15 | #include <linux/blkdev.h> |
1da177e4 LT |
16 | #include <linux/fs.h> |
17 | #include <linux/init.h> | |
18 | #include <linux/list.h> | |
19 | #include <linux/mempool.h> | |
20 | #include <linux/module.h> | |
21 | #include <linux/pagemap.h> | |
22 | #include <linux/slab.h> | |
23 | #include <linux/vmalloc.h> | |
24 | #include <linux/workqueue.h> | |
48c9c27b | 25 | #include <linux/mutex.h> |
df5d2e90 | 26 | #include <linux/delay.h> |
586e80e6 | 27 | #include <linux/device-mapper.h> |
a765e20e | 28 | #include <linux/dm-kcopyd.h> |
1da177e4 | 29 | |
4cc96131 | 30 | #include "dm-core.h" |
1da177e4 | 31 | |
c6ea41fb MP |
32 | #define SPLIT_COUNT 8 |
33 | #define MIN_JOBS 8 | |
c663e040 NT |
34 | |
35 | #define DEFAULT_SUB_JOB_SIZE_KB 512 | |
36 | #define MAX_SUB_JOB_SIZE_KB 1024 | |
37 | ||
86a3238c | 38 | static unsigned int kcopyd_subjob_size_kb = DEFAULT_SUB_JOB_SIZE_KB; |
c663e040 NT |
39 | |
40 | module_param(kcopyd_subjob_size_kb, uint, S_IRUGO | S_IWUSR); | |
41 | MODULE_PARM_DESC(kcopyd_subjob_size_kb, "Sub-job size for dm-kcopyd clients"); | |
42 | ||
86a3238c | 43 | static unsigned int dm_get_kcopyd_subjob_size(void) |
c663e040 | 44 | { |
86a3238c | 45 | unsigned int sub_job_size_kb; |
c663e040 NT |
46 | |
47 | sub_job_size_kb = __dm_get_module_param(&kcopyd_subjob_size_kb, | |
48 | DEFAULT_SUB_JOB_SIZE_KB, | |
49 | MAX_SUB_JOB_SIZE_KB); | |
50 | ||
51 | return sub_job_size_kb << 1; | |
52 | } | |
c6ea41fb | 53 | |
a4a82ce3 HM |
54 | /* |
55 | *---------------------------------------------------------------- | |
1da177e4 LT |
56 | * Each kcopyd client has its own little pool of preallocated |
57 | * pages for kcopyd io. | |
a4a82ce3 HM |
58 | *--------------------------------------------------------------- |
59 | */ | |
eb69aca5 | 60 | struct dm_kcopyd_client { |
1da177e4 | 61 | struct page_list *pages; |
86a3238c HM |
62 | unsigned int nr_reserved_pages; |
63 | unsigned int nr_free_pages; | |
64 | unsigned int sub_job_size; | |
138728dc | 65 | |
373a392b MB |
66 | struct dm_io_client *io_client; |
67 | ||
138728dc | 68 | wait_queue_head_t destroyq; |
8c0cbc2f | 69 | |
6f1c819c | 70 | mempool_t job_pool; |
08d8757a | 71 | |
8c0cbc2f MP |
72 | struct workqueue_struct *kcopyd_wq; |
73 | struct work_struct kcopyd_work; | |
74 | ||
df5d2e90 MP |
75 | struct dm_kcopyd_throttle *throttle; |
76 | ||
72d711c8 MS |
77 | atomic_t nr_jobs; |
78 | ||
8c0cbc2f | 79 | /* |
d7e6b8df | 80 | * We maintain four lists of jobs: |
8c0cbc2f MP |
81 | * |
82 | * i) jobs waiting for pages | |
83 | * ii) jobs that have pages, and are waiting for the io to be issued. | |
d7e6b8df NT |
84 | * iii) jobs that don't need to do any IO and just run a callback |
85 | * iv) jobs that have completed. | |
8c0cbc2f | 86 | * |
d7e6b8df | 87 | * All four of these are protected by job_lock. |
8c0cbc2f MP |
88 | */ |
89 | spinlock_t job_lock; | |
d7e6b8df | 90 | struct list_head callback_jobs; |
8c0cbc2f MP |
91 | struct list_head complete_jobs; |
92 | struct list_head io_jobs; | |
93 | struct list_head pages_jobs; | |
1da177e4 LT |
94 | }; |
95 | ||
7f069653 MP |
96 | static struct page_list zero_page_list; |
97 | ||
df5d2e90 MP |
98 | static DEFINE_SPINLOCK(throttle_spinlock); |
99 | ||
100 | /* | |
101 | * IO/IDLE accounting slowly decays after (1 << ACCOUNT_INTERVAL_SHIFT) period. | |
102 | * When total_period >= (1 << ACCOUNT_INTERVAL_SHIFT) the counters are divided | |
103 | * by 2. | |
104 | */ | |
105 | #define ACCOUNT_INTERVAL_SHIFT SHIFT_HZ | |
106 | ||
107 | /* | |
108 | * Sleep this number of milliseconds. | |
109 | * | |
110 | * The value was decided experimentally. | |
111 | * Smaller values seem to cause an increased copy rate above the limit. | |
112 | * The reason for this is unknown but possibly due to jiffies rounding errors | |
113 | * or read/write cache inside the disk. | |
114 | */ | |
238d991f | 115 | #define SLEEP_USEC 100000 |
df5d2e90 MP |
116 | |
117 | /* | |
118 | * Maximum number of sleep events. There is a theoretical livelock if more | |
119 | * kcopyd clients do work simultaneously which this limit avoids. | |
120 | */ | |
121 | #define MAX_SLEEPS 10 | |
122 | ||
123 | static void io_job_start(struct dm_kcopyd_throttle *t) | |
124 | { | |
86a3238c | 125 | unsigned int throttle, now, difference; |
df5d2e90 MP |
126 | int slept = 0, skew; |
127 | ||
128 | if (unlikely(!t)) | |
129 | return; | |
130 | ||
131 | try_again: | |
132 | spin_lock_irq(&throttle_spinlock); | |
133 | ||
6aa7de05 | 134 | throttle = READ_ONCE(t->throttle); |
df5d2e90 MP |
135 | |
136 | if (likely(throttle >= 100)) | |
137 | goto skip_limit; | |
138 | ||
139 | now = jiffies; | |
140 | difference = now - t->last_jiffies; | |
141 | t->last_jiffies = now; | |
142 | if (t->num_io_jobs) | |
143 | t->io_period += difference; | |
144 | t->total_period += difference; | |
145 | ||
146 | /* | |
147 | * Maintain sane values if we got a temporary overflow. | |
148 | */ | |
149 | if (unlikely(t->io_period > t->total_period)) | |
150 | t->io_period = t->total_period; | |
151 | ||
152 | if (unlikely(t->total_period >= (1 << ACCOUNT_INTERVAL_SHIFT))) { | |
153 | int shift = fls(t->total_period >> ACCOUNT_INTERVAL_SHIFT); | |
154 | t->total_period >>= shift; | |
155 | t->io_period >>= shift; | |
156 | } | |
157 | ||
158 | skew = t->io_period - throttle * t->total_period / 100; | |
159 | ||
160 | if (unlikely(skew > 0) && slept < MAX_SLEEPS) { | |
161 | slept++; | |
162 | spin_unlock_irq(&throttle_spinlock); | |
238d991f | 163 | fsleep(SLEEP_USEC); |
df5d2e90 MP |
164 | goto try_again; |
165 | } | |
166 | ||
167 | skip_limit: | |
168 | t->num_io_jobs++; | |
169 | ||
170 | spin_unlock_irq(&throttle_spinlock); | |
171 | } | |
172 | ||
173 | static void io_job_finish(struct dm_kcopyd_throttle *t) | |
174 | { | |
175 | unsigned long flags; | |
176 | ||
177 | if (unlikely(!t)) | |
178 | return; | |
179 | ||
180 | spin_lock_irqsave(&throttle_spinlock, flags); | |
181 | ||
182 | t->num_io_jobs--; | |
183 | ||
6aa7de05 | 184 | if (likely(READ_ONCE(t->throttle) >= 100)) |
df5d2e90 MP |
185 | goto skip_limit; |
186 | ||
187 | if (!t->num_io_jobs) { | |
86a3238c | 188 | unsigned int now, difference; |
df5d2e90 MP |
189 | |
190 | now = jiffies; | |
191 | difference = now - t->last_jiffies; | |
192 | t->last_jiffies = now; | |
193 | ||
194 | t->io_period += difference; | |
195 | t->total_period += difference; | |
196 | ||
197 | /* | |
198 | * Maintain sane values if we got a temporary overflow. | |
199 | */ | |
200 | if (unlikely(t->io_period > t->total_period)) | |
201 | t->io_period = t->total_period; | |
202 | } | |
203 | ||
204 | skip_limit: | |
205 | spin_unlock_irqrestore(&throttle_spinlock, flags); | |
206 | } | |
207 | ||
208 | ||
8c0cbc2f MP |
209 | static void wake(struct dm_kcopyd_client *kc) |
210 | { | |
211 | queue_work(kc->kcopyd_wq, &kc->kcopyd_work); | |
212 | } | |
213 | ||
d0471458 MP |
214 | /* |
215 | * Obtain one page for the use of kcopyd. | |
216 | */ | |
f99b55ee | 217 | static struct page_list *alloc_pl(gfp_t gfp) |
1da177e4 LT |
218 | { |
219 | struct page_list *pl; | |
220 | ||
f99b55ee | 221 | pl = kmalloc(sizeof(*pl), gfp); |
1da177e4 LT |
222 | if (!pl) |
223 | return NULL; | |
224 | ||
949d49ec | 225 | pl->page = alloc_page(gfp | __GFP_HIGHMEM); |
1da177e4 LT |
226 | if (!pl->page) { |
227 | kfree(pl); | |
228 | return NULL; | |
229 | } | |
230 | ||
231 | return pl; | |
232 | } | |
233 | ||
234 | static void free_pl(struct page_list *pl) | |
235 | { | |
236 | __free_page(pl->page); | |
237 | kfree(pl); | |
238 | } | |
239 | ||
d0471458 MP |
240 | /* |
241 | * Add the provided pages to a client's free page list, releasing | |
242 | * back to the system any beyond the reserved_pages limit. | |
243 | */ | |
244 | static void kcopyd_put_pages(struct dm_kcopyd_client *kc, struct page_list *pl) | |
1da177e4 | 245 | { |
d0471458 | 246 | struct page_list *next; |
1da177e4 | 247 | |
d0471458 MP |
248 | do { |
249 | next = pl->next; | |
1da177e4 | 250 | |
d0471458 MP |
251 | if (kc->nr_free_pages >= kc->nr_reserved_pages) |
252 | free_pl(pl); | |
253 | else { | |
254 | pl->next = kc->pages; | |
255 | kc->pages = pl; | |
256 | kc->nr_free_pages++; | |
257 | } | |
1da177e4 | 258 | |
d0471458 MP |
259 | pl = next; |
260 | } while (pl); | |
1da177e4 LT |
261 | } |
262 | ||
d0471458 MP |
263 | static int kcopyd_get_pages(struct dm_kcopyd_client *kc, |
264 | unsigned int nr, struct page_list **pages) | |
1da177e4 | 265 | { |
d0471458 | 266 | struct page_list *pl; |
1da177e4 | 267 | |
d0471458 MP |
268 | *pages = NULL; |
269 | ||
270 | do { | |
d0164adc | 271 | pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY | __GFP_KSWAPD_RECLAIM); |
d0471458 MP |
272 | if (unlikely(!pl)) { |
273 | /* Use reserved pages */ | |
274 | pl = kc->pages; | |
275 | if (unlikely(!pl)) | |
276 | goto out_of_memory; | |
277 | kc->pages = pl->next; | |
278 | kc->nr_free_pages--; | |
279 | } | |
280 | pl->next = *pages; | |
281 | *pages = pl; | |
282 | } while (--nr); | |
283 | ||
284 | return 0; | |
1da177e4 | 285 | |
d0471458 MP |
286 | out_of_memory: |
287 | if (*pages) | |
288 | kcopyd_put_pages(kc, *pages); | |
289 | return -ENOMEM; | |
1da177e4 LT |
290 | } |
291 | ||
292 | /* | |
293 | * These three functions resize the page pool. | |
294 | */ | |
295 | static void drop_pages(struct page_list *pl) | |
296 | { | |
297 | struct page_list *next; | |
298 | ||
299 | while (pl) { | |
300 | next = pl->next; | |
301 | free_pl(pl); | |
302 | pl = next; | |
303 | } | |
304 | } | |
305 | ||
d0471458 MP |
306 | /* |
307 | * Allocate and reserve nr_pages for the use of a specific client. | |
308 | */ | |
86a3238c | 309 | static int client_reserve_pages(struct dm_kcopyd_client *kc, unsigned int nr_pages) |
1da177e4 | 310 | { |
86a3238c | 311 | unsigned int i; |
1da177e4 LT |
312 | struct page_list *pl = NULL, *next; |
313 | ||
d0471458 | 314 | for (i = 0; i < nr_pages; i++) { |
f99b55ee | 315 | next = alloc_pl(GFP_KERNEL); |
1da177e4 LT |
316 | if (!next) { |
317 | if (pl) | |
318 | drop_pages(pl); | |
319 | return -ENOMEM; | |
320 | } | |
321 | next->next = pl; | |
322 | pl = next; | |
323 | } | |
324 | ||
d0471458 | 325 | kc->nr_reserved_pages += nr_pages; |
1da177e4 | 326 | kcopyd_put_pages(kc, pl); |
d0471458 | 327 | |
1da177e4 LT |
328 | return 0; |
329 | } | |
330 | ||
eb69aca5 | 331 | static void client_free_pages(struct dm_kcopyd_client *kc) |
1da177e4 | 332 | { |
d0471458 | 333 | BUG_ON(kc->nr_free_pages != kc->nr_reserved_pages); |
1da177e4 LT |
334 | drop_pages(kc->pages); |
335 | kc->pages = NULL; | |
d0471458 | 336 | kc->nr_free_pages = kc->nr_reserved_pages = 0; |
1da177e4 LT |
337 | } |
338 | ||
a4a82ce3 HM |
339 | /* |
340 | *--------------------------------------------------------------- | |
1da177e4 LT |
341 | * kcopyd_jobs need to be allocated by the *clients* of kcopyd, |
342 | * for this reason we use a mempool to prevent the client from | |
343 | * ever having to do io (which could cause a deadlock). | |
a4a82ce3 HM |
344 | *--------------------------------------------------------------- |
345 | */ | |
1da177e4 | 346 | struct kcopyd_job { |
eb69aca5 | 347 | struct dm_kcopyd_client *kc; |
1da177e4 | 348 | struct list_head list; |
86a3238c | 349 | unsigned int flags; |
1da177e4 LT |
350 | |
351 | /* | |
352 | * Error state of the job. | |
353 | */ | |
354 | int read_err; | |
4cdc1d1f | 355 | unsigned long write_err; |
1da177e4 LT |
356 | |
357 | /* | |
71f7113d | 358 | * REQ_OP_READ, REQ_OP_WRITE or REQ_OP_WRITE_ZEROES. |
1da177e4 | 359 | */ |
71f7113d | 360 | enum req_op op; |
22a1ceb1 | 361 | struct dm_io_region source; |
1da177e4 LT |
362 | |
363 | /* | |
364 | * The destinations for the transfer. | |
365 | */ | |
366 | unsigned int num_dests; | |
eb69aca5 | 367 | struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS]; |
1da177e4 | 368 | |
1da177e4 LT |
369 | struct page_list *pages; |
370 | ||
371 | /* | |
372 | * Set this to ensure you are notified when the job has | |
373 | * completed. 'context' is for callback to use. | |
374 | */ | |
eb69aca5 | 375 | dm_kcopyd_notify_fn fn; |
1da177e4 LT |
376 | void *context; |
377 | ||
378 | /* | |
379 | * These fields are only used if the job has been split | |
380 | * into more manageable parts. | |
381 | */ | |
def5b5b2 | 382 | struct mutex lock; |
1da177e4 LT |
383 | atomic_t sub_jobs; |
384 | sector_t progress; | |
b73c67c2 | 385 | sector_t write_offset; |
1da177e4 | 386 | |
c6ea41fb MP |
387 | struct kcopyd_job *master_job; |
388 | }; | |
1da177e4 | 389 | |
e18b890b | 390 | static struct kmem_cache *_job_cache; |
1da177e4 | 391 | |
945fa4d2 | 392 | int __init dm_kcopyd_init(void) |
1da177e4 | 393 | { |
c6ea41fb MP |
394 | _job_cache = kmem_cache_create("kcopyd_job", |
395 | sizeof(struct kcopyd_job) * (SPLIT_COUNT + 1), | |
396 | __alignof__(struct kcopyd_job), 0, NULL); | |
1da177e4 LT |
397 | if (!_job_cache) |
398 | return -ENOMEM; | |
399 | ||
7f069653 MP |
400 | zero_page_list.next = &zero_page_list; |
401 | zero_page_list.page = ZERO_PAGE(0); | |
402 | ||
1da177e4 LT |
403 | return 0; |
404 | } | |
405 | ||
945fa4d2 | 406 | void dm_kcopyd_exit(void) |
1da177e4 | 407 | { |
1da177e4 | 408 | kmem_cache_destroy(_job_cache); |
1da177e4 LT |
409 | _job_cache = NULL; |
410 | } | |
411 | ||
412 | /* | |
413 | * Functions to push and pop a job onto the head of a given job | |
414 | * list. | |
415 | */ | |
b73c67c2 DLM |
416 | static struct kcopyd_job *pop_io_job(struct list_head *jobs, |
417 | struct dm_kcopyd_client *kc) | |
418 | { | |
419 | struct kcopyd_job *job; | |
420 | ||
421 | /* | |
422 | * For I/O jobs, pop any read, any write without sequential write | |
423 | * constraint and sequential writes that are at the right position. | |
424 | */ | |
425 | list_for_each_entry(job, jobs, list) { | |
71f7113d BVA |
426 | if (job->op == REQ_OP_READ || |
427 | !(job->flags & BIT(DM_KCOPYD_WRITE_SEQ))) { | |
b73c67c2 DLM |
428 | list_del(&job->list); |
429 | return job; | |
430 | } | |
431 | ||
432 | if (job->write_offset == job->master_job->write_offset) { | |
433 | job->master_job->write_offset += job->source.count; | |
434 | list_del(&job->list); | |
435 | return job; | |
436 | } | |
437 | } | |
438 | ||
439 | return NULL; | |
440 | } | |
441 | ||
8c0cbc2f MP |
442 | static struct kcopyd_job *pop(struct list_head *jobs, |
443 | struct dm_kcopyd_client *kc) | |
1da177e4 LT |
444 | { |
445 | struct kcopyd_job *job = NULL; | |
1da177e4 | 446 | |
6bcd658f | 447 | spin_lock_irq(&kc->job_lock); |
1da177e4 LT |
448 | |
449 | if (!list_empty(jobs)) { | |
b73c67c2 DLM |
450 | if (jobs == &kc->io_jobs) |
451 | job = pop_io_job(jobs, kc); | |
452 | else { | |
453 | job = list_entry(jobs->next, struct kcopyd_job, list); | |
454 | list_del(&job->list); | |
455 | } | |
1da177e4 | 456 | } |
6bcd658f | 457 | spin_unlock_irq(&kc->job_lock); |
1da177e4 LT |
458 | |
459 | return job; | |
460 | } | |
461 | ||
028867ac | 462 | static void push(struct list_head *jobs, struct kcopyd_job *job) |
1da177e4 LT |
463 | { |
464 | unsigned long flags; | |
8c0cbc2f | 465 | struct dm_kcopyd_client *kc = job->kc; |
1da177e4 | 466 | |
8c0cbc2f | 467 | spin_lock_irqsave(&kc->job_lock, flags); |
1da177e4 | 468 | list_add_tail(&job->list, jobs); |
8c0cbc2f | 469 | spin_unlock_irqrestore(&kc->job_lock, flags); |
1da177e4 LT |
470 | } |
471 | ||
b673c3a8 KI |
472 | |
473 | static void push_head(struct list_head *jobs, struct kcopyd_job *job) | |
474 | { | |
b673c3a8 KI |
475 | struct dm_kcopyd_client *kc = job->kc; |
476 | ||
6bcd658f | 477 | spin_lock_irq(&kc->job_lock); |
b673c3a8 | 478 | list_add(&job->list, jobs); |
6bcd658f | 479 | spin_unlock_irq(&kc->job_lock); |
b673c3a8 KI |
480 | } |
481 | ||
1da177e4 LT |
482 | /* |
483 | * These three functions process 1 item from the corresponding | |
484 | * job list. | |
485 | * | |
486 | * They return: | |
487 | * < 0: error | |
488 | * 0: success | |
489 | * > 0: can't process yet. | |
490 | */ | |
491 | static int run_complete_job(struct kcopyd_job *job) | |
492 | { | |
493 | void *context = job->context; | |
494 | int read_err = job->read_err; | |
4cdc1d1f | 495 | unsigned long write_err = job->write_err; |
eb69aca5 HM |
496 | dm_kcopyd_notify_fn fn = job->fn; |
497 | struct dm_kcopyd_client *kc = job->kc; | |
1da177e4 | 498 | |
7f069653 | 499 | if (job->pages && job->pages != &zero_page_list) |
73830857 | 500 | kcopyd_put_pages(kc, job->pages); |
c6ea41fb MP |
501 | /* |
502 | * If this is the master job, the sub jobs have already | |
503 | * completed so we can free everything. | |
504 | */ | |
d5ffebdd MS |
505 | if (job->master_job == job) { |
506 | mutex_destroy(&job->lock); | |
6f1c819c | 507 | mempool_free(job, &kc->job_pool); |
d5ffebdd | 508 | } |
1da177e4 | 509 | fn(read_err, write_err, context); |
138728dc AK |
510 | |
511 | if (atomic_dec_and_test(&kc->nr_jobs)) | |
512 | wake_up(&kc->destroyq); | |
513 | ||
784c9a29 JP |
514 | cond_resched(); |
515 | ||
1da177e4 LT |
516 | return 0; |
517 | } | |
518 | ||
519 | static void complete_io(unsigned long error, void *context) | |
520 | { | |
521 | struct kcopyd_job *job = (struct kcopyd_job *) context; | |
8c0cbc2f | 522 | struct dm_kcopyd_client *kc = job->kc; |
1da177e4 | 523 | |
df5d2e90 MP |
524 | io_job_finish(kc->throttle); |
525 | ||
1da177e4 | 526 | if (error) { |
71f7113d | 527 | if (op_is_write(job->op)) |
ce503f59 | 528 | job->write_err |= error; |
1da177e4 LT |
529 | else |
530 | job->read_err = 1; | |
531 | ||
db2351eb | 532 | if (!(job->flags & BIT(DM_KCOPYD_IGNORE_ERROR))) { |
8c0cbc2f MP |
533 | push(&kc->complete_jobs, job); |
534 | wake(kc); | |
1da177e4 LT |
535 | return; |
536 | } | |
537 | } | |
538 | ||
71f7113d | 539 | if (op_is_write(job->op)) |
8c0cbc2f | 540 | push(&kc->complete_jobs, job); |
1da177e4 LT |
541 | |
542 | else { | |
71f7113d | 543 | job->op = REQ_OP_WRITE; |
8c0cbc2f | 544 | push(&kc->io_jobs, job); |
1da177e4 LT |
545 | } |
546 | ||
8c0cbc2f | 547 | wake(kc); |
1da177e4 LT |
548 | } |
549 | ||
550 | /* | |
551 | * Request io on as many buffer heads as we can currently get for | |
552 | * a particular job. | |
553 | */ | |
554 | static int run_io_job(struct kcopyd_job *job) | |
555 | { | |
556 | int r; | |
373a392b | 557 | struct dm_io_request io_req = { |
71f7113d | 558 | .bi_opf = job->op, |
373a392b MB |
559 | .mem.type = DM_IO_PAGE_LIST, |
560 | .mem.ptr.pl = job->pages, | |
4622afb3 | 561 | .mem.offset = 0, |
373a392b MB |
562 | .notify.fn = complete_io, |
563 | .notify.context = job, | |
564 | .client = job->kc->io_client, | |
565 | }; | |
1da177e4 | 566 | |
b73c67c2 DLM |
567 | /* |
568 | * If we need to write sequentially and some reads or writes failed, | |
569 | * no point in continuing. | |
570 | */ | |
db2351eb | 571 | if (job->flags & BIT(DM_KCOPYD_WRITE_SEQ) && |
d1fef414 DF |
572 | job->master_job->write_err) { |
573 | job->write_err = job->master_job->write_err; | |
b73c67c2 | 574 | return -EIO; |
d1fef414 | 575 | } |
b73c67c2 | 576 | |
df5d2e90 MP |
577 | io_job_start(job->kc->throttle); |
578 | ||
71f7113d | 579 | if (job->op == REQ_OP_READ) |
373a392b | 580 | r = dm_io(&io_req, 1, &job->source, NULL); |
721a9602 | 581 | else |
373a392b | 582 | r = dm_io(&io_req, job->num_dests, job->dests, NULL); |
1da177e4 LT |
583 | |
584 | return r; | |
585 | } | |
586 | ||
587 | static int run_pages_job(struct kcopyd_job *job) | |
588 | { | |
589 | int r; | |
86a3238c | 590 | unsigned int nr_pages = dm_div_up(job->dests[0].count, PAGE_SIZE >> 9); |
1da177e4 | 591 | |
5bf45a3d | 592 | r = kcopyd_get_pages(job->kc, nr_pages, &job->pages); |
1da177e4 LT |
593 | if (!r) { |
594 | /* this job is ready for io */ | |
8c0cbc2f | 595 | push(&job->kc->io_jobs, job); |
1da177e4 LT |
596 | return 0; |
597 | } | |
598 | ||
599 | if (r == -ENOMEM) | |
600 | /* can't complete now */ | |
601 | return 1; | |
602 | ||
603 | return r; | |
604 | } | |
605 | ||
606 | /* | |
607 | * Run through a list for as long as possible. Returns the count | |
608 | * of successful jobs. | |
609 | */ | |
8c0cbc2f | 610 | static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc, |
8ca817c4 | 611 | int (*fn)(struct kcopyd_job *)) |
1da177e4 LT |
612 | { |
613 | struct kcopyd_job *job; | |
614 | int r, count = 0; | |
615 | ||
8c0cbc2f | 616 | while ((job = pop(jobs, kc))) { |
1da177e4 LT |
617 | |
618 | r = fn(job); | |
619 | ||
620 | if (r < 0) { | |
621 | /* error this rogue job */ | |
71f7113d | 622 | if (op_is_write(job->op)) |
4cdc1d1f | 623 | job->write_err = (unsigned long) -1L; |
1da177e4 LT |
624 | else |
625 | job->read_err = 1; | |
8c0cbc2f | 626 | push(&kc->complete_jobs, job); |
d1fef414 | 627 | wake(kc); |
1da177e4 LT |
628 | break; |
629 | } | |
630 | ||
631 | if (r > 0) { | |
632 | /* | |
633 | * We couldn't service this job ATM, so | |
634 | * push this job back onto the list. | |
635 | */ | |
b673c3a8 | 636 | push_head(jobs, job); |
1da177e4 LT |
637 | break; |
638 | } | |
639 | ||
640 | count++; | |
641 | } | |
642 | ||
643 | return count; | |
644 | } | |
645 | ||
646 | /* | |
647 | * kcopyd does this every time it's woken up. | |
648 | */ | |
8c0cbc2f | 649 | static void do_work(struct work_struct *work) |
1da177e4 | 650 | { |
8c0cbc2f MP |
651 | struct dm_kcopyd_client *kc = container_of(work, |
652 | struct dm_kcopyd_client, kcopyd_work); | |
7eaceacc | 653 | struct blk_plug plug; |
8c0cbc2f | 654 | |
1da177e4 LT |
655 | /* |
656 | * The order that these are called is *very* important. | |
657 | * complete jobs can free some pages for pages jobs. | |
658 | * Pages jobs when successful will jump onto the io jobs | |
659 | * list. io jobs call wake when they complete and it all | |
660 | * starts again. | |
661 | */ | |
6bcd658f | 662 | spin_lock_irq(&kc->job_lock); |
d7e6b8df | 663 | list_splice_tail_init(&kc->callback_jobs, &kc->complete_jobs); |
6bcd658f | 664 | spin_unlock_irq(&kc->job_lock); |
d7e6b8df | 665 | |
7eaceacc | 666 | blk_start_plug(&plug); |
8c0cbc2f MP |
667 | process_jobs(&kc->complete_jobs, kc, run_complete_job); |
668 | process_jobs(&kc->pages_jobs, kc, run_pages_job); | |
669 | process_jobs(&kc->io_jobs, kc, run_io_job); | |
7eaceacc | 670 | blk_finish_plug(&plug); |
1da177e4 LT |
671 | } |
672 | ||
673 | /* | |
674 | * If we are copying a small region we just dispatch a single job | |
675 | * to do the copy, otherwise the io has to be split up into many | |
676 | * jobs. | |
677 | */ | |
678 | static void dispatch_job(struct kcopyd_job *job) | |
679 | { | |
8c0cbc2f MP |
680 | struct dm_kcopyd_client *kc = job->kc; |
681 | atomic_inc(&kc->nr_jobs); | |
9ca170a3 | 682 | if (unlikely(!job->source.count)) |
d7e6b8df | 683 | push(&kc->callback_jobs, job); |
7f069653 MP |
684 | else if (job->pages == &zero_page_list) |
685 | push(&kc->io_jobs, job); | |
9ca170a3 MP |
686 | else |
687 | push(&kc->pages_jobs, job); | |
8c0cbc2f | 688 | wake(kc); |
1da177e4 LT |
689 | } |
690 | ||
4cdc1d1f AK |
691 | static void segment_complete(int read_err, unsigned long write_err, |
692 | void *context) | |
1da177e4 LT |
693 | { |
694 | /* FIXME: tidy this function */ | |
695 | sector_t progress = 0; | |
696 | sector_t count = 0; | |
c6ea41fb MP |
697 | struct kcopyd_job *sub_job = (struct kcopyd_job *) context; |
698 | struct kcopyd_job *job = sub_job->master_job; | |
73830857 | 699 | struct dm_kcopyd_client *kc = job->kc; |
1da177e4 | 700 | |
def5b5b2 | 701 | mutex_lock(&job->lock); |
1da177e4 LT |
702 | |
703 | /* update the error */ | |
704 | if (read_err) | |
705 | job->read_err = 1; | |
706 | ||
707 | if (write_err) | |
ce503f59 | 708 | job->write_err |= write_err; |
1da177e4 LT |
709 | |
710 | /* | |
711 | * Only dispatch more work if there hasn't been an error. | |
712 | */ | |
713 | if ((!job->read_err && !job->write_err) || | |
db2351eb | 714 | job->flags & BIT(DM_KCOPYD_IGNORE_ERROR)) { |
1da177e4 LT |
715 | /* get the next chunk of work */ |
716 | progress = job->progress; | |
717 | count = job->source.count - progress; | |
718 | if (count) { | |
c663e040 NT |
719 | if (count > kc->sub_job_size) |
720 | count = kc->sub_job_size; | |
1da177e4 LT |
721 | |
722 | job->progress += count; | |
723 | } | |
724 | } | |
def5b5b2 | 725 | mutex_unlock(&job->lock); |
1da177e4 LT |
726 | |
727 | if (count) { | |
728 | int i; | |
1da177e4 LT |
729 | |
730 | *sub_job = *job; | |
b73c67c2 | 731 | sub_job->write_offset = progress; |
1da177e4 LT |
732 | sub_job->source.sector += progress; |
733 | sub_job->source.count = count; | |
734 | ||
735 | for (i = 0; i < job->num_dests; i++) { | |
736 | sub_job->dests[i].sector += progress; | |
737 | sub_job->dests[i].count = count; | |
738 | } | |
739 | ||
740 | sub_job->fn = segment_complete; | |
c6ea41fb | 741 | sub_job->context = sub_job; |
1da177e4 LT |
742 | dispatch_job(sub_job); |
743 | ||
744 | } else if (atomic_dec_and_test(&job->sub_jobs)) { | |
745 | ||
746 | /* | |
340cd444 MP |
747 | * Queue the completion callback to the kcopyd thread. |
748 | * | |
749 | * Some callers assume that all the completions are called | |
750 | * from a single thread and don't race with each other. | |
751 | * | |
752 | * We must not call the callback directly here because this | |
753 | * code may not be executing in the thread. | |
1da177e4 | 754 | */ |
340cd444 MP |
755 | push(&kc->complete_jobs, job); |
756 | wake(kc); | |
1da177e4 LT |
757 | } |
758 | } | |
759 | ||
760 | /* | |
c6ea41fb | 761 | * Create some sub jobs to share the work between them. |
1da177e4 | 762 | */ |
c6ea41fb | 763 | static void split_job(struct kcopyd_job *master_job) |
1da177e4 LT |
764 | { |
765 | int i; | |
766 | ||
c6ea41fb | 767 | atomic_inc(&master_job->kc->nr_jobs); |
340cd444 | 768 | |
c6ea41fb MP |
769 | atomic_set(&master_job->sub_jobs, SPLIT_COUNT); |
770 | for (i = 0; i < SPLIT_COUNT; i++) { | |
771 | master_job[i + 1].master_job = master_job; | |
772 | segment_complete(0, 0u, &master_job[i + 1]); | |
773 | } | |
1da177e4 LT |
774 | } |
775 | ||
7209049d MS |
776 | void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, |
777 | unsigned int num_dests, struct dm_io_region *dests, | |
778 | unsigned int flags, dm_kcopyd_notify_fn fn, void *context) | |
1da177e4 LT |
779 | { |
780 | struct kcopyd_job *job; | |
70d6c400 | 781 | int i; |
1da177e4 LT |
782 | |
783 | /* | |
c6ea41fb MP |
784 | * Allocate an array of jobs consisting of one master job |
785 | * followed by SPLIT_COUNT sub jobs. | |
1da177e4 | 786 | */ |
6f1c819c | 787 | job = mempool_alloc(&kc->job_pool, GFP_NOIO); |
d5ffebdd | 788 | mutex_init(&job->lock); |
1da177e4 LT |
789 | |
790 | /* | |
791 | * set up for the read. | |
792 | */ | |
793 | job->kc = kc; | |
794 | job->flags = flags; | |
795 | job->read_err = 0; | |
796 | job->write_err = 0; | |
1da177e4 LT |
797 | |
798 | job->num_dests = num_dests; | |
799 | memcpy(&job->dests, dests, sizeof(*dests) * num_dests); | |
800 | ||
b73c67c2 DLM |
801 | /* |
802 | * If one of the destination is a host-managed zoned block device, | |
803 | * we need to write sequentially. If one of the destination is a | |
804 | * host-aware device, then leave it to the caller to choose what to do. | |
805 | */ | |
db2351eb | 806 | if (!(job->flags & BIT(DM_KCOPYD_WRITE_SEQ))) { |
b73c67c2 DLM |
807 | for (i = 0; i < job->num_dests; i++) { |
808 | if (bdev_zoned_model(dests[i].bdev) == BLK_ZONED_HM) { | |
db2351eb | 809 | job->flags |= BIT(DM_KCOPYD_WRITE_SEQ); |
b73c67c2 DLM |
810 | break; |
811 | } | |
812 | } | |
813 | } | |
814 | ||
815 | /* | |
816 | * If we need to write sequentially, errors cannot be ignored. | |
817 | */ | |
db2351eb MP |
818 | if (job->flags & BIT(DM_KCOPYD_WRITE_SEQ) && |
819 | job->flags & BIT(DM_KCOPYD_IGNORE_ERROR)) | |
820 | job->flags &= ~BIT(DM_KCOPYD_IGNORE_ERROR); | |
b73c67c2 | 821 | |
7f069653 MP |
822 | if (from) { |
823 | job->source = *from; | |
824 | job->pages = NULL; | |
71f7113d | 825 | job->op = REQ_OP_READ; |
7f069653 MP |
826 | } else { |
827 | memset(&job->source, 0, sizeof job->source); | |
828 | job->source.count = job->dests[0].count; | |
829 | job->pages = &zero_page_list; | |
70d6c400 MS |
830 | |
831 | /* | |
615ec946 | 832 | * Use WRITE ZEROES to optimize zeroing if all dests support it. |
70d6c400 | 833 | */ |
71f7113d | 834 | job->op = REQ_OP_WRITE_ZEROES; |
70d6c400 | 835 | for (i = 0; i < job->num_dests; i++) |
615ec946 | 836 | if (!bdev_write_zeroes_sectors(job->dests[i].bdev)) { |
71f7113d | 837 | job->op = REQ_OP_WRITE; |
70d6c400 MS |
838 | break; |
839 | } | |
7f069653 | 840 | } |
1da177e4 LT |
841 | |
842 | job->fn = fn; | |
843 | job->context = context; | |
c6ea41fb | 844 | job->master_job = job; |
b73c67c2 | 845 | job->write_offset = 0; |
1da177e4 | 846 | |
c663e040 | 847 | if (job->source.count <= kc->sub_job_size) |
1da177e4 | 848 | dispatch_job(job); |
1da177e4 | 849 | else { |
1da177e4 LT |
850 | job->progress = 0; |
851 | split_job(job); | |
852 | } | |
1da177e4 | 853 | } |
eb69aca5 | 854 | EXPORT_SYMBOL(dm_kcopyd_copy); |
1da177e4 | 855 | |
7209049d | 856 | void dm_kcopyd_zero(struct dm_kcopyd_client *kc, |
86a3238c HM |
857 | unsigned int num_dests, struct dm_io_region *dests, |
858 | unsigned int flags, dm_kcopyd_notify_fn fn, void *context) | |
7f069653 | 859 | { |
7209049d | 860 | dm_kcopyd_copy(kc, NULL, num_dests, dests, flags, fn, context); |
7f069653 MP |
861 | } |
862 | EXPORT_SYMBOL(dm_kcopyd_zero); | |
863 | ||
a6e50b40 MP |
864 | void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc, |
865 | dm_kcopyd_notify_fn fn, void *context) | |
866 | { | |
867 | struct kcopyd_job *job; | |
868 | ||
6f1c819c | 869 | job = mempool_alloc(&kc->job_pool, GFP_NOIO); |
a6e50b40 MP |
870 | |
871 | memset(job, 0, sizeof(struct kcopyd_job)); | |
872 | job->kc = kc; | |
873 | job->fn = fn; | |
874 | job->context = context; | |
d136f2ef | 875 | job->master_job = job; |
a6e50b40 MP |
876 | |
877 | atomic_inc(&kc->nr_jobs); | |
878 | ||
879 | return job; | |
880 | } | |
881 | EXPORT_SYMBOL(dm_kcopyd_prepare_callback); | |
882 | ||
883 | void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err) | |
884 | { | |
885 | struct kcopyd_job *job = j; | |
886 | struct dm_kcopyd_client *kc = job->kc; | |
887 | ||
888 | job->read_err = read_err; | |
889 | job->write_err = write_err; | |
890 | ||
d7e6b8df | 891 | push(&kc->callback_jobs, job); |
a6e50b40 MP |
892 | wake(kc); |
893 | } | |
894 | EXPORT_SYMBOL(dm_kcopyd_do_callback); | |
895 | ||
1da177e4 LT |
896 | /* |
897 | * Cancels a kcopyd job, eg. someone might be deactivating a | |
898 | * mirror. | |
899 | */ | |
0b56306e | 900 | #if 0 |
1da177e4 LT |
901 | int kcopyd_cancel(struct kcopyd_job *job, int block) |
902 | { | |
903 | /* FIXME: finish */ | |
904 | return -1; | |
905 | } | |
0b56306e | 906 | #endif /* 0 */ |
1da177e4 | 907 | |
a4a82ce3 HM |
908 | /* |
909 | *--------------------------------------------------------------- | |
945fa4d2 | 910 | * Client setup |
a4a82ce3 HM |
911 | *--------------------------------------------------------------- |
912 | */ | |
df5d2e90 | 913 | struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *throttle) |
1da177e4 | 914 | { |
6f1c819c | 915 | int r; |
86a3238c | 916 | unsigned int reserve_pages; |
eb69aca5 | 917 | struct dm_kcopyd_client *kc; |
1da177e4 | 918 | |
d3775354 | 919 | kc = kzalloc(sizeof(*kc), GFP_KERNEL); |
945fa4d2 | 920 | if (!kc) |
fa34ce73 | 921 | return ERR_PTR(-ENOMEM); |
1da177e4 | 922 | |
8c0cbc2f | 923 | spin_lock_init(&kc->job_lock); |
d7e6b8df | 924 | INIT_LIST_HEAD(&kc->callback_jobs); |
8c0cbc2f MP |
925 | INIT_LIST_HEAD(&kc->complete_jobs); |
926 | INIT_LIST_HEAD(&kc->io_jobs); | |
927 | INIT_LIST_HEAD(&kc->pages_jobs); | |
df5d2e90 | 928 | kc->throttle = throttle; |
8c0cbc2f | 929 | |
6f1c819c KO |
930 | r = mempool_init_slab_pool(&kc->job_pool, MIN_JOBS, _job_cache); |
931 | if (r) | |
945fa4d2 | 932 | goto bad_slab; |
08d8757a | 933 | |
8c0cbc2f | 934 | INIT_WORK(&kc->kcopyd_work, do_work); |
670368a8 | 935 | kc->kcopyd_wq = alloc_workqueue("kcopyd", WQ_MEM_RECLAIM, 0); |
6f1c819c KO |
936 | if (!kc->kcopyd_wq) { |
937 | r = -ENOMEM; | |
945fa4d2 | 938 | goto bad_workqueue; |
6f1c819c | 939 | } |
8c0cbc2f | 940 | |
c663e040 NT |
941 | kc->sub_job_size = dm_get_kcopyd_subjob_size(); |
942 | reserve_pages = DIV_ROUND_UP(kc->sub_job_size << SECTOR_SHIFT, PAGE_SIZE); | |
943 | ||
1da177e4 | 944 | kc->pages = NULL; |
d0471458 | 945 | kc->nr_reserved_pages = kc->nr_free_pages = 0; |
c663e040 | 946 | r = client_reserve_pages(kc, reserve_pages); |
945fa4d2 MP |
947 | if (r) |
948 | goto bad_client_pages; | |
1da177e4 | 949 | |
bda8efec | 950 | kc->io_client = dm_io_client_create(); |
373a392b MB |
951 | if (IS_ERR(kc->io_client)) { |
952 | r = PTR_ERR(kc->io_client); | |
945fa4d2 | 953 | goto bad_io_client; |
1da177e4 LT |
954 | } |
955 | ||
138728dc AK |
956 | init_waitqueue_head(&kc->destroyq); |
957 | atomic_set(&kc->nr_jobs, 0); | |
958 | ||
fa34ce73 | 959 | return kc; |
945fa4d2 MP |
960 | |
961 | bad_io_client: | |
962 | client_free_pages(kc); | |
963 | bad_client_pages: | |
964 | destroy_workqueue(kc->kcopyd_wq); | |
965 | bad_workqueue: | |
6f1c819c | 966 | mempool_exit(&kc->job_pool); |
945fa4d2 MP |
967 | bad_slab: |
968 | kfree(kc); | |
969 | ||
fa34ce73 | 970 | return ERR_PTR(r); |
1da177e4 | 971 | } |
eb69aca5 | 972 | EXPORT_SYMBOL(dm_kcopyd_client_create); |
1da177e4 | 973 | |
eb69aca5 | 974 | void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc) |
1da177e4 | 975 | { |
138728dc AK |
976 | /* Wait for completion of all jobs submitted by this client. */ |
977 | wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs)); | |
978 | ||
d7e6b8df | 979 | BUG_ON(!list_empty(&kc->callback_jobs)); |
8c0cbc2f MP |
980 | BUG_ON(!list_empty(&kc->complete_jobs)); |
981 | BUG_ON(!list_empty(&kc->io_jobs)); | |
982 | BUG_ON(!list_empty(&kc->pages_jobs)); | |
983 | destroy_workqueue(kc->kcopyd_wq); | |
373a392b | 984 | dm_io_client_destroy(kc->io_client); |
1da177e4 | 985 | client_free_pages(kc); |
6f1c819c | 986 | mempool_exit(&kc->job_pool); |
1da177e4 | 987 | kfree(kc); |
1da177e4 | 988 | } |
eb69aca5 | 989 | EXPORT_SYMBOL(dm_kcopyd_client_destroy); |
293128b1 MP |
990 | |
991 | void dm_kcopyd_client_flush(struct dm_kcopyd_client *kc) | |
992 | { | |
993 | flush_workqueue(kc->kcopyd_wq); | |
994 | } | |
995 | EXPORT_SYMBOL(dm_kcopyd_client_flush); |