Commit | Line | Data |
---|---|---|
3bd94003 | 1 | // SPDX-License-Identifier: GPL-2.0-only |
1da177e4 LT |
2 | /* |
3 | * Copyright (C) 2002 Sistina Software (UK) Limited. | |
373a392b | 4 | * Copyright (C) 2006 Red Hat GmbH |
1da177e4 LT |
5 | * |
6 | * This file is released under the GPL. | |
7 | * | |
8 | * Kcopyd provides a simple interface for copying an area of one | |
9 | * block-device to one or more other block-devices, with an asynchronous | |
10 | * completion notification. | |
11 | */ | |
12 | ||
eb69aca5 | 13 | #include <linux/types.h> |
60063497 | 14 | #include <linux/atomic.h> |
1da177e4 | 15 | #include <linux/blkdev.h> |
1da177e4 LT |
16 | #include <linux/fs.h> |
17 | #include <linux/init.h> | |
18 | #include <linux/list.h> | |
19 | #include <linux/mempool.h> | |
20 | #include <linux/module.h> | |
21 | #include <linux/pagemap.h> | |
22 | #include <linux/slab.h> | |
23 | #include <linux/vmalloc.h> | |
24 | #include <linux/workqueue.h> | |
48c9c27b | 25 | #include <linux/mutex.h> |
df5d2e90 | 26 | #include <linux/delay.h> |
586e80e6 | 27 | #include <linux/device-mapper.h> |
a765e20e | 28 | #include <linux/dm-kcopyd.h> |
1da177e4 | 29 | |
4cc96131 | 30 | #include "dm-core.h" |
1da177e4 | 31 | |
c6ea41fb MP |
32 | #define SPLIT_COUNT 8 |
33 | #define MIN_JOBS 8 | |
c663e040 NT |
34 | |
35 | #define DEFAULT_SUB_JOB_SIZE_KB 512 | |
36 | #define MAX_SUB_JOB_SIZE_KB 1024 | |
37 | ||
86a3238c | 38 | static unsigned int kcopyd_subjob_size_kb = DEFAULT_SUB_JOB_SIZE_KB; |
c663e040 | 39 | |
6a808034 | 40 | module_param(kcopyd_subjob_size_kb, uint, 0644); |
c663e040 NT |
41 | MODULE_PARM_DESC(kcopyd_subjob_size_kb, "Sub-job size for dm-kcopyd clients"); |
42 | ||
86a3238c | 43 | static unsigned int dm_get_kcopyd_subjob_size(void) |
c663e040 | 44 | { |
86a3238c | 45 | unsigned int sub_job_size_kb; |
c663e040 NT |
46 | |
47 | sub_job_size_kb = __dm_get_module_param(&kcopyd_subjob_size_kb, | |
48 | DEFAULT_SUB_JOB_SIZE_KB, | |
49 | MAX_SUB_JOB_SIZE_KB); | |
50 | ||
51 | return sub_job_size_kb << 1; | |
52 | } | |
c6ea41fb | 53 | |
a4a82ce3 HM |
54 | /* |
55 | *---------------------------------------------------------------- | |
1da177e4 LT |
56 | * Each kcopyd client has its own little pool of preallocated |
57 | * pages for kcopyd io. | |
a4a82ce3 HM |
58 | *--------------------------------------------------------------- |
59 | */ | |
eb69aca5 | 60 | struct dm_kcopyd_client { |
1da177e4 | 61 | struct page_list *pages; |
86a3238c HM |
62 | unsigned int nr_reserved_pages; |
63 | unsigned int nr_free_pages; | |
64 | unsigned int sub_job_size; | |
138728dc | 65 | |
373a392b MB |
66 | struct dm_io_client *io_client; |
67 | ||
138728dc | 68 | wait_queue_head_t destroyq; |
8c0cbc2f | 69 | |
6f1c819c | 70 | mempool_t job_pool; |
08d8757a | 71 | |
8c0cbc2f MP |
72 | struct workqueue_struct *kcopyd_wq; |
73 | struct work_struct kcopyd_work; | |
74 | ||
df5d2e90 MP |
75 | struct dm_kcopyd_throttle *throttle; |
76 | ||
72d711c8 MS |
77 | atomic_t nr_jobs; |
78 | ||
8c0cbc2f | 79 | /* |
d7e6b8df | 80 | * We maintain four lists of jobs: |
8c0cbc2f MP |
81 | * |
82 | * i) jobs waiting for pages | |
83 | * ii) jobs that have pages, and are waiting for the io to be issued. | |
d7e6b8df NT |
84 | * iii) jobs that don't need to do any IO and just run a callback |
85 | * iv) jobs that have completed. | |
8c0cbc2f | 86 | * |
d7e6b8df | 87 | * All four of these are protected by job_lock. |
8c0cbc2f MP |
88 | */ |
89 | spinlock_t job_lock; | |
d7e6b8df | 90 | struct list_head callback_jobs; |
8c0cbc2f MP |
91 | struct list_head complete_jobs; |
92 | struct list_head io_jobs; | |
93 | struct list_head pages_jobs; | |
1da177e4 LT |
94 | }; |
95 | ||
7f069653 MP |
96 | static struct page_list zero_page_list; |
97 | ||
df5d2e90 MP |
98 | static DEFINE_SPINLOCK(throttle_spinlock); |
99 | ||
100 | /* | |
101 | * IO/IDLE accounting slowly decays after (1 << ACCOUNT_INTERVAL_SHIFT) period. | |
102 | * When total_period >= (1 << ACCOUNT_INTERVAL_SHIFT) the counters are divided | |
103 | * by 2. | |
104 | */ | |
105 | #define ACCOUNT_INTERVAL_SHIFT SHIFT_HZ | |
106 | ||
107 | /* | |
108 | * Sleep this number of milliseconds. | |
109 | * | |
110 | * The value was decided experimentally. | |
111 | * Smaller values seem to cause an increased copy rate above the limit. | |
112 | * The reason for this is unknown but possibly due to jiffies rounding errors | |
113 | * or read/write cache inside the disk. | |
114 | */ | |
238d991f | 115 | #define SLEEP_USEC 100000 |
df5d2e90 MP |
116 | |
117 | /* | |
118 | * Maximum number of sleep events. There is a theoretical livelock if more | |
119 | * kcopyd clients do work simultaneously which this limit avoids. | |
120 | */ | |
121 | #define MAX_SLEEPS 10 | |
122 | ||
123 | static void io_job_start(struct dm_kcopyd_throttle *t) | |
124 | { | |
86a3238c | 125 | unsigned int throttle, now, difference; |
df5d2e90 MP |
126 | int slept = 0, skew; |
127 | ||
128 | if (unlikely(!t)) | |
129 | return; | |
130 | ||
131 | try_again: | |
132 | spin_lock_irq(&throttle_spinlock); | |
133 | ||
6aa7de05 | 134 | throttle = READ_ONCE(t->throttle); |
df5d2e90 MP |
135 | |
136 | if (likely(throttle >= 100)) | |
137 | goto skip_limit; | |
138 | ||
139 | now = jiffies; | |
140 | difference = now - t->last_jiffies; | |
141 | t->last_jiffies = now; | |
142 | if (t->num_io_jobs) | |
143 | t->io_period += difference; | |
144 | t->total_period += difference; | |
145 | ||
146 | /* | |
147 | * Maintain sane values if we got a temporary overflow. | |
148 | */ | |
149 | if (unlikely(t->io_period > t->total_period)) | |
150 | t->io_period = t->total_period; | |
151 | ||
152 | if (unlikely(t->total_period >= (1 << ACCOUNT_INTERVAL_SHIFT))) { | |
153 | int shift = fls(t->total_period >> ACCOUNT_INTERVAL_SHIFT); | |
0ef0b471 | 154 | |
df5d2e90 MP |
155 | t->total_period >>= shift; |
156 | t->io_period >>= shift; | |
157 | } | |
158 | ||
159 | skew = t->io_period - throttle * t->total_period / 100; | |
160 | ||
161 | if (unlikely(skew > 0) && slept < MAX_SLEEPS) { | |
162 | slept++; | |
163 | spin_unlock_irq(&throttle_spinlock); | |
238d991f | 164 | fsleep(SLEEP_USEC); |
df5d2e90 MP |
165 | goto try_again; |
166 | } | |
167 | ||
168 | skip_limit: | |
169 | t->num_io_jobs++; | |
170 | ||
171 | spin_unlock_irq(&throttle_spinlock); | |
172 | } | |
173 | ||
174 | static void io_job_finish(struct dm_kcopyd_throttle *t) | |
175 | { | |
176 | unsigned long flags; | |
177 | ||
178 | if (unlikely(!t)) | |
179 | return; | |
180 | ||
181 | spin_lock_irqsave(&throttle_spinlock, flags); | |
182 | ||
183 | t->num_io_jobs--; | |
184 | ||
6aa7de05 | 185 | if (likely(READ_ONCE(t->throttle) >= 100)) |
df5d2e90 MP |
186 | goto skip_limit; |
187 | ||
188 | if (!t->num_io_jobs) { | |
86a3238c | 189 | unsigned int now, difference; |
df5d2e90 MP |
190 | |
191 | now = jiffies; | |
192 | difference = now - t->last_jiffies; | |
193 | t->last_jiffies = now; | |
194 | ||
195 | t->io_period += difference; | |
196 | t->total_period += difference; | |
197 | ||
198 | /* | |
199 | * Maintain sane values if we got a temporary overflow. | |
200 | */ | |
201 | if (unlikely(t->io_period > t->total_period)) | |
202 | t->io_period = t->total_period; | |
203 | } | |
204 | ||
205 | skip_limit: | |
206 | spin_unlock_irqrestore(&throttle_spinlock, flags); | |
207 | } | |
208 | ||
209 | ||
8c0cbc2f MP |
210 | static void wake(struct dm_kcopyd_client *kc) |
211 | { | |
212 | queue_work(kc->kcopyd_wq, &kc->kcopyd_work); | |
213 | } | |
214 | ||
d0471458 MP |
215 | /* |
216 | * Obtain one page for the use of kcopyd. | |
217 | */ | |
f99b55ee | 218 | static struct page_list *alloc_pl(gfp_t gfp) |
1da177e4 LT |
219 | { |
220 | struct page_list *pl; | |
221 | ||
f99b55ee | 222 | pl = kmalloc(sizeof(*pl), gfp); |
1da177e4 LT |
223 | if (!pl) |
224 | return NULL; | |
225 | ||
949d49ec | 226 | pl->page = alloc_page(gfp | __GFP_HIGHMEM); |
1da177e4 LT |
227 | if (!pl->page) { |
228 | kfree(pl); | |
229 | return NULL; | |
230 | } | |
231 | ||
232 | return pl; | |
233 | } | |
234 | ||
235 | static void free_pl(struct page_list *pl) | |
236 | { | |
237 | __free_page(pl->page); | |
238 | kfree(pl); | |
239 | } | |
240 | ||
d0471458 MP |
241 | /* |
242 | * Add the provided pages to a client's free page list, releasing | |
243 | * back to the system any beyond the reserved_pages limit. | |
244 | */ | |
245 | static void kcopyd_put_pages(struct dm_kcopyd_client *kc, struct page_list *pl) | |
1da177e4 | 246 | { |
d0471458 | 247 | struct page_list *next; |
1da177e4 | 248 | |
d0471458 MP |
249 | do { |
250 | next = pl->next; | |
1da177e4 | 251 | |
d0471458 MP |
252 | if (kc->nr_free_pages >= kc->nr_reserved_pages) |
253 | free_pl(pl); | |
254 | else { | |
255 | pl->next = kc->pages; | |
256 | kc->pages = pl; | |
257 | kc->nr_free_pages++; | |
258 | } | |
1da177e4 | 259 | |
d0471458 MP |
260 | pl = next; |
261 | } while (pl); | |
1da177e4 LT |
262 | } |
263 | ||
d0471458 MP |
264 | static int kcopyd_get_pages(struct dm_kcopyd_client *kc, |
265 | unsigned int nr, struct page_list **pages) | |
1da177e4 | 266 | { |
d0471458 | 267 | struct page_list *pl; |
1da177e4 | 268 | |
d0471458 MP |
269 | *pages = NULL; |
270 | ||
271 | do { | |
d0164adc | 272 | pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY | __GFP_KSWAPD_RECLAIM); |
d0471458 MP |
273 | if (unlikely(!pl)) { |
274 | /* Use reserved pages */ | |
275 | pl = kc->pages; | |
276 | if (unlikely(!pl)) | |
277 | goto out_of_memory; | |
278 | kc->pages = pl->next; | |
279 | kc->nr_free_pages--; | |
280 | } | |
281 | pl->next = *pages; | |
282 | *pages = pl; | |
283 | } while (--nr); | |
284 | ||
285 | return 0; | |
1da177e4 | 286 | |
d0471458 MP |
287 | out_of_memory: |
288 | if (*pages) | |
289 | kcopyd_put_pages(kc, *pages); | |
290 | return -ENOMEM; | |
1da177e4 LT |
291 | } |
292 | ||
293 | /* | |
294 | * These three functions resize the page pool. | |
295 | */ | |
296 | static void drop_pages(struct page_list *pl) | |
297 | { | |
298 | struct page_list *next; | |
299 | ||
300 | while (pl) { | |
301 | next = pl->next; | |
302 | free_pl(pl); | |
303 | pl = next; | |
304 | } | |
305 | } | |
306 | ||
d0471458 MP |
307 | /* |
308 | * Allocate and reserve nr_pages for the use of a specific client. | |
309 | */ | |
86a3238c | 310 | static int client_reserve_pages(struct dm_kcopyd_client *kc, unsigned int nr_pages) |
1da177e4 | 311 | { |
86a3238c | 312 | unsigned int i; |
1da177e4 LT |
313 | struct page_list *pl = NULL, *next; |
314 | ||
d0471458 | 315 | for (i = 0; i < nr_pages; i++) { |
f99b55ee | 316 | next = alloc_pl(GFP_KERNEL); |
1da177e4 LT |
317 | if (!next) { |
318 | if (pl) | |
319 | drop_pages(pl); | |
320 | return -ENOMEM; | |
321 | } | |
322 | next->next = pl; | |
323 | pl = next; | |
324 | } | |
325 | ||
d0471458 | 326 | kc->nr_reserved_pages += nr_pages; |
1da177e4 | 327 | kcopyd_put_pages(kc, pl); |
d0471458 | 328 | |
1da177e4 LT |
329 | return 0; |
330 | } | |
331 | ||
eb69aca5 | 332 | static void client_free_pages(struct dm_kcopyd_client *kc) |
1da177e4 | 333 | { |
d0471458 | 334 | BUG_ON(kc->nr_free_pages != kc->nr_reserved_pages); |
1da177e4 LT |
335 | drop_pages(kc->pages); |
336 | kc->pages = NULL; | |
d0471458 | 337 | kc->nr_free_pages = kc->nr_reserved_pages = 0; |
1da177e4 LT |
338 | } |
339 | ||
a4a82ce3 HM |
340 | /* |
341 | *--------------------------------------------------------------- | |
1da177e4 LT |
342 | * kcopyd_jobs need to be allocated by the *clients* of kcopyd, |
343 | * for this reason we use a mempool to prevent the client from | |
344 | * ever having to do io (which could cause a deadlock). | |
a4a82ce3 HM |
345 | *--------------------------------------------------------------- |
346 | */ | |
1da177e4 | 347 | struct kcopyd_job { |
eb69aca5 | 348 | struct dm_kcopyd_client *kc; |
1da177e4 | 349 | struct list_head list; |
86a3238c | 350 | unsigned int flags; |
1da177e4 LT |
351 | |
352 | /* | |
353 | * Error state of the job. | |
354 | */ | |
355 | int read_err; | |
4cdc1d1f | 356 | unsigned long write_err; |
1da177e4 LT |
357 | |
358 | /* | |
71f7113d | 359 | * REQ_OP_READ, REQ_OP_WRITE or REQ_OP_WRITE_ZEROES. |
1da177e4 | 360 | */ |
71f7113d | 361 | enum req_op op; |
22a1ceb1 | 362 | struct dm_io_region source; |
1da177e4 LT |
363 | |
364 | /* | |
365 | * The destinations for the transfer. | |
366 | */ | |
367 | unsigned int num_dests; | |
eb69aca5 | 368 | struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS]; |
1da177e4 | 369 | |
1da177e4 LT |
370 | struct page_list *pages; |
371 | ||
372 | /* | |
373 | * Set this to ensure you are notified when the job has | |
374 | * completed. 'context' is for callback to use. | |
375 | */ | |
eb69aca5 | 376 | dm_kcopyd_notify_fn fn; |
1da177e4 LT |
377 | void *context; |
378 | ||
379 | /* | |
380 | * These fields are only used if the job has been split | |
381 | * into more manageable parts. | |
382 | */ | |
def5b5b2 | 383 | struct mutex lock; |
1da177e4 LT |
384 | atomic_t sub_jobs; |
385 | sector_t progress; | |
b73c67c2 | 386 | sector_t write_offset; |
1da177e4 | 387 | |
c6ea41fb MP |
388 | struct kcopyd_job *master_job; |
389 | }; | |
1da177e4 | 390 | |
e18b890b | 391 | static struct kmem_cache *_job_cache; |
1da177e4 | 392 | |
945fa4d2 | 393 | int __init dm_kcopyd_init(void) |
1da177e4 | 394 | { |
c6ea41fb MP |
395 | _job_cache = kmem_cache_create("kcopyd_job", |
396 | sizeof(struct kcopyd_job) * (SPLIT_COUNT + 1), | |
397 | __alignof__(struct kcopyd_job), 0, NULL); | |
1da177e4 LT |
398 | if (!_job_cache) |
399 | return -ENOMEM; | |
400 | ||
7f069653 MP |
401 | zero_page_list.next = &zero_page_list; |
402 | zero_page_list.page = ZERO_PAGE(0); | |
403 | ||
1da177e4 LT |
404 | return 0; |
405 | } | |
406 | ||
945fa4d2 | 407 | void dm_kcopyd_exit(void) |
1da177e4 | 408 | { |
1da177e4 | 409 | kmem_cache_destroy(_job_cache); |
1da177e4 LT |
410 | _job_cache = NULL; |
411 | } | |
412 | ||
413 | /* | |
414 | * Functions to push and pop a job onto the head of a given job | |
415 | * list. | |
416 | */ | |
b73c67c2 DLM |
417 | static struct kcopyd_job *pop_io_job(struct list_head *jobs, |
418 | struct dm_kcopyd_client *kc) | |
419 | { | |
420 | struct kcopyd_job *job; | |
421 | ||
422 | /* | |
423 | * For I/O jobs, pop any read, any write without sequential write | |
424 | * constraint and sequential writes that are at the right position. | |
425 | */ | |
426 | list_for_each_entry(job, jobs, list) { | |
71f7113d BVA |
427 | if (job->op == REQ_OP_READ || |
428 | !(job->flags & BIT(DM_KCOPYD_WRITE_SEQ))) { | |
b73c67c2 DLM |
429 | list_del(&job->list); |
430 | return job; | |
431 | } | |
432 | ||
433 | if (job->write_offset == job->master_job->write_offset) { | |
434 | job->master_job->write_offset += job->source.count; | |
435 | list_del(&job->list); | |
436 | return job; | |
437 | } | |
438 | } | |
439 | ||
440 | return NULL; | |
441 | } | |
442 | ||
8c0cbc2f MP |
443 | static struct kcopyd_job *pop(struct list_head *jobs, |
444 | struct dm_kcopyd_client *kc) | |
1da177e4 LT |
445 | { |
446 | struct kcopyd_job *job = NULL; | |
1da177e4 | 447 | |
6bcd658f | 448 | spin_lock_irq(&kc->job_lock); |
1da177e4 LT |
449 | |
450 | if (!list_empty(jobs)) { | |
b73c67c2 DLM |
451 | if (jobs == &kc->io_jobs) |
452 | job = pop_io_job(jobs, kc); | |
453 | else { | |
454 | job = list_entry(jobs->next, struct kcopyd_job, list); | |
455 | list_del(&job->list); | |
456 | } | |
1da177e4 | 457 | } |
6bcd658f | 458 | spin_unlock_irq(&kc->job_lock); |
1da177e4 LT |
459 | |
460 | return job; | |
461 | } | |
462 | ||
028867ac | 463 | static void push(struct list_head *jobs, struct kcopyd_job *job) |
1da177e4 LT |
464 | { |
465 | unsigned long flags; | |
8c0cbc2f | 466 | struct dm_kcopyd_client *kc = job->kc; |
1da177e4 | 467 | |
8c0cbc2f | 468 | spin_lock_irqsave(&kc->job_lock, flags); |
1da177e4 | 469 | list_add_tail(&job->list, jobs); |
8c0cbc2f | 470 | spin_unlock_irqrestore(&kc->job_lock, flags); |
1da177e4 LT |
471 | } |
472 | ||
b673c3a8 KI |
473 | |
474 | static void push_head(struct list_head *jobs, struct kcopyd_job *job) | |
475 | { | |
b673c3a8 KI |
476 | struct dm_kcopyd_client *kc = job->kc; |
477 | ||
6bcd658f | 478 | spin_lock_irq(&kc->job_lock); |
b673c3a8 | 479 | list_add(&job->list, jobs); |
6bcd658f | 480 | spin_unlock_irq(&kc->job_lock); |
b673c3a8 KI |
481 | } |
482 | ||
1da177e4 LT |
483 | /* |
484 | * These three functions process 1 item from the corresponding | |
485 | * job list. | |
486 | * | |
487 | * They return: | |
488 | * < 0: error | |
489 | * 0: success | |
490 | * > 0: can't process yet. | |
491 | */ | |
492 | static int run_complete_job(struct kcopyd_job *job) | |
493 | { | |
494 | void *context = job->context; | |
495 | int read_err = job->read_err; | |
4cdc1d1f | 496 | unsigned long write_err = job->write_err; |
eb69aca5 HM |
497 | dm_kcopyd_notify_fn fn = job->fn; |
498 | struct dm_kcopyd_client *kc = job->kc; | |
1da177e4 | 499 | |
7f069653 | 500 | if (job->pages && job->pages != &zero_page_list) |
73830857 | 501 | kcopyd_put_pages(kc, job->pages); |
c6ea41fb MP |
502 | /* |
503 | * If this is the master job, the sub jobs have already | |
504 | * completed so we can free everything. | |
505 | */ | |
d5ffebdd MS |
506 | if (job->master_job == job) { |
507 | mutex_destroy(&job->lock); | |
6f1c819c | 508 | mempool_free(job, &kc->job_pool); |
d5ffebdd | 509 | } |
1da177e4 | 510 | fn(read_err, write_err, context); |
138728dc AK |
511 | |
512 | if (atomic_dec_and_test(&kc->nr_jobs)) | |
513 | wake_up(&kc->destroyq); | |
514 | ||
784c9a29 JP |
515 | cond_resched(); |
516 | ||
1da177e4 LT |
517 | return 0; |
518 | } | |
519 | ||
520 | static void complete_io(unsigned long error, void *context) | |
521 | { | |
26cb62a2 | 522 | struct kcopyd_job *job = context; |
8c0cbc2f | 523 | struct dm_kcopyd_client *kc = job->kc; |
1da177e4 | 524 | |
df5d2e90 MP |
525 | io_job_finish(kc->throttle); |
526 | ||
1da177e4 | 527 | if (error) { |
71f7113d | 528 | if (op_is_write(job->op)) |
ce503f59 | 529 | job->write_err |= error; |
1da177e4 LT |
530 | else |
531 | job->read_err = 1; | |
532 | ||
db2351eb | 533 | if (!(job->flags & BIT(DM_KCOPYD_IGNORE_ERROR))) { |
8c0cbc2f MP |
534 | push(&kc->complete_jobs, job); |
535 | wake(kc); | |
1da177e4 LT |
536 | return; |
537 | } | |
538 | } | |
539 | ||
71f7113d | 540 | if (op_is_write(job->op)) |
8c0cbc2f | 541 | push(&kc->complete_jobs, job); |
1da177e4 LT |
542 | |
543 | else { | |
71f7113d | 544 | job->op = REQ_OP_WRITE; |
8c0cbc2f | 545 | push(&kc->io_jobs, job); |
1da177e4 LT |
546 | } |
547 | ||
8c0cbc2f | 548 | wake(kc); |
1da177e4 LT |
549 | } |
550 | ||
551 | /* | |
552 | * Request io on as many buffer heads as we can currently get for | |
553 | * a particular job. | |
554 | */ | |
555 | static int run_io_job(struct kcopyd_job *job) | |
556 | { | |
557 | int r; | |
373a392b | 558 | struct dm_io_request io_req = { |
71f7113d | 559 | .bi_opf = job->op, |
373a392b MB |
560 | .mem.type = DM_IO_PAGE_LIST, |
561 | .mem.ptr.pl = job->pages, | |
4622afb3 | 562 | .mem.offset = 0, |
373a392b MB |
563 | .notify.fn = complete_io, |
564 | .notify.context = job, | |
565 | .client = job->kc->io_client, | |
566 | }; | |
1da177e4 | 567 | |
b73c67c2 DLM |
568 | /* |
569 | * If we need to write sequentially and some reads or writes failed, | |
570 | * no point in continuing. | |
571 | */ | |
db2351eb | 572 | if (job->flags & BIT(DM_KCOPYD_WRITE_SEQ) && |
d1fef414 DF |
573 | job->master_job->write_err) { |
574 | job->write_err = job->master_job->write_err; | |
b73c67c2 | 575 | return -EIO; |
d1fef414 | 576 | } |
b73c67c2 | 577 | |
df5d2e90 MP |
578 | io_job_start(job->kc->throttle); |
579 | ||
71f7113d | 580 | if (job->op == REQ_OP_READ) |
6e5f0f63 | 581 | r = dm_io(&io_req, 1, &job->source, NULL, IOPRIO_DEFAULT); |
721a9602 | 582 | else |
6e5f0f63 | 583 | r = dm_io(&io_req, job->num_dests, job->dests, NULL, IOPRIO_DEFAULT); |
1da177e4 LT |
584 | |
585 | return r; | |
586 | } | |
587 | ||
588 | static int run_pages_job(struct kcopyd_job *job) | |
589 | { | |
590 | int r; | |
86a3238c | 591 | unsigned int nr_pages = dm_div_up(job->dests[0].count, PAGE_SIZE >> 9); |
1da177e4 | 592 | |
5bf45a3d | 593 | r = kcopyd_get_pages(job->kc, nr_pages, &job->pages); |
1da177e4 LT |
594 | if (!r) { |
595 | /* this job is ready for io */ | |
8c0cbc2f | 596 | push(&job->kc->io_jobs, job); |
1da177e4 LT |
597 | return 0; |
598 | } | |
599 | ||
600 | if (r == -ENOMEM) | |
601 | /* can't complete now */ | |
602 | return 1; | |
603 | ||
604 | return r; | |
605 | } | |
606 | ||
607 | /* | |
608 | * Run through a list for as long as possible. Returns the count | |
609 | * of successful jobs. | |
610 | */ | |
8c0cbc2f | 611 | static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc, |
8ca817c4 | 612 | int (*fn)(struct kcopyd_job *)) |
1da177e4 LT |
613 | { |
614 | struct kcopyd_job *job; | |
615 | int r, count = 0; | |
616 | ||
8c0cbc2f | 617 | while ((job = pop(jobs, kc))) { |
1da177e4 LT |
618 | |
619 | r = fn(job); | |
620 | ||
621 | if (r < 0) { | |
622 | /* error this rogue job */ | |
71f7113d | 623 | if (op_is_write(job->op)) |
4cdc1d1f | 624 | job->write_err = (unsigned long) -1L; |
1da177e4 LT |
625 | else |
626 | job->read_err = 1; | |
8c0cbc2f | 627 | push(&kc->complete_jobs, job); |
d1fef414 | 628 | wake(kc); |
1da177e4 LT |
629 | break; |
630 | } | |
631 | ||
632 | if (r > 0) { | |
633 | /* | |
634 | * We couldn't service this job ATM, so | |
635 | * push this job back onto the list. | |
636 | */ | |
b673c3a8 | 637 | push_head(jobs, job); |
1da177e4 LT |
638 | break; |
639 | } | |
640 | ||
641 | count++; | |
642 | } | |
643 | ||
644 | return count; | |
645 | } | |
646 | ||
647 | /* | |
648 | * kcopyd does this every time it's woken up. | |
649 | */ | |
8c0cbc2f | 650 | static void do_work(struct work_struct *work) |
1da177e4 | 651 | { |
8c0cbc2f MP |
652 | struct dm_kcopyd_client *kc = container_of(work, |
653 | struct dm_kcopyd_client, kcopyd_work); | |
7eaceacc | 654 | struct blk_plug plug; |
8c0cbc2f | 655 | |
1da177e4 LT |
656 | /* |
657 | * The order that these are called is *very* important. | |
658 | * complete jobs can free some pages for pages jobs. | |
659 | * Pages jobs when successful will jump onto the io jobs | |
660 | * list. io jobs call wake when they complete and it all | |
661 | * starts again. | |
662 | */ | |
6bcd658f | 663 | spin_lock_irq(&kc->job_lock); |
d7e6b8df | 664 | list_splice_tail_init(&kc->callback_jobs, &kc->complete_jobs); |
6bcd658f | 665 | spin_unlock_irq(&kc->job_lock); |
d7e6b8df | 666 | |
7eaceacc | 667 | blk_start_plug(&plug); |
8c0cbc2f MP |
668 | process_jobs(&kc->complete_jobs, kc, run_complete_job); |
669 | process_jobs(&kc->pages_jobs, kc, run_pages_job); | |
670 | process_jobs(&kc->io_jobs, kc, run_io_job); | |
7eaceacc | 671 | blk_finish_plug(&plug); |
1da177e4 LT |
672 | } |
673 | ||
674 | /* | |
675 | * If we are copying a small region we just dispatch a single job | |
676 | * to do the copy, otherwise the io has to be split up into many | |
677 | * jobs. | |
678 | */ | |
679 | static void dispatch_job(struct kcopyd_job *job) | |
680 | { | |
8c0cbc2f | 681 | struct dm_kcopyd_client *kc = job->kc; |
0ef0b471 | 682 | |
8c0cbc2f | 683 | atomic_inc(&kc->nr_jobs); |
9ca170a3 | 684 | if (unlikely(!job->source.count)) |
d7e6b8df | 685 | push(&kc->callback_jobs, job); |
7f069653 MP |
686 | else if (job->pages == &zero_page_list) |
687 | push(&kc->io_jobs, job); | |
9ca170a3 MP |
688 | else |
689 | push(&kc->pages_jobs, job); | |
8c0cbc2f | 690 | wake(kc); |
1da177e4 LT |
691 | } |
692 | ||
4cdc1d1f AK |
693 | static void segment_complete(int read_err, unsigned long write_err, |
694 | void *context) | |
1da177e4 LT |
695 | { |
696 | /* FIXME: tidy this function */ | |
697 | sector_t progress = 0; | |
698 | sector_t count = 0; | |
26cb62a2 | 699 | struct kcopyd_job *sub_job = context; |
c6ea41fb | 700 | struct kcopyd_job *job = sub_job->master_job; |
73830857 | 701 | struct dm_kcopyd_client *kc = job->kc; |
1da177e4 | 702 | |
def5b5b2 | 703 | mutex_lock(&job->lock); |
1da177e4 LT |
704 | |
705 | /* update the error */ | |
706 | if (read_err) | |
707 | job->read_err = 1; | |
708 | ||
709 | if (write_err) | |
ce503f59 | 710 | job->write_err |= write_err; |
1da177e4 LT |
711 | |
712 | /* | |
713 | * Only dispatch more work if there hasn't been an error. | |
714 | */ | |
715 | if ((!job->read_err && !job->write_err) || | |
db2351eb | 716 | job->flags & BIT(DM_KCOPYD_IGNORE_ERROR)) { |
1da177e4 LT |
717 | /* get the next chunk of work */ |
718 | progress = job->progress; | |
719 | count = job->source.count - progress; | |
720 | if (count) { | |
c663e040 NT |
721 | if (count > kc->sub_job_size) |
722 | count = kc->sub_job_size; | |
1da177e4 LT |
723 | |
724 | job->progress += count; | |
725 | } | |
726 | } | |
def5b5b2 | 727 | mutex_unlock(&job->lock); |
1da177e4 LT |
728 | |
729 | if (count) { | |
730 | int i; | |
1da177e4 LT |
731 | |
732 | *sub_job = *job; | |
b73c67c2 | 733 | sub_job->write_offset = progress; |
1da177e4 LT |
734 | sub_job->source.sector += progress; |
735 | sub_job->source.count = count; | |
736 | ||
737 | for (i = 0; i < job->num_dests; i++) { | |
738 | sub_job->dests[i].sector += progress; | |
739 | sub_job->dests[i].count = count; | |
740 | } | |
741 | ||
742 | sub_job->fn = segment_complete; | |
c6ea41fb | 743 | sub_job->context = sub_job; |
1da177e4 LT |
744 | dispatch_job(sub_job); |
745 | ||
746 | } else if (atomic_dec_and_test(&job->sub_jobs)) { | |
747 | ||
748 | /* | |
340cd444 MP |
749 | * Queue the completion callback to the kcopyd thread. |
750 | * | |
751 | * Some callers assume that all the completions are called | |
752 | * from a single thread and don't race with each other. | |
753 | * | |
754 | * We must not call the callback directly here because this | |
755 | * code may not be executing in the thread. | |
1da177e4 | 756 | */ |
340cd444 MP |
757 | push(&kc->complete_jobs, job); |
758 | wake(kc); | |
1da177e4 LT |
759 | } |
760 | } | |
761 | ||
762 | /* | |
c6ea41fb | 763 | * Create some sub jobs to share the work between them. |
1da177e4 | 764 | */ |
c6ea41fb | 765 | static void split_job(struct kcopyd_job *master_job) |
1da177e4 LT |
766 | { |
767 | int i; | |
768 | ||
c6ea41fb | 769 | atomic_inc(&master_job->kc->nr_jobs); |
340cd444 | 770 | |
c6ea41fb MP |
771 | atomic_set(&master_job->sub_jobs, SPLIT_COUNT); |
772 | for (i = 0; i < SPLIT_COUNT; i++) { | |
773 | master_job[i + 1].master_job = master_job; | |
774 | segment_complete(0, 0u, &master_job[i + 1]); | |
775 | } | |
1da177e4 LT |
776 | } |
777 | ||
7209049d MS |
778 | void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, |
779 | unsigned int num_dests, struct dm_io_region *dests, | |
780 | unsigned int flags, dm_kcopyd_notify_fn fn, void *context) | |
1da177e4 LT |
781 | { |
782 | struct kcopyd_job *job; | |
70d6c400 | 783 | int i; |
1da177e4 LT |
784 | |
785 | /* | |
c6ea41fb MP |
786 | * Allocate an array of jobs consisting of one master job |
787 | * followed by SPLIT_COUNT sub jobs. | |
1da177e4 | 788 | */ |
6f1c819c | 789 | job = mempool_alloc(&kc->job_pool, GFP_NOIO); |
d5ffebdd | 790 | mutex_init(&job->lock); |
1da177e4 LT |
791 | |
792 | /* | |
793 | * set up for the read. | |
794 | */ | |
795 | job->kc = kc; | |
796 | job->flags = flags; | |
797 | job->read_err = 0; | |
798 | job->write_err = 0; | |
1da177e4 LT |
799 | |
800 | job->num_dests = num_dests; | |
801 | memcpy(&job->dests, dests, sizeof(*dests) * num_dests); | |
802 | ||
b73c67c2 DLM |
803 | /* |
804 | * If one of the destination is a host-managed zoned block device, | |
805 | * we need to write sequentially. If one of the destination is a | |
806 | * host-aware device, then leave it to the caller to choose what to do. | |
807 | */ | |
db2351eb | 808 | if (!(job->flags & BIT(DM_KCOPYD_WRITE_SEQ))) { |
b73c67c2 | 809 | for (i = 0; i < job->num_dests; i++) { |
7437bb73 | 810 | if (bdev_is_zoned(dests[i].bdev)) { |
db2351eb | 811 | job->flags |= BIT(DM_KCOPYD_WRITE_SEQ); |
b73c67c2 DLM |
812 | break; |
813 | } | |
814 | } | |
815 | } | |
816 | ||
817 | /* | |
818 | * If we need to write sequentially, errors cannot be ignored. | |
819 | */ | |
db2351eb MP |
820 | if (job->flags & BIT(DM_KCOPYD_WRITE_SEQ) && |
821 | job->flags & BIT(DM_KCOPYD_IGNORE_ERROR)) | |
822 | job->flags &= ~BIT(DM_KCOPYD_IGNORE_ERROR); | |
b73c67c2 | 823 | |
7f069653 MP |
824 | if (from) { |
825 | job->source = *from; | |
826 | job->pages = NULL; | |
71f7113d | 827 | job->op = REQ_OP_READ; |
7f069653 | 828 | } else { |
8d1058fb | 829 | memset(&job->source, 0, sizeof(job->source)); |
7f069653 MP |
830 | job->source.count = job->dests[0].count; |
831 | job->pages = &zero_page_list; | |
70d6c400 MS |
832 | |
833 | /* | |
615ec946 | 834 | * Use WRITE ZEROES to optimize zeroing if all dests support it. |
70d6c400 | 835 | */ |
71f7113d | 836 | job->op = REQ_OP_WRITE_ZEROES; |
70d6c400 | 837 | for (i = 0; i < job->num_dests; i++) |
615ec946 | 838 | if (!bdev_write_zeroes_sectors(job->dests[i].bdev)) { |
71f7113d | 839 | job->op = REQ_OP_WRITE; |
70d6c400 MS |
840 | break; |
841 | } | |
7f069653 | 842 | } |
1da177e4 LT |
843 | |
844 | job->fn = fn; | |
845 | job->context = context; | |
c6ea41fb | 846 | job->master_job = job; |
b73c67c2 | 847 | job->write_offset = 0; |
1da177e4 | 848 | |
c663e040 | 849 | if (job->source.count <= kc->sub_job_size) |
1da177e4 | 850 | dispatch_job(job); |
1da177e4 | 851 | else { |
1da177e4 LT |
852 | job->progress = 0; |
853 | split_job(job); | |
854 | } | |
1da177e4 | 855 | } |
eb69aca5 | 856 | EXPORT_SYMBOL(dm_kcopyd_copy); |
1da177e4 | 857 | |
7209049d | 858 | void dm_kcopyd_zero(struct dm_kcopyd_client *kc, |
86a3238c HM |
859 | unsigned int num_dests, struct dm_io_region *dests, |
860 | unsigned int flags, dm_kcopyd_notify_fn fn, void *context) | |
7f069653 | 861 | { |
7209049d | 862 | dm_kcopyd_copy(kc, NULL, num_dests, dests, flags, fn, context); |
7f069653 MP |
863 | } |
864 | EXPORT_SYMBOL(dm_kcopyd_zero); | |
865 | ||
a6e50b40 MP |
866 | void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc, |
867 | dm_kcopyd_notify_fn fn, void *context) | |
868 | { | |
869 | struct kcopyd_job *job; | |
870 | ||
6f1c819c | 871 | job = mempool_alloc(&kc->job_pool, GFP_NOIO); |
a6e50b40 MP |
872 | |
873 | memset(job, 0, sizeof(struct kcopyd_job)); | |
874 | job->kc = kc; | |
875 | job->fn = fn; | |
876 | job->context = context; | |
d136f2ef | 877 | job->master_job = job; |
a6e50b40 MP |
878 | |
879 | atomic_inc(&kc->nr_jobs); | |
880 | ||
881 | return job; | |
882 | } | |
883 | EXPORT_SYMBOL(dm_kcopyd_prepare_callback); | |
884 | ||
885 | void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err) | |
886 | { | |
887 | struct kcopyd_job *job = j; | |
888 | struct dm_kcopyd_client *kc = job->kc; | |
889 | ||
890 | job->read_err = read_err; | |
891 | job->write_err = write_err; | |
892 | ||
d7e6b8df | 893 | push(&kc->callback_jobs, job); |
a6e50b40 MP |
894 | wake(kc); |
895 | } | |
896 | EXPORT_SYMBOL(dm_kcopyd_do_callback); | |
897 | ||
1da177e4 LT |
898 | /* |
899 | * Cancels a kcopyd job, eg. someone might be deactivating a | |
900 | * mirror. | |
901 | */ | |
0b56306e | 902 | #if 0 |
1da177e4 LT |
903 | int kcopyd_cancel(struct kcopyd_job *job, int block) |
904 | { | |
905 | /* FIXME: finish */ | |
906 | return -1; | |
907 | } | |
0b56306e | 908 | #endif /* 0 */ |
1da177e4 | 909 | |
a4a82ce3 HM |
910 | /* |
911 | *--------------------------------------------------------------- | |
945fa4d2 | 912 | * Client setup |
a4a82ce3 HM |
913 | *--------------------------------------------------------------- |
914 | */ | |
df5d2e90 | 915 | struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *throttle) |
1da177e4 | 916 | { |
6f1c819c | 917 | int r; |
86a3238c | 918 | unsigned int reserve_pages; |
eb69aca5 | 919 | struct dm_kcopyd_client *kc; |
1da177e4 | 920 | |
d3775354 | 921 | kc = kzalloc(sizeof(*kc), GFP_KERNEL); |
945fa4d2 | 922 | if (!kc) |
fa34ce73 | 923 | return ERR_PTR(-ENOMEM); |
1da177e4 | 924 | |
8c0cbc2f | 925 | spin_lock_init(&kc->job_lock); |
d7e6b8df | 926 | INIT_LIST_HEAD(&kc->callback_jobs); |
8c0cbc2f MP |
927 | INIT_LIST_HEAD(&kc->complete_jobs); |
928 | INIT_LIST_HEAD(&kc->io_jobs); | |
929 | INIT_LIST_HEAD(&kc->pages_jobs); | |
df5d2e90 | 930 | kc->throttle = throttle; |
8c0cbc2f | 931 | |
6f1c819c KO |
932 | r = mempool_init_slab_pool(&kc->job_pool, MIN_JOBS, _job_cache); |
933 | if (r) | |
945fa4d2 | 934 | goto bad_slab; |
08d8757a | 935 | |
8c0cbc2f | 936 | INIT_WORK(&kc->kcopyd_work, do_work); |
670368a8 | 937 | kc->kcopyd_wq = alloc_workqueue("kcopyd", WQ_MEM_RECLAIM, 0); |
6f1c819c KO |
938 | if (!kc->kcopyd_wq) { |
939 | r = -ENOMEM; | |
945fa4d2 | 940 | goto bad_workqueue; |
6f1c819c | 941 | } |
8c0cbc2f | 942 | |
c663e040 NT |
943 | kc->sub_job_size = dm_get_kcopyd_subjob_size(); |
944 | reserve_pages = DIV_ROUND_UP(kc->sub_job_size << SECTOR_SHIFT, PAGE_SIZE); | |
945 | ||
1da177e4 | 946 | kc->pages = NULL; |
d0471458 | 947 | kc->nr_reserved_pages = kc->nr_free_pages = 0; |
c663e040 | 948 | r = client_reserve_pages(kc, reserve_pages); |
945fa4d2 MP |
949 | if (r) |
950 | goto bad_client_pages; | |
1da177e4 | 951 | |
bda8efec | 952 | kc->io_client = dm_io_client_create(); |
373a392b MB |
953 | if (IS_ERR(kc->io_client)) { |
954 | r = PTR_ERR(kc->io_client); | |
945fa4d2 | 955 | goto bad_io_client; |
1da177e4 LT |
956 | } |
957 | ||
138728dc AK |
958 | init_waitqueue_head(&kc->destroyq); |
959 | atomic_set(&kc->nr_jobs, 0); | |
960 | ||
fa34ce73 | 961 | return kc; |
945fa4d2 MP |
962 | |
963 | bad_io_client: | |
964 | client_free_pages(kc); | |
965 | bad_client_pages: | |
966 | destroy_workqueue(kc->kcopyd_wq); | |
967 | bad_workqueue: | |
6f1c819c | 968 | mempool_exit(&kc->job_pool); |
945fa4d2 MP |
969 | bad_slab: |
970 | kfree(kc); | |
971 | ||
fa34ce73 | 972 | return ERR_PTR(r); |
1da177e4 | 973 | } |
eb69aca5 | 974 | EXPORT_SYMBOL(dm_kcopyd_client_create); |
1da177e4 | 975 | |
eb69aca5 | 976 | void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc) |
1da177e4 | 977 | { |
138728dc AK |
978 | /* Wait for completion of all jobs submitted by this client. */ |
979 | wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs)); | |
980 | ||
d7e6b8df | 981 | BUG_ON(!list_empty(&kc->callback_jobs)); |
8c0cbc2f MP |
982 | BUG_ON(!list_empty(&kc->complete_jobs)); |
983 | BUG_ON(!list_empty(&kc->io_jobs)); | |
984 | BUG_ON(!list_empty(&kc->pages_jobs)); | |
985 | destroy_workqueue(kc->kcopyd_wq); | |
373a392b | 986 | dm_io_client_destroy(kc->io_client); |
1da177e4 | 987 | client_free_pages(kc); |
6f1c819c | 988 | mempool_exit(&kc->job_pool); |
1da177e4 | 989 | kfree(kc); |
1da177e4 | 990 | } |
eb69aca5 | 991 | EXPORT_SYMBOL(dm_kcopyd_client_destroy); |
293128b1 MP |
992 | |
993 | void dm_kcopyd_client_flush(struct dm_kcopyd_client *kc) | |
994 | { | |
995 | flush_workqueue(kc->kcopyd_wq); | |
996 | } | |
997 | EXPORT_SYMBOL(dm_kcopyd_client_flush); |