Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
cafe5635 KO |
2 | /* |
3 | * background writeback - scan btree for dirty data and write it to the backing | |
4 | * device | |
5 | * | |
6 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | |
7 | * Copyright 2012 Google, Inc. | |
8 | */ | |
9 | ||
10 | #include "bcache.h" | |
11 | #include "btree.h" | |
12 | #include "debug.h" | |
279afbad | 13 | #include "writeback.h" |
cafe5635 | 14 | |
5e6926da | 15 | #include <linux/delay.h> |
5e6926da | 16 | #include <linux/kthread.h> |
e6017571 | 17 | #include <linux/sched/clock.h> |
c37511b8 KO |
18 | #include <trace/events/bcache.h> |
19 | ||
7a671d8e CL |
20 | static void update_gc_after_writeback(struct cache_set *c) |
21 | { | |
22 | if (c->gc_after_writeback != (BCH_ENABLE_AUTO_GC) || | |
23 | c->gc_stats.in_use < BCH_AUTO_GC_DIRTY_THRESHOLD) | |
24 | return; | |
25 | ||
26 | c->gc_after_writeback |= BCH_DO_AUTO_GC; | |
27 | } | |
28 | ||
cafe5635 | 29 | /* Rate limiting */ |
616486ab | 30 | static uint64_t __calc_target_rate(struct cached_dev *dc) |
cafe5635 KO |
31 | { |
32 | struct cache_set *c = dc->disk.c; | |
616486ab ML |
33 | |
34 | /* | |
35 | * This is the size of the cache, minus the amount used for | |
36 | * flash-only devices | |
37 | */ | |
4a784266 | 38 | uint64_t cache_sectors = c->nbuckets * c->cache->sb.bucket_size - |
99a27d59 | 39 | atomic_long_read(&c->flash_dev_dirty_sectors); |
616486ab ML |
40 | |
41 | /* | |
42 | * Unfortunately there is no control of global dirty data. If the | |
43 | * user states that they want 10% dirty data in the cache, and has, | |
44 | * e.g., 5 backing volumes of equal size, we try and ensure each | |
45 | * backing volume uses about 2% of the cache for dirty data. | |
46 | */ | |
47 | uint32_t bdev_share = | |
cda25b82 | 48 | div64_u64(bdev_nr_sectors(dc->bdev) << WRITEBACK_SHARE_SHIFT, |
616486ab ML |
49 | c->cached_dev_sectors); |
50 | ||
cafe5635 KO |
51 | uint64_t cache_dirty_target = |
52 | div_u64(cache_sectors * dc->writeback_percent, 100); | |
cafe5635 | 53 | |
616486ab ML |
54 | /* Ensure each backing dev gets at least one dirty share */ |
55 | if (bdev_share < 1) | |
56 | bdev_share = 1; | |
57 | ||
58 | return (cache_dirty_target * bdev_share) >> WRITEBACK_SHARE_SHIFT; | |
59 | } | |
60 | ||
61 | static void __update_writeback_rate(struct cached_dev *dc) | |
62 | { | |
1d316e65 ML |
63 | /* |
64 | * PI controller: | |
65 | * Figures out the amount that should be written per second. | |
66 | * | |
67 | * First, the error (number of sectors that are dirty beyond our | |
68 | * target) is calculated. The error is accumulated (numerically | |
69 | * integrated). | |
70 | * | |
71 | * Then, the proportional value and integral value are scaled | |
72 | * based on configured values. These are stored as inverses to | |
73 | * avoid fixed point math and to make configuration easy-- e.g. | |
74 | * the default value of 40 for writeback_rate_p_term_inverse | |
75 | * attempts to write at a rate that would retire all the dirty | |
76 | * blocks in 40 seconds. | |
77 | * | |
78 | * The writeback_rate_i_inverse value of 10000 means that 1/10000th | |
79 | * of the error is accumulated in the integral term per second. | |
80 | * This acts as a slow, long-term average that is not subject to | |
81 | * variations in usage like the p term. | |
82 | */ | |
616486ab | 83 | int64_t target = __calc_target_rate(dc); |
279afbad | 84 | int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); |
1d316e65 ML |
85 | int64_t error = dirty - target; |
86 | int64_t proportional_scaled = | |
87 | div_s64(error, dc->writeback_rate_p_term_inverse); | |
e41166c5 ML |
88 | int64_t integral_scaled; |
89 | uint32_t new_rate; | |
1d316e65 | 90 | |
71dda2a5 | 91 | /* |
92 | * We need to consider the number of dirty buckets as well | |
93 | * when calculating the proportional_scaled, Otherwise we might | |
94 | * have an unreasonable small writeback rate at a highly fragmented situation | |
95 | * when very few dirty sectors consumed a lot dirty buckets, the | |
96 | * worst case is when dirty buckets reached cutoff_writeback_sync and | |
97 | * dirty data is still not even reached to writeback percent, so the rate | |
98 | * still will be at the minimum value, which will cause the write | |
99 | * stuck at a non-writeback mode. | |
100 | */ | |
101 | struct cache_set *c = dc->disk.c; | |
102 | ||
103 | int64_t dirty_buckets = c->nbuckets - c->avail_nbuckets; | |
104 | ||
105 | if (dc->writeback_consider_fragment && | |
106 | c->gc_stats.in_use > BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW && dirty > 0) { | |
107 | int64_t fragment = | |
108 | div_s64((dirty_buckets * c->cache->sb.bucket_size), dirty); | |
109 | int64_t fp_term; | |
110 | int64_t fps; | |
111 | ||
112 | if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID) { | |
62594f18 | 113 | fp_term = (int64_t)dc->writeback_rate_fp_term_low * |
71dda2a5 | 114 | (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW); |
115 | } else if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH) { | |
62594f18 | 116 | fp_term = (int64_t)dc->writeback_rate_fp_term_mid * |
71dda2a5 | 117 | (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID); |
118 | } else { | |
62594f18 | 119 | fp_term = (int64_t)dc->writeback_rate_fp_term_high * |
71dda2a5 | 120 | (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH); |
121 | } | |
122 | fps = div_s64(dirty, dirty_buckets) * fp_term; | |
123 | if (fragment > 3 && fps > proportional_scaled) { | |
124 | /* Only overrite the p when fragment > 3 */ | |
125 | proportional_scaled = fps; | |
126 | } | |
127 | } | |
128 | ||
1d316e65 ML |
129 | if ((error < 0 && dc->writeback_rate_integral > 0) || |
130 | (error > 0 && time_before64(local_clock(), | |
131 | dc->writeback_rate.next + NSEC_PER_MSEC))) { | |
132 | /* | |
133 | * Only decrease the integral term if it's more than | |
134 | * zero. Only increase the integral term if the device | |
135 | * is keeping up. (Don't wind up the integral | |
136 | * ineffectively in either case). | |
137 | * | |
138 | * It's necessary to scale this by | |
139 | * writeback_rate_update_seconds to keep the integral | |
140 | * term dimensioned properly. | |
141 | */ | |
142 | dc->writeback_rate_integral += error * | |
143 | dc->writeback_rate_update_seconds; | |
144 | } | |
cafe5635 | 145 | |
1d316e65 ML |
146 | integral_scaled = div_s64(dc->writeback_rate_integral, |
147 | dc->writeback_rate_i_term_inverse); | |
cafe5635 | 148 | |
e41166c5 ML |
149 | new_rate = clamp_t(int32_t, (proportional_scaled + integral_scaled), |
150 | dc->writeback_rate_minimum, NSEC_PER_SEC); | |
16749c23 | 151 | |
1d316e65 ML |
152 | dc->writeback_rate_proportional = proportional_scaled; |
153 | dc->writeback_rate_integral_scaled = integral_scaled; | |
ea8c5356 CL |
154 | dc->writeback_rate_change = new_rate - |
155 | atomic_long_read(&dc->writeback_rate.rate); | |
156 | atomic_long_set(&dc->writeback_rate.rate, new_rate); | |
cafe5635 | 157 | dc->writeback_rate_target = target; |
cafe5635 KO |
158 | } |
159 | ||
d2d05b88 CL |
160 | static bool idle_counter_exceeded(struct cache_set *c) |
161 | { | |
162 | int counter, dev_nr; | |
163 | ||
164 | /* | |
165 | * If c->idle_counter is overflow (idel for really long time), | |
166 | * reset as 0 and not set maximum rate this time for code | |
167 | * simplicity. | |
168 | */ | |
169 | counter = atomic_inc_return(&c->idle_counter); | |
170 | if (counter <= 0) { | |
171 | atomic_set(&c->idle_counter, 0); | |
172 | return false; | |
173 | } | |
174 | ||
175 | dev_nr = atomic_read(&c->attached_dev_nr); | |
176 | if (dev_nr == 0) | |
177 | return false; | |
178 | ||
179 | /* | |
180 | * c->idle_counter is increased by writeback thread of all | |
181 | * attached backing devices, in order to represent a rough | |
182 | * time period, counter should be divided by dev_nr. | |
183 | * Otherwise the idle time cannot be larger with more backing | |
184 | * device attached. | |
185 | * The following calculation equals to checking | |
186 | * (counter / dev_nr) < (dev_nr * 6) | |
187 | */ | |
188 | if (counter < (dev_nr * dev_nr * 6)) | |
189 | return false; | |
190 | ||
191 | return true; | |
192 | } | |
193 | ||
194 | /* | |
195 | * Idle_counter is increased every time when update_writeback_rate() is | |
196 | * called. If all backing devices attached to the same cache set have | |
197 | * identical dc->writeback_rate_update_seconds values, it is about 6 | |
198 | * rounds of update_writeback_rate() on each backing device before | |
199 | * c->at_max_writeback_rate is set to 1, and then max wrteback rate set | |
200 | * to each dc->writeback_rate.rate. | |
201 | * In order to avoid extra locking cost for counting exact dirty cached | |
202 | * devices number, c->attached_dev_nr is used to calculate the idle | |
203 | * throushold. It might be bigger if not all cached device are in write- | |
204 | * back mode, but it still works well with limited extra rounds of | |
205 | * update_writeback_rate(). | |
206 | */ | |
ea8c5356 CL |
207 | static bool set_at_max_writeback_rate(struct cache_set *c, |
208 | struct cached_dev *dc) | |
209 | { | |
c5fcdedc CL |
210 | /* Don't sst max writeback rate if it is disabled */ |
211 | if (!c->idle_max_writeback_rate_enabled) | |
212 | return false; | |
213 | ||
141df8bb CL |
214 | /* Don't set max writeback rate if gc is running */ |
215 | if (!c->gc_mark_valid) | |
216 | return false; | |
d2d05b88 CL |
217 | |
218 | if (!idle_counter_exceeded(c)) | |
ea8c5356 CL |
219 | return false; |
220 | ||
221 | if (atomic_read(&c->at_max_writeback_rate) != 1) | |
222 | atomic_set(&c->at_max_writeback_rate, 1); | |
223 | ||
224 | atomic_long_set(&dc->writeback_rate.rate, INT_MAX); | |
225 | ||
226 | /* keep writeback_rate_target as existing value */ | |
227 | dc->writeback_rate_proportional = 0; | |
228 | dc->writeback_rate_integral_scaled = 0; | |
229 | dc->writeback_rate_change = 0; | |
230 | ||
231 | /* | |
d2d05b88 CL |
232 | * In case new I/O arrives during before |
233 | * set_at_max_writeback_rate() returns. | |
ea8c5356 | 234 | */ |
d2d05b88 | 235 | if (!idle_counter_exceeded(c) || |
ea8c5356 CL |
236 | !atomic_read(&c->at_max_writeback_rate)) |
237 | return false; | |
238 | ||
239 | return true; | |
240 | } | |
241 | ||
cafe5635 KO |
242 | static void update_writeback_rate(struct work_struct *work) |
243 | { | |
244 | struct cached_dev *dc = container_of(to_delayed_work(work), | |
245 | struct cached_dev, | |
246 | writeback_rate_update); | |
771f393e | 247 | struct cache_set *c = dc->disk.c; |
cafe5635 | 248 | |
3fd47bfe CL |
249 | /* |
250 | * should check BCACHE_DEV_RATE_DW_RUNNING before calling | |
251 | * cancel_delayed_work_sync(). | |
252 | */ | |
253 | set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); | |
254 | /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ | |
b004aa86 | 255 | smp_mb__after_atomic(); |
3fd47bfe | 256 | |
771f393e CL |
257 | /* |
258 | * CACHE_SET_IO_DISABLE might be set via sysfs interface, | |
259 | * check it here too. | |
260 | */ | |
261 | if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) || | |
262 | test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { | |
3fd47bfe CL |
263 | clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); |
264 | /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ | |
b004aa86 | 265 | smp_mb__after_atomic(); |
3fd47bfe CL |
266 | return; |
267 | } | |
268 | ||
a1a2d8f0 CL |
269 | /* |
270 | * If the whole cache set is idle, set_at_max_writeback_rate() | |
271 | * will set writeback rate to a max number. Then it is | |
272 | * unncessary to update writeback rate for an idle cache set | |
273 | * in maximum writeback rate number(s). | |
274 | */ | |
275 | if (atomic_read(&dc->has_dirty) && dc->writeback_percent && | |
276 | !set_at_max_writeback_rate(c, dc)) { | |
277 | do { | |
278 | if (!down_read_trylock((&dc->writeback_lock))) { | |
279 | dc->rate_update_retry++; | |
280 | if (dc->rate_update_retry <= | |
281 | BCH_WBRATE_UPDATE_MAX_SKIPS) | |
282 | break; | |
283 | down_read(&dc->writeback_lock); | |
284 | dc->rate_update_retry = 0; | |
285 | } | |
ea8c5356 | 286 | __update_writeback_rate(dc); |
7a671d8e | 287 | update_gc_after_writeback(c); |
ea8c5356 | 288 | up_read(&dc->writeback_lock); |
a1a2d8f0 | 289 | } while (0); |
ea8c5356 | 290 | } |
cafe5635 | 291 | |
5e6926da | 292 | |
771f393e CL |
293 | /* |
294 | * CACHE_SET_IO_DISABLE might be set via sysfs interface, | |
295 | * check it here too. | |
296 | */ | |
297 | if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) && | |
298 | !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { | |
3fd47bfe | 299 | schedule_delayed_work(&dc->writeback_rate_update, |
5e6926da | 300 | dc->writeback_rate_update_seconds * HZ); |
3fd47bfe CL |
301 | } |
302 | ||
303 | /* | |
304 | * should check BCACHE_DEV_RATE_DW_RUNNING before calling | |
305 | * cancel_delayed_work_sync(). | |
306 | */ | |
307 | clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); | |
308 | /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ | |
b004aa86 | 309 | smp_mb__after_atomic(); |
cafe5635 KO |
310 | } |
311 | ||
6f10f7d1 CL |
312 | static unsigned int writeback_delay(struct cached_dev *dc, |
313 | unsigned int sectors) | |
cafe5635 | 314 | { |
c4d951dd | 315 | if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || |
cafe5635 KO |
316 | !dc->writeback_percent) |
317 | return 0; | |
318 | ||
16749c23 | 319 | return bch_next_delay(&dc->writeback_rate, sectors); |
cafe5635 KO |
320 | } |
321 | ||
5e6926da KO |
322 | struct dirty_io { |
323 | struct closure cl; | |
324 | struct cached_dev *dc; | |
6e6ccc67 | 325 | uint16_t sequence; |
5e6926da KO |
326 | struct bio bio; |
327 | }; | |
72c27061 | 328 | |
cafe5635 KO |
329 | static void dirty_init(struct keybuf_key *w) |
330 | { | |
331 | struct dirty_io *io = w->private; | |
332 | struct bio *bio = &io->bio; | |
333 | ||
49add496 CH |
334 | bio_init(bio, NULL, bio->bi_inline_vecs, |
335 | DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 0); | |
cafe5635 KO |
336 | if (!io->dc->writeback_percent) |
337 | bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); | |
338 | ||
4f024f37 | 339 | bio->bi_iter.bi_size = KEY_SIZE(&w->key) << 9; |
cafe5635 | 340 | bio->bi_private = w; |
169ef1cf | 341 | bch_bio_map(bio, NULL); |
cafe5635 KO |
342 | } |
343 | ||
d4e3b928 | 344 | static CLOSURE_CALLBACK(dirty_io_destructor) |
cafe5635 | 345 | { |
d4e3b928 | 346 | closure_type(io, struct dirty_io, cl); |
1fae7cf0 | 347 | |
cafe5635 KO |
348 | kfree(io); |
349 | } | |
350 | ||
d4e3b928 | 351 | static CLOSURE_CALLBACK(write_dirty_finish) |
cafe5635 | 352 | { |
d4e3b928 | 353 | closure_type(io, struct dirty_io, cl); |
cafe5635 KO |
354 | struct keybuf_key *w = io->bio.bi_private; |
355 | struct cached_dev *dc = io->dc; | |
cafe5635 | 356 | |
491221f8 | 357 | bio_free_pages(&io->bio); |
cafe5635 KO |
358 | |
359 | /* This is kind of a dumb way of signalling errors. */ | |
360 | if (KEY_DIRTY(&w->key)) { | |
cc7b8819 | 361 | int ret; |
6f10f7d1 | 362 | unsigned int i; |
0b93207a KO |
363 | struct keylist keys; |
364 | ||
0b93207a | 365 | bch_keylist_init(&keys); |
cafe5635 | 366 | |
1b207d80 KO |
367 | bkey_copy(keys.top, &w->key); |
368 | SET_KEY_DIRTY(keys.top, false); | |
369 | bch_keylist_push(&keys); | |
cafe5635 KO |
370 | |
371 | for (i = 0; i < KEY_PTRS(&w->key); i++) | |
372 | atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); | |
373 | ||
cc7b8819 | 374 | ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key); |
cafe5635 | 375 | |
6054c6d4 | 376 | if (ret) |
c37511b8 KO |
377 | trace_bcache_writeback_collision(&w->key); |
378 | ||
6054c6d4 | 379 | atomic_long_inc(ret |
cafe5635 KO |
380 | ? &dc->disk.c->writeback_keys_failed |
381 | : &dc->disk.c->writeback_keys_done); | |
382 | } | |
383 | ||
384 | bch_keybuf_del(&dc->writeback_keys, w); | |
c2a4f318 | 385 | up(&dc->in_flight); |
cafe5635 KO |
386 | |
387 | closure_return_with_destructor(cl, dirty_io_destructor); | |
388 | } | |
389 | ||
4246a0b6 | 390 | static void dirty_endio(struct bio *bio) |
cafe5635 KO |
391 | { |
392 | struct keybuf_key *w = bio->bi_private; | |
393 | struct dirty_io *io = w->private; | |
394 | ||
bf78980f | 395 | if (bio->bi_status) { |
cafe5635 | 396 | SET_KEY_DIRTY(&w->key, false); |
bf78980f CL |
397 | bch_count_backing_io_errors(io->dc, bio); |
398 | } | |
cafe5635 KO |
399 | |
400 | closure_put(&io->cl); | |
401 | } | |
402 | ||
d4e3b928 | 403 | static CLOSURE_CALLBACK(write_dirty) |
cafe5635 | 404 | { |
d4e3b928 | 405 | closure_type(io, struct dirty_io, cl); |
cafe5635 | 406 | struct keybuf_key *w = io->bio.bi_private; |
6e6ccc67 ML |
407 | struct cached_dev *dc = io->dc; |
408 | ||
409 | uint16_t next_sequence; | |
410 | ||
411 | if (atomic_read(&dc->writeback_sequence_next) != io->sequence) { | |
412 | /* Not our turn to write; wait for a write to complete */ | |
413 | closure_wait(&dc->writeback_ordering_wait, cl); | |
414 | ||
415 | if (atomic_read(&dc->writeback_sequence_next) == io->sequence) { | |
416 | /* | |
417 | * Edge case-- it happened in indeterminate order | |
418 | * relative to when we were added to wait list.. | |
419 | */ | |
420 | closure_wake_up(&dc->writeback_ordering_wait); | |
421 | } | |
422 | ||
423 | continue_at(cl, write_dirty, io->dc->writeback_write_wq); | |
424 | return; | |
425 | } | |
426 | ||
427 | next_sequence = io->sequence + 1; | |
cafe5635 | 428 | |
5fa89fb9 ML |
429 | /* |
430 | * IO errors are signalled using the dirty bit on the key. | |
431 | * If we failed to read, we should not attempt to write to the | |
432 | * backing device. Instead, immediately go to write_dirty_finish | |
433 | * to clean up. | |
434 | */ | |
435 | if (KEY_DIRTY(&w->key)) { | |
436 | dirty_init(w); | |
c34b7ac6 | 437 | io->bio.bi_opf = REQ_OP_WRITE; |
5fa89fb9 ML |
438 | io->bio.bi_iter.bi_sector = KEY_START(&w->key); |
439 | bio_set_dev(&io->bio, io->dc->bdev); | |
440 | io->bio.bi_end_io = dirty_endio; | |
cafe5635 | 441 | |
27a40ab9 | 442 | /* I/O request sent to backing device */ |
771f393e | 443 | closure_bio_submit(io->dc->disk.c, &io->bio, cl); |
5fa89fb9 | 444 | } |
cafe5635 | 445 | |
6e6ccc67 ML |
446 | atomic_set(&dc->writeback_sequence_next, next_sequence); |
447 | closure_wake_up(&dc->writeback_ordering_wait); | |
448 | ||
9baf3097 | 449 | continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); |
cafe5635 KO |
450 | } |
451 | ||
4246a0b6 | 452 | static void read_dirty_endio(struct bio *bio) |
cafe5635 KO |
453 | { |
454 | struct keybuf_key *w = bio->bi_private; | |
455 | struct dirty_io *io = w->private; | |
456 | ||
5138ac67 | 457 | /* is_read = 1 */ |
11e9560e | 458 | bch_count_io_errors(io->dc->disk.c->cache, |
5138ac67 CL |
459 | bio->bi_status, 1, |
460 | "reading dirty data from cache"); | |
cafe5635 | 461 | |
4246a0b6 | 462 | dirty_endio(bio); |
cafe5635 KO |
463 | } |
464 | ||
d4e3b928 | 465 | static CLOSURE_CALLBACK(read_dirty_submit) |
cafe5635 | 466 | { |
d4e3b928 | 467 | closure_type(io, struct dirty_io, cl); |
cafe5635 | 468 | |
771f393e | 469 | closure_bio_submit(io->dc->disk.c, &io->bio, cl); |
cafe5635 | 470 | |
9baf3097 | 471 | continue_at(cl, write_dirty, io->dc->writeback_write_wq); |
cafe5635 KO |
472 | } |
473 | ||
5e6926da | 474 | static void read_dirty(struct cached_dev *dc) |
cafe5635 | 475 | { |
6f10f7d1 | 476 | unsigned int delay = 0; |
539d39eb TJ |
477 | struct keybuf_key *next, *keys[MAX_WRITEBACKS_IN_PASS], *w; |
478 | size_t size; | |
479 | int nk, i; | |
cafe5635 | 480 | struct dirty_io *io; |
5e6926da | 481 | struct closure cl; |
6e6ccc67 | 482 | uint16_t sequence = 0; |
5e6926da | 483 | |
6e6ccc67 ML |
484 | BUG_ON(!llist_empty(&dc->writeback_ordering_wait.list)); |
485 | atomic_set(&dc->writeback_sequence_next, sequence); | |
5e6926da | 486 | closure_init_stack(&cl); |
cafe5635 KO |
487 | |
488 | /* | |
489 | * XXX: if we error, background writeback just spins. Should use some | |
490 | * mempools. | |
491 | */ | |
492 | ||
539d39eb TJ |
493 | next = bch_keybuf_next(&dc->writeback_keys); |
494 | ||
771f393e CL |
495 | while (!kthread_should_stop() && |
496 | !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && | |
497 | next) { | |
539d39eb TJ |
498 | size = 0; |
499 | nk = 0; | |
500 | ||
501 | do { | |
502 | BUG_ON(ptr_stale(dc->disk.c, &next->key, 0)); | |
503 | ||
504 | /* | |
505 | * Don't combine too many operations, even if they | |
506 | * are all small. | |
507 | */ | |
508 | if (nk >= MAX_WRITEBACKS_IN_PASS) | |
509 | break; | |
510 | ||
511 | /* | |
512 | * If the current operation is very large, don't | |
513 | * further combine operations. | |
514 | */ | |
515 | if (size >= MAX_WRITESIZE_IN_PASS) | |
516 | break; | |
517 | ||
518 | /* | |
519 | * Operations are only eligible to be combined | |
520 | * if they are contiguous. | |
521 | * | |
522 | * TODO: add a heuristic willing to fire a | |
523 | * certain amount of non-contiguous IO per pass, | |
524 | * so that we can benefit from backing device | |
525 | * command queueing. | |
526 | */ | |
527 | if ((nk != 0) && bkey_cmp(&keys[nk-1]->key, | |
528 | &START_KEY(&next->key))) | |
529 | break; | |
530 | ||
531 | size += KEY_SIZE(&next->key); | |
532 | keys[nk++] = next; | |
533 | } while ((next = bch_keybuf_next(&dc->writeback_keys))); | |
534 | ||
535 | /* Now we have gathered a set of 1..5 keys to write back. */ | |
536 | for (i = 0; i < nk; i++) { | |
537 | w = keys[i]; | |
538 | ||
29f1d5ca GS |
539 | io = kzalloc(struct_size(io, bio.bi_inline_vecs, |
540 | DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)), | |
539d39eb TJ |
541 | GFP_KERNEL); |
542 | if (!io) | |
543 | goto err; | |
544 | ||
545 | w->private = io; | |
546 | io->dc = dc; | |
6e6ccc67 | 547 | io->sequence = sequence++; |
539d39eb TJ |
548 | |
549 | dirty_init(w); | |
c34b7ac6 | 550 | io->bio.bi_opf = REQ_OP_READ; |
539d39eb | 551 | io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0); |
11e9560e | 552 | bio_set_dev(&io->bio, dc->disk.c->cache->bdev); |
539d39eb TJ |
553 | io->bio.bi_end_io = read_dirty_endio; |
554 | ||
555 | if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) | |
556 | goto err_free; | |
557 | ||
558 | trace_bcache_writeback(&w->key); | |
559 | ||
560 | down(&dc->in_flight); | |
561 | ||
3be11dba CL |
562 | /* |
563 | * We've acquired a semaphore for the maximum | |
539d39eb TJ |
564 | * simultaneous number of writebacks; from here |
565 | * everything happens asynchronously. | |
566 | */ | |
567 | closure_call(&io->cl, read_dirty_submit, NULL, &cl); | |
568 | } | |
569 | ||
570 | delay = writeback_delay(dc, size); | |
571 | ||
771f393e CL |
572 | while (!kthread_should_stop() && |
573 | !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && | |
574 | delay) { | |
539d39eb TJ |
575 | schedule_timeout_interruptible(delay); |
576 | delay = writeback_delay(dc, 0); | |
577 | } | |
cafe5635 KO |
578 | } |
579 | ||
580 | if (0) { | |
581 | err_free: | |
582 | kfree(w->private); | |
583 | err: | |
584 | bch_keybuf_del(&dc->writeback_keys, w); | |
585 | } | |
586 | ||
c2a4f318 KO |
587 | /* |
588 | * Wait for outstanding writeback IOs to finish (and keybuf slots to be | |
589 | * freed) before refilling again | |
590 | */ | |
5e6926da KO |
591 | closure_sync(&cl); |
592 | } | |
593 | ||
594 | /* Scan for dirty data */ | |
595 | ||
6f10f7d1 | 596 | void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned int inode, |
5e6926da KO |
597 | uint64_t offset, int nr_sectors) |
598 | { | |
599 | struct bcache_device *d = c->devices[inode]; | |
7a148126 CL |
600 | unsigned int stripe_offset, sectors_dirty; |
601 | int stripe; | |
5e6926da KO |
602 | |
603 | if (!d) | |
604 | return; | |
605 | ||
7a148126 CL |
606 | stripe = offset_to_stripe(d, offset); |
607 | if (stripe < 0) | |
608 | return; | |
609 | ||
99a27d59 TJ |
610 | if (UUID_FLASH_ONLY(&c->uuids[inode])) |
611 | atomic_long_add(nr_sectors, &c->flash_dev_dirty_sectors); | |
612 | ||
5e6926da KO |
613 | stripe_offset = offset & (d->stripe_size - 1); |
614 | ||
615 | while (nr_sectors) { | |
6f10f7d1 | 616 | int s = min_t(unsigned int, abs(nr_sectors), |
5e6926da KO |
617 | d->stripe_size - stripe_offset); |
618 | ||
619 | if (nr_sectors < 0) | |
620 | s = -s; | |
621 | ||
48a915a8 KO |
622 | if (stripe >= d->nr_stripes) |
623 | return; | |
624 | ||
625 | sectors_dirty = atomic_add_return(s, | |
626 | d->stripe_sectors_dirty + stripe); | |
7b1002f7 MZ |
627 | if (sectors_dirty == d->stripe_size) { |
628 | if (!test_bit(stripe, d->full_dirty_stripes)) | |
629 | set_bit(stripe, d->full_dirty_stripes); | |
630 | } else { | |
631 | if (test_bit(stripe, d->full_dirty_stripes)) | |
632 | clear_bit(stripe, d->full_dirty_stripes); | |
633 | } | |
48a915a8 | 634 | |
5e6926da KO |
635 | nr_sectors -= s; |
636 | stripe_offset = 0; | |
637 | stripe++; | |
638 | } | |
639 | } | |
640 | ||
641 | static bool dirty_pred(struct keybuf *buf, struct bkey *k) | |
642 | { | |
b0d30981 CL |
643 | struct cached_dev *dc = container_of(buf, |
644 | struct cached_dev, | |
645 | writeback_keys); | |
627ccd20 KO |
646 | |
647 | BUG_ON(KEY_INODE(k) != dc->disk.id); | |
648 | ||
5e6926da KO |
649 | return KEY_DIRTY(k); |
650 | } | |
651 | ||
48a915a8 | 652 | static void refill_full_stripes(struct cached_dev *dc) |
5e6926da | 653 | { |
48a915a8 | 654 | struct keybuf *buf = &dc->writeback_keys; |
7a148126 CL |
655 | unsigned int start_stripe, next_stripe; |
656 | int stripe; | |
48a915a8 KO |
657 | bool wrapped = false; |
658 | ||
659 | stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned)); | |
7a148126 | 660 | if (stripe < 0) |
48a915a8 | 661 | stripe = 0; |
5e6926da | 662 | |
48a915a8 | 663 | start_stripe = stripe; |
5e6926da KO |
664 | |
665 | while (1) { | |
48a915a8 KO |
666 | stripe = find_next_bit(dc->disk.full_dirty_stripes, |
667 | dc->disk.nr_stripes, stripe); | |
5e6926da | 668 | |
48a915a8 KO |
669 | if (stripe == dc->disk.nr_stripes) |
670 | goto next; | |
5e6926da | 671 | |
48a915a8 KO |
672 | next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes, |
673 | dc->disk.nr_stripes, stripe); | |
674 | ||
675 | buf->last_scanned = KEY(dc->disk.id, | |
676 | stripe * dc->disk.stripe_size, 0); | |
677 | ||
678 | bch_refill_keybuf(dc->disk.c, buf, | |
679 | &KEY(dc->disk.id, | |
680 | next_stripe * dc->disk.stripe_size, 0), | |
681 | dirty_pred); | |
682 | ||
683 | if (array_freelist_empty(&buf->freelist)) | |
684 | return; | |
685 | ||
686 | stripe = next_stripe; | |
687 | next: | |
688 | if (wrapped && stripe > start_stripe) | |
689 | return; | |
690 | ||
691 | if (stripe == dc->disk.nr_stripes) { | |
692 | stripe = 0; | |
693 | wrapped = true; | |
694 | } | |
5e6926da KO |
695 | } |
696 | } | |
697 | ||
627ccd20 KO |
698 | /* |
699 | * Returns true if we scanned the entire disk | |
700 | */ | |
5e6926da KO |
701 | static bool refill_dirty(struct cached_dev *dc) |
702 | { | |
703 | struct keybuf *buf = &dc->writeback_keys; | |
627ccd20 | 704 | struct bkey start = KEY(dc->disk.id, 0, 0); |
5e6926da | 705 | struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0); |
627ccd20 KO |
706 | struct bkey start_pos; |
707 | ||
708 | /* | |
709 | * make sure keybuf pos is inside the range for this disk - at bringup | |
710 | * we might not be attached yet so this disk's inode nr isn't | |
711 | * initialized then | |
712 | */ | |
713 | if (bkey_cmp(&buf->last_scanned, &start) < 0 || | |
714 | bkey_cmp(&buf->last_scanned, &end) > 0) | |
715 | buf->last_scanned = start; | |
48a915a8 KO |
716 | |
717 | if (dc->partial_stripes_expensive) { | |
718 | refill_full_stripes(dc); | |
719 | if (array_freelist_empty(&buf->freelist)) | |
720 | return false; | |
721 | } | |
5e6926da | 722 | |
627ccd20 | 723 | start_pos = buf->last_scanned; |
48a915a8 | 724 | bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); |
5e6926da | 725 | |
627ccd20 KO |
726 | if (bkey_cmp(&buf->last_scanned, &end) < 0) |
727 | return false; | |
728 | ||
729 | /* | |
730 | * If we get to the end start scanning again from the beginning, and | |
731 | * only scan up to where we initially started scanning from: | |
732 | */ | |
733 | buf->last_scanned = start; | |
734 | bch_refill_keybuf(dc->disk.c, buf, &start_pos, dirty_pred); | |
735 | ||
736 | return bkey_cmp(&buf->last_scanned, &start_pos) >= 0; | |
5e6926da KO |
737 | } |
738 | ||
739 | static int bch_writeback_thread(void *arg) | |
740 | { | |
741 | struct cached_dev *dc = arg; | |
771f393e | 742 | struct cache_set *c = dc->disk.c; |
5e6926da KO |
743 | bool searched_full_index; |
744 | ||
a8500fc8 ML |
745 | bch_ratelimit_reset(&dc->writeback_rate); |
746 | ||
771f393e CL |
747 | while (!kthread_should_stop() && |
748 | !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { | |
5e6926da | 749 | down_write(&dc->writeback_lock); |
99361bbf | 750 | set_current_state(TASK_INTERRUPTIBLE); |
fadd94e0 CL |
751 | /* |
752 | * If the bache device is detaching, skip here and continue | |
753 | * to perform writeback. Otherwise, if no dirty data on cache, | |
754 | * or there is dirty data on cache but writeback is disabled, | |
755 | * the writeback thread should sleep here and wait for others | |
756 | * to wake up it. | |
757 | */ | |
758 | if (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && | |
759 | (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) { | |
5e6926da | 760 | up_write(&dc->writeback_lock); |
5e6926da | 761 | |
771f393e CL |
762 | if (kthread_should_stop() || |
763 | test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { | |
99361bbf | 764 | set_current_state(TASK_RUNNING); |
804f3c69 | 765 | break; |
99361bbf | 766 | } |
5e6926da | 767 | |
5e6926da KO |
768 | schedule(); |
769 | continue; | |
770 | } | |
99361bbf | 771 | set_current_state(TASK_RUNNING); |
5e6926da KO |
772 | |
773 | searched_full_index = refill_dirty(dc); | |
774 | ||
775 | if (searched_full_index && | |
776 | RB_EMPTY_ROOT(&dc->writeback_keys.keys)) { | |
777 | atomic_set(&dc->has_dirty, 0); | |
5e6926da KO |
778 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); |
779 | bch_write_bdev_super(dc, NULL); | |
fadd94e0 CL |
780 | /* |
781 | * If bcache device is detaching via sysfs interface, | |
782 | * writeback thread should stop after there is no dirty | |
783 | * data on cache. BCACHE_DEV_DETACHING flag is set in | |
784 | * bch_cached_dev_detach(). | |
785 | */ | |
3943b040 | 786 | if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) { |
df4ad532 DY |
787 | struct closure cl; |
788 | ||
789 | closure_init_stack(&cl); | |
790 | memset(&dc->sb.set_uuid, 0, 16); | |
791 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE); | |
792 | ||
793 | bch_write_bdev_super(dc, &cl); | |
794 | closure_sync(&cl); | |
795 | ||
3943b040 | 796 | up_write(&dc->writeback_lock); |
fadd94e0 | 797 | break; |
3943b040 | 798 | } |
7a671d8e CL |
799 | |
800 | /* | |
801 | * When dirty data rate is high (e.g. 50%+), there might | |
802 | * be heavy buckets fragmentation after writeback | |
803 | * finished, which hurts following write performance. | |
804 | * If users really care about write performance they | |
805 | * may set BCH_ENABLE_AUTO_GC via sysfs, then when | |
806 | * BCH_DO_AUTO_GC is set, garbage collection thread | |
807 | * will be wake up here. After moving gc, the shrunk | |
808 | * btree and discarded free buckets SSD space may be | |
809 | * helpful for following write requests. | |
810 | */ | |
811 | if (c->gc_after_writeback == | |
812 | (BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) { | |
813 | c->gc_after_writeback &= ~BCH_DO_AUTO_GC; | |
814 | force_wake_up_gc(c); | |
815 | } | |
5e6926da KO |
816 | } |
817 | ||
818 | up_write(&dc->writeback_lock); | |
819 | ||
5e6926da KO |
820 | read_dirty(dc); |
821 | ||
822 | if (searched_full_index) { | |
6f10f7d1 | 823 | unsigned int delay = dc->writeback_delay * HZ; |
5e6926da KO |
824 | |
825 | while (delay && | |
826 | !kthread_should_stop() && | |
771f393e | 827 | !test_bit(CACHE_SET_IO_DISABLE, &c->flags) && |
c4d951dd | 828 | !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) |
9e5c3535 | 829 | delay = schedule_timeout_interruptible(delay); |
a8500fc8 ML |
830 | |
831 | bch_ratelimit_reset(&dc->writeback_rate); | |
5e6926da KO |
832 | } |
833 | } | |
834 | ||
97d26ae7 | 835 | if (dc->writeback_write_wq) |
7e865eba | 836 | destroy_workqueue(dc->writeback_write_wq); |
97d26ae7 | 837 | |
804f3c69 | 838 | cached_dev_put(dc); |
771f393e | 839 | wait_for_kthread_stop(); |
804f3c69 | 840 | |
5e6926da | 841 | return 0; |
cafe5635 KO |
842 | } |
843 | ||
444fc0b6 | 844 | /* Init */ |
94f71c16 | 845 | #define INIT_KEYS_EACH_TIME 500000 |
444fc0b6 | 846 | |
c18536a7 KO |
847 | struct sectors_dirty_init { |
848 | struct btree_op op; | |
6f10f7d1 | 849 | unsigned int inode; |
94f71c16 | 850 | size_t count; |
c18536a7 KO |
851 | }; |
852 | ||
853 | static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, | |
48dad8ba | 854 | struct bkey *k) |
444fc0b6 | 855 | { |
c18536a7 KO |
856 | struct sectors_dirty_init *op = container_of(_op, |
857 | struct sectors_dirty_init, op); | |
48dad8ba KO |
858 | if (KEY_INODE(k) > op->inode) |
859 | return MAP_DONE; | |
444fc0b6 | 860 | |
48dad8ba KO |
861 | if (KEY_DIRTY(k)) |
862 | bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), | |
863 | KEY_START(k), KEY_SIZE(k)); | |
864 | ||
94f71c16 | 865 | op->count++; |
80db4e47 CL |
866 | if (!(op->count % INIT_KEYS_EACH_TIME)) |
867 | cond_resched(); | |
94f71c16 | 868 | |
48dad8ba | 869 | return MAP_CONTINUE; |
444fc0b6 KO |
870 | } |
871 | ||
b144e45f CL |
872 | static int bch_root_node_dirty_init(struct cache_set *c, |
873 | struct bcache_device *d, | |
874 | struct bkey *k) | |
444fc0b6 | 875 | { |
c18536a7 | 876 | struct sectors_dirty_init op; |
94f71c16 | 877 | int ret; |
444fc0b6 | 878 | |
b54d6934 | 879 | bch_btree_op_init(&op.op, -1); |
175206cf | 880 | op.inode = d->id; |
94f71c16 | 881 | op.count = 0; |
80db4e47 CL |
882 | |
883 | ret = bcache_btree(map_keys_recurse, | |
884 | k, | |
885 | c->root, | |
886 | &op.op, | |
887 | &KEY(op.inode, 0, 0), | |
888 | sectors_dirty_init_fn, | |
889 | 0); | |
890 | if (ret < 0) | |
891 | pr_warn("sectors dirty init failed, ret=%d!\n", ret); | |
b144e45f | 892 | |
f0854489 MZ |
893 | /* |
894 | * The op may be added to cache_set's btree_cache_wait | |
895 | * in mca_cannibalize(), must ensure it is removed from | |
896 | * the list and release btree_cache_alloc_lock before | |
897 | * free op memory. | |
898 | * Otherwise, the btree_cache_wait will be damaged. | |
899 | */ | |
900 | bch_cannibalize_unlock(c); | |
901 | finish_wait(&c->btree_cache_wait, &(&op.op)->wait); | |
902 | ||
b144e45f CL |
903 | return ret; |
904 | } | |
905 | ||
906 | static int bch_dirty_init_thread(void *arg) | |
907 | { | |
908 | struct dirty_init_thrd_info *info = arg; | |
909 | struct bch_dirty_init_state *state = info->state; | |
910 | struct cache_set *c = state->c; | |
911 | struct btree_iter iter; | |
912 | struct bkey *k, *p; | |
913 | int cur_idx, prev_idx, skip_nr; | |
b144e45f CL |
914 | |
915 | k = p = NULL; | |
be93825f | 916 | prev_idx = 0; |
b144e45f CL |
917 | |
918 | bch_btree_iter_init(&c->root->keys, &iter, NULL); | |
919 | k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); | |
920 | BUG_ON(!k); | |
921 | ||
922 | p = k; | |
923 | ||
924 | while (k) { | |
925 | spin_lock(&state->idx_lock); | |
926 | cur_idx = state->key_idx; | |
927 | state->key_idx++; | |
928 | spin_unlock(&state->idx_lock); | |
929 | ||
930 | skip_nr = cur_idx - prev_idx; | |
931 | ||
932 | while (skip_nr) { | |
933 | k = bch_btree_iter_next_filter(&iter, | |
934 | &c->root->keys, | |
935 | bch_ptr_bad); | |
936 | if (k) | |
937 | p = k; | |
938 | else { | |
939 | atomic_set(&state->enough, 1); | |
940 | /* Update state->enough earlier */ | |
eb9b6666 | 941 | smp_mb__after_atomic(); |
b144e45f CL |
942 | goto out; |
943 | } | |
944 | skip_nr--; | |
b144e45f CL |
945 | } |
946 | ||
947 | if (p) { | |
948 | if (bch_root_node_dirty_init(c, state->d, p) < 0) | |
949 | goto out; | |
950 | } | |
951 | ||
952 | p = NULL; | |
953 | prev_idx = cur_idx; | |
b144e45f CL |
954 | } |
955 | ||
956 | out: | |
957 | /* In order to wake up state->wait in time */ | |
eb9b6666 | 958 | smp_mb__before_atomic(); |
b144e45f CL |
959 | if (atomic_dec_and_test(&state->started)) |
960 | wake_up(&state->wait); | |
961 | ||
962 | return 0; | |
963 | } | |
964 | ||
965 | static int bch_btre_dirty_init_thread_nr(void) | |
966 | { | |
967 | int n = num_online_cpus()/2; | |
968 | ||
969 | if (n == 0) | |
970 | n = 1; | |
971 | else if (n > BCH_DIRTY_INIT_THRD_MAX) | |
972 | n = BCH_DIRTY_INIT_THRD_MAX; | |
973 | ||
974 | return n; | |
975 | } | |
976 | ||
977 | void bch_sectors_dirty_init(struct bcache_device *d) | |
978 | { | |
979 | int i; | |
e34820f9 | 980 | struct btree *b = NULL; |
b144e45f CL |
981 | struct bkey *k = NULL; |
982 | struct btree_iter iter; | |
983 | struct sectors_dirty_init op; | |
984 | struct cache_set *c = d->c; | |
4dc34ae1 | 985 | struct bch_dirty_init_state state; |
b144e45f | 986 | |
e34820f9 MZ |
987 | retry_lock: |
988 | b = c->root; | |
989 | rw_lock(0, b, b->level); | |
990 | if (b != c->root) { | |
991 | rw_unlock(0, b); | |
992 | goto retry_lock; | |
993 | } | |
994 | ||
b144e45f CL |
995 | /* Just count root keys if no leaf node */ |
996 | if (c->root->level == 0) { | |
997 | bch_btree_op_init(&op.op, -1); | |
998 | op.inode = d->id; | |
999 | op.count = 0; | |
b144e45f CL |
1000 | |
1001 | for_each_key_filter(&c->root->keys, | |
7cc47e64 MZ |
1002 | k, &iter, bch_ptr_invalid) { |
1003 | if (KEY_INODE(k) != op.inode) | |
1004 | continue; | |
b144e45f | 1005 | sectors_dirty_init_fn(&op.op, c->root, k); |
7cc47e64 | 1006 | } |
80db4e47 | 1007 | |
e34820f9 | 1008 | rw_unlock(0, b); |
b144e45f CL |
1009 | return; |
1010 | } | |
1011 | ||
7d6b902e | 1012 | memset(&state, 0, sizeof(struct bch_dirty_init_state)); |
4dc34ae1 CL |
1013 | state.c = c; |
1014 | state.d = d; | |
1015 | state.total_threads = bch_btre_dirty_init_thread_nr(); | |
1016 | state.key_idx = 0; | |
1017 | spin_lock_init(&state.idx_lock); | |
1018 | atomic_set(&state.started, 0); | |
1019 | atomic_set(&state.enough, 0); | |
1020 | init_waitqueue_head(&state.wait); | |
1021 | ||
1022 | for (i = 0; i < state.total_threads; i++) { | |
1023 | /* Fetch latest state.enough earlier */ | |
eb9b6666 | 1024 | smp_mb__before_atomic(); |
4dc34ae1 | 1025 | if (atomic_read(&state.enough)) |
b144e45f CL |
1026 | break; |
1027 | ||
2faac25d | 1028 | atomic_inc(&state.started); |
4dc34ae1 CL |
1029 | state.infos[i].state = &state; |
1030 | state.infos[i].thread = | |
1031 | kthread_run(bch_dirty_init_thread, &state.infos[i], | |
1032 | "bch_dirtcnt[%d]", i); | |
1033 | if (IS_ERR(state.infos[i].thread)) { | |
46f5aa88 | 1034 | pr_err("fails to run thread bch_dirty_init[%d]\n", i); |
2faac25d | 1035 | atomic_dec(&state.started); |
b144e45f | 1036 | for (--i; i >= 0; i--) |
4dc34ae1 | 1037 | kthread_stop(state.infos[i].thread); |
b144e45f CL |
1038 | goto out; |
1039 | } | |
1040 | } | |
1041 | ||
b144e45f | 1042 | out: |
4dc34ae1 CL |
1043 | /* Must wait for all threads to stop. */ |
1044 | wait_event(state.wait, atomic_read(&state.started) == 0); | |
e34820f9 | 1045 | rw_unlock(0, b); |
444fc0b6 KO |
1046 | } |
1047 | ||
9e5c3535 | 1048 | void bch_cached_dev_writeback_init(struct cached_dev *dc) |
cafe5635 | 1049 | { |
c2a4f318 | 1050 | sema_init(&dc->in_flight, 64); |
cafe5635 | 1051 | init_rwsem(&dc->writeback_lock); |
72c27061 | 1052 | bch_keybuf_init(&dc->writeback_keys); |
cafe5635 KO |
1053 | |
1054 | dc->writeback_metadata = true; | |
79b79146 | 1055 | dc->writeback_running = false; |
71dda2a5 | 1056 | dc->writeback_consider_fragment = true; |
cafe5635 KO |
1057 | dc->writeback_percent = 10; |
1058 | dc->writeback_delay = 30; | |
ea8c5356 | 1059 | atomic_long_set(&dc->writeback_rate.rate, 1024); |
ae82ddbf | 1060 | dc->writeback_rate_minimum = 8; |
cafe5635 | 1061 | |
7a5e3ecb | 1062 | dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT; |
1d316e65 | 1063 | dc->writeback_rate_p_term_inverse = 40; |
71dda2a5 | 1064 | dc->writeback_rate_fp_term_low = 1; |
1065 | dc->writeback_rate_fp_term_mid = 10; | |
1066 | dc->writeback_rate_fp_term_high = 1000; | |
1d316e65 | 1067 | dc->writeback_rate_i_term_inverse = 10000; |
cafe5635 | 1068 | |
a1a2d8f0 CL |
1069 | /* For dc->writeback_lock contention in update_writeback_rate() */ |
1070 | dc->rate_update_retry = 0; | |
1071 | ||
3fd47bfe | 1072 | WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); |
9e5c3535 SP |
1073 | INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); |
1074 | } | |
1075 | ||
1076 | int bch_cached_dev_writeback_start(struct cached_dev *dc) | |
1077 | { | |
9baf3097 TJ |
1078 | dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq", |
1079 | WQ_MEM_RECLAIM, 0); | |
1080 | if (!dc->writeback_write_wq) | |
1081 | return -ENOMEM; | |
1082 | ||
804f3c69 | 1083 | cached_dev_get(dc); |
5e6926da KO |
1084 | dc->writeback_thread = kthread_create(bch_writeback_thread, dc, |
1085 | "bcache_writeback"); | |
804f3c69 CL |
1086 | if (IS_ERR(dc->writeback_thread)) { |
1087 | cached_dev_put(dc); | |
f54d801d | 1088 | destroy_workqueue(dc->writeback_write_wq); |
5e6926da | 1089 | return PTR_ERR(dc->writeback_thread); |
804f3c69 | 1090 | } |
79b79146 | 1091 | dc->writeback_running = true; |
5e6926da | 1092 | |
3fd47bfe | 1093 | WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); |
cafe5635 KO |
1094 | schedule_delayed_work(&dc->writeback_rate_update, |
1095 | dc->writeback_rate_update_seconds * HZ); | |
cafe5635 | 1096 | |
9e5c3535 SP |
1097 | bch_writeback_queue(dc); |
1098 | ||
cafe5635 KO |
1099 | return 0; |
1100 | } |