Commit | Line | Data |
---|---|---|
1c6fdbd8 KO |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | #include "bcachefs.h" | |
7b3f84ea | 4 | #include "alloc_foreground.h" |
1c6fdbd8 KO |
5 | #include "btree_iter.h" |
6 | #include "buckets.h" | |
7 | #include "clock.h" | |
8 | #include "disk_groups.h" | |
9 | #include "extents.h" | |
10 | #include "io.h" | |
11 | #include "move.h" | |
12 | #include "rebalance.h" | |
13 | #include "super-io.h" | |
14 | #include "trace.h" | |
15 | ||
16 | #include <linux/freezer.h> | |
17 | #include <linux/kthread.h> | |
18 | #include <linux/sched/cputime.h> | |
19 | ||
20 | static inline bool rebalance_ptr_pred(struct bch_fs *c, | |
1742237b | 21 | struct extent_ptr_decoded p, |
1c6fdbd8 KO |
22 | struct bch_io_opts *io_opts) |
23 | { | |
24 | if (io_opts->background_target && | |
1742237b KO |
25 | !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target) && |
26 | !p.ptr.cached) | |
1c6fdbd8 KO |
27 | return true; |
28 | ||
29 | if (io_opts->background_compression && | |
1742237b | 30 | p.crc.compression_type != |
1c6fdbd8 KO |
31 | bch2_compression_opt_to_type[io_opts->background_compression]) |
32 | return true; | |
33 | ||
34 | return false; | |
35 | } | |
36 | ||
37 | void bch2_rebalance_add_key(struct bch_fs *c, | |
38 | struct bkey_s_c k, | |
39 | struct bch_io_opts *io_opts) | |
40 | { | |
1742237b KO |
41 | const union bch_extent_entry *entry; |
42 | struct extent_ptr_decoded p; | |
1c6fdbd8 KO |
43 | struct bkey_s_c_extent e; |
44 | ||
45 | if (!bkey_extent_is_data(k.k)) | |
46 | return; | |
47 | ||
48 | if (!io_opts->background_target && | |
49 | !io_opts->background_compression) | |
50 | return; | |
51 | ||
52 | e = bkey_s_c_to_extent(k); | |
53 | ||
1742237b KO |
54 | extent_for_each_ptr_decode(e, p, entry) |
55 | if (rebalance_ptr_pred(c, p, io_opts)) { | |
56 | struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); | |
1c6fdbd8 | 57 | |
1742237b | 58 | if (atomic64_add_return(p.crc.compressed_size, |
1c6fdbd8 | 59 | &ca->rebalance_work) == |
1742237b | 60 | p.crc.compressed_size) |
1c6fdbd8 KO |
61 | rebalance_wakeup(c); |
62 | } | |
63 | } | |
64 | ||
65 | void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors) | |
66 | { | |
67 | if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) == | |
68 | sectors) | |
69 | rebalance_wakeup(c); | |
70 | } | |
71 | ||
72 | static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg, | |
26609b61 | 73 | struct bkey_s_c k, |
1c6fdbd8 KO |
74 | struct bch_io_opts *io_opts, |
75 | struct data_opts *data_opts) | |
76 | { | |
26609b61 KO |
77 | switch (k.k->type) { |
78 | case KEY_TYPE_extent: { | |
79 | struct bkey_s_c_extent e = bkey_s_c_to_extent(k); | |
80 | const union bch_extent_entry *entry; | |
81 | struct extent_ptr_decoded p; | |
1c6fdbd8 | 82 | |
26609b61 KO |
83 | /* Make sure we have room to add a new pointer: */ |
84 | if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX > | |
85 | BKEY_EXTENT_VAL_U64s_MAX) | |
86 | return DATA_SKIP; | |
1c6fdbd8 | 87 | |
26609b61 KO |
88 | extent_for_each_ptr_decode(e, p, entry) |
89 | if (rebalance_ptr_pred(c, p, io_opts)) | |
90 | goto found; | |
1c6fdbd8 | 91 | |
26609b61 | 92 | return DATA_SKIP; |
1c6fdbd8 | 93 | found: |
26609b61 KO |
94 | data_opts->target = io_opts->background_target; |
95 | data_opts->btree_insert_flags = 0; | |
96 | return DATA_ADD_REPLICAS; | |
97 | } | |
98 | default: | |
99 | return DATA_SKIP; | |
100 | } | |
1c6fdbd8 KO |
101 | } |
102 | ||
103 | struct rebalance_work { | |
104 | int dev_most_full_idx; | |
105 | unsigned dev_most_full_percent; | |
106 | u64 dev_most_full_work; | |
107 | u64 dev_most_full_capacity; | |
108 | u64 total_work; | |
109 | }; | |
110 | ||
111 | static void rebalance_work_accumulate(struct rebalance_work *w, | |
112 | u64 dev_work, u64 unknown_dev, u64 capacity, int idx) | |
113 | { | |
114 | unsigned percent_full; | |
115 | u64 work = dev_work + unknown_dev; | |
116 | ||
117 | if (work < dev_work || work < unknown_dev) | |
118 | work = U64_MAX; | |
119 | work = min(work, capacity); | |
120 | ||
cf0517af | 121 | percent_full = div64_u64(work * 100, capacity); |
1c6fdbd8 KO |
122 | |
123 | if (percent_full >= w->dev_most_full_percent) { | |
124 | w->dev_most_full_idx = idx; | |
125 | w->dev_most_full_percent = percent_full; | |
126 | w->dev_most_full_work = work; | |
127 | w->dev_most_full_capacity = capacity; | |
128 | } | |
129 | ||
130 | if (w->total_work + dev_work >= w->total_work && | |
131 | w->total_work + dev_work >= dev_work) | |
132 | w->total_work += dev_work; | |
133 | } | |
134 | ||
135 | static struct rebalance_work rebalance_work(struct bch_fs *c) | |
136 | { | |
137 | struct bch_dev *ca; | |
138 | struct rebalance_work ret = { .dev_most_full_idx = -1 }; | |
139 | u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev); | |
140 | unsigned i; | |
141 | ||
142 | for_each_online_member(ca, c, i) | |
143 | rebalance_work_accumulate(&ret, | |
144 | atomic64_read(&ca->rebalance_work), | |
145 | unknown_dev, | |
146 | bucket_to_sector(ca, ca->mi.nbuckets - | |
147 | ca->mi.first_bucket), | |
148 | i); | |
149 | ||
150 | rebalance_work_accumulate(&ret, | |
151 | unknown_dev, 0, c->capacity, -1); | |
152 | ||
153 | return ret; | |
154 | } | |
155 | ||
156 | static void rebalance_work_reset(struct bch_fs *c) | |
157 | { | |
158 | struct bch_dev *ca; | |
159 | unsigned i; | |
160 | ||
161 | for_each_online_member(ca, c, i) | |
162 | atomic64_set(&ca->rebalance_work, 0); | |
163 | ||
164 | atomic64_set(&c->rebalance.work_unknown_dev, 0); | |
165 | } | |
166 | ||
167 | static unsigned long curr_cputime(void) | |
168 | { | |
169 | u64 utime, stime; | |
170 | ||
171 | task_cputime_adjusted(current, &utime, &stime); | |
172 | return nsecs_to_jiffies(utime + stime); | |
173 | } | |
174 | ||
175 | static int bch2_rebalance_thread(void *arg) | |
176 | { | |
177 | struct bch_fs *c = arg; | |
178 | struct bch_fs_rebalance *r = &c->rebalance; | |
179 | struct io_clock *clock = &c->io_clock[WRITE]; | |
180 | struct rebalance_work w, p; | |
181 | unsigned long start, prev_start; | |
182 | unsigned long prev_run_time, prev_run_cputime; | |
183 | unsigned long cputime, prev_cputime; | |
184 | unsigned long io_start; | |
185 | long throttle; | |
186 | ||
187 | set_freezable(); | |
188 | ||
189 | io_start = atomic_long_read(&clock->now); | |
190 | p = rebalance_work(c); | |
191 | prev_start = jiffies; | |
192 | prev_cputime = curr_cputime(); | |
193 | ||
194 | while (!kthread_wait_freezable(r->enabled)) { | |
195 | start = jiffies; | |
196 | cputime = curr_cputime(); | |
197 | ||
198 | prev_run_time = start - prev_start; | |
199 | prev_run_cputime = cputime - prev_cputime; | |
200 | ||
201 | w = rebalance_work(c); | |
202 | BUG_ON(!w.dev_most_full_capacity); | |
203 | ||
204 | if (!w.total_work) { | |
205 | r->state = REBALANCE_WAITING; | |
206 | kthread_wait_freezable(rebalance_work(c).total_work); | |
207 | continue; | |
208 | } | |
209 | ||
210 | /* | |
211 | * If there isn't much work to do, throttle cpu usage: | |
212 | */ | |
213 | throttle = prev_run_cputime * 100 / | |
214 | max(1U, w.dev_most_full_percent) - | |
215 | prev_run_time; | |
216 | ||
217 | if (w.dev_most_full_percent < 20 && throttle > 0) { | |
218 | r->state = REBALANCE_THROTTLED; | |
219 | r->throttled_until_iotime = io_start + | |
220 | div_u64(w.dev_most_full_capacity * | |
221 | (20 - w.dev_most_full_percent), | |
222 | 50); | |
223 | r->throttled_until_cputime = start + throttle; | |
224 | ||
225 | bch2_kthread_io_clock_wait(clock, | |
226 | r->throttled_until_iotime, | |
227 | throttle); | |
228 | continue; | |
229 | } | |
230 | ||
231 | /* minimum 1 mb/sec: */ | |
232 | r->pd.rate.rate = | |
233 | max_t(u64, 1 << 11, | |
234 | r->pd.rate.rate * | |
235 | max(p.dev_most_full_percent, 1U) / | |
236 | max(w.dev_most_full_percent, 1U)); | |
237 | ||
238 | io_start = atomic_long_read(&clock->now); | |
239 | p = w; | |
240 | prev_start = start; | |
241 | prev_cputime = cputime; | |
242 | ||
243 | r->state = REBALANCE_RUNNING; | |
244 | memset(&r->move_stats, 0, sizeof(r->move_stats)); | |
245 | rebalance_work_reset(c); | |
246 | ||
247 | bch2_move_data(c, | |
248 | /* ratelimiting disabled for now */ | |
249 | NULL, /* &r->pd.rate, */ | |
250 | writepoint_ptr(&c->rebalance_write_point), | |
251 | POS_MIN, POS_MAX, | |
252 | rebalance_pred, NULL, | |
253 | &r->move_stats); | |
254 | } | |
255 | ||
256 | return 0; | |
257 | } | |
258 | ||
259 | ssize_t bch2_rebalance_work_show(struct bch_fs *c, char *buf) | |
260 | { | |
319f9ac3 | 261 | struct printbuf out = _PBUF(buf, PAGE_SIZE); |
1c6fdbd8 KO |
262 | struct bch_fs_rebalance *r = &c->rebalance; |
263 | struct rebalance_work w = rebalance_work(c); | |
264 | char h1[21], h2[21]; | |
265 | ||
266 | bch2_hprint(h1, w.dev_most_full_work << 9); | |
267 | bch2_hprint(h2, w.dev_most_full_capacity << 9); | |
319f9ac3 KO |
268 | pr_buf(&out, "fullest_dev (%i):\t%s/%s\n", |
269 | w.dev_most_full_idx, h1, h2); | |
1c6fdbd8 KO |
270 | |
271 | bch2_hprint(h1, w.total_work << 9); | |
272 | bch2_hprint(h2, c->capacity << 9); | |
319f9ac3 | 273 | pr_buf(&out, "total work:\t\t%s/%s\n", h1, h2); |
1c6fdbd8 | 274 | |
319f9ac3 | 275 | pr_buf(&out, "rate:\t\t\t%u\n", r->pd.rate.rate); |
1c6fdbd8 KO |
276 | |
277 | switch (r->state) { | |
278 | case REBALANCE_WAITING: | |
319f9ac3 | 279 | pr_buf(&out, "waiting\n"); |
1c6fdbd8 KO |
280 | break; |
281 | case REBALANCE_THROTTLED: | |
282 | bch2_hprint(h1, | |
283 | (r->throttled_until_iotime - | |
284 | atomic_long_read(&c->io_clock[WRITE].now)) << 9); | |
319f9ac3 KO |
285 | pr_buf(&out, "throttled for %lu sec or %s io\n", |
286 | (r->throttled_until_cputime - jiffies) / HZ, | |
287 | h1); | |
1c6fdbd8 KO |
288 | break; |
289 | case REBALANCE_RUNNING: | |
319f9ac3 KO |
290 | pr_buf(&out, "running\n"); |
291 | pr_buf(&out, "pos %llu:%llu\n", | |
292 | r->move_stats.iter.pos.inode, | |
293 | r->move_stats.iter.pos.offset); | |
1c6fdbd8 KO |
294 | break; |
295 | } | |
296 | ||
319f9ac3 | 297 | return out.pos - buf; |
1c6fdbd8 KO |
298 | } |
299 | ||
300 | void bch2_rebalance_stop(struct bch_fs *c) | |
301 | { | |
302 | struct task_struct *p; | |
303 | ||
304 | c->rebalance.pd.rate.rate = UINT_MAX; | |
305 | bch2_ratelimit_reset(&c->rebalance.pd.rate); | |
306 | ||
307 | p = rcu_dereference_protected(c->rebalance.thread, 1); | |
308 | c->rebalance.thread = NULL; | |
309 | ||
310 | if (p) { | |
311 | /* for sychronizing with rebalance_wakeup() */ | |
312 | synchronize_rcu(); | |
313 | ||
314 | kthread_stop(p); | |
315 | put_task_struct(p); | |
316 | } | |
317 | } | |
318 | ||
319 | int bch2_rebalance_start(struct bch_fs *c) | |
320 | { | |
321 | struct task_struct *p; | |
322 | ||
323 | if (c->opts.nochanges) | |
324 | return 0; | |
325 | ||
326 | p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance"); | |
327 | if (IS_ERR(p)) | |
328 | return PTR_ERR(p); | |
329 | ||
330 | get_task_struct(p); | |
331 | rcu_assign_pointer(c->rebalance.thread, p); | |
332 | wake_up_process(p); | |
333 | return 0; | |
334 | } | |
335 | ||
336 | void bch2_fs_rebalance_init(struct bch_fs *c) | |
337 | { | |
338 | bch2_pd_controller_init(&c->rebalance.pd); | |
339 | ||
340 | atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX); | |
341 | } |