e1c7d572fbffe2a66bf7d0b81df8eee505bfd3d7
[linux-block.git] / fs / bcachefs / io.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Some low level IO code, and hacks for various block layer limitations
4  *
5  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6  * Copyright 2012 Google, Inc.
7  */
8
9 #include "bcachefs.h"
10 #include "alloc_foreground.h"
11 #include "bset.h"
12 #include "btree_update.h"
13 #include "buckets.h"
14 #include "checksum.h"
15 #include "compress.h"
16 #include "clock.h"
17 #include "debug.h"
18 #include "disk_groups.h"
19 #include "ec.h"
20 #include "error.h"
21 #include "extents.h"
22 #include "io.h"
23 #include "journal.h"
24 #include "keylist.h"
25 #include "move.h"
26 #include "rebalance.h"
27 #include "super.h"
28 #include "super-io.h"
29 #include "trace.h"
30
31 #include <linux/blkdev.h>
32 #include <linux/random.h>
33
34 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
35
36 static bool bch2_target_congested(struct bch_fs *c, u16 target)
37 {
38         const struct bch_devs_mask *devs;
39         unsigned d, nr = 0, total = 0;
40         u64 now = local_clock(), last;
41         s64 congested;
42         struct bch_dev *ca;
43
44         if (!target)
45                 return false;
46
47         rcu_read_lock();
48         devs = bch2_target_to_mask(c, target);
49         for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
50                 ca = rcu_dereference(c->devs[d]);
51                 if (!ca)
52                         continue;
53
54                 congested = atomic_read(&ca->congested);
55                 last = READ_ONCE(ca->congested_last);
56                 if (time_after64(now, last))
57                         congested -= (now - last) >> 12;
58
59                 total += max(congested, 0LL);
60                 nr++;
61         }
62         rcu_read_unlock();
63
64         return bch2_rand_range(nr * CONGESTED_MAX) < total;
65 }
66
67 static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
68                                        u64 now, int rw)
69 {
70         u64 latency_capable =
71                 ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
72         /* ideally we'd be taking into account the device's variance here: */
73         u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
74         s64 latency_over = io_latency - latency_threshold;
75
76         if (latency_threshold && latency_over > 0) {
77                 /*
78                  * bump up congested by approximately latency_over * 4 /
79                  * latency_threshold - we don't need much accuracy here so don't
80                  * bother with the divide:
81                  */
82                 if (atomic_read(&ca->congested) < CONGESTED_MAX)
83                         atomic_add(latency_over >>
84                                    max_t(int, ilog2(latency_threshold) - 2, 0),
85                                    &ca->congested);
86
87                 ca->congested_last = now;
88         } else if (atomic_read(&ca->congested) > 0) {
89                 atomic_dec(&ca->congested);
90         }
91 }
92
93 void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
94 {
95         atomic64_t *latency = &ca->cur_latency[rw];
96         u64 now = local_clock();
97         u64 io_latency = time_after64(now, submit_time)
98                 ? now - submit_time
99                 : 0;
100         u64 old, new, v = atomic64_read(latency);
101
102         do {
103                 old = v;
104
105                 /*
106                  * If the io latency was reasonably close to the current
107                  * latency, skip doing the update and atomic operation - most of
108                  * the time:
109                  */
110                 if (abs((int) (old - io_latency)) < (old >> 1) &&
111                     now & ~(~0 << 5))
112                         break;
113
114                 new = ewma_add(old, io_latency, 5);
115         } while ((v = atomic64_cmpxchg(latency, old, new)) != old);
116
117         bch2_congested_acct(ca, io_latency, now, rw);
118
119         __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
120 }
121
122 #else
123
124 static bool bch2_target_congested(struct bch_fs *c, u16 target)
125 {
126         return false;
127 }
128
129 #endif
130
131 /* Allocate, free from mempool: */
132
133 void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
134 {
135         struct bvec_iter_all iter;
136         struct bio_vec *bv;
137
138         bio_for_each_segment_all(bv, bio, iter)
139                 if (bv->bv_page != ZERO_PAGE(0))
140                         mempool_free(bv->bv_page, &c->bio_bounce_pages);
141         bio->bi_vcnt = 0;
142 }
143
144 static void bch2_bio_alloc_page_pool(struct bch_fs *c, struct bio *bio,
145                                     bool *using_mempool)
146 {
147         struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt++];
148
149         if (likely(!*using_mempool)) {
150                 bv->bv_page = alloc_page(GFP_NOIO);
151                 if (unlikely(!bv->bv_page)) {
152                         mutex_lock(&c->bio_bounce_pages_lock);
153                         *using_mempool = true;
154                         goto pool_alloc;
155
156                 }
157         } else {
158 pool_alloc:
159                 bv->bv_page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
160         }
161
162         bv->bv_len = PAGE_SIZE;
163         bv->bv_offset = 0;
164 }
165
166 void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
167                                size_t bytes)
168 {
169         bool using_mempool = false;
170
171         BUG_ON(DIV_ROUND_UP(bytes, PAGE_SIZE) > bio->bi_max_vecs);
172
173         bio->bi_iter.bi_size = bytes;
174
175         while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE))
176                 bch2_bio_alloc_page_pool(c, bio, &using_mempool);
177
178         if (using_mempool)
179                 mutex_unlock(&c->bio_bounce_pages_lock);
180 }
181
182 void bch2_bio_alloc_more_pages_pool(struct bch_fs *c, struct bio *bio,
183                                     size_t bytes)
184 {
185         while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE)) {
186                 struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
187
188                 BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs);
189
190                 bv->bv_page = alloc_page(GFP_NOIO);
191                 if (!bv->bv_page) {
192                         /*
193                          * We already allocated from mempool, we can't allocate from it again
194                          * without freeing the pages we already allocated or else we could
195                          * deadlock:
196                          */
197                         bch2_bio_free_pages_pool(c, bio);
198                         bch2_bio_alloc_pages_pool(c, bio, bytes);
199                         return;
200                 }
201
202                 bv->bv_len = PAGE_SIZE;
203                 bv->bv_offset = 0;
204                 bio->bi_vcnt++;
205         }
206
207         bio->bi_iter.bi_size = bytes;
208 }
209
210 /* Writes */
211
212 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
213                                enum bch_data_type type,
214                                const struct bkey_i *k)
215 {
216         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
217         const struct bch_extent_ptr *ptr;
218         struct bch_write_bio *n;
219         struct bch_dev *ca;
220
221         BUG_ON(c->opts.nochanges);
222
223         bkey_for_each_ptr(ptrs, ptr) {
224                 BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
225                        !c->devs[ptr->dev]);
226
227                 ca = bch_dev_bkey_exists(c, ptr->dev);
228
229                 if (to_entry(ptr + 1) < ptrs.end) {
230                         n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
231                                                 GFP_NOIO, &ca->replica_set));
232
233                         n->bio.bi_end_io        = wbio->bio.bi_end_io;
234                         n->bio.bi_private       = wbio->bio.bi_private;
235                         n->parent               = wbio;
236                         n->split                = true;
237                         n->bounce               = false;
238                         n->put_bio              = true;
239                         n->bio.bi_opf           = wbio->bio.bi_opf;
240                         bio_inc_remaining(&wbio->bio);
241                 } else {
242                         n = wbio;
243                         n->split                = false;
244                 }
245
246                 n->c                    = c;
247                 n->dev                  = ptr->dev;
248                 n->have_ioref           = bch2_dev_get_ioref(ca, WRITE);
249                 n->submit_time          = local_clock();
250                 n->bio.bi_iter.bi_sector = ptr->offset;
251
252                 if (!journal_flushes_device(ca))
253                         n->bio.bi_opf |= REQ_FUA;
254
255                 if (likely(n->have_ioref)) {
256                         this_cpu_add(ca->io_done->sectors[WRITE][type],
257                                      bio_sectors(&n->bio));
258
259                         bio_set_dev(&n->bio, ca->disk_sb.bdev);
260
261                         if (type != BCH_DATA_BTREE && unlikely(c->opts.no_data_io)) {
262                                 bio_endio(&n->bio);
263                                 continue;
264                         }
265
266                         submit_bio(&n->bio);
267                 } else {
268                         n->bio.bi_status        = BLK_STS_REMOVED;
269                         bio_endio(&n->bio);
270                 }
271         }
272 }
273
274 static void __bch2_write(struct closure *);
275
276 static void bch2_write_done(struct closure *cl)
277 {
278         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
279         struct bch_fs *c = op->c;
280
281         if (!op->error && (op->flags & BCH_WRITE_FLUSH))
282                 op->error = bch2_journal_error(&c->journal);
283
284         if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
285                 bch2_disk_reservation_put(c, &op->res);
286         percpu_ref_put(&c->writes);
287         bch2_keylist_free(&op->insert_keys, op->inline_keys);
288
289         bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
290
291         closure_return(cl);
292 }
293
294 int bch2_write_index_default(struct bch_write_op *op)
295 {
296         struct keylist *keys = &op->insert_keys;
297         struct btree_iter iter;
298         int ret;
299
300         bch2_btree_iter_init(&iter, op->c, BTREE_ID_EXTENTS,
301                              bkey_start_pos(&bch2_keylist_front(keys)->k),
302                              BTREE_ITER_INTENT);
303
304         ret = bch2_btree_insert_list_at(&iter, keys, &op->res,
305                                         op_journal_seq(op),
306                                         BTREE_INSERT_NOFAIL|
307                                         BTREE_INSERT_USE_RESERVE);
308         bch2_btree_iter_unlock(&iter);
309
310         return ret;
311 }
312
313 /**
314  * bch_write_index - after a write, update index to point to new data
315  */
316 static void __bch2_write_index(struct bch_write_op *op)
317 {
318         struct bch_fs *c = op->c;
319         struct keylist *keys = &op->insert_keys;
320         struct bch_extent_ptr *ptr;
321         struct bkey_i *src, *dst = keys->keys, *n, *k;
322         unsigned dev;
323         int ret;
324
325         for (src = keys->keys; src != keys->top; src = n) {
326                 n = bkey_next(src);
327                 bkey_copy(dst, src);
328
329                 bch2_bkey_drop_ptrs(bkey_i_to_s(dst), ptr,
330                         test_bit(ptr->dev, op->failed.d));
331
332                 if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(dst))) {
333                         ret = -EIO;
334                         goto err;
335                 }
336
337                 dst = bkey_next(dst);
338         }
339
340         keys->top = dst;
341
342         /*
343          * probably not the ideal place to hook this in, but I don't
344          * particularly want to plumb io_opts all the way through the btree
345          * update stack right now
346          */
347         for_each_keylist_key(keys, k)
348                 bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
349
350         if (!bch2_keylist_empty(keys)) {
351                 u64 sectors_start = keylist_sectors(keys);
352                 int ret = op->index_update_fn(op);
353
354                 BUG_ON(keylist_sectors(keys) && !ret);
355
356                 op->written += sectors_start - keylist_sectors(keys);
357
358                 if (ret) {
359                         __bcache_io_error(c, "btree IO error %i", ret);
360                         op->error = ret;
361                 }
362         }
363 out:
364         /* If some a bucket wasn't written, we can't erasure code it: */
365         for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
366                 bch2_open_bucket_write_error(c, &op->open_buckets, dev);
367
368         bch2_open_buckets_put(c, &op->open_buckets);
369         return;
370 err:
371         keys->top = keys->keys;
372         op->error = ret;
373         goto out;
374 }
375
376 static void bch2_write_index(struct closure *cl)
377 {
378         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
379         struct bch_fs *c = op->c;
380
381         __bch2_write_index(op);
382
383         if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
384                 bch2_journal_flush_seq_async(&c->journal,
385                                              *op_journal_seq(op),
386                                              cl);
387                 continue_at(cl, bch2_write_done, index_update_wq(op));
388         } else {
389                 continue_at_nobarrier(cl, bch2_write_done, NULL);
390         }
391 }
392
393 static void bch2_write_endio(struct bio *bio)
394 {
395         struct closure *cl              = bio->bi_private;
396         struct bch_write_op *op         = container_of(cl, struct bch_write_op, cl);
397         struct bch_write_bio *wbio      = to_wbio(bio);
398         struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
399         struct bch_fs *c                = wbio->c;
400         struct bch_dev *ca              = bch_dev_bkey_exists(c, wbio->dev);
401
402         if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
403                 set_bit(wbio->dev, op->failed.d);
404
405         if (wbio->have_ioref) {
406                 bch2_latency_acct(ca, wbio->submit_time, WRITE);
407                 percpu_ref_put(&ca->io_ref);
408         }
409
410         if (wbio->bounce)
411                 bch2_bio_free_pages_pool(c, bio);
412
413         if (wbio->put_bio)
414                 bio_put(bio);
415
416         if (parent)
417                 bio_endio(&parent->bio);
418         else
419                 closure_put(cl);
420 }
421
422 static void init_append_extent(struct bch_write_op *op,
423                                struct write_point *wp,
424                                struct bversion version,
425                                struct bch_extent_crc_unpacked crc)
426 {
427         struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top);
428         struct bch_extent_ptr *ptr;
429
430         op->pos.offset += crc.uncompressed_size;
431         e->k.p          = op->pos;
432         e->k.size       = crc.uncompressed_size;
433         e->k.version    = version;
434
435         if (crc.csum_type ||
436             crc.compression_type ||
437             crc.nonce)
438                 bch2_extent_crc_append(e, crc);
439
440         bch2_alloc_sectors_append_ptrs(op->c, wp, &e->k_i, crc.compressed_size);
441
442         if (op->flags & BCH_WRITE_CACHED)
443                 extent_for_each_ptr(extent_i_to_s(e), ptr)
444                         ptr->cached = true;
445
446         bch2_keylist_push(&op->insert_keys);
447 }
448
449 static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
450                                         struct write_point *wp,
451                                         struct bio *src,
452                                         bool *page_alloc_failed,
453                                         void *buf)
454 {
455         struct bch_write_bio *wbio;
456         struct bio *bio;
457         unsigned output_available =
458                 min(wp->sectors_free << 9, src->bi_iter.bi_size);
459         unsigned pages = DIV_ROUND_UP(output_available, PAGE_SIZE);
460
461         bio = bio_alloc_bioset(NULL, pages, 0,
462                                GFP_NOIO, &c->bio_write);
463         wbio                    = wbio_init(bio);
464         wbio->put_bio           = true;
465         /* copy WRITE_SYNC flag */
466         wbio->bio.bi_opf        = src->bi_opf;
467
468         if (buf) {
469                 bio->bi_iter.bi_size = output_available;
470                 bch2_bio_map(bio, buf);
471                 return bio;
472         }
473
474         wbio->bounce            = true;
475
476         /*
477          * We can't use mempool for more than c->sb.encoded_extent_max
478          * worth of pages, but we'd like to allocate more if we can:
479          */
480         while (bio->bi_iter.bi_size < output_available) {
481                 unsigned len = min_t(unsigned, PAGE_SIZE,
482                                      output_available - bio->bi_iter.bi_size);
483                 struct page *p;
484
485                 p = alloc_page(GFP_NOIO);
486                 if (!p) {
487                         unsigned pool_max =
488                                 min_t(unsigned, output_available,
489                                       c->sb.encoded_extent_max << 9);
490
491                         if (bio_sectors(bio) < pool_max)
492                                 bch2_bio_alloc_pages_pool(c, bio, pool_max);
493                         break;
494                 }
495
496                 bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) {
497                         .bv_page        = p,
498                         .bv_len         = len,
499                         .bv_offset      = 0,
500                 };
501                 bio->bi_iter.bi_size += len;
502         }
503
504         *page_alloc_failed = bio->bi_vcnt < pages;
505         return bio;
506 }
507
508 static int bch2_write_rechecksum(struct bch_fs *c,
509                                  struct bch_write_op *op,
510                                  unsigned new_csum_type)
511 {
512         struct bio *bio = &op->wbio.bio;
513         struct bch_extent_crc_unpacked new_crc;
514         int ret;
515
516         /* bch2_rechecksum_bio() can't encrypt or decrypt data: */
517
518         if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
519             bch2_csum_type_is_encryption(new_csum_type))
520                 new_csum_type = op->crc.csum_type;
521
522         ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
523                                   NULL, &new_crc,
524                                   op->crc.offset, op->crc.live_size,
525                                   new_csum_type);
526         if (ret)
527                 return ret;
528
529         bio_advance(bio, op->crc.offset << 9);
530         bio->bi_iter.bi_size = op->crc.live_size << 9;
531         op->crc = new_crc;
532         return 0;
533 }
534
535 static int bch2_write_decrypt(struct bch_write_op *op)
536 {
537         struct bch_fs *c = op->c;
538         struct nonce nonce = extent_nonce(op->version, op->crc);
539         struct bch_csum csum;
540
541         if (!bch2_csum_type_is_encryption(op->crc.csum_type))
542                 return 0;
543
544         /*
545          * If we need to decrypt data in the write path, we'll no longer be able
546          * to verify the existing checksum (poly1305 mac, in this case) after
547          * it's decrypted - this is the last point we'll be able to reverify the
548          * checksum:
549          */
550         csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
551         if (bch2_crc_cmp(op->crc.csum, csum))
552                 return -EIO;
553
554         bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
555         op->crc.csum_type = 0;
556         op->crc.csum = (struct bch_csum) { 0, 0 };
557         return 0;
558 }
559
560 static enum prep_encoded_ret {
561         PREP_ENCODED_OK,
562         PREP_ENCODED_ERR,
563         PREP_ENCODED_CHECKSUM_ERR,
564         PREP_ENCODED_DO_WRITE,
565 } bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
566 {
567         struct bch_fs *c = op->c;
568         struct bio *bio = &op->wbio.bio;
569
570         if (!(op->flags & BCH_WRITE_DATA_ENCODED))
571                 return PREP_ENCODED_OK;
572
573         BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
574
575         /* Can we just write the entire extent as is? */
576         if (op->crc.uncompressed_size == op->crc.live_size &&
577             op->crc.compressed_size <= wp->sectors_free &&
578             op->crc.compression_type == op->compression_type) {
579                 if (!op->crc.compression_type &&
580                     op->csum_type != op->crc.csum_type &&
581                     bch2_write_rechecksum(c, op, op->csum_type))
582                         return PREP_ENCODED_CHECKSUM_ERR;
583
584                 return PREP_ENCODED_DO_WRITE;
585         }
586
587         /*
588          * If the data is compressed and we couldn't write the entire extent as
589          * is, we have to decompress it:
590          */
591         if (op->crc.compression_type) {
592                 struct bch_csum csum;
593
594                 if (bch2_write_decrypt(op))
595                         return PREP_ENCODED_CHECKSUM_ERR;
596
597                 /* Last point we can still verify checksum: */
598                 csum = bch2_checksum_bio(c, op->crc.csum_type,
599                                          extent_nonce(op->version, op->crc),
600                                          bio);
601                 if (bch2_crc_cmp(op->crc.csum, csum))
602                         return PREP_ENCODED_CHECKSUM_ERR;
603
604                 if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
605                         return PREP_ENCODED_ERR;
606         }
607
608         /*
609          * No longer have compressed data after this point - data might be
610          * encrypted:
611          */
612
613         /*
614          * If the data is checksummed and we're only writing a subset,
615          * rechecksum and adjust bio to point to currently live data:
616          */
617         if ((op->crc.live_size != op->crc.uncompressed_size ||
618              op->crc.csum_type != op->csum_type) &&
619             bch2_write_rechecksum(c, op, op->csum_type))
620                 return PREP_ENCODED_CHECKSUM_ERR;
621
622         /*
623          * If we want to compress the data, it has to be decrypted:
624          */
625         if ((op->compression_type ||
626              bch2_csum_type_is_encryption(op->crc.csum_type) !=
627              bch2_csum_type_is_encryption(op->csum_type)) &&
628             bch2_write_decrypt(op))
629                 return PREP_ENCODED_CHECKSUM_ERR;
630
631         return PREP_ENCODED_OK;
632 }
633
634 static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
635 {
636         struct bch_fs *c = op->c;
637         struct bio *src = &op->wbio.bio, *dst = src;
638         struct bvec_iter saved_iter;
639         struct bkey_i *key_to_write;
640         void *ec_buf;
641         unsigned key_to_write_offset = op->insert_keys.top_p -
642                 op->insert_keys.keys_p;
643         unsigned total_output = 0, total_input = 0;
644         bool bounce = false;
645         bool page_alloc_failed = false;
646         int ret, more = 0;
647
648         BUG_ON(!bio_sectors(src));
649
650         ec_buf = bch2_writepoint_ec_buf(c, wp);
651
652         switch (bch2_write_prep_encoded_data(op, wp)) {
653         case PREP_ENCODED_OK:
654                 break;
655         case PREP_ENCODED_ERR:
656                 ret = -EIO;
657                 goto err;
658         case PREP_ENCODED_CHECKSUM_ERR:
659                 goto csum_err;
660         case PREP_ENCODED_DO_WRITE:
661                 if (ec_buf) {
662                         dst = bch2_write_bio_alloc(c, wp, src,
663                                                    &page_alloc_failed,
664                                                    ec_buf);
665                         bio_copy_data(dst, src);
666                         bounce = true;
667                 }
668                 init_append_extent(op, wp, op->version, op->crc);
669                 goto do_write;
670         }
671
672         if (ec_buf ||
673             op->compression_type ||
674             (op->csum_type &&
675              !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
676             (bch2_csum_type_is_encryption(op->csum_type) &&
677              !(op->flags & BCH_WRITE_PAGES_OWNED))) {
678                 dst = bch2_write_bio_alloc(c, wp, src,
679                                            &page_alloc_failed,
680                                            ec_buf);
681                 bounce = true;
682         }
683
684         saved_iter = dst->bi_iter;
685
686         do {
687                 struct bch_extent_crc_unpacked crc =
688                         (struct bch_extent_crc_unpacked) { 0 };
689                 struct bversion version = op->version;
690                 size_t dst_len, src_len;
691
692                 if (page_alloc_failed &&
693                     bio_sectors(dst) < wp->sectors_free &&
694                     bio_sectors(dst) < c->sb.encoded_extent_max)
695                         break;
696
697                 BUG_ON(op->compression_type &&
698                        (op->flags & BCH_WRITE_DATA_ENCODED) &&
699                        bch2_csum_type_is_encryption(op->crc.csum_type));
700                 BUG_ON(op->compression_type && !bounce);
701
702                 crc.compression_type = op->compression_type
703                         ?  bch2_bio_compress(c, dst, &dst_len, src, &src_len,
704                                              op->compression_type)
705                         : 0;
706                 if (!crc.compression_type) {
707                         dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
708                         dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
709
710                         if (op->csum_type)
711                                 dst_len = min_t(unsigned, dst_len,
712                                                 c->sb.encoded_extent_max << 9);
713
714                         if (bounce) {
715                                 swap(dst->bi_iter.bi_size, dst_len);
716                                 bio_copy_data(dst, src);
717                                 swap(dst->bi_iter.bi_size, dst_len);
718                         }
719
720                         src_len = dst_len;
721                 }
722
723                 BUG_ON(!src_len || !dst_len);
724
725                 if (bch2_csum_type_is_encryption(op->csum_type)) {
726                         if (bversion_zero(version)) {
727                                 version.lo = atomic64_inc_return(&c->key_version) + 1;
728                         } else {
729                                 crc.nonce = op->nonce;
730                                 op->nonce += src_len >> 9;
731                         }
732                 }
733
734                 if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
735                     !crc.compression_type &&
736                     bch2_csum_type_is_encryption(op->crc.csum_type) ==
737                     bch2_csum_type_is_encryption(op->csum_type)) {
738                         /*
739                          * Note: when we're using rechecksum(), we need to be
740                          * checksumming @src because it has all the data our
741                          * existing checksum covers - if we bounced (because we
742                          * were trying to compress), @dst will only have the
743                          * part of the data the new checksum will cover.
744                          *
745                          * But normally we want to be checksumming post bounce,
746                          * because part of the reason for bouncing is so the
747                          * data can't be modified (by userspace) while it's in
748                          * flight.
749                          */
750                         if (bch2_rechecksum_bio(c, src, version, op->crc,
751                                         &crc, &op->crc,
752                                         src_len >> 9,
753                                         bio_sectors(src) - (src_len >> 9),
754                                         op->csum_type))
755                                 goto csum_err;
756                 } else {
757                         if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
758                             bch2_rechecksum_bio(c, src, version, op->crc,
759                                         NULL, &op->crc,
760                                         src_len >> 9,
761                                         bio_sectors(src) - (src_len >> 9),
762                                         op->crc.csum_type))
763                                 goto csum_err;
764
765                         crc.compressed_size     = dst_len >> 9;
766                         crc.uncompressed_size   = src_len >> 9;
767                         crc.live_size           = src_len >> 9;
768
769                         swap(dst->bi_iter.bi_size, dst_len);
770                         bch2_encrypt_bio(c, op->csum_type,
771                                          extent_nonce(version, crc), dst);
772                         crc.csum = bch2_checksum_bio(c, op->csum_type,
773                                          extent_nonce(version, crc), dst);
774                         crc.csum_type = op->csum_type;
775                         swap(dst->bi_iter.bi_size, dst_len);
776                 }
777
778                 init_append_extent(op, wp, version, crc);
779
780                 if (dst != src)
781                         bio_advance(dst, dst_len);
782                 bio_advance(src, src_len);
783                 total_output    += dst_len;
784                 total_input     += src_len;
785         } while (dst->bi_iter.bi_size &&
786                  src->bi_iter.bi_size &&
787                  wp->sectors_free &&
788                  !bch2_keylist_realloc(&op->insert_keys,
789                                       op->inline_keys,
790                                       ARRAY_SIZE(op->inline_keys),
791                                       BKEY_EXTENT_U64s_MAX));
792
793         more = src->bi_iter.bi_size != 0;
794
795         dst->bi_iter = saved_iter;
796
797         if (dst == src && more) {
798                 BUG_ON(total_output != total_input);
799
800                 dst = bio_split(src, total_input >> 9,
801                                 GFP_NOIO, &c->bio_write);
802                 wbio_init(dst)->put_bio = true;
803                 /* copy WRITE_SYNC flag */
804                 dst->bi_opf             = src->bi_opf;
805         }
806
807         dst->bi_iter.bi_size = total_output;
808
809         /* Free unneeded pages after compressing: */
810         if (to_wbio(dst)->bounce)
811                 while (dst->bi_vcnt > DIV_ROUND_UP(dst->bi_iter.bi_size, PAGE_SIZE))
812                         mempool_free(dst->bi_io_vec[--dst->bi_vcnt].bv_page,
813                                      &c->bio_bounce_pages);
814 do_write:
815         /* might have done a realloc... */
816
817         key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
818
819         bch2_ec_add_backpointer(c, wp,
820                                 bkey_start_pos(&key_to_write->k),
821                                 total_input >> 9);
822
823         dst->bi_end_io  = bch2_write_endio;
824         dst->bi_private = &op->cl;
825         dst->bi_opf     = REQ_OP_WRITE;
826
827         closure_get(dst->bi_private);
828
829         bch2_submit_wbio_replicas(to_wbio(dst), c, BCH_DATA_USER,
830                                   key_to_write);
831         return more;
832 csum_err:
833         bch_err(c, "error verifying existing checksum while "
834                 "rewriting existing data (memory corruption?)");
835         ret = -EIO;
836 err:
837         if (to_wbio(dst)->bounce)
838                 bch2_bio_free_pages_pool(c, dst);
839         if (to_wbio(dst)->put_bio)
840                 bio_put(dst);
841
842         return ret;
843 }
844
845 static void __bch2_write(struct closure *cl)
846 {
847         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
848         struct bch_fs *c = op->c;
849         struct write_point *wp;
850         int ret;
851 again:
852         memset(&op->failed, 0, sizeof(op->failed));
853
854         do {
855                 /* +1 for possible cache device: */
856                 if (op->open_buckets.nr + op->nr_replicas + 1 >
857                     ARRAY_SIZE(op->open_buckets.v))
858                         goto flush_io;
859
860                 if (bch2_keylist_realloc(&op->insert_keys,
861                                         op->inline_keys,
862                                         ARRAY_SIZE(op->inline_keys),
863                                         BKEY_EXTENT_U64s_MAX))
864                         goto flush_io;
865
866                 wp = bch2_alloc_sectors_start(c,
867                         op->target,
868                         op->opts.erasure_code,
869                         op->write_point,
870                         &op->devs_have,
871                         op->nr_replicas,
872                         op->nr_replicas_required,
873                         op->alloc_reserve,
874                         op->flags,
875                         (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
876                 EBUG_ON(!wp);
877
878                 if (unlikely(IS_ERR(wp))) {
879                         if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
880                                 ret = PTR_ERR(wp);
881                                 goto err;
882                         }
883
884                         goto flush_io;
885                 }
886
887                 ret = bch2_write_extent(op, wp);
888
889                 bch2_open_bucket_get(c, wp, &op->open_buckets);
890                 bch2_alloc_sectors_done(c, wp);
891
892                 if (ret < 0)
893                         goto err;
894         } while (ret);
895
896         continue_at(cl, bch2_write_index, index_update_wq(op));
897         return;
898 err:
899         op->error = ret;
900
901         continue_at(cl, !bch2_keylist_empty(&op->insert_keys)
902                     ? bch2_write_index
903                     : bch2_write_done, index_update_wq(op));
904         return;
905 flush_io:
906         closure_sync(cl);
907
908         if (!bch2_keylist_empty(&op->insert_keys)) {
909                 __bch2_write_index(op);
910
911                 if (op->error) {
912                         continue_at_nobarrier(cl, bch2_write_done, NULL);
913                         return;
914                 }
915         }
916
917         goto again;
918 }
919
920 /**
921  * bch_write - handle a write to a cache device or flash only volume
922  *
923  * This is the starting point for any data to end up in a cache device; it could
924  * be from a normal write, or a writeback write, or a write to a flash only
925  * volume - it's also used by the moving garbage collector to compact data in
926  * mostly empty buckets.
927  *
928  * It first writes the data to the cache, creating a list of keys to be inserted
929  * (if the data won't fit in a single open bucket, there will be multiple keys);
930  * after the data is written it calls bch_journal, and after the keys have been
931  * added to the next journal write they're inserted into the btree.
932  *
933  * If op->discard is true, instead of inserting the data it invalidates the
934  * region of the cache represented by op->bio and op->inode.
935  */
936 void bch2_write(struct closure *cl)
937 {
938         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
939         struct bch_fs *c = op->c;
940
941         BUG_ON(!op->nr_replicas);
942         BUG_ON(!op->write_point.v);
943         BUG_ON(!bkey_cmp(op->pos, POS_MAX));
944         BUG_ON(bio_sectors(&op->wbio.bio) > U16_MAX);
945
946         op->start_time = local_clock();
947
948         bch2_keylist_init(&op->insert_keys, op->inline_keys);
949         wbio_init(&op->wbio.bio)->put_bio = false;
950
951         if (c->opts.nochanges ||
952             !percpu_ref_tryget(&c->writes)) {
953                 __bcache_io_error(c, "read only");
954                 op->error = -EROFS;
955                 if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
956                         bch2_disk_reservation_put(c, &op->res);
957                 closure_return(cl);
958                 return;
959         }
960
961         bch2_increment_clock(c, bio_sectors(&op->wbio.bio), WRITE);
962
963         continue_at_nobarrier(cl, __bch2_write, NULL);
964 }
965
966 /* Cache promotion on read */
967
968 struct promote_op {
969         struct closure          cl;
970         struct rcu_head         rcu;
971         u64                     start_time;
972
973         struct rhash_head       hash;
974         struct bpos             pos;
975
976         struct migrate_write    write;
977         struct bio_vec          bi_inline_vecs[0]; /* must be last */
978 };
979
980 static const struct rhashtable_params bch_promote_params = {
981         .head_offset    = offsetof(struct promote_op, hash),
982         .key_offset     = offsetof(struct promote_op, pos),
983         .key_len        = sizeof(struct bpos),
984 };
985
986 static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
987                                   struct bpos pos,
988                                   struct bch_io_opts opts,
989                                   unsigned flags)
990 {
991         if (!opts.promote_target)
992                 return false;
993
994         if (!(flags & BCH_READ_MAY_PROMOTE))
995                 return false;
996
997         if (percpu_ref_is_dying(&c->writes))
998                 return false;
999
1000         if (!bkey_extent_is_data(k.k))
1001                 return false;
1002
1003         if (bch2_extent_has_target(c, bkey_s_c_to_extent(k), opts.promote_target))
1004                 return false;
1005
1006         if (bch2_target_congested(c, opts.promote_target))
1007                 return false;
1008
1009         if (rhashtable_lookup_fast(&c->promote_table, &pos,
1010                                    bch_promote_params))
1011                 return false;
1012
1013         return true;
1014 }
1015
1016 static void promote_free(struct bch_fs *c, struct promote_op *op)
1017 {
1018         int ret;
1019
1020         ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
1021                                      bch_promote_params);
1022         BUG_ON(ret);
1023         percpu_ref_put(&c->writes);
1024         kfree_rcu(op, rcu);
1025 }
1026
1027 static void promote_done(struct closure *cl)
1028 {
1029         struct promote_op *op =
1030                 container_of(cl, struct promote_op, cl);
1031         struct bch_fs *c = op->write.op.c;
1032
1033         bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
1034                                op->start_time);
1035
1036         bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
1037         promote_free(c, op);
1038 }
1039
1040 static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
1041 {
1042         struct bch_fs *c = rbio->c;
1043         struct closure *cl = &op->cl;
1044         struct bio *bio = &op->write.op.wbio.bio;
1045
1046         trace_promote(&rbio->bio);
1047
1048         /* we now own pages: */
1049         BUG_ON(!rbio->bounce);
1050         BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
1051
1052         memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
1053                sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
1054         swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
1055
1056         bch2_migrate_read_done(&op->write, rbio);
1057
1058         closure_init(cl, NULL);
1059         closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
1060         closure_return_with_destructor(cl, promote_done);
1061 }
1062
1063 noinline
1064 static struct promote_op *__promote_alloc(struct bch_fs *c,
1065                                           struct bpos pos,
1066                                           struct extent_ptr_decoded *pick,
1067                                           struct bch_io_opts opts,
1068                                           unsigned rbio_sectors,
1069                                           struct bch_read_bio **rbio)
1070 {
1071         struct promote_op *op = NULL;
1072         struct bio *bio;
1073         unsigned rbio_pages = DIV_ROUND_UP(rbio_sectors, PAGE_SECTORS);
1074         /* data might have to be decompressed in the write path: */
1075         unsigned wbio_pages = DIV_ROUND_UP(pick->crc.uncompressed_size,
1076                                            PAGE_SECTORS);
1077         int ret;
1078
1079         if (!percpu_ref_tryget(&c->writes))
1080                 return NULL;
1081
1082         op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * wbio_pages,
1083                      GFP_NOIO);
1084         if (!op)
1085                 goto err;
1086
1087         op->start_time = local_clock();
1088         op->pos = pos;
1089
1090         /*
1091          * promotes require bouncing, but if the extent isn't
1092          * checksummed/compressed it might be too big for the mempool:
1093          */
1094         if (rbio_sectors > c->sb.encoded_extent_max) {
1095                 *rbio = kzalloc(sizeof(struct bch_read_bio) +
1096                                 sizeof(struct bio_vec) * rbio_pages,
1097                                 GFP_NOIO);
1098                 if (!*rbio)
1099                         goto err;
1100
1101                 rbio_init(&(*rbio)->bio, opts);
1102                 bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, rbio_pages, 0);
1103
1104                 if (bch2_bio_alloc_pages(&(*rbio)->bio, rbio_sectors << 9,
1105                                          GFP_NOIO))
1106                         goto err;
1107
1108                 (*rbio)->bounce         = true;
1109                 (*rbio)->split          = true;
1110                 (*rbio)->kmalloc        = true;
1111         }
1112
1113         if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
1114                                           bch_promote_params))
1115                 goto err;
1116
1117         bio = &op->write.op.wbio.bio;
1118         bio_init(bio, NULL, bio->bi_inline_vecs, wbio_pages, 0);
1119
1120         ret = bch2_migrate_write_init(c, &op->write,
1121                         writepoint_hashed((unsigned long) current),
1122                         opts,
1123                         DATA_PROMOTE,
1124                         (struct data_opts) {
1125                                 .target = opts.promote_target
1126                         },
1127                         bkey_s_c_null);
1128         BUG_ON(ret);
1129
1130         return op;
1131 err:
1132         if (*rbio)
1133                 bio_free_pages(&(*rbio)->bio);
1134         kfree(*rbio);
1135         *rbio = NULL;
1136         kfree(op);
1137         percpu_ref_put(&c->writes);
1138         return NULL;
1139 }
1140
1141 static inline struct promote_op *promote_alloc(struct bch_fs *c,
1142                                                struct bvec_iter iter,
1143                                                struct bkey_s_c k,
1144                                                struct extent_ptr_decoded *pick,
1145                                                struct bch_io_opts opts,
1146                                                unsigned flags,
1147                                                struct bch_read_bio **rbio,
1148                                                bool *bounce,
1149                                                bool *read_full)
1150 {
1151         bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
1152         unsigned sectors = promote_full
1153                 ? pick->crc.compressed_size
1154                 : bvec_iter_sectors(iter);
1155         struct bpos pos = promote_full
1156                 ? bkey_start_pos(k.k)
1157                 : POS(k.k->p.inode, iter.bi_sector);
1158         struct promote_op *promote;
1159
1160         if (!should_promote(c, k, pos, opts, flags))
1161                 return NULL;
1162
1163         promote = __promote_alloc(c, pos, pick, opts, sectors, rbio);
1164         if (!promote)
1165                 return NULL;
1166
1167         *bounce         = true;
1168         *read_full      = promote_full;
1169         return promote;
1170 }
1171
1172 /* Read */
1173
1174 #define READ_RETRY_AVOID        1
1175 #define READ_RETRY              2
1176 #define READ_ERR                3
1177
1178 enum rbio_context {
1179         RBIO_CONTEXT_NULL,
1180         RBIO_CONTEXT_HIGHPRI,
1181         RBIO_CONTEXT_UNBOUND,
1182 };
1183
1184 static inline struct bch_read_bio *
1185 bch2_rbio_parent(struct bch_read_bio *rbio)
1186 {
1187         return rbio->split ? rbio->parent : rbio;
1188 }
1189
1190 __always_inline
1191 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
1192                            enum rbio_context context,
1193                            struct workqueue_struct *wq)
1194 {
1195         if (context <= rbio->context) {
1196                 fn(&rbio->work);
1197         } else {
1198                 rbio->work.func         = fn;
1199                 rbio->context           = context;
1200                 queue_work(wq, &rbio->work);
1201         }
1202 }
1203
1204 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
1205 {
1206         BUG_ON(rbio->bounce && !rbio->split);
1207
1208         if (rbio->promote)
1209                 promote_free(rbio->c, rbio->promote);
1210         rbio->promote = NULL;
1211
1212         if (rbio->bounce)
1213                 bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
1214
1215         if (rbio->split) {
1216                 struct bch_read_bio *parent = rbio->parent;
1217
1218                 if (rbio->kmalloc)
1219                         kfree(rbio);
1220                 else
1221                         bio_put(&rbio->bio);
1222
1223                 rbio = parent;
1224         }
1225
1226         return rbio;
1227 }
1228
1229 static void bch2_rbio_done(struct bch_read_bio *rbio)
1230 {
1231         bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
1232                                rbio->start_time);
1233         bio_endio(&rbio->bio);
1234 }
1235
1236 static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
1237                                      struct bvec_iter bvec_iter, u64 inode,
1238                                      struct bch_io_failures *failed,
1239                                      unsigned flags)
1240 {
1241         struct btree_iter iter;
1242         BKEY_PADDED(k) tmp;
1243         struct bkey_s_c k;
1244         int ret;
1245
1246         flags &= ~BCH_READ_LAST_FRAGMENT;
1247
1248         bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
1249                              rbio->pos, BTREE_ITER_SLOTS);
1250 retry:
1251         rbio->bio.bi_status = 0;
1252
1253         k = bch2_btree_iter_peek_slot(&iter);
1254         if (btree_iter_err(k)) {
1255                 bch2_btree_iter_unlock(&iter);
1256                 goto err;
1257         }
1258
1259         bkey_reassemble(&tmp.k, k);
1260         k = bkey_i_to_s_c(&tmp.k);
1261         bch2_btree_iter_unlock(&iter);
1262
1263         if (!bkey_extent_is_data(k.k) ||
1264             !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k),
1265                                      rbio->pick.ptr,
1266                                      rbio->pos.offset -
1267                                      rbio->pick.crc.offset)) {
1268                 /* extent we wanted to read no longer exists: */
1269                 rbio->hole = true;
1270                 goto out;
1271         }
1272
1273         ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags);
1274         if (ret == READ_RETRY)
1275                 goto retry;
1276         if (ret)
1277                 goto err;
1278         goto out;
1279 err:
1280         rbio->bio.bi_status = BLK_STS_IOERR;
1281 out:
1282         bch2_rbio_done(rbio);
1283 }
1284
1285 static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
1286                             struct bvec_iter bvec_iter, u64 inode,
1287                             struct bch_io_failures *failed, unsigned flags)
1288 {
1289         struct btree_iter iter;
1290         struct bkey_s_c k;
1291         int ret;
1292
1293         flags &= ~BCH_READ_LAST_FRAGMENT;
1294         flags |= BCH_READ_MUST_CLONE;
1295 retry:
1296         for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
1297                            POS(inode, bvec_iter.bi_sector),
1298                            BTREE_ITER_SLOTS, k) {
1299                 BKEY_PADDED(k) tmp;
1300                 unsigned bytes;
1301
1302                 bkey_reassemble(&tmp.k, k);
1303                 k = bkey_i_to_s_c(&tmp.k);
1304                 bch2_btree_iter_unlock(&iter);
1305
1306                 bytes = min_t(unsigned, bvec_iter.bi_size,
1307                               (k.k->p.offset - bvec_iter.bi_sector) << 9);
1308                 swap(bvec_iter.bi_size, bytes);
1309
1310                 ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags);
1311                 switch (ret) {
1312                 case READ_RETRY:
1313                         goto retry;
1314                 case READ_ERR:
1315                         goto err;
1316                 };
1317
1318                 if (bytes == bvec_iter.bi_size)
1319                         goto out;
1320
1321                 swap(bvec_iter.bi_size, bytes);
1322                 bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
1323         }
1324
1325         /*
1326          * If we get here, it better have been because there was an error
1327          * reading a btree node
1328          */
1329         ret = bch2_btree_iter_unlock(&iter);
1330         BUG_ON(!ret);
1331         __bcache_io_error(c, "btree IO error %i", ret);
1332 err:
1333         rbio->bio.bi_status = BLK_STS_IOERR;
1334 out:
1335         bch2_rbio_done(rbio);
1336 }
1337
1338 static void bch2_rbio_retry(struct work_struct *work)
1339 {
1340         struct bch_read_bio *rbio =
1341                 container_of(work, struct bch_read_bio, work);
1342         struct bch_fs *c        = rbio->c;
1343         struct bvec_iter iter   = rbio->bvec_iter;
1344         unsigned flags          = rbio->flags;
1345         u64 inode               = rbio->pos.inode;
1346         struct bch_io_failures failed = { .nr = 0 };
1347
1348         trace_read_retry(&rbio->bio);
1349
1350         if (rbio->retry == READ_RETRY_AVOID)
1351                 bch2_mark_io_failure(&failed, &rbio->pick);
1352
1353         rbio->bio.bi_status = 0;
1354
1355         rbio = bch2_rbio_free(rbio);
1356
1357         flags |= BCH_READ_IN_RETRY;
1358         flags &= ~BCH_READ_MAY_PROMOTE;
1359
1360         if (flags & BCH_READ_NODECODE)
1361                 bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags);
1362         else
1363                 bch2_read_retry(c, rbio, iter, inode, &failed, flags);
1364 }
1365
1366 static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
1367                             blk_status_t error)
1368 {
1369         rbio->retry = retry;
1370
1371         if (rbio->flags & BCH_READ_IN_RETRY)
1372                 return;
1373
1374         if (retry == READ_ERR) {
1375                 rbio = bch2_rbio_free(rbio);
1376
1377                 rbio->bio.bi_status = error;
1378                 bch2_rbio_done(rbio);
1379         } else {
1380                 bch2_rbio_punt(rbio, bch2_rbio_retry,
1381                                RBIO_CONTEXT_UNBOUND, system_unbound_wq);
1382         }
1383 }
1384
1385 static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
1386 {
1387         struct bch_fs *c = rbio->c;
1388         struct btree_iter iter;
1389         struct bkey_s_c k;
1390         struct bkey_i_extent *e;
1391         BKEY_PADDED(k) new;
1392         struct bch_extent_crc_unpacked new_crc;
1393         unsigned offset;
1394         int ret;
1395
1396         if (rbio->pick.crc.compression_type)
1397                 return;
1398
1399         bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, rbio->pos,
1400                              BTREE_ITER_INTENT);
1401 retry:
1402         k = bch2_btree_iter_peek(&iter);
1403         if (IS_ERR_OR_NULL(k.k))
1404                 goto out;
1405
1406         if (!bkey_extent_is_data(k.k))
1407                 goto out;
1408
1409         bkey_reassemble(&new.k, k);
1410         e = bkey_i_to_extent(&new.k);
1411
1412         if (!bch2_extent_matches_ptr(c, extent_i_to_s_c(e),
1413                                      rbio->pick.ptr,
1414                                      rbio->pos.offset -
1415                                      rbio->pick.crc.offset) ||
1416             bversion_cmp(e->k.version, rbio->version))
1417                 goto out;
1418
1419         /* Extent was merged? */
1420         if (bkey_start_offset(&e->k) < rbio->pos.offset ||
1421             e->k.p.offset > rbio->pos.offset + rbio->pick.crc.uncompressed_size)
1422                 goto out;
1423
1424         /* The extent might have been partially overwritten since we read it: */
1425         offset = rbio->pick.crc.offset + (bkey_start_offset(&e->k) - rbio->pos.offset);
1426
1427         if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
1428                                 rbio->pick.crc, NULL, &new_crc,
1429                                 offset, e->k.size,
1430                                 rbio->pick.crc.csum_type)) {
1431                 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
1432                 goto out;
1433         }
1434
1435         if (!bch2_extent_narrow_crcs(e, new_crc))
1436                 goto out;
1437
1438         ret = bch2_btree_insert_at(c, NULL, NULL,
1439                                    BTREE_INSERT_ATOMIC|
1440                                    BTREE_INSERT_NOFAIL|
1441                                    BTREE_INSERT_NOWAIT,
1442                                    BTREE_INSERT_ENTRY(&iter, &e->k_i));
1443         if (ret == -EINTR)
1444                 goto retry;
1445 out:
1446         bch2_btree_iter_unlock(&iter);
1447 }
1448
1449 static bool should_narrow_crcs(struct bkey_s_c k,
1450                                struct extent_ptr_decoded *pick,
1451                                unsigned flags)
1452 {
1453         return !(flags & BCH_READ_IN_RETRY) &&
1454                 bkey_extent_is_data(k.k) &&
1455                 bch2_can_narrow_extent_crcs(bkey_s_c_to_extent(k), pick->crc);
1456 }
1457
1458 /* Inner part that may run in process context */
1459 static void __bch2_read_endio(struct work_struct *work)
1460 {
1461         struct bch_read_bio *rbio =
1462                 container_of(work, struct bch_read_bio, work);
1463         struct bch_fs *c        = rbio->c;
1464         struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
1465         struct bio *src         = &rbio->bio;
1466         struct bio *dst         = &bch2_rbio_parent(rbio)->bio;
1467         struct bvec_iter dst_iter = rbio->bvec_iter;
1468         struct bch_extent_crc_unpacked crc = rbio->pick.crc;
1469         struct nonce nonce = extent_nonce(rbio->version, crc);
1470         struct bch_csum csum;
1471
1472         /* Reset iterator for checksumming and copying bounced data: */
1473         if (rbio->bounce) {
1474                 src->bi_iter.bi_size            = crc.compressed_size << 9;
1475                 src->bi_iter.bi_idx             = 0;
1476                 src->bi_iter.bi_bvec_done       = 0;
1477         } else {
1478                 src->bi_iter                    = rbio->bvec_iter;
1479         }
1480
1481         csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
1482         if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
1483                 goto csum_err;
1484
1485         if (unlikely(rbio->narrow_crcs))
1486                 bch2_rbio_narrow_crcs(rbio);
1487
1488         if (rbio->flags & BCH_READ_NODECODE)
1489                 goto nodecode;
1490
1491         /* Adjust crc to point to subset of data we want: */
1492         crc.offset     += rbio->bvec_iter.bi_sector - rbio->pos.offset;
1493         crc.live_size   = bvec_iter_sectors(rbio->bvec_iter);
1494
1495         if (crc.compression_type != BCH_COMPRESSION_NONE) {
1496                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1497                 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
1498                         goto decompression_err;
1499         } else {
1500                 /* don't need to decrypt the entire bio: */
1501                 nonce = nonce_add(nonce, crc.offset << 9);
1502                 bio_advance(src, crc.offset << 9);
1503
1504                 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
1505                 src->bi_iter.bi_size = dst_iter.bi_size;
1506
1507                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1508
1509                 if (rbio->bounce) {
1510                         struct bvec_iter src_iter = src->bi_iter;
1511                         bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
1512                 }
1513         }
1514
1515         if (rbio->promote) {
1516                 /*
1517                  * Re encrypt data we decrypted, so it's consistent with
1518                  * rbio->crc:
1519                  */
1520                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1521                 promote_start(rbio->promote, rbio);
1522                 rbio->promote = NULL;
1523         }
1524 nodecode:
1525         if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
1526                 rbio = bch2_rbio_free(rbio);
1527                 bch2_rbio_done(rbio);
1528         }
1529         return;
1530 csum_err:
1531         /*
1532          * Checksum error: if the bio wasn't bounced, we may have been
1533          * reading into buffers owned by userspace (that userspace can
1534          * scribble over) - retry the read, bouncing it this time:
1535          */
1536         if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
1537                 rbio->flags |= BCH_READ_MUST_BOUNCE;
1538                 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
1539                 return;
1540         }
1541
1542         bch2_dev_io_error(ca,
1543                 "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)",
1544                 rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
1545                 rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
1546                 csum.hi, csum.lo, crc.csum_type);
1547         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
1548         return;
1549 decompression_err:
1550         __bcache_io_error(c, "decompression error, inode %llu offset %llu",
1551                           rbio->pos.inode,
1552                           (u64) rbio->bvec_iter.bi_sector);
1553         bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
1554         return;
1555 }
1556
1557 static void bch2_read_endio(struct bio *bio)
1558 {
1559         struct bch_read_bio *rbio =
1560                 container_of(bio, struct bch_read_bio, bio);
1561         struct bch_fs *c        = rbio->c;
1562         struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
1563         struct workqueue_struct *wq = NULL;
1564         enum rbio_context context = RBIO_CONTEXT_NULL;
1565
1566         if (rbio->have_ioref) {
1567                 bch2_latency_acct(ca, rbio->submit_time, READ);
1568                 percpu_ref_put(&ca->io_ref);
1569         }
1570
1571         if (!rbio->split)
1572                 rbio->bio.bi_end_io = rbio->end_io;
1573
1574         if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) {
1575                 bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
1576                 return;
1577         }
1578
1579         if (rbio->pick.ptr.cached &&
1580             (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
1581              ptr_stale(ca, &rbio->pick.ptr))) {
1582                 atomic_long_inc(&c->read_realloc_races);
1583
1584                 if (rbio->flags & BCH_READ_RETRY_IF_STALE)
1585                         bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
1586                 else
1587                         bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
1588                 return;
1589         }
1590
1591         if (rbio->narrow_crcs ||
1592             rbio->pick.crc.compression_type ||
1593             bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
1594                 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
1595         else if (rbio->pick.crc.csum_type)
1596                 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
1597
1598         bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
1599 }
1600
1601 int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
1602                        struct bvec_iter iter, struct bkey_s_c k,
1603                        struct bch_io_failures *failed, unsigned flags)
1604 {
1605         struct extent_ptr_decoded pick;
1606         struct bch_read_bio *rbio = NULL;
1607         struct bch_dev *ca;
1608         struct promote_op *promote = NULL;
1609         bool bounce = false, read_full = false, narrow_crcs = false;
1610         struct bpos pos = bkey_start_pos(k.k);
1611         int pick_ret;
1612
1613         pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
1614
1615         /* hole or reservation - just zero fill: */
1616         if (!pick_ret)
1617                 goto hole;
1618
1619         if (pick_ret < 0) {
1620                 __bcache_io_error(c, "no device to read from");
1621                 goto err;
1622         }
1623
1624         if (pick_ret > 0)
1625                 ca = bch_dev_bkey_exists(c, pick.ptr.dev);
1626
1627         if (flags & BCH_READ_NODECODE) {
1628                 /*
1629                  * can happen if we retry, and the extent we were going to read
1630                  * has been merged in the meantime:
1631                  */
1632                 if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
1633                         goto hole;
1634
1635                 iter.bi_sector  = pos.offset;
1636                 iter.bi_size    = pick.crc.compressed_size << 9;
1637                 goto noclone;
1638         }
1639
1640         if (!(flags & BCH_READ_LAST_FRAGMENT) ||
1641             bio_flagged(&orig->bio, BIO_CHAIN))
1642                 flags |= BCH_READ_MUST_CLONE;
1643
1644         narrow_crcs = should_narrow_crcs(k, &pick, flags);
1645
1646         if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
1647                 flags |= BCH_READ_MUST_BOUNCE;
1648
1649         EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector ||
1650                 k.k->p.offset < bvec_iter_end_sector(iter));
1651
1652         if (pick.crc.compression_type != BCH_COMPRESSION_NONE ||
1653             (pick.crc.csum_type != BCH_CSUM_NONE &&
1654              (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
1655               (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
1656                (flags & BCH_READ_USER_MAPPED)) ||
1657               (flags & BCH_READ_MUST_BOUNCE)))) {
1658                 read_full = true;
1659                 bounce = true;
1660         }
1661
1662         promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
1663                                 &rbio, &bounce, &read_full);
1664
1665         if (!read_full) {
1666                 EBUG_ON(pick.crc.compression_type);
1667                 EBUG_ON(pick.crc.csum_type &&
1668                         (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
1669                          bvec_iter_sectors(iter) != pick.crc.live_size ||
1670                          pick.crc.offset ||
1671                          iter.bi_sector != pos.offset));
1672
1673                 pick.ptr.offset += pick.crc.offset +
1674                         (iter.bi_sector - pos.offset);
1675                 pick.crc.compressed_size        = bvec_iter_sectors(iter);
1676                 pick.crc.uncompressed_size      = bvec_iter_sectors(iter);
1677                 pick.crc.offset                 = 0;
1678                 pick.crc.live_size              = bvec_iter_sectors(iter);
1679                 pos.offset                      = iter.bi_sector;
1680         }
1681
1682         if (rbio) {
1683                 /* promote already allocated bounce rbio */
1684         } else if (bounce) {
1685                 unsigned sectors = pick.crc.compressed_size;
1686
1687                 rbio = rbio_init(bio_alloc_bioset(NULL,
1688                                                   DIV_ROUND_UP(sectors, PAGE_SECTORS),
1689                                                   0,
1690                                                   GFP_NOIO,
1691                                                   &c->bio_read_split),
1692                                  orig->opts);
1693
1694                 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
1695                 rbio->bounce    = true;
1696                 rbio->split     = true;
1697         } else if (flags & BCH_READ_MUST_CLONE) {
1698                 /*
1699                  * Have to clone if there were any splits, due to error
1700                  * reporting issues (if a split errored, and retrying didn't
1701                  * work, when it reports the error to its parent (us) we don't
1702                  * know if the error was from our bio, and we should retry, or
1703                  * from the whole bio, in which case we don't want to retry and
1704                  * lose the error)
1705                  */
1706                 rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOIO,
1707                                                  &c->bio_read_split),
1708                                  orig->opts);
1709                 rbio->bio.bi_iter = iter;
1710                 rbio->split     = true;
1711         } else {
1712 noclone:
1713                 rbio = orig;
1714                 rbio->bio.bi_iter = iter;
1715                 BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
1716         }
1717
1718         BUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
1719
1720         rbio->c                 = c;
1721         rbio->submit_time       = local_clock();
1722         if (rbio->split)
1723                 rbio->parent    = orig;
1724         else
1725                 rbio->end_io    = orig->bio.bi_end_io;
1726         rbio->bvec_iter         = iter;
1727         rbio->flags             = flags;
1728         rbio->have_ioref        = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
1729         rbio->narrow_crcs       = narrow_crcs;
1730         rbio->hole              = 0;
1731         rbio->retry             = 0;
1732         rbio->context           = 0;
1733         rbio->devs_have         = bch2_bkey_devs(k);
1734         rbio->pick              = pick;
1735         rbio->pos               = pos;
1736         rbio->version           = k.k->version;
1737         rbio->promote           = promote;
1738         INIT_WORK(&rbio->work, NULL);
1739
1740         rbio->bio.bi_opf        = orig->bio.bi_opf;
1741         rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
1742         rbio->bio.bi_end_io     = bch2_read_endio;
1743
1744         if (rbio->bounce)
1745                 trace_read_bounce(&rbio->bio);
1746
1747         bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
1748
1749         percpu_down_read(&c->mark_lock);
1750         bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
1751         percpu_up_read(&c->mark_lock);
1752
1753         if (likely(!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT)))) {
1754                 bio_inc_remaining(&orig->bio);
1755                 trace_read_split(&orig->bio);
1756         }
1757
1758         if (!rbio->pick.idx) {
1759                 if (!rbio->have_ioref) {
1760                         __bcache_io_error(c, "no device to read from");
1761                         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
1762                         goto out;
1763                 }
1764
1765                 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
1766                              bio_sectors(&rbio->bio));
1767                 bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
1768
1769                 if (unlikely(c->opts.no_data_io)) {
1770                         if (likely(!(flags & BCH_READ_IN_RETRY)))
1771                                 bio_endio(&rbio->bio);
1772                 } else {
1773                         if (likely(!(flags & BCH_READ_IN_RETRY)))
1774                                 submit_bio(&rbio->bio);
1775                         else
1776                                 submit_bio_wait(&rbio->bio);
1777                 }
1778         } else {
1779                 /* Attempting reconstruct read: */
1780                 if (bch2_ec_read_extent(c, rbio)) {
1781                         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
1782                         goto out;
1783                 }
1784
1785                 if (likely(!(flags & BCH_READ_IN_RETRY)))
1786                         bio_endio(&rbio->bio);
1787         }
1788 out:
1789         if (likely(!(flags & BCH_READ_IN_RETRY))) {
1790                 return 0;
1791         } else {
1792                 int ret;
1793
1794                 rbio->context = RBIO_CONTEXT_UNBOUND;
1795                 bch2_read_endio(&rbio->bio);
1796
1797                 ret = rbio->retry;
1798                 rbio = bch2_rbio_free(rbio);
1799
1800                 if (ret == READ_RETRY_AVOID) {
1801                         bch2_mark_io_failure(failed, &pick);
1802                         ret = READ_RETRY;
1803                 }
1804
1805                 return ret;
1806         }
1807
1808 err:
1809         if (flags & BCH_READ_IN_RETRY)
1810                 return READ_ERR;
1811
1812         orig->bio.bi_status = BLK_STS_IOERR;
1813         goto out_read_done;
1814
1815 hole:
1816         /*
1817          * won't normally happen in the BCH_READ_NODECODE
1818          * (bch2_move_extent()) path, but if we retry and the extent we wanted
1819          * to read no longer exists we have to signal that:
1820          */
1821         if (flags & BCH_READ_NODECODE)
1822                 orig->hole = true;
1823
1824         zero_fill_bio_iter(&orig->bio, iter);
1825 out_read_done:
1826         if (flags & BCH_READ_LAST_FRAGMENT)
1827                 bch2_rbio_done(orig);
1828         return 0;
1829 }
1830
1831 void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
1832 {
1833         struct btree_iter iter;
1834         struct bkey_s_c k;
1835         unsigned flags = BCH_READ_RETRY_IF_STALE|
1836                 BCH_READ_MAY_PROMOTE|
1837                 BCH_READ_USER_MAPPED;
1838         int ret;
1839
1840         BUG_ON(rbio->_state);
1841         BUG_ON(flags & BCH_READ_NODECODE);
1842         BUG_ON(flags & BCH_READ_IN_RETRY);
1843
1844         rbio->c = c;
1845         rbio->start_time = local_clock();
1846
1847         for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
1848                            POS(inode, rbio->bio.bi_iter.bi_sector),
1849                            BTREE_ITER_SLOTS, k) {
1850                 BKEY_PADDED(k) tmp;
1851                 unsigned bytes;
1852
1853                 /*
1854                  * Unlock the iterator while the btree node's lock is still in
1855                  * cache, before doing the IO:
1856                  */
1857                 bkey_reassemble(&tmp.k, k);
1858                 k = bkey_i_to_s_c(&tmp.k);
1859                 bch2_btree_iter_unlock(&iter);
1860
1861                 bytes = min_t(unsigned, rbio->bio.bi_iter.bi_size,
1862                               (k.k->p.offset - rbio->bio.bi_iter.bi_sector) << 9);
1863                 swap(rbio->bio.bi_iter.bi_size, bytes);
1864
1865                 if (rbio->bio.bi_iter.bi_size == bytes)
1866                         flags |= BCH_READ_LAST_FRAGMENT;
1867
1868                 bch2_read_extent(c, rbio, k, flags);
1869
1870                 if (flags & BCH_READ_LAST_FRAGMENT)
1871                         return;
1872
1873                 swap(rbio->bio.bi_iter.bi_size, bytes);
1874                 bio_advance(&rbio->bio, bytes);
1875         }
1876
1877         /*
1878          * If we get here, it better have been because there was an error
1879          * reading a btree node
1880          */
1881         ret = bch2_btree_iter_unlock(&iter);
1882         BUG_ON(!ret);
1883         bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
1884         bch2_rbio_done(rbio);
1885 }
1886
1887 void bch2_fs_io_exit(struct bch_fs *c)
1888 {
1889         if (c->promote_table.tbl)
1890                 rhashtable_destroy(&c->promote_table);
1891         mempool_exit(&c->bio_bounce_pages);
1892         bioset_exit(&c->bio_write);
1893         bioset_exit(&c->bio_read_split);
1894         bioset_exit(&c->bio_read);
1895 }
1896
1897 int bch2_fs_io_init(struct bch_fs *c)
1898 {
1899         if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
1900                         BIOSET_NEED_BVECS) ||
1901             bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
1902                         BIOSET_NEED_BVECS) ||
1903             bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
1904                         BIOSET_NEED_BVECS) ||
1905             mempool_init_page_pool(&c->bio_bounce_pages,
1906                                    max_t(unsigned,
1907                                          c->opts.btree_node_size,
1908                                          c->sb.encoded_extent_max) /
1909                                    PAGE_SECTORS, 0) ||
1910             rhashtable_init(&c->promote_table, &bch_promote_params))
1911                 return -ENOMEM;
1912
1913         return 0;
1914 }