Commit | Line | Data |
---|---|---|
1c6fdbd8 KO |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #ifndef NO_BCACHEFS_FS | |
3 | ||
4 | #include "bcachefs.h" | |
7b3f84ea | 5 | #include "alloc_foreground.h" |
07a1006a | 6 | #include "bkey_buf.h" |
1c6fdbd8 KO |
7 | #include "btree_update.h" |
8 | #include "buckets.h" | |
9 | #include "clock.h" | |
10 | #include "error.h" | |
e2d9912c | 11 | #include "extents.h" |
08c07fea | 12 | #include "extent_update.h" |
1c6fdbd8 KO |
13 | #include "fs.h" |
14 | #include "fs-io.h" | |
15 | #include "fsck.h" | |
16 | #include "inode.h" | |
17 | #include "journal.h" | |
18 | #include "io.h" | |
19 | #include "keylist.h" | |
20 | #include "quota.h" | |
76426098 | 21 | #include "reflink.h" |
1c6fdbd8 KO |
22 | #include "trace.h" |
23 | ||
24 | #include <linux/aio.h> | |
25 | #include <linux/backing-dev.h> | |
26 | #include <linux/falloc.h> | |
27 | #include <linux/migrate.h> | |
28 | #include <linux/mmu_context.h> | |
29 | #include <linux/pagevec.h> | |
9ba2eb25 | 30 | #include <linux/rmap.h> |
1c6fdbd8 KO |
31 | #include <linux/sched/signal.h> |
32 | #include <linux/task_io_accounting_ops.h> | |
33 | #include <linux/uio.h> | |
34 | #include <linux/writeback.h> | |
35 | ||
36 | #include <trace/events/writeback.h> | |
37 | ||
7f5e31e1 KO |
38 | static inline bool bio_full(struct bio *bio, unsigned len) |
39 | { | |
40 | if (bio->bi_vcnt >= bio->bi_max_vecs) | |
41 | return true; | |
42 | if (bio->bi_iter.bi_size > UINT_MAX - len) | |
43 | return true; | |
44 | return false; | |
45 | } | |
46 | ||
eb8e6e9c KO |
47 | static inline struct address_space *faults_disabled_mapping(void) |
48 | { | |
49 | return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL); | |
50 | } | |
51 | ||
52 | static inline void set_fdm_dropped_locks(void) | |
53 | { | |
54 | current->faults_disabled_mapping = | |
55 | (void *) (((unsigned long) current->faults_disabled_mapping)|1); | |
56 | } | |
57 | ||
58 | static inline bool fdm_dropped_locks(void) | |
59 | { | |
60 | return ((unsigned long) current->faults_disabled_mapping) & 1; | |
61 | } | |
62 | ||
1c6fdbd8 KO |
63 | struct quota_res { |
64 | u64 sectors; | |
65 | }; | |
66 | ||
1c6fdbd8 | 67 | struct bch_writepage_io { |
9a3df993 | 68 | struct bch_inode_info *inode; |
1c6fdbd8 KO |
69 | |
70 | /* must be last: */ | |
9a3df993 | 71 | struct bch_write_op op; |
1c6fdbd8 KO |
72 | }; |
73 | ||
74 | struct dio_write { | |
f8f30863 | 75 | struct completion done; |
1c6fdbd8 | 76 | struct kiocb *req; |
ed484030 | 77 | struct mm_struct *mm; |
1c6fdbd8 KO |
78 | unsigned loop:1, |
79 | sync:1, | |
80 | free_iov:1; | |
81 | struct quota_res quota_res; | |
042a1f26 | 82 | u64 written; |
1c6fdbd8 KO |
83 | |
84 | struct iov_iter iter; | |
85 | struct iovec inline_vecs[2]; | |
86 | ||
87 | /* must be last: */ | |
9a3df993 | 88 | struct bch_write_op op; |
1c6fdbd8 KO |
89 | }; |
90 | ||
91 | struct dio_read { | |
92 | struct closure cl; | |
93 | struct kiocb *req; | |
94 | long ret; | |
b4725cc1 | 95 | bool should_dirty; |
1c6fdbd8 KO |
96 | struct bch_read_bio rbio; |
97 | }; | |
98 | ||
99 | /* pagecache_block must be held */ | |
a023127a | 100 | static noinline int write_invalidate_inode_pages_range(struct address_space *mapping, |
1c6fdbd8 KO |
101 | loff_t start, loff_t end) |
102 | { | |
103 | int ret; | |
104 | ||
105 | /* | |
106 | * XXX: the way this is currently implemented, we can spin if a process | |
107 | * is continually redirtying a specific page | |
108 | */ | |
109 | do { | |
110 | if (!mapping->nrpages) | |
111 | return 0; | |
112 | ||
113 | ret = filemap_write_and_wait_range(mapping, start, end); | |
114 | if (ret) | |
115 | break; | |
116 | ||
117 | if (!mapping->nrpages) | |
118 | return 0; | |
119 | ||
120 | ret = invalidate_inode_pages2_range(mapping, | |
121 | start >> PAGE_SHIFT, | |
122 | end >> PAGE_SHIFT); | |
123 | } while (ret == -EBUSY); | |
124 | ||
125 | return ret; | |
126 | } | |
127 | ||
128 | /* quotas */ | |
129 | ||
130 | #ifdef CONFIG_BCACHEFS_QUOTA | |
131 | ||
132 | static void bch2_quota_reservation_put(struct bch_fs *c, | |
133 | struct bch_inode_info *inode, | |
134 | struct quota_res *res) | |
135 | { | |
136 | if (!res->sectors) | |
137 | return; | |
138 | ||
139 | mutex_lock(&inode->ei_quota_lock); | |
140 | BUG_ON(res->sectors > inode->ei_quota_reserved); | |
141 | ||
142 | bch2_quota_acct(c, inode->ei_qid, Q_SPC, | |
26609b61 | 143 | -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC); |
1c6fdbd8 KO |
144 | inode->ei_quota_reserved -= res->sectors; |
145 | mutex_unlock(&inode->ei_quota_lock); | |
146 | ||
147 | res->sectors = 0; | |
148 | } | |
149 | ||
150 | static int bch2_quota_reservation_add(struct bch_fs *c, | |
151 | struct bch_inode_info *inode, | |
152 | struct quota_res *res, | |
153 | unsigned sectors, | |
154 | bool check_enospc) | |
155 | { | |
156 | int ret; | |
157 | ||
158 | mutex_lock(&inode->ei_quota_lock); | |
159 | ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, | |
26609b61 | 160 | check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK); |
1c6fdbd8 KO |
161 | if (likely(!ret)) { |
162 | inode->ei_quota_reserved += sectors; | |
163 | res->sectors += sectors; | |
164 | } | |
165 | mutex_unlock(&inode->ei_quota_lock); | |
166 | ||
167 | return ret; | |
168 | } | |
169 | ||
170 | #else | |
171 | ||
172 | static void bch2_quota_reservation_put(struct bch_fs *c, | |
173 | struct bch_inode_info *inode, | |
174 | struct quota_res *res) | |
175 | { | |
176 | } | |
177 | ||
178 | static int bch2_quota_reservation_add(struct bch_fs *c, | |
179 | struct bch_inode_info *inode, | |
180 | struct quota_res *res, | |
181 | unsigned sectors, | |
182 | bool check_enospc) | |
183 | { | |
184 | return 0; | |
185 | } | |
186 | ||
187 | #endif | |
188 | ||
189 | /* i_size updates: */ | |
190 | ||
2ea90048 KO |
191 | struct inode_new_size { |
192 | loff_t new_size; | |
193 | u64 now; | |
194 | unsigned fields; | |
195 | }; | |
196 | ||
1c6fdbd8 KO |
197 | static int inode_set_size(struct bch_inode_info *inode, |
198 | struct bch_inode_unpacked *bi, | |
199 | void *p) | |
200 | { | |
2ea90048 | 201 | struct inode_new_size *s = p; |
1c6fdbd8 | 202 | |
2ea90048 KO |
203 | bi->bi_size = s->new_size; |
204 | if (s->fields & ATTR_ATIME) | |
205 | bi->bi_atime = s->now; | |
206 | if (s->fields & ATTR_MTIME) | |
207 | bi->bi_mtime = s->now; | |
208 | if (s->fields & ATTR_CTIME) | |
209 | bi->bi_ctime = s->now; | |
1c6fdbd8 | 210 | |
1c6fdbd8 KO |
211 | return 0; |
212 | } | |
213 | ||
76426098 KO |
214 | int __must_check bch2_write_inode_size(struct bch_fs *c, |
215 | struct bch_inode_info *inode, | |
216 | loff_t new_size, unsigned fields) | |
1c6fdbd8 | 217 | { |
2ea90048 KO |
218 | struct inode_new_size s = { |
219 | .new_size = new_size, | |
220 | .now = bch2_current_time(c), | |
221 | .fields = fields, | |
222 | }; | |
223 | ||
224 | return bch2_write_inode(c, inode, inode_set_size, &s, fields); | |
1c6fdbd8 KO |
225 | } |
226 | ||
227 | static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, | |
190fa7af | 228 | struct quota_res *quota_res, s64 sectors) |
1c6fdbd8 | 229 | { |
190fa7af KO |
230 | if (!sectors) |
231 | return; | |
232 | ||
1c6fdbd8 KO |
233 | mutex_lock(&inode->ei_quota_lock); |
234 | #ifdef CONFIG_BCACHEFS_QUOTA | |
235 | if (quota_res && sectors > 0) { | |
236 | BUG_ON(sectors > quota_res->sectors); | |
237 | BUG_ON(sectors > inode->ei_quota_reserved); | |
238 | ||
239 | quota_res->sectors -= sectors; | |
240 | inode->ei_quota_reserved -= sectors; | |
241 | } else { | |
26609b61 | 242 | bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); |
1c6fdbd8 KO |
243 | } |
244 | #endif | |
245 | inode->v.i_blocks += sectors; | |
246 | mutex_unlock(&inode->ei_quota_lock); | |
247 | } | |
248 | ||
1c6fdbd8 KO |
249 | /* page state: */ |
250 | ||
251 | /* stored in page->private: */ | |
252 | ||
f57a6a5d | 253 | struct bch_page_sector { |
f81b648d | 254 | /* Uncompressed, fully allocated replicas: */ |
f57a6a5d | 255 | unsigned nr_replicas:3; |
1c6fdbd8 | 256 | |
f81b648d | 257 | /* Owns PAGE_SECTORS * replicas_reserved sized reservation: */ |
f57a6a5d KO |
258 | unsigned replicas_reserved:3; |
259 | ||
260 | /* i_sectors: */ | |
261 | enum { | |
262 | SECTOR_UNALLOCATED, | |
2ba5d38b | 263 | SECTOR_RESERVED, |
f57a6a5d KO |
264 | SECTOR_DIRTY, |
265 | SECTOR_ALLOCATED, | |
266 | } state:2; | |
267 | }; | |
1c6fdbd8 | 268 | |
f57a6a5d | 269 | struct bch_page_state { |
3826ee0b | 270 | spinlock_t lock; |
7f5e31e1 | 271 | atomic_t write_count; |
f57a6a5d | 272 | struct bch_page_sector s[PAGE_SECTORS]; |
1c6fdbd8 KO |
273 | }; |
274 | ||
f57a6a5d | 275 | static inline struct bch_page_state *__bch2_page_state(struct page *page) |
1c6fdbd8 | 276 | { |
f57a6a5d KO |
277 | return page_has_private(page) |
278 | ? (struct bch_page_state *) page_private(page) | |
279 | : NULL; | |
280 | } | |
1c6fdbd8 | 281 | |
f57a6a5d KO |
282 | static inline struct bch_page_state *bch2_page_state(struct page *page) |
283 | { | |
e1036a2a | 284 | EBUG_ON(!PageLocked(page)); |
1c6fdbd8 | 285 | |
f57a6a5d KO |
286 | return __bch2_page_state(page); |
287 | } | |
288 | ||
289 | /* for newly allocated pages: */ | |
290 | static void __bch2_page_state_release(struct page *page) | |
291 | { | |
00276f9f | 292 | kfree(detach_page_private(page)); |
f57a6a5d KO |
293 | } |
294 | ||
295 | static void bch2_page_state_release(struct page *page) | |
296 | { | |
00276f9f MWO |
297 | EBUG_ON(!PageLocked(page)); |
298 | __bch2_page_state_release(page); | |
f57a6a5d KO |
299 | } |
300 | ||
301 | /* for newly allocated pages: */ | |
302 | static struct bch_page_state *__bch2_page_state_create(struct page *page, | |
303 | gfp_t gfp) | |
304 | { | |
305 | struct bch_page_state *s; | |
306 | ||
307 | s = kzalloc(sizeof(*s), GFP_NOFS|gfp); | |
308 | if (!s) | |
309 | return NULL; | |
1c6fdbd8 | 310 | |
3826ee0b | 311 | spin_lock_init(&s->lock); |
00276f9f | 312 | attach_page_private(page, s); |
1c6fdbd8 KO |
313 | return s; |
314 | } | |
315 | ||
f57a6a5d KO |
316 | static struct bch_page_state *bch2_page_state_create(struct page *page, |
317 | gfp_t gfp) | |
318 | { | |
319 | return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp); | |
320 | } | |
321 | ||
e1036a2a KO |
322 | static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) |
323 | { | |
324 | /* XXX: this should not be open coded */ | |
325 | return inode->ei_inode.bi_data_replicas | |
326 | ? inode->ei_inode.bi_data_replicas - 1 | |
327 | : c->opts.data_replicas; | |
328 | } | |
329 | ||
f57a6a5d KO |
330 | static inline unsigned sectors_to_reserve(struct bch_page_sector *s, |
331 | unsigned nr_replicas) | |
1c6fdbd8 | 332 | { |
f57a6a5d KO |
333 | return max(0, (int) nr_replicas - |
334 | s->nr_replicas - | |
335 | s->replicas_reserved); | |
336 | } | |
337 | ||
338 | static int bch2_get_page_disk_reservation(struct bch_fs *c, | |
339 | struct bch_inode_info *inode, | |
340 | struct page *page, bool check_enospc) | |
341 | { | |
342 | struct bch_page_state *s = bch2_page_state_create(page, 0); | |
e1036a2a | 343 | unsigned nr_replicas = inode_nr_replicas(c, inode); |
f57a6a5d KO |
344 | struct disk_reservation disk_res = { 0 }; |
345 | unsigned i, disk_res_sectors = 0; | |
346 | int ret; | |
347 | ||
348 | if (!s) | |
349 | return -ENOMEM; | |
350 | ||
351 | for (i = 0; i < ARRAY_SIZE(s->s); i++) | |
352 | disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas); | |
353 | ||
354 | if (!disk_res_sectors) | |
355 | return 0; | |
356 | ||
357 | ret = bch2_disk_reservation_get(c, &disk_res, | |
358 | disk_res_sectors, 1, | |
359 | !check_enospc | |
360 | ? BCH_DISK_RESERVATION_NOFAIL | |
361 | : 0); | |
362 | if (unlikely(ret)) | |
363 | return ret; | |
364 | ||
365 | for (i = 0; i < ARRAY_SIZE(s->s); i++) | |
366 | s->s[i].replicas_reserved += | |
367 | sectors_to_reserve(&s->s[i], nr_replicas); | |
368 | ||
369 | return 0; | |
370 | } | |
371 | ||
d1542e03 KO |
372 | struct bch2_page_reservation { |
373 | struct disk_reservation disk; | |
374 | struct quota_res quota; | |
375 | }; | |
376 | ||
377 | static void bch2_page_reservation_init(struct bch_fs *c, | |
f57a6a5d | 378 | struct bch_inode_info *inode, |
d1542e03 KO |
379 | struct bch2_page_reservation *res) |
380 | { | |
381 | memset(res, 0, sizeof(*res)); | |
382 | ||
383 | res->disk.nr_replicas = inode_nr_replicas(c, inode); | |
384 | } | |
385 | ||
386 | static void bch2_page_reservation_put(struct bch_fs *c, | |
387 | struct bch_inode_info *inode, | |
388 | struct bch2_page_reservation *res) | |
389 | { | |
390 | bch2_disk_reservation_put(c, &res->disk); | |
391 | bch2_quota_reservation_put(c, inode, &res->quota); | |
392 | } | |
393 | ||
394 | static int bch2_page_reservation_get(struct bch_fs *c, | |
395 | struct bch_inode_info *inode, struct page *page, | |
396 | struct bch2_page_reservation *res, | |
397 | unsigned offset, unsigned len, bool check_enospc) | |
f57a6a5d KO |
398 | { |
399 | struct bch_page_state *s = bch2_page_state_create(page, 0); | |
d1542e03 | 400 | unsigned i, disk_sectors = 0, quota_sectors = 0; |
f81b648d | 401 | int ret; |
1c6fdbd8 | 402 | |
f57a6a5d KO |
403 | if (!s) |
404 | return -ENOMEM; | |
1c6fdbd8 | 405 | |
4b0a66d5 KO |
406 | for (i = round_down(offset, block_bytes(c)) >> 9; |
407 | i < round_up(offset + len, block_bytes(c)) >> 9; | |
d1542e03 KO |
408 | i++) { |
409 | disk_sectors += sectors_to_reserve(&s->s[i], | |
410 | res->disk.nr_replicas); | |
411 | quota_sectors += s->s[i].state == SECTOR_UNALLOCATED; | |
412 | } | |
1c6fdbd8 | 413 | |
d1542e03 KO |
414 | if (disk_sectors) { |
415 | ret = bch2_disk_reservation_add(c, &res->disk, | |
416 | disk_sectors, | |
417 | !check_enospc | |
418 | ? BCH_DISK_RESERVATION_NOFAIL | |
419 | : 0); | |
420 | if (unlikely(ret)) | |
421 | return ret; | |
422 | } | |
1c6fdbd8 | 423 | |
d1542e03 KO |
424 | if (quota_sectors) { |
425 | ret = bch2_quota_reservation_add(c, inode, &res->quota, | |
426 | quota_sectors, | |
427 | check_enospc); | |
428 | if (unlikely(ret)) { | |
429 | struct disk_reservation tmp = { | |
430 | .sectors = disk_sectors | |
431 | }; | |
432 | ||
433 | bch2_disk_reservation_put(c, &tmp); | |
434 | res->disk.sectors -= disk_sectors; | |
435 | return ret; | |
436 | } | |
437 | } | |
1c6fdbd8 | 438 | |
adfcfaf0 | 439 | return 0; |
1c6fdbd8 KO |
440 | } |
441 | ||
442 | static void bch2_clear_page_bits(struct page *page) | |
443 | { | |
444 | struct bch_inode_info *inode = to_bch_ei(page->mapping->host); | |
445 | struct bch_fs *c = inode->v.i_sb->s_fs_info; | |
f57a6a5d | 446 | struct bch_page_state *s = bch2_page_state(page); |
d1542e03 | 447 | struct disk_reservation disk_res = { 0 }; |
f57a6a5d | 448 | int i, dirty_sectors = 0; |
f81b648d | 449 | |
f57a6a5d | 450 | if (!s) |
1c6fdbd8 KO |
451 | return; |
452 | ||
3826ee0b KO |
453 | EBUG_ON(!PageLocked(page)); |
454 | EBUG_ON(PageWriteback(page)); | |
455 | ||
f57a6a5d | 456 | for (i = 0; i < ARRAY_SIZE(s->s); i++) { |
d1542e03 KO |
457 | disk_res.sectors += s->s[i].replicas_reserved; |
458 | s->s[i].replicas_reserved = 0; | |
459 | ||
f57a6a5d KO |
460 | if (s->s[i].state == SECTOR_DIRTY) { |
461 | dirty_sectors++; | |
462 | s->s[i].state = SECTOR_UNALLOCATED; | |
463 | } | |
464 | } | |
1c6fdbd8 | 465 | |
d1542e03 KO |
466 | bch2_disk_reservation_put(c, &disk_res); |
467 | ||
f57a6a5d KO |
468 | if (dirty_sectors) |
469 | i_sectors_acct(c, inode, NULL, -dirty_sectors); | |
1c6fdbd8 | 470 | |
f57a6a5d | 471 | bch2_page_state_release(page); |
1c6fdbd8 KO |
472 | } |
473 | ||
d1542e03 KO |
474 | static void bch2_set_page_dirty(struct bch_fs *c, |
475 | struct bch_inode_info *inode, struct page *page, | |
476 | struct bch2_page_reservation *res, | |
477 | unsigned offset, unsigned len) | |
1c6fdbd8 | 478 | { |
d1542e03 | 479 | struct bch_page_state *s = bch2_page_state(page); |
f57a6a5d | 480 | unsigned i, dirty_sectors = 0; |
1c6fdbd8 | 481 | |
877dfb34 KO |
482 | WARN_ON((u64) page_offset(page) + offset + len > |
483 | round_up((u64) i_size_read(&inode->v), block_bytes(c))); | |
fb472ac5 | 484 | |
3826ee0b KO |
485 | spin_lock(&s->lock); |
486 | ||
4b0a66d5 KO |
487 | for (i = round_down(offset, block_bytes(c)) >> 9; |
488 | i < round_up(offset + len, block_bytes(c)) >> 9; | |
d1542e03 KO |
489 | i++) { |
490 | unsigned sectors = sectors_to_reserve(&s->s[i], | |
491 | res->disk.nr_replicas); | |
1c6fdbd8 | 492 | |
406d6d5a KO |
493 | /* |
494 | * This can happen if we race with the error path in | |
495 | * bch2_writepage_io_done(): | |
496 | */ | |
497 | sectors = min_t(unsigned, sectors, res->disk.sectors); | |
498 | ||
d1542e03 KO |
499 | s->s[i].replicas_reserved += sectors; |
500 | res->disk.sectors -= sectors; | |
adfcfaf0 | 501 | |
2ba5d38b | 502 | if (s->s[i].state == SECTOR_UNALLOCATED) |
f57a6a5d | 503 | dirty_sectors++; |
2ba5d38b KO |
504 | |
505 | s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY); | |
f57a6a5d KO |
506 | } |
507 | ||
3826ee0b KO |
508 | spin_unlock(&s->lock); |
509 | ||
f57a6a5d | 510 | if (dirty_sectors) |
d1542e03 | 511 | i_sectors_acct(c, inode, &res->quota, dirty_sectors); |
e1036a2a | 512 | |
d1542e03 KO |
513 | if (!PageDirty(page)) |
514 | filemap_dirty_folio(inode->v.i_mapping, page_folio(page)); | |
1c6fdbd8 KO |
515 | } |
516 | ||
517 | vm_fault_t bch2_page_fault(struct vm_fault *vmf) | |
518 | { | |
519 | struct file *file = vmf->vma->vm_file; | |
eb8e6e9c KO |
520 | struct address_space *mapping = file->f_mapping; |
521 | struct address_space *fdm = faults_disabled_mapping(); | |
1c6fdbd8 KO |
522 | struct bch_inode_info *inode = file_bch_inode(file); |
523 | int ret; | |
524 | ||
eb8e6e9c KO |
525 | if (fdm == mapping) |
526 | return VM_FAULT_SIGBUS; | |
527 | ||
528 | /* Lock ordering: */ | |
529 | if (fdm > mapping) { | |
530 | struct bch_inode_info *fdm_host = to_bch_ei(fdm->host); | |
531 | ||
532 | if (bch2_pagecache_add_tryget(&inode->ei_pagecache_lock)) | |
533 | goto got_lock; | |
534 | ||
535 | bch2_pagecache_block_put(&fdm_host->ei_pagecache_lock); | |
536 | ||
537 | bch2_pagecache_add_get(&inode->ei_pagecache_lock); | |
538 | bch2_pagecache_add_put(&inode->ei_pagecache_lock); | |
539 | ||
540 | bch2_pagecache_block_get(&fdm_host->ei_pagecache_lock); | |
541 | ||
542 | /* Signal that lock has been dropped: */ | |
543 | set_fdm_dropped_locks(); | |
544 | return VM_FAULT_SIGBUS; | |
545 | } | |
546 | ||
1c6fdbd8 | 547 | bch2_pagecache_add_get(&inode->ei_pagecache_lock); |
eb8e6e9c | 548 | got_lock: |
1c6fdbd8 KO |
549 | ret = filemap_fault(vmf); |
550 | bch2_pagecache_add_put(&inode->ei_pagecache_lock); | |
551 | ||
552 | return ret; | |
553 | } | |
554 | ||
555 | vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) | |
556 | { | |
557 | struct page *page = vmf->page; | |
558 | struct file *file = vmf->vma->vm_file; | |
559 | struct bch_inode_info *inode = file_bch_inode(file); | |
560 | struct address_space *mapping = file->f_mapping; | |
561 | struct bch_fs *c = inode->v.i_sb->s_fs_info; | |
d1542e03 | 562 | struct bch2_page_reservation res; |
6cc3535d KO |
563 | unsigned len; |
564 | loff_t isize; | |
1c6fdbd8 KO |
565 | int ret = VM_FAULT_LOCKED; |
566 | ||
d1542e03 KO |
567 | bch2_page_reservation_init(c, inode, &res); |
568 | ||
1c6fdbd8 KO |
569 | sb_start_pagefault(inode->v.i_sb); |
570 | file_update_time(file); | |
571 | ||
572 | /* | |
573 | * Not strictly necessary, but helps avoid dio writes livelocking in | |
574 | * write_invalidate_inode_pages_range() - can drop this if/when we get | |
575 | * a write_invalidate_inode_pages_range() that works without dropping | |
576 | * page lock before invalidating page | |
577 | */ | |
578 | bch2_pagecache_add_get(&inode->ei_pagecache_lock); | |
579 | ||
580 | lock_page(page); | |
6cc3535d KO |
581 | isize = i_size_read(&inode->v); |
582 | ||
583 | if (page->mapping != mapping || page_offset(page) >= isize) { | |
1c6fdbd8 KO |
584 | unlock_page(page); |
585 | ret = VM_FAULT_NOPAGE; | |
586 | goto out; | |
587 | } | |
588 | ||
d55460bb | 589 | len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page)); |
6cc3535d KO |
590 | |
591 | if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) { | |
1c6fdbd8 KO |
592 | unlock_page(page); |
593 | ret = VM_FAULT_SIGBUS; | |
594 | goto out; | |
595 | } | |
596 | ||
6cc3535d | 597 | bch2_set_page_dirty(c, inode, page, &res, 0, len); |
1b783a69 KO |
598 | bch2_page_reservation_put(c, inode, &res); |
599 | ||
1c6fdbd8 KO |
600 | wait_for_stable_page(page); |
601 | out: | |
602 | bch2_pagecache_add_put(&inode->ei_pagecache_lock); | |
603 | sb_end_pagefault(inode->v.i_sb); | |
d1542e03 | 604 | |
1c6fdbd8 KO |
605 | return ret; |
606 | } | |
607 | ||
608 | void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length) | |
609 | { | |
1c6fdbd8 KO |
610 | if (offset || length < folio_size(folio)) |
611 | return; | |
612 | ||
613 | bch2_clear_page_bits(&folio->page); | |
614 | } | |
615 | ||
616 | bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask) | |
617 | { | |
a6d90385 | 618 | if (folio_test_dirty(folio) || folio_test_writeback(folio)) |
1c6fdbd8 KO |
619 | return false; |
620 | ||
621 | bch2_clear_page_bits(&folio->page); | |
622 | return true; | |
623 | } | |
624 | ||
1c6fdbd8 KO |
625 | /* readpage(s): */ |
626 | ||
627 | static void bch2_readpages_end_io(struct bio *bio) | |
628 | { | |
629 | struct bvec_iter_all iter; | |
630 | struct bio_vec *bv; | |
631 | ||
632 | bio_for_each_segment_all(bv, bio, iter) { | |
633 | struct page *page = bv->bv_page; | |
634 | ||
635 | if (!bio->bi_status) { | |
636 | SetPageUptodate(page); | |
637 | } else { | |
638 | ClearPageUptodate(page); | |
639 | SetPageError(page); | |
640 | } | |
641 | unlock_page(page); | |
642 | } | |
643 | ||
644 | bio_put(bio); | |
645 | } | |
646 | ||
1c6fdbd8 KO |
647 | struct readpages_iter { |
648 | struct address_space *mapping; | |
649 | struct page **pages; | |
650 | unsigned nr_pages; | |
651 | unsigned idx; | |
652 | pgoff_t offset; | |
653 | }; | |
654 | ||
655 | static int readpages_iter_init(struct readpages_iter *iter, | |
656 | struct readahead_control *ractl) | |
657 | { | |
658 | unsigned i, nr_pages = readahead_count(ractl); | |
659 | ||
660 | memset(iter, 0, sizeof(*iter)); | |
661 | ||
662 | iter->mapping = ractl->mapping; | |
663 | iter->offset = readahead_index(ractl); | |
664 | iter->nr_pages = nr_pages; | |
665 | ||
666 | iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); | |
667 | if (!iter->pages) | |
668 | return -ENOMEM; | |
669 | ||
89931472 | 670 | nr_pages = __readahead_batch(ractl, iter->pages, nr_pages); |
1c6fdbd8 | 671 | for (i = 0; i < nr_pages; i++) { |
f57a6a5d | 672 | __bch2_page_state_create(iter->pages[i], __GFP_NOFAIL); |
1c6fdbd8 KO |
673 | put_page(iter->pages[i]); |
674 | } | |
675 | ||
676 | return 0; | |
677 | } | |
678 | ||
679 | static inline struct page *readpage_iter_next(struct readpages_iter *iter) | |
680 | { | |
681 | if (iter->idx >= iter->nr_pages) | |
682 | return NULL; | |
683 | ||
684 | EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx); | |
685 | ||
1c6fdbd8 KO |
686 | return iter->pages[iter->idx]; |
687 | } | |
688 | ||
689 | static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) | |
690 | { | |
691 | struct bvec_iter iter; | |
692 | struct bio_vec bv; | |
76426098 | 693 | unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v |
4de77495 | 694 | ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); |
2ba5d38b KO |
695 | unsigned state = k.k->type == KEY_TYPE_reservation |
696 | ? SECTOR_RESERVED | |
697 | : SECTOR_ALLOCATED; | |
1c6fdbd8 | 698 | |
f57a6a5d KO |
699 | bio_for_each_segment(bv, bio, iter) { |
700 | struct bch_page_state *s = bch2_page_state(bv.bv_page); | |
701 | unsigned i; | |
702 | ||
703 | for (i = bv.bv_offset >> 9; | |
704 | i < (bv.bv_offset + bv.bv_len) >> 9; | |
705 | i++) { | |
706 | s->s[i].nr_replicas = nr_ptrs; | |
2ba5d38b | 707 | s->s[i].state = state; |
f57a6a5d | 708 | } |
1c6fdbd8 KO |
709 | } |
710 | } | |
711 | ||
35189e09 KO |
712 | static bool extent_partial_reads_expensive(struct bkey_s_c k) |
713 | { | |
714 | struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); | |
715 | struct bch_extent_crc_unpacked crc; | |
716 | const union bch_extent_entry *i; | |
717 | ||
718 | bkey_for_each_crc(k.k, ptrs, crc, i) | |
719 | if (crc.csum_type || crc.compression_type) | |
720 | return true; | |
721 | return false; | |
722 | } | |
723 | ||
1c6fdbd8 | 724 | static void readpage_bio_extend(struct readpages_iter *iter, |
76426098 KO |
725 | struct bio *bio, |
726 | unsigned sectors_this_extent, | |
1c6fdbd8 KO |
727 | bool get_more) |
728 | { | |
76426098 | 729 | while (bio_sectors(bio) < sectors_this_extent && |
1c6fdbd8 KO |
730 | bio->bi_vcnt < bio->bi_max_vecs) { |
731 | pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT; | |
732 | struct page *page = readpage_iter_next(iter); | |
733 | int ret; | |
734 | ||
735 | if (page) { | |
736 | if (iter->offset + iter->idx != page_offset) | |
737 | break; | |
738 | ||
739 | iter->idx++; | |
740 | } else { | |
741 | if (!get_more) | |
742 | break; | |
743 | ||
744 | page = xa_load(&iter->mapping->i_pages, page_offset); | |
745 | if (page && !xa_is_value(page)) | |
746 | break; | |
747 | ||
748 | page = __page_cache_alloc(readahead_gfp_mask(iter->mapping)); | |
749 | if (!page) | |
750 | break; | |
751 | ||
f57a6a5d KO |
752 | if (!__bch2_page_state_create(page, 0)) { |
753 | put_page(page); | |
754 | break; | |
755 | } | |
1c6fdbd8 KO |
756 | |
757 | ret = add_to_page_cache_lru(page, iter->mapping, | |
758 | page_offset, GFP_NOFS); | |
759 | if (ret) { | |
f57a6a5d | 760 | __bch2_page_state_release(page); |
1c6fdbd8 KO |
761 | put_page(page); |
762 | break; | |
763 | } | |
764 | ||
765 | put_page(page); | |
766 | } | |
767 | ||
885678f6 | 768 | BUG_ON(!bio_add_page(bio, page, PAGE_SIZE, 0)); |
1c6fdbd8 KO |
769 | } |
770 | } | |
771 | ||
0f238367 | 772 | static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, |
1c6fdbd8 KO |
773 | struct bch_read_bio *rbio, u64 inum, |
774 | struct readpages_iter *readpages_iter) | |
775 | { | |
0f238367 | 776 | struct bch_fs *c = trans->c; |
07a1006a | 777 | struct bkey_buf sk; |
1c6fdbd8 KO |
778 | int flags = BCH_READ_RETRY_IF_STALE| |
779 | BCH_READ_MAY_PROMOTE; | |
76426098 | 780 | int ret = 0; |
1c6fdbd8 KO |
781 | |
782 | rbio->c = c; | |
783 | rbio->start_time = local_clock(); | |
35189e09 | 784 | |
07a1006a | 785 | bch2_bkey_buf_init(&sk); |
76426098 | 786 | retry: |
700c25b3 KO |
787 | bch2_trans_begin(trans); |
788 | ||
1c6fdbd8 | 789 | while (1) { |
1c6fdbd8 | 790 | struct bkey_s_c k; |
76426098 | 791 | unsigned bytes, sectors, offset_into_extent; |
5ff75ccb | 792 | enum btree_id data_btree = BTREE_ID_extents; |
1c6fdbd8 | 793 | |
3737e0dd KO |
794 | /* |
795 | * read_extent -> io_time_reset may cause a transaction restart | |
796 | * without returning an error, we need to check for that here: | |
797 | */ | |
798 | if (!bch2_trans_relock(trans)) { | |
799 | ret = -EINTR; | |
800 | break; | |
801 | } | |
802 | ||
06ed8558 KO |
803 | bch2_btree_iter_set_pos(iter, |
804 | POS(inum, rbio->bio.bi_iter.bi_sector)); | |
1c6fdbd8 KO |
805 | |
806 | k = bch2_btree_iter_peek_slot(iter); | |
76426098 KO |
807 | ret = bkey_err(k); |
808 | if (ret) | |
809 | break; | |
1c6fdbd8 | 810 | |
06ed8558 KO |
811 | offset_into_extent = iter->pos.offset - |
812 | bkey_start_offset(k.k); | |
76426098 KO |
813 | sectors = k.k->size - offset_into_extent; |
814 | ||
07a1006a | 815 | bch2_bkey_buf_reassemble(&sk, c, k); |
13dcd4ab | 816 | |
5ff75ccb | 817 | ret = bch2_read_indirect_extent(trans, &data_btree, |
22d8a33d | 818 | &offset_into_extent, &sk); |
76426098 KO |
819 | if (ret) |
820 | break; | |
821 | ||
13dcd4ab KO |
822 | k = bkey_i_to_s_c(sk.k); |
823 | ||
76426098 KO |
824 | sectors = min(sectors, k.k->size - offset_into_extent); |
825 | ||
826 | bch2_trans_unlock(trans); | |
06ed8558 | 827 | |
35189e09 KO |
828 | if (readpages_iter) |
829 | readpage_bio_extend(readpages_iter, &rbio->bio, sectors, | |
830 | extent_partial_reads_expensive(k)); | |
1c6fdbd8 | 831 | |
76426098 | 832 | bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; |
06ed8558 | 833 | swap(rbio->bio.bi_iter.bi_size, bytes); |
1c6fdbd8 | 834 | |
06ed8558 | 835 | if (rbio->bio.bi_iter.bi_size == bytes) |
1c6fdbd8 KO |
836 | flags |= BCH_READ_LAST_FRAGMENT; |
837 | ||
838 | if (bkey_extent_is_allocation(k.k)) | |
06ed8558 | 839 | bch2_add_page_sectors(&rbio->bio, k); |
1c6fdbd8 | 840 | |
5ff75ccb KO |
841 | bch2_read_extent(trans, rbio, iter->pos, |
842 | data_btree, k, offset_into_extent, flags); | |
1c6fdbd8 KO |
843 | |
844 | if (flags & BCH_READ_LAST_FRAGMENT) | |
35189e09 | 845 | break; |
1c6fdbd8 | 846 | |
06ed8558 KO |
847 | swap(rbio->bio.bi_iter.bi_size, bytes); |
848 | bio_advance(&rbio->bio, bytes); | |
1c6fdbd8 | 849 | } |
76426098 KO |
850 | |
851 | if (ret == -EINTR) | |
852 | goto retry; | |
853 | ||
35189e09 | 854 | if (ret) { |
0fefe8d8 KO |
855 | bch_err_inum_ratelimited(c, inum, |
856 | "read error %i from btree lookup", ret); | |
857 | rbio->bio.bi_status = BLK_STS_IOERR; | |
35189e09 KO |
858 | bio_endio(&rbio->bio); |
859 | } | |
860 | ||
07a1006a | 861 | bch2_bkey_buf_exit(&sk, c); |
1c6fdbd8 KO |
862 | } |
863 | ||
864 | void bch2_readahead(struct readahead_control *ractl) | |
865 | { | |
866 | struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); | |
867 | struct bch_fs *c = inode->v.i_sb->s_fs_info; | |
9a3df993 | 868 | struct bch_io_opts opts = io_opts(c, &inode->ei_inode); |
424eb881 KO |
869 | struct btree_trans trans; |
870 | struct btree_iter *iter; | |
1c6fdbd8 KO |
871 | struct page *page; |
872 | struct readpages_iter readpages_iter; | |
873 | int ret; | |
874 | ||
875 | ret = readpages_iter_init(&readpages_iter, ractl); | |
876 | BUG_ON(ret); | |
877 | ||
20bceecb | 878 | bch2_trans_init(&trans, c, 0, 0); |
41f8b09e | 879 | iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN, |
424eb881 | 880 | BTREE_ITER_SLOTS); |
1c6fdbd8 KO |
881 | |
882 | bch2_pagecache_add_get(&inode->ei_pagecache_lock); | |
883 | ||
884 | while ((page = readpage_iter_next(&readpages_iter))) { | |
885 | pgoff_t index = readpages_iter.offset + readpages_iter.idx; | |
886 | unsigned n = min_t(unsigned, | |
887 | readpages_iter.nr_pages - | |
888 | readpages_iter.idx, | |
889 | BIO_MAX_VECS); | |
890 | struct bch_read_bio *rbio = | |
891 | rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, | |
892 | GFP_NOFS, &c->bio_read), | |
893 | opts); | |
894 | ||
895 | readpages_iter.idx++; | |
896 | ||
897 | rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTOR_SHIFT; | |
898 | rbio->bio.bi_end_io = bch2_readpages_end_io; | |
885678f6 | 899 | BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); |
1c6fdbd8 | 900 | |
0f238367 KO |
901 | bchfs_read(&trans, iter, rbio, inode->v.i_ino, |
902 | &readpages_iter); | |
1c6fdbd8 KO |
903 | } |
904 | ||
905 | bch2_pagecache_add_put(&inode->ei_pagecache_lock); | |
424eb881 | 906 | |
50dc0f69 | 907 | bch2_trans_iter_put(&trans, iter); |
424eb881 | 908 | bch2_trans_exit(&trans); |
1c6fdbd8 KO |
909 | kfree(readpages_iter.pages); |
910 | } | |
911 | ||
912 | static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, | |
913 | u64 inum, struct page *page) | |
914 | { | |
424eb881 KO |
915 | struct btree_trans trans; |
916 | struct btree_iter *iter; | |
1c6fdbd8 | 917 | |
f57a6a5d | 918 | bch2_page_state_create(page, __GFP_NOFAIL); |
1c6fdbd8 KO |
919 | |
920 | rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; | |
7f5e31e1 KO |
921 | rbio->bio.bi_iter.bi_sector = |
922 | (sector_t) page->index << PAGE_SECTOR_SHIFT; | |
923 | BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); | |
1c6fdbd8 | 924 | |
20bceecb | 925 | bch2_trans_init(&trans, c, 0, 0); |
41f8b09e | 926 | iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN, |
424eb881 KO |
927 | BTREE_ITER_SLOTS); |
928 | ||
0f238367 | 929 | bchfs_read(&trans, iter, rbio, inum, NULL); |
424eb881 | 930 | |
50dc0f69 | 931 | bch2_trans_iter_put(&trans, iter); |
424eb881 | 932 | bch2_trans_exit(&trans); |
1c6fdbd8 KO |
933 | } |
934 | ||
935 | static void bch2_read_single_page_end_io(struct bio *bio) | |
936 | { | |
937 | complete(bio->bi_private); | |
938 | } | |
939 | ||
940 | static int bch2_read_single_page(struct page *page, | |
941 | struct address_space *mapping) | |
942 | { | |
943 | struct bch_inode_info *inode = to_bch_ei(mapping->host); | |
944 | struct bch_fs *c = inode->v.i_sb->s_fs_info; | |
945 | struct bch_read_bio *rbio; | |
946 | int ret; | |
947 | DECLARE_COMPLETION_ONSTACK(done); | |
948 | ||
949 | rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &c->bio_read), | |
9a3df993 | 950 | io_opts(c, &inode->ei_inode)); |
1c6fdbd8 KO |
951 | rbio->bio.bi_private = &done; |
952 | rbio->bio.bi_end_io = bch2_read_single_page_end_io; | |
953 | ||
954 | __bchfs_readpage(c, rbio, inode->v.i_ino, page); | |
955 | wait_for_completion(&done); | |
956 | ||
957 | ret = blk_status_to_errno(rbio->bio.bi_status); | |
958 | bio_put(&rbio->bio); | |
959 | ||
960 | if (ret < 0) | |
961 | return ret; | |
962 | ||
963 | SetPageUptodate(page); | |
964 | return 0; | |
965 | } | |
966 | ||
967 | int bch2_read_folio(struct file *file, struct folio *folio) | |
968 | { | |
969 | struct page *page = &folio->page; | |
970 | int ret; | |
971 | ||
972 | ret = bch2_read_single_page(page, page->mapping); | |
973 | folio_unlock(folio); | |
974 | return ret; | |
975 | } | |
976 | ||
977 | /* writepages: */ | |
978 | ||
979 | struct bch_writepage_state { | |
980 | struct bch_writepage_io *io; | |
981 | struct bch_io_opts opts; | |
982 | }; | |
983 | ||
984 | static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, | |
985 | struct bch_inode_info *inode) | |
986 | { | |
9a3df993 KO |
987 | return (struct bch_writepage_state) { |
988 | .opts = io_opts(c, &inode->ei_inode) | |
989 | }; | |
1c6fdbd8 KO |
990 | } |
991 | ||
9f311f21 | 992 | static void bch2_writepage_io_done(struct bch_write_op *op) |
1c6fdbd8 | 993 | { |
9f311f21 KO |
994 | struct bch_writepage_io *io = |
995 | container_of(op, struct bch_writepage_io, op); | |
9a3df993 KO |
996 | struct bch_fs *c = io->op.c; |
997 | struct bio *bio = &io->op.wbio.bio; | |
1c6fdbd8 KO |
998 | struct bvec_iter_all iter; |
999 | struct bio_vec *bvec; | |
b3fce09c | 1000 | unsigned i; |
1c6fdbd8 | 1001 | |
ef1b2092 KO |
1002 | up(&io->op.c->io_in_flight); |
1003 | ||
9a3df993 | 1004 | if (io->op.error) { |
33c74e41 KO |
1005 | set_bit(EI_INODE_ERROR, &io->inode->ei_flags); |
1006 | ||
75812e70 | 1007 | bio_for_each_segment_all(bvec, bio, iter) { |
b3fce09c KO |
1008 | struct bch_page_state *s; |
1009 | ||
1c6fdbd8 | 1010 | SetPageError(bvec->bv_page); |
75812e70 | 1011 | mapping_set_error(bvec->bv_page->mapping, -EIO); |
b3fce09c | 1012 | |
3826ee0b KO |
1013 | s = __bch2_page_state(bvec->bv_page); |
1014 | spin_lock(&s->lock); | |
b3fce09c KO |
1015 | for (i = 0; i < PAGE_SECTORS; i++) |
1016 | s->s[i].nr_replicas = 0; | |
3826ee0b | 1017 | spin_unlock(&s->lock); |
75812e70 | 1018 | } |
1c6fdbd8 KO |
1019 | } |
1020 | ||
4be1a412 KO |
1021 | if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { |
1022 | bio_for_each_segment_all(bvec, bio, iter) { | |
1023 | struct bch_page_state *s; | |
1024 | ||
1025 | s = __bch2_page_state(bvec->bv_page); | |
1026 | spin_lock(&s->lock); | |
1027 | for (i = 0; i < PAGE_SECTORS; i++) | |
1028 | s->s[i].nr_replicas = 0; | |
1029 | spin_unlock(&s->lock); | |
1030 | } | |
1031 | } | |
1032 | ||
1c6fdbd8 KO |
1033 | /* |
1034 | * racing with fallocate can cause us to add fewer sectors than | |
1035 | * expected - but we shouldn't add more sectors than expected: | |
1036 | */ | |
9a3df993 | 1037 | BUG_ON(io->op.i_sectors_delta > 0); |
1c6fdbd8 KO |
1038 | |
1039 | /* | |
1040 | * (error (due to going RO) halfway through a page can screw that up | |
1041 | * slightly) | |
1042 | * XXX wtf? | |
9a3df993 | 1043 | BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS); |
1c6fdbd8 KO |
1044 | */ |
1045 | ||
1046 | /* | |
1047 | * PageWriteback is effectively our ref on the inode - fixup i_blocks | |
1048 | * before calling end_page_writeback: | |
1049 | */ | |
9a3df993 | 1050 | i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); |
1c6fdbd8 | 1051 | |
7f5e31e1 KO |
1052 | bio_for_each_segment_all(bvec, bio, iter) { |
1053 | struct bch_page_state *s = __bch2_page_state(bvec->bv_page); | |
1054 | ||
1055 | if (atomic_dec_and_test(&s->write_count)) | |
1056 | end_page_writeback(bvec->bv_page); | |
1057 | } | |
1c6fdbd8 | 1058 | |
9f311f21 | 1059 | bio_put(&io->op.wbio.bio); |
1c6fdbd8 KO |
1060 | } |
1061 | ||
1062 | static void bch2_writepage_do_io(struct bch_writepage_state *w) | |
1063 | { | |
1064 | struct bch_writepage_io *io = w->io; | |
1065 | ||
ef1b2092 KO |
1066 | down(&io->op.c->io_in_flight); |
1067 | ||
1c6fdbd8 | 1068 | w->io = NULL; |
9f311f21 | 1069 | closure_call(&io->op.cl, bch2_write, NULL, NULL); |
1c6fdbd8 KO |
1070 | } |
1071 | ||
1072 | /* | |
1073 | * Get a bch_writepage_io and add @page to it - appending to an existing one if | |
1074 | * possible, else allocating a new one: | |
1075 | */ | |
1076 | static void bch2_writepage_io_alloc(struct bch_fs *c, | |
50fe5bd6 | 1077 | struct writeback_control *wbc, |
1c6fdbd8 KO |
1078 | struct bch_writepage_state *w, |
1079 | struct bch_inode_info *inode, | |
7f5e31e1 | 1080 | u64 sector, |
1c6fdbd8 KO |
1081 | unsigned nr_replicas) |
1082 | { | |
1083 | struct bch_write_op *op; | |
1c6fdbd8 KO |
1084 | |
1085 | w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS, | |
1086 | REQ_OP_WRITE, | |
1087 | GFP_NOFS, | |
1088 | &c->writepage_bioset), | |
9a3df993 | 1089 | struct bch_writepage_io, op.wbio.bio); |
1c6fdbd8 | 1090 | |
9a3df993 | 1091 | w->io->inode = inode; |
9a3df993 KO |
1092 | op = &w->io->op; |
1093 | bch2_write_op_init(op, c, w->opts); | |
1094 | op->target = w->opts.foreground_target; | |
1095 | op_journal_seq_set(op, &inode->ei_journal_seq); | |
1c6fdbd8 KO |
1096 | op->nr_replicas = nr_replicas; |
1097 | op->res.nr_replicas = nr_replicas; | |
1098 | op->write_point = writepoint_hashed(inode->ei_last_dirtied); | |
7f5e31e1 | 1099 | op->pos = POS(inode->v.i_ino, sector); |
9f311f21 | 1100 | op->end_io = bch2_writepage_io_done; |
7f5e31e1 | 1101 | op->wbio.bio.bi_iter.bi_sector = sector; |
50fe5bd6 | 1102 | op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); |
1c6fdbd8 KO |
1103 | } |
1104 | ||
1105 | static int __bch2_writepage(struct folio *folio, | |
1106 | struct writeback_control *wbc, | |
1107 | void *data) | |
1108 | { | |
1109 | struct page *page = &folio->page; | |
1110 | struct bch_inode_info *inode = to_bch_ei(page->mapping->host); | |
1111 | struct bch_fs *c = inode->v.i_sb->s_fs_info; | |
1112 | struct bch_writepage_state *w = data; | |
7f5e31e1 KO |
1113 | struct bch_page_state *s, orig; |
1114 | unsigned i, offset, nr_replicas_this_write = U32_MAX; | |
1c6fdbd8 KO |
1115 | loff_t i_size = i_size_read(&inode->v); |
1116 | pgoff_t end_index = i_size >> PAGE_SHIFT; | |
e1036a2a | 1117 | int ret; |
1c6fdbd8 KO |
1118 | |
1119 | EBUG_ON(!PageUptodate(page)); | |
1120 | ||
1121 | /* Is the page fully inside i_size? */ | |
1122 | if (page->index < end_index) | |
1123 | goto do_io; | |
1124 | ||
1125 | /* Is the page fully outside i_size? (truncate in progress) */ | |
1126 | offset = i_size & (PAGE_SIZE - 1); | |
1127 | if (page->index > end_index || !offset) { | |
1128 | unlock_page(page); | |
1129 | return 0; | |
1130 | } | |
1131 | ||
1132 | /* | |
1133 | * The page straddles i_size. It must be zeroed out on each and every | |
1134 | * writepage invocation because it may be mmapped. "A file is mapped | |
1135 | * in multiples of the page size. For a file that is not a multiple of | |
1136 | * the page size, the remaining memory is zeroed when mapped, and | |
1137 | * writes to that region are not written out to the file." | |
1138 | */ | |
1139 | zero_user_segment(page, offset, PAGE_SIZE); | |
1140 | do_io: | |
f57a6a5d | 1141 | s = bch2_page_state_create(page, __GFP_NOFAIL); |
f81b648d | 1142 | |
f57a6a5d | 1143 | ret = bch2_get_page_disk_reservation(c, inode, page, true); |
e1036a2a KO |
1144 | if (ret) { |
1145 | SetPageError(page); | |
1146 | mapping_set_error(page->mapping, ret); | |
1147 | unlock_page(page); | |
1148 | return 0; | |
1149 | } | |
f81b648d | 1150 | |
7f5e31e1 KO |
1151 | /* Before unlocking the page, get copy of reservations: */ |
1152 | orig = *s; | |
1153 | ||
1154 | for (i = 0; i < PAGE_SECTORS; i++) { | |
2ba5d38b | 1155 | if (s->s[i].state < SECTOR_DIRTY) |
7f5e31e1 KO |
1156 | continue; |
1157 | ||
f57a6a5d KO |
1158 | nr_replicas_this_write = |
1159 | min_t(unsigned, nr_replicas_this_write, | |
1160 | s->s[i].nr_replicas + | |
1161 | s->s[i].replicas_reserved); | |
7f5e31e1 | 1162 | } |
e1036a2a | 1163 | |
f57a6a5d | 1164 | for (i = 0; i < PAGE_SECTORS; i++) { |
2ba5d38b | 1165 | if (s->s[i].state < SECTOR_DIRTY) |
7f5e31e1 KO |
1166 | continue; |
1167 | ||
f57a6a5d KO |
1168 | s->s[i].nr_replicas = w->opts.compression |
1169 | ? 0 : nr_replicas_this_write; | |
e1036a2a | 1170 | |
f57a6a5d | 1171 | s->s[i].replicas_reserved = 0; |
f57a6a5d KO |
1172 | s->s[i].state = SECTOR_ALLOCATED; |
1173 | } | |
1c6fdbd8 | 1174 | |
7f5e31e1 KO |
1175 | BUG_ON(atomic_read(&s->write_count)); |
1176 | atomic_set(&s->write_count, 1); | |
1177 | ||
1c6fdbd8 KO |
1178 | BUG_ON(PageWriteback(page)); |
1179 | set_page_writeback(page); | |
7f5e31e1 | 1180 | |
1c6fdbd8 KO |
1181 | unlock_page(page); |
1182 | ||
7f5e31e1 KO |
1183 | offset = 0; |
1184 | while (1) { | |
1185 | unsigned sectors = 1, dirty_sectors = 0, reserved_sectors = 0; | |
1186 | u64 sector; | |
1187 | ||
1188 | while (offset < PAGE_SECTORS && | |
2ba5d38b | 1189 | orig.s[offset].state < SECTOR_DIRTY) |
7f5e31e1 | 1190 | offset++; |
1c6fdbd8 | 1191 | |
7f5e31e1 KO |
1192 | if (offset == PAGE_SECTORS) |
1193 | break; | |
1194 | ||
1195 | sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset; | |
1196 | ||
1197 | while (offset + sectors < PAGE_SECTORS && | |
2ba5d38b | 1198 | orig.s[offset + sectors].state >= SECTOR_DIRTY) |
7f5e31e1 KO |
1199 | sectors++; |
1200 | ||
1201 | for (i = offset; i < offset + sectors; i++) { | |
1202 | reserved_sectors += orig.s[i].replicas_reserved; | |
1203 | dirty_sectors += orig.s[i].state == SECTOR_DIRTY; | |
1204 | } | |
1205 | ||
1206 | if (w->io && | |
9a3df993 KO |
1207 | (w->io->op.res.nr_replicas != nr_replicas_this_write || |
1208 | bio_full(&w->io->op.wbio.bio, PAGE_SIZE) || | |
f59b3464 KO |
1209 | w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= |
1210 | (BIO_MAX_VECS * PAGE_SIZE) || | |
9a3df993 | 1211 | bio_end_sector(&w->io->op.wbio.bio) != sector)) |
7f5e31e1 | 1212 | bch2_writepage_do_io(w); |
1c6fdbd8 | 1213 | |
7f5e31e1 | 1214 | if (!w->io) |
50fe5bd6 | 1215 | bch2_writepage_io_alloc(c, wbc, w, inode, sector, |
7f5e31e1 | 1216 | nr_replicas_this_write); |
1c6fdbd8 | 1217 | |
7f5e31e1 KO |
1218 | atomic_inc(&s->write_count); |
1219 | ||
9a3df993 KO |
1220 | BUG_ON(inode != w->io->inode); |
1221 | BUG_ON(!bio_add_page(&w->io->op.wbio.bio, page, | |
7f5e31e1 KO |
1222 | sectors << 9, offset << 9)); |
1223 | ||
6cc3535d | 1224 | /* Check for writing past i_size: */ |
9a3df993 | 1225 | WARN_ON((bio_end_sector(&w->io->op.wbio.bio) << 9) > |
fb472ac5 | 1226 | round_up(i_size, block_bytes(c))); |
6cc3535d | 1227 | |
9a3df993 KO |
1228 | w->io->op.res.sectors += reserved_sectors; |
1229 | w->io->op.i_sectors_delta -= dirty_sectors; | |
7f5e31e1 KO |
1230 | w->io->op.new_i_size = i_size; |
1231 | ||
7f5e31e1 KO |
1232 | offset += sectors; |
1233 | } | |
1c6fdbd8 | 1234 | |
7f5e31e1 KO |
1235 | if (atomic_dec_and_test(&s->write_count)) |
1236 | end_page_writeback(page); | |
1c6fdbd8 KO |
1237 | |
1238 | return 0; | |
1239 | } | |
1240 | ||
1241 | int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) | |
1242 | { | |
1243 | struct bch_fs *c = mapping->host->i_sb->s_fs_info; | |
1244 | struct bch_writepage_state w = | |
1245 | bch_writepage_state_init(c, to_bch_ei(mapping->host)); | |
1246 | struct blk_plug plug; | |
1247 | int ret; | |
1248 | ||
1249 | blk_start_plug(&plug); | |
1250 | ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w); | |
1251 | if (w.io) | |
1252 | bch2_writepage_do_io(&w); | |
1253 | blk_finish_plug(&plug); | |
1254 | return ret; | |
1255 | } | |
1256 | ||
1257 | int bch2_writepage(struct page *page, struct writeback_control *wbc) | |
1258 | { | |
1259 | struct bch_fs *c = page->mapping->host->i_sb->s_fs_info; | |
1260 | struct bch_writepage_state w = | |
1261 | bch_writepage_state_init(c, to_bch_ei(page->mapping->host)); | |
1262 | int ret; | |
1263 | ||
1264 | ret = __bch2_writepage(page_folio(page), wbc, &w); | |
1265 | if (w.io) | |
1266 | bch2_writepage_do_io(&w); | |
1267 | ||
1268 | return ret; | |
1269 | } | |
1270 | ||
1271 | /* buffered writes: */ | |
1272 | ||
1273 | int bch2_write_begin(struct file *file, struct address_space *mapping, | |
1274 | loff_t pos, unsigned len, | |
1275 | struct page **pagep, void **fsdata) | |
1276 | { | |
1277 | struct bch_inode_info *inode = to_bch_ei(mapping->host); | |
1278 | struct bch_fs *c = inode->v.i_sb->s_fs_info; | |
d1542e03 | 1279 | struct bch2_page_reservation *res; |
1c6fdbd8 KO |
1280 | pgoff_t index = pos >> PAGE_SHIFT; |
1281 | unsigned offset = pos & (PAGE_SIZE - 1); | |
1282 | struct page *page; | |
1283 | int ret = -ENOMEM; | |
1284 | ||
d1542e03 KO |
1285 | res = kmalloc(sizeof(*res), GFP_KERNEL); |
1286 | if (!res) | |
1287 | return -ENOMEM; | |
1288 | ||
1289 | bch2_page_reservation_init(c, inode, res); | |
1290 | *fsdata = res; | |
1c6fdbd8 KO |
1291 | |
1292 | bch2_pagecache_add_get(&inode->ei_pagecache_lock); | |
1293 | ||
1294 | page = grab_cache_page_write_begin(mapping, index); | |
1295 | if (!page) | |
1296 | goto err_unlock; | |
1297 | ||
1298 | if (PageUptodate(page)) | |
1299 | goto out; | |
1300 | ||
1301 | /* If we're writing entire page, don't need to read it in first: */ | |
1302 | if (len == PAGE_SIZE) | |
1303 | goto out; | |
1304 | ||
1305 | if (!offset && pos + len >= inode->v.i_size) { | |
1306 | zero_user_segment(page, len, PAGE_SIZE); | |
1307 | flush_dcache_page(page); | |
1308 | goto out; | |
1309 | } | |
1310 | ||
1311 | if (index > inode->v.i_size >> PAGE_SHIFT) { | |
1312 | zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE); | |
1313 | flush_dcache_page(page); | |
1314 | goto out; | |
1315 | } | |
1316 | readpage: | |
1317 | ret = bch2_read_single_page(page, mapping); | |
1318 | if (ret) | |
1319 | goto err; | |
1320 | out: | |
d1542e03 KO |
1321 | ret = bch2_page_reservation_get(c, inode, page, res, |
1322 | offset, len, true); | |
1c6fdbd8 KO |
1323 | if (ret) { |
1324 | if (!PageUptodate(page)) { | |
1325 | /* | |
1326 | * If the page hasn't been read in, we won't know if we | |
1327 | * actually need a reservation - we don't actually need | |
1328 | * to read here, we just need to check if the page is | |
1329 | * fully backed by uncompressed data: | |
1330 | */ | |
1331 | goto readpage; | |
1332 | } | |
1333 | ||
1334 | goto err; | |
1335 | } | |
1336 | ||
1337 | *pagep = page; | |
1338 | return 0; | |
1339 | err: | |
1340 | unlock_page(page); | |
1341 | put_page(page); | |
1342 | *pagep = NULL; | |
1343 | err_unlock: | |
1344 | bch2_pagecache_add_put(&inode->ei_pagecache_lock); | |
d1542e03 KO |
1345 | kfree(res); |
1346 | *fsdata = NULL; | |
1c6fdbd8 KO |
1347 | return ret; |
1348 | } | |
1349 | ||
1350 | int bch2_write_end(struct file *file, struct address_space *mapping, | |
1351 | loff_t pos, unsigned len, unsigned copied, | |
1352 | struct page *page, void *fsdata) | |
1353 | { | |
1354 | struct bch_inode_info *inode = to_bch_ei(mapping->host); | |
1355 | struct bch_fs *c = inode->v.i_sb->s_fs_info; | |
d1542e03 KO |
1356 | struct bch2_page_reservation *res = fsdata; |
1357 | unsigned offset = pos & (PAGE_SIZE - 1); | |
1c6fdbd8 KO |
1358 | |
1359 | lockdep_assert_held(&inode->v.i_rwsem); | |
1360 | ||
1361 | if (unlikely(copied < len && !PageUptodate(page))) { | |
1362 | /* | |
1363 | * The page needs to be read in, but that would destroy | |
1364 | * our partial write - simplest thing is to just force | |
1365 | * userspace to redo the write: | |
1366 | */ | |
1367 | zero_user(page, 0, PAGE_SIZE); | |
1368 | flush_dcache_page(page); | |
1369 | copied = 0; | |
1370 | } | |
1371 | ||
1372 | spin_lock(&inode->v.i_lock); | |
1373 | if (pos + copied > inode->v.i_size) | |
1374 | i_size_write(&inode->v, pos + copied); | |
1375 | spin_unlock(&inode->v.i_lock); | |
1376 | ||
1377 | if (copied) { | |
1378 | if (!PageUptodate(page)) | |
1379 | SetPageUptodate(page); | |
d1542e03 KO |
1380 | |
1381 | bch2_set_page_dirty(c, inode, page, res, offset, copied); | |
1c6fdbd8 KO |
1382 | |
1383 | inode->ei_last_dirtied = (unsigned long) current; | |
1c6fdbd8 KO |
1384 | } |
1385 | ||
1386 | unlock_page(page); | |
1387 | put_page(page); | |
1388 | bch2_pagecache_add_put(&inode->ei_pagecache_lock); | |
1389 | ||
d1542e03 KO |
1390 | bch2_page_reservation_put(c, inode, res); |
1391 | kfree(res); | |
1392 | ||
1c6fdbd8 KO |
1393 | return copied; |
1394 | } | |
1395 | ||
1396 | #define WRITE_BATCH_PAGES 32 | |
1397 | ||
1398 | static int __bch2_buffered_write(struct bch_inode_info *inode, | |
1399 | struct address_space *mapping, | |
1400 | struct iov_iter *iter, | |
1401 | loff_t pos, unsigned len) | |
1402 | { | |
1403 | struct bch_fs *c = inode->v.i_sb->s_fs_info; | |
1404 | struct page *pages[WRITE_BATCH_PAGES]; | |
d1542e03 | 1405 | struct bch2_page_reservation res; |
1c6fdbd8 KO |
1406 | unsigned long index = pos >> PAGE_SHIFT; |
1407 | unsigned offset = pos & (PAGE_SIZE - 1); | |
1408 | unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); | |
d1542e03 KO |
1409 | unsigned i, reserved = 0, set_dirty = 0; |
1410 | unsigned copied = 0, nr_pages_copied = 0; | |
1c6fdbd8 KO |
1411 | int ret = 0; |
1412 | ||
1413 | BUG_ON(!len); | |
1414 | BUG_ON(nr_pages > ARRAY_SIZE(pages)); | |
1415 | ||
d1542e03 KO |
1416 | bch2_page_reservation_init(c, inode, &res); |
1417 | ||
1c6fdbd8 KO |
1418 | for (i = 0; i < nr_pages; i++) { |
1419 | pages[i] = grab_cache_page_write_begin(mapping, index + i); | |
1420 | if (!pages[i]) { | |
1421 | nr_pages = i; | |
8de819f8 KO |
1422 | if (!i) { |
1423 | ret = -ENOMEM; | |
1424 | goto out; | |
1425 | } | |
1426 | len = min_t(unsigned, len, | |
1427 | nr_pages * PAGE_SIZE - offset); | |
1428 | break; | |
1c6fdbd8 KO |
1429 | } |
1430 | } | |
1431 | ||
1432 | if (offset && !PageUptodate(pages[0])) { | |
1433 | ret = bch2_read_single_page(pages[0], mapping); | |
1434 | if (ret) | |
1435 | goto out; | |
1436 | } | |
1437 | ||
1438 | if ((pos + len) & (PAGE_SIZE - 1) && | |
1439 | !PageUptodate(pages[nr_pages - 1])) { | |
1440 | if ((index + nr_pages - 1) << PAGE_SHIFT >= inode->v.i_size) { | |
1441 | zero_user(pages[nr_pages - 1], 0, PAGE_SIZE); | |
1442 | } else { | |
1443 | ret = bch2_read_single_page(pages[nr_pages - 1], mapping); | |
1444 | if (ret) | |
1445 | goto out; | |
1446 | } | |
1447 | } | |
1448 | ||
d1542e03 KO |
1449 | while (reserved < len) { |
1450 | struct page *page = pages[(offset + reserved) >> PAGE_SHIFT]; | |
1451 | unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1); | |
1452 | unsigned pg_len = min_t(unsigned, len - reserved, | |
1453 | PAGE_SIZE - pg_offset); | |
1454 | retry_reservation: | |
1455 | ret = bch2_page_reservation_get(c, inode, page, &res, | |
1456 | pg_offset, pg_len, true); | |
1457 | ||
1458 | if (ret && !PageUptodate(page)) { | |
1459 | ret = bch2_read_single_page(page, mapping); | |
1460 | if (!ret) | |
1461 | goto retry_reservation; | |
1c6fdbd8 KO |
1462 | } |
1463 | ||
1464 | if (ret) | |
1465 | goto out; | |
d1542e03 KO |
1466 | |
1467 | reserved += pg_len; | |
1c6fdbd8 KO |
1468 | } |
1469 | ||
1470 | if (mapping_writably_mapped(mapping)) | |
1471 | for (i = 0; i < nr_pages; i++) | |
1472 | flush_dcache_page(pages[i]); | |
1473 | ||
1474 | while (copied < len) { | |
1475 | struct page *page = pages[(offset + copied) >> PAGE_SHIFT]; | |
1476 | unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1); | |
d1542e03 KO |
1477 | unsigned pg_len = min_t(unsigned, len - copied, |
1478 | PAGE_SIZE - pg_offset); | |
1c6fdbd8 | 1479 | unsigned pg_copied = copy_page_from_iter_atomic(page, |
d1542e03 KO |
1480 | pg_offset, pg_len, iter); |
1481 | ||
1482 | if (!pg_copied) | |
1483 | break; | |
1c6fdbd8 | 1484 | |
912bdf17 KO |
1485 | if (!PageUptodate(page) && |
1486 | pg_copied != PAGE_SIZE && | |
1487 | pos + copied + pg_copied < inode->v.i_size) { | |
1488 | zero_user(page, 0, PAGE_SIZE); | |
1489 | break; | |
1490 | } | |
1491 | ||
1c6fdbd8 KO |
1492 | flush_dcache_page(page); |
1493 | copied += pg_copied; | |
912bdf17 KO |
1494 | |
1495 | if (pg_copied != pg_len) | |
1496 | break; | |
1c6fdbd8 KO |
1497 | } |
1498 | ||
1499 | if (!copied) | |
1500 | goto out; | |
1501 | ||
877dfb34 KO |
1502 | spin_lock(&inode->v.i_lock); |
1503 | if (pos + copied > inode->v.i_size) | |
1504 | i_size_write(&inode->v, pos + copied); | |
1505 | spin_unlock(&inode->v.i_lock); | |
1506 | ||
d1542e03 KO |
1507 | while (set_dirty < copied) { |
1508 | struct page *page = pages[(offset + set_dirty) >> PAGE_SHIFT]; | |
1509 | unsigned pg_offset = (offset + set_dirty) & (PAGE_SIZE - 1); | |
1510 | unsigned pg_len = min_t(unsigned, copied - set_dirty, | |
1511 | PAGE_SIZE - pg_offset); | |
1512 | ||
1513 | if (!PageUptodate(page)) | |
1514 | SetPageUptodate(page); | |
1515 | ||
1516 | bch2_set_page_dirty(c, inode, page, &res, pg_offset, pg_len); | |
1517 | unlock_page(page); | |
1518 | put_page(page); | |
1519 | ||
1520 | set_dirty += pg_len; | |
1521 | } | |
877dfb34 KO |
1522 | |
1523 | nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE); | |
1524 | inode->ei_last_dirtied = (unsigned long) current; | |
d1542e03 | 1525 | out: |
1c6fdbd8 | 1526 | for (i = nr_pages_copied; i < nr_pages; i++) { |
1c6fdbd8 KO |
1527 | unlock_page(pages[i]); |
1528 | put_page(pages[i]); | |
1529 | } | |
1530 | ||
d1542e03 KO |
1531 | bch2_page_reservation_put(c, inode, &res); |
1532 | ||
1c6fdbd8 KO |
1533 | return copied ?: ret; |
1534 | } | |
1535 | ||
1536 | static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) | |
1537 | { | |
1538 | struct file *file = iocb->ki_filp; | |
1539 | struct address_space *mapping = file->f_mapping; | |
1540 | struct bch_inode_info *inode = file_bch_inode(file); | |
1541 | loff_t pos = iocb->ki_pos; | |
1542 | ssize_t written = 0; | |
1543 | int ret = 0; | |
1544 | ||
1545 | bch2_pagecache_add_get(&inode->ei_pagecache_lock); | |
1546 | ||
1547 | do { | |
1548 | unsigned offset = pos & (PAGE_SIZE - 1); | |
1549 | unsigned bytes = min_t(unsigned long, iov_iter_count(iter), | |
1550 | PAGE_SIZE * WRITE_BATCH_PAGES - offset); | |
1551 | again: | |
1552 | /* | |
1553 | * Bring in the user page that we will copy from _first_. | |
1554 | * Otherwise there's a nasty deadlock on copying from the | |
1555 | * same page as we're writing to, without it being marked | |
1556 | * up-to-date. | |
1557 | * | |
1558 | * Not only is this an optimisation, but it is also required | |
1559 | * to check that the address is actually valid, when atomic | |
1560 | * usercopies are used, below. | |
1561 | */ | |
1562 | if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { | |
1563 | bytes = min_t(unsigned long, iov_iter_count(iter), | |
1564 | PAGE_SIZE - offset); | |
1565 | ||
1566 | if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { | |
1567 | ret = -EFAULT; | |
1568 | break; | |
1569 | } | |
1570 | } | |
1571 | ||
1572 | if (unlikely(fatal_signal_pending(current))) { | |
1573 | ret = -EINTR; | |
1574 | break; | |
1575 | } | |
1576 | ||
1577 | ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); | |
1578 | if (unlikely(ret < 0)) | |
1579 | break; | |
1580 | ||
1581 | cond_resched(); | |
1582 | ||
1583 | if (unlikely(ret == 0)) { | |
1584 | /* | |
1585 | * If we were unable to copy any data at all, we must | |
1586 | * fall back to a single segment length write. | |
1587 | * | |
1588 | * If we didn't fallback here, we could livelock | |
1589 | * because not all segments in the iov can be copied at | |
1590 | * once without a pagefault. | |
1591 | */ | |
1592 | bytes = min_t(unsigned long, PAGE_SIZE - offset, | |
1593 | iov_iter_single_seg_count(iter)); | |
1594 | goto again; | |
1595 | } | |
1596 | pos += ret; | |
1597 | written += ret; | |
912bdf17 | 1598 | ret = 0; |
1c6fdbd8 KO |
1599 | |
1600 | balance_dirty_pages_ratelimited(mapping); | |
1601 | } while (iov_iter_count(iter)); | |
1602 | ||
1603 | bch2_pagecache_add_put(&inode->ei_pagecache_lock); | |
1604 | ||
1605 | return written ? written : ret; | |
1606 | } | |
1607 | ||
1608 | /* O_DIRECT reads */ | |
1609 | ||
b4725cc1 KO |
1610 | static void bio_check_or_release(struct bio *bio, bool check_dirty) |
1611 | { | |
1612 | if (check_dirty) { | |
1613 | bio_check_pages_dirty(bio); | |
1614 | } else { | |
1615 | bio_release_pages(bio, false); | |
1616 | bio_put(bio); | |
1617 | } | |
1618 | } | |
1619 | ||
1c6fdbd8 KO |
1620 | static void bch2_dio_read_complete(struct closure *cl) |
1621 | { | |
1622 | struct dio_read *dio = container_of(cl, struct dio_read, cl); | |
1623 | ||
1624 | dio->req->ki_complete(dio->req, dio->ret); | |
b4725cc1 | 1625 | bio_check_or_release(&dio->rbio.bio, dio->should_dirty); |
1c6fdbd8 KO |
1626 | } |
1627 | ||
1628 | static void bch2_direct_IO_read_endio(struct bio *bio) | |
1629 | { | |
1630 | struct dio_read *dio = bio->bi_private; | |
1631 | ||
1632 | if (bio->bi_status) | |
1633 | dio->ret = blk_status_to_errno(bio->bi_status); | |
1634 | ||
1635 | closure_put(&dio->cl); | |
1636 | } | |
1637 | ||
1638 | static void bch2_direct_IO_read_split_endio(struct bio *bio) | |
1639 | { | |
b4725cc1 KO |
1640 | struct dio_read *dio = bio->bi_private; |
1641 | bool should_dirty = dio->should_dirty; | |
1642 | ||
1c6fdbd8 | 1643 | bch2_direct_IO_read_endio(bio); |
b4725cc1 | 1644 | bio_check_or_release(bio, should_dirty); |
1c6fdbd8 KO |
1645 | } |
1646 | ||
1647 | static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) | |
1648 | { | |
1649 | struct file *file = req->ki_filp; | |
1650 | struct bch_inode_info *inode = file_bch_inode(file); | |
1651 | struct bch_fs *c = inode->v.i_sb->s_fs_info; | |
9a3df993 | 1652 | struct bch_io_opts opts = io_opts(c, &inode->ei_inode); |
1c6fdbd8 KO |
1653 | struct dio_read *dio; |
1654 | struct bio *bio; | |
1655 | loff_t offset = req->ki_pos; | |
1656 | bool sync = is_sync_kiocb(req); | |
1657 | size_t shorten; | |
1658 | ssize_t ret; | |
1659 | ||
1660 | if ((offset|iter->count) & (block_bytes(c) - 1)) | |
1661 | return -EINVAL; | |
1662 | ||
1663 | ret = min_t(loff_t, iter->count, | |
1664 | max_t(loff_t, 0, i_size_read(&inode->v) - offset)); | |
1665 | ||
1666 | if (!ret) | |
1667 | return ret; | |
1668 | ||
1669 | shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c)); | |
1670 | iter->count -= shorten; | |
1671 | ||
1672 | bio = bio_alloc_bioset(NULL, | |
1673 | iov_iter_npages(iter, BIO_MAX_VECS), | |
1674 | REQ_OP_READ, | |
1675 | GFP_KERNEL, | |
1676 | &c->dio_read_bioset); | |
1677 | ||
1678 | bio->bi_end_io = bch2_direct_IO_read_endio; | |
1679 | ||
1680 | dio = container_of(bio, struct dio_read, rbio.bio); | |
1681 | closure_init(&dio->cl, NULL); | |
1682 | ||
1683 | /* | |
1684 | * this is a _really_ horrible hack just to avoid an atomic sub at the | |
1685 | * end: | |
1686 | */ | |
1687 | if (!sync) { | |
1688 | set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL); | |
1689 | atomic_set(&dio->cl.remaining, | |
1690 | CLOSURE_REMAINING_INITIALIZER - | |
1691 | CLOSURE_RUNNING + | |
1692 | CLOSURE_DESTRUCTOR); | |
1693 | } else { | |
1694 | atomic_set(&dio->cl.remaining, | |
1695 | CLOSURE_REMAINING_INITIALIZER + 1); | |
1696 | } | |
1697 | ||
1698 | dio->req = req; | |
1699 | dio->ret = ret; | |
b4725cc1 KO |
1700 | /* |
1701 | * This is one of the sketchier things I've encountered: we have to skip | |
1702 | * the dirtying of requests that are internal from the kernel (i.e. from | |
1703 | * loopback), because we'll deadlock on page_lock. | |
1704 | */ | |
1705 | dio->should_dirty = iter_is_iovec(iter); | |
1c6fdbd8 KO |
1706 | |
1707 | goto start; | |
1708 | while (iter->count) { | |
1709 | bio = bio_alloc_bioset(NULL, | |
1710 | iov_iter_npages(iter, BIO_MAX_VECS), | |
1711 | REQ_OP_READ, | |
1712 | GFP_KERNEL, | |
1713 | &c->bio_read); | |
1714 | bio->bi_end_io = bch2_direct_IO_read_split_endio; | |
1715 | start: | |
1716 | bio->bi_opf = REQ_OP_READ|REQ_SYNC; | |
1717 | bio->bi_iter.bi_sector = offset >> 9; | |
1718 | bio->bi_private = dio; | |
1719 | ||
1720 | ret = bio_iov_iter_get_pages(bio, iter); | |
1721 | if (ret < 0) { | |
1722 | /* XXX: fault inject this path */ | |
1723 | bio->bi_status = BLK_STS_RESOURCE; | |
1724 | bio_endio(bio); | |
1725 | break; | |
1726 | } | |
1727 | ||
1728 | offset += bio->bi_iter.bi_size; | |
b4725cc1 KO |
1729 | |
1730 | if (dio->should_dirty) | |
1731 | bio_set_pages_dirty(bio); | |
1c6fdbd8 KO |
1732 | |
1733 | if (iter->count) | |
1734 | closure_get(&dio->cl); | |
1735 | ||
1736 | bch2_read(c, rbio_init(bio, opts), inode->v.i_ino); | |
1737 | } | |
1738 | ||
1739 | iter->count += shorten; | |
1740 | ||
1741 | if (sync) { | |
1742 | closure_sync(&dio->cl); | |
1743 | closure_debug_destroy(&dio->cl); | |
1744 | ret = dio->ret; | |
b4725cc1 | 1745 | bio_check_or_release(&dio->rbio.bio, dio->should_dirty); |
1c6fdbd8 KO |
1746 | return ret; |
1747 | } else { | |
1748 | return -EIOCBQUEUED; | |
1749 | } | |
1750 | } | |
1751 | ||
1752 | ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) | |
1753 | { | |
1754 | struct file *file = iocb->ki_filp; | |
1755 | struct bch_inode_info *inode = file_bch_inode(file); | |
1756 | struct address_space *mapping = file->f_mapping; | |
1757 | size_t count = iov_iter_count(iter); | |
1758 | ssize_t ret; | |
1759 | ||
1760 | if (!count) | |
1761 | return 0; /* skip atime */ | |
1762 | ||
1763 | if (iocb->ki_flags & IOCB_DIRECT) { | |
1764 | struct blk_plug plug; | |
1765 | ||
a023127a KO |
1766 | if (unlikely(mapping->nrpages)) { |
1767 | ret = filemap_write_and_wait_range(mapping, | |
1768 | iocb->ki_pos, | |
1769 | iocb->ki_pos + count - 1); | |
1770 | if (ret < 0) | |
1771 | return ret; | |
1772 | } | |
1c6fdbd8 KO |
1773 | |
1774 | file_accessed(file); | |
1775 | ||
1776 | blk_start_plug(&plug); | |
1777 | ret = bch2_direct_IO_read(iocb, iter); | |
1778 | blk_finish_plug(&plug); | |
1779 | ||
1780 | if (ret >= 0) | |
1781 | iocb->ki_pos += ret; | |
1782 | } else { | |
1783 | bch2_pagecache_add_get(&inode->ei_pagecache_lock); | |
1784 | ret = generic_file_read_iter(iocb, iter); | |
1785 | bch2_pagecache_add_put(&inode->ei_pagecache_lock); | |
1786 | } | |
1787 | ||
1788 | return ret; | |
1789 | } | |
1790 | ||
1791 | /* O_DIRECT writes */ | |
1792 | ||
1793 | /* | |
1794 | * We're going to return -EIOCBQUEUED, but we haven't finished consuming the | |
1795 | * iov_iter yet, so we need to stash a copy of the iovec: it might be on the | |
1796 | * caller's stack, we're not guaranteed that it will live for the duration of | |
1797 | * the IO: | |
1798 | */ | |
1799 | static noinline int bch2_dio_write_copy_iov(struct dio_write *dio) | |
1800 | { | |
1801 | struct iovec *iov = dio->inline_vecs; | |
1802 | ||
1803 | /* | |
1804 | * iov_iter has a single embedded iovec - nothing to do: | |
1805 | */ | |
1806 | if (iter_is_ubuf(&dio->iter)) | |
1807 | return 0; | |
1808 | ||
1809 | /* | |
1810 | * We don't currently handle non-iovec iov_iters here - return an error, | |
1811 | * and we'll fall back to doing the IO synchronously: | |
1812 | */ | |
1813 | if (!iter_is_iovec(&dio->iter)) | |
1814 | return -1; | |
1815 | ||
1816 | if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { | |
1817 | iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov), | |
1818 | GFP_KERNEL); | |
1819 | if (unlikely(!iov)) | |
1820 | return -ENOMEM; | |
1821 | ||
1822 | dio->free_iov = true; | |
1823 | } | |
1824 | ||
1825 | memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov)); | |
1826 | dio->iter.__iov = iov; | |
1827 | return 0; | |
1828 | } | |
1829 | ||
042a1f26 KO |
1830 | static void bch2_dio_write_loop_async(struct bch_write_op *); |
1831 | ||
1c6fdbd8 KO |
1832 | static long bch2_dio_write_loop(struct dio_write *dio) |
1833 | { | |
ed484030 | 1834 | bool kthread = (current->flags & PF_KTHREAD) != 0; |
1c6fdbd8 KO |
1835 | struct kiocb *req = dio->req; |
1836 | struct address_space *mapping = req->ki_filp->f_mapping; | |
9a3df993 | 1837 | struct bch_inode_info *inode = file_bch_inode(req->ki_filp); |
042a1f26 | 1838 | struct bch_fs *c = inode->v.i_sb->s_fs_info; |
9a3df993 | 1839 | struct bio *bio = &dio->op.wbio.bio; |
eb8e6e9c KO |
1840 | unsigned unaligned, iter_count; |
1841 | bool sync = dio->sync, dropped_locks; | |
1c6fdbd8 KO |
1842 | long ret; |
1843 | ||
1844 | if (dio->loop) | |
1845 | goto loop; | |
1846 | ||
1c6fdbd8 | 1847 | while (1) { |
eb8e6e9c KO |
1848 | iter_count = dio->iter.count; |
1849 | ||
ed484030 KO |
1850 | if (kthread) |
1851 | kthread_use_mm(dio->mm); | |
1c6fdbd8 KO |
1852 | BUG_ON(current->faults_disabled_mapping); |
1853 | current->faults_disabled_mapping = mapping; | |
1854 | ||
1855 | ret = bio_iov_iter_get_pages(bio, &dio->iter); | |
1856 | ||
eb8e6e9c KO |
1857 | dropped_locks = fdm_dropped_locks(); |
1858 | ||
1c6fdbd8 | 1859 | current->faults_disabled_mapping = NULL; |
ed484030 KO |
1860 | if (kthread) |
1861 | kthread_unuse_mm(dio->mm); | |
1c6fdbd8 | 1862 | |
eb8e6e9c KO |
1863 | /* |
1864 | * If the fault handler returned an error but also signalled | |
1865 | * that it dropped & retook ei_pagecache_lock, we just need to | |
1866 | * re-shoot down the page cache and retry: | |
1867 | */ | |
1868 | if (dropped_locks && ret) | |
1869 | ret = 0; | |
1870 | ||
1c6fdbd8 KO |
1871 | if (unlikely(ret < 0)) |
1872 | goto err; | |
1873 | ||
eb8e6e9c KO |
1874 | if (unlikely(dropped_locks)) { |
1875 | ret = write_invalidate_inode_pages_range(mapping, | |
1876 | req->ki_pos, | |
1877 | req->ki_pos + iter_count - 1); | |
1878 | if (unlikely(ret)) | |
1879 | goto err; | |
1880 | ||
1881 | if (!bio->bi_iter.bi_size) | |
1882 | continue; | |
1883 | } | |
1884 | ||
0a426c32 KO |
1885 | unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); |
1886 | bio->bi_iter.bi_size -= unaligned; | |
1887 | iov_iter_revert(&dio->iter, unaligned); | |
1888 | ||
1889 | if (!bio->bi_iter.bi_size) { | |
1890 | /* | |
1891 | * bio_iov_iter_get_pages was only able to get < | |
1892 | * blocksize worth of pages: | |
1893 | */ | |
0a426c32 KO |
1894 | ret = -EFAULT; |
1895 | goto err; | |
1896 | } | |
1897 | ||
042a1f26 KO |
1898 | bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode)); |
1899 | dio->op.end_io = bch2_dio_write_loop_async; | |
1900 | dio->op.target = dio->op.opts.foreground_target; | |
1901 | op_journal_seq_set(&dio->op, &inode->ei_journal_seq); | |
1902 | dio->op.write_point = writepoint_hashed((unsigned long) current); | |
1903 | dio->op.nr_replicas = dio->op.opts.data_replicas; | |
1904 | dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); | |
1905 | ||
1906 | if ((req->ki_flags & IOCB_DSYNC) && | |
1907 | !c->opts.journal_flush_disabled) | |
1908 | dio->op.flags |= BCH_WRITE_FLUSH; | |
a6336910 | 1909 | dio->op.flags |= BCH_WRITE_CHECK_ENOSPC; |
042a1f26 KO |
1910 | |
1911 | ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), | |
1912 | dio->op.opts.data_replicas, 0); | |
1913 | if (unlikely(ret) && | |
1914 | !bch2_check_range_allocated(c, dio->op.pos, | |
35a067b4 KO |
1915 | bio_sectors(bio), |
1916 | dio->op.opts.data_replicas, | |
1917 | dio->op.opts.compression != 0)) | |
042a1f26 | 1918 | goto err; |
1c6fdbd8 KO |
1919 | |
1920 | task_io_account_write(bio->bi_iter.bi_size); | |
1921 | ||
1c6fdbd8 KO |
1922 | if (!dio->sync && !dio->loop && dio->iter.count) { |
1923 | if (bch2_dio_write_copy_iov(dio)) { | |
286d8ad0 | 1924 | dio->sync = sync = true; |
f8f30863 | 1925 | goto do_io; |
1c6fdbd8 KO |
1926 | } |
1927 | } | |
f8f30863 | 1928 | do_io: |
1c6fdbd8 | 1929 | dio->loop = true; |
f8f30863 | 1930 | closure_call(&dio->op.cl, bch2_write, NULL, NULL); |
1c6fdbd8 | 1931 | |
286d8ad0 | 1932 | if (sync) |
f8f30863 KO |
1933 | wait_for_completion(&dio->done); |
1934 | else | |
1c6fdbd8 | 1935 | return -EIOCBQUEUED; |
1c6fdbd8 | 1936 | loop: |
9a3df993 KO |
1937 | i_sectors_acct(c, inode, &dio->quota_res, |
1938 | dio->op.i_sectors_delta); | |
042a1f26 KO |
1939 | req->ki_pos += (u64) dio->op.written << 9; |
1940 | dio->written += dio->op.written; | |
9a3df993 KO |
1941 | |
1942 | spin_lock(&inode->v.i_lock); | |
042a1f26 KO |
1943 | if (req->ki_pos > inode->v.i_size) |
1944 | i_size_write(&inode->v, req->ki_pos); | |
9a3df993 KO |
1945 | spin_unlock(&inode->v.i_lock); |
1946 | ||
032ac32c | 1947 | bio_release_pages(bio, false); |
5468f119 | 1948 | bio->bi_vcnt = 0; |
33c74e41 KO |
1949 | |
1950 | if (dio->op.error) { | |
1951 | set_bit(EI_INODE_ERROR, &inode->ei_flags); | |
1952 | break; | |
1953 | } | |
1954 | ||
1955 | if (!dio->iter.count) | |
1c6fdbd8 | 1956 | break; |
f8f30863 | 1957 | |
1c6fdbd8 | 1958 | bio_reset(bio, NULL, REQ_OP_WRITE); |
f8f30863 | 1959 | reinit_completion(&dio->done); |
1c6fdbd8 KO |
1960 | } |
1961 | ||
042a1f26 | 1962 | ret = dio->op.error ?: ((long) dio->written << 9); |
1c6fdbd8 KO |
1963 | err: |
1964 | bch2_pagecache_block_put(&inode->ei_pagecache_lock); | |
0a426c32 | 1965 | bch2_quota_reservation_put(c, inode, &dio->quota_res); |
1c6fdbd8 KO |
1966 | |
1967 | if (dio->free_iov) | |
1968 | kfree(dio->iter.__iov); | |
1969 | ||
5468f119 | 1970 | bio_release_pages(bio, false); |
1c6fdbd8 KO |
1971 | bio_put(bio); |
1972 | ||
1973 | /* inode->i_dio_count is our ref on inode and thus bch_fs */ | |
1974 | inode_dio_end(&inode->v); | |
1975 | ||
1976 | if (!sync) { | |
1977 | req->ki_complete(req, ret); | |
1978 | ret = -EIOCBQUEUED; | |
1979 | } | |
1980 | return ret; | |
1981 | } | |
1982 | ||
f8f30863 | 1983 | static void bch2_dio_write_loop_async(struct bch_write_op *op) |
1c6fdbd8 | 1984 | { |
f8f30863 | 1985 | struct dio_write *dio = container_of(op, struct dio_write, op); |
1c6fdbd8 | 1986 | |
f8f30863 KO |
1987 | if (dio->sync) |
1988 | complete(&dio->done); | |
1989 | else | |
1990 | bch2_dio_write_loop(dio); | |
1c6fdbd8 KO |
1991 | } |
1992 | ||
1993 | static noinline | |
1994 | ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) | |
1995 | { | |
1996 | struct file *file = req->ki_filp; | |
54847d25 | 1997 | struct address_space *mapping = file->f_mapping; |
1c6fdbd8 KO |
1998 | struct bch_inode_info *inode = file_bch_inode(file); |
1999 | struct bch_fs *c = inode->v.i_sb->s_fs_info; | |
2000 | struct dio_write *dio; | |
2001 | struct bio *bio; | |
7edcfbfe | 2002 | bool locked = true, extending; |
1c6fdbd8 KO |
2003 | ssize_t ret; |
2004 | ||
7edcfbfe KO |
2005 | prefetch(&c->opts); |
2006 | prefetch((void *) &c->opts + 64); | |
2007 | prefetch(&inode->ei_inode); | |
2008 | prefetch((void *) &inode->ei_inode + 64); | |
1c6fdbd8 | 2009 | |
7edcfbfe KO |
2010 | inode_lock(&inode->v); |
2011 | ||
2012 | ret = generic_write_checks(req, iter); | |
2013 | if (unlikely(ret <= 0)) | |
2014 | goto err; | |
2015 | ||
2016 | ret = file_remove_privs(file); | |
2017 | if (unlikely(ret)) | |
2018 | goto err; | |
2019 | ||
2020 | ret = file_update_time(file); | |
2021 | if (unlikely(ret)) | |
2022 | goto err; | |
1c6fdbd8 | 2023 | |
919dbbd1 | 2024 | if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) |
7edcfbfe KO |
2025 | goto err; |
2026 | ||
2027 | inode_dio_begin(&inode->v); | |
2028 | bch2_pagecache_block_get(&inode->ei_pagecache_lock); | |
2029 | ||
2030 | extending = req->ki_pos + iter->count > inode->v.i_size; | |
2031 | if (!extending) { | |
2032 | inode_unlock(&inode->v); | |
2033 | locked = false; | |
2034 | } | |
1c6fdbd8 KO |
2035 | |
2036 | bio = bio_alloc_bioset(NULL, | |
2ed5cd50 KO |
2037 | iov_iter_is_bvec(iter) |
2038 | ? 0 | |
2039 | : iov_iter_npages(iter, BIO_MAX_VECS), | |
1c6fdbd8 KO |
2040 | REQ_OP_WRITE, |
2041 | GFP_KERNEL, | |
2042 | &c->dio_write_bioset); | |
9a3df993 | 2043 | dio = container_of(bio, struct dio_write, op.wbio.bio); |
f8f30863 | 2044 | init_completion(&dio->done); |
1c6fdbd8 | 2045 | dio->req = req; |
ed484030 | 2046 | dio->mm = current->mm; |
1c6fdbd8 | 2047 | dio->loop = false; |
7edcfbfe | 2048 | dio->sync = is_sync_kiocb(req) || extending; |
1c6fdbd8 KO |
2049 | dio->free_iov = false; |
2050 | dio->quota_res.sectors = 0; | |
042a1f26 | 2051 | dio->written = 0; |
1c6fdbd8 | 2052 | dio->iter = *iter; |
9a3df993 | 2053 | |
1c6fdbd8 KO |
2054 | ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, |
2055 | iter->count >> 9, true); | |
2056 | if (unlikely(ret)) | |
7edcfbfe | 2057 | goto err_put_bio; |
1c6fdbd8 | 2058 | |
a023127a KO |
2059 | if (unlikely(mapping->nrpages)) { |
2060 | ret = write_invalidate_inode_pages_range(mapping, | |
2061 | req->ki_pos, | |
2062 | req->ki_pos + iter->count - 1); | |
2063 | if (unlikely(ret)) | |
2064 | goto err_put_bio; | |
2065 | } | |
54847d25 | 2066 | |
7edcfbfe | 2067 | ret = bch2_dio_write_loop(dio); |
1c6fdbd8 | 2068 | err: |
7edcfbfe KO |
2069 | if (locked) |
2070 | inode_unlock(&inode->v); | |
7edcfbfe KO |
2071 | return ret; |
2072 | err_put_bio: | |
2073 | bch2_pagecache_block_put(&inode->ei_pagecache_lock); | |
1c6fdbd8 | 2074 | bch2_quota_reservation_put(c, inode, &dio->quota_res); |
1c6fdbd8 | 2075 | bio_put(bio); |
7edcfbfe KO |
2076 | inode_dio_end(&inode->v); |
2077 | goto err; | |
1c6fdbd8 KO |
2078 | } |
2079 | ||
7edcfbfe | 2080 | ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) |
1c6fdbd8 KO |
2081 | { |
2082 | struct file *file = iocb->ki_filp; | |
7edcfbfe | 2083 | struct bch_inode_info *inode = file_bch_inode(file); |
1c6fdbd8 KO |
2084 | ssize_t ret; |
2085 | ||
2086 | if (iocb->ki_flags & IOCB_DIRECT) | |
2087 | return bch2_direct_write(iocb, from); | |
2088 | ||
7edcfbfe KO |
2089 | inode_lock(&inode->v); |
2090 | ||
2091 | ret = generic_write_checks(iocb, from); | |
2092 | if (ret <= 0) | |
2093 | goto unlock; | |
2094 | ||
1c6fdbd8 KO |
2095 | ret = file_remove_privs(file); |
2096 | if (ret) | |
7edcfbfe | 2097 | goto unlock; |
1c6fdbd8 KO |
2098 | |
2099 | ret = file_update_time(file); | |
2100 | if (ret) | |
7edcfbfe | 2101 | goto unlock; |
1c6fdbd8 | 2102 | |
7edcfbfe | 2103 | ret = bch2_buffered_write(iocb, from); |
1c6fdbd8 KO |
2104 | if (likely(ret > 0)) |
2105 | iocb->ki_pos += ret; | |
7edcfbfe | 2106 | unlock: |
1c6fdbd8 KO |
2107 | inode_unlock(&inode->v); |
2108 | ||
7edcfbfe | 2109 | if (ret > 0) |
1c6fdbd8 KO |
2110 | ret = generic_write_sync(iocb, ret); |
2111 | ||
2112 | return ret; | |
2113 | } | |
2114 | ||
2115 | /* fsync: */ | |
2116 | ||
2117 | int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) | |
2118 | { | |
2119 | struct bch_inode_info *inode = file_bch_inode(file); | |
2120 | struct bch_fs *c = inode->v.i_sb->s_fs_info; | |
bb1b3658 | 2121 | int ret, ret2; |
1c6fdbd8 KO |
2122 | |
2123 | ret = file_write_and_wait_range(file, start, end); | |
2124 | if (ret) | |
2125 | return ret; | |
2126 | ||
2127 | if (datasync && !(inode->v.i_state & I_DIRTY_DATASYNC)) | |
2128 | goto out; | |
2129 | ||
2130 | ret = sync_inode_metadata(&inode->v, 1); | |
2131 | if (ret) | |
2132 | return ret; | |
2133 | out: | |
75812e70 KO |
2134 | if (!c->opts.journal_flush_disabled) |
2135 | ret = bch2_journal_flush_seq(&c->journal, | |
2136 | inode->ei_journal_seq); | |
bb1b3658 KO |
2137 | ret2 = file_check_and_advance_wb_err(file); |
2138 | ||
2139 | return ret ?: ret2; | |
1c6fdbd8 KO |
2140 | } |
2141 | ||
2142 | /* truncate: */ | |
2143 | ||
2144 | static inline int range_has_data(struct bch_fs *c, | |
2145 | struct bpos start, | |
2146 | struct bpos end) | |
2147 | { | |
424eb881 KO |
2148 | struct btree_trans trans; |
2149 | struct btree_iter *iter; | |
1c6fdbd8 KO |
2150 | struct bkey_s_c k; |
2151 | int ret = 0; | |
2152 | ||
20bceecb | 2153 | bch2_trans_init(&trans, c, 0, 0); |
424eb881 | 2154 | |
41f8b09e | 2155 | for_each_btree_key(&trans, iter, BTREE_ID_extents, start, 0, k, ret) { |
1c6fdbd8 KO |
2156 | if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) |
2157 | break; | |
2158 | ||
2159 | if (bkey_extent_is_data(k.k)) { | |
2160 | ret = 1; | |
2161 | break; | |
2162 | } | |
2163 | } | |
50dc0f69 | 2164 | bch2_trans_iter_put(&trans, iter); |
1c6fdbd8 | 2165 | |
424eb881 | 2166 | return bch2_trans_exit(&trans) ?: ret; |
1c6fdbd8 KO |
2167 | } |
2168 | ||
2169 | static int __bch2_truncate_page(struct bch_inode_info *inode, | |
2170 | pgoff_t index, loff_t start, loff_t end) | |
2171 | { | |
2172 | struct bch_fs *c = inode->v.i_sb->s_fs_info; | |
2173 | struct address_space *mapping = inode->v.i_mapping; | |
a99b1caf | 2174 | struct bch_page_state *s; |
1c6fdbd8 KO |
2175 | unsigned start_offset = start & (PAGE_SIZE - 1); |
2176 | unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1; | |
a99b1caf | 2177 | unsigned i; |
1c6fdbd8 KO |
2178 | struct page *page; |
2179 | int ret = 0; | |
2180 | ||
2181 | /* Page boundary? Nothing to do */ | |
2182 | if (!((index == start >> PAGE_SHIFT && start_offset) || | |
2183 | (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE))) | |
2184 | return 0; | |
2185 | ||
2186 | /* Above i_size? */ | |
2187 | if (index << PAGE_SHIFT >= inode->v.i_size) | |
2188 | return 0; | |
2189 | ||
2190 | page = find_lock_page(mapping, index); | |
2191 | if (!page) { | |
2192 | /* | |
2193 | * XXX: we're doing two index lookups when we end up reading the | |
2194 | * page | |
2195 | */ | |
2196 | ret = range_has_data(c, | |
2197 | POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT), | |
2198 | POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT)); | |
2199 | if (ret <= 0) | |
2200 | return ret; | |
2201 | ||
2202 | page = find_or_create_page(mapping, index, GFP_KERNEL); | |
2203 | if (unlikely(!page)) { | |
2204 | ret = -ENOMEM; | |
2205 | goto out; | |
2206 | } | |
2207 | } | |
2208 | ||
a99b1caf KO |
2209 | s = bch2_page_state_create(page, 0); |
2210 | if (!s) { | |
2211 | ret = -ENOMEM; | |
2212 | goto unlock; | |
2213 | } | |
2214 | ||
1c6fdbd8 KO |
2215 | if (!PageUptodate(page)) { |
2216 | ret = bch2_read_single_page(page, mapping); | |
2217 | if (ret) | |
2218 | goto unlock; | |
2219 | } | |
2220 | ||
a99b1caf KO |
2221 | if (index != start >> PAGE_SHIFT) |
2222 | start_offset = 0; | |
2223 | if (index != end >> PAGE_SHIFT) | |
2224 | end_offset = PAGE_SIZE; | |
2225 | ||
2226 | for (i = round_up(start_offset, block_bytes(c)) >> 9; | |
2227 | i < round_down(end_offset, block_bytes(c)) >> 9; | |
2228 | i++) { | |
2229 | s->s[i].nr_replicas = 0; | |
2230 | s->s[i].state = SECTOR_UNALLOCATED; | |
2231 | } | |
2232 | ||
2233 | zero_user_segment(page, start_offset, end_offset); | |
2234 | ||
1c6fdbd8 KO |
2235 | /* |
2236 | * Bit of a hack - we don't want truncate to fail due to -ENOSPC. | |
2237 | * | |
2238 | * XXX: because we aren't currently tracking whether the page has actual | |
2239 | * data in it (vs. just 0s, or only partially written) this wrong. ick. | |
2240 | */ | |
d1542e03 | 2241 | ret = bch2_get_page_disk_reservation(c, inode, page, false); |
1c6fdbd8 KO |
2242 | BUG_ON(ret); |
2243 | ||
9ba2eb25 KO |
2244 | /* |
2245 | * This removes any writeable userspace mappings; we need to force | |
2246 | * .page_mkwrite to be called again before any mmapped writes, to | |
2247 | * redirty the full page: | |
2248 | */ | |
2249 | page_mkclean(page); | |
d1542e03 | 2250 | filemap_dirty_folio(mapping, page_folio(page)); |
1c6fdbd8 KO |
2251 | unlock: |
2252 | unlock_page(page); | |
2253 | put_page(page); | |
2254 | out: | |
2255 | return ret; | |
2256 | } | |
2257 | ||
2258 | static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from) | |
2259 | { | |
2260 | return __bch2_truncate_page(inode, from >> PAGE_SHIFT, | |
a99b1caf | 2261 | from, round_up(from, PAGE_SIZE)); |
1c6fdbd8 KO |
2262 | } |
2263 | ||
68a507a2 KO |
2264 | static int bch2_extend(struct mnt_idmap *idmap, |
2265 | struct bch_inode_info *inode, | |
e0541a93 KO |
2266 | struct bch_inode_unpacked *inode_u, |
2267 | struct iattr *iattr) | |
1c6fdbd8 | 2268 | { |
1c6fdbd8 KO |
2269 | struct address_space *mapping = inode->v.i_mapping; |
2270 | int ret; | |
2271 | ||
e0541a93 KO |
2272 | /* |
2273 | * sync appends: | |
2925fc49 KO |
2274 | * |
2275 | * this has to be done _before_ extending i_size: | |
e0541a93 KO |
2276 | */ |
2277 | ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX); | |
1c6fdbd8 KO |
2278 | if (ret) |
2279 | return ret; | |
2280 | ||
2281 | truncate_setsize(&inode->v, iattr->ia_size); | |
1c6fdbd8 | 2282 | |
68a507a2 | 2283 | return bch2_setattr_nonsize(idmap, inode, iattr); |
1c6fdbd8 KO |
2284 | } |
2285 | ||
54e2264e KO |
2286 | static int bch2_truncate_finish_fn(struct bch_inode_info *inode, |
2287 | struct bch_inode_unpacked *bi, | |
2288 | void *p) | |
2289 | { | |
54e2264e | 2290 | bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; |
54e2264e KO |
2291 | return 0; |
2292 | } | |
2293 | ||
2294 | static int bch2_truncate_start_fn(struct bch_inode_info *inode, | |
2295 | struct bch_inode_unpacked *bi, void *p) | |
2296 | { | |
2297 | u64 *new_i_size = p; | |
2298 | ||
2299 | bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY; | |
2300 | bi->bi_size = *new_i_size; | |
2301 | return 0; | |
2302 | } | |
2303 | ||
68a507a2 KO |
2304 | int bch2_truncate(struct mnt_idmap *idmap, |
2305 | struct bch_inode_info *inode, struct iattr *iattr) | |
1c6fdbd8 KO |
2306 | { |
2307 | struct bch_fs *c = inode->v.i_sb->s_fs_info; | |
2308 | struct address_space *mapping = inode->v.i_mapping; | |
e0541a93 | 2309 | struct bch_inode_unpacked inode_u; |
54e2264e | 2310 | u64 new_i_size = iattr->ia_size; |
2e87eae1 | 2311 | s64 i_sectors_delta = 0; |
1c6fdbd8 KO |
2312 | int ret = 0; |
2313 | ||
68a507a2 | 2314 | /* |
78d66ab1 DR |
2315 | * If the truncate call with change the size of the file, the |
2316 | * cmtimes should be updated. If the size will not change, we | |
2317 | * do not need to update the cmtimes. | |
68a507a2 | 2318 | */ |
78d66ab1 DR |
2319 | if (iattr->ia_size != inode->v.i_size) { |
2320 | if (!(iattr->ia_valid & ATTR_MTIME)) | |
2321 | ktime_get_coarse_real_ts64(&iattr->ia_mtime); | |
2322 | if (!(iattr->ia_valid & ATTR_CTIME)) | |
2323 | ktime_get_coarse_real_ts64(&iattr->ia_ctime); | |
2324 | iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME; | |
2325 | } | |
68a507a2 | 2326 | |
1c6fdbd8 KO |
2327 | inode_dio_wait(&inode->v); |
2328 | bch2_pagecache_block_get(&inode->ei_pagecache_lock); | |
2329 | ||
b97bbd4e | 2330 | ret = bch2_inode_find_by_inum(c, inode->v.i_ino, &inode_u); |
c45d473d KO |
2331 | if (ret) |
2332 | goto err; | |
2333 | ||
2334 | /* | |
2335 | * check this before next assertion; on filesystem error our normal | |
2336 | * invariants are a bit broken (truncate has to truncate the page cache | |
2337 | * before the inode). | |
2338 | */ | |
2339 | ret = bch2_journal_error(&c->journal); | |
e0541a93 KO |
2340 | if (ret) |
2341 | goto err; | |
1c6fdbd8 | 2342 | |
33c74e41 KO |
2343 | WARN_ON(!test_bit(EI_INODE_ERROR, &inode->ei_flags) && |
2344 | inode->v.i_size < inode_u.bi_size); | |
1c6fdbd8 | 2345 | |
e0541a93 | 2346 | if (iattr->ia_size > inode->v.i_size) { |
68a507a2 | 2347 | ret = bch2_extend(idmap, inode, &inode_u, iattr); |
54e2264e | 2348 | goto err; |
1c6fdbd8 KO |
2349 | } |
2350 | ||
68a507a2 KO |
2351 | iattr->ia_valid &= ~ATTR_SIZE; |
2352 | ||
1c6fdbd8 KO |
2353 | ret = bch2_truncate_page(inode, iattr->ia_size); |
2354 | if (unlikely(ret)) | |
54e2264e | 2355 | goto err; |
1c6fdbd8 | 2356 | |
6cc3535d KO |
2357 | /* |
2358 | * When extending, we're going to write the new i_size to disk | |
2359 | * immediately so we need to flush anything above the current on disk | |
2360 | * i_size first: | |
2361 | * | |
2362 | * Also, when extending we need to flush the page that i_size currently | |
2363 | * straddles - if it's mapped to userspace, we need to ensure that | |
2364 | * userspace has to redirty it and call .mkwrite -> set_page_dirty | |
2365 | * again to allocate the part of the page that was extended. | |
2366 | */ | |
e0541a93 | 2367 | if (iattr->ia_size > inode_u.bi_size) |
1c6fdbd8 | 2368 | ret = filemap_write_and_wait_range(mapping, |
e0541a93 | 2369 | inode_u.bi_size, |
1c6fdbd8 KO |
2370 | iattr->ia_size - 1); |
2371 | else if (iattr->ia_size & (PAGE_SIZE - 1)) | |
2372 | ret = filemap_write_and_wait_range(mapping, | |
2373 | round_down(iattr->ia_size, PAGE_SIZE), | |
2374 | iattr->ia_size - 1); | |
2375 | if (ret) | |
54e2264e | 2376 | goto err; |
1c6fdbd8 | 2377 | |
54e2264e KO |
2378 | mutex_lock(&inode->ei_update_lock); |
2379 | ret = bch2_write_inode(c, inode, bch2_truncate_start_fn, | |
2380 | &new_i_size, 0); | |
2381 | mutex_unlock(&inode->ei_update_lock); | |
1c6fdbd8 | 2382 | |
1c6fdbd8 | 2383 | if (unlikely(ret)) |
54e2264e | 2384 | goto err; |
1c6fdbd8 KO |
2385 | |
2386 | truncate_setsize(&inode->v, iattr->ia_size); | |
2387 | ||
2e87eae1 | 2388 | ret = bch2_fpunch(c, inode->v.i_ino, |
a99b1caf | 2389 | round_up(iattr->ia_size, block_bytes(c)) >> 9, |
2e87eae1 KO |
2390 | U64_MAX, &inode->ei_journal_seq, &i_sectors_delta); |
2391 | i_sectors_acct(c, inode, NULL, i_sectors_delta); | |
2392 | ||
1c6fdbd8 | 2393 | if (unlikely(ret)) |
54e2264e | 2394 | goto err; |
1c6fdbd8 | 2395 | |
54e2264e | 2396 | mutex_lock(&inode->ei_update_lock); |
68a507a2 | 2397 | ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, 0); |
54e2264e | 2398 | mutex_unlock(&inode->ei_update_lock); |
68a507a2 KO |
2399 | |
2400 | ret = bch2_setattr_nonsize(idmap, inode, iattr); | |
54e2264e | 2401 | err: |
1c6fdbd8 KO |
2402 | bch2_pagecache_block_put(&inode->ei_pagecache_lock); |
2403 | return ret; | |
1c6fdbd8 KO |
2404 | } |
2405 | ||
2406 | /* fallocate: */ | |
2407 | ||
050197b1 KO |
2408 | static int inode_update_times_fn(struct bch_inode_info *inode, |
2409 | struct bch_inode_unpacked *bi, void *p) | |
2410 | { | |
2411 | struct bch_fs *c = inode->v.i_sb->s_fs_info; | |
2412 | ||
2413 | bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); | |
2414 | return 0; | |
2415 | } | |
2416 | ||
2e87eae1 | 2417 | static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) |
1c6fdbd8 KO |
2418 | { |
2419 | struct bch_fs *c = inode->v.i_sb->s_fs_info; | |
a99b1caf KO |
2420 | u64 discard_start = round_up(offset, block_bytes(c)) >> 9; |
2421 | u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9; | |
1c6fdbd8 KO |
2422 | int ret = 0; |
2423 | ||
2424 | inode_lock(&inode->v); | |
2425 | inode_dio_wait(&inode->v); | |
2426 | bch2_pagecache_block_get(&inode->ei_pagecache_lock); | |
2427 | ||
2428 | ret = __bch2_truncate_page(inode, | |
2429 | offset >> PAGE_SHIFT, | |
2430 | offset, offset + len); | |
2431 | if (unlikely(ret)) | |
2432 | goto err; | |
2433 | ||
2434 | if (offset >> PAGE_SHIFT != | |
2435 | (offset + len) >> PAGE_SHIFT) { | |
2436 | ret = __bch2_truncate_page(inode, | |
2437 | (offset + len) >> PAGE_SHIFT, | |
2438 | offset, offset + len); | |
2439 | if (unlikely(ret)) | |
2440 | goto err; | |
2441 | } | |
2442 | ||
2443 | truncate_pagecache_range(&inode->v, offset, offset + len - 1); | |
2444 | ||
2e87eae1 KO |
2445 | if (discard_start < discard_end) { |
2446 | s64 i_sectors_delta = 0; | |
2447 | ||
2448 | ret = bch2_fpunch(c, inode->v.i_ino, | |
2449 | discard_start, discard_end, | |
2450 | &inode->ei_journal_seq, | |
2451 | &i_sectors_delta); | |
2452 | i_sectors_acct(c, inode, NULL, i_sectors_delta); | |
2453 | } | |
050197b1 KO |
2454 | |
2455 | mutex_lock(&inode->ei_update_lock); | |
2456 | ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, | |
2457 | ATTR_MTIME|ATTR_CTIME) ?: ret; | |
2458 | mutex_unlock(&inode->ei_update_lock); | |
1c6fdbd8 KO |
2459 | err: |
2460 | bch2_pagecache_block_put(&inode->ei_pagecache_lock); | |
2461 | inode_unlock(&inode->v); | |
2462 | ||
2463 | return ret; | |
2464 | } | |
2465 | ||
2e87eae1 | 2466 | static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, |
5f786787 KO |
2467 | loff_t offset, loff_t len, |
2468 | bool insert) | |
1c6fdbd8 KO |
2469 | { |
2470 | struct bch_fs *c = inode->v.i_sb->s_fs_info; | |
2471 | struct address_space *mapping = inode->v.i_mapping; | |
07a1006a | 2472 | struct bkey_buf copy; |
d69f41d6 | 2473 | struct btree_trans trans; |
3d495595 | 2474 | struct btree_iter *src, *dst, *del; |
5f786787 KO |
2475 | loff_t shift, new_size; |
2476 | u64 src_start; | |
50dc0f69 | 2477 | int ret = 0; |
1c6fdbd8 KO |
2478 | |
2479 | if ((offset | len) & (block_bytes(c) - 1)) | |
2480 | return -EINVAL; | |
2481 | ||
1c6fdbd8 KO |
2482 | /* |
2483 | * We need i_mutex to keep the page cache consistent with the extents | |
2484 | * btree, and the btree consistent with i_size - we don't need outside | |
2485 | * locking for the extents btree itself, because we're using linked | |
2486 | * iterators | |
2487 | */ | |
2488 | inode_lock(&inode->v); | |
2489 | inode_dio_wait(&inode->v); | |
2490 | bch2_pagecache_block_get(&inode->ei_pagecache_lock); | |
2491 | ||
5f786787 KO |
2492 | if (insert) { |
2493 | ret = -EFBIG; | |
2494 | if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len) | |
2495 | goto err; | |
1c6fdbd8 | 2496 | |
5f786787 KO |
2497 | ret = -EINVAL; |
2498 | if (offset >= inode->v.i_size) | |
2499 | goto err; | |
1c6fdbd8 | 2500 | |
5f786787 KO |
2501 | src_start = U64_MAX; |
2502 | shift = len; | |
2503 | } else { | |
2504 | ret = -EINVAL; | |
2505 | if (offset + len >= inode->v.i_size) | |
2506 | goto err; | |
1c6fdbd8 | 2507 | |
5f786787 KO |
2508 | src_start = offset + len; |
2509 | shift = -len; | |
2510 | } | |
2511 | ||
2512 | new_size = inode->v.i_size + shift; | |
1c6fdbd8 | 2513 | |
5f786787 | 2514 | ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); |
63095894 KO |
2515 | if (ret) |
2516 | goto err; | |
2517 | ||
5f786787 KO |
2518 | if (insert) { |
2519 | i_size_write(&inode->v, new_size); | |
2520 | mutex_lock(&inode->ei_update_lock); | |
2521 | ret = bch2_write_inode_size(c, inode, new_size, | |
2522 | ATTR_MTIME|ATTR_CTIME); | |
2523 | mutex_unlock(&inode->ei_update_lock); | |
2524 | } else { | |
2e87eae1 KO |
2525 | s64 i_sectors_delta = 0; |
2526 | ||
2527 | ret = bch2_fpunch(c, inode->v.i_ino, | |
2528 | offset >> 9, (offset + len) >> 9, | |
2529 | &inode->ei_journal_seq, | |
2530 | &i_sectors_delta); | |
2531 | i_sectors_acct(c, inode, NULL, i_sectors_delta); | |
2532 | ||
5f786787 KO |
2533 | if (ret) |
2534 | goto err; | |
2535 | } | |
8ef231bd | 2536 | |
50dc0f69 | 2537 | bch2_bkey_buf_init(©); |
f7beb4ca | 2538 | bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); |
41f8b09e | 2539 | src = bch2_trans_get_iter(&trans, BTREE_ID_extents, |
5f786787 | 2540 | POS(inode->v.i_ino, src_start >> 9), |
63095894 | 2541 | BTREE_ITER_INTENT); |
5f786787 | 2542 | dst = bch2_trans_copy_iter(&trans, src); |
3d495595 | 2543 | del = bch2_trans_copy_iter(&trans, src); |
5f786787 | 2544 | |
50dc0f69 | 2545 | while (ret == 0 || ret == -EINTR) { |
63095894 KO |
2546 | struct disk_reservation disk_res = |
2547 | bch2_disk_reservation_init(c, 0); | |
63095894 KO |
2548 | struct bkey_i delete; |
2549 | struct bkey_s_c k; | |
2550 | struct bpos next_pos; | |
5f786787 KO |
2551 | struct bpos move_pos = POS(inode->v.i_ino, offset >> 9); |
2552 | struct bpos atomic_end; | |
2d594dfb | 2553 | unsigned trigger_flags = 0; |
1c6fdbd8 | 2554 | |
700c25b3 KO |
2555 | bch2_trans_begin(&trans); |
2556 | ||
5f786787 KO |
2557 | k = insert |
2558 | ? bch2_btree_iter_peek_prev(src) | |
2559 | : bch2_btree_iter_peek(src); | |
63095894 | 2560 | if ((ret = bkey_err(k))) |
50dc0f69 | 2561 | continue; |
8ef231bd | 2562 | |
63095894 KO |
2563 | if (!k.k || k.k->p.inode != inode->v.i_ino) |
2564 | break; | |
1c6fdbd8 | 2565 | |
5f786787 KO |
2566 | if (insert && |
2567 | bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0) | |
2568 | break; | |
2569 | reassemble: | |
07a1006a | 2570 | bch2_bkey_buf_reassemble(©, c, k); |
5f786787 KO |
2571 | |
2572 | if (insert && | |
283eda57 | 2573 | bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) |
35189e09 | 2574 | bch2_cut_front(move_pos, copy.k); |
1c6fdbd8 | 2575 | |
35189e09 KO |
2576 | copy.k->k.p.offset += shift >> 9; |
2577 | bch2_btree_iter_set_pos(dst, bkey_start_pos(©.k->k)); | |
1c6fdbd8 | 2578 | |
9f6bd307 | 2579 | ret = bch2_extent_atomic_end(&trans, dst, copy.k, &atomic_end); |
3c7f3b7a | 2580 | if (ret) |
50dc0f69 | 2581 | continue; |
e2d9912c | 2582 | |
35189e09 | 2583 | if (bkey_cmp(atomic_end, copy.k->k.p)) { |
5f786787 KO |
2584 | if (insert) { |
2585 | move_pos = atomic_end; | |
2586 | move_pos.offset -= shift >> 9; | |
2587 | goto reassemble; | |
2588 | } else { | |
085ab693 | 2589 | bch2_cut_back(atomic_end, copy.k); |
5f786787 KO |
2590 | } |
2591 | } | |
2592 | ||
63095894 | 2593 | bkey_init(&delete.k); |
283eda57 KO |
2594 | delete.k.p = copy.k->k.p; |
2595 | delete.k.size = copy.k->k.size; | |
2596 | delete.k.p.offset -= shift >> 9; | |
3d495595 | 2597 | bch2_btree_iter_set_pos(del, bkey_start_pos(&delete.k)); |
1c6fdbd8 | 2598 | |
5f786787 | 2599 | next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; |
1c6fdbd8 | 2600 | |
35189e09 | 2601 | if (copy.k->k.size == k.k->size) { |
63095894 KO |
2602 | /* |
2603 | * If we're moving the entire extent, we can skip | |
2604 | * running triggers: | |
2605 | */ | |
2d594dfb | 2606 | trigger_flags |= BTREE_TRIGGER_NORUN; |
63095894 KO |
2607 | } else { |
2608 | /* We might end up splitting compressed extents: */ | |
2609 | unsigned nr_ptrs = | |
4de77495 | 2610 | bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k)); |
63095894 KO |
2611 | |
2612 | ret = bch2_disk_reservation_get(c, &disk_res, | |
35189e09 | 2613 | copy.k->k.size, nr_ptrs, |
63095894 KO |
2614 | BCH_DISK_RESERVATION_NOFAIL); |
2615 | BUG_ON(ret); | |
2616 | } | |
2617 | ||
8c3f6da9 KO |
2618 | ret = bch2_btree_iter_traverse(del) ?: |
2619 | bch2_trans_update(&trans, del, &delete, trigger_flags) ?: | |
24326cd1 KO |
2620 | bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?: |
2621 | bch2_trans_commit(&trans, &disk_res, | |
2622 | &inode->ei_journal_seq, | |
2623 | BTREE_INSERT_NOFAIL); | |
1c6fdbd8 | 2624 | bch2_disk_reservation_put(c, &disk_res); |
50dc0f69 | 2625 | |
63095894 KO |
2626 | if (!ret) |
2627 | bch2_btree_iter_set_pos(src, next_pos); | |
1c6fdbd8 | 2628 | } |
50dc0f69 KO |
2629 | bch2_trans_iter_put(&trans, del); |
2630 | bch2_trans_iter_put(&trans, dst); | |
2631 | bch2_trans_iter_put(&trans, src); | |
2632 | bch2_trans_exit(&trans); | |
2633 | bch2_bkey_buf_exit(©, c); | |
2634 | ||
2635 | if (ret) | |
2636 | goto err; | |
1c6fdbd8 | 2637 | |
5f786787 KO |
2638 | if (!insert) { |
2639 | i_size_write(&inode->v, new_size); | |
2640 | mutex_lock(&inode->ei_update_lock); | |
2641 | ret = bch2_write_inode_size(c, inode, new_size, | |
2642 | ATTR_MTIME|ATTR_CTIME); | |
2643 | mutex_unlock(&inode->ei_update_lock); | |
2644 | } | |
1c6fdbd8 KO |
2645 | err: |
2646 | bch2_pagecache_block_put(&inode->ei_pagecache_lock); | |
2647 | inode_unlock(&inode->v); | |
2648 | return ret; | |
2649 | } | |
2650 | ||
694015c2 KO |
2651 | static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, |
2652 | u64 start_sector, u64 end_sector) | |
1c6fdbd8 | 2653 | { |
1c6fdbd8 | 2654 | struct bch_fs *c = inode->v.i_sb->s_fs_info; |
190fa7af KO |
2655 | struct btree_trans trans; |
2656 | struct btree_iter *iter; | |
694015c2 | 2657 | struct bpos end_pos = POS(inode->v.i_ino, end_sector); |
9a3df993 | 2658 | unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas; |
694015c2 | 2659 | int ret = 0; |
1c6fdbd8 | 2660 | |
f7beb4ca | 2661 | bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512); |
1c6fdbd8 | 2662 | |
41f8b09e | 2663 | iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, |
694015c2 | 2664 | POS(inode->v.i_ino, start_sector), |
190fa7af | 2665 | BTREE_ITER_SLOTS|BTREE_ITER_INTENT); |
1c6fdbd8 | 2666 | |
50dc0f69 | 2667 | while (!ret && bkey_cmp(iter->pos, end_pos) < 0) { |
2e87eae1 | 2668 | s64 i_sectors_delta = 0; |
1c6fdbd8 | 2669 | struct disk_reservation disk_res = { 0 }; |
190fa7af | 2670 | struct quota_res quota_res = { 0 }; |
1c6fdbd8 KO |
2671 | struct bkey_i_reservation reservation; |
2672 | struct bkey_s_c k; | |
694015c2 | 2673 | unsigned sectors; |
1c6fdbd8 | 2674 | |
163e885a | 2675 | bch2_trans_begin(&trans); |
a8abd3a7 | 2676 | |
190fa7af | 2677 | k = bch2_btree_iter_peek_slot(iter); |
0f238367 KO |
2678 | if ((ret = bkey_err(k))) |
2679 | goto bkey_err; | |
1c6fdbd8 KO |
2680 | |
2681 | /* already reserved */ | |
26609b61 | 2682 | if (k.k->type == KEY_TYPE_reservation && |
1c6fdbd8 | 2683 | bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) { |
8b3e9bd6 | 2684 | bch2_btree_iter_advance(iter); |
1c6fdbd8 KO |
2685 | continue; |
2686 | } | |
2687 | ||
190fa7af KO |
2688 | if (bkey_extent_is_data(k.k) && |
2689 | !(mode & FALLOC_FL_ZERO_RANGE)) { | |
8b3e9bd6 | 2690 | bch2_btree_iter_advance(iter); |
190fa7af | 2691 | continue; |
1c6fdbd8 KO |
2692 | } |
2693 | ||
2694 | bkey_reservation_init(&reservation.k_i); | |
26609b61 | 2695 | reservation.k.type = KEY_TYPE_reservation; |
1c6fdbd8 KO |
2696 | reservation.k.p = k.k->p; |
2697 | reservation.k.size = k.k->size; | |
2698 | ||
085ab693 KO |
2699 | bch2_cut_front(iter->pos, &reservation.k_i); |
2700 | bch2_cut_back(end_pos, &reservation.k_i); | |
1c6fdbd8 KO |
2701 | |
2702 | sectors = reservation.k.size; | |
4de77495 | 2703 | reservation.v.nr_replicas = bch2_bkey_nr_ptrs_allocated(k); |
1c6fdbd8 KO |
2704 | |
2705 | if (!bkey_extent_is_allocation(k.k)) { | |
2706 | ret = bch2_quota_reservation_add(c, inode, | |
190fa7af | 2707 | "a_res, |
1c6fdbd8 KO |
2708 | sectors, true); |
2709 | if (unlikely(ret)) | |
0f238367 | 2710 | goto bkey_err; |
1c6fdbd8 KO |
2711 | } |
2712 | ||
2713 | if (reservation.v.nr_replicas < replicas || | |
4de77495 | 2714 | bch2_bkey_sectors_compressed(k)) { |
1c6fdbd8 KO |
2715 | ret = bch2_disk_reservation_get(c, &disk_res, sectors, |
2716 | replicas, 0); | |
2717 | if (unlikely(ret)) | |
0f238367 | 2718 | goto bkey_err; |
1c6fdbd8 KO |
2719 | |
2720 | reservation.v.nr_replicas = disk_res.nr_replicas; | |
2721 | } | |
2722 | ||
2e87eae1 KO |
2723 | ret = bch2_extent_update(&trans, iter, &reservation.k_i, |
2724 | &disk_res, &inode->ei_journal_seq, | |
a6336910 | 2725 | 0, &i_sectors_delta, true); |
2e87eae1 | 2726 | i_sectors_acct(c, inode, "a_res, i_sectors_delta); |
0f238367 | 2727 | bkey_err: |
190fa7af | 2728 | bch2_quota_reservation_put(c, inode, "a_res); |
1c6fdbd8 | 2729 | bch2_disk_reservation_put(c, &disk_res); |
1c6fdbd8 KO |
2730 | if (ret == -EINTR) |
2731 | ret = 0; | |
1c6fdbd8 | 2732 | } |
50dc0f69 | 2733 | bch2_trans_iter_put(&trans, iter); |
694015c2 KO |
2734 | bch2_trans_exit(&trans); |
2735 | return ret; | |
2736 | } | |
50dc0f69 | 2737 | |
694015c2 KO |
2738 | static long bchfs_fallocate(struct bch_inode_info *inode, int mode, |
2739 | loff_t offset, loff_t len) | |
2740 | { | |
2741 | struct address_space *mapping = inode->v.i_mapping; | |
2742 | struct bch_fs *c = inode->v.i_sb->s_fs_info; | |
2743 | loff_t end = offset + len; | |
2744 | loff_t block_start = round_down(offset, block_bytes(c)); | |
2745 | loff_t block_end = round_up(end, block_bytes(c)); | |
2746 | int ret; | |
2747 | ||
2748 | inode_lock(&inode->v); | |
2749 | inode_dio_wait(&inode->v); | |
2750 | bch2_pagecache_block_get(&inode->ei_pagecache_lock); | |
2751 | ||
2752 | if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { | |
2753 | ret = inode_newsize_ok(&inode->v, end); | |
2754 | if (ret) | |
2755 | goto err; | |
2756 | } | |
2757 | ||
2758 | if (mode & FALLOC_FL_ZERO_RANGE) { | |
2759 | ret = __bch2_truncate_page(inode, | |
2760 | offset >> PAGE_SHIFT, | |
2761 | offset, end); | |
2762 | ||
2763 | if (!ret && | |
2764 | offset >> PAGE_SHIFT != end >> PAGE_SHIFT) | |
2765 | ret = __bch2_truncate_page(inode, | |
2766 | end >> PAGE_SHIFT, | |
2767 | offset, end); | |
2768 | ||
2769 | if (unlikely(ret)) | |
2770 | goto err; | |
2771 | ||
2772 | truncate_pagecache_range(&inode->v, offset, end - 1); | |
2773 | } | |
2774 | ||
2775 | ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9); | |
50dc0f69 KO |
2776 | if (ret) |
2777 | goto err; | |
1c6fdbd8 | 2778 | |
e0541a93 KO |
2779 | /* |
2780 | * Do we need to extend the file? | |
2781 | * | |
2782 | * If we zeroed up to the end of the file, we dropped whatever writes | |
2783 | * were going to write out the current i_size, so we have to extend | |
2784 | * manually even if FL_KEEP_SIZE was set: | |
2785 | */ | |
2786 | if (end >= inode->v.i_size && | |
2787 | (!(mode & FALLOC_FL_KEEP_SIZE) || | |
2788 | (mode & FALLOC_FL_ZERO_RANGE))) { | |
1c6fdbd8 | 2789 | |
e0541a93 KO |
2790 | /* |
2791 | * Sync existing appends before extending i_size, | |
2792 | * as in bch2_extend(): | |
2793 | */ | |
1c6fdbd8 | 2794 | ret = filemap_write_and_wait_range(mapping, |
694015c2 | 2795 | inode->ei_inode.bi_size, S64_MAX); |
1c6fdbd8 KO |
2796 | if (ret) |
2797 | goto err; | |
2798 | ||
e0541a93 KO |
2799 | if (mode & FALLOC_FL_KEEP_SIZE) |
2800 | end = inode->v.i_size; | |
2801 | else | |
2802 | i_size_write(&inode->v, end); | |
2803 | ||
2804 | mutex_lock(&inode->ei_update_lock); | |
2805 | ret = bch2_write_inode_size(c, inode, end, 0); | |
2806 | mutex_unlock(&inode->ei_update_lock); | |
1c6fdbd8 | 2807 | } |
1c6fdbd8 KO |
2808 | err: |
2809 | bch2_pagecache_block_put(&inode->ei_pagecache_lock); | |
2810 | inode_unlock(&inode->v); | |
2811 | return ret; | |
2812 | } | |
2813 | ||
2814 | long bch2_fallocate_dispatch(struct file *file, int mode, | |
2815 | loff_t offset, loff_t len) | |
2816 | { | |
2817 | struct bch_inode_info *inode = file_bch_inode(file); | |
2a9101a9 KO |
2818 | struct bch_fs *c = inode->v.i_sb->s_fs_info; |
2819 | long ret; | |
1c6fdbd8 | 2820 | |
2a9101a9 KO |
2821 | if (!percpu_ref_tryget(&c->writes)) |
2822 | return -EROFS; | |
5f786787 | 2823 | |
2a9101a9 KO |
2824 | if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) |
2825 | ret = bchfs_fallocate(inode, mode, offset, len); | |
2826 | else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) | |
2827 | ret = bchfs_fpunch(inode, offset, len); | |
2828 | else if (mode == FALLOC_FL_INSERT_RANGE) | |
2829 | ret = bchfs_fcollapse_finsert(inode, offset, len, true); | |
2830 | else if (mode == FALLOC_FL_COLLAPSE_RANGE) | |
2831 | ret = bchfs_fcollapse_finsert(inode, offset, len, false); | |
2832 | else | |
2833 | ret = -EOPNOTSUPP; | |
2834 | ||
2835 | percpu_ref_put(&c->writes); | |
1c6fdbd8 | 2836 | |
2a9101a9 | 2837 | return ret; |
1c6fdbd8 KO |
2838 | } |
2839 | ||
76426098 KO |
2840 | static void mark_range_unallocated(struct bch_inode_info *inode, |
2841 | loff_t start, loff_t end) | |
2842 | { | |
2843 | pgoff_t index = start >> PAGE_SHIFT; | |
2844 | pgoff_t end_index = (end - 1) >> PAGE_SHIFT; | |
2845 | struct folio_batch fbatch; | |
2846 | unsigned i, j; | |
2847 | ||
2848 | folio_batch_init(&fbatch); | |
2849 | ||
2850 | while (filemap_get_folios(inode->v.i_mapping, | |
2851 | &index, end_index, &fbatch)) { | |
2852 | for (i = 0; i < folio_batch_count(&fbatch); i++) { | |
2853 | struct folio *folio = fbatch.folios[i]; | |
2854 | struct bch_page_state *s; | |
2855 | ||
2856 | folio_lock(folio); | |
2857 | s = bch2_page_state(&folio->page); | |
2858 | ||
3826ee0b KO |
2859 | if (s) { |
2860 | spin_lock(&s->lock); | |
76426098 KO |
2861 | for (j = 0; j < PAGE_SECTORS; j++) |
2862 | s->s[j].nr_replicas = 0; | |
3826ee0b KO |
2863 | spin_unlock(&s->lock); |
2864 | } | |
76426098 KO |
2865 | |
2866 | folio_unlock(folio); | |
2867 | } | |
2868 | folio_batch_release(&fbatch); | |
2869 | cond_resched(); | |
2870 | } | |
2871 | } | |
2872 | ||
2873 | loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, | |
2874 | struct file *file_dst, loff_t pos_dst, | |
2875 | loff_t len, unsigned remap_flags) | |
2876 | { | |
2877 | struct bch_inode_info *src = file_bch_inode(file_src); | |
2878 | struct bch_inode_info *dst = file_bch_inode(file_dst); | |
2879 | struct bch_fs *c = src->v.i_sb->s_fs_info; | |
2e87eae1 | 2880 | s64 i_sectors_delta = 0; |
677fc056 | 2881 | u64 aligned_len; |
76426098 | 2882 | loff_t ret = 0; |
76426098 KO |
2883 | |
2884 | if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) | |
2885 | return -EINVAL; | |
2886 | ||
2887 | if (remap_flags & REMAP_FILE_DEDUP) | |
2888 | return -EOPNOTSUPP; | |
2889 | ||
2890 | if ((pos_src & (block_bytes(c) - 1)) || | |
2891 | (pos_dst & (block_bytes(c) - 1))) | |
2892 | return -EINVAL; | |
2893 | ||
2894 | if (src == dst && | |
2895 | abs(pos_src - pos_dst) < len) | |
2896 | return -EINVAL; | |
2897 | ||
2898 | bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); | |
2899 | ||
2e87eae1 KO |
2900 | file_update_time(file_dst); |
2901 | ||
76426098 KO |
2902 | inode_dio_wait(&src->v); |
2903 | inode_dio_wait(&dst->v); | |
2904 | ||
2905 | ret = generic_remap_file_range_prep(file_src, pos_src, | |
2906 | file_dst, pos_dst, | |
2907 | &len, remap_flags); | |
2908 | if (ret < 0 || len == 0) | |
2e87eae1 | 2909 | goto err; |
76426098 | 2910 | |
677fc056 | 2911 | aligned_len = round_up((u64) len, block_bytes(c)); |
76426098 KO |
2912 | |
2913 | ret = write_invalidate_inode_pages_range(dst->v.i_mapping, | |
677fc056 | 2914 | pos_dst, pos_dst + len - 1); |
76426098 | 2915 | if (ret) |
2e87eae1 | 2916 | goto err; |
76426098 KO |
2917 | |
2918 | mark_range_unallocated(src, pos_src, pos_src + aligned_len); | |
2919 | ||
2e87eae1 | 2920 | ret = bch2_remap_range(c, |
76426098 KO |
2921 | POS(dst->v.i_ino, pos_dst >> 9), |
2922 | POS(src->v.i_ino, pos_src >> 9), | |
2923 | aligned_len >> 9, | |
2e87eae1 KO |
2924 | &dst->ei_journal_seq, |
2925 | pos_dst + len, &i_sectors_delta); | |
2926 | if (ret < 0) | |
2927 | goto err; | |
76426098 | 2928 | |
2e87eae1 KO |
2929 | /* |
2930 | * due to alignment, we might have remapped slightly more than requsted | |
2931 | */ | |
677fc056 | 2932 | ret = min((u64) ret << 9, (u64) len); |
2e87eae1 KO |
2933 | |
2934 | /* XXX get a quota reservation */ | |
2935 | i_sectors_acct(c, dst, NULL, i_sectors_delta); | |
2936 | ||
2937 | spin_lock(&dst->v.i_lock); | |
677fc056 KO |
2938 | if (pos_dst + ret > dst->v.i_size) |
2939 | i_size_write(&dst->v, pos_dst + ret); | |
2e87eae1 | 2940 | spin_unlock(&dst->v.i_lock); |
e7084c9c KO |
2941 | |
2942 | if (((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || | |
2943 | IS_SYNC(file_inode(file_dst))) && | |
2944 | !c->opts.journal_flush_disabled) | |
2945 | ret = bch2_journal_flush_seq(&c->journal, dst->ei_journal_seq); | |
2e87eae1 | 2946 | err: |
76426098 KO |
2947 | bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); |
2948 | ||
2949 | return ret; | |
2950 | } | |
2951 | ||
1c6fdbd8 KO |
2952 | /* fseek: */ |
2953 | ||
543ef2eb | 2954 | static int folio_data_offset(struct folio *folio, unsigned offset) |
1c6fdbd8 | 2955 | { |
f57a6a5d KO |
2956 | struct bch_page_state *s = bch2_page_state(&folio->page); |
2957 | unsigned i; | |
2958 | ||
543ef2eb KO |
2959 | if (s) |
2960 | for (i = offset >> 9; i < PAGE_SECTORS; i++) | |
2961 | if (s->s[i].state >= SECTOR_DIRTY) | |
2962 | return i << 9; | |
f81b648d | 2963 | |
543ef2eb | 2964 | return -1; |
1c6fdbd8 KO |
2965 | } |
2966 | ||
543ef2eb | 2967 | static loff_t bch2_seek_pagecache_data(struct inode *vinode, |
1c6fdbd8 KO |
2968 | loff_t start_offset, |
2969 | loff_t end_offset) | |
2970 | { | |
2971 | struct folio_batch fbatch; | |
2972 | pgoff_t start_index = start_offset >> PAGE_SHIFT; | |
2973 | pgoff_t end_index = end_offset >> PAGE_SHIFT; | |
2974 | pgoff_t index = start_index; | |
2975 | unsigned i; | |
543ef2eb KO |
2976 | loff_t ret; |
2977 | int offset; | |
1c6fdbd8 KO |
2978 | |
2979 | folio_batch_init(&fbatch); | |
2980 | ||
2981 | while (filemap_get_folios(vinode->i_mapping, | |
2982 | &index, end_index, &fbatch)) { | |
2983 | for (i = 0; i < folio_batch_count(&fbatch); i++) { | |
2984 | struct folio *folio = fbatch.folios[i]; | |
2985 | ||
2986 | folio_lock(folio); | |
543ef2eb KO |
2987 | offset = folio_data_offset(folio, |
2988 | folio->index == start_index | |
2989 | ? start_offset & (PAGE_SIZE - 1) | |
2990 | : 0); | |
2991 | if (offset >= 0) { | |
2992 | ret = clamp(((loff_t) folio->index << PAGE_SHIFT) + | |
2993 | offset, | |
2994 | start_offset, end_offset); | |
1c6fdbd8 KO |
2995 | folio_unlock(folio); |
2996 | folio_batch_release(&fbatch); | |
543ef2eb | 2997 | return ret; |
1c6fdbd8 KO |
2998 | } |
2999 | folio_unlock(folio); | |
3000 | } | |
3001 | folio_batch_release(&fbatch); | |
3002 | cond_resched(); | |
3003 | } | |
3004 | ||
3005 | return end_offset; | |
3006 | } | |
3007 | ||
3008 | static loff_t bch2_seek_data(struct file *file, u64 offset) | |
3009 | { | |
3010 | struct bch_inode_info *inode = file_bch_inode(file); | |
3011 | struct bch_fs *c = inode->v.i_sb->s_fs_info; | |
424eb881 KO |
3012 | struct btree_trans trans; |
3013 | struct btree_iter *iter; | |
1c6fdbd8 KO |
3014 | struct bkey_s_c k; |
3015 | u64 isize, next_data = MAX_LFS_FILESIZE; | |
3016 | int ret; | |
3017 | ||
3018 | isize = i_size_read(&inode->v); | |
3019 | if (offset >= isize) | |
3020 | return -ENXIO; | |
3021 | ||
20bceecb | 3022 | bch2_trans_init(&trans, c, 0, 0); |
424eb881 | 3023 | |
41f8b09e | 3024 | for_each_btree_key(&trans, iter, BTREE_ID_extents, |
94f651e2 | 3025 | POS(inode->v.i_ino, offset >> 9), 0, k, ret) { |
1c6fdbd8 KO |
3026 | if (k.k->p.inode != inode->v.i_ino) { |
3027 | break; | |
3028 | } else if (bkey_extent_is_data(k.k)) { | |
3029 | next_data = max(offset, bkey_start_offset(k.k) << 9); | |
3030 | break; | |
3031 | } else if (k.k->p.offset >> 9 > isize) | |
3032 | break; | |
3033 | } | |
50dc0f69 | 3034 | bch2_trans_iter_put(&trans, iter); |
1c6fdbd8 | 3035 | |
94f651e2 | 3036 | ret = bch2_trans_exit(&trans) ?: ret; |
1c6fdbd8 KO |
3037 | if (ret) |
3038 | return ret; | |
3039 | ||
3040 | if (next_data > offset) | |
543ef2eb | 3041 | next_data = bch2_seek_pagecache_data(&inode->v, |
1c6fdbd8 KO |
3042 | offset, next_data); |
3043 | ||
e10d3094 | 3044 | if (next_data >= isize) |
1c6fdbd8 KO |
3045 | return -ENXIO; |
3046 | ||
3047 | return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); | |
3048 | } | |
3049 | ||
543ef2eb | 3050 | static int __page_hole_offset(struct page *page, unsigned offset) |
1c6fdbd8 | 3051 | { |
543ef2eb KO |
3052 | struct bch_page_state *s = bch2_page_state(page); |
3053 | unsigned i; | |
3054 | ||
3055 | if (!s) | |
3056 | return 0; | |
3057 | ||
3058 | for (i = offset >> 9; i < PAGE_SECTORS; i++) | |
3059 | if (s->s[i].state < SECTOR_DIRTY) | |
3060 | return i << 9; | |
3061 | ||
3062 | return -1; | |
3063 | } | |
3064 | ||
3065 | static loff_t page_hole_offset(struct address_space *mapping, loff_t offset) | |
3066 | { | |
3067 | pgoff_t index = offset >> PAGE_SHIFT; | |
1c6fdbd8 | 3068 | struct page *page; |
543ef2eb KO |
3069 | int pg_offset; |
3070 | loff_t ret = -1; | |
1c6fdbd8 KO |
3071 | |
3072 | page = find_lock_page(mapping, index); | |
3073 | if (!page) | |
543ef2eb KO |
3074 | return offset; |
3075 | ||
3076 | pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1)); | |
3077 | if (pg_offset >= 0) | |
3078 | ret = ((loff_t) index << PAGE_SHIFT) + pg_offset; | |
1c6fdbd8 | 3079 | |
1c6fdbd8 KO |
3080 | unlock_page(page); |
3081 | ||
3082 | return ret; | |
3083 | } | |
3084 | ||
543ef2eb | 3085 | static loff_t bch2_seek_pagecache_hole(struct inode *vinode, |
1c6fdbd8 KO |
3086 | loff_t start_offset, |
3087 | loff_t end_offset) | |
3088 | { | |
3089 | struct address_space *mapping = vinode->i_mapping; | |
543ef2eb | 3090 | loff_t offset = start_offset, hole; |
1c6fdbd8 | 3091 | |
543ef2eb KO |
3092 | while (offset < end_offset) { |
3093 | hole = page_hole_offset(mapping, offset); | |
3094 | if (hole >= 0 && hole <= end_offset) | |
3095 | return max(start_offset, hole); | |
3096 | ||
3097 | offset += PAGE_SIZE; | |
3098 | offset &= PAGE_MASK; | |
3099 | } | |
1c6fdbd8 KO |
3100 | |
3101 | return end_offset; | |
3102 | } | |
3103 | ||
3104 | static loff_t bch2_seek_hole(struct file *file, u64 offset) | |
3105 | { | |
3106 | struct bch_inode_info *inode = file_bch_inode(file); | |
3107 | struct bch_fs *c = inode->v.i_sb->s_fs_info; | |
424eb881 KO |
3108 | struct btree_trans trans; |
3109 | struct btree_iter *iter; | |
1c6fdbd8 KO |
3110 | struct bkey_s_c k; |
3111 | u64 isize, next_hole = MAX_LFS_FILESIZE; | |
3112 | int ret; | |
3113 | ||
3114 | isize = i_size_read(&inode->v); | |
3115 | if (offset >= isize) | |
3116 | return -ENXIO; | |
3117 | ||
20bceecb | 3118 | bch2_trans_init(&trans, c, 0, 0); |
424eb881 | 3119 | |
41f8b09e | 3120 | for_each_btree_key(&trans, iter, BTREE_ID_extents, |
1c6fdbd8 | 3121 | POS(inode->v.i_ino, offset >> 9), |
94f651e2 | 3122 | BTREE_ITER_SLOTS, k, ret) { |
1c6fdbd8 | 3123 | if (k.k->p.inode != inode->v.i_ino) { |
543ef2eb | 3124 | next_hole = bch2_seek_pagecache_hole(&inode->v, |
1c6fdbd8 KO |
3125 | offset, MAX_LFS_FILESIZE); |
3126 | break; | |
3127 | } else if (!bkey_extent_is_data(k.k)) { | |
543ef2eb | 3128 | next_hole = bch2_seek_pagecache_hole(&inode->v, |
1c6fdbd8 KO |
3129 | max(offset, bkey_start_offset(k.k) << 9), |
3130 | k.k->p.offset << 9); | |
3131 | ||
3132 | if (next_hole < k.k->p.offset << 9) | |
3133 | break; | |
3134 | } else { | |
3135 | offset = max(offset, bkey_start_offset(k.k) << 9); | |
3136 | } | |
3137 | } | |
50dc0f69 | 3138 | bch2_trans_iter_put(&trans, iter); |
1c6fdbd8 | 3139 | |
94f651e2 | 3140 | ret = bch2_trans_exit(&trans) ?: ret; |
1c6fdbd8 KO |
3141 | if (ret) |
3142 | return ret; | |
3143 | ||
3144 | if (next_hole > isize) | |
3145 | next_hole = isize; | |
3146 | ||
3147 | return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE); | |
3148 | } | |
3149 | ||
3150 | loff_t bch2_llseek(struct file *file, loff_t offset, int whence) | |
3151 | { | |
3152 | switch (whence) { | |
3153 | case SEEK_SET: | |
3154 | case SEEK_CUR: | |
3155 | case SEEK_END: | |
3156 | return generic_file_llseek(file, offset, whence); | |
3157 | case SEEK_DATA: | |
3158 | return bch2_seek_data(file, offset); | |
3159 | case SEEK_HOLE: | |
3160 | return bch2_seek_hole(file, offset); | |
3161 | } | |
3162 | ||
3163 | return -EINVAL; | |
3164 | } | |
3165 | ||
3166 | void bch2_fs_fsio_exit(struct bch_fs *c) | |
3167 | { | |
3168 | bioset_exit(&c->dio_write_bioset); | |
3169 | bioset_exit(&c->dio_read_bioset); | |
3170 | bioset_exit(&c->writepage_bioset); | |
3171 | } | |
3172 | ||
3173 | int bch2_fs_fsio_init(struct bch_fs *c) | |
3174 | { | |
3175 | int ret = 0; | |
3176 | ||
3177 | pr_verbose_init(c->opts, ""); | |
3178 | ||
3179 | if (bioset_init(&c->writepage_bioset, | |
9a3df993 | 3180 | 4, offsetof(struct bch_writepage_io, op.wbio.bio), |
1c6fdbd8 KO |
3181 | BIOSET_NEED_BVECS) || |
3182 | bioset_init(&c->dio_read_bioset, | |
3183 | 4, offsetof(struct dio_read, rbio.bio), | |
3184 | BIOSET_NEED_BVECS) || | |
3185 | bioset_init(&c->dio_write_bioset, | |
9a3df993 | 3186 | 4, offsetof(struct dio_write, op.wbio.bio), |
1c6fdbd8 KO |
3187 | BIOSET_NEED_BVECS)) |
3188 | ret = -ENOMEM; | |
3189 | ||
3190 | pr_verbose_init(c->opts, "ret %i", ret); | |
3191 | return ret; | |
3192 | } | |
3193 | ||
3194 | #endif /* NO_BCACHEFS_FS */ |