Commit | Line | Data |
---|---|---|
1dd53957 VG |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * dax: direct host memory access | |
4 | * Copyright (C) 2020 Red Hat, Inc. | |
5 | */ | |
6 | ||
7 | #include "fuse_i.h" | |
8 | ||
9 | #include <linux/dax.h> | |
c2d0ad00 | 10 | #include <linux/uio.h> |
45f2348e | 11 | #include <linux/pfn_t.h> |
c2d0ad00 VG |
12 | #include <linux/iomap.h> |
13 | #include <linux/interval_tree.h> | |
45f2348e | 14 | |
fd1a1dc6 SH |
15 | /* |
16 | * Default memory range size. A power of 2 so it agrees with common FUSE_INIT | |
17 | * map_alignment values 4KB and 64KB. | |
18 | */ | |
45f2348e VG |
19 | #define FUSE_DAX_SHIFT 21 |
20 | #define FUSE_DAX_SZ (1 << FUSE_DAX_SHIFT) | |
21 | #define FUSE_DAX_PAGES (FUSE_DAX_SZ / PAGE_SIZE) | |
22 | ||
23 | /** Translation information for file offsets to DAX window offsets */ | |
24 | struct fuse_dax_mapping { | |
25 | /* Will connect in fcd->free_ranges to keep track of free memory */ | |
26 | struct list_head list; | |
27 | ||
c2d0ad00 VG |
28 | /* For interval tree in file/inode */ |
29 | struct interval_tree_node itn; | |
30 | ||
45f2348e VG |
31 | /** Position in DAX window */ |
32 | u64 window_offset; | |
33 | ||
34 | /** Length of mapping, in bytes */ | |
35 | loff_t length; | |
c2d0ad00 VG |
36 | |
37 | /* Is this mapping read-only or read-write */ | |
38 | bool writable; | |
39 | }; | |
40 | ||
41 | /* Per-inode dax map */ | |
42 | struct fuse_inode_dax { | |
43 | /* Semaphore to protect modifications to the dmap tree */ | |
44 | struct rw_semaphore sem; | |
45 | ||
46 | /* Sorted rb tree of struct fuse_dax_mapping elements */ | |
47 | struct rb_root_cached tree; | |
48 | unsigned long nr; | |
45f2348e | 49 | }; |
1dd53957 VG |
50 | |
51 | struct fuse_conn_dax { | |
52 | /* DAX device */ | |
53 | struct dax_device *dev; | |
45f2348e | 54 | |
c2d0ad00 VG |
55 | /* Lock protecting accessess to members of this structure */ |
56 | spinlock_t lock; | |
57 | ||
45f2348e VG |
58 | /* DAX Window Free Ranges */ |
59 | long nr_free_ranges; | |
60 | struct list_head free_ranges; | |
1dd53957 VG |
61 | }; |
62 | ||
c2d0ad00 VG |
63 | static inline struct fuse_dax_mapping * |
64 | node_to_dmap(struct interval_tree_node *node) | |
65 | { | |
66 | if (!node) | |
67 | return NULL; | |
68 | ||
69 | return container_of(node, struct fuse_dax_mapping, itn); | |
70 | } | |
71 | ||
72 | static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn_dax *fcd) | |
73 | { | |
74 | struct fuse_dax_mapping *dmap; | |
75 | ||
76 | spin_lock(&fcd->lock); | |
77 | dmap = list_first_entry_or_null(&fcd->free_ranges, | |
78 | struct fuse_dax_mapping, list); | |
79 | if (dmap) { | |
80 | list_del_init(&dmap->list); | |
81 | WARN_ON(fcd->nr_free_ranges <= 0); | |
82 | fcd->nr_free_ranges--; | |
83 | } | |
84 | spin_unlock(&fcd->lock); | |
85 | return dmap; | |
86 | } | |
87 | ||
88 | /* This assumes fcd->lock is held */ | |
89 | static void __dmap_add_to_free_pool(struct fuse_conn_dax *fcd, | |
90 | struct fuse_dax_mapping *dmap) | |
91 | { | |
92 | list_add_tail(&dmap->list, &fcd->free_ranges); | |
93 | fcd->nr_free_ranges++; | |
94 | } | |
95 | ||
96 | static void dmap_add_to_free_pool(struct fuse_conn_dax *fcd, | |
97 | struct fuse_dax_mapping *dmap) | |
98 | { | |
99 | /* Return fuse_dax_mapping to free list */ | |
100 | spin_lock(&fcd->lock); | |
101 | __dmap_add_to_free_pool(fcd, dmap); | |
102 | spin_unlock(&fcd->lock); | |
103 | } | |
104 | ||
105 | static int fuse_setup_one_mapping(struct inode *inode, unsigned long start_idx, | |
106 | struct fuse_dax_mapping *dmap, bool writable, | |
107 | bool upgrade) | |
108 | { | |
109 | struct fuse_conn *fc = get_fuse_conn(inode); | |
110 | struct fuse_conn_dax *fcd = fc->dax; | |
111 | struct fuse_inode *fi = get_fuse_inode(inode); | |
112 | struct fuse_setupmapping_in inarg; | |
113 | loff_t offset = start_idx << FUSE_DAX_SHIFT; | |
114 | FUSE_ARGS(args); | |
115 | ssize_t err; | |
116 | ||
117 | WARN_ON(fcd->nr_free_ranges < 0); | |
118 | ||
119 | /* Ask fuse daemon to setup mapping */ | |
120 | memset(&inarg, 0, sizeof(inarg)); | |
121 | inarg.foffset = offset; | |
122 | inarg.fh = -1; | |
123 | inarg.moffset = dmap->window_offset; | |
124 | inarg.len = FUSE_DAX_SZ; | |
125 | inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ; | |
126 | if (writable) | |
127 | inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE; | |
128 | args.opcode = FUSE_SETUPMAPPING; | |
129 | args.nodeid = fi->nodeid; | |
130 | args.in_numargs = 1; | |
131 | args.in_args[0].size = sizeof(inarg); | |
132 | args.in_args[0].value = &inarg; | |
133 | err = fuse_simple_request(fc, &args); | |
134 | if (err < 0) | |
135 | return err; | |
136 | dmap->writable = writable; | |
137 | if (!upgrade) { | |
138 | dmap->itn.start = dmap->itn.last = start_idx; | |
139 | /* Protected by fi->dax->sem */ | |
140 | interval_tree_insert(&dmap->itn, &fi->dax->tree); | |
141 | fi->dax->nr++; | |
142 | } | |
143 | return 0; | |
144 | } | |
145 | ||
146 | static int fuse_send_removemapping(struct inode *inode, | |
147 | struct fuse_removemapping_in *inargp, | |
148 | struct fuse_removemapping_one *remove_one) | |
149 | { | |
150 | struct fuse_inode *fi = get_fuse_inode(inode); | |
151 | struct fuse_conn *fc = get_fuse_conn(inode); | |
152 | FUSE_ARGS(args); | |
153 | ||
154 | args.opcode = FUSE_REMOVEMAPPING; | |
155 | args.nodeid = fi->nodeid; | |
156 | args.in_numargs = 2; | |
157 | args.in_args[0].size = sizeof(*inargp); | |
158 | args.in_args[0].value = inargp; | |
159 | args.in_args[1].size = inargp->count * sizeof(*remove_one); | |
160 | args.in_args[1].value = remove_one; | |
161 | return fuse_simple_request(fc, &args); | |
162 | } | |
163 | ||
164 | static int dmap_removemapping_list(struct inode *inode, unsigned int num, | |
165 | struct list_head *to_remove) | |
166 | { | |
167 | struct fuse_removemapping_one *remove_one, *ptr; | |
168 | struct fuse_removemapping_in inarg; | |
169 | struct fuse_dax_mapping *dmap; | |
170 | int ret, i = 0, nr_alloc; | |
171 | ||
172 | nr_alloc = min_t(unsigned int, num, FUSE_REMOVEMAPPING_MAX_ENTRY); | |
173 | remove_one = kmalloc_array(nr_alloc, sizeof(*remove_one), GFP_NOFS); | |
174 | if (!remove_one) | |
175 | return -ENOMEM; | |
176 | ||
177 | ptr = remove_one; | |
178 | list_for_each_entry(dmap, to_remove, list) { | |
179 | ptr->moffset = dmap->window_offset; | |
180 | ptr->len = dmap->length; | |
181 | ptr++; | |
182 | i++; | |
183 | num--; | |
184 | if (i >= nr_alloc || num == 0) { | |
185 | memset(&inarg, 0, sizeof(inarg)); | |
186 | inarg.count = i; | |
187 | ret = fuse_send_removemapping(inode, &inarg, | |
188 | remove_one); | |
189 | if (ret) | |
190 | goto out; | |
191 | ptr = remove_one; | |
192 | i = 0; | |
193 | } | |
194 | } | |
195 | out: | |
196 | kfree(remove_one); | |
197 | return ret; | |
198 | } | |
199 | ||
200 | /* | |
201 | * Cleanup dmap entry and add back to free list. This should be called with | |
202 | * fcd->lock held. | |
203 | */ | |
204 | static void dmap_reinit_add_to_free_pool(struct fuse_conn_dax *fcd, | |
205 | struct fuse_dax_mapping *dmap) | |
206 | { | |
207 | pr_debug("fuse: freeing memory range start_idx=0x%lx end_idx=0x%lx window_offset=0x%llx length=0x%llx\n", | |
208 | dmap->itn.start, dmap->itn.last, dmap->window_offset, | |
209 | dmap->length); | |
210 | dmap->itn.start = dmap->itn.last = 0; | |
211 | __dmap_add_to_free_pool(fcd, dmap); | |
212 | } | |
213 | ||
214 | /* | |
215 | * Free inode dmap entries whose range falls inside [start, end]. | |
216 | * Does not take any locks. At this point of time it should only be | |
217 | * called from evict_inode() path where we know all dmap entries can be | |
218 | * reclaimed. | |
219 | */ | |
220 | static void inode_reclaim_dmap_range(struct fuse_conn_dax *fcd, | |
221 | struct inode *inode, | |
222 | loff_t start, loff_t end) | |
223 | { | |
224 | struct fuse_inode *fi = get_fuse_inode(inode); | |
225 | struct fuse_dax_mapping *dmap, *n; | |
226 | int err, num = 0; | |
227 | LIST_HEAD(to_remove); | |
228 | unsigned long start_idx = start >> FUSE_DAX_SHIFT; | |
229 | unsigned long end_idx = end >> FUSE_DAX_SHIFT; | |
230 | struct interval_tree_node *node; | |
231 | ||
232 | while (1) { | |
233 | node = interval_tree_iter_first(&fi->dax->tree, start_idx, | |
234 | end_idx); | |
235 | if (!node) | |
236 | break; | |
237 | dmap = node_to_dmap(node); | |
238 | interval_tree_remove(&dmap->itn, &fi->dax->tree); | |
239 | num++; | |
240 | list_add(&dmap->list, &to_remove); | |
241 | } | |
242 | ||
243 | /* Nothing to remove */ | |
244 | if (list_empty(&to_remove)) | |
245 | return; | |
246 | ||
247 | WARN_ON(fi->dax->nr < num); | |
248 | fi->dax->nr -= num; | |
249 | err = dmap_removemapping_list(inode, num, &to_remove); | |
250 | if (err && err != -ENOTCONN) { | |
251 | pr_warn("Failed to removemappings. start=0x%llx end=0x%llx\n", | |
252 | start, end); | |
253 | } | |
254 | spin_lock(&fcd->lock); | |
255 | list_for_each_entry_safe(dmap, n, &to_remove, list) { | |
256 | list_del_init(&dmap->list); | |
257 | dmap_reinit_add_to_free_pool(fcd, dmap); | |
258 | } | |
259 | spin_unlock(&fcd->lock); | |
260 | } | |
261 | ||
262 | /* | |
263 | * It is called from evict_inode() and by that time inode is going away. So | |
264 | * this function does not take any locks like fi->dax->sem for traversing | |
265 | * that fuse inode interval tree. If that lock is taken then lock validator | |
266 | * complains of deadlock situation w.r.t fs_reclaim lock. | |
267 | */ | |
268 | void fuse_dax_inode_cleanup(struct inode *inode) | |
269 | { | |
270 | struct fuse_conn *fc = get_fuse_conn(inode); | |
271 | struct fuse_inode *fi = get_fuse_inode(inode); | |
272 | ||
273 | /* | |
274 | * fuse_evict_inode() has already called truncate_inode_pages_final() | |
275 | * before we arrive here. So we should not have to worry about any | |
276 | * pages/exception entries still associated with inode. | |
277 | */ | |
278 | inode_reclaim_dmap_range(fc->dax, inode, 0, -1); | |
279 | WARN_ON(fi->dax->nr); | |
280 | } | |
281 | ||
282 | static void fuse_fill_iomap_hole(struct iomap *iomap, loff_t length) | |
283 | { | |
284 | iomap->addr = IOMAP_NULL_ADDR; | |
285 | iomap->length = length; | |
286 | iomap->type = IOMAP_HOLE; | |
287 | } | |
288 | ||
289 | static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length, | |
290 | struct iomap *iomap, struct fuse_dax_mapping *dmap, | |
291 | unsigned int flags) | |
292 | { | |
293 | loff_t offset, len; | |
294 | loff_t i_size = i_size_read(inode); | |
295 | ||
296 | offset = pos - (dmap->itn.start << FUSE_DAX_SHIFT); | |
297 | len = min(length, dmap->length - offset); | |
298 | ||
299 | /* If length is beyond end of file, truncate further */ | |
300 | if (pos + len > i_size) | |
301 | len = i_size - pos; | |
302 | ||
303 | if (len > 0) { | |
304 | iomap->addr = dmap->window_offset + offset; | |
305 | iomap->length = len; | |
306 | if (flags & IOMAP_FAULT) | |
307 | iomap->length = ALIGN(len, PAGE_SIZE); | |
308 | iomap->type = IOMAP_MAPPED; | |
309 | } else { | |
310 | /* Mapping beyond end of file is hole */ | |
311 | fuse_fill_iomap_hole(iomap, length); | |
312 | } | |
313 | } | |
314 | ||
315 | static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos, | |
316 | loff_t length, unsigned int flags, | |
317 | struct iomap *iomap) | |
318 | { | |
319 | struct fuse_inode *fi = get_fuse_inode(inode); | |
320 | struct fuse_conn *fc = get_fuse_conn(inode); | |
321 | struct fuse_conn_dax *fcd = fc->dax; | |
322 | struct fuse_dax_mapping *dmap, *alloc_dmap = NULL; | |
323 | int ret; | |
324 | bool writable = flags & IOMAP_WRITE; | |
325 | unsigned long start_idx = pos >> FUSE_DAX_SHIFT; | |
326 | struct interval_tree_node *node; | |
327 | ||
328 | alloc_dmap = alloc_dax_mapping(fcd); | |
329 | if (!alloc_dmap) | |
330 | return -EIO; | |
331 | ||
332 | /* | |
333 | * Take write lock so that only one caller can try to setup mapping | |
334 | * and other waits. | |
335 | */ | |
336 | down_write(&fi->dax->sem); | |
337 | /* | |
338 | * We dropped lock. Check again if somebody else setup | |
339 | * mapping already. | |
340 | */ | |
341 | node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); | |
342 | if (node) { | |
343 | dmap = node_to_dmap(node); | |
344 | fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); | |
345 | dmap_add_to_free_pool(fcd, alloc_dmap); | |
346 | up_write(&fi->dax->sem); | |
347 | return 0; | |
348 | } | |
349 | ||
350 | /* Setup one mapping */ | |
351 | ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, alloc_dmap, | |
352 | writable, false); | |
353 | if (ret < 0) { | |
354 | dmap_add_to_free_pool(fcd, alloc_dmap); | |
355 | up_write(&fi->dax->sem); | |
356 | return ret; | |
357 | } | |
358 | fuse_fill_iomap(inode, pos, length, iomap, alloc_dmap, flags); | |
359 | up_write(&fi->dax->sem); | |
360 | return 0; | |
361 | } | |
362 | ||
363 | static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos, | |
364 | loff_t length, unsigned int flags, | |
365 | struct iomap *iomap) | |
366 | { | |
367 | struct fuse_inode *fi = get_fuse_inode(inode); | |
368 | struct fuse_dax_mapping *dmap; | |
369 | int ret; | |
370 | unsigned long idx = pos >> FUSE_DAX_SHIFT; | |
371 | struct interval_tree_node *node; | |
372 | ||
373 | /* | |
374 | * Take exclusive lock so that only one caller can try to setup | |
375 | * mapping and others wait. | |
376 | */ | |
377 | down_write(&fi->dax->sem); | |
378 | node = interval_tree_iter_first(&fi->dax->tree, idx, idx); | |
379 | ||
380 | /* We are holding either inode lock or i_mmap_sem, and that should | |
381 | * ensure that dmap can't reclaimed or truncated and it should still | |
382 | * be there in tree despite the fact we dropped and re-acquired the | |
383 | * lock. | |
384 | */ | |
385 | ret = -EIO; | |
386 | if (WARN_ON(!node)) | |
387 | goto out_err; | |
388 | ||
389 | dmap = node_to_dmap(node); | |
390 | ||
391 | /* Maybe another thread already upgraded mapping while we were not | |
392 | * holding lock. | |
393 | */ | |
394 | if (dmap->writable) { | |
395 | ret = 0; | |
396 | goto out_fill_iomap; | |
397 | } | |
398 | ||
399 | ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, dmap, true, | |
400 | true); | |
401 | if (ret < 0) | |
402 | goto out_err; | |
403 | out_fill_iomap: | |
404 | fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); | |
405 | out_err: | |
406 | up_write(&fi->dax->sem); | |
407 | return ret; | |
408 | } | |
409 | ||
410 | /* This is just for DAX and the mapping is ephemeral, do not use it for other | |
411 | * purposes since there is no block device with a permanent mapping. | |
412 | */ | |
413 | static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length, | |
414 | unsigned int flags, struct iomap *iomap, | |
415 | struct iomap *srcmap) | |
416 | { | |
417 | struct fuse_inode *fi = get_fuse_inode(inode); | |
418 | struct fuse_conn *fc = get_fuse_conn(inode); | |
419 | struct fuse_dax_mapping *dmap; | |
420 | bool writable = flags & IOMAP_WRITE; | |
421 | unsigned long start_idx = pos >> FUSE_DAX_SHIFT; | |
422 | struct interval_tree_node *node; | |
423 | ||
424 | /* We don't support FIEMAP */ | |
425 | if (WARN_ON(flags & IOMAP_REPORT)) | |
426 | return -EIO; | |
427 | ||
428 | iomap->offset = pos; | |
429 | iomap->flags = 0; | |
430 | iomap->bdev = NULL; | |
431 | iomap->dax_dev = fc->dax->dev; | |
432 | ||
433 | /* | |
434 | * Both read/write and mmap path can race here. So we need something | |
435 | * to make sure if we are setting up mapping, then other path waits | |
436 | * | |
437 | * For now, use a semaphore for this. It probably needs to be | |
438 | * optimized later. | |
439 | */ | |
440 | down_read(&fi->dax->sem); | |
441 | node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); | |
442 | if (node) { | |
443 | dmap = node_to_dmap(node); | |
444 | if (writable && !dmap->writable) { | |
445 | /* Upgrade read-only mapping to read-write. This will | |
446 | * require exclusive fi->dax->sem lock as we don't want | |
447 | * two threads to be trying to this simultaneously | |
448 | * for same dmap. So drop shared lock and acquire | |
449 | * exclusive lock. | |
450 | */ | |
451 | up_read(&fi->dax->sem); | |
452 | pr_debug("%s: Upgrading mapping at offset 0x%llx length 0x%llx\n", | |
453 | __func__, pos, length); | |
454 | return fuse_upgrade_dax_mapping(inode, pos, length, | |
455 | flags, iomap); | |
456 | } else { | |
457 | fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); | |
458 | up_read(&fi->dax->sem); | |
459 | return 0; | |
460 | } | |
461 | } else { | |
462 | up_read(&fi->dax->sem); | |
463 | pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n", | |
464 | __func__, pos, length); | |
465 | if (pos >= i_size_read(inode)) | |
466 | goto iomap_hole; | |
467 | ||
468 | return fuse_setup_new_dax_mapping(inode, pos, length, flags, | |
469 | iomap); | |
470 | } | |
471 | ||
472 | /* | |
473 | * If read beyond end of file happnes, fs code seems to return | |
474 | * it as hole | |
475 | */ | |
476 | iomap_hole: | |
477 | fuse_fill_iomap_hole(iomap, length); | |
478 | pr_debug("%s returning hole mapping. pos=0x%llx length_asked=0x%llx length_returned=0x%llx\n", | |
479 | __func__, pos, length, iomap->length); | |
480 | return 0; | |
481 | } | |
482 | ||
483 | static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length, | |
484 | ssize_t written, unsigned int flags, | |
485 | struct iomap *iomap) | |
486 | { | |
487 | /* DAX writes beyond end-of-file aren't handled using iomap, so the | |
488 | * file size is unchanged and there is nothing to do here. | |
489 | */ | |
490 | return 0; | |
491 | } | |
492 | ||
493 | static const struct iomap_ops fuse_iomap_ops = { | |
494 | .iomap_begin = fuse_iomap_begin, | |
495 | .iomap_end = fuse_iomap_end, | |
496 | }; | |
497 | ||
6ae330ca VG |
498 | static void fuse_wait_dax_page(struct inode *inode) |
499 | { | |
500 | struct fuse_inode *fi = get_fuse_inode(inode); | |
501 | ||
502 | up_write(&fi->i_mmap_sem); | |
503 | schedule(); | |
504 | down_write(&fi->i_mmap_sem); | |
505 | } | |
506 | ||
507 | /* Should be called with fi->i_mmap_sem lock held exclusively */ | |
508 | static int __fuse_dax_break_layouts(struct inode *inode, bool *retry, | |
509 | loff_t start, loff_t end) | |
510 | { | |
511 | struct page *page; | |
512 | ||
513 | page = dax_layout_busy_page_range(inode->i_mapping, start, end); | |
514 | if (!page) | |
515 | return 0; | |
516 | ||
517 | *retry = true; | |
518 | return ___wait_var_event(&page->_refcount, | |
519 | atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, | |
520 | 0, 0, fuse_wait_dax_page(inode)); | |
521 | } | |
522 | ||
523 | /* dmap_end == 0 leads to unmapping of whole file */ | |
524 | int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, | |
525 | u64 dmap_end) | |
526 | { | |
527 | bool retry; | |
528 | int ret; | |
529 | ||
530 | do { | |
531 | retry = false; | |
532 | ret = __fuse_dax_break_layouts(inode, &retry, dmap_start, | |
533 | dmap_end); | |
534 | } while (ret == 0 && retry); | |
535 | ||
536 | return ret; | |
537 | } | |
538 | ||
c2d0ad00 VG |
539 | ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) |
540 | { | |
541 | struct inode *inode = file_inode(iocb->ki_filp); | |
542 | ssize_t ret; | |
543 | ||
544 | if (iocb->ki_flags & IOCB_NOWAIT) { | |
545 | if (!inode_trylock_shared(inode)) | |
546 | return -EAGAIN; | |
547 | } else { | |
548 | inode_lock_shared(inode); | |
549 | } | |
550 | ||
551 | ret = dax_iomap_rw(iocb, to, &fuse_iomap_ops); | |
552 | inode_unlock_shared(inode); | |
553 | ||
554 | /* TODO file_accessed(iocb->f_filp) */ | |
555 | return ret; | |
556 | } | |
557 | ||
558 | static bool file_extending_write(struct kiocb *iocb, struct iov_iter *from) | |
559 | { | |
560 | struct inode *inode = file_inode(iocb->ki_filp); | |
561 | ||
562 | return (iov_iter_rw(from) == WRITE && | |
563 | ((iocb->ki_pos) >= i_size_read(inode) || | |
564 | (iocb->ki_pos + iov_iter_count(from) > i_size_read(inode)))); | |
565 | } | |
566 | ||
567 | static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from) | |
568 | { | |
569 | struct inode *inode = file_inode(iocb->ki_filp); | |
570 | struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); | |
571 | ssize_t ret; | |
572 | ||
573 | ret = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); | |
574 | if (ret < 0) | |
575 | return ret; | |
576 | ||
577 | fuse_invalidate_attr(inode); | |
578 | fuse_write_update_size(inode, iocb->ki_pos); | |
579 | return ret; | |
580 | } | |
581 | ||
582 | ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) | |
583 | { | |
584 | struct inode *inode = file_inode(iocb->ki_filp); | |
585 | ssize_t ret; | |
586 | ||
587 | if (iocb->ki_flags & IOCB_NOWAIT) { | |
588 | if (!inode_trylock(inode)) | |
589 | return -EAGAIN; | |
590 | } else { | |
591 | inode_lock(inode); | |
592 | } | |
593 | ||
594 | ret = generic_write_checks(iocb, from); | |
595 | if (ret <= 0) | |
596 | goto out; | |
597 | ||
598 | ret = file_remove_privs(iocb->ki_filp); | |
599 | if (ret) | |
600 | goto out; | |
601 | /* TODO file_update_time() but we don't want metadata I/O */ | |
602 | ||
603 | /* Do not use dax for file extending writes as write and on | |
604 | * disk i_size increase are not atomic otherwise. | |
605 | */ | |
606 | if (file_extending_write(iocb, from)) | |
607 | ret = fuse_dax_direct_write(iocb, from); | |
608 | else | |
609 | ret = dax_iomap_rw(iocb, from, &fuse_iomap_ops); | |
610 | ||
611 | out: | |
612 | inode_unlock(inode); | |
613 | ||
614 | if (ret > 0) | |
615 | ret = generic_write_sync(iocb, ret); | |
616 | return ret; | |
617 | } | |
618 | ||
9483e7d5 VG |
619 | static int fuse_dax_writepages(struct address_space *mapping, |
620 | struct writeback_control *wbc) | |
621 | { | |
622 | ||
623 | struct inode *inode = mapping->host; | |
624 | struct fuse_conn *fc = get_fuse_conn(inode); | |
625 | ||
626 | return dax_writeback_mapping_range(mapping, fc->dax->dev, wbc); | |
627 | } | |
628 | ||
2a9a609a SH |
629 | static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, |
630 | enum page_entry_size pe_size, bool write) | |
631 | { | |
632 | vm_fault_t ret; | |
633 | struct inode *inode = file_inode(vmf->vma->vm_file); | |
634 | struct super_block *sb = inode->i_sb; | |
635 | pfn_t pfn; | |
636 | ||
637 | if (write) | |
638 | sb_start_pagefault(sb); | |
639 | ||
6ae330ca VG |
640 | /* |
641 | * We need to serialize against not only truncate but also against | |
642 | * fuse dax memory range reclaim. While a range is being reclaimed, | |
643 | * we do not want any read/write/mmap to make progress and try | |
644 | * to populate page cache or access memory we are trying to free. | |
645 | */ | |
646 | down_read(&get_fuse_inode(inode)->i_mmap_sem); | |
2a9a609a SH |
647 | ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, &fuse_iomap_ops); |
648 | ||
649 | if (ret & VM_FAULT_NEEDDSYNC) | |
650 | ret = dax_finish_sync_fault(vmf, pe_size, pfn); | |
6ae330ca | 651 | up_read(&get_fuse_inode(inode)->i_mmap_sem); |
2a9a609a SH |
652 | |
653 | if (write) | |
654 | sb_end_pagefault(sb); | |
655 | ||
656 | return ret; | |
657 | } | |
658 | ||
659 | static vm_fault_t fuse_dax_fault(struct vm_fault *vmf) | |
660 | { | |
661 | return __fuse_dax_fault(vmf, PE_SIZE_PTE, | |
662 | vmf->flags & FAULT_FLAG_WRITE); | |
663 | } | |
664 | ||
665 | static vm_fault_t fuse_dax_huge_fault(struct vm_fault *vmf, | |
666 | enum page_entry_size pe_size) | |
667 | { | |
668 | return __fuse_dax_fault(vmf, pe_size, vmf->flags & FAULT_FLAG_WRITE); | |
669 | } | |
670 | ||
671 | static vm_fault_t fuse_dax_page_mkwrite(struct vm_fault *vmf) | |
672 | { | |
673 | return __fuse_dax_fault(vmf, PE_SIZE_PTE, true); | |
674 | } | |
675 | ||
676 | static vm_fault_t fuse_dax_pfn_mkwrite(struct vm_fault *vmf) | |
677 | { | |
678 | return __fuse_dax_fault(vmf, PE_SIZE_PTE, true); | |
679 | } | |
680 | ||
681 | static const struct vm_operations_struct fuse_dax_vm_ops = { | |
682 | .fault = fuse_dax_fault, | |
683 | .huge_fault = fuse_dax_huge_fault, | |
684 | .page_mkwrite = fuse_dax_page_mkwrite, | |
685 | .pfn_mkwrite = fuse_dax_pfn_mkwrite, | |
686 | }; | |
687 | ||
688 | int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma) | |
689 | { | |
690 | file_accessed(file); | |
691 | vma->vm_ops = &fuse_dax_vm_ops; | |
692 | vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; | |
693 | return 0; | |
694 | } | |
695 | ||
45f2348e VG |
696 | static void fuse_free_dax_mem_ranges(struct list_head *mem_list) |
697 | { | |
698 | struct fuse_dax_mapping *range, *temp; | |
699 | ||
700 | /* Free All allocated elements */ | |
701 | list_for_each_entry_safe(range, temp, mem_list, list) { | |
702 | list_del(&range->list); | |
703 | kfree(range); | |
704 | } | |
705 | } | |
706 | ||
1dd53957 VG |
707 | void fuse_dax_conn_free(struct fuse_conn *fc) |
708 | { | |
45f2348e VG |
709 | if (fc->dax) { |
710 | fuse_free_dax_mem_ranges(&fc->dax->free_ranges); | |
711 | kfree(fc->dax); | |
712 | } | |
713 | } | |
714 | ||
715 | static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd) | |
716 | { | |
717 | long nr_pages, nr_ranges; | |
718 | void *kaddr; | |
719 | pfn_t pfn; | |
720 | struct fuse_dax_mapping *range; | |
721 | int ret, id; | |
722 | size_t dax_size = -1; | |
723 | unsigned long i; | |
724 | ||
725 | INIT_LIST_HEAD(&fcd->free_ranges); | |
726 | id = dax_read_lock(); | |
727 | nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), &kaddr, | |
728 | &pfn); | |
729 | dax_read_unlock(id); | |
730 | if (nr_pages < 0) { | |
731 | pr_debug("dax_direct_access() returned %ld\n", nr_pages); | |
732 | return nr_pages; | |
733 | } | |
734 | ||
735 | nr_ranges = nr_pages/FUSE_DAX_PAGES; | |
736 | pr_debug("%s: dax mapped %ld pages. nr_ranges=%ld\n", | |
737 | __func__, nr_pages, nr_ranges); | |
738 | ||
739 | for (i = 0; i < nr_ranges; i++) { | |
740 | range = kzalloc(sizeof(struct fuse_dax_mapping), GFP_KERNEL); | |
741 | ret = -ENOMEM; | |
742 | if (!range) | |
743 | goto out_err; | |
744 | ||
745 | /* TODO: This offset only works if virtio-fs driver is not | |
746 | * having some memory hidden at the beginning. This needs | |
747 | * better handling | |
748 | */ | |
749 | range->window_offset = i * FUSE_DAX_SZ; | |
750 | range->length = FUSE_DAX_SZ; | |
751 | list_add_tail(&range->list, &fcd->free_ranges); | |
752 | } | |
753 | ||
754 | fcd->nr_free_ranges = nr_ranges; | |
755 | return 0; | |
756 | out_err: | |
757 | /* Free All allocated elements */ | |
758 | fuse_free_dax_mem_ranges(&fcd->free_ranges); | |
759 | return ret; | |
1dd53957 VG |
760 | } |
761 | ||
762 | int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev) | |
763 | { | |
764 | struct fuse_conn_dax *fcd; | |
45f2348e | 765 | int err; |
1dd53957 VG |
766 | |
767 | if (!dax_dev) | |
768 | return 0; | |
769 | ||
770 | fcd = kzalloc(sizeof(*fcd), GFP_KERNEL); | |
771 | if (!fcd) | |
772 | return -ENOMEM; | |
773 | ||
c2d0ad00 | 774 | spin_lock_init(&fcd->lock); |
1dd53957 | 775 | fcd->dev = dax_dev; |
45f2348e VG |
776 | err = fuse_dax_mem_range_init(fcd); |
777 | if (err) { | |
778 | kfree(fcd); | |
779 | return err; | |
780 | } | |
1dd53957 VG |
781 | |
782 | fc->dax = fcd; | |
783 | return 0; | |
784 | } | |
fd1a1dc6 | 785 | |
c2d0ad00 VG |
786 | bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi) |
787 | { | |
788 | struct fuse_conn *fc = get_fuse_conn_super(sb); | |
789 | ||
790 | fi->dax = NULL; | |
791 | if (fc->dax) { | |
792 | fi->dax = kzalloc(sizeof(*fi->dax), GFP_KERNEL_ACCOUNT); | |
793 | if (!fi->dax) | |
794 | return false; | |
795 | ||
796 | init_rwsem(&fi->dax->sem); | |
797 | fi->dax->tree = RB_ROOT_CACHED; | |
798 | } | |
799 | ||
800 | return true; | |
801 | } | |
802 | ||
9483e7d5 VG |
803 | static const struct address_space_operations fuse_dax_file_aops = { |
804 | .writepages = fuse_dax_writepages, | |
805 | .direct_IO = noop_direct_IO, | |
806 | .set_page_dirty = noop_set_page_dirty, | |
807 | .invalidatepage = noop_invalidatepage, | |
808 | }; | |
809 | ||
c2d0ad00 VG |
810 | void fuse_dax_inode_init(struct inode *inode) |
811 | { | |
812 | struct fuse_conn *fc = get_fuse_conn(inode); | |
813 | ||
814 | if (!fc->dax) | |
815 | return; | |
816 | ||
817 | inode->i_flags |= S_DAX; | |
9483e7d5 | 818 | inode->i_data.a_ops = &fuse_dax_file_aops; |
c2d0ad00 VG |
819 | } |
820 | ||
fd1a1dc6 SH |
821 | bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment) |
822 | { | |
823 | if (fc->dax && (map_alignment > FUSE_DAX_SHIFT)) { | |
824 | pr_warn("FUSE: map_alignment %u incompatible with dax mem range size %u\n", | |
825 | map_alignment, FUSE_DAX_SZ); | |
826 | return false; | |
827 | } | |
828 | return true; | |
829 | } |