Commit | Line | Data |
---|---|---|
ab68f262 DW |
1 | /* |
2 | * Copyright(c) 2016 Intel Corporation. All rights reserved. | |
3 | * | |
4 | * This program is free software; you can redistribute it and/or modify | |
5 | * it under the terms of version 2 of the GNU General Public License as | |
6 | * published by the Free Software Foundation. | |
7 | * | |
8 | * This program is distributed in the hope that it will be useful, but | |
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
11 | * General Public License for more details. | |
12 | */ | |
13 | #include <linux/pagemap.h> | |
14 | #include <linux/module.h> | |
15 | #include <linux/device.h> | |
50d34394 | 16 | #include <linux/magic.h> |
3bc52c45 | 17 | #include <linux/mount.h> |
ab68f262 | 18 | #include <linux/pfn_t.h> |
3bc52c45 | 19 | #include <linux/hash.h> |
ba09c01d | 20 | #include <linux/cdev.h> |
ab68f262 DW |
21 | #include <linux/slab.h> |
22 | #include <linux/dax.h> | |
23 | #include <linux/fs.h> | |
24 | #include <linux/mm.h> | |
ccdb07f6 | 25 | #include "dax.h" |
ab68f262 | 26 | |
ba09c01d | 27 | static dev_t dax_devt; |
956a4cd2 | 28 | DEFINE_STATIC_SRCU(dax_srcu); |
ab68f262 DW |
29 | static struct class *dax_class; |
30 | static DEFINE_IDA(dax_minor_ida); | |
ba09c01d DW |
31 | static int nr_dax = CONFIG_NR_DEV_DAX; |
32 | module_param(nr_dax, int, S_IRUGO); | |
3bc52c45 DW |
33 | static struct vfsmount *dax_mnt; |
34 | static struct kmem_cache *dax_cache __read_mostly; | |
35 | static struct super_block *dax_superblock __read_mostly; | |
ba09c01d | 36 | MODULE_PARM_DESC(nr_dax, "max number of device-dax instances"); |
ab68f262 DW |
37 | |
38 | /** | |
39 | * struct dax_region - mapping infrastructure for dax devices | |
40 | * @id: kernel-wide unique region for a memory range | |
41 | * @base: linear address corresponding to @res | |
42 | * @kref: to pin while other agents have a need to do lookups | |
43 | * @dev: parent device backing this region | |
44 | * @align: allocation and mapping alignment for child dax devices | |
45 | * @res: physical address range of the region | |
46 | * @pfn_flags: identify whether the pfns are paged back or not | |
47 | */ | |
48 | struct dax_region { | |
49 | int id; | |
50 | struct ida ida; | |
51 | void *base; | |
52 | struct kref kref; | |
53 | struct device *dev; | |
54 | unsigned int align; | |
55 | struct resource res; | |
56 | unsigned long pfn_flags; | |
57 | }; | |
58 | ||
59 | /** | |
5f0694b3 | 60 | * struct dev_dax - instance data for a subdivision of a dax region |
ab68f262 DW |
61 | * @region - parent region |
62 | * @dev - device backing the character device | |
ba09c01d | 63 | * @cdev - core chardev data |
956a4cd2 | 64 | * @alive - !alive + srcu grace period == no new mappings can be established |
ab68f262 DW |
65 | * @id - child id in the region |
66 | * @num_resources - number of physical address extents in this device | |
67 | * @res - array of physical address ranges | |
68 | */ | |
5f0694b3 | 69 | struct dev_dax { |
ab68f262 | 70 | struct dax_region *region; |
3bc52c45 | 71 | struct inode *inode; |
ebd84d72 | 72 | struct device dev; |
ba09c01d | 73 | struct cdev cdev; |
dee41079 | 74 | bool alive; |
ab68f262 DW |
75 | int id; |
76 | int num_resources; | |
77 | struct resource res[0]; | |
78 | }; | |
79 | ||
d7fe1a67 DW |
80 | static ssize_t id_show(struct device *dev, |
81 | struct device_attribute *attr, char *buf) | |
82 | { | |
83 | struct dax_region *dax_region; | |
84 | ssize_t rc = -ENXIO; | |
85 | ||
86 | device_lock(dev); | |
87 | dax_region = dev_get_drvdata(dev); | |
88 | if (dax_region) | |
89 | rc = sprintf(buf, "%d\n", dax_region->id); | |
90 | device_unlock(dev); | |
91 | ||
92 | return rc; | |
93 | } | |
94 | static DEVICE_ATTR_RO(id); | |
95 | ||
96 | static ssize_t region_size_show(struct device *dev, | |
97 | struct device_attribute *attr, char *buf) | |
98 | { | |
99 | struct dax_region *dax_region; | |
100 | ssize_t rc = -ENXIO; | |
101 | ||
102 | device_lock(dev); | |
103 | dax_region = dev_get_drvdata(dev); | |
104 | if (dax_region) | |
105 | rc = sprintf(buf, "%llu\n", (unsigned long long) | |
106 | resource_size(&dax_region->res)); | |
107 | device_unlock(dev); | |
108 | ||
109 | return rc; | |
110 | } | |
111 | static struct device_attribute dev_attr_region_size = __ATTR(size, 0444, | |
112 | region_size_show, NULL); | |
113 | ||
114 | static ssize_t align_show(struct device *dev, | |
115 | struct device_attribute *attr, char *buf) | |
116 | { | |
117 | struct dax_region *dax_region; | |
118 | ssize_t rc = -ENXIO; | |
119 | ||
120 | device_lock(dev); | |
121 | dax_region = dev_get_drvdata(dev); | |
122 | if (dax_region) | |
123 | rc = sprintf(buf, "%u\n", dax_region->align); | |
124 | device_unlock(dev); | |
125 | ||
126 | return rc; | |
127 | } | |
128 | static DEVICE_ATTR_RO(align); | |
129 | ||
130 | static struct attribute *dax_region_attributes[] = { | |
131 | &dev_attr_region_size.attr, | |
132 | &dev_attr_align.attr, | |
133 | &dev_attr_id.attr, | |
134 | NULL, | |
135 | }; | |
136 | ||
137 | static const struct attribute_group dax_region_attribute_group = { | |
138 | .name = "dax_region", | |
139 | .attrs = dax_region_attributes, | |
140 | }; | |
141 | ||
142 | static const struct attribute_group *dax_region_attribute_groups[] = { | |
143 | &dax_region_attribute_group, | |
144 | NULL, | |
145 | }; | |
146 | ||
3bc52c45 | 147 | static struct inode *dax_alloc_inode(struct super_block *sb) |
ab68f262 | 148 | { |
3bc52c45 DW |
149 | return kmem_cache_alloc(dax_cache, GFP_KERNEL); |
150 | } | |
ab68f262 | 151 | |
3bc52c45 DW |
152 | static void dax_i_callback(struct rcu_head *head) |
153 | { | |
154 | struct inode *inode = container_of(head, struct inode, i_rcu); | |
155 | ||
156 | kmem_cache_free(dax_cache, inode); | |
ab68f262 DW |
157 | } |
158 | ||
3bc52c45 | 159 | static void dax_destroy_inode(struct inode *inode) |
ab68f262 | 160 | { |
3bc52c45 | 161 | call_rcu(&inode->i_rcu, dax_i_callback); |
ab68f262 | 162 | } |
ab68f262 | 163 | |
3bc52c45 DW |
164 | static const struct super_operations dax_sops = { |
165 | .statfs = simple_statfs, | |
166 | .alloc_inode = dax_alloc_inode, | |
167 | .destroy_inode = dax_destroy_inode, | |
168 | .drop_inode = generic_delete_inode, | |
169 | }; | |
170 | ||
171 | static struct dentry *dax_mount(struct file_system_type *fs_type, | |
172 | int flags, const char *dev_name, void *data) | |
ab68f262 | 173 | { |
3bc52c45 DW |
174 | return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC); |
175 | } | |
ab68f262 | 176 | |
3bc52c45 DW |
177 | static struct file_system_type dax_type = { |
178 | .name = "dax", | |
179 | .mount = dax_mount, | |
180 | .kill_sb = kill_anon_super, | |
181 | }; | |
182 | ||
183 | static int dax_test(struct inode *inode, void *data) | |
184 | { | |
185 | return inode->i_cdev == data; | |
186 | } | |
187 | ||
188 | static int dax_set(struct inode *inode, void *data) | |
189 | { | |
190 | inode->i_cdev = data; | |
191 | return 0; | |
192 | } | |
193 | ||
194 | static struct inode *dax_inode_get(struct cdev *cdev, dev_t devt) | |
195 | { | |
196 | struct inode *inode; | |
197 | ||
198 | inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31), | |
199 | dax_test, dax_set, cdev); | |
200 | ||
201 | if (!inode) | |
202 | return NULL; | |
203 | ||
204 | if (inode->i_state & I_NEW) { | |
205 | inode->i_mode = S_IFCHR; | |
206 | inode->i_flags = S_DAX; | |
207 | inode->i_rdev = devt; | |
208 | mapping_set_gfp_mask(&inode->i_data, GFP_USER); | |
209 | unlock_new_inode(inode); | |
210 | } | |
211 | return inode; | |
212 | } | |
213 | ||
214 | static void init_once(void *inode) | |
215 | { | |
216 | inode_init_once(inode); | |
217 | } | |
218 | ||
219 | static int dax_inode_init(void) | |
220 | { | |
221 | int rc; | |
222 | ||
223 | dax_cache = kmem_cache_create("dax_cache", sizeof(struct inode), 0, | |
224 | (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| | |
225 | SLAB_MEM_SPREAD|SLAB_ACCOUNT), | |
226 | init_once); | |
227 | if (!dax_cache) | |
228 | return -ENOMEM; | |
229 | ||
230 | rc = register_filesystem(&dax_type); | |
231 | if (rc) | |
232 | goto err_register_fs; | |
233 | ||
234 | dax_mnt = kern_mount(&dax_type); | |
235 | if (IS_ERR(dax_mnt)) { | |
236 | rc = PTR_ERR(dax_mnt); | |
237 | goto err_mount; | |
238 | } | |
239 | dax_superblock = dax_mnt->mnt_sb; | |
240 | ||
241 | return 0; | |
242 | ||
243 | err_mount: | |
244 | unregister_filesystem(&dax_type); | |
245 | err_register_fs: | |
246 | kmem_cache_destroy(dax_cache); | |
247 | ||
248 | return rc; | |
ab68f262 DW |
249 | } |
250 | ||
3bc52c45 DW |
251 | static void dax_inode_exit(void) |
252 | { | |
253 | kern_unmount(dax_mnt); | |
254 | unregister_filesystem(&dax_type); | |
255 | kmem_cache_destroy(dax_cache); | |
256 | } | |
257 | ||
ab68f262 DW |
258 | static void dax_region_free(struct kref *kref) |
259 | { | |
260 | struct dax_region *dax_region; | |
261 | ||
262 | dax_region = container_of(kref, struct dax_region, kref); | |
263 | kfree(dax_region); | |
264 | } | |
265 | ||
266 | void dax_region_put(struct dax_region *dax_region) | |
ab68f262 | 267 | { |
ab68f262 | 268 | kref_put(&dax_region->kref, dax_region_free); |
ab68f262 | 269 | } |
ab68f262 | 270 | EXPORT_SYMBOL_GPL(dax_region_put); |
ab68f262 | 271 | |
d7fe1a67 DW |
272 | static void dax_region_unregister(void *region) |
273 | { | |
274 | struct dax_region *dax_region = region; | |
275 | ||
276 | sysfs_remove_groups(&dax_region->dev->kobj, | |
277 | dax_region_attribute_groups); | |
278 | dax_region_put(dax_region); | |
279 | } | |
280 | ||
ab68f262 DW |
281 | struct dax_region *alloc_dax_region(struct device *parent, int region_id, |
282 | struct resource *res, unsigned int align, void *addr, | |
283 | unsigned long pfn_flags) | |
284 | { | |
285 | struct dax_region *dax_region; | |
286 | ||
d7fe1a67 DW |
287 | /* |
288 | * The DAX core assumes that it can store its private data in | |
289 | * parent->driver_data. This WARN is a reminder / safeguard for | |
290 | * developers of device-dax drivers. | |
291 | */ | |
292 | if (dev_get_drvdata(parent)) { | |
293 | dev_WARN(parent, "dax core failed to setup private data\n"); | |
294 | return NULL; | |
295 | } | |
296 | ||
9d2d01a0 DW |
297 | if (!IS_ALIGNED(res->start, align) |
298 | || !IS_ALIGNED(resource_size(res), align)) | |
299 | return NULL; | |
ab68f262 | 300 | |
9d2d01a0 | 301 | dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL); |
ab68f262 DW |
302 | if (!dax_region) |
303 | return NULL; | |
304 | ||
d7fe1a67 | 305 | dev_set_drvdata(parent, dax_region); |
ab68f262 DW |
306 | memcpy(&dax_region->res, res, sizeof(*res)); |
307 | dax_region->pfn_flags = pfn_flags; | |
308 | kref_init(&dax_region->kref); | |
309 | dax_region->id = region_id; | |
310 | ida_init(&dax_region->ida); | |
311 | dax_region->align = align; | |
312 | dax_region->dev = parent; | |
313 | dax_region->base = addr; | |
d7fe1a67 DW |
314 | if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) { |
315 | kfree(dax_region); | |
316 | return NULL;; | |
317 | } | |
ab68f262 | 318 | |
d7fe1a67 DW |
319 | kref_get(&dax_region->kref); |
320 | if (devm_add_action_or_reset(parent, dax_region_unregister, dax_region)) | |
321 | return NULL; | |
ab68f262 DW |
322 | return dax_region; |
323 | } | |
324 | EXPORT_SYMBOL_GPL(alloc_dax_region); | |
325 | ||
5f0694b3 | 326 | static struct dev_dax *to_dev_dax(struct device *dev) |
ebd84d72 | 327 | { |
5f0694b3 | 328 | return container_of(dev, struct dev_dax, dev); |
ebd84d72 DW |
329 | } |
330 | ||
ab68f262 DW |
331 | static ssize_t size_show(struct device *dev, |
332 | struct device_attribute *attr, char *buf) | |
333 | { | |
5f0694b3 | 334 | struct dev_dax *dev_dax = to_dev_dax(dev); |
ab68f262 DW |
335 | unsigned long long size = 0; |
336 | int i; | |
337 | ||
5f0694b3 DW |
338 | for (i = 0; i < dev_dax->num_resources; i++) |
339 | size += resource_size(&dev_dax->res[i]); | |
ab68f262 DW |
340 | |
341 | return sprintf(buf, "%llu\n", size); | |
342 | } | |
343 | static DEVICE_ATTR_RO(size); | |
344 | ||
5f0694b3 | 345 | static struct attribute *dev_dax_attributes[] = { |
ab68f262 DW |
346 | &dev_attr_size.attr, |
347 | NULL, | |
348 | }; | |
349 | ||
5f0694b3 DW |
350 | static const struct attribute_group dev_dax_attribute_group = { |
351 | .attrs = dev_dax_attributes, | |
ab68f262 DW |
352 | }; |
353 | ||
354 | static const struct attribute_group *dax_attribute_groups[] = { | |
5f0694b3 | 355 | &dev_dax_attribute_group, |
ab68f262 DW |
356 | NULL, |
357 | }; | |
358 | ||
5f0694b3 | 359 | static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, |
dee41079 DW |
360 | const char *func) |
361 | { | |
5f0694b3 DW |
362 | struct dax_region *dax_region = dev_dax->region; |
363 | struct device *dev = &dev_dax->dev; | |
dee41079 DW |
364 | unsigned long mask; |
365 | ||
5f0694b3 | 366 | if (!dev_dax->alive) |
dee41079 DW |
367 | return -ENXIO; |
368 | ||
4cb19355 | 369 | /* prevent private mappings from being established */ |
325896ff | 370 | if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) { |
dee41079 DW |
371 | dev_info(dev, "%s: %s: fail, attempted private mapping\n", |
372 | current->comm, func); | |
373 | return -EINVAL; | |
374 | } | |
375 | ||
376 | mask = dax_region->align - 1; | |
377 | if (vma->vm_start & mask || vma->vm_end & mask) { | |
378 | dev_info(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n", | |
379 | current->comm, func, vma->vm_start, vma->vm_end, | |
380 | mask); | |
381 | return -EINVAL; | |
382 | } | |
383 | ||
384 | if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV | |
385 | && (vma->vm_flags & VM_DONTCOPY) == 0) { | |
386 | dev_info(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n", | |
387 | current->comm, func); | |
388 | return -EINVAL; | |
389 | } | |
390 | ||
391 | if (!vma_is_dax(vma)) { | |
392 | dev_info(dev, "%s: %s: fail, vma is not DAX capable\n", | |
393 | current->comm, func); | |
394 | return -EINVAL; | |
395 | } | |
396 | ||
397 | return 0; | |
398 | } | |
399 | ||
5f0694b3 | 400 | static phys_addr_t pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, |
dee41079 DW |
401 | unsigned long size) |
402 | { | |
403 | struct resource *res; | |
404 | phys_addr_t phys; | |
405 | int i; | |
406 | ||
5f0694b3 DW |
407 | for (i = 0; i < dev_dax->num_resources; i++) { |
408 | res = &dev_dax->res[i]; | |
dee41079 DW |
409 | phys = pgoff * PAGE_SIZE + res->start; |
410 | if (phys >= res->start && phys <= res->end) | |
411 | break; | |
412 | pgoff -= PHYS_PFN(resource_size(res)); | |
413 | } | |
414 | ||
5f0694b3 DW |
415 | if (i < dev_dax->num_resources) { |
416 | res = &dev_dax->res[i]; | |
dee41079 DW |
417 | if (phys + size - 1 <= res->end) |
418 | return phys; | |
419 | } | |
420 | ||
421 | return -1; | |
422 | } | |
423 | ||
5f0694b3 | 424 | static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) |
dee41079 | 425 | { |
5f0694b3 | 426 | struct device *dev = &dev_dax->dev; |
dee41079 DW |
427 | struct dax_region *dax_region; |
428 | int rc = VM_FAULT_SIGBUS; | |
429 | phys_addr_t phys; | |
430 | pfn_t pfn; | |
0134ed4f | 431 | unsigned int fault_size = PAGE_SIZE; |
dee41079 | 432 | |
5f0694b3 | 433 | if (check_vma(dev_dax, vmf->vma, __func__)) |
dee41079 DW |
434 | return VM_FAULT_SIGBUS; |
435 | ||
5f0694b3 | 436 | dax_region = dev_dax->region; |
dee41079 | 437 | if (dax_region->align > PAGE_SIZE) { |
76202620 OH |
438 | dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", |
439 | __func__, dax_region->align, fault_size); | |
dee41079 DW |
440 | return VM_FAULT_SIGBUS; |
441 | } | |
442 | ||
0134ed4f DJ |
443 | if (fault_size != dax_region->align) |
444 | return VM_FAULT_SIGBUS; | |
445 | ||
5f0694b3 | 446 | phys = pgoff_to_phys(dev_dax, vmf->pgoff, PAGE_SIZE); |
dee41079 | 447 | if (phys == -1) { |
52084f89 | 448 | dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, |
dee41079 DW |
449 | vmf->pgoff); |
450 | return VM_FAULT_SIGBUS; | |
451 | } | |
452 | ||
453 | pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); | |
454 | ||
11bac800 | 455 | rc = vm_insert_mixed(vmf->vma, vmf->address, pfn); |
dee41079 DW |
456 | |
457 | if (rc == -ENOMEM) | |
458 | return VM_FAULT_OOM; | |
459 | if (rc < 0 && rc != -EBUSY) | |
460 | return VM_FAULT_SIGBUS; | |
461 | ||
462 | return VM_FAULT_NOPAGE; | |
463 | } | |
464 | ||
5f0694b3 | 465 | static int __dev_dax_pmd_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) |
dee41079 | 466 | { |
d8a849e1 | 467 | unsigned long pmd_addr = vmf->address & PMD_MASK; |
5f0694b3 | 468 | struct device *dev = &dev_dax->dev; |
dee41079 DW |
469 | struct dax_region *dax_region; |
470 | phys_addr_t phys; | |
471 | pgoff_t pgoff; | |
472 | pfn_t pfn; | |
0134ed4f | 473 | unsigned int fault_size = PMD_SIZE; |
dee41079 | 474 | |
5f0694b3 | 475 | if (check_vma(dev_dax, vmf->vma, __func__)) |
dee41079 DW |
476 | return VM_FAULT_SIGBUS; |
477 | ||
5f0694b3 | 478 | dax_region = dev_dax->region; |
dee41079 | 479 | if (dax_region->align > PMD_SIZE) { |
76202620 OH |
480 | dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", |
481 | __func__, dax_region->align, fault_size); | |
dee41079 DW |
482 | return VM_FAULT_SIGBUS; |
483 | } | |
484 | ||
485 | /* dax pmd mappings require pfn_t_devmap() */ | |
486 | if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { | |
76202620 | 487 | dev_dbg(dev, "%s: region lacks devmap flags\n", __func__); |
dee41079 DW |
488 | return VM_FAULT_SIGBUS; |
489 | } | |
490 | ||
0134ed4f DJ |
491 | if (fault_size < dax_region->align) |
492 | return VM_FAULT_SIGBUS; | |
493 | else if (fault_size > dax_region->align) | |
494 | return VM_FAULT_FALLBACK; | |
495 | ||
496 | /* if we are outside of the VMA */ | |
497 | if (pmd_addr < vmf->vma->vm_start || | |
498 | (pmd_addr + PMD_SIZE) > vmf->vma->vm_end) | |
499 | return VM_FAULT_SIGBUS; | |
500 | ||
f4200391 | 501 | pgoff = linear_page_index(vmf->vma, pmd_addr); |
5f0694b3 | 502 | phys = pgoff_to_phys(dev_dax, pgoff, PMD_SIZE); |
dee41079 | 503 | if (phys == -1) { |
52084f89 | 504 | dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, |
dee41079 DW |
505 | pgoff); |
506 | return VM_FAULT_SIGBUS; | |
507 | } | |
508 | ||
509 | pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); | |
510 | ||
f4200391 | 511 | return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn, |
d8a849e1 | 512 | vmf->flags & FAULT_FLAG_WRITE); |
dee41079 DW |
513 | } |
514 | ||
9557feee | 515 | #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD |
5f0694b3 | 516 | static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) |
9557feee DJ |
517 | { |
518 | unsigned long pud_addr = vmf->address & PUD_MASK; | |
5f0694b3 | 519 | struct device *dev = &dev_dax->dev; |
9557feee DJ |
520 | struct dax_region *dax_region; |
521 | phys_addr_t phys; | |
522 | pgoff_t pgoff; | |
523 | pfn_t pfn; | |
70b085b0 DJ |
524 | unsigned int fault_size = PUD_SIZE; |
525 | ||
9557feee | 526 | |
5f0694b3 | 527 | if (check_vma(dev_dax, vmf->vma, __func__)) |
9557feee DJ |
528 | return VM_FAULT_SIGBUS; |
529 | ||
5f0694b3 | 530 | dax_region = dev_dax->region; |
9557feee | 531 | if (dax_region->align > PUD_SIZE) { |
76202620 OH |
532 | dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", |
533 | __func__, dax_region->align, fault_size); | |
9557feee DJ |
534 | return VM_FAULT_SIGBUS; |
535 | } | |
536 | ||
537 | /* dax pud mappings require pfn_t_devmap() */ | |
538 | if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { | |
76202620 | 539 | dev_dbg(dev, "%s: region lacks devmap flags\n", __func__); |
9557feee DJ |
540 | return VM_FAULT_SIGBUS; |
541 | } | |
542 | ||
70b085b0 DJ |
543 | if (fault_size < dax_region->align) |
544 | return VM_FAULT_SIGBUS; | |
545 | else if (fault_size > dax_region->align) | |
546 | return VM_FAULT_FALLBACK; | |
547 | ||
548 | /* if we are outside of the VMA */ | |
549 | if (pud_addr < vmf->vma->vm_start || | |
550 | (pud_addr + PUD_SIZE) > vmf->vma->vm_end) | |
551 | return VM_FAULT_SIGBUS; | |
552 | ||
9557feee | 553 | pgoff = linear_page_index(vmf->vma, pud_addr); |
5f0694b3 | 554 | phys = pgoff_to_phys(dev_dax, pgoff, PUD_SIZE); |
9557feee | 555 | if (phys == -1) { |
52084f89 | 556 | dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, |
9557feee DJ |
557 | pgoff); |
558 | return VM_FAULT_SIGBUS; | |
559 | } | |
560 | ||
561 | pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); | |
562 | ||
563 | return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, pfn, | |
564 | vmf->flags & FAULT_FLAG_WRITE); | |
565 | } | |
566 | #else | |
5f0694b3 | 567 | static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) |
9557feee DJ |
568 | { |
569 | return VM_FAULT_FALLBACK; | |
570 | } | |
571 | #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ | |
572 | ||
5f0694b3 | 573 | static int dev_dax_huge_fault(struct vm_fault *vmf, |
c791ace1 | 574 | enum page_entry_size pe_size) |
dee41079 | 575 | { |
956a4cd2 | 576 | int rc, id; |
f4200391 | 577 | struct file *filp = vmf->vma->vm_file; |
5f0694b3 | 578 | struct dev_dax *dev_dax = filp->private_data; |
dee41079 | 579 | |
5f0694b3 | 580 | dev_dbg(&dev_dax->dev, "%s: %s: %s (%#lx - %#lx) size = %d\n", __func__, |
d8a849e1 | 581 | current->comm, (vmf->flags & FAULT_FLAG_WRITE) |
f4200391 | 582 | ? "write" : "read", |
76202620 | 583 | vmf->vma->vm_start, vmf->vma->vm_end, pe_size); |
dee41079 | 584 | |
956a4cd2 | 585 | id = srcu_read_lock(&dax_srcu); |
c791ace1 DJ |
586 | switch (pe_size) { |
587 | case PE_SIZE_PTE: | |
5f0694b3 | 588 | rc = __dev_dax_pte_fault(dev_dax, vmf); |
a2d58167 | 589 | break; |
c791ace1 | 590 | case PE_SIZE_PMD: |
5f0694b3 | 591 | rc = __dev_dax_pmd_fault(dev_dax, vmf); |
9557feee | 592 | break; |
c791ace1 | 593 | case PE_SIZE_PUD: |
5f0694b3 | 594 | rc = __dev_dax_pud_fault(dev_dax, vmf); |
a2d58167 DJ |
595 | break; |
596 | default: | |
54eafcc9 | 597 | rc = VM_FAULT_SIGBUS; |
a2d58167 | 598 | } |
956a4cd2 | 599 | srcu_read_unlock(&dax_srcu, id); |
dee41079 DW |
600 | |
601 | return rc; | |
602 | } | |
603 | ||
5f0694b3 | 604 | static int dev_dax_fault(struct vm_fault *vmf) |
c791ace1 | 605 | { |
5f0694b3 | 606 | return dev_dax_huge_fault(vmf, PE_SIZE_PTE); |
c791ace1 DJ |
607 | } |
608 | ||
5f0694b3 DW |
609 | static const struct vm_operations_struct dax_vm_ops = { |
610 | .fault = dev_dax_fault, | |
611 | .huge_fault = dev_dax_huge_fault, | |
dee41079 DW |
612 | }; |
613 | ||
af69f51e | 614 | static int dax_mmap(struct file *filp, struct vm_area_struct *vma) |
dee41079 | 615 | { |
5f0694b3 | 616 | struct dev_dax *dev_dax = filp->private_data; |
dee41079 DW |
617 | int rc; |
618 | ||
5f0694b3 | 619 | dev_dbg(&dev_dax->dev, "%s\n", __func__); |
dee41079 | 620 | |
5f0694b3 | 621 | rc = check_vma(dev_dax, vma, __func__); |
dee41079 DW |
622 | if (rc) |
623 | return rc; | |
624 | ||
5f0694b3 | 625 | vma->vm_ops = &dax_vm_ops; |
dee41079 DW |
626 | vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; |
627 | return 0; | |
043a9255 DW |
628 | } |
629 | ||
630 | /* return an unmapped area aligned to the dax region specified alignment */ | |
af69f51e | 631 | static unsigned long dax_get_unmapped_area(struct file *filp, |
043a9255 DW |
632 | unsigned long addr, unsigned long len, unsigned long pgoff, |
633 | unsigned long flags) | |
634 | { | |
635 | unsigned long off, off_end, off_align, len_align, addr_align, align; | |
5f0694b3 | 636 | struct dev_dax *dev_dax = filp ? filp->private_data : NULL; |
043a9255 DW |
637 | struct dax_region *dax_region; |
638 | ||
5f0694b3 | 639 | if (!dev_dax || addr) |
043a9255 DW |
640 | goto out; |
641 | ||
5f0694b3 | 642 | dax_region = dev_dax->region; |
043a9255 DW |
643 | align = dax_region->align; |
644 | off = pgoff << PAGE_SHIFT; | |
645 | off_end = off + len; | |
646 | off_align = round_up(off, align); | |
647 | ||
648 | if ((off_end <= off_align) || ((off_end - off_align) < align)) | |
649 | goto out; | |
650 | ||
651 | len_align = len + align; | |
652 | if ((off + len_align) < off) | |
653 | goto out; | |
dee41079 | 654 | |
043a9255 DW |
655 | addr_align = current->mm->get_unmapped_area(filp, addr, len_align, |
656 | pgoff, flags); | |
657 | if (!IS_ERR_VALUE(addr_align)) { | |
658 | addr_align += (off - addr_align) & (align - 1); | |
659 | return addr_align; | |
660 | } | |
661 | out: | |
662 | return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); | |
663 | } | |
664 | ||
af69f51e | 665 | static int dax_open(struct inode *inode, struct file *filp) |
043a9255 | 666 | { |
5f0694b3 | 667 | struct dev_dax *dev_dax; |
043a9255 | 668 | |
5f0694b3 DW |
669 | dev_dax = container_of(inode->i_cdev, struct dev_dax, cdev); |
670 | dev_dbg(&dev_dax->dev, "%s\n", __func__); | |
671 | inode->i_mapping = dev_dax->inode->i_mapping; | |
672 | inode->i_mapping->host = dev_dax->inode; | |
3bc52c45 | 673 | filp->f_mapping = inode->i_mapping; |
5f0694b3 | 674 | filp->private_data = dev_dax; |
ebd84d72 | 675 | inode->i_flags = S_DAX; |
043a9255 | 676 | |
043a9255 DW |
677 | return 0; |
678 | } | |
dee41079 | 679 | |
af69f51e | 680 | static int dax_release(struct inode *inode, struct file *filp) |
043a9255 | 681 | { |
5f0694b3 | 682 | struct dev_dax *dev_dax = filp->private_data; |
043a9255 | 683 | |
5f0694b3 | 684 | dev_dbg(&dev_dax->dev, "%s\n", __func__); |
043a9255 | 685 | return 0; |
dee41079 DW |
686 | } |
687 | ||
ab68f262 DW |
688 | static const struct file_operations dax_fops = { |
689 | .llseek = noop_llseek, | |
690 | .owner = THIS_MODULE, | |
af69f51e DW |
691 | .open = dax_open, |
692 | .release = dax_release, | |
693 | .get_unmapped_area = dax_get_unmapped_area, | |
694 | .mmap = dax_mmap, | |
ab68f262 DW |
695 | }; |
696 | ||
5f0694b3 | 697 | static void dev_dax_release(struct device *dev) |
043a9255 | 698 | { |
5f0694b3 DW |
699 | struct dev_dax *dev_dax = to_dev_dax(dev); |
700 | struct dax_region *dax_region = dev_dax->region; | |
043a9255 | 701 | |
5f0694b3 | 702 | ida_simple_remove(&dax_region->ida, dev_dax->id); |
ebd84d72 DW |
703 | ida_simple_remove(&dax_minor_ida, MINOR(dev->devt)); |
704 | dax_region_put(dax_region); | |
5f0694b3 DW |
705 | iput(dev_dax->inode); |
706 | kfree(dev_dax); | |
ebd84d72 DW |
707 | } |
708 | ||
5f0694b3 | 709 | static void kill_dev_dax(struct dev_dax *dev_dax) |
ebd84d72 | 710 | { |
043a9255 | 711 | /* |
5f0694b3 | 712 | * Note, rcu is not protecting the liveness of dev_dax, rcu is |
043a9255 | 713 | * ensuring that any fault handlers that might have seen |
5f0694b3 | 714 | * dev_dax->alive == true, have completed. Any fault handlers |
956a4cd2 | 715 | * that start after synchronize_srcu() has started will abort |
5f0694b3 | 716 | * upon seeing dev_dax->alive == false. |
043a9255 | 717 | */ |
5f0694b3 | 718 | dev_dax->alive = false; |
956a4cd2 | 719 | synchronize_srcu(&dax_srcu); |
5f0694b3 | 720 | unmap_mapping_range(dev_dax->inode->i_mapping, 0, 0, 1); |
ed01e50a DW |
721 | } |
722 | ||
5f0694b3 | 723 | static void unregister_dev_dax(void *dev) |
ed01e50a | 724 | { |
5f0694b3 | 725 | struct dev_dax *dev_dax = to_dev_dax(dev); |
ed01e50a DW |
726 | |
727 | dev_dbg(dev, "%s\n", __func__); | |
728 | ||
5f0694b3 DW |
729 | kill_dev_dax(dev_dax); |
730 | cdev_device_del(&dev_dax->cdev, dev); | |
92a3fa07 | 731 | put_device(dev); |
043a9255 DW |
732 | } |
733 | ||
5f0694b3 | 734 | struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, |
d76911ee | 735 | struct resource *res, int count) |
043a9255 DW |
736 | { |
737 | struct device *parent = dax_region->dev; | |
5f0694b3 | 738 | struct dev_dax *dev_dax; |
9d2d01a0 | 739 | int rc = 0, minor, i; |
043a9255 | 740 | struct device *dev; |
ba09c01d | 741 | struct cdev *cdev; |
043a9255 DW |
742 | dev_t dev_t; |
743 | ||
5f0694b3 DW |
744 | dev_dax = kzalloc(sizeof(*dev_dax) + sizeof(*res) * count, GFP_KERNEL); |
745 | if (!dev_dax) | |
d76911ee | 746 | return ERR_PTR(-ENOMEM); |
043a9255 | 747 | |
9d2d01a0 DW |
748 | for (i = 0; i < count; i++) { |
749 | if (!IS_ALIGNED(res[i].start, dax_region->align) | |
750 | || !IS_ALIGNED(resource_size(&res[i]), | |
751 | dax_region->align)) { | |
752 | rc = -EINVAL; | |
753 | break; | |
754 | } | |
5f0694b3 DW |
755 | dev_dax->res[i].start = res[i].start; |
756 | dev_dax->res[i].end = res[i].end; | |
9d2d01a0 DW |
757 | } |
758 | ||
759 | if (i < count) | |
760 | goto err_id; | |
761 | ||
5f0694b3 DW |
762 | dev_dax->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); |
763 | if (dev_dax->id < 0) { | |
764 | rc = dev_dax->id; | |
043a9255 DW |
765 | goto err_id; |
766 | } | |
767 | ||
768 | minor = ida_simple_get(&dax_minor_ida, 0, 0, GFP_KERNEL); | |
769 | if (minor < 0) { | |
770 | rc = minor; | |
771 | goto err_minor; | |
772 | } | |
773 | ||
bc0a0fe9 | 774 | dev_t = MKDEV(MAJOR(dax_devt), minor); |
5f0694b3 DW |
775 | dev = &dev_dax->dev; |
776 | dev_dax->inode = dax_inode_get(&dev_dax->cdev, dev_t); | |
777 | if (!dev_dax->inode) { | |
3bc52c45 DW |
778 | rc = -ENOMEM; |
779 | goto err_inode; | |
780 | } | |
781 | ||
5f0694b3 | 782 | /* from here on we're committed to teardown via dev_dax_release() */ |
ebd84d72 | 783 | device_initialize(dev); |
ba09c01d | 784 | |
5f0694b3 | 785 | cdev = &dev_dax->cdev; |
ba09c01d DW |
786 | cdev_init(cdev, &dax_fops); |
787 | cdev->owner = parent->driver->owner; | |
ba09c01d | 788 | |
5f0694b3 DW |
789 | dev_dax->num_resources = count; |
790 | dev_dax->alive = true; | |
791 | dev_dax->region = dax_region; | |
ba09c01d DW |
792 | kref_get(&dax_region->kref); |
793 | ||
ebd84d72 DW |
794 | dev->devt = dev_t; |
795 | dev->class = dax_class; | |
796 | dev->parent = parent; | |
797 | dev->groups = dax_attribute_groups; | |
5f0694b3 DW |
798 | dev->release = dev_dax_release; |
799 | dev_set_name(dev, "dax%d.%d", dax_region->id, dev_dax->id); | |
92a3fa07 LG |
800 | |
801 | rc = cdev_device_add(cdev, dev); | |
ebd84d72 | 802 | if (rc) { |
5f0694b3 | 803 | kill_dev_dax(dev_dax); |
ebd84d72 | 804 | put_device(dev); |
d76911ee | 805 | return ERR_PTR(rc); |
ebd84d72 | 806 | } |
043a9255 | 807 | |
5f0694b3 | 808 | rc = devm_add_action_or_reset(dax_region->dev, unregister_dev_dax, dev); |
d76911ee DW |
809 | if (rc) |
810 | return ERR_PTR(rc); | |
811 | ||
5f0694b3 | 812 | return dev_dax; |
043a9255 | 813 | |
3bc52c45 | 814 | err_inode: |
ba09c01d | 815 | ida_simple_remove(&dax_minor_ida, minor); |
043a9255 | 816 | err_minor: |
5f0694b3 | 817 | ida_simple_remove(&dax_region->ida, dev_dax->id); |
043a9255 | 818 | err_id: |
5f0694b3 | 819 | kfree(dev_dax); |
043a9255 | 820 | |
d76911ee | 821 | return ERR_PTR(rc); |
043a9255 | 822 | } |
5f0694b3 | 823 | EXPORT_SYMBOL_GPL(devm_create_dev_dax); |
043a9255 | 824 | |
ab68f262 DW |
825 | static int __init dax_init(void) |
826 | { | |
827 | int rc; | |
828 | ||
3bc52c45 DW |
829 | rc = dax_inode_init(); |
830 | if (rc) | |
ab68f262 | 831 | return rc; |
3bc52c45 | 832 | |
ba09c01d DW |
833 | nr_dax = max(nr_dax, 256); |
834 | rc = alloc_chrdev_region(&dax_devt, 0, nr_dax, "dax"); | |
835 | if (rc) | |
3bc52c45 | 836 | goto err_chrdev; |
ab68f262 DW |
837 | |
838 | dax_class = class_create(THIS_MODULE, "dax"); | |
839 | if (IS_ERR(dax_class)) { | |
3bc52c45 DW |
840 | rc = PTR_ERR(dax_class); |
841 | goto err_class; | |
ab68f262 DW |
842 | } |
843 | ||
844 | return 0; | |
3bc52c45 DW |
845 | |
846 | err_class: | |
847 | unregister_chrdev_region(dax_devt, nr_dax); | |
848 | err_chrdev: | |
849 | dax_inode_exit(); | |
850 | return rc; | |
ab68f262 DW |
851 | } |
852 | ||
853 | static void __exit dax_exit(void) | |
854 | { | |
855 | class_destroy(dax_class); | |
ba09c01d | 856 | unregister_chrdev_region(dax_devt, nr_dax); |
ab68f262 | 857 | ida_destroy(&dax_minor_ida); |
3bc52c45 | 858 | dax_inode_exit(); |
ab68f262 DW |
859 | } |
860 | ||
861 | MODULE_AUTHOR("Intel Corporation"); | |
862 | MODULE_LICENSE("GPL v2"); | |
863 | subsys_initcall(dax_init); | |
864 | module_exit(dax_exit); |