Commit | Line | Data |
---|---|---|
ab68f262 DW |
1 | /* |
2 | * Copyright(c) 2016 Intel Corporation. All rights reserved. | |
3 | * | |
4 | * This program is free software; you can redistribute it and/or modify | |
5 | * it under the terms of version 2 of the GNU General Public License as | |
6 | * published by the Free Software Foundation. | |
7 | * | |
8 | * This program is distributed in the hope that it will be useful, but | |
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
11 | * General Public License for more details. | |
12 | */ | |
13 | #include <linux/pagemap.h> | |
14 | #include <linux/module.h> | |
15 | #include <linux/device.h> | |
16 | #include <linux/pfn_t.h> | |
17 | #include <linux/slab.h> | |
18 | #include <linux/dax.h> | |
19 | #include <linux/fs.h> | |
20 | #include <linux/mm.h> | |
ccdb07f6 | 21 | #include "dax.h" |
ab68f262 DW |
22 | |
23 | static int dax_major; | |
24 | static struct class *dax_class; | |
25 | static DEFINE_IDA(dax_minor_ida); | |
26 | ||
27 | /** | |
28 | * struct dax_region - mapping infrastructure for dax devices | |
29 | * @id: kernel-wide unique region for a memory range | |
30 | * @base: linear address corresponding to @res | |
31 | * @kref: to pin while other agents have a need to do lookups | |
32 | * @dev: parent device backing this region | |
33 | * @align: allocation and mapping alignment for child dax devices | |
34 | * @res: physical address range of the region | |
35 | * @pfn_flags: identify whether the pfns are paged back or not | |
36 | */ | |
37 | struct dax_region { | |
38 | int id; | |
39 | struct ida ida; | |
40 | void *base; | |
41 | struct kref kref; | |
42 | struct device *dev; | |
43 | unsigned int align; | |
44 | struct resource res; | |
45 | unsigned long pfn_flags; | |
46 | }; | |
47 | ||
48 | /** | |
49 | * struct dax_dev - subdivision of a dax region | |
50 | * @region - parent region | |
51 | * @dev - device backing the character device | |
52 | * @kref - enable this data to be tracked in filp->private_data | |
dee41079 | 53 | * @alive - !alive + rcu grace period == no new mappings can be established |
ab68f262 DW |
54 | * @id - child id in the region |
55 | * @num_resources - number of physical address extents in this device | |
56 | * @res - array of physical address ranges | |
57 | */ | |
58 | struct dax_dev { | |
59 | struct dax_region *region; | |
60 | struct device *dev; | |
61 | struct kref kref; | |
dee41079 | 62 | bool alive; |
ab68f262 DW |
63 | int id; |
64 | int num_resources; | |
65 | struct resource res[0]; | |
66 | }; | |
67 | ||
68 | static void dax_region_free(struct kref *kref) | |
69 | { | |
70 | struct dax_region *dax_region; | |
71 | ||
72 | dax_region = container_of(kref, struct dax_region, kref); | |
73 | kfree(dax_region); | |
74 | } | |
75 | ||
76 | void dax_region_put(struct dax_region *dax_region) | |
77 | { | |
78 | kref_put(&dax_region->kref, dax_region_free); | |
79 | } | |
80 | EXPORT_SYMBOL_GPL(dax_region_put); | |
81 | ||
82 | static void dax_dev_free(struct kref *kref) | |
83 | { | |
84 | struct dax_dev *dax_dev; | |
85 | ||
86 | dax_dev = container_of(kref, struct dax_dev, kref); | |
87 | dax_region_put(dax_dev->region); | |
88 | kfree(dax_dev); | |
89 | } | |
90 | ||
91 | static void dax_dev_put(struct dax_dev *dax_dev) | |
92 | { | |
93 | kref_put(&dax_dev->kref, dax_dev_free); | |
94 | } | |
95 | ||
96 | struct dax_region *alloc_dax_region(struct device *parent, int region_id, | |
97 | struct resource *res, unsigned int align, void *addr, | |
98 | unsigned long pfn_flags) | |
99 | { | |
100 | struct dax_region *dax_region; | |
101 | ||
102 | dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL); | |
103 | ||
104 | if (!dax_region) | |
105 | return NULL; | |
106 | ||
107 | memcpy(&dax_region->res, res, sizeof(*res)); | |
108 | dax_region->pfn_flags = pfn_flags; | |
109 | kref_init(&dax_region->kref); | |
110 | dax_region->id = region_id; | |
111 | ida_init(&dax_region->ida); | |
112 | dax_region->align = align; | |
113 | dax_region->dev = parent; | |
114 | dax_region->base = addr; | |
115 | ||
116 | return dax_region; | |
117 | } | |
118 | EXPORT_SYMBOL_GPL(alloc_dax_region); | |
119 | ||
120 | static ssize_t size_show(struct device *dev, | |
121 | struct device_attribute *attr, char *buf) | |
122 | { | |
123 | struct dax_dev *dax_dev = dev_get_drvdata(dev); | |
124 | unsigned long long size = 0; | |
125 | int i; | |
126 | ||
127 | for (i = 0; i < dax_dev->num_resources; i++) | |
128 | size += resource_size(&dax_dev->res[i]); | |
129 | ||
130 | return sprintf(buf, "%llu\n", size); | |
131 | } | |
132 | static DEVICE_ATTR_RO(size); | |
133 | ||
134 | static struct attribute *dax_device_attributes[] = { | |
135 | &dev_attr_size.attr, | |
136 | NULL, | |
137 | }; | |
138 | ||
139 | static const struct attribute_group dax_device_attribute_group = { | |
140 | .attrs = dax_device_attributes, | |
141 | }; | |
142 | ||
143 | static const struct attribute_group *dax_attribute_groups[] = { | |
144 | &dax_device_attribute_group, | |
145 | NULL, | |
146 | }; | |
147 | ||
dee41079 DW |
148 | static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma, |
149 | const char *func) | |
150 | { | |
151 | struct dax_region *dax_region = dax_dev->region; | |
152 | struct device *dev = dax_dev->dev; | |
153 | unsigned long mask; | |
154 | ||
155 | if (!dax_dev->alive) | |
156 | return -ENXIO; | |
157 | ||
158 | /* prevent private / writable mappings from being established */ | |
159 | if ((vma->vm_flags & (VM_NORESERVE|VM_SHARED|VM_WRITE)) == VM_WRITE) { | |
160 | dev_info(dev, "%s: %s: fail, attempted private mapping\n", | |
161 | current->comm, func); | |
162 | return -EINVAL; | |
163 | } | |
164 | ||
165 | mask = dax_region->align - 1; | |
166 | if (vma->vm_start & mask || vma->vm_end & mask) { | |
167 | dev_info(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n", | |
168 | current->comm, func, vma->vm_start, vma->vm_end, | |
169 | mask); | |
170 | return -EINVAL; | |
171 | } | |
172 | ||
173 | if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV | |
174 | && (vma->vm_flags & VM_DONTCOPY) == 0) { | |
175 | dev_info(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n", | |
176 | current->comm, func); | |
177 | return -EINVAL; | |
178 | } | |
179 | ||
180 | if (!vma_is_dax(vma)) { | |
181 | dev_info(dev, "%s: %s: fail, vma is not DAX capable\n", | |
182 | current->comm, func); | |
183 | return -EINVAL; | |
184 | } | |
185 | ||
186 | return 0; | |
187 | } | |
188 | ||
189 | static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff, | |
190 | unsigned long size) | |
191 | { | |
192 | struct resource *res; | |
193 | phys_addr_t phys; | |
194 | int i; | |
195 | ||
196 | for (i = 0; i < dax_dev->num_resources; i++) { | |
197 | res = &dax_dev->res[i]; | |
198 | phys = pgoff * PAGE_SIZE + res->start; | |
199 | if (phys >= res->start && phys <= res->end) | |
200 | break; | |
201 | pgoff -= PHYS_PFN(resource_size(res)); | |
202 | } | |
203 | ||
204 | if (i < dax_dev->num_resources) { | |
205 | res = &dax_dev->res[i]; | |
206 | if (phys + size - 1 <= res->end) | |
207 | return phys; | |
208 | } | |
209 | ||
210 | return -1; | |
211 | } | |
212 | ||
213 | static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma, | |
214 | struct vm_fault *vmf) | |
215 | { | |
216 | unsigned long vaddr = (unsigned long) vmf->virtual_address; | |
217 | struct device *dev = dax_dev->dev; | |
218 | struct dax_region *dax_region; | |
219 | int rc = VM_FAULT_SIGBUS; | |
220 | phys_addr_t phys; | |
221 | pfn_t pfn; | |
222 | ||
223 | if (check_vma(dax_dev, vma, __func__)) | |
224 | return VM_FAULT_SIGBUS; | |
225 | ||
226 | dax_region = dax_dev->region; | |
227 | if (dax_region->align > PAGE_SIZE) { | |
228 | dev_dbg(dev, "%s: alignment > fault size\n", __func__); | |
229 | return VM_FAULT_SIGBUS; | |
230 | } | |
231 | ||
232 | phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE); | |
233 | if (phys == -1) { | |
234 | dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__, | |
235 | vmf->pgoff); | |
236 | return VM_FAULT_SIGBUS; | |
237 | } | |
238 | ||
239 | pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); | |
240 | ||
241 | rc = vm_insert_mixed(vma, vaddr, pfn); | |
242 | ||
243 | if (rc == -ENOMEM) | |
244 | return VM_FAULT_OOM; | |
245 | if (rc < 0 && rc != -EBUSY) | |
246 | return VM_FAULT_SIGBUS; | |
247 | ||
248 | return VM_FAULT_NOPAGE; | |
249 | } | |
250 | ||
251 | static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |
252 | { | |
253 | int rc; | |
254 | struct file *filp = vma->vm_file; | |
255 | struct dax_dev *dax_dev = filp->private_data; | |
256 | ||
257 | dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, | |
258 | current->comm, (vmf->flags & FAULT_FLAG_WRITE) | |
259 | ? "write" : "read", vma->vm_start, vma->vm_end); | |
260 | rcu_read_lock(); | |
261 | rc = __dax_dev_fault(dax_dev, vma, vmf); | |
262 | rcu_read_unlock(); | |
263 | ||
264 | return rc; | |
265 | } | |
266 | ||
267 | static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, | |
268 | struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, | |
269 | unsigned int flags) | |
270 | { | |
271 | unsigned long pmd_addr = addr & PMD_MASK; | |
272 | struct device *dev = dax_dev->dev; | |
273 | struct dax_region *dax_region; | |
274 | phys_addr_t phys; | |
275 | pgoff_t pgoff; | |
276 | pfn_t pfn; | |
277 | ||
278 | if (check_vma(dax_dev, vma, __func__)) | |
279 | return VM_FAULT_SIGBUS; | |
280 | ||
281 | dax_region = dax_dev->region; | |
282 | if (dax_region->align > PMD_SIZE) { | |
283 | dev_dbg(dev, "%s: alignment > fault size\n", __func__); | |
284 | return VM_FAULT_SIGBUS; | |
285 | } | |
286 | ||
287 | /* dax pmd mappings require pfn_t_devmap() */ | |
288 | if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { | |
289 | dev_dbg(dev, "%s: alignment > fault size\n", __func__); | |
290 | return VM_FAULT_SIGBUS; | |
291 | } | |
292 | ||
293 | pgoff = linear_page_index(vma, pmd_addr); | |
294 | phys = pgoff_to_phys(dax_dev, pgoff, PAGE_SIZE); | |
295 | if (phys == -1) { | |
296 | dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__, | |
297 | pgoff); | |
298 | return VM_FAULT_SIGBUS; | |
299 | } | |
300 | ||
301 | pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); | |
302 | ||
303 | return vmf_insert_pfn_pmd(vma, addr, pmd, pfn, | |
304 | flags & FAULT_FLAG_WRITE); | |
305 | } | |
306 | ||
307 | static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr, | |
308 | pmd_t *pmd, unsigned int flags) | |
309 | { | |
310 | int rc; | |
311 | struct file *filp = vma->vm_file; | |
312 | struct dax_dev *dax_dev = filp->private_data; | |
313 | ||
314 | dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, | |
315 | current->comm, (flags & FAULT_FLAG_WRITE) | |
316 | ? "write" : "read", vma->vm_start, vma->vm_end); | |
317 | ||
318 | rcu_read_lock(); | |
319 | rc = __dax_dev_pmd_fault(dax_dev, vma, addr, pmd, flags); | |
320 | rcu_read_unlock(); | |
321 | ||
322 | return rc; | |
323 | } | |
324 | ||
325 | static void dax_dev_vm_open(struct vm_area_struct *vma) | |
326 | { | |
327 | struct file *filp = vma->vm_file; | |
328 | struct dax_dev *dax_dev = filp->private_data; | |
329 | ||
330 | dev_dbg(dax_dev->dev, "%s\n", __func__); | |
331 | kref_get(&dax_dev->kref); | |
332 | } | |
333 | ||
334 | static void dax_dev_vm_close(struct vm_area_struct *vma) | |
335 | { | |
336 | struct file *filp = vma->vm_file; | |
337 | struct dax_dev *dax_dev = filp->private_data; | |
338 | ||
339 | dev_dbg(dax_dev->dev, "%s\n", __func__); | |
340 | dax_dev_put(dax_dev); | |
341 | } | |
342 | ||
343 | static const struct vm_operations_struct dax_dev_vm_ops = { | |
344 | .fault = dax_dev_fault, | |
345 | .pmd_fault = dax_dev_pmd_fault, | |
346 | .open = dax_dev_vm_open, | |
347 | .close = dax_dev_vm_close, | |
348 | }; | |
349 | ||
350 | static int dax_dev_mmap(struct file *filp, struct vm_area_struct *vma) | |
351 | { | |
352 | struct dax_dev *dax_dev = filp->private_data; | |
353 | int rc; | |
354 | ||
355 | dev_dbg(dax_dev->dev, "%s\n", __func__); | |
356 | ||
357 | rc = check_vma(dax_dev, vma, __func__); | |
358 | if (rc) | |
359 | return rc; | |
360 | ||
361 | kref_get(&dax_dev->kref); | |
362 | vma->vm_ops = &dax_dev_vm_ops; | |
363 | vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; | |
364 | return 0; | |
043a9255 DW |
365 | } |
366 | ||
367 | /* return an unmapped area aligned to the dax region specified alignment */ | |
368 | static unsigned long dax_dev_get_unmapped_area(struct file *filp, | |
369 | unsigned long addr, unsigned long len, unsigned long pgoff, | |
370 | unsigned long flags) | |
371 | { | |
372 | unsigned long off, off_end, off_align, len_align, addr_align, align; | |
373 | struct dax_dev *dax_dev = filp ? filp->private_data : NULL; | |
374 | struct dax_region *dax_region; | |
375 | ||
376 | if (!dax_dev || addr) | |
377 | goto out; | |
378 | ||
379 | dax_region = dax_dev->region; | |
380 | align = dax_region->align; | |
381 | off = pgoff << PAGE_SHIFT; | |
382 | off_end = off + len; | |
383 | off_align = round_up(off, align); | |
384 | ||
385 | if ((off_end <= off_align) || ((off_end - off_align) < align)) | |
386 | goto out; | |
387 | ||
388 | len_align = len + align; | |
389 | if ((off + len_align) < off) | |
390 | goto out; | |
391 | ||
392 | addr_align = current->mm->get_unmapped_area(filp, addr, len_align, | |
393 | pgoff, flags); | |
394 | if (!IS_ERR_VALUE(addr_align)) { | |
395 | addr_align += (off - addr_align) & (align - 1); | |
396 | return addr_align; | |
397 | } | |
398 | out: | |
399 | return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); | |
400 | } | |
401 | ||
402 | static int __match_devt(struct device *dev, const void *data) | |
403 | { | |
404 | const dev_t *devt = data; | |
405 | ||
406 | return dev->devt == *devt; | |
407 | } | |
408 | ||
409 | static struct device *dax_dev_find(dev_t dev_t) | |
410 | { | |
411 | return class_find_device(dax_class, NULL, &dev_t, __match_devt); | |
412 | } | |
413 | ||
414 | static int dax_dev_open(struct inode *inode, struct file *filp) | |
415 | { | |
416 | struct dax_dev *dax_dev = NULL; | |
417 | struct device *dev; | |
418 | ||
419 | dev = dax_dev_find(inode->i_rdev); | |
420 | if (!dev) | |
421 | return -ENXIO; | |
422 | ||
423 | device_lock(dev); | |
424 | dax_dev = dev_get_drvdata(dev); | |
425 | if (dax_dev) { | |
426 | dev_dbg(dev, "%s\n", __func__); | |
427 | filp->private_data = dax_dev; | |
428 | kref_get(&dax_dev->kref); | |
429 | inode->i_flags = S_DAX; | |
430 | } | |
431 | device_unlock(dev); | |
432 | ||
433 | if (!dax_dev) { | |
434 | put_device(dev); | |
435 | return -ENXIO; | |
436 | } | |
437 | return 0; | |
438 | } | |
dee41079 | 439 | |
043a9255 DW |
440 | static int dax_dev_release(struct inode *inode, struct file *filp) |
441 | { | |
442 | struct dax_dev *dax_dev = filp->private_data; | |
443 | struct device *dev = dax_dev->dev; | |
444 | ||
445 | dev_dbg(dax_dev->dev, "%s\n", __func__); | |
446 | dax_dev_put(dax_dev); | |
447 | put_device(dev); | |
448 | ||
449 | return 0; | |
dee41079 DW |
450 | } |
451 | ||
ab68f262 DW |
452 | static const struct file_operations dax_fops = { |
453 | .llseek = noop_llseek, | |
454 | .owner = THIS_MODULE, | |
dee41079 DW |
455 | .open = dax_dev_open, |
456 | .release = dax_dev_release, | |
457 | .get_unmapped_area = dax_dev_get_unmapped_area, | |
458 | .mmap = dax_dev_mmap, | |
ab68f262 DW |
459 | }; |
460 | ||
043a9255 DW |
461 | static void unregister_dax_dev(void *_dev) |
462 | { | |
463 | struct device *dev = _dev; | |
464 | struct dax_dev *dax_dev = dev_get_drvdata(dev); | |
465 | struct dax_region *dax_region = dax_dev->region; | |
466 | ||
467 | dev_dbg(dev, "%s\n", __func__); | |
468 | ||
469 | /* | |
470 | * Note, rcu is not protecting the liveness of dax_dev, rcu is | |
471 | * ensuring that any fault handlers that might have seen | |
472 | * dax_dev->alive == true, have completed. Any fault handlers | |
473 | * that start after synchronize_rcu() has started will abort | |
474 | * upon seeing dax_dev->alive == false. | |
475 | */ | |
476 | dax_dev->alive = false; | |
477 | synchronize_rcu(); | |
478 | ||
479 | get_device(dev); | |
480 | device_unregister(dev); | |
481 | ida_simple_remove(&dax_region->ida, dax_dev->id); | |
482 | ida_simple_remove(&dax_minor_ida, MINOR(dev->devt)); | |
483 | put_device(dev); | |
484 | dax_dev_put(dax_dev); | |
485 | } | |
486 | ||
487 | int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res, | |
488 | int count) | |
489 | { | |
490 | struct device *parent = dax_region->dev; | |
491 | struct dax_dev *dax_dev; | |
492 | struct device *dev; | |
493 | int rc, minor; | |
494 | dev_t dev_t; | |
495 | ||
496 | dax_dev = kzalloc(sizeof(*dax_dev) + sizeof(*res) * count, GFP_KERNEL); | |
497 | if (!dax_dev) | |
498 | return -ENOMEM; | |
499 | memcpy(dax_dev->res, res, sizeof(*res) * count); | |
500 | dax_dev->num_resources = count; | |
501 | kref_init(&dax_dev->kref); | |
502 | dax_dev->alive = true; | |
503 | dax_dev->region = dax_region; | |
504 | kref_get(&dax_region->kref); | |
505 | ||
506 | dax_dev->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); | |
507 | if (dax_dev->id < 0) { | |
508 | rc = dax_dev->id; | |
509 | goto err_id; | |
510 | } | |
511 | ||
512 | minor = ida_simple_get(&dax_minor_ida, 0, 0, GFP_KERNEL); | |
513 | if (minor < 0) { | |
514 | rc = minor; | |
515 | goto err_minor; | |
516 | } | |
517 | ||
518 | dev_t = MKDEV(dax_major, minor); | |
519 | dev = device_create_with_groups(dax_class, parent, dev_t, dax_dev, | |
520 | dax_attribute_groups, "dax%d.%d", dax_region->id, | |
521 | dax_dev->id); | |
522 | if (IS_ERR(dev)) { | |
523 | rc = PTR_ERR(dev); | |
524 | goto err_create; | |
525 | } | |
526 | dax_dev->dev = dev; | |
527 | ||
528 | rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_dev, dev); | |
529 | if (rc) | |
530 | return rc; | |
531 | ||
532 | return 0; | |
533 | ||
534 | err_create: | |
535 | ida_simple_remove(&dax_minor_ida, minor); | |
536 | err_minor: | |
537 | ida_simple_remove(&dax_region->ida, dax_dev->id); | |
538 | err_id: | |
539 | dax_dev_put(dax_dev); | |
540 | ||
541 | return rc; | |
542 | } | |
543 | EXPORT_SYMBOL_GPL(devm_create_dax_dev); | |
544 | ||
ab68f262 DW |
545 | static int __init dax_init(void) |
546 | { | |
547 | int rc; | |
548 | ||
549 | rc = register_chrdev(0, "dax", &dax_fops); | |
550 | if (rc < 0) | |
551 | return rc; | |
552 | dax_major = rc; | |
553 | ||
554 | dax_class = class_create(THIS_MODULE, "dax"); | |
555 | if (IS_ERR(dax_class)) { | |
556 | unregister_chrdev(dax_major, "dax"); | |
557 | return PTR_ERR(dax_class); | |
558 | } | |
559 | ||
560 | return 0; | |
561 | } | |
562 | ||
563 | static void __exit dax_exit(void) | |
564 | { | |
565 | class_destroy(dax_class); | |
566 | unregister_chrdev(dax_major, "dax"); | |
567 | ida_destroy(&dax_minor_ida); | |
568 | } | |
569 | ||
570 | MODULE_AUTHOR("Intel Corporation"); | |
571 | MODULE_LICENSE("GPL v2"); | |
572 | subsys_initcall(dax_init); | |
573 | module_exit(dax_exit); |