Commit | Line | Data |
---|---|---|
ab68f262 DW |
1 | /* |
2 | * Copyright(c) 2016 Intel Corporation. All rights reserved. | |
3 | * | |
4 | * This program is free software; you can redistribute it and/or modify | |
5 | * it under the terms of version 2 of the GNU General Public License as | |
6 | * published by the Free Software Foundation. | |
7 | * | |
8 | * This program is distributed in the hope that it will be useful, but | |
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
11 | * General Public License for more details. | |
12 | */ | |
13 | #include <linux/pagemap.h> | |
14 | #include <linux/module.h> | |
15 | #include <linux/device.h> | |
16 | #include <linux/pfn_t.h> | |
17 | #include <linux/slab.h> | |
18 | #include <linux/dax.h> | |
19 | #include <linux/fs.h> | |
20 | #include <linux/mm.h> | |
21 | ||
22 | static int dax_major; | |
23 | static struct class *dax_class; | |
24 | static DEFINE_IDA(dax_minor_ida); | |
25 | ||
26 | /** | |
27 | * struct dax_region - mapping infrastructure for dax devices | |
28 | * @id: kernel-wide unique region for a memory range | |
29 | * @base: linear address corresponding to @res | |
30 | * @kref: to pin while other agents have a need to do lookups | |
31 | * @dev: parent device backing this region | |
32 | * @align: allocation and mapping alignment for child dax devices | |
33 | * @res: physical address range of the region | |
34 | * @pfn_flags: identify whether the pfns are paged back or not | |
35 | */ | |
36 | struct dax_region { | |
37 | int id; | |
38 | struct ida ida; | |
39 | void *base; | |
40 | struct kref kref; | |
41 | struct device *dev; | |
42 | unsigned int align; | |
43 | struct resource res; | |
44 | unsigned long pfn_flags; | |
45 | }; | |
46 | ||
47 | /** | |
48 | * struct dax_dev - subdivision of a dax region | |
49 | * @region - parent region | |
50 | * @dev - device backing the character device | |
51 | * @kref - enable this data to be tracked in filp->private_data | |
dee41079 | 52 | * @alive - !alive + rcu grace period == no new mappings can be established |
ab68f262 DW |
53 | * @id - child id in the region |
54 | * @num_resources - number of physical address extents in this device | |
55 | * @res - array of physical address ranges | |
56 | */ | |
57 | struct dax_dev { | |
58 | struct dax_region *region; | |
59 | struct device *dev; | |
60 | struct kref kref; | |
dee41079 | 61 | bool alive; |
ab68f262 DW |
62 | int id; |
63 | int num_resources; | |
64 | struct resource res[0]; | |
65 | }; | |
66 | ||
67 | static void dax_region_free(struct kref *kref) | |
68 | { | |
69 | struct dax_region *dax_region; | |
70 | ||
71 | dax_region = container_of(kref, struct dax_region, kref); | |
72 | kfree(dax_region); | |
73 | } | |
74 | ||
75 | void dax_region_put(struct dax_region *dax_region) | |
76 | { | |
77 | kref_put(&dax_region->kref, dax_region_free); | |
78 | } | |
79 | EXPORT_SYMBOL_GPL(dax_region_put); | |
80 | ||
81 | static void dax_dev_free(struct kref *kref) | |
82 | { | |
83 | struct dax_dev *dax_dev; | |
84 | ||
85 | dax_dev = container_of(kref, struct dax_dev, kref); | |
86 | dax_region_put(dax_dev->region); | |
87 | kfree(dax_dev); | |
88 | } | |
89 | ||
90 | static void dax_dev_put(struct dax_dev *dax_dev) | |
91 | { | |
92 | kref_put(&dax_dev->kref, dax_dev_free); | |
93 | } | |
94 | ||
95 | struct dax_region *alloc_dax_region(struct device *parent, int region_id, | |
96 | struct resource *res, unsigned int align, void *addr, | |
97 | unsigned long pfn_flags) | |
98 | { | |
99 | struct dax_region *dax_region; | |
100 | ||
101 | dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL); | |
102 | ||
103 | if (!dax_region) | |
104 | return NULL; | |
105 | ||
106 | memcpy(&dax_region->res, res, sizeof(*res)); | |
107 | dax_region->pfn_flags = pfn_flags; | |
108 | kref_init(&dax_region->kref); | |
109 | dax_region->id = region_id; | |
110 | ida_init(&dax_region->ida); | |
111 | dax_region->align = align; | |
112 | dax_region->dev = parent; | |
113 | dax_region->base = addr; | |
114 | ||
115 | return dax_region; | |
116 | } | |
117 | EXPORT_SYMBOL_GPL(alloc_dax_region); | |
118 | ||
119 | static ssize_t size_show(struct device *dev, | |
120 | struct device_attribute *attr, char *buf) | |
121 | { | |
122 | struct dax_dev *dax_dev = dev_get_drvdata(dev); | |
123 | unsigned long long size = 0; | |
124 | int i; | |
125 | ||
126 | for (i = 0; i < dax_dev->num_resources; i++) | |
127 | size += resource_size(&dax_dev->res[i]); | |
128 | ||
129 | return sprintf(buf, "%llu\n", size); | |
130 | } | |
131 | static DEVICE_ATTR_RO(size); | |
132 | ||
133 | static struct attribute *dax_device_attributes[] = { | |
134 | &dev_attr_size.attr, | |
135 | NULL, | |
136 | }; | |
137 | ||
138 | static const struct attribute_group dax_device_attribute_group = { | |
139 | .attrs = dax_device_attributes, | |
140 | }; | |
141 | ||
142 | static const struct attribute_group *dax_attribute_groups[] = { | |
143 | &dax_device_attribute_group, | |
144 | NULL, | |
145 | }; | |
146 | ||
147 | static void unregister_dax_dev(void *_dev) | |
148 | { | |
149 | struct device *dev = _dev; | |
150 | struct dax_dev *dax_dev = dev_get_drvdata(dev); | |
151 | struct dax_region *dax_region = dax_dev->region; | |
152 | ||
153 | dev_dbg(dev, "%s\n", __func__); | |
154 | ||
dee41079 DW |
155 | /* |
156 | * Note, rcu is not protecting the liveness of dax_dev, rcu is | |
157 | * ensuring that any fault handlers that might have seen | |
158 | * dax_dev->alive == true, have completed. Any fault handlers | |
159 | * that start after synchronize_rcu() has started will abort | |
160 | * upon seeing dax_dev->alive == false. | |
161 | */ | |
162 | dax_dev->alive = false; | |
163 | synchronize_rcu(); | |
164 | ||
ab68f262 DW |
165 | get_device(dev); |
166 | device_unregister(dev); | |
167 | ida_simple_remove(&dax_region->ida, dax_dev->id); | |
168 | ida_simple_remove(&dax_minor_ida, MINOR(dev->devt)); | |
169 | put_device(dev); | |
170 | dax_dev_put(dax_dev); | |
171 | } | |
172 | ||
173 | int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res, | |
174 | int count) | |
175 | { | |
176 | struct device *parent = dax_region->dev; | |
177 | struct dax_dev *dax_dev; | |
178 | struct device *dev; | |
179 | int rc, minor; | |
180 | dev_t dev_t; | |
181 | ||
182 | dax_dev = kzalloc(sizeof(*dax_dev) + sizeof(*res) * count, GFP_KERNEL); | |
183 | if (!dax_dev) | |
184 | return -ENOMEM; | |
185 | memcpy(dax_dev->res, res, sizeof(*res) * count); | |
186 | dax_dev->num_resources = count; | |
187 | kref_init(&dax_dev->kref); | |
dee41079 | 188 | dax_dev->alive = true; |
ab68f262 DW |
189 | dax_dev->region = dax_region; |
190 | kref_get(&dax_region->kref); | |
191 | ||
192 | dax_dev->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); | |
193 | if (dax_dev->id < 0) { | |
194 | rc = dax_dev->id; | |
195 | goto err_id; | |
196 | } | |
197 | ||
198 | minor = ida_simple_get(&dax_minor_ida, 0, 0, GFP_KERNEL); | |
199 | if (minor < 0) { | |
200 | rc = minor; | |
201 | goto err_minor; | |
202 | } | |
203 | ||
204 | dev_t = MKDEV(dax_major, minor); | |
205 | dev = device_create_with_groups(dax_class, parent, dev_t, dax_dev, | |
206 | dax_attribute_groups, "dax%d.%d", dax_region->id, | |
207 | dax_dev->id); | |
208 | if (IS_ERR(dev)) { | |
209 | rc = PTR_ERR(dev); | |
210 | goto err_create; | |
211 | } | |
212 | dax_dev->dev = dev; | |
213 | ||
d1c8e0c5 SV |
214 | rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_dev, dev); |
215 | if (rc) | |
ab68f262 | 216 | return rc; |
ab68f262 DW |
217 | |
218 | return 0; | |
219 | ||
220 | err_create: | |
221 | ida_simple_remove(&dax_minor_ida, minor); | |
222 | err_minor: | |
223 | ida_simple_remove(&dax_region->ida, dax_dev->id); | |
224 | err_id: | |
225 | dax_dev_put(dax_dev); | |
226 | ||
227 | return rc; | |
228 | } | |
229 | EXPORT_SYMBOL_GPL(devm_create_dax_dev); | |
230 | ||
dee41079 DW |
231 | /* return an unmapped area aligned to the dax region specified alignment */ |
232 | static unsigned long dax_dev_get_unmapped_area(struct file *filp, | |
233 | unsigned long addr, unsigned long len, unsigned long pgoff, | |
234 | unsigned long flags) | |
235 | { | |
236 | unsigned long off, off_end, off_align, len_align, addr_align, align; | |
237 | struct dax_dev *dax_dev = filp ? filp->private_data : NULL; | |
238 | struct dax_region *dax_region; | |
239 | ||
240 | if (!dax_dev || addr) | |
241 | goto out; | |
242 | ||
243 | dax_region = dax_dev->region; | |
244 | align = dax_region->align; | |
245 | off = pgoff << PAGE_SHIFT; | |
246 | off_end = off + len; | |
247 | off_align = round_up(off, align); | |
248 | ||
249 | if ((off_end <= off_align) || ((off_end - off_align) < align)) | |
250 | goto out; | |
251 | ||
252 | len_align = len + align; | |
253 | if ((off + len_align) < off) | |
254 | goto out; | |
255 | ||
256 | addr_align = current->mm->get_unmapped_area(filp, addr, len_align, | |
257 | pgoff, flags); | |
258 | if (!IS_ERR_VALUE(addr_align)) { | |
259 | addr_align += (off - addr_align) & (align - 1); | |
260 | return addr_align; | |
261 | } | |
262 | out: | |
263 | return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); | |
264 | } | |
265 | ||
266 | static int __match_devt(struct device *dev, const void *data) | |
267 | { | |
268 | const dev_t *devt = data; | |
269 | ||
270 | return dev->devt == *devt; | |
271 | } | |
272 | ||
273 | static struct device *dax_dev_find(dev_t dev_t) | |
274 | { | |
275 | return class_find_device(dax_class, NULL, &dev_t, __match_devt); | |
276 | } | |
277 | ||
278 | static int dax_dev_open(struct inode *inode, struct file *filp) | |
279 | { | |
280 | struct dax_dev *dax_dev = NULL; | |
281 | struct device *dev; | |
282 | ||
283 | dev = dax_dev_find(inode->i_rdev); | |
284 | if (!dev) | |
285 | return -ENXIO; | |
286 | ||
287 | device_lock(dev); | |
288 | dax_dev = dev_get_drvdata(dev); | |
289 | if (dax_dev) { | |
290 | dev_dbg(dev, "%s\n", __func__); | |
291 | filp->private_data = dax_dev; | |
292 | kref_get(&dax_dev->kref); | |
293 | inode->i_flags = S_DAX; | |
294 | } | |
295 | device_unlock(dev); | |
296 | ||
297 | if (!dax_dev) { | |
298 | put_device(dev); | |
299 | return -ENXIO; | |
300 | } | |
301 | return 0; | |
302 | } | |
303 | ||
304 | static int dax_dev_release(struct inode *inode, struct file *filp) | |
305 | { | |
306 | struct dax_dev *dax_dev = filp->private_data; | |
307 | struct device *dev = dax_dev->dev; | |
308 | ||
309 | dev_dbg(dax_dev->dev, "%s\n", __func__); | |
310 | dax_dev_put(dax_dev); | |
311 | put_device(dev); | |
312 | ||
313 | return 0; | |
314 | } | |
315 | ||
316 | static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma, | |
317 | const char *func) | |
318 | { | |
319 | struct dax_region *dax_region = dax_dev->region; | |
320 | struct device *dev = dax_dev->dev; | |
321 | unsigned long mask; | |
322 | ||
323 | if (!dax_dev->alive) | |
324 | return -ENXIO; | |
325 | ||
326 | /* prevent private / writable mappings from being established */ | |
327 | if ((vma->vm_flags & (VM_NORESERVE|VM_SHARED|VM_WRITE)) == VM_WRITE) { | |
328 | dev_info(dev, "%s: %s: fail, attempted private mapping\n", | |
329 | current->comm, func); | |
330 | return -EINVAL; | |
331 | } | |
332 | ||
333 | mask = dax_region->align - 1; | |
334 | if (vma->vm_start & mask || vma->vm_end & mask) { | |
335 | dev_info(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n", | |
336 | current->comm, func, vma->vm_start, vma->vm_end, | |
337 | mask); | |
338 | return -EINVAL; | |
339 | } | |
340 | ||
341 | if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV | |
342 | && (vma->vm_flags & VM_DONTCOPY) == 0) { | |
343 | dev_info(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n", | |
344 | current->comm, func); | |
345 | return -EINVAL; | |
346 | } | |
347 | ||
348 | if (!vma_is_dax(vma)) { | |
349 | dev_info(dev, "%s: %s: fail, vma is not DAX capable\n", | |
350 | current->comm, func); | |
351 | return -EINVAL; | |
352 | } | |
353 | ||
354 | return 0; | |
355 | } | |
356 | ||
357 | static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff, | |
358 | unsigned long size) | |
359 | { | |
360 | struct resource *res; | |
361 | phys_addr_t phys; | |
362 | int i; | |
363 | ||
364 | for (i = 0; i < dax_dev->num_resources; i++) { | |
365 | res = &dax_dev->res[i]; | |
366 | phys = pgoff * PAGE_SIZE + res->start; | |
367 | if (phys >= res->start && phys <= res->end) | |
368 | break; | |
369 | pgoff -= PHYS_PFN(resource_size(res)); | |
370 | } | |
371 | ||
372 | if (i < dax_dev->num_resources) { | |
373 | res = &dax_dev->res[i]; | |
374 | if (phys + size - 1 <= res->end) | |
375 | return phys; | |
376 | } | |
377 | ||
378 | return -1; | |
379 | } | |
380 | ||
381 | static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma, | |
382 | struct vm_fault *vmf) | |
383 | { | |
384 | unsigned long vaddr = (unsigned long) vmf->virtual_address; | |
385 | struct device *dev = dax_dev->dev; | |
386 | struct dax_region *dax_region; | |
387 | int rc = VM_FAULT_SIGBUS; | |
388 | phys_addr_t phys; | |
389 | pfn_t pfn; | |
390 | ||
391 | if (check_vma(dax_dev, vma, __func__)) | |
392 | return VM_FAULT_SIGBUS; | |
393 | ||
394 | dax_region = dax_dev->region; | |
395 | if (dax_region->align > PAGE_SIZE) { | |
396 | dev_dbg(dev, "%s: alignment > fault size\n", __func__); | |
397 | return VM_FAULT_SIGBUS; | |
398 | } | |
399 | ||
400 | phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE); | |
401 | if (phys == -1) { | |
402 | dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__, | |
403 | vmf->pgoff); | |
404 | return VM_FAULT_SIGBUS; | |
405 | } | |
406 | ||
407 | pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); | |
408 | ||
409 | rc = vm_insert_mixed(vma, vaddr, pfn); | |
410 | ||
411 | if (rc == -ENOMEM) | |
412 | return VM_FAULT_OOM; | |
413 | if (rc < 0 && rc != -EBUSY) | |
414 | return VM_FAULT_SIGBUS; | |
415 | ||
416 | return VM_FAULT_NOPAGE; | |
417 | } | |
418 | ||
419 | static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |
420 | { | |
421 | int rc; | |
422 | struct file *filp = vma->vm_file; | |
423 | struct dax_dev *dax_dev = filp->private_data; | |
424 | ||
425 | dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, | |
426 | current->comm, (vmf->flags & FAULT_FLAG_WRITE) | |
427 | ? "write" : "read", vma->vm_start, vma->vm_end); | |
428 | rcu_read_lock(); | |
429 | rc = __dax_dev_fault(dax_dev, vma, vmf); | |
430 | rcu_read_unlock(); | |
431 | ||
432 | return rc; | |
433 | } | |
434 | ||
435 | static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, | |
436 | struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, | |
437 | unsigned int flags) | |
438 | { | |
439 | unsigned long pmd_addr = addr & PMD_MASK; | |
440 | struct device *dev = dax_dev->dev; | |
441 | struct dax_region *dax_region; | |
442 | phys_addr_t phys; | |
443 | pgoff_t pgoff; | |
444 | pfn_t pfn; | |
445 | ||
446 | if (check_vma(dax_dev, vma, __func__)) | |
447 | return VM_FAULT_SIGBUS; | |
448 | ||
449 | dax_region = dax_dev->region; | |
450 | if (dax_region->align > PMD_SIZE) { | |
451 | dev_dbg(dev, "%s: alignment > fault size\n", __func__); | |
452 | return VM_FAULT_SIGBUS; | |
453 | } | |
454 | ||
455 | /* dax pmd mappings require pfn_t_devmap() */ | |
456 | if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { | |
457 | dev_dbg(dev, "%s: alignment > fault size\n", __func__); | |
458 | return VM_FAULT_SIGBUS; | |
459 | } | |
460 | ||
461 | pgoff = linear_page_index(vma, pmd_addr); | |
462 | phys = pgoff_to_phys(dax_dev, pgoff, PAGE_SIZE); | |
463 | if (phys == -1) { | |
464 | dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__, | |
465 | pgoff); | |
466 | return VM_FAULT_SIGBUS; | |
467 | } | |
468 | ||
469 | pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); | |
470 | ||
471 | return vmf_insert_pfn_pmd(vma, addr, pmd, pfn, | |
472 | flags & FAULT_FLAG_WRITE); | |
473 | } | |
474 | ||
475 | static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr, | |
476 | pmd_t *pmd, unsigned int flags) | |
477 | { | |
478 | int rc; | |
479 | struct file *filp = vma->vm_file; | |
480 | struct dax_dev *dax_dev = filp->private_data; | |
481 | ||
482 | dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, | |
483 | current->comm, (flags & FAULT_FLAG_WRITE) | |
484 | ? "write" : "read", vma->vm_start, vma->vm_end); | |
485 | ||
486 | rcu_read_lock(); | |
487 | rc = __dax_dev_pmd_fault(dax_dev, vma, addr, pmd, flags); | |
488 | rcu_read_unlock(); | |
489 | ||
490 | return rc; | |
491 | } | |
492 | ||
493 | static void dax_dev_vm_open(struct vm_area_struct *vma) | |
494 | { | |
495 | struct file *filp = vma->vm_file; | |
496 | struct dax_dev *dax_dev = filp->private_data; | |
497 | ||
498 | dev_dbg(dax_dev->dev, "%s\n", __func__); | |
499 | kref_get(&dax_dev->kref); | |
500 | } | |
501 | ||
502 | static void dax_dev_vm_close(struct vm_area_struct *vma) | |
503 | { | |
504 | struct file *filp = vma->vm_file; | |
505 | struct dax_dev *dax_dev = filp->private_data; | |
506 | ||
507 | dev_dbg(dax_dev->dev, "%s\n", __func__); | |
508 | dax_dev_put(dax_dev); | |
509 | } | |
510 | ||
511 | static const struct vm_operations_struct dax_dev_vm_ops = { | |
512 | .fault = dax_dev_fault, | |
513 | .pmd_fault = dax_dev_pmd_fault, | |
514 | .open = dax_dev_vm_open, | |
515 | .close = dax_dev_vm_close, | |
516 | }; | |
517 | ||
518 | static int dax_dev_mmap(struct file *filp, struct vm_area_struct *vma) | |
519 | { | |
520 | struct dax_dev *dax_dev = filp->private_data; | |
521 | int rc; | |
522 | ||
523 | dev_dbg(dax_dev->dev, "%s\n", __func__); | |
524 | ||
525 | rc = check_vma(dax_dev, vma, __func__); | |
526 | if (rc) | |
527 | return rc; | |
528 | ||
529 | kref_get(&dax_dev->kref); | |
530 | vma->vm_ops = &dax_dev_vm_ops; | |
531 | vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; | |
532 | return 0; | |
533 | ||
534 | } | |
535 | ||
ab68f262 DW |
536 | static const struct file_operations dax_fops = { |
537 | .llseek = noop_llseek, | |
538 | .owner = THIS_MODULE, | |
dee41079 DW |
539 | .open = dax_dev_open, |
540 | .release = dax_dev_release, | |
541 | .get_unmapped_area = dax_dev_get_unmapped_area, | |
542 | .mmap = dax_dev_mmap, | |
ab68f262 DW |
543 | }; |
544 | ||
545 | static int __init dax_init(void) | |
546 | { | |
547 | int rc; | |
548 | ||
549 | rc = register_chrdev(0, "dax", &dax_fops); | |
550 | if (rc < 0) | |
551 | return rc; | |
552 | dax_major = rc; | |
553 | ||
554 | dax_class = class_create(THIS_MODULE, "dax"); | |
555 | if (IS_ERR(dax_class)) { | |
556 | unregister_chrdev(dax_major, "dax"); | |
557 | return PTR_ERR(dax_class); | |
558 | } | |
559 | ||
560 | return 0; | |
561 | } | |
562 | ||
563 | static void __exit dax_exit(void) | |
564 | { | |
565 | class_destroy(dax_class); | |
566 | unregister_chrdev(dax_major, "dax"); | |
567 | ida_destroy(&dax_minor_ida); | |
568 | } | |
569 | ||
570 | MODULE_AUTHOR("Intel Corporation"); | |
571 | MODULE_LICENSE("GPL v2"); | |
572 | subsys_initcall(dax_init); | |
573 | module_exit(dax_exit); |