device-dax: Start defining a dax bus model
[linux-2.6-block.git] / drivers / dax / device.c
CommitLineData
51cf784c
DW
1// SPDX-License-Identifier: GPL-2.0
2/* Copyright(c) 2016-2018 Intel Corporation. All rights reserved. */
ab68f262
DW
3#include <linux/pagemap.h>
4#include <linux/module.h>
5#include <linux/device.h>
6#include <linux/pfn_t.h>
ba09c01d 7#include <linux/cdev.h>
ab68f262
DW
8#include <linux/slab.h>
9#include <linux/dax.h>
10#include <linux/fs.h>
11#include <linux/mm.h>
ef842302 12#include <linux/mman.h>
efebc711 13#include "dax-private.h"
51cf784c 14#include "bus.h"
ab68f262 15
ab68f262 16static struct class *dax_class;
ab68f262 17
5f0694b3 18static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma,
dee41079
DW
19 const char *func)
20{
5f0694b3
DW
21 struct dax_region *dax_region = dev_dax->region;
22 struct device *dev = &dev_dax->dev;
dee41079
DW
23 unsigned long mask;
24
7b6be844 25 if (!dax_alive(dev_dax->dax_dev))
dee41079
DW
26 return -ENXIO;
27
4cb19355 28 /* prevent private mappings from being established */
325896ff 29 if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) {
5a14e91d
JM
30 dev_info_ratelimited(dev,
31 "%s: %s: fail, attempted private mapping\n",
dee41079
DW
32 current->comm, func);
33 return -EINVAL;
34 }
35
36 mask = dax_region->align - 1;
37 if (vma->vm_start & mask || vma->vm_end & mask) {
5a14e91d
JM
38 dev_info_ratelimited(dev,
39 "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n",
dee41079
DW
40 current->comm, func, vma->vm_start, vma->vm_end,
41 mask);
42 return -EINVAL;
43 }
44
45 if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV
46 && (vma->vm_flags & VM_DONTCOPY) == 0) {
5a14e91d
JM
47 dev_info_ratelimited(dev,
48 "%s: %s: fail, dax range requires MADV_DONTFORK\n",
dee41079
DW
49 current->comm, func);
50 return -EINVAL;
51 }
52
53 if (!vma_is_dax(vma)) {
5a14e91d
JM
54 dev_info_ratelimited(dev,
55 "%s: %s: fail, vma is not DAX capable\n",
dee41079
DW
56 current->comm, func);
57 return -EINVAL;
58 }
59
60 return 0;
61}
62
efebc711 63/* see "strong" declaration in tools/testing/nvdimm/dax-dev.c */
73616367 64__weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
dee41079
DW
65 unsigned long size)
66{
753a0850
DW
67 struct resource *res = &dev_dax->region->res;
68 phys_addr_t phys;
dee41079 69
753a0850
DW
70 phys = pgoff * PAGE_SIZE + res->start;
71 if (phys >= res->start && phys <= res->end) {
dee41079
DW
72 if (phys + size - 1 <= res->end)
73 return phys;
74 }
75
76 return -1;
77}
78
226ab561 79static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
2232c638 80 struct vm_fault *vmf, pfn_t *pfn)
dee41079 81{
5f0694b3 82 struct device *dev = &dev_dax->dev;
dee41079 83 struct dax_region *dax_region;
dee41079 84 phys_addr_t phys;
0134ed4f 85 unsigned int fault_size = PAGE_SIZE;
dee41079 86
5f0694b3 87 if (check_vma(dev_dax, vmf->vma, __func__))
dee41079
DW
88 return VM_FAULT_SIGBUS;
89
5f0694b3 90 dax_region = dev_dax->region;
dee41079 91 if (dax_region->align > PAGE_SIZE) {
6daaca52
DW
92 dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n",
93 dax_region->align, fault_size);
dee41079
DW
94 return VM_FAULT_SIGBUS;
95 }
96
0134ed4f
DJ
97 if (fault_size != dax_region->align)
98 return VM_FAULT_SIGBUS;
99
73616367 100 phys = dax_pgoff_to_phys(dev_dax, vmf->pgoff, PAGE_SIZE);
dee41079 101 if (phys == -1) {
6daaca52 102 dev_dbg(dev, "pgoff_to_phys(%#lx) failed\n", vmf->pgoff);
dee41079
DW
103 return VM_FAULT_SIGBUS;
104 }
105
2232c638 106 *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
dee41079 107
2232c638 108 return vmf_insert_mixed(vmf->vma, vmf->address, *pfn);
dee41079
DW
109}
110
226ab561 111static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
2232c638 112 struct vm_fault *vmf, pfn_t *pfn)
dee41079 113{
d8a849e1 114 unsigned long pmd_addr = vmf->address & PMD_MASK;
5f0694b3 115 struct device *dev = &dev_dax->dev;
dee41079
DW
116 struct dax_region *dax_region;
117 phys_addr_t phys;
118 pgoff_t pgoff;
0134ed4f 119 unsigned int fault_size = PMD_SIZE;
dee41079 120
5f0694b3 121 if (check_vma(dev_dax, vmf->vma, __func__))
dee41079
DW
122 return VM_FAULT_SIGBUS;
123
5f0694b3 124 dax_region = dev_dax->region;
dee41079 125 if (dax_region->align > PMD_SIZE) {
6daaca52
DW
126 dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n",
127 dax_region->align, fault_size);
dee41079
DW
128 return VM_FAULT_SIGBUS;
129 }
130
131 /* dax pmd mappings require pfn_t_devmap() */
132 if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
6daaca52 133 dev_dbg(dev, "region lacks devmap flags\n");
dee41079
DW
134 return VM_FAULT_SIGBUS;
135 }
136
0134ed4f
DJ
137 if (fault_size < dax_region->align)
138 return VM_FAULT_SIGBUS;
139 else if (fault_size > dax_region->align)
140 return VM_FAULT_FALLBACK;
141
142 /* if we are outside of the VMA */
143 if (pmd_addr < vmf->vma->vm_start ||
144 (pmd_addr + PMD_SIZE) > vmf->vma->vm_end)
145 return VM_FAULT_SIGBUS;
146
f4200391 147 pgoff = linear_page_index(vmf->vma, pmd_addr);
73616367 148 phys = dax_pgoff_to_phys(dev_dax, pgoff, PMD_SIZE);
dee41079 149 if (phys == -1) {
6daaca52 150 dev_dbg(dev, "pgoff_to_phys(%#lx) failed\n", pgoff);
dee41079
DW
151 return VM_FAULT_SIGBUS;
152 }
153
2232c638 154 *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
dee41079 155
2232c638 156 return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, *pfn,
d8a849e1 157 vmf->flags & FAULT_FLAG_WRITE);
dee41079
DW
158}
159
9557feee 160#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
226ab561 161static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
2232c638 162 struct vm_fault *vmf, pfn_t *pfn)
9557feee
DJ
163{
164 unsigned long pud_addr = vmf->address & PUD_MASK;
5f0694b3 165 struct device *dev = &dev_dax->dev;
9557feee
DJ
166 struct dax_region *dax_region;
167 phys_addr_t phys;
168 pgoff_t pgoff;
70b085b0
DJ
169 unsigned int fault_size = PUD_SIZE;
170
9557feee 171
5f0694b3 172 if (check_vma(dev_dax, vmf->vma, __func__))
9557feee
DJ
173 return VM_FAULT_SIGBUS;
174
5f0694b3 175 dax_region = dev_dax->region;
9557feee 176 if (dax_region->align > PUD_SIZE) {
6daaca52
DW
177 dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n",
178 dax_region->align, fault_size);
9557feee
DJ
179 return VM_FAULT_SIGBUS;
180 }
181
182 /* dax pud mappings require pfn_t_devmap() */
183 if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
6daaca52 184 dev_dbg(dev, "region lacks devmap flags\n");
9557feee
DJ
185 return VM_FAULT_SIGBUS;
186 }
187
70b085b0
DJ
188 if (fault_size < dax_region->align)
189 return VM_FAULT_SIGBUS;
190 else if (fault_size > dax_region->align)
191 return VM_FAULT_FALLBACK;
192
193 /* if we are outside of the VMA */
194 if (pud_addr < vmf->vma->vm_start ||
195 (pud_addr + PUD_SIZE) > vmf->vma->vm_end)
196 return VM_FAULT_SIGBUS;
197
9557feee 198 pgoff = linear_page_index(vmf->vma, pud_addr);
73616367 199 phys = dax_pgoff_to_phys(dev_dax, pgoff, PUD_SIZE);
9557feee 200 if (phys == -1) {
6daaca52 201 dev_dbg(dev, "pgoff_to_phys(%#lx) failed\n", pgoff);
9557feee
DJ
202 return VM_FAULT_SIGBUS;
203 }
204
2232c638 205 *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
9557feee 206
2232c638 207 return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, *pfn,
9557feee
DJ
208 vmf->flags & FAULT_FLAG_WRITE);
209}
210#else
226ab561 211static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
2232c638 212 struct vm_fault *vmf, pfn_t *pfn)
9557feee
DJ
213{
214 return VM_FAULT_FALLBACK;
215}
216#endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
217
226ab561 218static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf,
c791ace1 219 enum page_entry_size pe_size)
dee41079 220{
f4200391 221 struct file *filp = vmf->vma->vm_file;
2232c638 222 unsigned long fault_size;
36bdac1e
SJ
223 vm_fault_t rc = VM_FAULT_SIGBUS;
224 int id;
2232c638 225 pfn_t pfn;
5f0694b3 226 struct dev_dax *dev_dax = filp->private_data;
dee41079 227
6daaca52
DW
228 dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) size = %d\n", current->comm,
229 (vmf->flags & FAULT_FLAG_WRITE) ? "write" : "read",
76202620 230 vmf->vma->vm_start, vmf->vma->vm_end, pe_size);
dee41079 231
7b6be844 232 id = dax_read_lock();
c791ace1
DJ
233 switch (pe_size) {
234 case PE_SIZE_PTE:
2232c638
DW
235 fault_size = PAGE_SIZE;
236 rc = __dev_dax_pte_fault(dev_dax, vmf, &pfn);
a2d58167 237 break;
c791ace1 238 case PE_SIZE_PMD:
2232c638
DW
239 fault_size = PMD_SIZE;
240 rc = __dev_dax_pmd_fault(dev_dax, vmf, &pfn);
9557feee 241 break;
c791ace1 242 case PE_SIZE_PUD:
2232c638
DW
243 fault_size = PUD_SIZE;
244 rc = __dev_dax_pud_fault(dev_dax, vmf, &pfn);
a2d58167
DJ
245 break;
246 default:
54eafcc9 247 rc = VM_FAULT_SIGBUS;
a2d58167 248 }
2232c638
DW
249
250 if (rc == VM_FAULT_NOPAGE) {
251 unsigned long i;
35de2995 252 pgoff_t pgoff;
2232c638
DW
253
254 /*
255 * In the device-dax case the only possibility for a
256 * VM_FAULT_NOPAGE result is when device-dax capacity is
257 * mapped. No need to consider the zero page, or racing
258 * conflicting mappings.
259 */
35de2995
DW
260 pgoff = linear_page_index(vmf->vma, vmf->address
261 & ~(fault_size - 1));
2232c638
DW
262 for (i = 0; i < fault_size / PAGE_SIZE; i++) {
263 struct page *page;
264
265 page = pfn_to_page(pfn_t_to_pfn(pfn) + i);
266 if (page->mapping)
267 continue;
268 page->mapping = filp->f_mapping;
35de2995 269 page->index = pgoff + i;
2232c638
DW
270 }
271 }
7b6be844 272 dax_read_unlock(id);
dee41079
DW
273
274 return rc;
275}
276
226ab561 277static vm_fault_t dev_dax_fault(struct vm_fault *vmf)
c791ace1 278{
5f0694b3 279 return dev_dax_huge_fault(vmf, PE_SIZE_PTE);
c791ace1
DJ
280}
281
9702cffd
DW
282static int dev_dax_split(struct vm_area_struct *vma, unsigned long addr)
283{
284 struct file *filp = vma->vm_file;
285 struct dev_dax *dev_dax = filp->private_data;
286 struct dax_region *dax_region = dev_dax->region;
287
288 if (!IS_ALIGNED(addr, dax_region->align))
289 return -EINVAL;
290 return 0;
291}
292
c1d53b92
DW
293static unsigned long dev_dax_pagesize(struct vm_area_struct *vma)
294{
295 struct file *filp = vma->vm_file;
296 struct dev_dax *dev_dax = filp->private_data;
297 struct dax_region *dax_region = dev_dax->region;
298
299 return dax_region->align;
300}
301
5f0694b3
DW
302static const struct vm_operations_struct dax_vm_ops = {
303 .fault = dev_dax_fault,
304 .huge_fault = dev_dax_huge_fault,
9702cffd 305 .split = dev_dax_split,
c1d53b92 306 .pagesize = dev_dax_pagesize,
dee41079
DW
307};
308
af69f51e 309static int dax_mmap(struct file *filp, struct vm_area_struct *vma)
dee41079 310{
5f0694b3 311 struct dev_dax *dev_dax = filp->private_data;
7b6be844 312 int rc, id;
dee41079 313
6daaca52 314 dev_dbg(&dev_dax->dev, "trace\n");
dee41079 315
7b6be844
DW
316 /*
317 * We lock to check dax_dev liveness and will re-check at
318 * fault time.
319 */
320 id = dax_read_lock();
5f0694b3 321 rc = check_vma(dev_dax, vma, __func__);
7b6be844 322 dax_read_unlock(id);
dee41079
DW
323 if (rc)
324 return rc;
325
5f0694b3 326 vma->vm_ops = &dax_vm_ops;
e1fb4a08 327 vma->vm_flags |= VM_HUGEPAGE;
dee41079 328 return 0;
043a9255
DW
329}
330
331/* return an unmapped area aligned to the dax region specified alignment */
af69f51e 332static unsigned long dax_get_unmapped_area(struct file *filp,
043a9255
DW
333 unsigned long addr, unsigned long len, unsigned long pgoff,
334 unsigned long flags)
335{
336 unsigned long off, off_end, off_align, len_align, addr_align, align;
5f0694b3 337 struct dev_dax *dev_dax = filp ? filp->private_data : NULL;
043a9255
DW
338 struct dax_region *dax_region;
339
5f0694b3 340 if (!dev_dax || addr)
043a9255
DW
341 goto out;
342
5f0694b3 343 dax_region = dev_dax->region;
043a9255
DW
344 align = dax_region->align;
345 off = pgoff << PAGE_SHIFT;
346 off_end = off + len;
347 off_align = round_up(off, align);
348
349 if ((off_end <= off_align) || ((off_end - off_align) < align))
350 goto out;
351
352 len_align = len + align;
353 if ((off + len_align) < off)
354 goto out;
dee41079 355
043a9255
DW
356 addr_align = current->mm->get_unmapped_area(filp, addr, len_align,
357 pgoff, flags);
358 if (!IS_ERR_VALUE(addr_align)) {
359 addr_align += (off - addr_align) & (align - 1);
360 return addr_align;
361 }
362 out:
363 return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
364}
365
41c9b1be
DJ
366static const struct address_space_operations dev_dax_aops = {
367 .set_page_dirty = noop_set_page_dirty,
368 .invalidatepage = noop_invalidatepage,
369};
370
af69f51e 371static int dax_open(struct inode *inode, struct file *filp)
043a9255 372{
7b6be844
DW
373 struct dax_device *dax_dev = inode_dax(inode);
374 struct inode *__dax_inode = dax_inode(dax_dev);
375 struct dev_dax *dev_dax = dax_get_private(dax_dev);
043a9255 376
6daaca52 377 dev_dbg(&dev_dax->dev, "trace\n");
7b6be844
DW
378 inode->i_mapping = __dax_inode->i_mapping;
379 inode->i_mapping->host = __dax_inode;
41c9b1be 380 inode->i_mapping->a_ops = &dev_dax_aops;
3bc52c45 381 filp->f_mapping = inode->i_mapping;
5660e13d 382 filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
5f0694b3 383 filp->private_data = dev_dax;
ebd84d72 384 inode->i_flags = S_DAX;
043a9255 385
043a9255
DW
386 return 0;
387}
dee41079 388
af69f51e 389static int dax_release(struct inode *inode, struct file *filp)
043a9255 390{
5f0694b3 391 struct dev_dax *dev_dax = filp->private_data;
043a9255 392
6daaca52 393 dev_dbg(&dev_dax->dev, "trace\n");
043a9255 394 return 0;
dee41079
DW
395}
396
ab68f262
DW
397static const struct file_operations dax_fops = {
398 .llseek = noop_llseek,
399 .owner = THIS_MODULE,
af69f51e
DW
400 .open = dax_open,
401 .release = dax_release,
402 .get_unmapped_area = dax_get_unmapped_area,
403 .mmap = dax_mmap,
ef842302 404 .mmap_supported_flags = MAP_SYNC,
ab68f262
DW
405};
406
5f0694b3 407static void dev_dax_release(struct device *dev)
043a9255 408{
5f0694b3
DW
409 struct dev_dax *dev_dax = to_dev_dax(dev);
410 struct dax_region *dax_region = dev_dax->region;
7b6be844 411 struct dax_device *dax_dev = dev_dax->dax_dev;
043a9255 412
ebd84d72 413 dax_region_put(dax_region);
7b6be844 414 put_dax(dax_dev);
5f0694b3 415 kfree(dev_dax);
ebd84d72
DW
416}
417
753a0850 418struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, int id)
043a9255
DW
419{
420 struct device *parent = dax_region->dev;
7b6be844 421 struct dax_device *dax_dev;
5f0694b3 422 struct dev_dax *dev_dax;
7b6be844 423 struct inode *inode;
043a9255 424 struct device *dev;
ba09c01d 425 struct cdev *cdev;
753a0850 426 int rc;
43fe51e1 427
753a0850 428 dev_dax = kzalloc(sizeof(*dev_dax), GFP_KERNEL);
5f0694b3 429 if (!dev_dax)
d76911ee 430 return ERR_PTR(-ENOMEM);
043a9255 431
6568b08b
DW
432 /*
433 * No 'host' or dax_operations since there is no access to this
434 * device outside of mmap of the resulting character device.
435 */
436 dax_dev = alloc_dax(dev_dax, NULL, NULL);
43fe51e1
DW
437 if (!dax_dev) {
438 rc = -ENOMEM;
21b9e979 439 goto err;
43fe51e1 440 }
3bc52c45 441
7b6be844 442 /* from here on we're committed to teardown via dax_dev_release() */
5f0694b3 443 dev = &dev_dax->dev;
ebd84d72 444 device_initialize(dev);
ba09c01d 445
7b6be844
DW
446 inode = dax_inode(dax_dev);
447 cdev = inode->i_cdev;
ba09c01d
DW
448 cdev_init(cdev, &dax_fops);
449 cdev->owner = parent->driver->owner;
ba09c01d 450
7b6be844 451 dev_dax->dax_dev = dax_dev;
5f0694b3 452 dev_dax->region = dax_region;
ba09c01d
DW
453 kref_get(&dax_region->kref);
454
7b6be844 455 dev->devt = inode->i_rdev;
ebd84d72
DW
456 dev->class = dax_class;
457 dev->parent = parent;
458 dev->groups = dax_attribute_groups;
5f0694b3 459 dev->release = dev_dax_release;
bbb3be17 460 dev_set_name(dev, "dax%d.%d", dax_region->id, id);
92a3fa07
LG
461
462 rc = cdev_device_add(cdev, dev);
ebd84d72 463 if (rc) {
5f0694b3 464 kill_dev_dax(dev_dax);
ebd84d72 465 put_device(dev);
d76911ee 466 return ERR_PTR(rc);
ebd84d72 467 }
043a9255 468
5f0694b3 469 rc = devm_add_action_or_reset(dax_region->dev, unregister_dev_dax, dev);
d76911ee
DW
470 if (rc)
471 return ERR_PTR(rc);
472
5f0694b3 473 return dev_dax;
043a9255 474
21b9e979 475 err:
5f0694b3 476 kfree(dev_dax);
043a9255 477
d76911ee 478 return ERR_PTR(rc);
043a9255 479}
5f0694b3 480EXPORT_SYMBOL_GPL(devm_create_dev_dax);
043a9255 481
ab68f262
DW
482static int __init dax_init(void)
483{
ab68f262 484 dax_class = class_create(THIS_MODULE, "dax");
7b6be844 485 return PTR_ERR_OR_ZERO(dax_class);
ab68f262
DW
486}
487
488static void __exit dax_exit(void)
489{
490 class_destroy(dax_class);
ab68f262
DW
491}
492
493MODULE_AUTHOR("Intel Corporation");
494MODULE_LICENSE("GPL v2");
495subsys_initcall(dax_init);
496module_exit(dax_exit);