Merge tag 'libnvdimm-for-4.18' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdim...

[linux-2.6-block.git] / fs / dax.c
diff --git a/fs/dax.c b/fs/dax.c

index 08656a2f2aa6ca9c76c966f44fc5b08bba5b12a6..641192808bb69054a634b6f5b229c2ca25f308bf 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -351,6 +351,19 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
         }
  }
  
+static struct page *dax_busy_page(void *entry)
+{
+       unsigned long pfn;
+
+       for_each_mapped_pfn(entry, pfn) {
+               struct page *page = pfn_to_page(pfn);
+
+               if (page_ref_count(page) > 1)
+                       return page;
+       }
+       return NULL;
+}
+
  /*
   * Find radix tree entry at given index. If it points to an exceptional entry,
   * return it with the radix tree entry locked. If the radix tree doesn't
@@ -492,6 +505,90 @@ restart:
         return entry;
  }
  
+/**
+ * dax_layout_busy_page - find first pinned page in @mapping
+ * @mapping: address space to scan for a page with ref count > 1
+ *
+ * DAX requires ZONE_DEVICE mapped pages. These pages are never
+ * 'onlined' to the page allocator so they are considered idle when
+ * page->count == 1. A filesystem uses this interface to determine if
+ * any page in the mapping is busy, i.e. for DMA, or other
+ * get_user_pages() usages.
+ *
+ * It is expected that the filesystem is holding locks to block the
+ * establishment of new mappings in this address_space. I.e. it expects
+ * to be able to run unmap_mapping_range() and subsequently not race
+ * mapping_mapped() becoming true.
+ */
+struct page *dax_layout_busy_page(struct address_space *mapping)
+{
+       pgoff_t indices[PAGEVEC_SIZE];
+       struct page *page = NULL;
+       struct pagevec pvec;
+       pgoff_t index, end;
+       unsigned i;
+
+       /*
+        * In the 'limited' case get_user_pages() for dax is disabled.
+        */
+       if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
+               return NULL;
+
+       if (!dax_mapping(mapping) || !mapping_mapped(mapping))
+               return NULL;
+
+       pagevec_init(&pvec);
+       index = 0;
+       end = -1;
+
+       /*
+        * If we race get_user_pages_fast() here either we'll see the
+        * elevated page count in the pagevec_lookup and wait, or
+        * get_user_pages_fast() will see that the page it took a reference
+        * against is no longer mapped in the page tables and bail to the
+        * get_user_pages() slow path.  The slow path is protected by
+        * pte_lock() and pmd_lock(). New references are not taken without
+        * holding those locks, and unmap_mapping_range() will not zero the
+        * pte or pmd without holding the respective lock, so we are
+        * guaranteed to either see new references or prevent new
+        * references from being established.
+        */
+       unmap_mapping_range(mapping, 0, 0, 1);
+
+       while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
+                               min(end - index, (pgoff_t)PAGEVEC_SIZE),
+                               indices)) {
+               for (i = 0; i < pagevec_count(&pvec); i++) {
+                       struct page *pvec_ent = pvec.pages[i];
+                       void *entry;
+
+                       index = indices[i];
+                       if (index >= end)
+                               break;
+
+                       if (!radix_tree_exceptional_entry(pvec_ent))
+                               continue;
+
+                       xa_lock_irq(&mapping->i_pages);
+                       entry = get_unlocked_mapping_entry(mapping, index, NULL);
+                       if (entry)
+                               page = dax_busy_page(entry);
+                       put_unlocked_mapping_entry(mapping, index, entry);
+                       xa_unlock_irq(&mapping->i_pages);
+                       if (page)
+                               break;
+               }
+               pagevec_remove_exceptionals(&pvec);
+               pagevec_release(&pvec);
+               index++;
+
+               if (page)
+                       break;
+       }
+       return page;
+}
+EXPORT_SYMBOL_GPL(dax_layout_busy_page);
+
  static int __dax_invalidate_mapping_entry(struct address_space *mapping,
                                           pgoff_t index, bool trunc)
  {
@@ -912,7 +1009,6 @@ static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry,
         unsigned long vaddr = vmf->address;
         vm_fault_t ret = VM_FAULT_NOPAGE;
         struct page *zero_page;
-       void *entry2;
         pfn_t pfn;
  
         zero_page = ZERO_PAGE(0);
@@ -922,13 +1018,8 @@ static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry,
         }
  
         pfn = page_to_pfn_t(zero_page);
-       entry2 = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
-                       RADIX_DAX_ZERO_PAGE, false);
-       if (IS_ERR(entry2)) {
-               ret = VM_FAULT_SIGBUS;
-               goto out;
-       }
-
+       dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE,
+                       false);
         ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
  out:
         trace_dax_load_hole(inode, vmf, ret);
@@ -991,6 +1082,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
         struct iov_iter *iter = data;
         loff_t end = pos + length, done = 0;
         ssize_t ret = 0;
+       size_t xfer;
         int id;
  
         if (iov_iter_rw(iter) == READ) {
@@ -1054,18 +1146,20 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
                  * vfs_write(), depending on which operation we are doing.
                  */
                 if (iov_iter_rw(iter) == WRITE)
-                       map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr,
+                       xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
                                         map_len, iter);
                 else
-                       map_len = copy_to_iter(kaddr, map_len, iter);
-               if (map_len <= 0) {
-                       ret = map_len ? map_len : -EFAULT;
-                       break;
-               }
+                       xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
+                                       map_len, iter);
  
-               pos += map_len;
-               length -= map_len;
-               done += map_len;
+               pos += xfer;
+               length -= xfer;
+               done += xfer;
+
+               if (xfer == 0)
+                       ret = -EFAULT;
+               if (xfer < map_len)
+                       break;
         }
         dax_read_unlock(id);
  
@@ -1240,10 +1334,6 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
  
                 entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
                                                  0, write && !sync);
-               if (IS_ERR(entry)) {
-                       error = PTR_ERR(entry);
-                       goto error_finish_iomap;
-               }
  
                 /*
                  * If we are doing synchronous page fault and inode needs fsync,
@@ -1324,8 +1414,6 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
         pfn = page_to_pfn_t(zero_page);
         ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
                         RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
-       if (IS_ERR(ret))
-               goto fallback;
  
         ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
         if (!pmd_none(*(vmf->pmd))) {
@@ -1447,8 +1535,6 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
  
                 entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
                                                 RADIX_DAX_PMD, write && !sync);
-               if (IS_ERR(entry))
-                       goto finish_iomap;
  
                 /*
                  * If we are doing synchronous page fault and inode needs fsync,