Merge branch 'timers-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-2.6-block.git] / fs / dax.c
index 5ae8e11ad78677ef3103fc569d0959742a380370..a8732fbed381a45bbce44fcdf0731ccfdc1a09ba 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -31,6 +31,7 @@
 #include <linux/vmstat.h>
 #include <linux/pfn_t.h>
 #include <linux/sizes.h>
+#include <linux/mmu_notifier.h>
 #include <linux/iomap.h>
 #include "internal.h"
 
@@ -240,6 +241,23 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
        }
 }
 
+static void dax_unlock_mapping_entry(struct address_space *mapping,
+                                    pgoff_t index)
+{
+       void *entry, **slot;
+
+       spin_lock_irq(&mapping->tree_lock);
+       entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
+       if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
+                        !slot_locked(mapping, slot))) {
+               spin_unlock_irq(&mapping->tree_lock);
+               return;
+       }
+       unlock_slot(mapping, slot);
+       spin_unlock_irq(&mapping->tree_lock);
+       dax_wake_mapping_entry_waiter(mapping, index, entry, false);
+}
+
 static void put_locked_mapping_entry(struct address_space *mapping,
                                     pgoff_t index, void *entry)
 {
@@ -433,22 +451,6 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
                __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
 }
 
-void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
-{
-       void *entry, **slot;
-
-       spin_lock_irq(&mapping->tree_lock);
-       entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
-       if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
-                        !slot_locked(mapping, slot))) {
-               spin_unlock_irq(&mapping->tree_lock);
-               return;
-       }
-       unlock_slot(mapping, slot);
-       spin_unlock_irq(&mapping->tree_lock);
-       dax_wake_mapping_entry_waiter(mapping, index, entry, false);
-}
-
 /*
  * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
  * entry to get unlocked before deleting it.
@@ -500,10 +502,8 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
        /* This will replace locked radix tree entry with a hole page */
        page = find_or_create_page(mapping, vmf->pgoff,
                                   vmf->gfp_mask | __GFP_ZERO);
-       if (!page) {
-               put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+       if (!page)
                return VM_FAULT_OOM;
-       }
        vmf->page = page;
        return VM_FAULT_LOCKED;
 }
@@ -615,36 +615,107 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
        return new_entry;
 }
 
+static inline unsigned long
+pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
+{
+       unsigned long address;
+
+       address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+       VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
+       return address;
+}
+
+/* Walk all mappings of a given index of a file and writeprotect them */
+static void dax_mapping_entry_mkclean(struct address_space *mapping,
+                                     pgoff_t index, unsigned long pfn)
+{
+       struct vm_area_struct *vma;
+       pte_t *ptep;
+       pte_t pte;
+       spinlock_t *ptl;
+       bool changed;
+
+       i_mmap_lock_read(mapping);
+       vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
+               unsigned long address;
+
+               cond_resched();
+
+               if (!(vma->vm_flags & VM_SHARED))
+                       continue;
+
+               address = pgoff_address(index, vma);
+               changed = false;
+               if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
+                       continue;
+               if (pfn != pte_pfn(*ptep))
+                       goto unlock;
+               if (!pte_dirty(*ptep) && !pte_write(*ptep))
+                       goto unlock;
+
+               flush_cache_page(vma, address, pfn);
+               pte = ptep_clear_flush(vma, address, ptep);
+               pte = pte_wrprotect(pte);
+               pte = pte_mkclean(pte);
+               set_pte_at(vma->vm_mm, address, ptep, pte);
+               changed = true;
+unlock:
+               pte_unmap_unlock(ptep, ptl);
+
+               if (changed)
+                       mmu_notifier_invalidate_page(vma->vm_mm, address);
+       }
+       i_mmap_unlock_read(mapping);
+}
+
 static int dax_writeback_one(struct block_device *bdev,
                struct address_space *mapping, pgoff_t index, void *entry)
 {
        struct radix_tree_root *page_tree = &mapping->page_tree;
-       struct radix_tree_node *node;
        struct blk_dax_ctl dax;
-       void **slot;
+       void *entry2, **slot;
        int ret = 0;
 
-       spin_lock_irq(&mapping->tree_lock);
        /*
-        * Regular page slots are stabilized by the page lock even
-        * without the tree itself locked.  These unlocked entries
-        * need verification under the tree lock.
+        * A page got tagged dirty in DAX mapping? Something is seriously
+        * wrong.
         */
-       if (!__radix_tree_lookup(page_tree, index, &node, &slot))
-               goto unlock;
-       if (*slot != entry)
-               goto unlock;
-
-       /* another fsync thread may have already written back this entry */
-       if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
-               goto unlock;
+       if (WARN_ON(!radix_tree_exceptional_entry(entry)))
+               return -EIO;
 
+       spin_lock_irq(&mapping->tree_lock);
+       entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
+       /* Entry got punched out / reallocated? */
+       if (!entry2 || !radix_tree_exceptional_entry(entry2))
+               goto put_unlocked;
+       /*
+        * Entry got reallocated elsewhere? No need to writeback. We have to
+        * compare sectors as we must not bail out due to difference in lockbit
+        * or entry type.
+        */
+       if (dax_radix_sector(entry2) != dax_radix_sector(entry))
+               goto put_unlocked;
        if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
                                dax_is_zero_entry(entry))) {
                ret = -EIO;
-               goto unlock;
+               goto put_unlocked;
        }
 
+       /* Another fsync thread may have already written back this entry */
+       if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
+               goto put_unlocked;
+       /* Lock the entry to serialize with page faults */
+       entry = lock_slot(mapping, slot);
+       /*
+        * We can clear the tag now but we have to be careful so that concurrent
+        * dax_writeback_one() calls for the same index cannot finish before we
+        * actually flush the caches. This is achieved as the calls will look
+        * at the entry only under tree_lock and once they do that they will
+        * see the entry locked and wait for it to unlock.
+        */
+       radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
+       spin_unlock_irq(&mapping->tree_lock);
+
        /*
         * Even if dax_writeback_mapping_range() was given a wbc->range_start
         * in the middle of a PMD, the 'index' we are given will be aligned to
@@ -654,31 +725,40 @@ static int dax_writeback_one(struct block_device *bdev,
         */
        dax.sector = dax_radix_sector(entry);
        dax.size = PAGE_SIZE << dax_radix_order(entry);
-       spin_unlock_irq(&mapping->tree_lock);
 
        /*
         * We cannot hold tree_lock while calling dax_map_atomic() because it
         * eventually calls cond_resched().
         */
        ret = dax_map_atomic(bdev, &dax);
-       if (ret < 0)
+       if (ret < 0) {
+               put_locked_mapping_entry(mapping, index, entry);
                return ret;
+       }
 
        if (WARN_ON_ONCE(ret < dax.size)) {
                ret = -EIO;
                goto unmap;
        }
 
+       dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn));
        wb_cache_pmem(dax.addr, dax.size);
-
+       /*
+        * After we have flushed the cache, we can clear the dirty tag. There
+        * cannot be new dirty data in the pfn after the flush has completed as
+        * the pfn mappings are writeprotected and fault waits for mapping
+        * entry lock.
+        */
        spin_lock_irq(&mapping->tree_lock);
-       radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
+       radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
        spin_unlock_irq(&mapping->tree_lock);
  unmap:
        dax_unmap_atomic(bdev, &dax);
+       put_locked_mapping_entry(mapping, index, entry);
        return ret;
 
- unlock:
+ put_unlocked:
+       put_unlocked_mapping_entry(mapping, index, entry2);
        spin_unlock_irq(&mapping->tree_lock);
        return ret;
 }
@@ -738,7 +818,7 @@ static int dax_insert_mapping(struct address_space *mapping,
                struct block_device *bdev, sector_t sector, size_t size,
                void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-       unsigned long vaddr = (unsigned long)vmf->virtual_address;
+       unsigned long vaddr = vmf->address;
        struct blk_dax_ctl dax = {
                .sector = sector,
                .size = size,
@@ -767,17 +847,27 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct file *file = vma->vm_file;
        struct address_space *mapping = file->f_mapping;
-       void *entry;
+       void *entry, **slot;
        pgoff_t index = vmf->pgoff;
 
        spin_lock_irq(&mapping->tree_lock);
-       entry = get_unlocked_mapping_entry(mapping, index, NULL);
-       if (!entry || !radix_tree_exceptional_entry(entry))
-               goto out;
+       entry = get_unlocked_mapping_entry(mapping, index, &slot);
+       if (!entry || !radix_tree_exceptional_entry(entry)) {
+               if (entry)
+                       put_unlocked_mapping_entry(mapping, index, entry);
+               spin_unlock_irq(&mapping->tree_lock);
+               return VM_FAULT_NOPAGE;
+       }
        radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
-       put_unlocked_mapping_entry(mapping, index, entry);
-out:
+       entry = lock_slot(mapping, slot);
        spin_unlock_irq(&mapping->tree_lock);
+       /*
+        * If we race with somebody updating the PTE and finish_mkwrite_fault()
+        * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry
+        * the fault in either case.
+        */
+       finish_mkwrite_fault(vmf);
+       put_locked_mapping_entry(mapping, index, entry);
        return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
@@ -948,13 +1038,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 {
        struct address_space *mapping = vma->vm_file->f_mapping;
        struct inode *inode = mapping->host;
-       unsigned long vaddr = (unsigned long)vmf->virtual_address;
+       unsigned long vaddr = vmf->address;
        loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
        sector_t sector;
        struct iomap iomap = { 0 };
        unsigned flags = IOMAP_FAULT;
        int error, major = 0;
-       int locked_status = 0;
+       int vmf_ret = 0;
        void *entry;
 
        /*
@@ -1007,13 +1097,11 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
                if (error)
                        goto finish_iomap;
-               if (!radix_tree_exceptional_entry(entry)) {
-                       vmf->page = entry;
-                       locked_status = VM_FAULT_LOCKED;
-               } else {
-                       vmf->entry = entry;
-                       locked_status = VM_FAULT_DAX_LOCKED;
-               }
+
+               __SetPageUptodate(vmf->cow_page);
+               vmf_ret = finish_fault(vmf);
+               if (!vmf_ret)
+                       vmf_ret = VM_FAULT_DONE_COW;
                goto finish_iomap;
        }
 
@@ -1030,7 +1118,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
        case IOMAP_UNWRITTEN:
        case IOMAP_HOLE:
                if (!(vmf->flags & FAULT_FLAG_WRITE)) {
-                       locked_status = dax_load_hole(mapping, entry, vmf);
+                       vmf_ret = dax_load_hole(mapping, entry, vmf);
                        break;
                }
                /*FALLTHRU*/
@@ -1042,7 +1130,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
  finish_iomap:
        if (ops->iomap_end) {
-               if (error) {
+               if (error || (vmf_ret & VM_FAULT_ERROR)) {
                        /* keep previous error */
                        ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags,
                                        &iomap);
@@ -1052,7 +1140,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
                }
        }
  unlock_entry:
-       if (!locked_status || error)
+       if (vmf_ret != VM_FAULT_LOCKED || error)
                put_locked_mapping_entry(mapping, vmf->pgoff, entry);
  out:
        if (error == -ENOMEM)
@@ -1060,9 +1148,9 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
        /* -EBUSY is fine, somebody else faulted on the same PTE */
        if (error < 0 && error != -EBUSY)
                return VM_FAULT_SIGBUS | major;
-       if (locked_status) {
+       if (vmf_ret) {
                WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */
-               return locked_status;
+               return vmf_ret;
        }
        return VM_FAULT_NOPAGE | major;
 }