Merge tag 'for-linus-20190617' of git://git.sourceforge.jp/gitroot/uclinux-h8/linux
[linux-2.6-block.git] / mm / swapfile.c
index 596ac98051c5a50a460e8fb638e967932c5a86a5..0789a762ce2f49add4f8823c38ba366d88bda9e1 100644 (file)
@@ -152,6 +152,18 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
        return ret;
 }
 
+static inline struct swap_extent *first_se(struct swap_info_struct *sis)
+{
+       struct rb_node *rb = rb_first(&sis->swap_extent_root);
+       return rb_entry(rb, struct swap_extent, rb_node);
+}
+
+static inline struct swap_extent *next_se(struct swap_extent *se)
+{
+       struct rb_node *rb = rb_next(&se->rb_node);
+       return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
+}
+
 /*
  * swapon tell device that all the old swap contents can be discarded,
  * to allow the swap device to optimize its wear-levelling.
@@ -164,7 +176,7 @@ static int discard_swap(struct swap_info_struct *si)
        int err = 0;
 
        /* Do not discard the swap header page! */
-       se = &si->first_swap_extent;
+       se = first_se(si);
        start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
        nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
        if (nr_blocks) {
@@ -175,7 +187,7 @@ static int discard_swap(struct swap_info_struct *si)
                cond_resched();
        }
 
-       list_for_each_entry(se, &si->first_swap_extent.list, list) {
+       for (se = next_se(se); se; se = next_se(se)) {
                start_block = se->start_block << (PAGE_SHIFT - 9);
                nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
 
@@ -189,6 +201,26 @@ static int discard_swap(struct swap_info_struct *si)
        return err;             /* That will often be -EOPNOTSUPP */
 }
 
+static struct swap_extent *
+offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
+{
+       struct swap_extent *se;
+       struct rb_node *rb;
+
+       rb = sis->swap_extent_root.rb_node;
+       while (rb) {
+               se = rb_entry(rb, struct swap_extent, rb_node);
+               if (offset < se->start_page)
+                       rb = rb->rb_left;
+               else if (offset >= se->start_page + se->nr_pages)
+                       rb = rb->rb_right;
+               else
+                       return se;
+       }
+       /* It *must* be present */
+       BUG();
+}
+
 /*
  * swap allocation tell device that a cluster of swap can now be discarded,
  * to allow the swap device to optimize its wear-levelling.
@@ -196,32 +228,25 @@ static int discard_swap(struct swap_info_struct *si)
 static void discard_swap_cluster(struct swap_info_struct *si,
                                 pgoff_t start_page, pgoff_t nr_pages)
 {
-       struct swap_extent *se = si->curr_swap_extent;
-       int found_extent = 0;
+       struct swap_extent *se = offset_to_swap_extent(si, start_page);
 
        while (nr_pages) {
-               if (se->start_page <= start_page &&
-                   start_page < se->start_page + se->nr_pages) {
-                       pgoff_t offset = start_page - se->start_page;
-                       sector_t start_block = se->start_block + offset;
-                       sector_t nr_blocks = se->nr_pages - offset;
-
-                       if (nr_blocks > nr_pages)
-                               nr_blocks = nr_pages;
-                       start_page += nr_blocks;
-                       nr_pages -= nr_blocks;
-
-                       if (!found_extent++)
-                               si->curr_swap_extent = se;
-
-                       start_block <<= PAGE_SHIFT - 9;
-                       nr_blocks <<= PAGE_SHIFT - 9;
-                       if (blkdev_issue_discard(si->bdev, start_block,
-                                   nr_blocks, GFP_NOIO, 0))
-                               break;
-               }
+               pgoff_t offset = start_page - se->start_page;
+               sector_t start_block = se->start_block + offset;
+               sector_t nr_blocks = se->nr_pages - offset;
+
+               if (nr_blocks > nr_pages)
+                       nr_blocks = nr_pages;
+               start_page += nr_blocks;
+               nr_pages -= nr_blocks;
+
+               start_block <<= PAGE_SHIFT - 9;
+               nr_blocks <<= PAGE_SHIFT - 9;
+               if (blkdev_issue_discard(si->bdev, start_block,
+                                       nr_blocks, GFP_NOIO, 0))
+                       break;
 
-               se = list_next_entry(se, list);
+               se = next_se(se);
        }
 }
 
@@ -1079,12 +1104,11 @@ fail:
 static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
 {
        struct swap_info_struct *p;
-       unsigned long offset, type;
+       unsigned long offset;
 
        if (!entry.val)
                goto out;
-       type = swp_type(entry);
-       p = swap_type_to_swap_info(type);
+       p = swp_swap_info(entry);
        if (!p)
                goto bad_nofile;
        if (!(p->flags & SWP_USED))
@@ -1187,6 +1211,69 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
        return usage;
 }
 
+/*
+ * Check whether swap entry is valid in the swap device.  If so,
+ * return pointer to swap_info_struct, and keep the swap entry valid
+ * via preventing the swap device from being swapoff, until
+ * put_swap_device() is called.  Otherwise return NULL.
+ *
+ * The entirety of the RCU read critical section must come before the
+ * return from or after the call to synchronize_rcu() in
+ * enable_swap_info() or swapoff().  So if "si->flags & SWP_VALID" is
+ * true, the si->map, si->cluster_info, etc. must be valid in the
+ * critical section.
+ *
+ * Notice that swapoff or swapoff+swapon can still happen before the
+ * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
+ * in put_swap_device() if there isn't any other way to prevent
+ * swapoff, such as page lock, page table lock, etc.  The caller must
+ * be prepared for that.  For example, the following situation is
+ * possible.
+ *
+ *   CPU1                              CPU2
+ *   do_swap_page()
+ *     ...                             swapoff+swapon
+ *     __read_swap_cache_async()
+ *       swapcache_prepare()
+ *         __swap_duplicate()
+ *           // check swap_map
+ *     // verify PTE not changed
+ *
+ * In __swap_duplicate(), the swap_map need to be checked before
+ * changing partly because the specified swap entry may be for another
+ * swap device which has been swapoff.  And in do_swap_page(), after
+ * the page is read from the swap device, the PTE is verified not
+ * changed with the page table locked to check whether the swap device
+ * has been swapoff or swapoff+swapon.
+ */
+struct swap_info_struct *get_swap_device(swp_entry_t entry)
+{
+       struct swap_info_struct *si;
+       unsigned long offset;
+
+       if (!entry.val)
+               goto out;
+       si = swp_swap_info(entry);
+       if (!si)
+               goto bad_nofile;
+
+       rcu_read_lock();
+       if (!(si->flags & SWP_VALID))
+               goto unlock_out;
+       offset = swp_offset(entry);
+       if (offset >= si->max)
+               goto unlock_out;
+
+       return si;
+bad_nofile:
+       pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
+out:
+       return NULL;
+unlock_out:
+       rcu_read_unlock();
+       return NULL;
+}
+
 static unsigned char __swap_entry_free(struct swap_info_struct *p,
                                       swp_entry_t entry, unsigned char usage)
 {
@@ -1358,11 +1445,18 @@ int page_swapcount(struct page *page)
        return count;
 }
 
-int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
+int __swap_count(swp_entry_t entry)
 {
+       struct swap_info_struct *si;
        pgoff_t offset = swp_offset(entry);
+       int count = 0;
 
-       return swap_count(si->swap_map[offset]);
+       si = get_swap_device(entry);
+       if (si) {
+               count = swap_count(si->swap_map[offset]);
+               put_swap_device(si);
+       }
+       return count;
 }
 
 static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
@@ -1387,9 +1481,11 @@ int __swp_swapcount(swp_entry_t entry)
        int count = 0;
        struct swap_info_struct *si;
 
-       si = __swap_info_get(entry);
-       if (si)
+       si = get_swap_device(entry);
+       if (si) {
                count = swap_swapcount(si, entry);
+               put_swap_device(si);
+       }
        return count;
 }
 
@@ -1684,7 +1780,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
                        return type;
                }
                if (bdev == sis->bdev) {
-                       struct swap_extent *se = &sis->first_swap_extent;
+                       struct swap_extent *se = first_se(sis);
 
                        if (se->start_block == offset) {
                                if (bdev_p)
@@ -2161,7 +2257,6 @@ static void drain_mmlist(void)
 static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
 {
        struct swap_info_struct *sis;
-       struct swap_extent *start_se;
        struct swap_extent *se;
        pgoff_t offset;
 
@@ -2169,18 +2264,8 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
        *bdev = sis->bdev;
 
        offset = swp_offset(entry);
-       start_se = sis->curr_swap_extent;
-       se = start_se;
-
-       for ( ; ; ) {
-               if (se->start_page <= offset &&
-                               offset < (se->start_page + se->nr_pages)) {
-                       return se->start_block + (offset - se->start_page);
-               }
-               se = list_next_entry(se, list);
-               sis->curr_swap_extent = se;
-               BUG_ON(se == start_se);         /* It *must* be present */
-       }
+       se = offset_to_swap_extent(sis, offset);
+       return se->start_block + (offset - se->start_page);
 }
 
 /*
@@ -2198,12 +2283,11 @@ sector_t map_swap_page(struct page *page, struct block_device **bdev)
  */
 static void destroy_swap_extents(struct swap_info_struct *sis)
 {
-       while (!list_empty(&sis->first_swap_extent.list)) {
-               struct swap_extent *se;
+       while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
+               struct rb_node *rb = sis->swap_extent_root.rb_node;
+               struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);
 
-               se = list_first_entry(&sis->first_swap_extent.list,
-                               struct swap_extent, list);
-               list_del(&se->list);
+               rb_erase(rb, &sis->swap_extent_root);
                kfree(se);
        }
 
@@ -2219,7 +2303,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
 
 /*
  * Add a block range (and the corresponding page range) into this swapdev's
- * extent list.  The extent list is kept sorted in page order.
+ * extent tree.
  *
  * This function rather assumes that it is called in ascending page order.
  */
@@ -2227,20 +2311,21 @@ int
 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
                unsigned long nr_pages, sector_t start_block)
 {
+       struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL;
        struct swap_extent *se;
        struct swap_extent *new_se;
-       struct list_head *lh;
-
-       if (start_page == 0) {
-               se = &sis->first_swap_extent;
-               sis->curr_swap_extent = se;
-               se->start_page = 0;
-               se->nr_pages = nr_pages;
-               se->start_block = start_block;
-               return 1;
-       } else {
-               lh = sis->first_swap_extent.list.prev;  /* Highest extent */
-               se = list_entry(lh, struct swap_extent, list);
+
+       /*
+        * place the new node at the right most since the
+        * function is called in ascending page order.
+        */
+       while (*link) {
+               parent = *link;
+               link = &parent->rb_right;
+       }
+
+       if (parent) {
+               se = rb_entry(parent, struct swap_extent, rb_node);
                BUG_ON(se->start_page + se->nr_pages != start_page);
                if (se->start_block + se->nr_pages == start_block) {
                        /* Merge it */
@@ -2249,9 +2334,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
                }
        }
 
-       /*
-        * No merge.  Insert a new extent, preserving ordering.
-        */
+       /* No merge, insert a new extent. */
        new_se = kmalloc(sizeof(*se), GFP_KERNEL);
        if (new_se == NULL)
                return -ENOMEM;
@@ -2259,7 +2342,8 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
        new_se->nr_pages = nr_pages;
        new_se->start_block = start_block;
 
-       list_add_tail(&new_se->list, &sis->first_swap_extent.list);
+       rb_link_node(&new_se->rb_node, parent, link);
+       rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
        return 1;
 }
 EXPORT_SYMBOL_GPL(add_swap_extent);
@@ -2335,9 +2419,9 @@ static int swap_node(struct swap_info_struct *p)
        return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
 }
 
-static void _enable_swap_info(struct swap_info_struct *p, int prio,
-                               unsigned char *swap_map,
-                               struct swap_cluster_info *cluster_info)
+static void setup_swap_info(struct swap_info_struct *p, int prio,
+                           unsigned char *swap_map,
+                           struct swap_cluster_info *cluster_info)
 {
        int i;
 
@@ -2362,7 +2446,11 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
        }
        p->swap_map = swap_map;
        p->cluster_info = cluster_info;
-       p->flags |= SWP_WRITEOK;
+}
+
+static void _enable_swap_info(struct swap_info_struct *p)
+{
+       p->flags |= SWP_WRITEOK | SWP_VALID;
        atomic_long_add(p->pages, &nr_swap_pages);
        total_swap_pages += p->pages;
 
@@ -2389,7 +2477,17 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
        frontswap_init(p->type, frontswap_map);
        spin_lock(&swap_lock);
        spin_lock(&p->lock);
-        _enable_swap_info(p, prio, swap_map, cluster_info);
+       setup_swap_info(p, prio, swap_map, cluster_info);
+       spin_unlock(&p->lock);
+       spin_unlock(&swap_lock);
+       /*
+        * Guarantee swap_map, cluster_info, etc. fields are valid
+        * between get/put_swap_device() if SWP_VALID bit is set
+        */
+       synchronize_rcu();
+       spin_lock(&swap_lock);
+       spin_lock(&p->lock);
+       _enable_swap_info(p);
        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
 }
@@ -2398,7 +2496,8 @@ static void reinsert_swap_info(struct swap_info_struct *p)
 {
        spin_lock(&swap_lock);
        spin_lock(&p->lock);
-       _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
+       setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
+       _enable_swap_info(p);
        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
 }
@@ -2501,6 +2600,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 
        reenable_swap_slots_cache_unlock();
 
+       spin_lock(&swap_lock);
+       spin_lock(&p->lock);
+       p->flags &= ~SWP_VALID;         /* mark swap device as invalid */
+       spin_unlock(&p->lock);
+       spin_unlock(&swap_lock);
+       /*
+        * wait for swap operations protected by get/put_swap_device()
+        * to complete
+        */
+       synchronize_rcu();
+
        flush_work(&p->discard_work);
 
        destroy_swap_extents(p);
@@ -2749,7 +2859,7 @@ static struct swap_info_struct *alloc_swap_info(void)
                 * would be relying on p->type to remain valid.
                 */
        }
-       INIT_LIST_HEAD(&p->first_swap_extent.list);
+       p->swap_extent_root = RB_ROOT;
        plist_node_init(&p->list, 0);
        for_each_node(i)
                plist_node_init(&p->avail_lists[i], 0);
@@ -3265,17 +3375,11 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
        unsigned char has_cache;
        int err = -EINVAL;
 
-       if (non_swap_entry(entry))
-               goto out;
-
-       p = swp_swap_info(entry);
+       p = get_swap_device(entry);
        if (!p)
-               goto bad_file;
-
-       offset = swp_offset(entry);
-       if (unlikely(offset >= p->max))
                goto out;
 
+       offset = swp_offset(entry);
        ci = lock_cluster_or_swap_info(p, offset);
 
        count = p->swap_map[offset];
@@ -3321,11 +3425,9 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
 unlock_out:
        unlock_cluster_or_swap_info(p, ci);
 out:
+       if (p)
+               put_swap_device(p);
        return err;
-
-bad_file:
-       pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
-       goto out;
 }
 
 /*
@@ -3417,6 +3519,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
        struct page *list_page;
        pgoff_t offset;
        unsigned char count;
+       int ret = 0;
 
        /*
         * When debugging, it's easier to use __GFP_ZERO here; but it's better
@@ -3424,15 +3527,15 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
         */
        page = alloc_page(gfp_mask | __GFP_HIGHMEM);
 
-       si = swap_info_get(entry);
+       si = get_swap_device(entry);
        if (!si) {
                /*
                 * An acceptable race has occurred since the failing
-                * __swap_duplicate(): the swap entry has been freed,
-                * perhaps even the whole swap_map cleared for swapoff.
+                * __swap_duplicate(): the swap device may be swapoff
                 */
                goto outer;
        }
+       spin_lock(&si->lock);
 
        offset = swp_offset(entry);
 
@@ -3450,9 +3553,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
        }
 
        if (!page) {
-               unlock_cluster(ci);
-               spin_unlock(&si->lock);
-               return -ENOMEM;
+               ret = -ENOMEM;
+               goto out;
        }
 
        /*
@@ -3504,10 +3606,11 @@ out_unlock_cont:
 out:
        unlock_cluster(ci);
        spin_unlock(&si->lock);
+       put_swap_device(si);
 outer:
        if (page)
                __free_page(page);
-       return 0;
+       return ret;
 }
 
 /*