Merge branch 'work.mount0' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
[linux-2.6-block.git] / drivers / misc / vmw_balloon.c
index 2136f6ad97d32dffc4eefde314b34b19c4ae8f87..8840299420e0b287f3cd81733a1c13a0c7957fb0 100644 (file)
@@ -29,6 +29,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/mount.h>
+#include <linux/pseudo_fs.h>
 #include <linux/balloon_compaction.h>
 #include <linux/vmw_vmci_defs.h>
 #include <linux/vmw_vmci_api.h>
@@ -40,6 +41,15 @@ MODULE_ALIAS("dmi:*:svnVMware*:*");
 MODULE_ALIAS("vmware_vmmemctl");
 MODULE_LICENSE("GPL");
 
+static bool __read_mostly vmwballoon_shrinker_enable;
+module_param(vmwballoon_shrinker_enable, bool, 0444);
+MODULE_PARM_DESC(vmwballoon_shrinker_enable,
+       "Enable non-cooperative out-of-memory protection. Disabled by default as it may degrade performance.");
+
+/* Delay in seconds after shrink before inflation. */
+#define VMBALLOON_SHRINK_DELAY         (5)
+
+/* Maximum number of refused pages we accumulate during inflation cycle */
 #define VMW_BALLOON_MAX_REFUSED                16
 
 /* Magic number for the balloon mount-point */
@@ -217,18 +227,20 @@ enum vmballoon_stat_general {
        VMW_BALLOON_STAT_TIMER,
        VMW_BALLOON_STAT_DOORBELL,
        VMW_BALLOON_STAT_RESET,
-       VMW_BALLOON_STAT_LAST = VMW_BALLOON_STAT_RESET
+       VMW_BALLOON_STAT_SHRINK,
+       VMW_BALLOON_STAT_SHRINK_FREE,
+       VMW_BALLOON_STAT_LAST = VMW_BALLOON_STAT_SHRINK_FREE
 };
 
 #define VMW_BALLOON_STAT_NUM           (VMW_BALLOON_STAT_LAST + 1)
 
-
 static DEFINE_STATIC_KEY_TRUE(vmw_balloon_batching);
 static DEFINE_STATIC_KEY_FALSE(balloon_stat_enabled);
 
 struct vmballoon_ctl {
        struct list_head pages;
        struct list_head refused_pages;
+       struct list_head prealloc_pages;
        unsigned int n_refused_pages;
        unsigned int n_pages;
        enum vmballoon_page_size_type page_size;
@@ -321,6 +333,15 @@ struct vmballoon {
         */
        struct page *page;
 
+       /**
+        * @shrink_timeout: timeout until the next inflation.
+        *
+        * After an shrink event, indicates the time in jiffies after which
+        * inflation is allowed again. Can be written concurrently with reads,
+        * so must use READ_ONCE/WRITE_ONCE when accessing.
+        */
+       unsigned long shrink_timeout;
+
        /* statistics */
        struct vmballoon_stats *stats;
 
@@ -361,6 +382,20 @@ struct vmballoon {
         * Lock ordering: @conf_sem -> @comm_lock .
         */
        spinlock_t comm_lock;
+
+       /**
+        * @shrinker: shrinker interface that is used to avoid over-inflation.
+        */
+       struct shrinker shrinker;
+
+       /**
+        * @shrinker_registered: whether the shrinker was registered.
+        *
+        * The shrinker interface does not handle gracefully the removal of
+        * shrinker that was not registered before. This indication allows to
+        * simplify the unregistration process.
+        */
+       bool shrinker_registered;
 };
 
 static struct vmballoon balloon;
@@ -635,15 +670,25 @@ static int vmballoon_alloc_page_list(struct vmballoon *b,
        unsigned int i;
 
        for (i = 0; i < req_n_pages; i++) {
-               if (ctl->page_size == VMW_BALLOON_2M_PAGE)
-                       page = alloc_pages(__GFP_HIGHMEM|__GFP_NOWARN|
+               /*
+                * First check if we happen to have pages that were allocated
+                * before. This happens when 2MB page rejected during inflation
+                * by the hypervisor, and then split into 4KB pages.
+                */
+               if (!list_empty(&ctl->prealloc_pages)) {
+                       page = list_first_entry(&ctl->prealloc_pages,
+                                               struct page, lru);
+                       list_del(&page->lru);
+               } else {
+                       if (ctl->page_size == VMW_BALLOON_2M_PAGE)
+                               page = alloc_pages(__GFP_HIGHMEM|__GFP_NOWARN|
                                        __GFP_NOMEMALLOC, VMW_BALLOON_2M_ORDER);
-               else
-                       page = balloon_page_alloc();
+                       else
+                               page = balloon_page_alloc();
 
-               /* Update statistics */
-               vmballoon_stats_page_inc(b, VMW_BALLOON_PAGE_STAT_ALLOC,
-                                        ctl->page_size);
+                       vmballoon_stats_page_inc(b, VMW_BALLOON_PAGE_STAT_ALLOC,
+                                                ctl->page_size);
+               }
 
                if (page) {
                        vmballoon_mark_page_offline(page, ctl->page_size);
@@ -889,7 +934,8 @@ static void vmballoon_release_page_list(struct list_head *page_list,
                __free_pages(page, vmballoon_page_order(page_size));
        }
 
-       *n_pages = 0;
+       if (n_pages)
+               *n_pages = 0;
 }
 
 
@@ -935,6 +981,10 @@ static int64_t vmballoon_change(struct vmballoon *b)
            size - target < vmballoon_page_in_frames(VMW_BALLOON_2M_PAGE))
                return 0;
 
+       /* If an out-of-memory recently occurred, inflation is disallowed. */
+       if (target > size && time_before(jiffies, READ_ONCE(b->shrink_timeout)))
+               return 0;
+
        return target - size;
 }
 
@@ -1017,6 +1067,32 @@ static void vmballoon_dequeue_page_list(struct vmballoon *b,
        *n_pages = i;
 }
 
+/**
+ * vmballoon_split_refused_pages() - Split the 2MB refused pages to 4k.
+ *
+ * If inflation of 2MB pages was denied by the hypervisor, it is likely to be
+ * due to one or few 4KB pages. These 2MB pages may keep being allocated and
+ * then being refused. To prevent this case, this function splits the refused
+ * pages into 4KB pages and adds them into @prealloc_pages list.
+ *
+ * @ctl: pointer for the %struct vmballoon_ctl, which defines the operation.
+ */
+static void vmballoon_split_refused_pages(struct vmballoon_ctl *ctl)
+{
+       struct page *page, *tmp;
+       unsigned int i, order;
+
+       order = vmballoon_page_order(ctl->page_size);
+
+       list_for_each_entry_safe(page, tmp, &ctl->refused_pages, lru) {
+               list_del(&page->lru);
+               split_page(page, order);
+               for (i = 0; i < (1 << order); i++)
+                       list_add(&page[i].lru, &ctl->prealloc_pages);
+       }
+       ctl->n_refused_pages = 0;
+}
+
 /**
  * vmballoon_inflate() - Inflate the balloon towards its target size.
  *
@@ -1028,6 +1104,7 @@ static void vmballoon_inflate(struct vmballoon *b)
        struct vmballoon_ctl ctl = {
                .pages = LIST_HEAD_INIT(ctl.pages),
                .refused_pages = LIST_HEAD_INIT(ctl.refused_pages),
+               .prealloc_pages = LIST_HEAD_INIT(ctl.prealloc_pages),
                .page_size = b->max_page_size,
                .op = VMW_BALLOON_INFLATE
        };
@@ -1075,10 +1152,10 @@ static void vmballoon_inflate(struct vmballoon *b)
                                break;
 
                        /*
-                        * Ignore errors from locking as we now switch to 4k
-                        * pages and we might get different errors.
+                        * Split the refused pages to 4k. This will also empty
+                        * the refused pages list.
                         */
-                       vmballoon_release_refused_pages(b, &ctl);
+                       vmballoon_split_refused_pages(&ctl);
                        ctl.page_size--;
                }
 
@@ -1092,6 +1169,8 @@ static void vmballoon_inflate(struct vmballoon *b)
         */
        if (ctl.n_refused_pages != 0)
                vmballoon_release_refused_pages(b, &ctl);
+
+       vmballoon_release_page_list(&ctl.prealloc_pages, NULL, ctl.page_size);
 }
 
 /**
@@ -1430,6 +1509,90 @@ static void vmballoon_work(struct work_struct *work)
 
 }
 
+/**
+ * vmballoon_shrinker_scan() - deflate the balloon due to memory pressure.
+ * @shrinker: pointer to the balloon shrinker.
+ * @sc: page reclaim information.
+ *
+ * Returns: number of pages that were freed during deflation.
+ */
+static unsigned long vmballoon_shrinker_scan(struct shrinker *shrinker,
+                                            struct shrink_control *sc)
+{
+       struct vmballoon *b = &balloon;
+       unsigned long deflated_frames;
+
+       pr_debug("%s - size: %llu", __func__, atomic64_read(&b->size));
+
+       vmballoon_stats_gen_inc(b, VMW_BALLOON_STAT_SHRINK);
+
+       /*
+        * If the lock is also contended for read, we cannot easily reclaim and
+        * we bail out.
+        */
+       if (!down_read_trylock(&b->conf_sem))
+               return 0;
+
+       deflated_frames = vmballoon_deflate(b, sc->nr_to_scan, true);
+
+       vmballoon_stats_gen_add(b, VMW_BALLOON_STAT_SHRINK_FREE,
+                               deflated_frames);
+
+       /*
+        * Delay future inflation for some time to mitigate the situations in
+        * which balloon continuously grows and shrinks. Use WRITE_ONCE() since
+        * the access is asynchronous.
+        */
+       WRITE_ONCE(b->shrink_timeout, jiffies + HZ * VMBALLOON_SHRINK_DELAY);
+
+       up_read(&b->conf_sem);
+
+       return deflated_frames;
+}
+
+/**
+ * vmballoon_shrinker_count() - return the number of ballooned pages.
+ * @shrinker: pointer to the balloon shrinker.
+ * @sc: page reclaim information.
+ *
+ * Returns: number of 4k pages that are allocated for the balloon and can
+ *         therefore be reclaimed under pressure.
+ */
+static unsigned long vmballoon_shrinker_count(struct shrinker *shrinker,
+                                             struct shrink_control *sc)
+{
+       struct vmballoon *b = &balloon;
+
+       return atomic64_read(&b->size);
+}
+
+static void vmballoon_unregister_shrinker(struct vmballoon *b)
+{
+       if (b->shrinker_registered)
+               unregister_shrinker(&b->shrinker);
+       b->shrinker_registered = false;
+}
+
+static int vmballoon_register_shrinker(struct vmballoon *b)
+{
+       int r;
+
+       /* Do nothing if the shrinker is not enabled */
+       if (!vmwballoon_shrinker_enable)
+               return 0;
+
+       b->shrinker.scan_objects = vmballoon_shrinker_scan;
+       b->shrinker.count_objects = vmballoon_shrinker_count;
+       b->shrinker.seeks = DEFAULT_SEEKS;
+
+       r = register_shrinker(&b->shrinker);
+
+       if (r == 0)
+               b->shrinker_registered = true;
+
+       return r;
+}
+
 /*
  * DEBUGFS Interface
  */
@@ -1447,6 +1610,8 @@ static const char * const vmballoon_stat_names[] = {
        [VMW_BALLOON_STAT_TIMER]                = "timer",
        [VMW_BALLOON_STAT_DOORBELL]             = "doorbell",
        [VMW_BALLOON_STAT_RESET]                = "reset",
+       [VMW_BALLOON_STAT_SHRINK]               = "shrink",
+       [VMW_BALLOON_STAT_SHRINK_FREE]          = "shrinkFree"
 };
 
 static int vmballoon_enable_stats(struct vmballoon *b)
@@ -1535,19 +1700,10 @@ static int vmballoon_debug_show(struct seq_file *f, void *offset)
 
 DEFINE_SHOW_ATTRIBUTE(vmballoon_debug);
 
-static int __init vmballoon_debugfs_init(struct vmballoon *b)
+static void __init vmballoon_debugfs_init(struct vmballoon *b)
 {
-       int error;
-
        b->dbg_entry = debugfs_create_file("vmmemctl", S_IRUGO, NULL, b,
                                           &vmballoon_debug_fops);
-       if (IS_ERR(b->dbg_entry)) {
-               error = PTR_ERR(b->dbg_entry);
-               pr_err("failed to create debugfs entry, error: %d\n", error);
-               return error;
-       }
-
-       return 0;
 }
 
 static void __exit vmballoon_debugfs_exit(struct vmballoon *b)
@@ -1560,9 +1716,8 @@ static void __exit vmballoon_debugfs_exit(struct vmballoon *b)
 
 #else
 
-static inline int vmballoon_debugfs_init(struct vmballoon *b)
+static inline void vmballoon_debugfs_init(struct vmballoon *b)
 {
-       return 0;
 }
 
 static inline void vmballoon_debugfs_exit(struct vmballoon *b)
@@ -1574,22 +1729,15 @@ static inline void vmballoon_debugfs_exit(struct vmballoon *b)
 
 #ifdef CONFIG_BALLOON_COMPACTION
 
-static struct dentry *vmballoon_mount(struct file_system_type *fs_type,
-                                     int flags, const char *dev_name,
-                                     void *data)
+static int vmballoon_init_fs_context(struct fs_context *fc)
 {
-       static const struct dentry_operations ops = {
-               .d_dname = simple_dname,
-       };
-
-       return mount_pseudo(fs_type, "balloon-vmware:", NULL, &ops,
-                           BALLOON_VMW_MAGIC);
+       return init_pseudo(fc, BALLOON_VMW_MAGIC) ? 0 : -ENOMEM;
 }
 
 static struct file_system_type vmballoon_fs = {
-       .name           = "balloon-vmware",
-       .mount          = vmballoon_mount,
-       .kill_sb        = kill_anon_super,
+       .name                   = "balloon-vmware",
+       .init_fs_context        = vmballoon_init_fs_context,
+       .kill_sb                = kill_anon_super,
 };
 
 static struct vfsmount *vmballoon_mnt;
@@ -1780,7 +1928,7 @@ static int __init vmballoon_init(void)
 
        INIT_DELAYED_WORK(&balloon.dwork, vmballoon_work);
 
-       error = vmballoon_debugfs_init(&balloon);
+       error = vmballoon_register_shrinker(&balloon);
        if (error)
                goto fail;
 
@@ -1803,8 +1951,11 @@ static int __init vmballoon_init(void)
 
        queue_delayed_work(system_freezable_wq, &balloon.dwork, 0);
 
+       vmballoon_debugfs_init(&balloon);
+
        return 0;
 fail:
+       vmballoon_unregister_shrinker(&balloon);
        vmballoon_compaction_deinit(&balloon);
        return error;
 }
@@ -1819,6 +1970,7 @@ late_initcall(vmballoon_init);
 
 static void __exit vmballoon_exit(void)
 {
+       vmballoon_unregister_shrinker(&balloon);
        vmballoon_vmci_cleanup(&balloon);
        cancel_delayed_work_sync(&balloon.dwork);