memcg: generate file modified notifications on "memory.events"

[linux-2.6-block.git] / include / linux / memcontrol.h
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h

index 73b02b0a8f609ac757de6ee59b23bcf8b0e87396..c83c699a6605b982ca8fbec56f9510d44e2c9cbd 100644 (file)
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -23,6 +23,11 @@
  #include <linux/vm_event_item.h>
  #include <linux/hardirq.h>
  #include <linux/jump_label.h>
+#include <linux/page_counter.h>
+#include <linux/vmpressure.h>
+#include <linux/eventfd.h>
+#include <linux/mmzone.h>
+#include <linux/writeback.h>
  
  struct mem_cgroup;
  struct page;
@@ -67,12 +72,225 @@ enum mem_cgroup_events_index {
         MEMCG_NR_EVENTS,
  };
  
+/*
+ * Per memcg event counter is incremented at every pagein/pageout. With THP,
+ * it will be incremated by the number of pages. This counter is used for
+ * for trigger some periodic events. This is straightforward and better
+ * than using jiffies etc. to handle periodic memcg event.
+ */
+enum mem_cgroup_events_target {
+       MEM_CGROUP_TARGET_THRESH,
+       MEM_CGROUP_TARGET_SOFTLIMIT,
+       MEM_CGROUP_TARGET_NUMAINFO,
+       MEM_CGROUP_NTARGETS,
+};
+
+/*
+ * Bits in struct cg_proto.flags
+ */
+enum cg_proto_flags {
+       /* Currently active and new sockets should be assigned to cgroups */
+       MEMCG_SOCK_ACTIVE,
+       /* It was ever activated; we must disarm static keys on destruction */
+       MEMCG_SOCK_ACTIVATED,
+};
+
+struct cg_proto {
+       struct page_counter     memory_allocated;       /* Current allocated memory. */
+       struct percpu_counter   sockets_allocated;      /* Current number of sockets. */
+       int                     memory_pressure;
+       long                    sysctl_mem[3];
+       unsigned long           flags;
+       /*
+        * memcg field is used to find which memcg we belong directly
+        * Each memcg struct can hold more than one cg_proto, so container_of
+        * won't really cut.
+        *
+        * The elegant solution would be having an inverse function to
+        * proto_cgroup in struct proto, but that means polluting the structure
+        * for everybody, instead of just for memcg users.
+        */
+       struct mem_cgroup       *memcg;
+};
+
  #ifdef CONFIG_MEMCG
+struct mem_cgroup_stat_cpu {
+       long count[MEM_CGROUP_STAT_NSTATS];
+       unsigned long events[MEMCG_NR_EVENTS];
+       unsigned long nr_page_events;
+       unsigned long targets[MEM_CGROUP_NTARGETS];
+};
+
+struct mem_cgroup_reclaim_iter {
+       struct mem_cgroup *position;
+       /* scan generation, increased every round-trip */
+       unsigned int generation;
+};
+
+/*
+ * per-zone information in memory controller.
+ */
+struct mem_cgroup_per_zone {
+       struct lruvec           lruvec;
+       unsigned long           lru_size[NR_LRU_LISTS];
+
+       struct mem_cgroup_reclaim_iter  iter[DEF_PRIORITY + 1];
+
+       struct rb_node          tree_node;      /* RB tree node */
+       unsigned long           usage_in_excess;/* Set to the value by which */
+                                               /* the soft limit is exceeded*/
+       bool                    on_tree;
+       struct mem_cgroup       *memcg;         /* Back pointer, we cannot */
+                                               /* use container_of        */
+};
+
+struct mem_cgroup_per_node {
+       struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
+};
+
+struct mem_cgroup_threshold {
+       struct eventfd_ctx *eventfd;
+       unsigned long threshold;
+};
+
+/* For threshold */
+struct mem_cgroup_threshold_ary {
+       /* An array index points to threshold just below or equal to usage. */
+       int current_threshold;
+       /* Size of entries[] */
+       unsigned int size;
+       /* Array of thresholds */
+       struct mem_cgroup_threshold entries[0];
+};
+
+struct mem_cgroup_thresholds {
+       /* Primary thresholds array */
+       struct mem_cgroup_threshold_ary *primary;
+       /*
+        * Spare threshold array.
+        * This is needed to make mem_cgroup_unregister_event() "never fail".
+        * It must be able to store at least primary->size - 1 entries.
+        */
+       struct mem_cgroup_threshold_ary *spare;
+};
+
+/*
+ * The memory controller data structure. The memory controller controls both
+ * page cache and RSS per cgroup. We would eventually like to provide
+ * statistics based on the statistics developed by Rik Van Riel for clock-pro,
+ * to help the administrator determine what knobs to tune.
+ */
+struct mem_cgroup {
+       struct cgroup_subsys_state css;
+
+       /* Accounted resources */
+       struct page_counter memory;
+       struct page_counter memsw;
+       struct page_counter kmem;
+
+       /* Normal memory consumption range */
+       unsigned long low;
+       unsigned long high;
+
+       unsigned long soft_limit;
+
+       /* vmpressure notifications */
+       struct vmpressure vmpressure;
+
+       /* css_online() has been completed */
+       int initialized;
+
+       /*
+        * Should the accounting and control be hierarchical, per subtree?
+        */
+       bool use_hierarchy;
+
+       /* protected by memcg_oom_lock */
+       bool            oom_lock;
+       int             under_oom;
+
+       int     swappiness;
+       /* OOM-Killer disable */
+       int             oom_kill_disable;
+
+       /* handle for "memory.events" */
+       struct cgroup_file events_file;
+
+       /* protect arrays of thresholds */
+       struct mutex thresholds_lock;
+
+       /* thresholds for memory usage. RCU-protected */
+       struct mem_cgroup_thresholds thresholds;
+
+       /* thresholds for mem+swap usage. RCU-protected */
+       struct mem_cgroup_thresholds memsw_thresholds;
+
+       /* For oom notifier event fd */
+       struct list_head oom_notify;
+
+       /*
+        * Should we move charges of a task when a task is moved into this
+        * mem_cgroup ? And what type of charges should we move ?
+        */
+       unsigned long move_charge_at_immigrate;
+       /*
+        * set > 0 if pages under this cgroup are moving to other cgroup.
+        */
+       atomic_t                moving_account;
+       /* taken only while moving_account > 0 */
+       spinlock_t              move_lock;
+       struct task_struct      *move_lock_task;
+       unsigned long           move_lock_flags;
+       /*
+        * percpu counter.
+        */
+       struct mem_cgroup_stat_cpu __percpu *stat;
+       spinlock_t pcp_counter_lock;
+
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
+       struct cg_proto tcp_mem;
+#endif
+#if defined(CONFIG_MEMCG_KMEM)
+        /* Index in the kmem_cache->memcg_params.memcg_caches array */
+       int kmemcg_id;
+       bool kmem_acct_activated;
+       bool kmem_acct_active;
+#endif
+
+       int last_scanned_node;
+#if MAX_NUMNODES > 1
+       nodemask_t      scan_nodes;
+       atomic_t        numainfo_events;
+       atomic_t        numainfo_updating;
+#endif
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+       struct list_head cgwb_list;
+       struct wb_domain cgwb_domain;
+#endif
+
+       /* List of events which userspace want to receive */
+       struct list_head event_list;
+       spinlock_t event_list_lock;
+
+       struct mem_cgroup_per_node *nodeinfo[0];
+       /* WARNING: nodeinfo must be the last member here */
+};
  extern struct cgroup_subsys_state *mem_cgroup_root_css;
  
-void mem_cgroup_events(struct mem_cgroup *memcg,
+/**
+ * mem_cgroup_events - count memory events against a cgroup
+ * @memcg: the memory cgroup
+ * @idx: the event index
+ * @nr: the number of events to account for
+ */
+static inline void mem_cgroup_events(struct mem_cgroup *memcg,
                        enum mem_cgroup_events_index idx,
-                      unsigned int nr);
+                      unsigned int nr)
+{
+       this_cpu_add(memcg->stat->events[idx], nr);
+       cgroup_file_notify(&memcg->events_file);
+}
  
  bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
  
@@ -90,15 +308,29 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
  struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
  struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
  
-bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
-                             struct mem_cgroup *root);
  bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
+struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
+struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
+
+static inline
+struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
+       return css ? container_of(css, struct mem_cgroup, css) : NULL;
+}
  
-extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
-extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
+struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
+                                  struct mem_cgroup *,
+                                  struct mem_cgroup_reclaim_cookie *);
+void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
  
-extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
-extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css);
+static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
+                             struct mem_cgroup *root)
+{
+       if (root == memcg)
+               return true;
+       if (!root->use_hierarchy)
+               return false;
+       return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
+}
  
  static inline bool mm_match_cgroup(struct mm_struct *mm,
                                    struct mem_cgroup *memcg)
@@ -114,24 +346,66 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
         return match;
  }
  
-extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
-extern struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
+struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
+ino_t page_cgroup_ino(struct page *page);
  
-struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
-                                  struct mem_cgroup *,
-                                  struct mem_cgroup_reclaim_cookie *);
-void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
+static inline bool mem_cgroup_disabled(void)
+{
+       return !cgroup_subsys_enabled(memory_cgrp_subsys);
+}
  
  /*
   * For memory reclaim.
   */
-int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec);
-bool mem_cgroup_lruvec_online(struct lruvec *lruvec);
  int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
-unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
-void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
-extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
-                                       struct task_struct *p);
+
+void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
+               int nr_pages);
+
+static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
+{
+       struct mem_cgroup_per_zone *mz;
+       struct mem_cgroup *memcg;
+
+       if (mem_cgroup_disabled())
+               return true;
+
+       mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+       memcg = mz->memcg;
+
+       return !!(memcg->css.flags & CSS_ONLINE);
+}
+
+static inline
+unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+{
+       struct mem_cgroup_per_zone *mz;
+
+       mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+       return mz->lru_size[lru];
+}
+
+static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
+{
+       unsigned long inactive_ratio;
+       unsigned long inactive;
+       unsigned long active;
+       unsigned long gb;
+
+       inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
+       active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
+
+       gb = (inactive + active) >> (30 - PAGE_SHIFT);
+       if (gb)
+               inactive_ratio = int_sqrt(10 * gb);
+       else
+               inactive_ratio = 1;
+
+       return inactive * inactive_ratio < active;
+}
+
+void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
+                               struct task_struct *p);
  
  static inline void mem_cgroup_oom_enable(void)
  {
@@ -156,18 +430,26 @@ bool mem_cgroup_oom_synchronize(bool wait);
  extern int do_swap_account;
  #endif
  
-static inline bool mem_cgroup_disabled(void)
-{
-       if (memory_cgrp_subsys.disabled)
-               return true;
-       return false;
-}
-
  struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page);
-void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
-                                enum mem_cgroup_stat_index idx, int val);
  void mem_cgroup_end_page_stat(struct mem_cgroup *memcg);
  
+/**
+ * mem_cgroup_update_page_stat - update page state statistics
+ * @memcg: memcg to account against
+ * @idx: page state item to account
+ * @val: number of pages (positive or negative)
+ *
+ * See mem_cgroup_begin_page_stat() for locking requirements.
+ */
+static inline void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
+                                enum mem_cgroup_stat_index idx, int val)
+{
+       VM_BUG_ON(!rcu_read_lock_held());
+
+       if (memcg)
+               this_cpu_add(memcg->stat->count[idx], val);
+}
+
  static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg,
                                             enum mem_cgroup_stat_index idx)
  {
@@ -184,13 +466,31 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
                                                 gfp_t gfp_mask,
                                                 unsigned long *total_scanned);
  
-void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
  static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
                                              enum vm_event_item idx)
  {
+       struct mem_cgroup *memcg;
+
         if (mem_cgroup_disabled())
                 return;
-       __mem_cgroup_count_vm_event(mm, idx);
+
+       rcu_read_lock();
+       memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+       if (unlikely(!memcg))
+               goto out;
+
+       switch (idx) {
+       case PGFAULT:
+               this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
+               break;
+       case PGMAJFAULT:
+               this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
+               break;
+       default:
+               BUG();
+       }
+out:
+       rcu_read_unlock();
  }
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  void mem_cgroup_split_huge_fixup(struct page *head);
@@ -199,8 +499,6 @@ void mem_cgroup_split_huge_fixup(struct page *head);
  #else /* CONFIG_MEMCG */
  struct mem_cgroup;
  
-#define mem_cgroup_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
-
  static inline void mem_cgroup_events(struct mem_cgroup *memcg,
                                      enum mem_cgroup_events_index idx,
                                      unsigned int nr)
@@ -258,11 +556,6 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
         return &zone->lruvec;
  }
  
-static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
-{
-       return NULL;
-}
-
  static inline bool mm_match_cgroup(struct mm_struct *mm,
                 struct mem_cgroup *memcg)
  {
@@ -275,12 +568,6 @@ static inline bool task_in_mem_cgroup(struct task_struct *task,
         return true;
  }
  
-static inline struct cgroup_subsys_state
-               *mem_cgroup_css(struct mem_cgroup *memcg)
-{
-       return NULL;
-}
-
  static inline struct mem_cgroup *
  mem_cgroup_iter(struct mem_cgroup *root,
                 struct mem_cgroup *prev,
@@ -428,8 +715,8 @@ static inline void sock_release_memcg(struct sock *sk)
  extern struct static_key memcg_kmem_enabled_key;
  
  extern int memcg_nr_cache_ids;
-extern void memcg_get_cache_ids(void);
-extern void memcg_put_cache_ids(void);
+void memcg_get_cache_ids(void);
+void memcg_put_cache_ids(void);
  
  /*
   * Helper macro to loop through all memcg-specific caches. Callers must still
@@ -444,7 +731,10 @@ static inline bool memcg_kmem_enabled(void)
         return static_key_false(&memcg_kmem_enabled_key);
  }
  
-bool memcg_kmem_is_active(struct mem_cgroup *memcg);
+static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+{
+       return memcg->kmem_acct_active;
+}
  
  /*
   * In general, we'll do everything in our power to not incur in any overhead
@@ -463,7 +753,15 @@ void __memcg_kmem_commit_charge(struct page *page,
                                        struct mem_cgroup *memcg, int order);
  void __memcg_kmem_uncharge_pages(struct page *page, int order);
  
-int memcg_cache_id(struct mem_cgroup *memcg);
+/*
+ * helper for acessing a memcg's index. It will be used as an index in the
+ * child cache array in kmem_cache, and also to derive its name. This function
+ * will return -1 when this is not a kmem-limited memcg.
+ */
+static inline int memcg_cache_id(struct mem_cgroup *memcg)
+{
+       return memcg ? memcg->kmemcg_id : -1;
+}
  
  struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep);
  void __memcg_kmem_put_cache(struct kmem_cache *cachep);