rcu: Create rcutree plugins to handle hotplug CPU for multi-level trees
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>
Thu, 27 Aug 2009 21:58:16 +0000 (14:58 -0700)
committerIngo Molnar <mingo@elte.hu>
Sat, 29 Aug 2009 13:34:39 +0000 (15:34 +0200)
When offlining CPUs from a multi-level tree, there is the
possibility of offlining the last CPU from a given node when
there are preempted RCU read-side critical sections that
started life on one of the CPUs on that node.

In this case, the corresponding tasks will be enqueued via the
task_struct's rcu_node_entry list_head onto one of the
rcu_node's blocked_tasks[] lists.  These tasks need to be moved
somewhere else so that they will prevent the current grace
period from ending. That somewhere is the root rcu_node.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josht@linux.vnet.ibm.com
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
LKML-Reference: <20090827215816.GA30472@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
include/linux/init_task.h
include/linux/sched.h
kernel/rcutree.c
kernel/rcutree_plugin.h

index 79d4baee31b681b65e2d2513a5279fb12459a59f..9e7f2e8fc66ee415a6178b5f4c2d26ca0efc8933 100644 (file)
@@ -98,7 +98,7 @@ extern struct group_info init_groups;
 #define INIT_TASK_RCU_PREEMPT(tsk)                                     \
        .rcu_read_lock_nesting = 0,                                     \
        .rcu_read_unlock_special = 0,                                   \
-       .rcu_blocked_cpu = -1,                                          \
+       .rcu_blocked_node = NULL,                                       \
        .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry),
 #else
 #define INIT_TASK_RCU_PREEMPT(tsk)
index bfca26d63b13e689ae9b17ca395111d72280c919..3fe03151a8e65c4806d899d80426baf52d69ea81 100644 (file)
@@ -1208,7 +1208,7 @@ struct task_struct {
 #ifdef CONFIG_TREE_PREEMPT_RCU
        int rcu_read_lock_nesting;
        char rcu_read_unlock_special;
-       int rcu_blocked_cpu;
+       void *rcu_blocked_node;
        struct list_head rcu_node_entry;
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 
@@ -1735,7 +1735,7 @@ static inline void rcu_copy_process(struct task_struct *p)
 {
        p->rcu_read_lock_nesting = 0;
        p->rcu_read_unlock_special = 0;
-       p->rcu_blocked_cpu = -1;
+       p->rcu_blocked_node = NULL;
        INIT_LIST_HEAD(&p->rcu_node_entry);
 }
 
index fee6316a867371e64375fba1fefe0005adf90b23..d903e2f2b840162d45282f0d46f106f94b8197e7 100644 (file)
@@ -81,6 +81,7 @@ struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 
 extern long rcu_batches_completed_sched(void);
+static struct rcu_node *rcu_get_root(struct rcu_state *rsp);
 static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp,
                          struct rcu_node *rnp, unsigned long flags);
 static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags);
@@ -876,6 +877,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
                        spin_unlock(&rnp->lock); /* irqs remain disabled. */
                        break;
                }
+               rcu_preempt_offline_tasks(rsp, rnp);
                mask = rnp->grpmask;
                spin_unlock(&rnp->lock);        /* irqs remain disabled. */
                rnp = rnp->parent;
index 201334cdc200c4bef39c37b02de57239bfa149af..04343bee646db630b2c73dc5a2049dad4e467ed8 100644 (file)
@@ -92,7 +92,7 @@ static void rcu_preempt_qs(int cpu)
                rnp = rdp->mynode;
                spin_lock(&rnp->lock);
                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
-               t->rcu_blocked_cpu = cpu;
+               t->rcu_blocked_node = (void *)rnp;
 
                /*
                 * If this CPU has already checked in, then this task
@@ -170,12 +170,21 @@ static void rcu_read_unlock_special(struct task_struct *t)
        if (special & RCU_READ_UNLOCK_BLOCKED) {
                t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
 
-               /* Remove this task from the list it blocked on. */
-               rnp = rcu_preempt_state.rda[t->rcu_blocked_cpu]->mynode;
-               spin_lock(&rnp->lock);
+               /*
+                * Remove this task from the list it blocked on.  The
+                * task can migrate while we acquire the lock, but at
+                * most one time.  So at most two passes through loop.
+                */
+               for (;;) {
+                       rnp = (struct rcu_node *)t->rcu_blocked_node;
+                       spin_lock(&rnp->lock);
+                       if (rnp == (struct rcu_node *)t->rcu_blocked_node)
+                               break;
+                       spin_unlock(&rnp->lock);
+               }
                empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
                list_del_init(&t->rcu_node_entry);
-               t->rcu_blocked_cpu = -1;
+               t->rcu_blocked_node = NULL;
 
                /*
                 * If this was the last task on the current list, and if
@@ -261,6 +270,47 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
 
 #ifdef CONFIG_HOTPLUG_CPU
 
+/*
+ * Handle tasklist migration for case in which all CPUs covered by the
+ * specified rcu_node have gone offline.  Move them up to the root
+ * rcu_node.  The reason for not just moving them to the immediate
+ * parent is to remove the need for rcu_read_unlock_special() to
+ * make more than two attempts to acquire the target rcu_node's lock.
+ *
+ * The caller must hold rnp->lock with irqs disabled.
+ */
+static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
+                                     struct rcu_node *rnp)
+{
+       int i;
+       struct list_head *lp;
+       struct list_head *lp_root;
+       struct rcu_node *rnp_root = rcu_get_root(rsp);
+       struct task_struct *tp;
+
+       if (rnp == rnp_root)
+               return;  /* Shouldn't happen: at least one CPU online. */
+
+       /*
+        * Move tasks up to root rcu_node.  Rely on the fact that the
+        * root rcu_node can be at most one ahead of the rest of the
+        * rcu_nodes in terms of gp_num value.  This fact allows us to
+        * move the blocked_tasks[] array directly, element by element.
+        */
+       for (i = 0; i < 2; i++) {
+               lp = &rnp->blocked_tasks[i];
+               lp_root = &rnp_root->blocked_tasks[i];
+               while (!list_empty(lp)) {
+                       tp = list_entry(lp->next, typeof(*tp), rcu_node_entry);
+                       spin_lock(&rnp_root->lock); /* irqs already disabled */
+                       list_del(&tp->rcu_node_entry);
+                       tp->rcu_blocked_node = rnp_root;
+                       list_add(&tp->rcu_node_entry, lp_root);
+                       spin_unlock(&rnp_root->lock); /* irqs remain disabled */
+               }
+       }
+}
+
 /*
  * Do CPU-offline processing for preemptable RCU.
  */
@@ -409,6 +459,15 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
 
 #ifdef CONFIG_HOTPLUG_CPU
 
+/*
+ * Because preemptable RCU does not exist, it never needs to migrate
+ * tasks that were blocked within RCU read-side critical sections.
+ */
+static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
+                                     struct rcu_node *rnp)
+{
+}
+
 /*
  * Because preemptable RCU does not exist, it never needs CPU-offline
  * processing.