Merge branches 'fixes.2015.07.22a' and 'initexp.2015.08.04a' into HEAD
[linux-2.6-block.git] / kernel / rcu / tree.c
index 65137bc28b2b363536a04c26049b6bace2b4b936..9f75f25cc5d92667c27d70dd1b1a6091b42fbceb 100644 (file)
@@ -70,6 +70,8 @@ MODULE_ALIAS("rcutree");
 
 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
 static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
+static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
+static struct lock_class_key rcu_exp_sched_class[RCU_NUM_LVLS];
 
 /*
  * In order to export the rcu_state name to the tracing tools, it
@@ -124,13 +126,8 @@ module_param(rcu_fanout_exact, bool, 0444);
 static int rcu_fanout_leaf = RCU_FANOUT_LEAF;
 module_param(rcu_fanout_leaf, int, 0444);
 int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
-static int num_rcu_lvl[] = {  /* Number of rcu_nodes at specified level. */
-       NUM_RCU_LVL_0,
-       NUM_RCU_LVL_1,
-       NUM_RCU_LVL_2,
-       NUM_RCU_LVL_3,
-       NUM_RCU_LVL_4,
-};
+/* Number of rcu_nodes at specified level. */
+static int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
 int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
 
 /*
@@ -649,12 +646,12 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
         * It is illegal to enter an extended quiescent state while
         * in an RCU read-side critical section.
         */
-       rcu_lockdep_assert(!lock_is_held(&rcu_lock_map),
-                          "Illegal idle entry in RCU read-side critical section.");
-       rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map),
-                          "Illegal idle entry in RCU-bh read-side critical section.");
-       rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map),
-                          "Illegal idle entry in RCU-sched read-side critical section.");
+       RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map),
+                        "Illegal idle entry in RCU read-side critical section.");
+       RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map),
+                        "Illegal idle entry in RCU-bh read-side critical section.");
+       RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map),
+                        "Illegal idle entry in RCU-sched read-side critical section.");
 }
 
 /*
@@ -701,7 +698,7 @@ void rcu_idle_enter(void)
 }
 EXPORT_SYMBOL_GPL(rcu_idle_enter);
 
-#ifdef CONFIG_RCU_USER_QS
+#ifdef CONFIG_NO_HZ_FULL
 /**
  * rcu_user_enter - inform RCU that we are resuming userspace.
  *
@@ -714,7 +711,7 @@ void rcu_user_enter(void)
 {
        rcu_eqs_enter(1);
 }
-#endif /* CONFIG_RCU_USER_QS */
+#endif /* CONFIG_NO_HZ_FULL */
 
 /**
  * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
@@ -828,7 +825,7 @@ void rcu_idle_exit(void)
 }
 EXPORT_SYMBOL_GPL(rcu_idle_exit);
 
-#ifdef CONFIG_RCU_USER_QS
+#ifdef CONFIG_NO_HZ_FULL
 /**
  * rcu_user_exit - inform RCU that we are exiting userspace.
  *
@@ -839,7 +836,7 @@ void rcu_user_exit(void)
 {
        rcu_eqs_exit(1);
 }
-#endif /* CONFIG_RCU_USER_QS */
+#endif /* CONFIG_NO_HZ_FULL */
 
 /**
  * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
@@ -978,9 +975,9 @@ bool notrace rcu_is_watching(void)
 {
        bool ret;
 
-       preempt_disable();
+       preempt_disable_notrace();
        ret = __rcu_is_watching();
-       preempt_enable();
+       preempt_enable_notrace();
        return ret;
 }
 EXPORT_SYMBOL_GPL(rcu_is_watching);
@@ -1178,9 +1175,11 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
        j = jiffies;
        gpa = READ_ONCE(rsp->gp_activity);
        if (j - gpa > 2 * HZ)
-               pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x\n",
+               pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n",
                       rsp->name, j - gpa,
-                      rsp->gpnum, rsp->completed, rsp->gp_flags);
+                      rsp->gpnum, rsp->completed,
+                      rsp->gp_flags, rsp->gp_state,
+                      rsp->gp_kthread ? rsp->gp_kthread->state : 0);
 }
 
 /*
@@ -1905,6 +1904,26 @@ static int rcu_gp_init(struct rcu_state *rsp)
        return 1;
 }
 
+/*
+ * Helper function for wait_event_interruptible_timeout() wakeup
+ * at force-quiescent-state time.
+ */
+static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
+{
+       struct rcu_node *rnp = rcu_get_root(rsp);
+
+       /* Someone like call_rcu() requested a force-quiescent-state scan. */
+       *gfp = READ_ONCE(rsp->gp_flags);
+       if (*gfp & RCU_GP_FLAG_FQS)
+               return true;
+
+       /* The current grace period has completed. */
+       if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp))
+               return true;
+
+       return false;
+}
+
 /*
  * Do one round of quiescent-state forcing.
  */
@@ -2041,6 +2060,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
                        wait_event_interruptible(rsp->gp_wq,
                                                 READ_ONCE(rsp->gp_flags) &
                                                 RCU_GP_FLAG_INIT);
+                       rsp->gp_state = RCU_GP_DONE_GPS;
                        /* Locking provides needed memory barrier. */
                        if (rcu_gp_init(rsp))
                                break;
@@ -2068,11 +2088,8 @@ static int __noreturn rcu_gp_kthread(void *arg)
                                               TPS("fqswait"));
                        rsp->gp_state = RCU_GP_WAIT_FQS;
                        ret = wait_event_interruptible_timeout(rsp->gp_wq,
-                                       ((gf = READ_ONCE(rsp->gp_flags)) &
-                                        RCU_GP_FLAG_FQS) ||
-                                       (!READ_ONCE(rnp->qsmask) &&
-                                        !rcu_preempt_blocked_readers_cgp(rnp)),
-                                       j);
+                                       rcu_gp_fqs_check_wake(rsp, &gf), j);
+                       rsp->gp_state = RCU_GP_DOING_FQS;
                        /* Locking provides needed memory barriers. */
                        /* If grace period done, leave loop. */
                        if (!READ_ONCE(rnp->qsmask) &&
@@ -2110,7 +2127,9 @@ static int __noreturn rcu_gp_kthread(void *arg)
                }
 
                /* Handle grace-period end. */
+               rsp->gp_state = RCU_GP_CLEANUP;
                rcu_gp_cleanup(rsp);
+               rsp->gp_state = RCU_GP_CLEANED;
        }
 }
 
@@ -3161,10 +3180,10 @@ static inline int rcu_blocking_is_gp(void)
  */
 void synchronize_sched(void)
 {
-       rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
-                          !lock_is_held(&rcu_lock_map) &&
-                          !lock_is_held(&rcu_sched_lock_map),
-                          "Illegal synchronize_sched() in RCU-sched read-side critical section");
+       RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+                        lock_is_held(&rcu_lock_map) ||
+                        lock_is_held(&rcu_sched_lock_map),
+                        "Illegal synchronize_sched() in RCU-sched read-side critical section");
        if (rcu_blocking_is_gp())
                return;
        if (rcu_gp_is_expedited())
@@ -3188,10 +3207,10 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
  */
 void synchronize_rcu_bh(void)
 {
-       rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
-                          !lock_is_held(&rcu_lock_map) &&
-                          !lock_is_held(&rcu_sched_lock_map),
-                          "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
+       RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+                        lock_is_held(&rcu_lock_map) ||
+                        lock_is_held(&rcu_sched_lock_map),
+                        "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
        if (rcu_blocking_is_gp())
                return;
        if (rcu_gp_is_expedited())
@@ -3253,23 +3272,247 @@ void cond_synchronize_rcu(unsigned long oldstate)
 }
 EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
 
-static int synchronize_sched_expedited_cpu_stop(void *data)
+/**
+ * get_state_synchronize_sched - Snapshot current RCU-sched state
+ *
+ * Returns a cookie that is used by a later call to cond_synchronize_sched()
+ * to determine whether or not a full grace period has elapsed in the
+ * meantime.
+ */
+unsigned long get_state_synchronize_sched(void)
 {
        /*
-        * There must be a full memory barrier on each affected CPU
-        * between the time that try_stop_cpus() is called and the
-        * time that it returns.
-        *
-        * In the current initial implementation of cpu_stop, the
-        * above condition is already met when the control reaches
-        * this point and the following smp_mb() is not strictly
-        * necessary.  Do smp_mb() anyway for documentation and
-        * robustness against future implementation changes.
+        * Any prior manipulation of RCU-protected data must happen
+        * before the load from ->gpnum.
+        */
+       smp_mb();  /* ^^^ */
+
+       /*
+        * Make sure this load happens before the purportedly
+        * time-consuming work between get_state_synchronize_sched()
+        * and cond_synchronize_sched().
+        */
+       return smp_load_acquire(&rcu_sched_state.gpnum);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_sched);
+
+/**
+ * cond_synchronize_sched - Conditionally wait for an RCU-sched grace period
+ *
+ * @oldstate: return value from earlier call to get_state_synchronize_sched()
+ *
+ * If a full RCU-sched grace period has elapsed since the earlier call to
+ * get_state_synchronize_sched(), just return.  Otherwise, invoke
+ * synchronize_sched() to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.  But
+ * counter wrap is harmless.  If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for one additional grace period should be just fine.
+ */
+void cond_synchronize_sched(unsigned long oldstate)
+{
+       unsigned long newstate;
+
+       /*
+        * Ensure that this load happens before any RCU-destructive
+        * actions the caller might carry out after we return.
         */
-       smp_mb(); /* See above comment block. */
+       newstate = smp_load_acquire(&rcu_sched_state.completed);
+       if (ULONG_CMP_GE(oldstate, newstate))
+               synchronize_sched();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_sched);
+
+/* Adjust sequence number for start of update-side operation. */
+static void rcu_seq_start(unsigned long *sp)
+{
+       WRITE_ONCE(*sp, *sp + 1);
+       smp_mb(); /* Ensure update-side operation after counter increment. */
+       WARN_ON_ONCE(!(*sp & 0x1));
+}
+
+/* Adjust sequence number for end of update-side operation. */
+static void rcu_seq_end(unsigned long *sp)
+{
+       smp_mb(); /* Ensure update-side operation before counter increment. */
+       WRITE_ONCE(*sp, *sp + 1);
+       WARN_ON_ONCE(*sp & 0x1);
+}
+
+/* Take a snapshot of the update side's sequence number. */
+static unsigned long rcu_seq_snap(unsigned long *sp)
+{
+       unsigned long s;
+
+       smp_mb(); /* Caller's modifications seen first by other CPUs. */
+       s = (READ_ONCE(*sp) + 3) & ~0x1;
+       smp_mb(); /* Above access must not bleed into critical section. */
+       return s;
+}
+
+/*
+ * Given a snapshot from rcu_seq_snap(), determine whether or not a
+ * full update-side operation has occurred.
+ */
+static bool rcu_seq_done(unsigned long *sp, unsigned long s)
+{
+       return ULONG_CMP_GE(READ_ONCE(*sp), s);
+}
+
+/* Wrapper functions for expedited grace periods.  */
+static void rcu_exp_gp_seq_start(struct rcu_state *rsp)
+{
+       rcu_seq_start(&rsp->expedited_sequence);
+}
+static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
+{
+       rcu_seq_end(&rsp->expedited_sequence);
+       smp_mb(); /* Ensure that consecutive grace periods serialize. */
+}
+static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
+{
+       return rcu_seq_snap(&rsp->expedited_sequence);
+}
+static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
+{
+       return rcu_seq_done(&rsp->expedited_sequence, s);
+}
+
+/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
+static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
+                              struct rcu_data *rdp,
+                              atomic_long_t *stat, unsigned long s)
+{
+       if (rcu_exp_gp_seq_done(rsp, s)) {
+               if (rnp)
+                       mutex_unlock(&rnp->exp_funnel_mutex);
+               else if (rdp)
+                       mutex_unlock(&rdp->exp_funnel_mutex);
+               /* Ensure test happens before caller kfree(). */
+               smp_mb__before_atomic(); /* ^^^ */
+               atomic_long_inc(stat);
+               return true;
+       }
+       return false;
+}
+
+/*
+ * Funnel-lock acquisition for expedited grace periods.  Returns a
+ * pointer to the root rcu_node structure, or NULL if some other
+ * task did the expedited grace period for us.
+ */
+static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
+{
+       struct rcu_data *rdp;
+       struct rcu_node *rnp0;
+       struct rcu_node *rnp1 = NULL;
+
+       /*
+        * First try directly acquiring the root lock in order to reduce
+        * latency in the common case where expedited grace periods are
+        * rare.  We check mutex_is_locked() to avoid pathological levels of
+        * memory contention on ->exp_funnel_mutex in the heavy-load case.
+        */
+       rnp0 = rcu_get_root(rsp);
+       if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) {
+               if (mutex_trylock(&rnp0->exp_funnel_mutex)) {
+                       if (sync_exp_work_done(rsp, rnp0, NULL,
+                                              &rsp->expedited_workdone0, s))
+                               return NULL;
+                       return rnp0;
+               }
+       }
+
+       /*
+        * Each pass through the following loop works its way
+        * up the rcu_node tree, returning if others have done the
+        * work or otherwise falls through holding the root rnp's
+        * ->exp_funnel_mutex.  The mapping from CPU to rcu_node structure
+        * can be inexact, as it is just promoting locality and is not
+        * strictly needed for correctness.
+        */
+       rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
+       if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s))
+               return NULL;
+       mutex_lock(&rdp->exp_funnel_mutex);
+       rnp0 = rdp->mynode;
+       for (; rnp0 != NULL; rnp0 = rnp0->parent) {
+               if (sync_exp_work_done(rsp, rnp1, rdp,
+                                      &rsp->expedited_workdone2, s))
+                       return NULL;
+               mutex_lock(&rnp0->exp_funnel_mutex);
+               if (rnp1)
+                       mutex_unlock(&rnp1->exp_funnel_mutex);
+               else
+                       mutex_unlock(&rdp->exp_funnel_mutex);
+               rnp1 = rnp0;
+       }
+       if (sync_exp_work_done(rsp, rnp1, rdp,
+                              &rsp->expedited_workdone3, s))
+               return NULL;
+       return rnp1;
+}
+
+/* Invoked on each online non-idle CPU for expedited quiescent state. */
+static int synchronize_sched_expedited_cpu_stop(void *data)
+{
+       struct rcu_data *rdp = data;
+       struct rcu_state *rsp = rdp->rsp;
+
+       /* We are here: If we are last, do the wakeup. */
+       rdp->exp_done = true;
+       if (atomic_dec_and_test(&rsp->expedited_need_qs))
+               wake_up(&rsp->expedited_wq);
        return 0;
 }
 
+static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
+{
+       int cpu;
+       unsigned long jiffies_stall;
+       unsigned long jiffies_start;
+       struct rcu_data *rdp;
+       int ret;
+
+       jiffies_stall = rcu_jiffies_till_stall_check();
+       jiffies_start = jiffies;
+
+       for (;;) {
+               ret = wait_event_interruptible_timeout(
+                               rsp->expedited_wq,
+                               !atomic_read(&rsp->expedited_need_qs),
+                               jiffies_stall);
+               if (ret > 0)
+                       return;
+               if (ret < 0) {
+                       /* Hit a signal, disable CPU stall warnings. */
+                       wait_event(rsp->expedited_wq,
+                                  !atomic_read(&rsp->expedited_need_qs));
+                       return;
+               }
+               pr_err("INFO: %s detected expedited stalls on CPUs: {",
+                      rsp->name);
+               for_each_online_cpu(cpu) {
+                       rdp = per_cpu_ptr(rsp->rda, cpu);
+
+                       if (rdp->exp_done)
+                               continue;
+                       pr_cont(" %d", cpu);
+               }
+               pr_cont(" } %lu jiffies s: %lu\n",
+                       jiffies - jiffies_start, rsp->expedited_sequence);
+               for_each_online_cpu(cpu) {
+                       rdp = per_cpu_ptr(rsp->rda, cpu);
+
+                       if (rdp->exp_done)
+                               continue;
+                       dump_cpu_task(cpu);
+               }
+               jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
+       }
+}
+
 /**
  * synchronize_sched_expedited - Brute-force RCU-sched grace period
  *
@@ -3281,58 +3524,21 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
  * restructure your code to batch your updates, and then use a single
  * synchronize_sched() instead.
  *
- * This implementation can be thought of as an application of ticket
- * locking to RCU, with sync_sched_expedited_started and
- * sync_sched_expedited_done taking on the roles of the halves
- * of the ticket-lock word.  Each task atomically increments
- * sync_sched_expedited_started upon entry, snapshotting the old value,
- * then attempts to stop all the CPUs.  If this succeeds, then each
- * CPU will have executed a context switch, resulting in an RCU-sched
- * grace period.  We are then done, so we use atomic_cmpxchg() to
- * update sync_sched_expedited_done to match our snapshot -- but
- * only if someone else has not already advanced past our snapshot.
- *
- * On the other hand, if try_stop_cpus() fails, we check the value
- * of sync_sched_expedited_done.  If it has advanced past our
- * initial snapshot, then someone else must have forced a grace period
- * some time after we took our snapshot.  In this case, our work is
- * done for us, and we can simply return.  Otherwise, we try again,
- * but keep our initial snapshot for purposes of checking for someone
- * doing our work for us.
- *
- * If we fail too many times in a row, we fall back to synchronize_sched().
+ * This implementation can be thought of as an application of sequence
+ * locking to expedited grace periods, but using the sequence counter to
+ * determine when someone else has already done the work instead of for
+ * retrying readers.
  */
 void synchronize_sched_expedited(void)
 {
-       cpumask_var_t cm;
-       bool cma = false;
        int cpu;
-       long firstsnap, s, snap;
-       int trycount = 0;
+       unsigned long s;
+       struct rcu_node *rnp;
        struct rcu_state *rsp = &rcu_sched_state;
 
-       /*
-        * If we are in danger of counter wrap, just do synchronize_sched().
-        * By allowing sync_sched_expedited_started to advance no more than
-        * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
-        * that more than 3.5 billion CPUs would be required to force a
-        * counter wrap on a 32-bit system.  Quite a few more CPUs would of
-        * course be required on a 64-bit system.
-        */
-       if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
-                        (ulong)atomic_long_read(&rsp->expedited_done) +
-                        ULONG_MAX / 8)) {
-               wait_rcu_gp(call_rcu_sched);
-               atomic_long_inc(&rsp->expedited_wrap);
-               return;
-       }
+       /* Take a snapshot of the sequence number.  */
+       s = rcu_exp_gp_seq_snap(rsp);
 
-       /*
-        * Take a ticket.  Note that atomic_inc_return() implies a
-        * full memory barrier.
-        */
-       snap = atomic_long_inc_return(&rsp->expedited_start);
-       firstsnap = snap;
        if (!try_get_online_cpus()) {
                /* CPU hotplug operation in flight, fall back to normal GP. */
                wait_rcu_gp(call_rcu_sched);
@@ -3341,100 +3547,38 @@ void synchronize_sched_expedited(void)
        }
        WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
 
-       /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
-       cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
-       if (cma) {
-               cpumask_copy(cm, cpu_online_mask);
-               cpumask_clear_cpu(raw_smp_processor_id(), cm);
-               for_each_cpu(cpu, cm) {
-                       struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
-                       if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
-                               cpumask_clear_cpu(cpu, cm);
-               }
-               if (cpumask_weight(cm) == 0)
-                       goto all_cpus_idle;
+       rnp = exp_funnel_lock(rsp, s);
+       if (rnp == NULL) {
+               put_online_cpus();
+               return;  /* Someone else did our work for us. */
        }
 
-       /*
-        * Each pass through the following loop attempts to force a
-        * context switch on each CPU.
-        */
-       while (try_stop_cpus(cma ? cm : cpu_online_mask,
-                            synchronize_sched_expedited_cpu_stop,
-                            NULL) == -EAGAIN) {
-               put_online_cpus();
-               atomic_long_inc(&rsp->expedited_tryfail);
-
-               /* Check to see if someone else did our work for us. */
-               s = atomic_long_read(&rsp->expedited_done);
-               if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
-                       /* ensure test happens before caller kfree */
-                       smp_mb__before_atomic(); /* ^^^ */
-                       atomic_long_inc(&rsp->expedited_workdone1);
-                       free_cpumask_var(cm);
-                       return;
-               }
+       rcu_exp_gp_seq_start(rsp);
 
-               /* No joy, try again later.  Or just synchronize_sched(). */
-               if (trycount++ < 10) {
-                       udelay(trycount * num_online_cpus());
-               } else {
-                       wait_rcu_gp(call_rcu_sched);
-                       atomic_long_inc(&rsp->expedited_normal);
-                       free_cpumask_var(cm);
-                       return;
-               }
+       /* Stop each CPU that is online, non-idle, and not us. */
+       init_waitqueue_head(&rsp->expedited_wq);
+       atomic_set(&rsp->expedited_need_qs, 1); /* Extra count avoids race. */
+       for_each_online_cpu(cpu) {
+               struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+               struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
 
-               /* Recheck to see if someone else did our work for us. */
-               s = atomic_long_read(&rsp->expedited_done);
-               if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
-                       /* ensure test happens before caller kfree */
-                       smp_mb__before_atomic(); /* ^^^ */
-                       atomic_long_inc(&rsp->expedited_workdone2);
-                       free_cpumask_var(cm);
-                       return;
-               }
+               rdp->exp_done = false;
 
-               /*
-                * Refetching sync_sched_expedited_started allows later
-                * callers to piggyback on our grace period.  We retry
-                * after they started, so our grace period works for them,
-                * and they started after our first try, so their grace
-                * period works for us.
-                */
-               if (!try_get_online_cpus()) {
-                       /* CPU hotplug operation in flight, use normal GP. */
-                       wait_rcu_gp(call_rcu_sched);
-                       atomic_long_inc(&rsp->expedited_normal);
-                       free_cpumask_var(cm);
-                       return;
-               }
-               snap = atomic_long_read(&rsp->expedited_start);
-               smp_mb(); /* ensure read is before try_stop_cpus(). */
+               /* Skip our CPU and any idle CPUs. */
+               if (raw_smp_processor_id() == cpu ||
+                   !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+                       continue;
+               atomic_inc(&rsp->expedited_need_qs);
+               stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop,
+                                   rdp, &rdp->exp_stop_work);
        }
-       atomic_long_inc(&rsp->expedited_stoppedcpus);
 
-all_cpus_idle:
-       free_cpumask_var(cm);
+       /* Remove extra count and, if necessary, wait for CPUs to stop. */
+       if (!atomic_dec_and_test(&rsp->expedited_need_qs))
+               synchronize_sched_expedited_wait(rsp);
 
-       /*
-        * Everyone up to our most recent fetch is covered by our grace
-        * period.  Update the counter, but only if our work is still
-        * relevant -- which it won't be if someone who started later
-        * than we did already did their update.
-        */
-       do {
-               atomic_long_inc(&rsp->expedited_done_tries);
-               s = atomic_long_read(&rsp->expedited_done);
-               if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
-                       /* ensure test happens before caller kfree */
-                       smp_mb__before_atomic(); /* ^^^ */
-                       atomic_long_inc(&rsp->expedited_done_lost);
-                       break;
-               }
-       } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
-       atomic_long_inc(&rsp->expedited_done_exit);
+       rcu_exp_gp_seq_end(rsp);
+       mutex_unlock(&rnp->exp_funnel_mutex);
 
        put_online_cpus();
 }
@@ -3571,10 +3715,10 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
        struct rcu_state *rsp = rdp->rsp;
 
        if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
-               _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done);
+               _rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence);
                complete(&rsp->barrier_completion);
        } else {
-               _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done);
+               _rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence);
        }
 }
 
@@ -3586,7 +3730,7 @@ static void rcu_barrier_func(void *type)
        struct rcu_state *rsp = type;
        struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
 
-       _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
+       _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence);
        atomic_inc(&rsp->barrier_cpu_count);
        rsp->call(&rdp->barrier_head, rcu_barrier_callback);
 }
@@ -3599,55 +3743,24 @@ static void _rcu_barrier(struct rcu_state *rsp)
 {
        int cpu;
        struct rcu_data *rdp;
-       unsigned long snap = READ_ONCE(rsp->n_barrier_done);
-       unsigned long snap_done;
+       unsigned long s = rcu_seq_snap(&rsp->barrier_sequence);
 
-       _rcu_barrier_trace(rsp, "Begin", -1, snap);
+       _rcu_barrier_trace(rsp, "Begin", -1, s);
 
        /* Take mutex to serialize concurrent rcu_barrier() requests. */
        mutex_lock(&rsp->barrier_mutex);
 
-       /*
-        * Ensure that all prior references, including to ->n_barrier_done,
-        * are ordered before the _rcu_barrier() machinery.
-        */
-       smp_mb();  /* See above block comment. */
-
-       /*
-        * Recheck ->n_barrier_done to see if others did our work for us.
-        * This means checking ->n_barrier_done for an even-to-odd-to-even
-        * transition.  The "if" expression below therefore rounds the old
-        * value up to the next even number and adds two before comparing.
-        */
-       snap_done = rsp->n_barrier_done;
-       _rcu_barrier_trace(rsp, "Check", -1, snap_done);
-
-       /*
-        * If the value in snap is odd, we needed to wait for the current
-        * rcu_barrier() to complete, then wait for the next one, in other
-        * words, we need the value of snap_done to be three larger than
-        * the value of snap.  On the other hand, if the value in snap is
-        * even, we only had to wait for the next rcu_barrier() to complete,
-        * in other words, we need the value of snap_done to be only two
-        * greater than the value of snap.  The "(snap + 3) & ~0x1" computes
-        * this for us (thank you, Linus!).
-        */
-       if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) {
-               _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
+       /* Did someone else do our work for us? */
+       if (rcu_seq_done(&rsp->barrier_sequence, s)) {
+               _rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence);
                smp_mb(); /* caller's subsequent code after above check. */
                mutex_unlock(&rsp->barrier_mutex);
                return;
        }
 
-       /*
-        * Increment ->n_barrier_done to avoid duplicate work.  Use
-        * WRITE_ONCE() to prevent the compiler from speculating
-        * the increment to precede the early-exit check.
-        */
-       WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1);
-       WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
-       _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
-       smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
+       /* Mark the start of the barrier operation. */
+       rcu_seq_start(&rsp->barrier_sequence);
+       _rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence);
 
        /*
         * Initialize the count to one rather than to zero in order to
@@ -3671,10 +3784,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
                if (rcu_is_nocb_cpu(cpu)) {
                        if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) {
                                _rcu_barrier_trace(rsp, "OfflineNoCB", cpu,
-                                                  rsp->n_barrier_done);
+                                                  rsp->barrier_sequence);
                        } else {
                                _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
-                                                  rsp->n_barrier_done);
+                                                  rsp->barrier_sequence);
                                smp_mb__before_atomic();
                                atomic_inc(&rsp->barrier_cpu_count);
                                __call_rcu(&rdp->barrier_head,
@@ -3682,11 +3795,11 @@ static void _rcu_barrier(struct rcu_state *rsp)
                        }
                } else if (READ_ONCE(rdp->qlen)) {
                        _rcu_barrier_trace(rsp, "OnlineQ", cpu,
-                                          rsp->n_barrier_done);
+                                          rsp->barrier_sequence);
                        smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
                } else {
                        _rcu_barrier_trace(rsp, "OnlineNQ", cpu,
-                                          rsp->n_barrier_done);
+                                          rsp->barrier_sequence);
                }
        }
        put_online_cpus();
@@ -3698,16 +3811,13 @@ static void _rcu_barrier(struct rcu_state *rsp)
        if (atomic_dec_and_test(&rsp->barrier_cpu_count))
                complete(&rsp->barrier_completion);
 
-       /* Increment ->n_barrier_done to prevent duplicate work. */
-       smp_mb(); /* Keep increment after above mechanism. */
-       WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1);
-       WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
-       _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
-       smp_mb(); /* Keep increment before caller's subsequent code. */
-
        /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
        wait_for_completion(&rsp->barrier_completion);
 
+       /* Mark the end of the barrier operation. */
+       _rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence);
+       rcu_seq_end(&rsp->barrier_sequence);
+
        /* Other rcu_barrier() invocations can now safely proceed. */
        mutex_unlock(&rsp->barrier_mutex);
 }
@@ -3770,6 +3880,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
        WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
        rdp->cpu = cpu;
        rdp->rsp = rsp;
+       mutex_init(&rdp->exp_funnel_mutex);
        rcu_boot_init_nocb_percpu_data(rdp);
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
@@ -3961,22 +4072,22 @@ void rcu_scheduler_starting(void)
  * Compute the per-level fanout, either using the exact fanout specified
  * or balancing the tree, depending on the rcu_fanout_exact boot parameter.
  */
-static void __init rcu_init_levelspread(struct rcu_state *rsp)
+static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt)
 {
        int i;
 
        if (rcu_fanout_exact) {
-               rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
+               levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
                for (i = rcu_num_lvls - 2; i >= 0; i--)
-                       rsp->levelspread[i] = RCU_FANOUT;
+                       levelspread[i] = RCU_FANOUT;
        } else {
                int ccur;
                int cprv;
 
                cprv = nr_cpu_ids;
                for (i = rcu_num_lvls - 1; i >= 0; i--) {
-                       ccur = rsp->levelcnt[i];
-                       rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
+                       ccur = levelcnt[i];
+                       levelspread[i] = (cprv + ccur - 1) / ccur;
                        cprv = ccur;
                }
        }
@@ -3988,23 +4099,20 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
 static void __init rcu_init_one(struct rcu_state *rsp,
                struct rcu_data __percpu *rda)
 {
-       static const char * const buf[] = {
-               "rcu_node_0",
-               "rcu_node_1",
-               "rcu_node_2",
-               "rcu_node_3" };  /* Match MAX_RCU_LVLS */
-       static const char * const fqs[] = {
-               "rcu_node_fqs_0",
-               "rcu_node_fqs_1",
-               "rcu_node_fqs_2",
-               "rcu_node_fqs_3" };  /* Match MAX_RCU_LVLS */
+       static const char * const buf[] = RCU_NODE_NAME_INIT;
+       static const char * const fqs[] = RCU_FQS_NAME_INIT;
+       static const char * const exp[] = RCU_EXP_NAME_INIT;
+       static const char * const exp_sched[] = RCU_EXP_SCHED_NAME_INIT;
        static u8 fl_mask = 0x1;
+
+       int levelcnt[RCU_NUM_LVLS];             /* # nodes in each level. */
+       int levelspread[RCU_NUM_LVLS];          /* kids/node in each level. */
        int cpustride = 1;
        int i;
        int j;
        struct rcu_node *rnp;
 
-       BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf));  /* Fix buf[] init! */
+       BUILD_BUG_ON(RCU_NUM_LVLS > ARRAY_SIZE(buf));  /* Fix buf[] init! */
 
        /* Silence gcc 4.8 false positive about array index out of range. */
        if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS)
@@ -4013,19 +4121,19 @@ static void __init rcu_init_one(struct rcu_state *rsp,
        /* Initialize the level-tracking arrays. */
 
        for (i = 0; i < rcu_num_lvls; i++)
-               rsp->levelcnt[i] = num_rcu_lvl[i];
+               levelcnt[i] = num_rcu_lvl[i];
        for (i = 1; i < rcu_num_lvls; i++)
-               rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
-       rcu_init_levelspread(rsp);
+               rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1];
+       rcu_init_levelspread(levelspread, levelcnt);
        rsp->flavor_mask = fl_mask;
        fl_mask <<= 1;
 
        /* Initialize the elements themselves, starting from the leaves. */
 
        for (i = rcu_num_lvls - 1; i >= 0; i--) {
-               cpustride *= rsp->levelspread[i];
+               cpustride *= levelspread[i];
                rnp = rsp->level[i];
-               for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
+               for (j = 0; j < levelcnt[i]; j++, rnp++) {
                        raw_spin_lock_init(&rnp->lock);
                        lockdep_set_class_and_name(&rnp->lock,
                                                   &rcu_node_class[i], buf[i]);
@@ -4045,14 +4153,23 @@ static void __init rcu_init_one(struct rcu_state *rsp,
                                rnp->grpmask = 0;
                                rnp->parent = NULL;
                        } else {
-                               rnp->grpnum = j % rsp->levelspread[i - 1];
+                               rnp->grpnum = j % levelspread[i - 1];
                                rnp->grpmask = 1UL << rnp->grpnum;
                                rnp->parent = rsp->level[i - 1] +
-                                             j / rsp->levelspread[i - 1];
+                                             j / levelspread[i - 1];
                        }
                        rnp->level = i;
                        INIT_LIST_HEAD(&rnp->blkd_tasks);
                        rcu_init_one_nocb(rnp);
+                       mutex_init(&rnp->exp_funnel_mutex);
+                       if (rsp == &rcu_sched_state)
+                               lockdep_set_class_and_name(
+                                       &rnp->exp_funnel_mutex,
+                                       &rcu_exp_sched_class[i], exp_sched[i]);
+                       else
+                               lockdep_set_class_and_name(
+                                       &rnp->exp_funnel_mutex,
+                                       &rcu_exp_class[i], exp[i]);
                }
        }
 
@@ -4076,9 +4193,7 @@ static void __init rcu_init_geometry(void)
 {
        ulong d;
        int i;
-       int j;
-       int n = nr_cpu_ids;
-       int rcu_capacity[MAX_RCU_LVLS + 1];
+       int rcu_capacity[RCU_NUM_LVLS];
 
        /*
         * Initialize any unspecified boot parameters.
@@ -4100,48 +4215,50 @@ static void __init rcu_init_geometry(void)
        pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n",
                rcu_fanout_leaf, nr_cpu_ids);
 
-       /*
-        * Compute number of nodes that can be handled an rcu_node tree
-        * with the given number of levels.  Setting rcu_capacity[0] makes
-        * some of the arithmetic easier.
-        */
-       rcu_capacity[0] = 1;
-       rcu_capacity[1] = rcu_fanout_leaf;
-       for (i = 2; i <= MAX_RCU_LVLS; i++)
-               rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT;
-
        /*
         * The boot-time rcu_fanout_leaf parameter is only permitted
         * to increase the leaf-level fanout, not decrease it.  Of course,
         * the leaf-level fanout cannot exceed the number of bits in
-        * the rcu_node masks.  Finally, the tree must be able to accommodate
-        * the configured number of CPUs.  Complain and fall back to the
-        * compile-time values if these limits are exceeded.
+        * the rcu_node masks.  Complain and fall back to the compile-
+        * time values if these limits are exceeded.
         */
        if (rcu_fanout_leaf < RCU_FANOUT_LEAF ||
-           rcu_fanout_leaf > sizeof(unsigned long) * 8 ||
-           n > rcu_capacity[MAX_RCU_LVLS]) {
+           rcu_fanout_leaf > sizeof(unsigned long) * 8) {
+               rcu_fanout_leaf = RCU_FANOUT_LEAF;
                WARN_ON(1);
                return;
        }
 
+       /*
+        * Compute number of nodes that can be handled an rcu_node tree
+        * with the given number of levels.
+        */
+       rcu_capacity[0] = rcu_fanout_leaf;
+       for (i = 1; i < RCU_NUM_LVLS; i++)
+               rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT;
+
+       /*
+        * The tree must be able to accommodate the configured number of CPUs.
+        * If this limit is exceeded than we have a serious problem elsewhere.
+        */
+       if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1])
+               panic("rcu_init_geometry: rcu_capacity[] is too small");
+
+       /* Calculate the number of levels in the tree. */
+       for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) {
+       }
+       rcu_num_lvls = i + 1;
+
        /* Calculate the number of rcu_nodes at each level of the tree. */
-       for (i = 1; i <= MAX_RCU_LVLS; i++)
-               if (n <= rcu_capacity[i]) {
-                       for (j = 0; j <= i; j++)
-                               num_rcu_lvl[j] =
-                                       DIV_ROUND_UP(n, rcu_capacity[i - j]);
-                       rcu_num_lvls = i;
-                       for (j = i + 1; j <= MAX_RCU_LVLS; j++)
-                               num_rcu_lvl[j] = 0;
-                       break;
-               }
+       for (i = 0; i < rcu_num_lvls; i++) {
+               int cap = rcu_capacity[(rcu_num_lvls - 1) - i];
+               num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap);
+       }
 
        /* Calculate the total number of rcu_node structures. */
        rcu_num_nodes = 0;
-       for (i = 0; i <= MAX_RCU_LVLS; i++)
+       for (i = 0; i < rcu_num_lvls; i++)
                rcu_num_nodes += num_rcu_lvl[i];
-       rcu_num_nodes -= n;
 }
 
 /*