bpf: Use raw_spin_trylock() for pcpu_freelist_push/pop in NMI
authorSong Liu <songliubraving@fb.com>
Mon, 5 Oct 2020 16:58:38 +0000 (09:58 -0700)
committerDaniel Borkmann <daniel@iogearbox.net>
Mon, 5 Oct 2020 22:04:11 +0000 (00:04 +0200)
Recent improvements in LOCKDEP highlighted a potential A-A deadlock with
pcpu_freelist in NMI:

./tools/testing/selftests/bpf/test_progs -t stacktrace_build_id_nmi

[   18.984807] ================================
[   18.984807] WARNING: inconsistent lock state
[   18.984808] 5.9.0-rc6-01771-g1466de1330e1 #2967 Not tainted
[   18.984809] --------------------------------
[   18.984809] inconsistent {INITIAL USE} -> {IN-NMI} usage.
[   18.984810] test_progs/1990 [HC2[2]:SC0[0]:HE0:SE1] takes:
[   18.984810] ffffe8ffffc219c0 (&head->lock){....}-{2:2}, at: __pcpu_freelist_pop+0xe3/0x180
[   18.984813] {INITIAL USE} state was registered at:
[   18.984814]   lock_acquire+0x175/0x7c0
[   18.984814]   _raw_spin_lock+0x2c/0x40
[   18.984815]   __pcpu_freelist_pop+0xe3/0x180
[   18.984815]   pcpu_freelist_pop+0x31/0x40
[   18.984816]   htab_map_alloc+0xbbf/0xf40
[   18.984816]   __do_sys_bpf+0x5aa/0x3ed0
[   18.984817]   do_syscall_64+0x2d/0x40
[   18.984818]   entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   18.984818] irq event stamp: 12
[...]
[   18.984822] other info that might help us debug this:
[   18.984823]  Possible unsafe locking scenario:
[   18.984823]
[   18.984824]        CPU0
[   18.984824]        ----
[   18.984824]   lock(&head->lock);
[   18.984826]   <Interrupt>
[   18.984826]     lock(&head->lock);
[   18.984827]
[   18.984828]  *** DEADLOCK ***
[   18.984828]
[   18.984829] 2 locks held by test_progs/1990:
[...]
[   18.984838]  <NMI>
[   18.984838]  dump_stack+0x9a/0xd0
[   18.984839]  lock_acquire+0x5c9/0x7c0
[   18.984839]  ? lock_release+0x6f0/0x6f0
[   18.984840]  ? __pcpu_freelist_pop+0xe3/0x180
[   18.984840]  _raw_spin_lock+0x2c/0x40
[   18.984841]  ? __pcpu_freelist_pop+0xe3/0x180
[   18.984841]  __pcpu_freelist_pop+0xe3/0x180
[   18.984842]  pcpu_freelist_pop+0x17/0x40
[   18.984842]  ? lock_release+0x6f0/0x6f0
[   18.984843]  __bpf_get_stackid+0x534/0xaf0
[   18.984843]  bpf_prog_1fd9e30e1438d3c5_oncpu+0x73/0x350
[   18.984844]  bpf_overflow_handler+0x12f/0x3f0

This is because pcpu_freelist_head.lock is accessed in both NMI and
non-NMI context. Fix this issue by using raw_spin_trylock() in NMI.

Since NMI interrupts non-NMI context, when NMI context tries to lock the
raw_spinlock, non-NMI context of the same CPU may already have locked a
lock and is blocked from unlocking the lock. For a system with N CPUs,
there could be N NMIs at the same time, and they may block N non-NMI
raw_spinlocks. This is tricky for pcpu_freelist_push(), where unlike
_pop(), failing _push() means leaking memory. This issue is more likely to
trigger in non-SMP system.

Fix this issue with an extra list, pcpu_freelist.extralist. The extralist
is primarily used to take _push() when raw_spin_trylock() failed on all
the per CPU lists. It should be empty most of the time. The following
table summarizes the behavior of pcpu_freelist in NMI and non-NMI:

non-NMI pop():  use _lock(); check per CPU lists first;
                if all per CPU lists are empty, check extralist;
                if extralist is empty, return NULL.

non-NMI push(): use _lock(); only push to per CPU lists.

NMI pop():    use _trylock(); check per CPU lists first;
              if all per CPU lists are locked or empty, check extralist;
              if extralist is locked or empty, return NULL.

NMI push():   use _trylock(); check per CPU lists first;
              if all per CPU lists are locked; try push to extralist;
              if extralist is also locked, keep trying on per CPU lists.

Reported-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20201005165838.3735218-1-songliubraving@fb.com
kernel/bpf/percpu_freelist.c
kernel/bpf/percpu_freelist.h

index b367430e611c7d493b478c3507ef4973fd63cc1e..3d897de8906121cfa95a1c34a0b8170f02fba4f8 100644 (file)
@@ -17,6 +17,8 @@ int pcpu_freelist_init(struct pcpu_freelist *s)
                raw_spin_lock_init(&head->lock);
                head->first = NULL;
        }
+       raw_spin_lock_init(&s->extralist.lock);
+       s->extralist.first = NULL;
        return 0;
 }
 
@@ -40,12 +42,50 @@ static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
        raw_spin_unlock(&head->lock);
 }
 
+static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s,
+                                               struct pcpu_freelist_node *node)
+{
+       if (!raw_spin_trylock(&s->extralist.lock))
+               return false;
+
+       pcpu_freelist_push_node(&s->extralist, node);
+       raw_spin_unlock(&s->extralist.lock);
+       return true;
+}
+
+static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s,
+                                            struct pcpu_freelist_node *node)
+{
+       int cpu, orig_cpu;
+
+       orig_cpu = cpu = raw_smp_processor_id();
+       while (1) {
+               struct pcpu_freelist_head *head;
+
+               head = per_cpu_ptr(s->freelist, cpu);
+               if (raw_spin_trylock(&head->lock)) {
+                       pcpu_freelist_push_node(head, node);
+                       raw_spin_unlock(&head->lock);
+                       return;
+               }
+               cpu = cpumask_next(cpu, cpu_possible_mask);
+               if (cpu >= nr_cpu_ids)
+                       cpu = 0;
+
+               /* cannot lock any per cpu lock, try extralist */
+               if (cpu == orig_cpu &&
+                   pcpu_freelist_try_push_extra(s, node))
+                       return;
+       }
+}
+
 void __pcpu_freelist_push(struct pcpu_freelist *s,
                        struct pcpu_freelist_node *node)
 {
-       struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist);
-
-       ___pcpu_freelist_push(head, node);
+       if (in_nmi())
+               ___pcpu_freelist_push_nmi(s, node);
+       else
+               ___pcpu_freelist_push(this_cpu_ptr(s->freelist), node);
 }
 
 void pcpu_freelist_push(struct pcpu_freelist *s,
@@ -81,7 +121,7 @@ again:
        }
 }
 
-struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
+static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
 {
        struct pcpu_freelist_head *head;
        struct pcpu_freelist_node *node;
@@ -102,8 +142,59 @@ struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
                if (cpu >= nr_cpu_ids)
                        cpu = 0;
                if (cpu == orig_cpu)
-                       return NULL;
+                       break;
+       }
+
+       /* per cpu lists are all empty, try extralist */
+       raw_spin_lock(&s->extralist.lock);
+       node = s->extralist.first;
+       if (node)
+               s->extralist.first = node->next;
+       raw_spin_unlock(&s->extralist.lock);
+       return node;
+}
+
+static struct pcpu_freelist_node *
+___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
+{
+       struct pcpu_freelist_head *head;
+       struct pcpu_freelist_node *node;
+       int orig_cpu, cpu;
+
+       orig_cpu = cpu = raw_smp_processor_id();
+       while (1) {
+               head = per_cpu_ptr(s->freelist, cpu);
+               if (raw_spin_trylock(&head->lock)) {
+                       node = head->first;
+                       if (node) {
+                               head->first = node->next;
+                               raw_spin_unlock(&head->lock);
+                               return node;
+                       }
+                       raw_spin_unlock(&head->lock);
+               }
+               cpu = cpumask_next(cpu, cpu_possible_mask);
+               if (cpu >= nr_cpu_ids)
+                       cpu = 0;
+               if (cpu == orig_cpu)
+                       break;
        }
+
+       /* cannot pop from per cpu lists, try extralist */
+       if (!raw_spin_trylock(&s->extralist.lock))
+               return NULL;
+       node = s->extralist.first;
+       if (node)
+               s->extralist.first = node->next;
+       raw_spin_unlock(&s->extralist.lock);
+       return node;
+}
+
+struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
+{
+       if (in_nmi())
+               return ___pcpu_freelist_pop_nmi(s);
+       return ___pcpu_freelist_pop(s);
 }
 
 struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
index fbf8a8a2897911e437f5167b090a1724d8665286..3c76553cfe571f8e1ed427bc56154f0c7e000f4e 100644 (file)
@@ -13,6 +13,7 @@ struct pcpu_freelist_head {
 
 struct pcpu_freelist {
        struct pcpu_freelist_head __percpu *freelist;
+       struct pcpu_freelist_head extralist;
 };
 
 struct pcpu_freelist_node {