Commit | Line | Data |
---|---|---|
2cdb54c9 MCC |
1 | .. SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | ================================================= | |
4 | Using RCU hlist_nulls to protect list and objects | |
5 | ================================================= | |
6 | ||
7 | This section describes how to use hlist_nulls to | |
8 | protect read-mostly linked lists and | |
9 | objects using SLAB_TYPESAFE_BY_RCU allocations. | |
10 | ||
404147fa | 11 | Please read the basics in listRCU.rst. |
2cdb54c9 | 12 | |
2d9c318b MCC |
13 | Using 'nulls' |
14 | ============= | |
15 | ||
2cdb54c9 | 16 | Using special makers (called 'nulls') is a convenient way |
da82af04 | 17 | to solve following problem. |
2cdb54c9 | 18 | |
da82af04 PM |
19 | Without 'nulls', a typical RCU linked list managing objects which are |
20 | allocated with SLAB_TYPESAFE_BY_RCU kmem_cache can use the following | |
21 | algorithms: | |
2cdb54c9 | 22 | |
da82af04 PM |
23 | 1) Lookup algorithm |
24 | ------------------- | |
2cdb54c9 MCC |
25 | |
26 | :: | |
27 | ||
2cdb54c9 | 28 | begin: |
da82af04 | 29 | rcu_read_lock() |
2cdb54c9 MCC |
30 | obj = lockless_lookup(key); |
31 | if (obj) { | |
32 | if (!try_get_ref(obj)) // might fail for free objects | |
33 | goto begin; | |
34 | /* | |
35 | * Because a writer could delete object, and a writer could | |
36 | * reuse these object before the RCU grace period, we | |
37 | * must check key after getting the reference on object | |
38 | */ | |
39 | if (obj->key != key) { // not the object we expected | |
40 | put_ref(obj); | |
da82af04 | 41 | rcu_read_unlock(); |
2cdb54c9 MCC |
42 | goto begin; |
43 | } | |
44 | } | |
45 | rcu_read_unlock(); | |
46 | ||
47 | Beware that lockless_lookup(key) cannot use traditional hlist_for_each_entry_rcu() | |
48 | but a version with an additional memory barrier (smp_rmb()) | |
49 | ||
50 | :: | |
51 | ||
52 | lockless_lookup(key) | |
53 | { | |
54 | struct hlist_node *node, *next; | |
55 | for (pos = rcu_dereference((head)->first); | |
da82af04 PM |
56 | pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) && |
57 | ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); | |
58 | pos = rcu_dereference(next)) | |
2cdb54c9 MCC |
59 | if (obj->key == key) |
60 | return obj; | |
61 | return NULL; | |
62 | } | |
63 | ||
64 | And note the traditional hlist_for_each_entry_rcu() misses this smp_rmb():: | |
65 | ||
66 | struct hlist_node *node; | |
67 | for (pos = rcu_dereference((head)->first); | |
da82af04 PM |
68 | pos && ({ prefetch(pos->next); 1; }) && |
69 | ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); | |
70 | pos = rcu_dereference(pos->next)) | |
2cdb54c9 MCC |
71 | if (obj->key == key) |
72 | return obj; | |
73 | return NULL; | |
74 | ||
75 | Quoting Corey Minyard:: | |
76 | ||
77 | "If the object is moved from one list to another list in-between the | |
78 | time the hash is calculated and the next field is accessed, and the | |
79 | object has moved to the end of a new list, the traversal will not | |
80 | complete properly on the list it should have, since the object will | |
81 | be on the end of the new list and there's not a way to tell it's on a | |
82 | new list and restart the list traversal. I think that this can be | |
83 | solved by pre-fetching the "next" field (with proper barriers) before | |
84 | checking the key." | |
85 | ||
da82af04 PM |
86 | 2) Insertion algorithm |
87 | ---------------------- | |
2cdb54c9 MCC |
88 | |
89 | We need to make sure a reader cannot read the new 'obj->obj_next' value | |
da82af04 | 90 | and previous value of 'obj->key'. Otherwise, an item could be deleted |
2cdb54c9 | 91 | from a chain, and inserted into another chain. If new chain was empty |
da82af04 PM |
92 | before the move, 'next' pointer is NULL, and lockless reader can not |
93 | detect the fact that it missed following items in original chain. | |
2cdb54c9 MCC |
94 | |
95 | :: | |
96 | ||
97 | /* | |
da82af04 PM |
98 | * Please note that new inserts are done at the head of list, |
99 | * not in the middle or end. | |
100 | */ | |
2cdb54c9 MCC |
101 | obj = kmem_cache_alloc(...); |
102 | lock_chain(); // typically a spin_lock() | |
103 | obj->key = key; | |
da82af04 | 104 | atomic_set_release(&obj->refcnt, 1); // key before refcnt |
2cdb54c9 MCC |
105 | hlist_add_head_rcu(&obj->obj_node, list); |
106 | unlock_chain(); // typically a spin_unlock() | |
107 | ||
108 | ||
da82af04 PM |
109 | 3) Removal algorithm |
110 | -------------------- | |
111 | ||
2cdb54c9 MCC |
112 | Nothing special here, we can use a standard RCU hlist deletion. |
113 | But thanks to SLAB_TYPESAFE_BY_RCU, beware a deleted object can be reused | |
114 | very very fast (before the end of RCU grace period) | |
115 | ||
116 | :: | |
117 | ||
118 | if (put_last_reference_on(obj) { | |
119 | lock_chain(); // typically a spin_lock() | |
120 | hlist_del_init_rcu(&obj->obj_node); | |
121 | unlock_chain(); // typically a spin_unlock() | |
122 | kmem_cache_free(cachep, obj); | |
123 | } | |
124 | ||
125 | ||
126 | ||
127 | -------------------------------------------------------------------------- | |
128 | ||
2d9c318b MCC |
129 | Avoiding extra smp_rmb() |
130 | ======================== | |
131 | ||
2cdb54c9 | 132 | With hlist_nulls we can avoid extra smp_rmb() in lockless_lookup() |
da82af04 | 133 | and extra _release() in insert function. |
2cdb54c9 MCC |
134 | |
135 | For example, if we choose to store the slot number as the 'nulls' | |
136 | end-of-list marker for each slot of the hash table, we can detect | |
137 | a race (some writer did a delete and/or a move of an object | |
138 | to another chain) checking the final 'nulls' value if | |
139 | the lookup met the end of chain. If final 'nulls' value | |
140 | is not the slot number, then we must restart the lookup at | |
141 | the beginning. If the object was moved to the same chain, | |
da82af04 | 142 | then the reader doesn't care: It might occasionally |
2cdb54c9 MCC |
143 | scan the list again without harm. |
144 | ||
145 | ||
da82af04 PM |
146 | 1) lookup algorithm |
147 | ------------------- | |
2cdb54c9 MCC |
148 | |
149 | :: | |
150 | ||
151 | head = &table[slot]; | |
2cdb54c9 | 152 | begin: |
da82af04 | 153 | rcu_read_lock(); |
2cdb54c9 MCC |
154 | hlist_nulls_for_each_entry_rcu(obj, node, head, member) { |
155 | if (obj->key == key) { | |
da82af04 PM |
156 | if (!try_get_ref(obj)) { // might fail for free objects |
157 | rcu_read_unlock(); | |
2cdb54c9 | 158 | goto begin; |
da82af04 | 159 | } |
2cdb54c9 MCC |
160 | if (obj->key != key) { // not the object we expected |
161 | put_ref(obj); | |
da82af04 | 162 | rcu_read_unlock(); |
2cdb54c9 MCC |
163 | goto begin; |
164 | } | |
da82af04 PM |
165 | goto out; |
166 | } | |
167 | } | |
168 | ||
169 | // If the nulls value we got at the end of this lookup is | |
170 | // not the expected one, we must restart lookup. | |
171 | // We probably met an item that was moved to another chain. | |
172 | if (get_nulls_value(node) != slot) { | |
173 | put_ref(obj); | |
174 | rcu_read_unlock(); | |
175 | goto begin; | |
2cdb54c9 | 176 | } |
2cdb54c9 MCC |
177 | obj = NULL; |
178 | ||
179 | out: | |
180 | rcu_read_unlock(); | |
181 | ||
da82af04 PM |
182 | 2) Insert algorithm |
183 | ------------------- | |
2cdb54c9 MCC |
184 | |
185 | :: | |
186 | ||
187 | /* | |
da82af04 PM |
188 | * Please note that new inserts are done at the head of list, |
189 | * not in the middle or end. | |
190 | */ | |
2cdb54c9 MCC |
191 | obj = kmem_cache_alloc(cachep); |
192 | lock_chain(); // typically a spin_lock() | |
193 | obj->key = key; | |
da82af04 | 194 | atomic_set_release(&obj->refcnt, 1); // key before refcnt |
2cdb54c9 | 195 | /* |
da82af04 PM |
196 | * insert obj in RCU way (readers might be traversing chain) |
197 | */ | |
2cdb54c9 MCC |
198 | hlist_nulls_add_head_rcu(&obj->obj_node, list); |
199 | unlock_chain(); // typically a spin_unlock() |