Merge branch 'master' of https://github.com/celestinechen/fio
[fio.git] / lib / prio_tree.c
1 /*
2  * lib/prio_tree.c - priority search tree
3  *
4  * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
5  *
6  * This file is released under the GPL v2.
7  *
8  * Based on the radix priority search tree proposed by Edward M. McCreight
9  * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
10  *
11  * 02Feb2004    Initial version
12  */
13
14 #include <assert.h>
15 #include <stdlib.h>
16 #include <limits.h>
17
18 #include "../compiler/compiler.h"
19 #include "prio_tree.h"
20
21 /*
22  * A clever mix of heap and radix trees forms a radix priority search tree (PST)
23  * which is useful for storing intervals, e.g, we can consider a vma as a closed
24  * interval of file pages [offset_begin, offset_end], and store all vmas that
25  * map a file in a PST. Then, using the PST, we can answer a stabbing query,
26  * i.e., selecting a set of stored intervals (vmas) that overlap with (map) a
27  * given input interval X (a set of consecutive file pages), in "O(log n + m)"
28  * time where 'log n' is the height of the PST, and 'm' is the number of stored
29  * intervals (vmas) that overlap (map) with the input interval X (the set of
30  * consecutive file pages).
31  *
32  * In our implementation, we store closed intervals of the form [radix_index,
33  * heap_index]. We assume that always radix_index <= heap_index. McCreight's PST
34  * is designed for storing intervals with unique radix indices, i.e., each
35  * interval have different radix_index. However, this limitation can be easily
36  * overcome by using the size, i.e., heap_index - radix_index, as part of the
37  * index, so we index the tree using [(radix_index,size), heap_index].
38  *
39  * When the above-mentioned indexing scheme is used, theoretically, in a 32 bit
40  * machine, the maximum height of a PST can be 64. We can use a balanced version
41  * of the priority search tree to optimize the tree height, but the balanced
42  * tree proposed by McCreight is too complex and memory-hungry for our purpose.
43  */
44
45 static void get_index(const struct prio_tree_node *node,
46                       unsigned long *radix, unsigned long *heap)
47 {
48         *radix = node->start;
49         *heap = node->last;
50 }
51
52 static unsigned long index_bits_to_maxindex[BITS_PER_LONG];
53
54 static void fio_init prio_tree_init(void)
55 {
56         unsigned int i;
57
58         for (i = 0; i < FIO_ARRAY_SIZE(index_bits_to_maxindex) - 1; i++)
59                 index_bits_to_maxindex[i] = (1UL << (i + 1)) - 1;
60         index_bits_to_maxindex[FIO_ARRAY_SIZE(index_bits_to_maxindex) - 1] = ~0UL;
61 }
62
63 /*
64  * Maximum heap_index that can be stored in a PST with index_bits bits
65  */
66 static inline unsigned long prio_tree_maxindex(unsigned int bits)
67 {
68         return index_bits_to_maxindex[bits - 1];
69 }
70
71 /*
72  * Extend a priority search tree so that it can store a node with heap_index
73  * max_heap_index. In the worst case, this algorithm takes O((log n)^2).
74  * However, this function is used rarely and the common case performance is
75  * not bad.
76  */
77 static struct prio_tree_node *prio_tree_expand(struct prio_tree_root *root,
78                 struct prio_tree_node *node, unsigned long max_heap_index)
79 {
80         struct prio_tree_node *first = NULL, *prev, *last = NULL;
81
82         if (max_heap_index > prio_tree_maxindex(root->index_bits))
83                 root->index_bits++;
84
85         while (max_heap_index > prio_tree_maxindex(root->index_bits)) {
86                 root->index_bits++;
87
88                 if (prio_tree_empty(root))
89                         continue;
90
91                 if (first == NULL) {
92                         first = root->prio_tree_node;
93                         prio_tree_remove(root, root->prio_tree_node);
94                         INIT_PRIO_TREE_NODE(first);
95                         last = first;
96                 } else {
97                         prev = last;
98                         last = root->prio_tree_node;
99                         prio_tree_remove(root, root->prio_tree_node);
100                         INIT_PRIO_TREE_NODE(last);
101                         prev->left = last;
102                         last->parent = prev;
103                 }
104         }
105
106         INIT_PRIO_TREE_NODE(node);
107
108         if (first) {
109                 node->left = first;
110                 first->parent = node;
111         } else
112                 last = node;
113
114         if (!prio_tree_empty(root)) {
115                 last->left = root->prio_tree_node;
116                 last->left->parent = last;
117         }
118
119         root->prio_tree_node = node;
120         return node;
121 }
122
123 /*
124  * Replace a prio_tree_node with a new node and return the old node
125  */
126 struct prio_tree_node *prio_tree_replace(struct prio_tree_root *root,
127                 struct prio_tree_node *old, struct prio_tree_node *node)
128 {
129         INIT_PRIO_TREE_NODE(node);
130
131         if (prio_tree_root(old)) {
132                 assert(root->prio_tree_node == old);
133                 /*
134                  * We can reduce root->index_bits here. However, it is complex
135                  * and does not help much to improve performance (IMO).
136                  */
137                 node->parent = node;
138                 root->prio_tree_node = node;
139         } else {
140                 node->parent = old->parent;
141                 if (old->parent->left == old)
142                         old->parent->left = node;
143                 else
144                         old->parent->right = node;
145         }
146
147         if (!prio_tree_left_empty(old)) {
148                 node->left = old->left;
149                 old->left->parent = node;
150         }
151
152         if (!prio_tree_right_empty(old)) {
153                 node->right = old->right;
154                 old->right->parent = node;
155         }
156
157         return old;
158 }
159
160 /*
161  * Insert a prio_tree_node @node into a radix priority search tree @root. The
162  * algorithm typically takes O(log n) time where 'log n' is the number of bits
163  * required to represent the maximum heap_index. In the worst case, the algo
164  * can take O((log n)^2) - check prio_tree_expand.
165  *
166  * If a prior node with same radix_index and heap_index is already found in
167  * the tree, then returns the address of the prior node. Otherwise, inserts
168  * @node into the tree and returns @node.
169  */
170 struct prio_tree_node *prio_tree_insert(struct prio_tree_root *root,
171                 struct prio_tree_node *node)
172 {
173         struct prio_tree_node *cur, *res = node;
174         unsigned long radix_index, heap_index;
175         unsigned long r_index, h_index, index, mask;
176         int size_flag = 0;
177
178         get_index(node, &radix_index, &heap_index);
179
180         if (prio_tree_empty(root) ||
181                         heap_index > prio_tree_maxindex(root->index_bits))
182                 return prio_tree_expand(root, node, heap_index);
183
184         cur = root->prio_tree_node;
185         mask = 1UL << (root->index_bits - 1);
186
187         while (mask) {
188                 get_index(cur, &r_index, &h_index);
189
190                 if (r_index == radix_index && h_index == heap_index)
191                         return cur;
192
193                 if (h_index < heap_index ||
194                     (h_index == heap_index && r_index > radix_index)) {
195                         struct prio_tree_node *tmp = node;
196                         node = prio_tree_replace(root, cur, node);
197                         cur = tmp;
198                         /* swap indices */
199                         index = r_index;
200                         r_index = radix_index;
201                         radix_index = index;
202                         index = h_index;
203                         h_index = heap_index;
204                         heap_index = index;
205                 }
206
207                 if (size_flag)
208                         index = heap_index - radix_index;
209                 else
210                         index = radix_index;
211
212                 if (index & mask) {
213                         if (prio_tree_right_empty(cur)) {
214                                 INIT_PRIO_TREE_NODE(node);
215                                 cur->right = node;
216                                 node->parent = cur;
217                                 return res;
218                         } else
219                                 cur = cur->right;
220                 } else {
221                         if (prio_tree_left_empty(cur)) {
222                                 INIT_PRIO_TREE_NODE(node);
223                                 cur->left = node;
224                                 node->parent = cur;
225                                 return res;
226                         } else
227                                 cur = cur->left;
228                 }
229
230                 mask >>= 1;
231
232                 if (!mask) {
233                         mask = 1UL << (BITS_PER_LONG - 1);
234                         size_flag = 1;
235                 }
236         }
237         /* Should not reach here */
238         assert(0);
239         return NULL;
240 }
241
242 /*
243  * Remove a prio_tree_node @node from a radix priority search tree @root. The
244  * algorithm takes O(log n) time where 'log n' is the number of bits required
245  * to represent the maximum heap_index.
246  */
247 void prio_tree_remove(struct prio_tree_root *root, struct prio_tree_node *node)
248 {
249         struct prio_tree_node *cur;
250         unsigned long r_index, h_index_right, h_index_left;
251
252         cur = node;
253
254         while (!prio_tree_left_empty(cur) || !prio_tree_right_empty(cur)) {
255                 if (!prio_tree_left_empty(cur))
256                         get_index(cur->left, &r_index, &h_index_left);
257                 else {
258                         cur = cur->right;
259                         continue;
260                 }
261
262                 if (!prio_tree_right_empty(cur))
263                         get_index(cur->right, &r_index, &h_index_right);
264                 else {
265                         cur = cur->left;
266                         continue;
267                 }
268
269                 /* both h_index_left and h_index_right cannot be 0 */
270                 if (h_index_left >= h_index_right)
271                         cur = cur->left;
272                 else
273                         cur = cur->right;
274         }
275
276         if (prio_tree_root(cur)) {
277                 assert(root->prio_tree_node == cur);
278                 INIT_PRIO_TREE_ROOT(root);
279                 return;
280         }
281
282         if (cur->parent->right == cur)
283                 cur->parent->right = cur->parent;
284         else
285                 cur->parent->left = cur->parent;
286
287         while (cur != node)
288                 cur = prio_tree_replace(root, cur->parent, cur);
289 }
290
291 /*
292  * Following functions help to enumerate all prio_tree_nodes in the tree that
293  * overlap with the input interval X [radix_index, heap_index]. The enumeration
294  * takes O(log n + m) time where 'log n' is the height of the tree (which is
295  * proportional to # of bits required to represent the maximum heap_index) and
296  * 'm' is the number of prio_tree_nodes that overlap the interval X.
297  */
298
299 static struct prio_tree_node *prio_tree_left(struct prio_tree_iter *iter,
300                 unsigned long *r_index, unsigned long *h_index)
301 {
302         if (prio_tree_left_empty(iter->cur))
303                 return NULL;
304
305         get_index(iter->cur->left, r_index, h_index);
306
307         if (iter->r_index <= *h_index) {
308                 iter->cur = iter->cur->left;
309                 iter->mask >>= 1;
310                 if (iter->mask) {
311                         if (iter->size_level)
312                                 iter->size_level++;
313                 } else {
314                         if (iter->size_level) {
315                                 assert(prio_tree_left_empty(iter->cur));
316                                 assert(prio_tree_right_empty(iter->cur));
317                                 iter->size_level++;
318                                 iter->mask = ULONG_MAX;
319                         } else {
320                                 iter->size_level = 1;
321                                 iter->mask = 1UL << (BITS_PER_LONG - 1);
322                         }
323                 }
324                 return iter->cur;
325         }
326
327         return NULL;
328 }
329
330 static struct prio_tree_node *prio_tree_right(struct prio_tree_iter *iter,
331                 unsigned long *r_index, unsigned long *h_index)
332 {
333         unsigned long value;
334
335         if (prio_tree_right_empty(iter->cur))
336                 return NULL;
337
338         if (iter->size_level)
339                 value = iter->value;
340         else
341                 value = iter->value | iter->mask;
342
343         if (iter->h_index < value)
344                 return NULL;
345
346         get_index(iter->cur->right, r_index, h_index);
347
348         if (iter->r_index <= *h_index) {
349                 iter->cur = iter->cur->right;
350                 iter->mask >>= 1;
351                 iter->value = value;
352                 if (iter->mask) {
353                         if (iter->size_level)
354                                 iter->size_level++;
355                 } else {
356                         if (iter->size_level) {
357                                 assert(prio_tree_left_empty(iter->cur));
358                                 assert(prio_tree_right_empty(iter->cur));
359                                 iter->size_level++;
360                                 iter->mask = ULONG_MAX;
361                         } else {
362                                 iter->size_level = 1;
363                                 iter->mask = 1UL << (BITS_PER_LONG - 1);
364                         }
365                 }
366                 return iter->cur;
367         }
368
369         return NULL;
370 }
371
372 static struct prio_tree_node *prio_tree_parent(struct prio_tree_iter *iter)
373 {
374         iter->cur = iter->cur->parent;
375         if (iter->mask == ULONG_MAX)
376                 iter->mask = 1UL;
377         else if (iter->size_level == 1)
378                 iter->mask = 1UL;
379         else
380                 iter->mask <<= 1;
381         if (iter->size_level)
382                 iter->size_level--;
383         if (!iter->size_level && (iter->value & iter->mask))
384                 iter->value ^= iter->mask;
385         return iter->cur;
386 }
387
388 static inline int overlap(struct prio_tree_iter *iter,
389                 unsigned long r_index, unsigned long h_index)
390 {
391         return iter->h_index >= r_index && iter->r_index <= h_index;
392 }
393
394 /*
395  * prio_tree_first:
396  *
397  * Get the first prio_tree_node that overlaps with the interval [radix_index,
398  * heap_index]. Note that always radix_index <= heap_index. We do a pre-order
399  * traversal of the tree.
400  */
401 static struct prio_tree_node *prio_tree_first(struct prio_tree_iter *iter)
402 {
403         struct prio_tree_root *root;
404         unsigned long r_index, h_index;
405
406         INIT_PRIO_TREE_ITER(iter);
407
408         root = iter->root;
409         if (prio_tree_empty(root))
410                 return NULL;
411
412         get_index(root->prio_tree_node, &r_index, &h_index);
413
414         if (iter->r_index > h_index)
415                 return NULL;
416
417         iter->mask = 1UL << (root->index_bits - 1);
418         iter->cur = root->prio_tree_node;
419
420         while (1) {
421                 if (overlap(iter, r_index, h_index))
422                         return iter->cur;
423
424                 if (prio_tree_left(iter, &r_index, &h_index))
425                         continue;
426
427                 if (prio_tree_right(iter, &r_index, &h_index))
428                         continue;
429
430                 break;
431         }
432         return NULL;
433 }
434
435 /*
436  * prio_tree_next:
437  *
438  * Get the next prio_tree_node that overlaps with the input interval in iter
439  */
440 struct prio_tree_node *prio_tree_next(struct prio_tree_iter *iter)
441 {
442         unsigned long r_index, h_index;
443
444         if (iter->cur == NULL)
445                 return prio_tree_first(iter);
446
447 repeat:
448         while (prio_tree_left(iter, &r_index, &h_index))
449                 if (overlap(iter, r_index, h_index))
450                         return iter->cur;
451
452         while (!prio_tree_right(iter, &r_index, &h_index)) {
453                 while (!prio_tree_root(iter->cur) &&
454                                 iter->cur->parent->right == iter->cur)
455                         prio_tree_parent(iter);
456
457                 if (prio_tree_root(iter->cur))
458                         return NULL;
459
460                 prio_tree_parent(iter);
461         }
462
463         if (overlap(iter, r_index, h_index))
464                 return iter->cur;
465
466         goto repeat;
467 }