btrfs: qgroup: Record possible quota-related extent for qgroup.
[linux-2.6-block.git] / fs / btrfs / qgroup.c
1 /*
2  * Copyright (C) 2011 STRATO.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/sched.h>
20 #include <linux/pagemap.h>
21 #include <linux/writeback.h>
22 #include <linux/blkdev.h>
23 #include <linux/rbtree.h>
24 #include <linux/slab.h>
25 #include <linux/workqueue.h>
26 #include <linux/btrfs.h>
27
28 #include "ctree.h"
29 #include "transaction.h"
30 #include "disk-io.h"
31 #include "locking.h"
32 #include "ulist.h"
33 #include "backref.h"
34 #include "extent_io.h"
35 #include "qgroup.h"
36
37 /* TODO XXX FIXME
38  *  - subvol delete -> delete when ref goes to 0? delete limits also?
39  *  - reorganize keys
40  *  - compressed
41  *  - sync
42  *  - copy also limits on subvol creation
43  *  - limit
44  *  - caches fuer ulists
45  *  - performance benchmarks
46  *  - check all ioctl parameters
47  */
48
49 /*
50  * one struct for each qgroup, organized in fs_info->qgroup_tree.
51  */
52 struct btrfs_qgroup {
53         u64 qgroupid;
54
55         /*
56          * state
57          */
58         u64 rfer;       /* referenced */
59         u64 rfer_cmpr;  /* referenced compressed */
60         u64 excl;       /* exclusive */
61         u64 excl_cmpr;  /* exclusive compressed */
62
63         /*
64          * limits
65          */
66         u64 lim_flags;  /* which limits are set */
67         u64 max_rfer;
68         u64 max_excl;
69         u64 rsv_rfer;
70         u64 rsv_excl;
71
72         /*
73          * reservation tracking
74          */
75         u64 reserved;
76
77         /*
78          * lists
79          */
80         struct list_head groups;  /* groups this group is member of */
81         struct list_head members; /* groups that are members of this group */
82         struct list_head dirty;   /* dirty groups */
83         struct rb_node node;      /* tree of qgroups */
84
85         /*
86          * temp variables for accounting operations
87          * Refer to qgroup_shared_accouting() for details.
88          */
89         u64 old_refcnt;
90         u64 new_refcnt;
91 };
92
93 static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
94                                            int mod)
95 {
96         if (qg->old_refcnt < seq)
97                 qg->old_refcnt = seq;
98         qg->old_refcnt += mod;
99 }
100
101 static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
102                                            int mod)
103 {
104         if (qg->new_refcnt < seq)
105                 qg->new_refcnt = seq;
106         qg->new_refcnt += mod;
107 }
108
109 static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq)
110 {
111         if (qg->old_refcnt < seq)
112                 return 0;
113         return qg->old_refcnt - seq;
114 }
115
116 static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq)
117 {
118         if (qg->new_refcnt < seq)
119                 return 0;
120         return qg->new_refcnt - seq;
121 }
122
123 /*
124  * glue structure to represent the relations between qgroups.
125  */
126 struct btrfs_qgroup_list {
127         struct list_head next_group;
128         struct list_head next_member;
129         struct btrfs_qgroup *group;
130         struct btrfs_qgroup *member;
131 };
132
133 #define ptr_to_u64(x) ((u64)(uintptr_t)x)
134 #define u64_to_ptr(x) ((struct btrfs_qgroup *)(uintptr_t)x)
135
136 static int
137 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
138                    int init_flags);
139 static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
140
141 /* must be called with qgroup_ioctl_lock held */
142 static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
143                                            u64 qgroupid)
144 {
145         struct rb_node *n = fs_info->qgroup_tree.rb_node;
146         struct btrfs_qgroup *qgroup;
147
148         while (n) {
149                 qgroup = rb_entry(n, struct btrfs_qgroup, node);
150                 if (qgroup->qgroupid < qgroupid)
151                         n = n->rb_left;
152                 else if (qgroup->qgroupid > qgroupid)
153                         n = n->rb_right;
154                 else
155                         return qgroup;
156         }
157         return NULL;
158 }
159
160 /* must be called with qgroup_lock held */
161 static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
162                                           u64 qgroupid)
163 {
164         struct rb_node **p = &fs_info->qgroup_tree.rb_node;
165         struct rb_node *parent = NULL;
166         struct btrfs_qgroup *qgroup;
167
168         while (*p) {
169                 parent = *p;
170                 qgroup = rb_entry(parent, struct btrfs_qgroup, node);
171
172                 if (qgroup->qgroupid < qgroupid)
173                         p = &(*p)->rb_left;
174                 else if (qgroup->qgroupid > qgroupid)
175                         p = &(*p)->rb_right;
176                 else
177                         return qgroup;
178         }
179
180         qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
181         if (!qgroup)
182                 return ERR_PTR(-ENOMEM);
183
184         qgroup->qgroupid = qgroupid;
185         INIT_LIST_HEAD(&qgroup->groups);
186         INIT_LIST_HEAD(&qgroup->members);
187         INIT_LIST_HEAD(&qgroup->dirty);
188
189         rb_link_node(&qgroup->node, parent, p);
190         rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
191
192         return qgroup;
193 }
194
195 static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
196 {
197         struct btrfs_qgroup_list *list;
198
199         list_del(&qgroup->dirty);
200         while (!list_empty(&qgroup->groups)) {
201                 list = list_first_entry(&qgroup->groups,
202                                         struct btrfs_qgroup_list, next_group);
203                 list_del(&list->next_group);
204                 list_del(&list->next_member);
205                 kfree(list);
206         }
207
208         while (!list_empty(&qgroup->members)) {
209                 list = list_first_entry(&qgroup->members,
210                                         struct btrfs_qgroup_list, next_member);
211                 list_del(&list->next_group);
212                 list_del(&list->next_member);
213                 kfree(list);
214         }
215         kfree(qgroup);
216 }
217
218 /* must be called with qgroup_lock held */
219 static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
220 {
221         struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
222
223         if (!qgroup)
224                 return -ENOENT;
225
226         rb_erase(&qgroup->node, &fs_info->qgroup_tree);
227         __del_qgroup_rb(qgroup);
228         return 0;
229 }
230
231 /* must be called with qgroup_lock held */
232 static int add_relation_rb(struct btrfs_fs_info *fs_info,
233                            u64 memberid, u64 parentid)
234 {
235         struct btrfs_qgroup *member;
236         struct btrfs_qgroup *parent;
237         struct btrfs_qgroup_list *list;
238
239         member = find_qgroup_rb(fs_info, memberid);
240         parent = find_qgroup_rb(fs_info, parentid);
241         if (!member || !parent)
242                 return -ENOENT;
243
244         list = kzalloc(sizeof(*list), GFP_ATOMIC);
245         if (!list)
246                 return -ENOMEM;
247
248         list->group = parent;
249         list->member = member;
250         list_add_tail(&list->next_group, &member->groups);
251         list_add_tail(&list->next_member, &parent->members);
252
253         return 0;
254 }
255
256 /* must be called with qgroup_lock held */
257 static int del_relation_rb(struct btrfs_fs_info *fs_info,
258                            u64 memberid, u64 parentid)
259 {
260         struct btrfs_qgroup *member;
261         struct btrfs_qgroup *parent;
262         struct btrfs_qgroup_list *list;
263
264         member = find_qgroup_rb(fs_info, memberid);
265         parent = find_qgroup_rb(fs_info, parentid);
266         if (!member || !parent)
267                 return -ENOENT;
268
269         list_for_each_entry(list, &member->groups, next_group) {
270                 if (list->group == parent) {
271                         list_del(&list->next_group);
272                         list_del(&list->next_member);
273                         kfree(list);
274                         return 0;
275                 }
276         }
277         return -ENOENT;
278 }
279
280 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
281 int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
282                                u64 rfer, u64 excl)
283 {
284         struct btrfs_qgroup *qgroup;
285
286         qgroup = find_qgroup_rb(fs_info, qgroupid);
287         if (!qgroup)
288                 return -EINVAL;
289         if (qgroup->rfer != rfer || qgroup->excl != excl)
290                 return -EINVAL;
291         return 0;
292 }
293 #endif
294
295 /*
296  * The full config is read in one go, only called from open_ctree()
297  * It doesn't use any locking, as at this point we're still single-threaded
298  */
299 int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
300 {
301         struct btrfs_key key;
302         struct btrfs_key found_key;
303         struct btrfs_root *quota_root = fs_info->quota_root;
304         struct btrfs_path *path = NULL;
305         struct extent_buffer *l;
306         int slot;
307         int ret = 0;
308         u64 flags = 0;
309         u64 rescan_progress = 0;
310
311         if (!fs_info->quota_enabled)
312                 return 0;
313
314         fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
315         if (!fs_info->qgroup_ulist) {
316                 ret = -ENOMEM;
317                 goto out;
318         }
319
320         path = btrfs_alloc_path();
321         if (!path) {
322                 ret = -ENOMEM;
323                 goto out;
324         }
325
326         /* default this to quota off, in case no status key is found */
327         fs_info->qgroup_flags = 0;
328
329         /*
330          * pass 1: read status, all qgroup infos and limits
331          */
332         key.objectid = 0;
333         key.type = 0;
334         key.offset = 0;
335         ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
336         if (ret)
337                 goto out;
338
339         while (1) {
340                 struct btrfs_qgroup *qgroup;
341
342                 slot = path->slots[0];
343                 l = path->nodes[0];
344                 btrfs_item_key_to_cpu(l, &found_key, slot);
345
346                 if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
347                         struct btrfs_qgroup_status_item *ptr;
348
349                         ptr = btrfs_item_ptr(l, slot,
350                                              struct btrfs_qgroup_status_item);
351
352                         if (btrfs_qgroup_status_version(l, ptr) !=
353                             BTRFS_QGROUP_STATUS_VERSION) {
354                                 btrfs_err(fs_info,
355                                  "old qgroup version, quota disabled");
356                                 goto out;
357                         }
358                         if (btrfs_qgroup_status_generation(l, ptr) !=
359                             fs_info->generation) {
360                                 flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
361                                 btrfs_err(fs_info,
362                                         "qgroup generation mismatch, "
363                                         "marked as inconsistent");
364                         }
365                         fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
366                                                                           ptr);
367                         rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
368                         goto next1;
369                 }
370
371                 if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
372                     found_key.type != BTRFS_QGROUP_LIMIT_KEY)
373                         goto next1;
374
375                 qgroup = find_qgroup_rb(fs_info, found_key.offset);
376                 if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
377                     (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
378                         btrfs_err(fs_info, "inconsitent qgroup config");
379                         flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
380                 }
381                 if (!qgroup) {
382                         qgroup = add_qgroup_rb(fs_info, found_key.offset);
383                         if (IS_ERR(qgroup)) {
384                                 ret = PTR_ERR(qgroup);
385                                 goto out;
386                         }
387                 }
388                 switch (found_key.type) {
389                 case BTRFS_QGROUP_INFO_KEY: {
390                         struct btrfs_qgroup_info_item *ptr;
391
392                         ptr = btrfs_item_ptr(l, slot,
393                                              struct btrfs_qgroup_info_item);
394                         qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
395                         qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
396                         qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
397                         qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
398                         /* generation currently unused */
399                         break;
400                 }
401                 case BTRFS_QGROUP_LIMIT_KEY: {
402                         struct btrfs_qgroup_limit_item *ptr;
403
404                         ptr = btrfs_item_ptr(l, slot,
405                                              struct btrfs_qgroup_limit_item);
406                         qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
407                         qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
408                         qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
409                         qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
410                         qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
411                         break;
412                 }
413                 }
414 next1:
415                 ret = btrfs_next_item(quota_root, path);
416                 if (ret < 0)
417                         goto out;
418                 if (ret)
419                         break;
420         }
421         btrfs_release_path(path);
422
423         /*
424          * pass 2: read all qgroup relations
425          */
426         key.objectid = 0;
427         key.type = BTRFS_QGROUP_RELATION_KEY;
428         key.offset = 0;
429         ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
430         if (ret)
431                 goto out;
432         while (1) {
433                 slot = path->slots[0];
434                 l = path->nodes[0];
435                 btrfs_item_key_to_cpu(l, &found_key, slot);
436
437                 if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
438                         goto next2;
439
440                 if (found_key.objectid > found_key.offset) {
441                         /* parent <- member, not needed to build config */
442                         /* FIXME should we omit the key completely? */
443                         goto next2;
444                 }
445
446                 ret = add_relation_rb(fs_info, found_key.objectid,
447                                       found_key.offset);
448                 if (ret == -ENOENT) {
449                         btrfs_warn(fs_info,
450                                 "orphan qgroup relation 0x%llx->0x%llx",
451                                 found_key.objectid, found_key.offset);
452                         ret = 0;        /* ignore the error */
453                 }
454                 if (ret)
455                         goto out;
456 next2:
457                 ret = btrfs_next_item(quota_root, path);
458                 if (ret < 0)
459                         goto out;
460                 if (ret)
461                         break;
462         }
463 out:
464         fs_info->qgroup_flags |= flags;
465         if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) {
466                 fs_info->quota_enabled = 0;
467                 fs_info->pending_quota_state = 0;
468         } else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
469                    ret >= 0) {
470                 ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
471         }
472         btrfs_free_path(path);
473
474         if (ret < 0) {
475                 ulist_free(fs_info->qgroup_ulist);
476                 fs_info->qgroup_ulist = NULL;
477                 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
478         }
479
480         return ret < 0 ? ret : 0;
481 }
482
483 /*
484  * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
485  * first two are in single-threaded paths.And for the third one, we have set
486  * quota_root to be null with qgroup_lock held before, so it is safe to clean
487  * up the in-memory structures without qgroup_lock held.
488  */
489 void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
490 {
491         struct rb_node *n;
492         struct btrfs_qgroup *qgroup;
493
494         while ((n = rb_first(&fs_info->qgroup_tree))) {
495                 qgroup = rb_entry(n, struct btrfs_qgroup, node);
496                 rb_erase(n, &fs_info->qgroup_tree);
497                 __del_qgroup_rb(qgroup);
498         }
499         /*
500          * we call btrfs_free_qgroup_config() when umounting
501          * filesystem and disabling quota, so we set qgroup_ulit
502          * to be null here to avoid double free.
503          */
504         ulist_free(fs_info->qgroup_ulist);
505         fs_info->qgroup_ulist = NULL;
506 }
507
508 static int add_qgroup_relation_item(struct btrfs_trans_handle *trans,
509                                     struct btrfs_root *quota_root,
510                                     u64 src, u64 dst)
511 {
512         int ret;
513         struct btrfs_path *path;
514         struct btrfs_key key;
515
516         path = btrfs_alloc_path();
517         if (!path)
518                 return -ENOMEM;
519
520         key.objectid = src;
521         key.type = BTRFS_QGROUP_RELATION_KEY;
522         key.offset = dst;
523
524         ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
525
526         btrfs_mark_buffer_dirty(path->nodes[0]);
527
528         btrfs_free_path(path);
529         return ret;
530 }
531
532 static int del_qgroup_relation_item(struct btrfs_trans_handle *trans,
533                                     struct btrfs_root *quota_root,
534                                     u64 src, u64 dst)
535 {
536         int ret;
537         struct btrfs_path *path;
538         struct btrfs_key key;
539
540         path = btrfs_alloc_path();
541         if (!path)
542                 return -ENOMEM;
543
544         key.objectid = src;
545         key.type = BTRFS_QGROUP_RELATION_KEY;
546         key.offset = dst;
547
548         ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
549         if (ret < 0)
550                 goto out;
551
552         if (ret > 0) {
553                 ret = -ENOENT;
554                 goto out;
555         }
556
557         ret = btrfs_del_item(trans, quota_root, path);
558 out:
559         btrfs_free_path(path);
560         return ret;
561 }
562
563 static int add_qgroup_item(struct btrfs_trans_handle *trans,
564                            struct btrfs_root *quota_root, u64 qgroupid)
565 {
566         int ret;
567         struct btrfs_path *path;
568         struct btrfs_qgroup_info_item *qgroup_info;
569         struct btrfs_qgroup_limit_item *qgroup_limit;
570         struct extent_buffer *leaf;
571         struct btrfs_key key;
572
573         if (btrfs_test_is_dummy_root(quota_root))
574                 return 0;
575
576         path = btrfs_alloc_path();
577         if (!path)
578                 return -ENOMEM;
579
580         key.objectid = 0;
581         key.type = BTRFS_QGROUP_INFO_KEY;
582         key.offset = qgroupid;
583
584         /*
585          * Avoid a transaction abort by catching -EEXIST here. In that
586          * case, we proceed by re-initializing the existing structure
587          * on disk.
588          */
589
590         ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
591                                       sizeof(*qgroup_info));
592         if (ret && ret != -EEXIST)
593                 goto out;
594
595         leaf = path->nodes[0];
596         qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
597                                  struct btrfs_qgroup_info_item);
598         btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
599         btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
600         btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
601         btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
602         btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
603
604         btrfs_mark_buffer_dirty(leaf);
605
606         btrfs_release_path(path);
607
608         key.type = BTRFS_QGROUP_LIMIT_KEY;
609         ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
610                                       sizeof(*qgroup_limit));
611         if (ret && ret != -EEXIST)
612                 goto out;
613
614         leaf = path->nodes[0];
615         qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
616                                   struct btrfs_qgroup_limit_item);
617         btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
618         btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
619         btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
620         btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
621         btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
622
623         btrfs_mark_buffer_dirty(leaf);
624
625         ret = 0;
626 out:
627         btrfs_free_path(path);
628         return ret;
629 }
630
631 static int del_qgroup_item(struct btrfs_trans_handle *trans,
632                            struct btrfs_root *quota_root, u64 qgroupid)
633 {
634         int ret;
635         struct btrfs_path *path;
636         struct btrfs_key key;
637
638         path = btrfs_alloc_path();
639         if (!path)
640                 return -ENOMEM;
641
642         key.objectid = 0;
643         key.type = BTRFS_QGROUP_INFO_KEY;
644         key.offset = qgroupid;
645         ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
646         if (ret < 0)
647                 goto out;
648
649         if (ret > 0) {
650                 ret = -ENOENT;
651                 goto out;
652         }
653
654         ret = btrfs_del_item(trans, quota_root, path);
655         if (ret)
656                 goto out;
657
658         btrfs_release_path(path);
659
660         key.type = BTRFS_QGROUP_LIMIT_KEY;
661         ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
662         if (ret < 0)
663                 goto out;
664
665         if (ret > 0) {
666                 ret = -ENOENT;
667                 goto out;
668         }
669
670         ret = btrfs_del_item(trans, quota_root, path);
671
672 out:
673         btrfs_free_path(path);
674         return ret;
675 }
676
677 static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
678                                     struct btrfs_root *root,
679                                     struct btrfs_qgroup *qgroup)
680 {
681         struct btrfs_path *path;
682         struct btrfs_key key;
683         struct extent_buffer *l;
684         struct btrfs_qgroup_limit_item *qgroup_limit;
685         int ret;
686         int slot;
687
688         key.objectid = 0;
689         key.type = BTRFS_QGROUP_LIMIT_KEY;
690         key.offset = qgroup->qgroupid;
691
692         path = btrfs_alloc_path();
693         if (!path)
694                 return -ENOMEM;
695
696         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
697         if (ret > 0)
698                 ret = -ENOENT;
699
700         if (ret)
701                 goto out;
702
703         l = path->nodes[0];
704         slot = path->slots[0];
705         qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
706         btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags);
707         btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer);
708         btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
709         btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
710         btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
711
712         btrfs_mark_buffer_dirty(l);
713
714 out:
715         btrfs_free_path(path);
716         return ret;
717 }
718
719 static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
720                                    struct btrfs_root *root,
721                                    struct btrfs_qgroup *qgroup)
722 {
723         struct btrfs_path *path;
724         struct btrfs_key key;
725         struct extent_buffer *l;
726         struct btrfs_qgroup_info_item *qgroup_info;
727         int ret;
728         int slot;
729
730         if (btrfs_test_is_dummy_root(root))
731                 return 0;
732
733         key.objectid = 0;
734         key.type = BTRFS_QGROUP_INFO_KEY;
735         key.offset = qgroup->qgroupid;
736
737         path = btrfs_alloc_path();
738         if (!path)
739                 return -ENOMEM;
740
741         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
742         if (ret > 0)
743                 ret = -ENOENT;
744
745         if (ret)
746                 goto out;
747
748         l = path->nodes[0];
749         slot = path->slots[0];
750         qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item);
751         btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
752         btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
753         btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
754         btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
755         btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
756
757         btrfs_mark_buffer_dirty(l);
758
759 out:
760         btrfs_free_path(path);
761         return ret;
762 }
763
764 static int update_qgroup_status_item(struct btrfs_trans_handle *trans,
765                                      struct btrfs_fs_info *fs_info,
766                                     struct btrfs_root *root)
767 {
768         struct btrfs_path *path;
769         struct btrfs_key key;
770         struct extent_buffer *l;
771         struct btrfs_qgroup_status_item *ptr;
772         int ret;
773         int slot;
774
775         key.objectid = 0;
776         key.type = BTRFS_QGROUP_STATUS_KEY;
777         key.offset = 0;
778
779         path = btrfs_alloc_path();
780         if (!path)
781                 return -ENOMEM;
782
783         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
784         if (ret > 0)
785                 ret = -ENOENT;
786
787         if (ret)
788                 goto out;
789
790         l = path->nodes[0];
791         slot = path->slots[0];
792         ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
793         btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
794         btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
795         btrfs_set_qgroup_status_rescan(l, ptr,
796                                 fs_info->qgroup_rescan_progress.objectid);
797
798         btrfs_mark_buffer_dirty(l);
799
800 out:
801         btrfs_free_path(path);
802         return ret;
803 }
804
805 /*
806  * called with qgroup_lock held
807  */
808 static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
809                                   struct btrfs_root *root)
810 {
811         struct btrfs_path *path;
812         struct btrfs_key key;
813         struct extent_buffer *leaf = NULL;
814         int ret;
815         int nr = 0;
816
817         path = btrfs_alloc_path();
818         if (!path)
819                 return -ENOMEM;
820
821         path->leave_spinning = 1;
822
823         key.objectid = 0;
824         key.offset = 0;
825         key.type = 0;
826
827         while (1) {
828                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
829                 if (ret < 0)
830                         goto out;
831                 leaf = path->nodes[0];
832                 nr = btrfs_header_nritems(leaf);
833                 if (!nr)
834                         break;
835                 /*
836                  * delete the leaf one by one
837                  * since the whole tree is going
838                  * to be deleted.
839                  */
840                 path->slots[0] = 0;
841                 ret = btrfs_del_items(trans, root, path, 0, nr);
842                 if (ret)
843                         goto out;
844
845                 btrfs_release_path(path);
846         }
847         ret = 0;
848 out:
849         root->fs_info->pending_quota_state = 0;
850         btrfs_free_path(path);
851         return ret;
852 }
853
854 int btrfs_quota_enable(struct btrfs_trans_handle *trans,
855                        struct btrfs_fs_info *fs_info)
856 {
857         struct btrfs_root *quota_root;
858         struct btrfs_root *tree_root = fs_info->tree_root;
859         struct btrfs_path *path = NULL;
860         struct btrfs_qgroup_status_item *ptr;
861         struct extent_buffer *leaf;
862         struct btrfs_key key;
863         struct btrfs_key found_key;
864         struct btrfs_qgroup *qgroup = NULL;
865         int ret = 0;
866         int slot;
867
868         mutex_lock(&fs_info->qgroup_ioctl_lock);
869         if (fs_info->quota_root) {
870                 fs_info->pending_quota_state = 1;
871                 goto out;
872         }
873
874         fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
875         if (!fs_info->qgroup_ulist) {
876                 ret = -ENOMEM;
877                 goto out;
878         }
879
880         /*
881          * initially create the quota tree
882          */
883         quota_root = btrfs_create_tree(trans, fs_info,
884                                        BTRFS_QUOTA_TREE_OBJECTID);
885         if (IS_ERR(quota_root)) {
886                 ret =  PTR_ERR(quota_root);
887                 goto out;
888         }
889
890         path = btrfs_alloc_path();
891         if (!path) {
892                 ret = -ENOMEM;
893                 goto out_free_root;
894         }
895
896         key.objectid = 0;
897         key.type = BTRFS_QGROUP_STATUS_KEY;
898         key.offset = 0;
899
900         ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
901                                       sizeof(*ptr));
902         if (ret)
903                 goto out_free_path;
904
905         leaf = path->nodes[0];
906         ptr = btrfs_item_ptr(leaf, path->slots[0],
907                                  struct btrfs_qgroup_status_item);
908         btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
909         btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
910         fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
911                                 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
912         btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
913         btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
914
915         btrfs_mark_buffer_dirty(leaf);
916
917         key.objectid = 0;
918         key.type = BTRFS_ROOT_REF_KEY;
919         key.offset = 0;
920
921         btrfs_release_path(path);
922         ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0);
923         if (ret > 0)
924                 goto out_add_root;
925         if (ret < 0)
926                 goto out_free_path;
927
928
929         while (1) {
930                 slot = path->slots[0];
931                 leaf = path->nodes[0];
932                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
933
934                 if (found_key.type == BTRFS_ROOT_REF_KEY) {
935                         ret = add_qgroup_item(trans, quota_root,
936                                               found_key.offset);
937                         if (ret)
938                                 goto out_free_path;
939
940                         qgroup = add_qgroup_rb(fs_info, found_key.offset);
941                         if (IS_ERR(qgroup)) {
942                                 ret = PTR_ERR(qgroup);
943                                 goto out_free_path;
944                         }
945                 }
946                 ret = btrfs_next_item(tree_root, path);
947                 if (ret < 0)
948                         goto out_free_path;
949                 if (ret)
950                         break;
951         }
952
953 out_add_root:
954         btrfs_release_path(path);
955         ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
956         if (ret)
957                 goto out_free_path;
958
959         qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID);
960         if (IS_ERR(qgroup)) {
961                 ret = PTR_ERR(qgroup);
962                 goto out_free_path;
963         }
964         spin_lock(&fs_info->qgroup_lock);
965         fs_info->quota_root = quota_root;
966         fs_info->pending_quota_state = 1;
967         spin_unlock(&fs_info->qgroup_lock);
968 out_free_path:
969         btrfs_free_path(path);
970 out_free_root:
971         if (ret) {
972                 free_extent_buffer(quota_root->node);
973                 free_extent_buffer(quota_root->commit_root);
974                 kfree(quota_root);
975         }
976 out:
977         if (ret) {
978                 ulist_free(fs_info->qgroup_ulist);
979                 fs_info->qgroup_ulist = NULL;
980         }
981         mutex_unlock(&fs_info->qgroup_ioctl_lock);
982         return ret;
983 }
984
985 int btrfs_quota_disable(struct btrfs_trans_handle *trans,
986                         struct btrfs_fs_info *fs_info)
987 {
988         struct btrfs_root *tree_root = fs_info->tree_root;
989         struct btrfs_root *quota_root;
990         int ret = 0;
991
992         mutex_lock(&fs_info->qgroup_ioctl_lock);
993         if (!fs_info->quota_root)
994                 goto out;
995         spin_lock(&fs_info->qgroup_lock);
996         fs_info->quota_enabled = 0;
997         fs_info->pending_quota_state = 0;
998         quota_root = fs_info->quota_root;
999         fs_info->quota_root = NULL;
1000         fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
1001         spin_unlock(&fs_info->qgroup_lock);
1002
1003         btrfs_free_qgroup_config(fs_info);
1004
1005         ret = btrfs_clean_quota_tree(trans, quota_root);
1006         if (ret)
1007                 goto out;
1008
1009         ret = btrfs_del_root(trans, tree_root, &quota_root->root_key);
1010         if (ret)
1011                 goto out;
1012
1013         list_del(&quota_root->dirty_list);
1014
1015         btrfs_tree_lock(quota_root->node);
1016         clean_tree_block(trans, tree_root->fs_info, quota_root->node);
1017         btrfs_tree_unlock(quota_root->node);
1018         btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
1019
1020         free_extent_buffer(quota_root->node);
1021         free_extent_buffer(quota_root->commit_root);
1022         kfree(quota_root);
1023 out:
1024         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1025         return ret;
1026 }
1027
1028 static void qgroup_dirty(struct btrfs_fs_info *fs_info,
1029                          struct btrfs_qgroup *qgroup)
1030 {
1031         if (list_empty(&qgroup->dirty))
1032                 list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
1033 }
1034
1035 /*
1036  * The easy accounting, if we are adding/removing the only ref for an extent
1037  * then this qgroup and all of the parent qgroups get their refrence and
1038  * exclusive counts adjusted.
1039  *
1040  * Caller should hold fs_info->qgroup_lock.
1041  */
1042 static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
1043                                     struct ulist *tmp, u64 ref_root,
1044                                     u64 num_bytes, int sign)
1045 {
1046         struct btrfs_qgroup *qgroup;
1047         struct btrfs_qgroup_list *glist;
1048         struct ulist_node *unode;
1049         struct ulist_iterator uiter;
1050         int ret = 0;
1051
1052         qgroup = find_qgroup_rb(fs_info, ref_root);
1053         if (!qgroup)
1054                 goto out;
1055
1056         qgroup->rfer += sign * num_bytes;
1057         qgroup->rfer_cmpr += sign * num_bytes;
1058
1059         WARN_ON(sign < 0 && qgroup->excl < num_bytes);
1060         qgroup->excl += sign * num_bytes;
1061         qgroup->excl_cmpr += sign * num_bytes;
1062         if (sign > 0)
1063                 qgroup->reserved -= num_bytes;
1064
1065         qgroup_dirty(fs_info, qgroup);
1066
1067         /* Get all of the parent groups that contain this qgroup */
1068         list_for_each_entry(glist, &qgroup->groups, next_group) {
1069                 ret = ulist_add(tmp, glist->group->qgroupid,
1070                                 ptr_to_u64(glist->group), GFP_ATOMIC);
1071                 if (ret < 0)
1072                         goto out;
1073         }
1074
1075         /* Iterate all of the parents and adjust their reference counts */
1076         ULIST_ITER_INIT(&uiter);
1077         while ((unode = ulist_next(tmp, &uiter))) {
1078                 qgroup = u64_to_ptr(unode->aux);
1079                 qgroup->rfer += sign * num_bytes;
1080                 qgroup->rfer_cmpr += sign * num_bytes;
1081                 WARN_ON(sign < 0 && qgroup->excl < num_bytes);
1082                 qgroup->excl += sign * num_bytes;
1083                 if (sign > 0)
1084                         qgroup->reserved -= num_bytes;
1085                 qgroup->excl_cmpr += sign * num_bytes;
1086                 qgroup_dirty(fs_info, qgroup);
1087
1088                 /* Add any parents of the parents */
1089                 list_for_each_entry(glist, &qgroup->groups, next_group) {
1090                         ret = ulist_add(tmp, glist->group->qgroupid,
1091                                         ptr_to_u64(glist->group), GFP_ATOMIC);
1092                         if (ret < 0)
1093                                 goto out;
1094                 }
1095         }
1096         ret = 0;
1097 out:
1098         return ret;
1099 }
1100
1101
1102 /*
1103  * Quick path for updating qgroup with only excl refs.
1104  *
1105  * In that case, just update all parent will be enough.
1106  * Or we needs to do a full rescan.
1107  * Caller should also hold fs_info->qgroup_lock.
1108  *
1109  * Return 0 for quick update, return >0 for need to full rescan
1110  * and mark INCONSISTENT flag.
1111  * Return < 0 for other error.
1112  */
1113 static int quick_update_accounting(struct btrfs_fs_info *fs_info,
1114                                    struct ulist *tmp, u64 src, u64 dst,
1115                                    int sign)
1116 {
1117         struct btrfs_qgroup *qgroup;
1118         int ret = 1;
1119         int err = 0;
1120
1121         qgroup = find_qgroup_rb(fs_info, src);
1122         if (!qgroup)
1123                 goto out;
1124         if (qgroup->excl == qgroup->rfer) {
1125                 ret = 0;
1126                 err = __qgroup_excl_accounting(fs_info, tmp, dst,
1127                                                qgroup->excl, sign);
1128                 if (err < 0) {
1129                         ret = err;
1130                         goto out;
1131                 }
1132         }
1133 out:
1134         if (ret)
1135                 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1136         return ret;
1137 }
1138
1139 int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
1140                               struct btrfs_fs_info *fs_info, u64 src, u64 dst)
1141 {
1142         struct btrfs_root *quota_root;
1143         struct btrfs_qgroup *parent;
1144         struct btrfs_qgroup *member;
1145         struct btrfs_qgroup_list *list;
1146         struct ulist *tmp;
1147         int ret = 0;
1148
1149         /* Check the level of src and dst first */
1150         if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
1151                 return -EINVAL;
1152
1153         tmp = ulist_alloc(GFP_NOFS);
1154         if (!tmp)
1155                 return -ENOMEM;
1156
1157         mutex_lock(&fs_info->qgroup_ioctl_lock);
1158         quota_root = fs_info->quota_root;
1159         if (!quota_root) {
1160                 ret = -EINVAL;
1161                 goto out;
1162         }
1163         member = find_qgroup_rb(fs_info, src);
1164         parent = find_qgroup_rb(fs_info, dst);
1165         if (!member || !parent) {
1166                 ret = -EINVAL;
1167                 goto out;
1168         }
1169
1170         /* check if such qgroup relation exist firstly */
1171         list_for_each_entry(list, &member->groups, next_group) {
1172                 if (list->group == parent) {
1173                         ret = -EEXIST;
1174                         goto out;
1175                 }
1176         }
1177
1178         ret = add_qgroup_relation_item(trans, quota_root, src, dst);
1179         if (ret)
1180                 goto out;
1181
1182         ret = add_qgroup_relation_item(trans, quota_root, dst, src);
1183         if (ret) {
1184                 del_qgroup_relation_item(trans, quota_root, src, dst);
1185                 goto out;
1186         }
1187
1188         spin_lock(&fs_info->qgroup_lock);
1189         ret = add_relation_rb(quota_root->fs_info, src, dst);
1190         if (ret < 0) {
1191                 spin_unlock(&fs_info->qgroup_lock);
1192                 goto out;
1193         }
1194         ret = quick_update_accounting(fs_info, tmp, src, dst, 1);
1195         spin_unlock(&fs_info->qgroup_lock);
1196 out:
1197         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1198         ulist_free(tmp);
1199         return ret;
1200 }
1201
1202 int __del_qgroup_relation(struct btrfs_trans_handle *trans,
1203                               struct btrfs_fs_info *fs_info, u64 src, u64 dst)
1204 {
1205         struct btrfs_root *quota_root;
1206         struct btrfs_qgroup *parent;
1207         struct btrfs_qgroup *member;
1208         struct btrfs_qgroup_list *list;
1209         struct ulist *tmp;
1210         int ret = 0;
1211         int err;
1212
1213         tmp = ulist_alloc(GFP_NOFS);
1214         if (!tmp)
1215                 return -ENOMEM;
1216
1217         quota_root = fs_info->quota_root;
1218         if (!quota_root) {
1219                 ret = -EINVAL;
1220                 goto out;
1221         }
1222
1223         member = find_qgroup_rb(fs_info, src);
1224         parent = find_qgroup_rb(fs_info, dst);
1225         if (!member || !parent) {
1226                 ret = -EINVAL;
1227                 goto out;
1228         }
1229
1230         /* check if such qgroup relation exist firstly */
1231         list_for_each_entry(list, &member->groups, next_group) {
1232                 if (list->group == parent)
1233                         goto exist;
1234         }
1235         ret = -ENOENT;
1236         goto out;
1237 exist:
1238         ret = del_qgroup_relation_item(trans, quota_root, src, dst);
1239         err = del_qgroup_relation_item(trans, quota_root, dst, src);
1240         if (err && !ret)
1241                 ret = err;
1242
1243         spin_lock(&fs_info->qgroup_lock);
1244         del_relation_rb(fs_info, src, dst);
1245         ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
1246         spin_unlock(&fs_info->qgroup_lock);
1247 out:
1248         ulist_free(tmp);
1249         return ret;
1250 }
1251
1252 int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
1253                               struct btrfs_fs_info *fs_info, u64 src, u64 dst)
1254 {
1255         int ret = 0;
1256
1257         mutex_lock(&fs_info->qgroup_ioctl_lock);
1258         ret = __del_qgroup_relation(trans, fs_info, src, dst);
1259         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1260
1261         return ret;
1262 }
1263
1264 int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
1265                         struct btrfs_fs_info *fs_info, u64 qgroupid)
1266 {
1267         struct btrfs_root *quota_root;
1268         struct btrfs_qgroup *qgroup;
1269         int ret = 0;
1270
1271         mutex_lock(&fs_info->qgroup_ioctl_lock);
1272         quota_root = fs_info->quota_root;
1273         if (!quota_root) {
1274                 ret = -EINVAL;
1275                 goto out;
1276         }
1277         qgroup = find_qgroup_rb(fs_info, qgroupid);
1278         if (qgroup) {
1279                 ret = -EEXIST;
1280                 goto out;
1281         }
1282
1283         ret = add_qgroup_item(trans, quota_root, qgroupid);
1284         if (ret)
1285                 goto out;
1286
1287         spin_lock(&fs_info->qgroup_lock);
1288         qgroup = add_qgroup_rb(fs_info, qgroupid);
1289         spin_unlock(&fs_info->qgroup_lock);
1290
1291         if (IS_ERR(qgroup))
1292                 ret = PTR_ERR(qgroup);
1293 out:
1294         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1295         return ret;
1296 }
1297
1298 int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
1299                         struct btrfs_fs_info *fs_info, u64 qgroupid)
1300 {
1301         struct btrfs_root *quota_root;
1302         struct btrfs_qgroup *qgroup;
1303         struct btrfs_qgroup_list *list;
1304         int ret = 0;
1305
1306         mutex_lock(&fs_info->qgroup_ioctl_lock);
1307         quota_root = fs_info->quota_root;
1308         if (!quota_root) {
1309                 ret = -EINVAL;
1310                 goto out;
1311         }
1312
1313         qgroup = find_qgroup_rb(fs_info, qgroupid);
1314         if (!qgroup) {
1315                 ret = -ENOENT;
1316                 goto out;
1317         } else {
1318                 /* check if there are no children of this qgroup */
1319                 if (!list_empty(&qgroup->members)) {
1320                         ret = -EBUSY;
1321                         goto out;
1322                 }
1323         }
1324         ret = del_qgroup_item(trans, quota_root, qgroupid);
1325
1326         while (!list_empty(&qgroup->groups)) {
1327                 list = list_first_entry(&qgroup->groups,
1328                                         struct btrfs_qgroup_list, next_group);
1329                 ret = __del_qgroup_relation(trans, fs_info,
1330                                            qgroupid,
1331                                            list->group->qgroupid);
1332                 if (ret)
1333                         goto out;
1334         }
1335
1336         spin_lock(&fs_info->qgroup_lock);
1337         del_qgroup_rb(quota_root->fs_info, qgroupid);
1338         spin_unlock(&fs_info->qgroup_lock);
1339 out:
1340         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1341         return ret;
1342 }
1343
1344 int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
1345                        struct btrfs_fs_info *fs_info, u64 qgroupid,
1346                        struct btrfs_qgroup_limit *limit)
1347 {
1348         struct btrfs_root *quota_root;
1349         struct btrfs_qgroup *qgroup;
1350         int ret = 0;
1351
1352         mutex_lock(&fs_info->qgroup_ioctl_lock);
1353         quota_root = fs_info->quota_root;
1354         if (!quota_root) {
1355                 ret = -EINVAL;
1356                 goto out;
1357         }
1358
1359         qgroup = find_qgroup_rb(fs_info, qgroupid);
1360         if (!qgroup) {
1361                 ret = -ENOENT;
1362                 goto out;
1363         }
1364
1365         spin_lock(&fs_info->qgroup_lock);
1366         if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER)
1367                 qgroup->max_rfer = limit->max_rfer;
1368         if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL)
1369                 qgroup->max_excl = limit->max_excl;
1370         if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER)
1371                 qgroup->rsv_rfer = limit->rsv_rfer;
1372         if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL)
1373                 qgroup->rsv_excl = limit->rsv_excl;
1374         qgroup->lim_flags |= limit->flags;
1375
1376         spin_unlock(&fs_info->qgroup_lock);
1377
1378         ret = update_qgroup_limit_item(trans, quota_root, qgroup);
1379         if (ret) {
1380                 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1381                 btrfs_info(fs_info, "unable to update quota limit for %llu",
1382                        qgroupid);
1383         }
1384
1385 out:
1386         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1387         return ret;
1388 }
1389
1390 static int comp_oper_exist(struct btrfs_qgroup_operation *oper1,
1391                            struct btrfs_qgroup_operation *oper2)
1392 {
1393         /*
1394          * Ignore seq and type here, we're looking for any operation
1395          * at all related to this extent on that root.
1396          */
1397         if (oper1->bytenr < oper2->bytenr)
1398                 return -1;
1399         if (oper1->bytenr > oper2->bytenr)
1400                 return 1;
1401         if (oper1->ref_root < oper2->ref_root)
1402                 return -1;
1403         if (oper1->ref_root > oper2->ref_root)
1404                 return 1;
1405         return 0;
1406 }
1407
1408 static int qgroup_oper_exists(struct btrfs_fs_info *fs_info,
1409                               struct btrfs_qgroup_operation *oper)
1410 {
1411         struct rb_node *n;
1412         struct btrfs_qgroup_operation *cur;
1413         int cmp;
1414
1415         spin_lock(&fs_info->qgroup_op_lock);
1416         n = fs_info->qgroup_op_tree.rb_node;
1417         while (n) {
1418                 cur = rb_entry(n, struct btrfs_qgroup_operation, n);
1419                 cmp = comp_oper_exist(cur, oper);
1420                 if (cmp < 0) {
1421                         n = n->rb_right;
1422                 } else if (cmp) {
1423                         n = n->rb_left;
1424                 } else {
1425                         spin_unlock(&fs_info->qgroup_op_lock);
1426                         return -EEXIST;
1427                 }
1428         }
1429         spin_unlock(&fs_info->qgroup_op_lock);
1430         return 0;
1431 }
1432
1433 static int comp_oper(struct btrfs_qgroup_operation *oper1,
1434                      struct btrfs_qgroup_operation *oper2)
1435 {
1436         if (oper1->bytenr < oper2->bytenr)
1437                 return -1;
1438         if (oper1->bytenr > oper2->bytenr)
1439                 return 1;
1440         if (oper1->ref_root < oper2->ref_root)
1441                 return -1;
1442         if (oper1->ref_root > oper2->ref_root)
1443                 return 1;
1444         if (oper1->seq < oper2->seq)
1445                 return -1;
1446         if (oper1->seq > oper2->seq)
1447                 return 1;
1448         if (oper1->type < oper2->type)
1449                 return -1;
1450         if (oper1->type > oper2->type)
1451                 return 1;
1452         return 0;
1453 }
1454
1455 static int insert_qgroup_oper(struct btrfs_fs_info *fs_info,
1456                               struct btrfs_qgroup_operation *oper)
1457 {
1458         struct rb_node **p;
1459         struct rb_node *parent = NULL;
1460         struct btrfs_qgroup_operation *cur;
1461         int cmp;
1462
1463         spin_lock(&fs_info->qgroup_op_lock);
1464         p = &fs_info->qgroup_op_tree.rb_node;
1465         while (*p) {
1466                 parent = *p;
1467                 cur = rb_entry(parent, struct btrfs_qgroup_operation, n);
1468                 cmp = comp_oper(cur, oper);
1469                 if (cmp < 0) {
1470                         p = &(*p)->rb_right;
1471                 } else if (cmp) {
1472                         p = &(*p)->rb_left;
1473                 } else {
1474                         spin_unlock(&fs_info->qgroup_op_lock);
1475                         return -EEXIST;
1476                 }
1477         }
1478         rb_link_node(&oper->n, parent, p);
1479         rb_insert_color(&oper->n, &fs_info->qgroup_op_tree);
1480         spin_unlock(&fs_info->qgroup_op_lock);
1481         return 0;
1482 }
1483
1484 /*
1485  * Record a quota operation for processing later on.
1486  * @trans: the transaction we are adding the delayed op to.
1487  * @fs_info: the fs_info for this fs.
1488  * @ref_root: the root of the reference we are acting on,
1489  * @bytenr: the bytenr we are acting on.
1490  * @num_bytes: the number of bytes in the reference.
1491  * @type: the type of operation this is.
1492  * @mod_seq: do we need to get a sequence number for looking up roots.
1493  *
1494  * We just add it to our trans qgroup_ref_list and carry on and process these
1495  * operations in order at some later point.  If the reference root isn't a fs
1496  * root then we don't bother with doing anything.
1497  *
1498  * MUST BE HOLDING THE REF LOCK.
1499  */
1500 int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
1501                             struct btrfs_fs_info *fs_info, u64 ref_root,
1502                             u64 bytenr, u64 num_bytes,
1503                             enum btrfs_qgroup_operation_type type, int mod_seq)
1504 {
1505         struct btrfs_qgroup_operation *oper;
1506         int ret;
1507
1508         if (!is_fstree(ref_root) || !fs_info->quota_enabled)
1509                 return 0;
1510
1511         oper = kmalloc(sizeof(*oper), GFP_NOFS);
1512         if (!oper)
1513                 return -ENOMEM;
1514
1515         oper->ref_root = ref_root;
1516         oper->bytenr = bytenr;
1517         oper->num_bytes = num_bytes;
1518         oper->type = type;
1519         oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq);
1520         INIT_LIST_HEAD(&oper->elem.list);
1521         oper->elem.seq = 0;
1522
1523         trace_btrfs_qgroup_record_ref(oper);
1524
1525         if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) {
1526                 /*
1527                  * If any operation for this bytenr/ref_root combo
1528                  * exists, then we know it's not exclusively owned and
1529                  * shouldn't be queued up.
1530                  *
1531                  * This also catches the case where we have a cloned
1532                  * extent that gets queued up multiple times during
1533                  * drop snapshot.
1534                  */
1535                 if (qgroup_oper_exists(fs_info, oper)) {
1536                         kfree(oper);
1537                         return 0;
1538                 }
1539         }
1540
1541         ret = insert_qgroup_oper(fs_info, oper);
1542         if (ret) {
1543                 /* Shouldn't happen so have an assert for developers */
1544                 ASSERT(0);
1545                 kfree(oper);
1546                 return ret;
1547         }
1548         list_add_tail(&oper->list, &trans->qgroup_ref_list);
1549
1550         if (mod_seq)
1551                 btrfs_get_tree_mod_seq(fs_info, &oper->elem);
1552
1553         return 0;
1554 }
1555
1556 struct btrfs_qgroup_extent_record
1557 *btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs,
1558                                   struct btrfs_qgroup_extent_record *record)
1559 {
1560         struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
1561         struct rb_node *parent_node = NULL;
1562         struct btrfs_qgroup_extent_record *entry;
1563         u64 bytenr = record->bytenr;
1564
1565         while (*p) {
1566                 parent_node = *p;
1567                 entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
1568                                  node);
1569                 if (bytenr < entry->bytenr)
1570                         p = &(*p)->rb_left;
1571                 else if (bytenr > entry->bytenr)
1572                         p = &(*p)->rb_right;
1573                 else
1574                         return entry;
1575         }
1576
1577         rb_link_node(&record->node, parent_node, p);
1578         rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
1579         return NULL;
1580 }
1581
1582 /*
1583  * The easy accounting, if we are adding/removing the only ref for an extent
1584  * then this qgroup and all of the parent qgroups get their refrence and
1585  * exclusive counts adjusted.
1586  */
1587 static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
1588                                   struct btrfs_qgroup_operation *oper)
1589 {
1590         struct ulist *tmp;
1591         int sign = 0;
1592         int ret = 0;
1593
1594         tmp = ulist_alloc(GFP_NOFS);
1595         if (!tmp)
1596                 return -ENOMEM;
1597
1598         spin_lock(&fs_info->qgroup_lock);
1599         if (!fs_info->quota_root)
1600                 goto out;
1601
1602         switch (oper->type) {
1603         case BTRFS_QGROUP_OPER_ADD_EXCL:
1604                 sign = 1;
1605                 break;
1606         case BTRFS_QGROUP_OPER_SUB_EXCL:
1607                 sign = -1;
1608                 break;
1609         default:
1610                 ASSERT(0);
1611         }
1612         ret = __qgroup_excl_accounting(fs_info, tmp, oper->ref_root,
1613                                        oper->num_bytes, sign);
1614 out:
1615         spin_unlock(&fs_info->qgroup_lock);
1616         ulist_free(tmp);
1617         return ret;
1618 }
1619
1620 /*
1621  * Walk all of the roots that pointed to our bytenr and adjust their refcnts as
1622  * properly.
1623  */
1624 static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
1625                                   u64 root_to_skip, struct ulist *tmp,
1626                                   struct ulist *roots, struct ulist *qgroups,
1627                                   u64 seq, int *old_roots, int rescan)
1628 {
1629         struct ulist_node *unode;
1630         struct ulist_iterator uiter;
1631         struct ulist_node *tmp_unode;
1632         struct ulist_iterator tmp_uiter;
1633         struct btrfs_qgroup *qg;
1634         int ret;
1635
1636         ULIST_ITER_INIT(&uiter);
1637         while ((unode = ulist_next(roots, &uiter))) {
1638                 /* We don't count our current root here */
1639                 if (unode->val == root_to_skip)
1640                         continue;
1641                 qg = find_qgroup_rb(fs_info, unode->val);
1642                 if (!qg)
1643                         continue;
1644                 /*
1645                  * We could have a pending removal of this same ref so we may
1646                  * not have actually found our ref root when doing
1647                  * btrfs_find_all_roots, so we need to keep track of how many
1648                  * old roots we find in case we removed ours and added a
1649                  * different one at the same time.  I don't think this could
1650                  * happen in practice but that sort of thinking leads to pain
1651                  * and suffering and to the dark side.
1652                  */
1653                 (*old_roots)++;
1654
1655                 ulist_reinit(tmp);
1656                 ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
1657                                 GFP_ATOMIC);
1658                 if (ret < 0)
1659                         return ret;
1660                 ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), GFP_ATOMIC);
1661                 if (ret < 0)
1662                         return ret;
1663                 ULIST_ITER_INIT(&tmp_uiter);
1664                 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
1665                         struct btrfs_qgroup_list *glist;
1666                         int mod;
1667
1668                         qg = u64_to_ptr(tmp_unode->aux);
1669                         /*
1670                          * We use this sequence number to keep from having to
1671                          * run the whole list and 0 out the refcnt every time.
1672                          * We basically use sequnce as the known 0 count and
1673                          * then add 1 everytime we see a qgroup.  This is how we
1674                          * get how many of the roots actually point up to the
1675                          * upper level qgroups in order to determine exclusive
1676                          * counts.
1677                          *
1678                          * For rescan none of the extent is recorded before so
1679                          * we just don't add old_refcnt.
1680                          */
1681                         if (rescan)
1682                                 mod = 0;
1683                         else
1684                                 mod = 1;
1685                         btrfs_qgroup_update_old_refcnt(qg, seq, mod);
1686                         btrfs_qgroup_update_new_refcnt(qg, seq, 1);
1687                         list_for_each_entry(glist, &qg->groups, next_group) {
1688                                 ret = ulist_add(qgroups, glist->group->qgroupid,
1689                                                 ptr_to_u64(glist->group),
1690                                                 GFP_ATOMIC);
1691                                 if (ret < 0)
1692                                         return ret;
1693                                 ret = ulist_add(tmp, glist->group->qgroupid,
1694                                                 ptr_to_u64(glist->group),
1695                                                 GFP_ATOMIC);
1696                                 if (ret < 0)
1697                                         return ret;
1698                         }
1699                 }
1700         }
1701         return 0;
1702 }
1703
1704 /*
1705  * We need to walk forward in our operation tree and account for any roots that
1706  * were deleted after we made this operation.
1707  */
1708 static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info,
1709                                        struct btrfs_qgroup_operation *oper,
1710                                        struct ulist *tmp,
1711                                        struct ulist *qgroups, u64 seq,
1712                                        int *old_roots)
1713 {
1714         struct ulist_node *unode;
1715         struct ulist_iterator uiter;
1716         struct btrfs_qgroup *qg;
1717         struct btrfs_qgroup_operation *tmp_oper;
1718         struct rb_node *n;
1719         int ret;
1720
1721         ulist_reinit(tmp);
1722
1723         /*
1724          * We only walk forward in the tree since we're only interested in
1725          * removals that happened _after_  our operation.
1726          */
1727         spin_lock(&fs_info->qgroup_op_lock);
1728         n = rb_next(&oper->n);
1729         spin_unlock(&fs_info->qgroup_op_lock);
1730         if (!n)
1731                 return 0;
1732         tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
1733         while (tmp_oper->bytenr == oper->bytenr) {
1734                 /*
1735                  * If it's not a removal we don't care, additions work out
1736                  * properly with our refcnt tracking.
1737                  */
1738                 if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED &&
1739                     tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL)
1740                         goto next;
1741                 qg = find_qgroup_rb(fs_info, tmp_oper->ref_root);
1742                 if (!qg)
1743                         goto next;
1744                 ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
1745                                 GFP_ATOMIC);
1746                 if (ret) {
1747                         if (ret < 0)
1748                                 return ret;
1749                         /*
1750                          * We only want to increase old_roots if this qgroup is
1751                          * not already in the list of qgroups.  If it is already
1752                          * there then that means it must have been re-added or
1753                          * the delete will be discarded because we had an
1754                          * existing ref that we haven't looked up yet.  In this
1755                          * case we don't want to increase old_roots.  So if ret
1756                          * == 1 then we know that this is the first time we've
1757                          * seen this qgroup and we can bump the old_roots.
1758                          */
1759                         (*old_roots)++;
1760                         ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg),
1761                                         GFP_ATOMIC);
1762                         if (ret < 0)
1763                                 return ret;
1764                 }
1765 next:
1766                 spin_lock(&fs_info->qgroup_op_lock);
1767                 n = rb_next(&tmp_oper->n);
1768                 spin_unlock(&fs_info->qgroup_op_lock);
1769                 if (!n)
1770                         break;
1771                 tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
1772         }
1773
1774         /* Ok now process the qgroups we found */
1775         ULIST_ITER_INIT(&uiter);
1776         while ((unode = ulist_next(tmp, &uiter))) {
1777                 struct btrfs_qgroup_list *glist;
1778
1779                 qg = u64_to_ptr(unode->aux);
1780                 btrfs_qgroup_update_old_refcnt(qg, seq, 1);
1781                 btrfs_qgroup_update_new_refcnt(qg, seq, 1);
1782                 list_for_each_entry(glist, &qg->groups, next_group) {
1783                         ret = ulist_add(qgroups, glist->group->qgroupid,
1784                                         ptr_to_u64(glist->group), GFP_ATOMIC);
1785                         if (ret < 0)
1786                                 return ret;
1787                         ret = ulist_add(tmp, glist->group->qgroupid,
1788                                         ptr_to_u64(glist->group), GFP_ATOMIC);
1789                         if (ret < 0)
1790                                 return ret;
1791                 }
1792         }
1793         return 0;
1794 }
1795
1796 /* Add refcnt for the newly added reference. */
1797 static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info,
1798                                   struct btrfs_qgroup_operation *oper,
1799                                   struct btrfs_qgroup *qgroup,
1800                                   struct ulist *tmp, struct ulist *qgroups,
1801                                   u64 seq)
1802 {
1803         struct ulist_node *unode;
1804         struct ulist_iterator uiter;
1805         struct btrfs_qgroup *qg;
1806         int ret;
1807
1808         ulist_reinit(tmp);
1809         ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup),
1810                         GFP_ATOMIC);
1811         if (ret < 0)
1812                 return ret;
1813         ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup),
1814                         GFP_ATOMIC);
1815         if (ret < 0)
1816                 return ret;
1817         ULIST_ITER_INIT(&uiter);
1818         while ((unode = ulist_next(tmp, &uiter))) {
1819                 struct btrfs_qgroup_list *glist;
1820
1821                 qg = u64_to_ptr(unode->aux);
1822                 if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED)
1823                         btrfs_qgroup_update_new_refcnt(qg, seq, 1);
1824                 else
1825                         btrfs_qgroup_update_old_refcnt(qg, seq, 1);
1826                 list_for_each_entry(glist, &qg->groups, next_group) {
1827                         ret = ulist_add(tmp, glist->group->qgroupid,
1828                                         ptr_to_u64(glist->group), GFP_ATOMIC);
1829                         if (ret < 0)
1830                                 return ret;
1831                         ret = ulist_add(qgroups, glist->group->qgroupid,
1832                                         ptr_to_u64(glist->group), GFP_ATOMIC);
1833                         if (ret < 0)
1834                                 return ret;
1835                 }
1836         }
1837         return 0;
1838 }
1839
1840 #define UPDATE_NEW      0
1841 #define UPDATE_OLD      1
1842 /*
1843  * Walk all of the roots that points to the bytenr and adjust their refcnts.
1844  */
1845 static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
1846                                 struct ulist *roots, struct ulist *tmp,
1847                                 struct ulist *qgroups, u64 seq, int update_old)
1848 {
1849         struct ulist_node *unode;
1850         struct ulist_iterator uiter;
1851         struct ulist_node *tmp_unode;
1852         struct ulist_iterator tmp_uiter;
1853         struct btrfs_qgroup *qg;
1854         int ret = 0;
1855
1856         if (!roots)
1857                 return 0;
1858         ULIST_ITER_INIT(&uiter);
1859         while ((unode = ulist_next(roots, &uiter))) {
1860                 qg = find_qgroup_rb(fs_info, unode->val);
1861                 if (!qg)
1862                         continue;
1863
1864                 ulist_reinit(tmp);
1865                 ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
1866                                 GFP_ATOMIC);
1867                 if (ret < 0)
1868                         return ret;
1869                 ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), GFP_ATOMIC);
1870                 if (ret < 0)
1871                         return ret;
1872                 ULIST_ITER_INIT(&tmp_uiter);
1873                 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
1874                         struct btrfs_qgroup_list *glist;
1875
1876                         qg = u64_to_ptr(tmp_unode->aux);
1877                         if (update_old)
1878                                 btrfs_qgroup_update_old_refcnt(qg, seq, 1);
1879                         else
1880                                 btrfs_qgroup_update_new_refcnt(qg, seq, 1);
1881                         list_for_each_entry(glist, &qg->groups, next_group) {
1882                                 ret = ulist_add(qgroups, glist->group->qgroupid,
1883                                                 ptr_to_u64(glist->group),
1884                                                 GFP_ATOMIC);
1885                                 if (ret < 0)
1886                                         return ret;
1887                                 ret = ulist_add(tmp, glist->group->qgroupid,
1888                                                 ptr_to_u64(glist->group),
1889                                                 GFP_ATOMIC);
1890                                 if (ret < 0)
1891                                         return ret;
1892                         }
1893                 }
1894         }
1895         return 0;
1896 }
1897
1898 /*
1899  * Update qgroup rfer/excl counters.
1900  * Rfer update is easy, codes can explain themselves.
1901  * Excl update is tricky, the update is split into 2 part.
1902  * Part 1: Possible exclusive <-> sharing detect:
1903  *      |       A       |       !A      |
1904  *  -------------------------------------
1905  *  B   |       *       |       -       |
1906  *  -------------------------------------
1907  *  !B  |       +       |       **      |
1908  *  -------------------------------------
1909  *
1910  * Conditions:
1911  * A:   cur_old_roots < nr_old_roots    (not exclusive before)
1912  * !A:  cur_old_roots == nr_old_roots   (possible exclusive before)
1913  * B:   cur_new_roots < nr_new_roots    (not exclusive now)
1914  * !B:  cur_new_roots == nr_new_roots   (possible exclsuive now)
1915  *
1916  * Results:
1917  * +: Possible sharing -> exclusive     -: Possible exclusive -> sharing
1918  * *: Definitely not changed.           **: Possible unchanged.
1919  *
1920  * For !A and !B condition, the exception is cur_old/new_roots == 0 case.
1921  *
1922  * To make the logic clear, we first use condition A and B to split
1923  * combination into 4 results.
1924  *
1925  * Then, for result "+" and "-", check old/new_roots == 0 case, as in them
1926  * only on variant maybe 0.
1927  *
1928  * Lastly, check result **, since there are 2 variants maybe 0, split them
1929  * again(2x2).
1930  * But this time we don't need to consider other things, the codes and logic
1931  * is easy to understand now.
1932  */
1933 static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
1934                                   struct ulist *qgroups,
1935                                   u64 nr_old_roots,
1936                                   u64 nr_new_roots,
1937                                   u64 num_bytes, u64 seq)
1938 {
1939         struct ulist_node *unode;
1940         struct ulist_iterator uiter;
1941         struct btrfs_qgroup *qg;
1942         u64 cur_new_count, cur_old_count;
1943
1944         ULIST_ITER_INIT(&uiter);
1945         while ((unode = ulist_next(qgroups, &uiter))) {
1946                 bool dirty = false;
1947
1948                 qg = u64_to_ptr(unode->aux);
1949                 cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
1950                 cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
1951
1952                 /* Rfer update part */
1953                 if (cur_old_count == 0 && cur_new_count > 0) {
1954                         qg->rfer += num_bytes;
1955                         qg->rfer_cmpr += num_bytes;
1956                         dirty = true;
1957                 }
1958                 if (cur_old_count > 0 && cur_new_count == 0) {
1959                         qg->rfer -= num_bytes;
1960                         qg->rfer_cmpr -= num_bytes;
1961                         dirty = true;
1962                 }
1963
1964                 /* Excl update part */
1965                 /* Exclusive/none -> shared case */
1966                 if (cur_old_count == nr_old_roots &&
1967                     cur_new_count < nr_new_roots) {
1968                         /* Exclusive -> shared */
1969                         if (cur_old_count != 0) {
1970                                 qg->excl -= num_bytes;
1971                                 qg->excl_cmpr -= num_bytes;
1972                                 dirty = true;
1973                         }
1974                 }
1975
1976                 /* Shared -> exclusive/none case */
1977                 if (cur_old_count < nr_old_roots &&
1978                     cur_new_count == nr_new_roots) {
1979                         /* Shared->exclusive */
1980                         if (cur_new_count != 0) {
1981                                 qg->excl += num_bytes;
1982                                 qg->excl_cmpr += num_bytes;
1983                                 dirty = true;
1984                         }
1985                 }
1986
1987                 /* Exclusive/none -> exclusive/none case */
1988                 if (cur_old_count == nr_old_roots &&
1989                     cur_new_count == nr_new_roots) {
1990                         if (cur_old_count == 0) {
1991                                 /* None -> exclusive/none */
1992
1993                                 if (cur_new_count != 0) {
1994                                         /* None -> exclusive */
1995                                         qg->excl += num_bytes;
1996                                         qg->excl_cmpr += num_bytes;
1997                                         dirty = true;
1998                                 }
1999                                 /* None -> none, nothing changed */
2000                         } else {
2001                                 /* Exclusive -> exclusive/none */
2002
2003                                 if (cur_new_count == 0) {
2004                                         /* Exclusive -> none */
2005                                         qg->excl -= num_bytes;
2006                                         qg->excl_cmpr -= num_bytes;
2007                                         dirty = true;
2008                                 }
2009                                 /* Exclusive -> exclusive, nothing changed */
2010                         }
2011                 }
2012                 if (dirty)
2013                         qgroup_dirty(fs_info, qg);
2014         }
2015         return 0;
2016 }
2017
2018 /*
2019  * This adjusts the counters for all referenced qgroups if need be.
2020  */
2021 static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info,
2022                                   u64 root_to_skip, u64 num_bytes,
2023                                   struct ulist *qgroups, u64 seq,
2024                                   int old_roots, int new_roots, int rescan)
2025 {
2026         struct ulist_node *unode;
2027         struct ulist_iterator uiter;
2028         struct btrfs_qgroup *qg;
2029         u64 cur_new_count, cur_old_count;
2030
2031         ULIST_ITER_INIT(&uiter);
2032         while ((unode = ulist_next(qgroups, &uiter))) {
2033                 bool dirty = false;
2034
2035                 qg = u64_to_ptr(unode->aux);
2036                 cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
2037                 cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
2038
2039                 /*
2040                  * Wasn't referenced before but is now, add to the reference
2041                  * counters.
2042                  */
2043                 if (cur_old_count == 0 && cur_new_count > 0) {
2044                         qg->rfer += num_bytes;
2045                         qg->rfer_cmpr += num_bytes;
2046                         dirty = true;
2047                 }
2048
2049                 /*
2050                  * Was referenced before but isn't now, subtract from the
2051                  * reference counters.
2052                  */
2053                 if (cur_old_count > 0 && cur_new_count == 0) {
2054                         qg->rfer -= num_bytes;
2055                         qg->rfer_cmpr -= num_bytes;
2056                         dirty = true;
2057                 }
2058
2059                 /*
2060                  * If our refcount was the same as the roots previously but our
2061                  * new count isn't the same as the number of roots now then we
2062                  * went from having a exclusive reference on this range to not.
2063                  */
2064                 if (old_roots && cur_old_count == old_roots &&
2065                     (cur_new_count != new_roots || new_roots == 0)) {
2066                         WARN_ON(cur_new_count != new_roots && new_roots == 0);
2067                         qg->excl -= num_bytes;
2068                         qg->excl_cmpr -= num_bytes;
2069                         dirty = true;
2070                 }
2071
2072                 /*
2073                  * If we didn't reference all the roots before but now we do we
2074                  * have an exclusive reference to this range.
2075                  */
2076                 if ((!old_roots || (old_roots && cur_old_count != old_roots))
2077                     && cur_new_count == new_roots) {
2078                         qg->excl += num_bytes;
2079                         qg->excl_cmpr += num_bytes;
2080                         dirty = true;
2081                 }
2082
2083                 if (dirty)
2084                         qgroup_dirty(fs_info, qg);
2085         }
2086         return 0;
2087 }
2088
2089 /*
2090  * If we removed a data extent and there were other references for that bytenr
2091  * then we need to lookup all referenced roots to make sure we still don't
2092  * reference this bytenr.  If we do then we can just discard this operation.
2093  */
2094 static int check_existing_refs(struct btrfs_trans_handle *trans,
2095                                struct btrfs_fs_info *fs_info,
2096                                struct btrfs_qgroup_operation *oper)
2097 {
2098         struct ulist *roots = NULL;
2099         struct ulist_node *unode;
2100         struct ulist_iterator uiter;
2101         int ret = 0;
2102
2103         ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
2104                                    oper->elem.seq, &roots);
2105         if (ret < 0)
2106                 return ret;
2107         ret = 0;
2108
2109         ULIST_ITER_INIT(&uiter);
2110         while ((unode = ulist_next(roots, &uiter))) {
2111                 if (unode->val == oper->ref_root) {
2112                         ret = 1;
2113                         break;
2114                 }
2115         }
2116         ulist_free(roots);
2117         btrfs_put_tree_mod_seq(fs_info, &oper->elem);
2118
2119         return ret;
2120 }
2121
2122 /*
2123  * If we share a reference across multiple roots then we may need to adjust
2124  * various qgroups referenced and exclusive counters.  The basic premise is this
2125  *
2126  * 1) We have seq to represent a 0 count.  Instead of looping through all of the
2127  * qgroups and resetting their refcount to 0 we just constantly bump this
2128  * sequence number to act as the base reference count.  This means that if
2129  * anybody is equal to or below this sequence they were never referenced.  We
2130  * jack this sequence up by the number of roots we found each time in order to
2131  * make sure we don't have any overlap.
2132  *
2133  * 2) We first search all the roots that reference the area _except_ the root
2134  * we're acting on currently.  This makes up the old_refcnt of all the qgroups
2135  * before.
2136  *
2137  * 3) We walk all of the qgroups referenced by the root we are currently acting
2138  * on, and will either adjust old_refcnt in the case of a removal or the
2139  * new_refcnt in the case of an addition.
2140  *
2141  * 4) Finally we walk all the qgroups that are referenced by this range
2142  * including the root we are acting on currently.  We will adjust the counters
2143  * based on the number of roots we had and will have after this operation.
2144  *
2145  * Take this example as an illustration
2146  *
2147  *                      [qgroup 1/0]
2148  *                   /         |          \
2149  *              [qg 0/0]   [qg 0/1]     [qg 0/2]
2150  *                 \          |            /
2151  *                [        extent           ]
2152  *
2153  * Say we are adding a reference that is covered by qg 0/0.  The first step
2154  * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with
2155  * old_roots being 2.  Because it is adding new_roots will be 1.  We then go
2156  * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's
2157  * new_refcnt, bringing it to 3.  We then walk through all of the qgroups, we
2158  * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a
2159  * reference and thus must add the size to the referenced bytes.  Everything
2160  * else is the same so nothing else changes.
2161  */
2162 static int qgroup_shared_accounting(struct btrfs_trans_handle *trans,
2163                                     struct btrfs_fs_info *fs_info,
2164                                     struct btrfs_qgroup_operation *oper)
2165 {
2166         struct ulist *roots = NULL;
2167         struct ulist *qgroups, *tmp;
2168         struct btrfs_qgroup *qgroup;
2169         struct seq_list elem = SEQ_LIST_INIT(elem);
2170         u64 seq;
2171         int old_roots = 0;
2172         int new_roots = 0;
2173         int ret = 0;
2174
2175         if (oper->elem.seq) {
2176                 ret = check_existing_refs(trans, fs_info, oper);
2177                 if (ret < 0)
2178                         return ret;
2179                 if (ret)
2180                         return 0;
2181         }
2182
2183         qgroups = ulist_alloc(GFP_NOFS);
2184         if (!qgroups)
2185                 return -ENOMEM;
2186
2187         tmp = ulist_alloc(GFP_NOFS);
2188         if (!tmp) {
2189                 ulist_free(qgroups);
2190                 return -ENOMEM;
2191         }
2192
2193         btrfs_get_tree_mod_seq(fs_info, &elem);
2194         ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq,
2195                                    &roots);
2196         btrfs_put_tree_mod_seq(fs_info, &elem);
2197         if (ret < 0) {
2198                 ulist_free(qgroups);
2199                 ulist_free(tmp);
2200                 return ret;
2201         }
2202         spin_lock(&fs_info->qgroup_lock);
2203         qgroup = find_qgroup_rb(fs_info, oper->ref_root);
2204         if (!qgroup)
2205                 goto out;
2206         seq = fs_info->qgroup_seq;
2207
2208         /*
2209          * So roots is the list of all the roots currently pointing at the
2210          * bytenr, including the ref we are adding if we are adding, or not if
2211          * we are removing a ref.  So we pass in the ref_root to skip that root
2212          * in our calculations.  We set old_refnct and new_refcnt cause who the
2213          * hell knows what everything looked like before, and it doesn't matter
2214          * except...
2215          */
2216         ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups,
2217                                      seq, &old_roots, 0);
2218         if (ret < 0)
2219                 goto out;
2220
2221         /*
2222          * Now adjust the refcounts of the qgroups that care about this
2223          * reference, either the old_count in the case of removal or new_count
2224          * in the case of an addition.
2225          */
2226         ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups,
2227                                      seq);
2228         if (ret < 0)
2229                 goto out;
2230
2231         /*
2232          * ...in the case of removals.  If we had a removal before we got around
2233          * to processing this operation then we need to find that guy and count
2234          * his references as if they really existed so we don't end up screwing
2235          * up the exclusive counts.  Then whenever we go to process the delete
2236          * everything will be grand and we can account for whatever exclusive
2237          * changes need to be made there.  We also have to pass in old_roots so
2238          * we have an accurate count of the roots as it pertains to this
2239          * operations view of the world.
2240          */
2241         ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq,
2242                                           &old_roots);
2243         if (ret < 0)
2244                 goto out;
2245
2246         /*
2247          * We are adding our root, need to adjust up the number of roots,
2248          * otherwise old_roots is the number of roots we want.
2249          */
2250         if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
2251                 new_roots = old_roots + 1;
2252         } else {
2253                 new_roots = old_roots;
2254                 old_roots++;
2255         }
2256
2257         /*
2258          * Bump qgroup_seq to avoid seq overlap
2259          * XXX: This makes qgroup_seq mismatch with oper->seq.
2260          */
2261         fs_info->qgroup_seq += old_roots + 1;
2262
2263
2264         /*
2265          * And now the magic happens, bless Arne for having a pretty elegant
2266          * solution for this.
2267          */
2268         qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes,
2269                                qgroups, seq, old_roots, new_roots, 0);
2270 out:
2271         spin_unlock(&fs_info->qgroup_lock);
2272         ulist_free(qgroups);
2273         ulist_free(roots);
2274         ulist_free(tmp);
2275         return ret;
2276 }
2277
2278 /*
2279  * Process a reference to a shared subtree. This type of operation is
2280  * queued during snapshot removal when we encounter extents which are
2281  * shared between more than one root.
2282  */
2283 static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans,
2284                                      struct btrfs_fs_info *fs_info,
2285                                      struct btrfs_qgroup_operation *oper)
2286 {
2287         struct ulist *roots = NULL;
2288         struct ulist_node *unode;
2289         struct ulist_iterator uiter;
2290         struct btrfs_qgroup_list *glist;
2291         struct ulist *parents;
2292         int ret = 0;
2293         int err;
2294         struct btrfs_qgroup *qg;
2295         u64 root_obj = 0;
2296         struct seq_list elem = SEQ_LIST_INIT(elem);
2297
2298         parents = ulist_alloc(GFP_NOFS);
2299         if (!parents)
2300                 return -ENOMEM;
2301
2302         btrfs_get_tree_mod_seq(fs_info, &elem);
2303         ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
2304                                    elem.seq, &roots);
2305         btrfs_put_tree_mod_seq(fs_info, &elem);
2306         if (ret < 0)
2307                 goto out;
2308
2309         if (roots->nnodes != 1)
2310                 goto out;
2311
2312         ULIST_ITER_INIT(&uiter);
2313         unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */
2314         /*
2315          * If we find our ref root then that means all refs
2316          * this extent has to the root have not yet been
2317          * deleted. In that case, we do nothing and let the
2318          * last ref for this bytenr drive our update.
2319          *
2320          * This can happen for example if an extent is
2321          * referenced multiple times in a snapshot (clone,
2322          * etc). If we are in the middle of snapshot removal,
2323          * queued updates for such an extent will find the
2324          * root if we have not yet finished removing the
2325          * snapshot.
2326          */
2327         if (unode->val == oper->ref_root)
2328                 goto out;
2329
2330         root_obj = unode->val;
2331         BUG_ON(!root_obj);
2332
2333         spin_lock(&fs_info->qgroup_lock);
2334         qg = find_qgroup_rb(fs_info, root_obj);
2335         if (!qg)
2336                 goto out_unlock;
2337
2338         qg->excl += oper->num_bytes;
2339         qg->excl_cmpr += oper->num_bytes;
2340         qgroup_dirty(fs_info, qg);
2341
2342         /*
2343          * Adjust counts for parent groups. First we find all
2344          * parents, then in the 2nd loop we do the adjustment
2345          * while adding parents of the parents to our ulist.
2346          */
2347         list_for_each_entry(glist, &qg->groups, next_group) {
2348                 err = ulist_add(parents, glist->group->qgroupid,
2349                                 ptr_to_u64(glist->group), GFP_ATOMIC);
2350                 if (err < 0) {
2351                         ret = err;
2352                         goto out_unlock;
2353                 }
2354         }
2355
2356         ULIST_ITER_INIT(&uiter);
2357         while ((unode = ulist_next(parents, &uiter))) {
2358                 qg = u64_to_ptr(unode->aux);
2359                 qg->excl += oper->num_bytes;
2360                 qg->excl_cmpr += oper->num_bytes;
2361                 qgroup_dirty(fs_info, qg);
2362
2363                 /* Add any parents of the parents */
2364                 list_for_each_entry(glist, &qg->groups, next_group) {
2365                         err = ulist_add(parents, glist->group->qgroupid,
2366                                         ptr_to_u64(glist->group), GFP_ATOMIC);
2367                         if (err < 0) {
2368                                 ret = err;
2369                                 goto out_unlock;
2370                         }
2371                 }
2372         }
2373
2374 out_unlock:
2375         spin_unlock(&fs_info->qgroup_lock);
2376
2377 out:
2378         ulist_free(roots);
2379         ulist_free(parents);
2380         return ret;
2381 }
2382
2383 /*
2384  * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
2385  * from the fs. First, all roots referencing the extent are searched, and
2386  * then the space is accounted accordingly to the different roots. The
2387  * accounting algorithm works in 3 steps documented inline.
2388  */
2389 static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
2390                                 struct btrfs_fs_info *fs_info,
2391                                 struct btrfs_qgroup_operation *oper)
2392 {
2393         int ret = 0;
2394
2395         if (!fs_info->quota_enabled)
2396                 return 0;
2397
2398         BUG_ON(!fs_info->quota_root);
2399
2400         mutex_lock(&fs_info->qgroup_rescan_lock);
2401         if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
2402                 if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) {
2403                         mutex_unlock(&fs_info->qgroup_rescan_lock);
2404                         return 0;
2405                 }
2406         }
2407         mutex_unlock(&fs_info->qgroup_rescan_lock);
2408
2409         ASSERT(is_fstree(oper->ref_root));
2410
2411         trace_btrfs_qgroup_account(oper);
2412
2413         switch (oper->type) {
2414         case BTRFS_QGROUP_OPER_ADD_EXCL:
2415         case BTRFS_QGROUP_OPER_SUB_EXCL:
2416                 ret = qgroup_excl_accounting(fs_info, oper);
2417                 break;
2418         case BTRFS_QGROUP_OPER_ADD_SHARED:
2419         case BTRFS_QGROUP_OPER_SUB_SHARED:
2420                 ret = qgroup_shared_accounting(trans, fs_info, oper);
2421                 break;
2422         case BTRFS_QGROUP_OPER_SUB_SUBTREE:
2423                 ret = qgroup_subtree_accounting(trans, fs_info, oper);
2424                 break;
2425         default:
2426                 ASSERT(0);
2427         }
2428         return ret;
2429 }
2430
2431 /*
2432  * Needs to be called everytime we run delayed refs, even if there is an error
2433  * in order to cleanup outstanding operations.
2434  */
2435 int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
2436                                     struct btrfs_fs_info *fs_info)
2437 {
2438         struct btrfs_qgroup_operation *oper;
2439         int ret = 0;
2440
2441         while (!list_empty(&trans->qgroup_ref_list)) {
2442                 oper = list_first_entry(&trans->qgroup_ref_list,
2443                                         struct btrfs_qgroup_operation, list);
2444                 list_del_init(&oper->list);
2445                 if (!ret || !trans->aborted)
2446                         ret = btrfs_qgroup_account(trans, fs_info, oper);
2447                 spin_lock(&fs_info->qgroup_op_lock);
2448                 rb_erase(&oper->n, &fs_info->qgroup_op_tree);
2449                 spin_unlock(&fs_info->qgroup_op_lock);
2450                 btrfs_put_tree_mod_seq(fs_info, &oper->elem);
2451                 kfree(oper);
2452         }
2453         return ret;
2454 }
2455
2456 /*
2457  * called from commit_transaction. Writes all changed qgroups to disk.
2458  */
2459 int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
2460                       struct btrfs_fs_info *fs_info)
2461 {
2462         struct btrfs_root *quota_root = fs_info->quota_root;
2463         int ret = 0;
2464         int start_rescan_worker = 0;
2465
2466         if (!quota_root)
2467                 goto out;
2468
2469         if (!fs_info->quota_enabled && fs_info->pending_quota_state)
2470                 start_rescan_worker = 1;
2471
2472         fs_info->quota_enabled = fs_info->pending_quota_state;
2473
2474         spin_lock(&fs_info->qgroup_lock);
2475         while (!list_empty(&fs_info->dirty_qgroups)) {
2476                 struct btrfs_qgroup *qgroup;
2477                 qgroup = list_first_entry(&fs_info->dirty_qgroups,
2478                                           struct btrfs_qgroup, dirty);
2479                 list_del_init(&qgroup->dirty);
2480                 spin_unlock(&fs_info->qgroup_lock);
2481                 ret = update_qgroup_info_item(trans, quota_root, qgroup);
2482                 if (ret)
2483                         fs_info->qgroup_flags |=
2484                                         BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2485                 ret = update_qgroup_limit_item(trans, quota_root, qgroup);
2486                 if (ret)
2487                         fs_info->qgroup_flags |=
2488                                         BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2489                 spin_lock(&fs_info->qgroup_lock);
2490         }
2491         if (fs_info->quota_enabled)
2492                 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
2493         else
2494                 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
2495         spin_unlock(&fs_info->qgroup_lock);
2496
2497         ret = update_qgroup_status_item(trans, fs_info, quota_root);
2498         if (ret)
2499                 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2500
2501         if (!ret && start_rescan_worker) {
2502                 ret = qgroup_rescan_init(fs_info, 0, 1);
2503                 if (!ret) {
2504                         qgroup_rescan_zero_tracking(fs_info);
2505                         btrfs_queue_work(fs_info->qgroup_rescan_workers,
2506                                          &fs_info->qgroup_rescan_work);
2507                 }
2508                 ret = 0;
2509         }
2510
2511 out:
2512
2513         return ret;
2514 }
2515
2516 /*
2517  * copy the acounting information between qgroups. This is necessary when a
2518  * snapshot or a subvolume is created
2519  */
2520 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
2521                          struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
2522                          struct btrfs_qgroup_inherit *inherit)
2523 {
2524         int ret = 0;
2525         int i;
2526         u64 *i_qgroups;
2527         struct btrfs_root *quota_root = fs_info->quota_root;
2528         struct btrfs_qgroup *srcgroup;
2529         struct btrfs_qgroup *dstgroup;
2530         u32 level_size = 0;
2531         u64 nums;
2532
2533         mutex_lock(&fs_info->qgroup_ioctl_lock);
2534         if (!fs_info->quota_enabled)
2535                 goto out;
2536
2537         if (!quota_root) {
2538                 ret = -EINVAL;
2539                 goto out;
2540         }
2541
2542         if (inherit) {
2543                 i_qgroups = (u64 *)(inherit + 1);
2544                 nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
2545                        2 * inherit->num_excl_copies;
2546                 for (i = 0; i < nums; ++i) {
2547                         srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
2548                         if (!srcgroup) {
2549                                 ret = -EINVAL;
2550                                 goto out;
2551                         }
2552
2553                         if ((srcgroup->qgroupid >> 48) <= (objectid >> 48)) {
2554                                 ret = -EINVAL;
2555                                 goto out;
2556                         }
2557                         ++i_qgroups;
2558                 }
2559         }
2560
2561         /*
2562          * create a tracking group for the subvol itself
2563          */
2564         ret = add_qgroup_item(trans, quota_root, objectid);
2565         if (ret)
2566                 goto out;
2567
2568         if (srcid) {
2569                 struct btrfs_root *srcroot;
2570                 struct btrfs_key srckey;
2571
2572                 srckey.objectid = srcid;
2573                 srckey.type = BTRFS_ROOT_ITEM_KEY;
2574                 srckey.offset = (u64)-1;
2575                 srcroot = btrfs_read_fs_root_no_name(fs_info, &srckey);
2576                 if (IS_ERR(srcroot)) {
2577                         ret = PTR_ERR(srcroot);
2578                         goto out;
2579                 }
2580
2581                 rcu_read_lock();
2582                 level_size = srcroot->nodesize;
2583                 rcu_read_unlock();
2584         }
2585
2586         /*
2587          * add qgroup to all inherited groups
2588          */
2589         if (inherit) {
2590                 i_qgroups = (u64 *)(inherit + 1);
2591                 for (i = 0; i < inherit->num_qgroups; ++i) {
2592                         ret = add_qgroup_relation_item(trans, quota_root,
2593                                                        objectid, *i_qgroups);
2594                         if (ret)
2595                                 goto out;
2596                         ret = add_qgroup_relation_item(trans, quota_root,
2597                                                        *i_qgroups, objectid);
2598                         if (ret)
2599                                 goto out;
2600                         ++i_qgroups;
2601                 }
2602         }
2603
2604
2605         spin_lock(&fs_info->qgroup_lock);
2606
2607         dstgroup = add_qgroup_rb(fs_info, objectid);
2608         if (IS_ERR(dstgroup)) {
2609                 ret = PTR_ERR(dstgroup);
2610                 goto unlock;
2611         }
2612
2613         if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
2614                 dstgroup->lim_flags = inherit->lim.flags;
2615                 dstgroup->max_rfer = inherit->lim.max_rfer;
2616                 dstgroup->max_excl = inherit->lim.max_excl;
2617                 dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
2618                 dstgroup->rsv_excl = inherit->lim.rsv_excl;
2619
2620                 ret = update_qgroup_limit_item(trans, quota_root, dstgroup);
2621                 if (ret) {
2622                         fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2623                         btrfs_info(fs_info, "unable to update quota limit for %llu",
2624                                dstgroup->qgroupid);
2625                         goto unlock;
2626                 }
2627         }
2628
2629         if (srcid) {
2630                 srcgroup = find_qgroup_rb(fs_info, srcid);
2631                 if (!srcgroup)
2632                         goto unlock;
2633
2634                 /*
2635                  * We call inherit after we clone the root in order to make sure
2636                  * our counts don't go crazy, so at this point the only
2637                  * difference between the two roots should be the root node.
2638                  */
2639                 dstgroup->rfer = srcgroup->rfer;
2640                 dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
2641                 dstgroup->excl = level_size;
2642                 dstgroup->excl_cmpr = level_size;
2643                 srcgroup->excl = level_size;
2644                 srcgroup->excl_cmpr = level_size;
2645
2646                 /* inherit the limit info */
2647                 dstgroup->lim_flags = srcgroup->lim_flags;
2648                 dstgroup->max_rfer = srcgroup->max_rfer;
2649                 dstgroup->max_excl = srcgroup->max_excl;
2650                 dstgroup->rsv_rfer = srcgroup->rsv_rfer;
2651                 dstgroup->rsv_excl = srcgroup->rsv_excl;
2652
2653                 qgroup_dirty(fs_info, dstgroup);
2654                 qgroup_dirty(fs_info, srcgroup);
2655         }
2656
2657         if (!inherit)
2658                 goto unlock;
2659
2660         i_qgroups = (u64 *)(inherit + 1);
2661         for (i = 0; i < inherit->num_qgroups; ++i) {
2662                 ret = add_relation_rb(quota_root->fs_info, objectid,
2663                                       *i_qgroups);
2664                 if (ret)
2665                         goto unlock;
2666                 ++i_qgroups;
2667         }
2668
2669         for (i = 0; i <  inherit->num_ref_copies; ++i) {
2670                 struct btrfs_qgroup *src;
2671                 struct btrfs_qgroup *dst;
2672
2673                 src = find_qgroup_rb(fs_info, i_qgroups[0]);
2674                 dst = find_qgroup_rb(fs_info, i_qgroups[1]);
2675
2676                 if (!src || !dst) {
2677                         ret = -EINVAL;
2678                         goto unlock;
2679                 }
2680
2681                 dst->rfer = src->rfer - level_size;
2682                 dst->rfer_cmpr = src->rfer_cmpr - level_size;
2683                 i_qgroups += 2;
2684         }
2685         for (i = 0; i <  inherit->num_excl_copies; ++i) {
2686                 struct btrfs_qgroup *src;
2687                 struct btrfs_qgroup *dst;
2688
2689                 src = find_qgroup_rb(fs_info, i_qgroups[0]);
2690                 dst = find_qgroup_rb(fs_info, i_qgroups[1]);
2691
2692                 if (!src || !dst) {
2693                         ret = -EINVAL;
2694                         goto unlock;
2695                 }
2696
2697                 dst->excl = src->excl + level_size;
2698                 dst->excl_cmpr = src->excl_cmpr + level_size;
2699                 i_qgroups += 2;
2700         }
2701
2702 unlock:
2703         spin_unlock(&fs_info->qgroup_lock);
2704 out:
2705         mutex_unlock(&fs_info->qgroup_ioctl_lock);
2706         return ret;
2707 }
2708
2709 int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
2710 {
2711         struct btrfs_root *quota_root;
2712         struct btrfs_qgroup *qgroup;
2713         struct btrfs_fs_info *fs_info = root->fs_info;
2714         u64 ref_root = root->root_key.objectid;
2715         int ret = 0;
2716         struct ulist_node *unode;
2717         struct ulist_iterator uiter;
2718
2719         if (!is_fstree(ref_root))
2720                 return 0;
2721
2722         if (num_bytes == 0)
2723                 return 0;
2724
2725         spin_lock(&fs_info->qgroup_lock);
2726         quota_root = fs_info->quota_root;
2727         if (!quota_root)
2728                 goto out;
2729
2730         qgroup = find_qgroup_rb(fs_info, ref_root);
2731         if (!qgroup)
2732                 goto out;
2733
2734         /*
2735          * in a first step, we check all affected qgroups if any limits would
2736          * be exceeded
2737          */
2738         ulist_reinit(fs_info->qgroup_ulist);
2739         ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
2740                         (uintptr_t)qgroup, GFP_ATOMIC);
2741         if (ret < 0)
2742                 goto out;
2743         ULIST_ITER_INIT(&uiter);
2744         while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
2745                 struct btrfs_qgroup *qg;
2746                 struct btrfs_qgroup_list *glist;
2747
2748                 qg = u64_to_ptr(unode->aux);
2749
2750                 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
2751                     qg->reserved + (s64)qg->rfer + num_bytes >
2752                     qg->max_rfer) {
2753                         ret = -EDQUOT;
2754                         goto out;
2755                 }
2756
2757                 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
2758                     qg->reserved + (s64)qg->excl + num_bytes >
2759                     qg->max_excl) {
2760                         ret = -EDQUOT;
2761                         goto out;
2762                 }
2763
2764                 list_for_each_entry(glist, &qg->groups, next_group) {
2765                         ret = ulist_add(fs_info->qgroup_ulist,
2766                                         glist->group->qgroupid,
2767                                         (uintptr_t)glist->group, GFP_ATOMIC);
2768                         if (ret < 0)
2769                                 goto out;
2770                 }
2771         }
2772         ret = 0;
2773         /*
2774          * no limits exceeded, now record the reservation into all qgroups
2775          */
2776         ULIST_ITER_INIT(&uiter);
2777         while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
2778                 struct btrfs_qgroup *qg;
2779
2780                 qg = u64_to_ptr(unode->aux);
2781
2782                 qg->reserved += num_bytes;
2783         }
2784
2785 out:
2786         spin_unlock(&fs_info->qgroup_lock);
2787         return ret;
2788 }
2789
2790 void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
2791 {
2792         struct btrfs_root *quota_root;
2793         struct btrfs_qgroup *qgroup;
2794         struct btrfs_fs_info *fs_info = root->fs_info;
2795         struct ulist_node *unode;
2796         struct ulist_iterator uiter;
2797         u64 ref_root = root->root_key.objectid;
2798         int ret = 0;
2799
2800         if (!is_fstree(ref_root))
2801                 return;
2802
2803         if (num_bytes == 0)
2804                 return;
2805
2806         spin_lock(&fs_info->qgroup_lock);
2807
2808         quota_root = fs_info->quota_root;
2809         if (!quota_root)
2810                 goto out;
2811
2812         qgroup = find_qgroup_rb(fs_info, ref_root);
2813         if (!qgroup)
2814                 goto out;
2815
2816         ulist_reinit(fs_info->qgroup_ulist);
2817         ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
2818                         (uintptr_t)qgroup, GFP_ATOMIC);
2819         if (ret < 0)
2820                 goto out;
2821         ULIST_ITER_INIT(&uiter);
2822         while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
2823                 struct btrfs_qgroup *qg;
2824                 struct btrfs_qgroup_list *glist;
2825
2826                 qg = u64_to_ptr(unode->aux);
2827
2828                 qg->reserved -= num_bytes;
2829
2830                 list_for_each_entry(glist, &qg->groups, next_group) {
2831                         ret = ulist_add(fs_info->qgroup_ulist,
2832                                         glist->group->qgroupid,
2833                                         (uintptr_t)glist->group, GFP_ATOMIC);
2834                         if (ret < 0)
2835                                 goto out;
2836                 }
2837         }
2838
2839 out:
2840         spin_unlock(&fs_info->qgroup_lock);
2841 }
2842
2843 void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
2844 {
2845         if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
2846                 return;
2847         btrfs_err(trans->root->fs_info,
2848                 "qgroups not uptodate in trans handle %p:  list is%s empty, "
2849                 "seq is %#x.%x",
2850                 trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
2851                 (u32)(trans->delayed_ref_elem.seq >> 32),
2852                 (u32)trans->delayed_ref_elem.seq);
2853         BUG();
2854 }
2855
2856 /*
2857  * returns < 0 on error, 0 when more leafs are to be scanned.
2858  * returns 1 when done.
2859  */
2860 static int
2861 qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
2862                    struct btrfs_trans_handle *trans, struct ulist *qgroups,
2863                    struct ulist *tmp, struct extent_buffer *scratch_leaf)
2864 {
2865         struct btrfs_key found;
2866         struct ulist *roots = NULL;
2867         struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
2868         u64 num_bytes;
2869         u64 seq;
2870         int new_roots;
2871         int slot;
2872         int ret;
2873
2874         path->leave_spinning = 1;
2875         mutex_lock(&fs_info->qgroup_rescan_lock);
2876         ret = btrfs_search_slot_for_read(fs_info->extent_root,
2877                                          &fs_info->qgroup_rescan_progress,
2878                                          path, 1, 0);
2879
2880         pr_debug("current progress key (%llu %u %llu), search_slot ret %d\n",
2881                  fs_info->qgroup_rescan_progress.objectid,
2882                  fs_info->qgroup_rescan_progress.type,
2883                  fs_info->qgroup_rescan_progress.offset, ret);
2884
2885         if (ret) {
2886                 /*
2887                  * The rescan is about to end, we will not be scanning any
2888                  * further blocks. We cannot unset the RESCAN flag here, because
2889                  * we want to commit the transaction if everything went well.
2890                  * To make the live accounting work in this phase, we set our
2891                  * scan progress pointer such that every real extent objectid
2892                  * will be smaller.
2893                  */
2894                 fs_info->qgroup_rescan_progress.objectid = (u64)-1;
2895                 btrfs_release_path(path);
2896                 mutex_unlock(&fs_info->qgroup_rescan_lock);
2897                 return ret;
2898         }
2899
2900         btrfs_item_key_to_cpu(path->nodes[0], &found,
2901                               btrfs_header_nritems(path->nodes[0]) - 1);
2902         fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
2903
2904         btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
2905         memcpy(scratch_leaf, path->nodes[0], sizeof(*scratch_leaf));
2906         slot = path->slots[0];
2907         btrfs_release_path(path);
2908         mutex_unlock(&fs_info->qgroup_rescan_lock);
2909
2910         for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
2911                 btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
2912                 if (found.type != BTRFS_EXTENT_ITEM_KEY &&
2913                     found.type != BTRFS_METADATA_ITEM_KEY)
2914                         continue;
2915                 if (found.type == BTRFS_METADATA_ITEM_KEY)
2916                         num_bytes = fs_info->extent_root->nodesize;
2917                 else
2918                         num_bytes = found.offset;
2919
2920                 ulist_reinit(qgroups);
2921                 ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
2922                                            &roots);
2923                 if (ret < 0)
2924                         goto out;
2925                 spin_lock(&fs_info->qgroup_lock);
2926                 seq = fs_info->qgroup_seq;
2927                 fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
2928
2929                 new_roots = 0;
2930                 ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups,
2931                                              seq, &new_roots, 1);
2932                 if (ret < 0) {
2933                         spin_unlock(&fs_info->qgroup_lock);
2934                         ulist_free(roots);
2935                         goto out;
2936                 }
2937
2938                 ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups,
2939                                              seq, 0, new_roots, 1);
2940                 if (ret < 0) {
2941                         spin_unlock(&fs_info->qgroup_lock);
2942                         ulist_free(roots);
2943                         goto out;
2944                 }
2945                 spin_unlock(&fs_info->qgroup_lock);
2946                 ulist_free(roots);
2947         }
2948 out:
2949         btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
2950
2951         return ret;
2952 }
2953
2954 static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
2955 {
2956         struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
2957                                                      qgroup_rescan_work);
2958         struct btrfs_path *path;
2959         struct btrfs_trans_handle *trans = NULL;
2960         struct ulist *tmp = NULL, *qgroups = NULL;
2961         struct extent_buffer *scratch_leaf = NULL;
2962         int err = -ENOMEM;
2963         int ret = 0;
2964
2965         path = btrfs_alloc_path();
2966         if (!path)
2967                 goto out;
2968         qgroups = ulist_alloc(GFP_NOFS);
2969         if (!qgroups)
2970                 goto out;
2971         tmp = ulist_alloc(GFP_NOFS);
2972         if (!tmp)
2973                 goto out;
2974         scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS);
2975         if (!scratch_leaf)
2976                 goto out;
2977
2978         err = 0;
2979         while (!err) {
2980                 trans = btrfs_start_transaction(fs_info->fs_root, 0);
2981                 if (IS_ERR(trans)) {
2982                         err = PTR_ERR(trans);
2983                         break;
2984                 }
2985                 if (!fs_info->quota_enabled) {
2986                         err = -EINTR;
2987                 } else {
2988                         err = qgroup_rescan_leaf(fs_info, path, trans,
2989                                                  qgroups, tmp, scratch_leaf);
2990                 }
2991                 if (err > 0)
2992                         btrfs_commit_transaction(trans, fs_info->fs_root);
2993                 else
2994                         btrfs_end_transaction(trans, fs_info->fs_root);
2995         }
2996
2997 out:
2998         kfree(scratch_leaf);
2999         ulist_free(qgroups);
3000         ulist_free(tmp);
3001         btrfs_free_path(path);
3002
3003         mutex_lock(&fs_info->qgroup_rescan_lock);
3004         fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3005
3006         if (err > 0 &&
3007             fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
3008                 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3009         } else if (err < 0) {
3010                 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3011         }
3012         mutex_unlock(&fs_info->qgroup_rescan_lock);
3013
3014         /*
3015          * only update status, since the previous part has alreay updated the
3016          * qgroup info.
3017          */
3018         trans = btrfs_start_transaction(fs_info->quota_root, 1);
3019         if (IS_ERR(trans)) {
3020                 err = PTR_ERR(trans);
3021                 btrfs_err(fs_info,
3022                           "fail to start transaction for status update: %d\n",
3023                           err);
3024                 goto done;
3025         }
3026         ret = update_qgroup_status_item(trans, fs_info, fs_info->quota_root);
3027         if (ret < 0) {
3028                 err = ret;
3029                 btrfs_err(fs_info, "fail to update qgroup status: %d\n", err);
3030         }
3031         btrfs_end_transaction(trans, fs_info->quota_root);
3032
3033         if (err >= 0) {
3034                 btrfs_info(fs_info, "qgroup scan completed%s",
3035                         err > 0 ? " (inconsistency flag cleared)" : "");
3036         } else {
3037                 btrfs_err(fs_info, "qgroup scan failed with %d", err);
3038         }
3039
3040 done:
3041         complete_all(&fs_info->qgroup_rescan_completion);
3042 }
3043
3044 /*
3045  * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
3046  * memory required for the rescan context.
3047  */
3048 static int
3049 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
3050                    int init_flags)
3051 {
3052         int ret = 0;
3053
3054         if (!init_flags &&
3055             (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) ||
3056              !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))) {
3057                 ret = -EINVAL;
3058                 goto err;
3059         }
3060
3061         mutex_lock(&fs_info->qgroup_rescan_lock);
3062         spin_lock(&fs_info->qgroup_lock);
3063
3064         if (init_flags) {
3065                 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
3066                         ret = -EINPROGRESS;
3067                 else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
3068                         ret = -EINVAL;
3069
3070                 if (ret) {
3071                         spin_unlock(&fs_info->qgroup_lock);
3072                         mutex_unlock(&fs_info->qgroup_rescan_lock);
3073                         goto err;
3074                 }
3075                 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3076         }
3077
3078         memset(&fs_info->qgroup_rescan_progress, 0,
3079                 sizeof(fs_info->qgroup_rescan_progress));
3080         fs_info->qgroup_rescan_progress.objectid = progress_objectid;
3081
3082         spin_unlock(&fs_info->qgroup_lock);
3083         mutex_unlock(&fs_info->qgroup_rescan_lock);
3084
3085         init_completion(&fs_info->qgroup_rescan_completion);
3086
3087         memset(&fs_info->qgroup_rescan_work, 0,
3088                sizeof(fs_info->qgroup_rescan_work));
3089         btrfs_init_work(&fs_info->qgroup_rescan_work,
3090                         btrfs_qgroup_rescan_helper,
3091                         btrfs_qgroup_rescan_worker, NULL, NULL);
3092
3093         if (ret) {
3094 err:
3095                 btrfs_info(fs_info, "qgroup_rescan_init failed with %d", ret);
3096                 return ret;
3097         }
3098
3099         return 0;
3100 }
3101
3102 static void
3103 qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
3104 {
3105         struct rb_node *n;
3106         struct btrfs_qgroup *qgroup;
3107
3108         spin_lock(&fs_info->qgroup_lock);
3109         /* clear all current qgroup tracking information */
3110         for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
3111                 qgroup = rb_entry(n, struct btrfs_qgroup, node);
3112                 qgroup->rfer = 0;
3113                 qgroup->rfer_cmpr = 0;
3114                 qgroup->excl = 0;
3115                 qgroup->excl_cmpr = 0;
3116         }
3117         spin_unlock(&fs_info->qgroup_lock);
3118 }
3119
3120 int
3121 btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
3122 {
3123         int ret = 0;
3124         struct btrfs_trans_handle *trans;
3125
3126         ret = qgroup_rescan_init(fs_info, 0, 1);
3127         if (ret)
3128                 return ret;
3129
3130         /*
3131          * We have set the rescan_progress to 0, which means no more
3132          * delayed refs will be accounted by btrfs_qgroup_account_ref.
3133          * However, btrfs_qgroup_account_ref may be right after its call
3134          * to btrfs_find_all_roots, in which case it would still do the
3135          * accounting.
3136          * To solve this, we're committing the transaction, which will
3137          * ensure we run all delayed refs and only after that, we are
3138          * going to clear all tracking information for a clean start.
3139          */
3140
3141         trans = btrfs_join_transaction(fs_info->fs_root);
3142         if (IS_ERR(trans)) {
3143                 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3144                 return PTR_ERR(trans);
3145         }
3146         ret = btrfs_commit_transaction(trans, fs_info->fs_root);
3147         if (ret) {
3148                 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3149                 return ret;
3150         }
3151
3152         qgroup_rescan_zero_tracking(fs_info);
3153
3154         btrfs_queue_work(fs_info->qgroup_rescan_workers,
3155                          &fs_info->qgroup_rescan_work);
3156
3157         return 0;
3158 }
3159
3160 int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info)
3161 {
3162         int running;
3163         int ret = 0;
3164
3165         mutex_lock(&fs_info->qgroup_rescan_lock);
3166         spin_lock(&fs_info->qgroup_lock);
3167         running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3168         spin_unlock(&fs_info->qgroup_lock);
3169         mutex_unlock(&fs_info->qgroup_rescan_lock);
3170
3171         if (running)
3172                 ret = wait_for_completion_interruptible(
3173                                         &fs_info->qgroup_rescan_completion);
3174
3175         return ret;
3176 }
3177
3178 /*
3179  * this is only called from open_ctree where we're still single threaded, thus
3180  * locking is omitted here.
3181  */
3182 void
3183 btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
3184 {
3185         if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
3186                 btrfs_queue_work(fs_info->qgroup_rescan_workers,
3187                                  &fs_info->qgroup_rescan_work);
3188 }