net/sched/sch_cbq.c

   1 /*
   2  * net/sched/sch_cbq.c  Class-Based Queueing discipline.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  *
  11  */
  12
  13 #include <linux/module.h>
  14 #include <linux/slab.h>
  15 #include <linux/types.h>
  16 #include <linux/kernel.h>
  17 #include <linux/string.h>
  18 #include <linux/errno.h>
  19 #include <linux/skbuff.h>
  20 #include <net/netlink.h>
  21 #include <net/pkt_sched.h>
  22 #include <net/pkt_cls.h>
  23
  24
  25 /*      Class-Based Queueing (CBQ) algorithm.
  26         =======================================
  27
  28         Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
  29                  Management Models for Packet Networks",
  30                  IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
  31
  32                  [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
  33
  34                  [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
  35                  Parameters", 1996
  36
  37                  [4] Sally Floyd and Michael Speer, "Experimental Results
  38                  for Class-Based Queueing", 1998, not published.
  39
  40         -----------------------------------------------------------------------
  41
  42         Algorithm skeleton was taken from NS simulator cbq.cc.
  43         If someone wants to check this code against the LBL version,
  44         he should take into account that ONLY the skeleton was borrowed,
  45         the implementation is different. Particularly:
  46
  47         --- The WRR algorithm is different. Our version looks more
  48         reasonable (I hope) and works when quanta are allowed to be
  49         less than MTU, which is always the case when real time classes
  50         have small rates. Note, that the statement of [3] is
  51         incomplete, delay may actually be estimated even if class
  52         per-round allotment is less than MTU. Namely, if per-round
  53         allotment is W*r_i, and r_1+...+r_k = r < 1
  54
  55         delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
  56
  57         In the worst case we have IntServ estimate with D = W*r+k*MTU
  58         and C = MTU*r. The proof (if correct at all) is trivial.
  59
  60
  61         --- It seems that cbq-2.0 is not very accurate. At least, I cannot
  62         interpret some places, which look like wrong translations
  63         from NS. Anyone is advised to find these differences
  64         and explain to me, why I am wrong 8).
  65
  66         --- Linux has no EOI event, so that we cannot estimate true class
  67         idle time. Workaround is to consider the next dequeue event
  68         as sign that previous packet is finished. This is wrong because of
  69         internal device queueing, but on a permanently loaded link it is true.
  70         Moreover, combined with clock integrator, this scheme looks
  71         very close to an ideal solution.  */
  72
  73 struct cbq_sched_data;
  74
  75
  76 struct cbq_class {
  77         struct Qdisc_class_common common;
  78         struct cbq_class        *next_alive;    /* next class with backlog in this priority band */
  79
  80 /* Parameters */
  81         unsigned char           priority;       /* class priority */
  82         unsigned char           priority2;      /* priority to be used after overlimit */
  83         unsigned char           ewma_log;       /* time constant for idle time calculation */
  84
  85         u32                     defmap;
  86
  87         /* Link-sharing scheduler parameters */
  88         long                    maxidle;        /* Class parameters: see below. */
  89         long                    offtime;
  90         long                    minidle;
  91         u32                     avpkt;
  92         struct qdisc_rate_table *R_tab;
  93
  94         /* General scheduler (WRR) parameters */
  95         long                    allot;
  96         long                    quantum;        /* Allotment per WRR round */
  97         long                    weight;         /* Relative allotment: see below */
  98
  99         struct Qdisc            *qdisc;         /* Ptr to CBQ discipline */
 100         struct cbq_class        *split;         /* Ptr to split node */
 101         struct cbq_class        *share;         /* Ptr to LS parent in the class tree */
 102         struct cbq_class        *tparent;       /* Ptr to tree parent in the class tree */
 103         struct cbq_class        *borrow;        /* NULL if class is bandwidth limited;
 104                                                    parent otherwise */
 105         struct cbq_class        *sibling;       /* Sibling chain */
 106         struct cbq_class        *children;      /* Pointer to children chain */
 107
 108         struct Qdisc            *q;             /* Elementary queueing discipline */
 109
 110
 111 /* Variables */
 112         unsigned char           cpriority;      /* Effective priority */
 113         unsigned char           delayed;
 114         unsigned char           level;          /* level of the class in hierarchy:
 115                                                    0 for leaf classes, and maximal
 116                                                    level of children + 1 for nodes.
 117                                                  */
 118
 119         psched_time_t           last;           /* Last end of service */
 120         psched_time_t           undertime;
 121         long                    avgidle;
 122         long                    deficit;        /* Saved deficit for WRR */
 123         psched_time_t           penalized;
 124         struct gnet_stats_basic_packed bstats;
 125         struct gnet_stats_queue qstats;
 126         struct net_rate_estimator __rcu *rate_est;
 127         struct tc_cbq_xstats    xstats;
 128
 129         struct tcf_proto __rcu  *filter_list;
 130         struct tcf_block        *block;
 131
 132         int                     filters;
 133
 134         struct cbq_class        *defaults[TC_PRIO_MAX + 1];
 135 };
 136
 137 struct cbq_sched_data {
 138         struct Qdisc_class_hash clhash;                 /* Hash table of all classes */
 139         int                     nclasses[TC_CBQ_MAXPRIO + 1];
 140         unsigned int            quanta[TC_CBQ_MAXPRIO + 1];
 141
 142         struct cbq_class        link;
 143
 144         unsigned int            activemask;
 145         struct cbq_class        *active[TC_CBQ_MAXPRIO + 1];    /* List of all classes
 146                                                                    with backlog */
 147
 148 #ifdef CONFIG_NET_CLS_ACT
 149         struct cbq_class        *rx_class;
 150 #endif
 151         struct cbq_class        *tx_class;
 152         struct cbq_class        *tx_borrowed;
 153         int                     tx_len;
 154         psched_time_t           now;            /* Cached timestamp */
 155         unsigned int            pmask;
 156
 157         struct hrtimer          delay_timer;
 158         struct qdisc_watchdog   watchdog;       /* Watchdog timer,
 159                                                    started when CBQ has
 160                                                    backlog, but cannot
 161                                                    transmit just now */
 162         psched_tdiff_t          wd_expires;
 163         int                     toplevel;
 164         u32                     hgenerator;
 165 };
 166
 167
 168 #define L2T(cl, len)    qdisc_l2t((cl)->R_tab, len)
 169
 170 static inline struct cbq_class *
 171 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
 172 {
 173         struct Qdisc_class_common *clc;
 174
 175         clc = qdisc_class_find(&q->clhash, classid);
 176         if (clc == NULL)
 177                 return NULL;
 178         return container_of(clc, struct cbq_class, common);
 179 }
 180
 181 #ifdef CONFIG_NET_CLS_ACT
 182
 183 static struct cbq_class *
 184 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
 185 {
 186         struct cbq_class *cl;
 187
 188         for (cl = this->tparent; cl; cl = cl->tparent) {
 189                 struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
 190
 191                 if (new != NULL && new != this)
 192                         return new;
 193         }
 194         return NULL;
 195 }
 196
 197 #endif
 198
 199 /* Classify packet. The procedure is pretty complicated, but
 200  * it allows us to combine link sharing and priority scheduling
 201  * transparently.
 202  *
 203  * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
 204  * so that it resolves to split nodes. Then packets are classified
 205  * by logical priority, or a more specific classifier may be attached
 206  * to the split node.
 207  */
 208
 209 static struct cbq_class *
 210 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 211 {
 212         struct cbq_sched_data *q = qdisc_priv(sch);
 213         struct cbq_class *head = &q->link;
 214         struct cbq_class **defmap;
 215         struct cbq_class *cl = NULL;
 216         u32 prio = skb->priority;
 217         struct tcf_proto *fl;
 218         struct tcf_result res;
 219
 220         /*
 221          *  Step 1. If skb->priority points to one of our classes, use it.
 222          */
 223         if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
 224             (cl = cbq_class_lookup(q, prio)) != NULL)
 225                 return cl;
 226
 227         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 228         for (;;) {
 229                 int result = 0;
 230                 defmap = head->defaults;
 231
 232                 fl = rcu_dereference_bh(head->filter_list);
 233                 /*
 234                  * Step 2+n. Apply classifier.
 235                  */
 236                 result = tcf_classify(skb, fl, &res, true);
 237                 if (!fl || result < 0)
 238                         goto fallback;
 239
 240                 cl = (void *)res.class;
 241                 if (!cl) {
 242                         if (TC_H_MAJ(res.classid))
 243                                 cl = cbq_class_lookup(q, res.classid);
 244                         else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
 245                                 cl = defmap[TC_PRIO_BESTEFFORT];
 246
 247                         if (cl == NULL)
 248                                 goto fallback;
 249                 }
 250                 if (cl->level >= head->level)
 251                         goto fallback;
 252 #ifdef CONFIG_NET_CLS_ACT
 253                 switch (result) {
 254                 case TC_ACT_QUEUED:
 255                 case TC_ACT_STOLEN:
 256                 case TC_ACT_TRAP:
 257                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 258                 case TC_ACT_SHOT:
 259                         return NULL;
 260                 case TC_ACT_RECLASSIFY:
 261                         return cbq_reclassify(skb, cl);
 262                 }
 263 #endif
 264                 if (cl->level == 0)
 265                         return cl;
 266
 267                 /*
 268                  * Step 3+n. If classifier selected a link sharing class,
 269                  *         apply agency specific classifier.
 270                  *         Repeat this procdure until we hit a leaf node.
 271                  */
 272                 head = cl;
 273         }
 274
 275 fallback:
 276         cl = head;
 277
 278         /*
 279          * Step 4. No success...
 280          */
 281         if (TC_H_MAJ(prio) == 0 &&
 282             !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
 283             !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
 284                 return head;
 285
 286         return cl;
 287 }
 288
 289 /*
 290  * A packet has just been enqueued on the empty class.
 291  * cbq_activate_class adds it to the tail of active class list
 292  * of its priority band.
 293  */
 294
 295 static inline void cbq_activate_class(struct cbq_class *cl)
 296 {
 297         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 298         int prio = cl->cpriority;
 299         struct cbq_class *cl_tail;
 300
 301         cl_tail = q->active[prio];
 302         q->active[prio] = cl;
 303
 304         if (cl_tail != NULL) {
 305                 cl->next_alive = cl_tail->next_alive;
 306                 cl_tail->next_alive = cl;
 307         } else {
 308                 cl->next_alive = cl;
 309                 q->activemask |= (1<<prio);
 310         }
 311 }
 312
 313 /*
 314  * Unlink class from active chain.
 315  * Note that this same procedure is done directly in cbq_dequeue*
 316  * during round-robin procedure.
 317  */
 318
 319 static void cbq_deactivate_class(struct cbq_class *this)
 320 {
 321         struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 322         int prio = this->cpriority;
 323         struct cbq_class *cl;
 324         struct cbq_class *cl_prev = q->active[prio];
 325
 326         do {
 327                 cl = cl_prev->next_alive;
 328                 if (cl == this) {
 329                         cl_prev->next_alive = cl->next_alive;
 330                         cl->next_alive = NULL;
 331
 332                         if (cl == q->active[prio]) {
 333                                 q->active[prio] = cl_prev;
 334                                 if (cl == q->active[prio]) {
 335                                         q->active[prio] = NULL;
 336                                         q->activemask &= ~(1<<prio);
 337                                         return;
 338                                 }
 339                         }
 340                         return;
 341                 }
 342         } while ((cl_prev = cl) != q->active[prio]);
 343 }
 344
 345 static void
 346 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
 347 {
 348         int toplevel = q->toplevel;
 349
 350         if (toplevel > cl->level) {
 351                 psched_time_t now = psched_get_time();
 352
 353                 do {
 354                         if (cl->undertime < now) {
 355                                 q->toplevel = cl->level;
 356                                 return;
 357                         }
 358                 } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
 359         }
 360 }
 361
 362 static int
 363 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 364             struct sk_buff **to_free)
 365 {
 366         struct cbq_sched_data *q = qdisc_priv(sch);
 367         int uninitialized_var(ret);
 368         struct cbq_class *cl = cbq_classify(skb, sch, &ret);
 369
 370 #ifdef CONFIG_NET_CLS_ACT
 371         q->rx_class = cl;
 372 #endif
 373         if (cl == NULL) {
 374                 if (ret & __NET_XMIT_BYPASS)
 375                         qdisc_qstats_drop(sch);
 376                 __qdisc_drop(skb, to_free);
 377                 return ret;
 378         }
 379
 380         ret = qdisc_enqueue(skb, cl->q, to_free);
 381         if (ret == NET_XMIT_SUCCESS) {
 382                 sch->q.qlen++;
 383                 cbq_mark_toplevel(q, cl);
 384                 if (!cl->next_alive)
 385                         cbq_activate_class(cl);
 386                 return ret;
 387         }
 388
 389         if (net_xmit_drop_count(ret)) {
 390                 qdisc_qstats_drop(sch);
 391                 cbq_mark_toplevel(q, cl);
 392                 cl->qstats.drops++;
 393         }
 394         return ret;
 395 }
 396
 397 /* Overlimit action: penalize leaf class by adding offtime */
 398 static void cbq_overlimit(struct cbq_class *cl)
 399 {
 400         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 401         psched_tdiff_t delay = cl->undertime - q->now;
 402
 403         if (!cl->delayed) {
 404                 delay += cl->offtime;
 405
 406                 /*
 407                  * Class goes to sleep, so that it will have no
 408                  * chance to work avgidle. Let's forgive it 8)
 409                  *
 410                  * BTW cbq-2.0 has a crap in this
 411                  * place, apparently they forgot to shift it by cl->ewma_log.
 412                  */
 413                 if (cl->avgidle < 0)
 414                         delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
 415                 if (cl->avgidle < cl->minidle)
 416                         cl->avgidle = cl->minidle;
 417                 if (delay <= 0)
 418                         delay = 1;
 419                 cl->undertime = q->now + delay;
 420
 421                 cl->xstats.overactions++;
 422                 cl->delayed = 1;
 423         }
 424         if (q->wd_expires == 0 || q->wd_expires > delay)
 425                 q->wd_expires = delay;
 426
 427         /* Dirty work! We must schedule wakeups based on
 428          * real available rate, rather than leaf rate,
 429          * which may be tiny (even zero).
 430          */
 431         if (q->toplevel == TC_CBQ_MAXLEVEL) {
 432                 struct cbq_class *b;
 433                 psched_tdiff_t base_delay = q->wd_expires;
 434
 435                 for (b = cl->borrow; b; b = b->borrow) {
 436                         delay = b->undertime - q->now;
 437                         if (delay < base_delay) {
 438                                 if (delay <= 0)
 439                                         delay = 1;
 440                                 base_delay = delay;
 441                         }
 442                 }
 443
 444                 q->wd_expires = base_delay;
 445         }
 446 }
 447
 448 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
 449                                        psched_time_t now)
 450 {
 451         struct cbq_class *cl;
 452         struct cbq_class *cl_prev = q->active[prio];
 453         psched_time_t sched = now;
 454
 455         if (cl_prev == NULL)
 456                 return 0;
 457
 458         do {
 459                 cl = cl_prev->next_alive;
 460                 if (now - cl->penalized > 0) {
 461                         cl_prev->next_alive = cl->next_alive;
 462                         cl->next_alive = NULL;
 463                         cl->cpriority = cl->priority;
 464                         cl->delayed = 0;
 465                         cbq_activate_class(cl);
 466
 467                         if (cl == q->active[prio]) {
 468                                 q->active[prio] = cl_prev;
 469                                 if (cl == q->active[prio]) {
 470                                         q->active[prio] = NULL;
 471                                         return 0;
 472                                 }
 473                         }
 474
 475                         cl = cl_prev->next_alive;
 476                 } else if (sched - cl->penalized > 0)
 477                         sched = cl->penalized;
 478         } while ((cl_prev = cl) != q->active[prio]);
 479
 480         return sched - now;
 481 }
 482
 483 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
 484 {
 485         struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
 486                                                 delay_timer);
 487         struct Qdisc *sch = q->watchdog.qdisc;
 488         psched_time_t now;
 489         psched_tdiff_t delay = 0;
 490         unsigned int pmask;
 491
 492         now = psched_get_time();
 493
 494         pmask = q->pmask;
 495         q->pmask = 0;
 496
 497         while (pmask) {
 498                 int prio = ffz(~pmask);
 499                 psched_tdiff_t tmp;
 500
 501                 pmask &= ~(1<<prio);
 502
 503                 tmp = cbq_undelay_prio(q, prio, now);
 504                 if (tmp > 0) {
 505                         q->pmask |= 1<<prio;
 506                         if (tmp < delay || delay == 0)
 507                                 delay = tmp;
 508                 }
 509         }
 510
 511         if (delay) {
 512                 ktime_t time;
 513
 514                 time = 0;
 515                 time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
 516                 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
 517         }
 518
 519         __netif_schedule(qdisc_root(sch));
 520         return HRTIMER_NORESTART;
 521 }
 522
 523 /*
 524  * It is mission critical procedure.
 525  *
 526  * We "regenerate" toplevel cutoff, if transmitting class
 527  * has backlog and it is not regulated. It is not part of
 528  * original CBQ description, but looks more reasonable.
 529  * Probably, it is wrong. This question needs further investigation.
 530  */
 531
 532 static inline void
 533 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
 534                     struct cbq_class *borrowed)
 535 {
 536         if (cl && q->toplevel >= borrowed->level) {
 537                 if (cl->q->q.qlen > 1) {
 538                         do {
 539                                 if (borrowed->undertime == PSCHED_PASTPERFECT) {
 540                                         q->toplevel = borrowed->level;
 541                                         return;
 542                                 }
 543                         } while ((borrowed = borrowed->borrow) != NULL);
 544                 }
 545 #if 0
 546         /* It is not necessary now. Uncommenting it
 547            will save CPU cycles, but decrease fairness.
 548          */
 549                 q->toplevel = TC_CBQ_MAXLEVEL;
 550 #endif
 551         }
 552 }
 553
 554 static void
 555 cbq_update(struct cbq_sched_data *q)
 556 {
 557         struct cbq_class *this = q->tx_class;
 558         struct cbq_class *cl = this;
 559         int len = q->tx_len;
 560         psched_time_t now;
 561
 562         q->tx_class = NULL;
 563         /* Time integrator. We calculate EOS time
 564          * by adding expected packet transmission time.
 565          */
 566         now = q->now + L2T(&q->link, len);
 567
 568         for ( ; cl; cl = cl->share) {
 569                 long avgidle = cl->avgidle;
 570                 long idle;
 571
 572                 cl->bstats.packets++;
 573                 cl->bstats.bytes += len;
 574
 575                 /*
 576                  * (now - last) is total time between packet right edges.
 577                  * (last_pktlen/rate) is "virtual" busy time, so that
 578                  *
 579                  *      idle = (now - last) - last_pktlen/rate
 580                  */
 581
 582                 idle = now - cl->last;
 583                 if ((unsigned long)idle > 128*1024*1024) {
 584                         avgidle = cl->maxidle;
 585                 } else {
 586                         idle -= L2T(cl, len);
 587
 588                 /* true_avgidle := (1-W)*true_avgidle + W*idle,
 589                  * where W=2^{-ewma_log}. But cl->avgidle is scaled:
 590                  * cl->avgidle == true_avgidle/W,
 591                  * hence:
 592                  */
 593                         avgidle += idle - (avgidle>>cl->ewma_log);
 594                 }
 595
 596                 if (avgidle <= 0) {
 597                         /* Overlimit or at-limit */
 598
 599                         if (avgidle < cl->minidle)
 600                                 avgidle = cl->minidle;
 601
 602                         cl->avgidle = avgidle;
 603
 604                         /* Calculate expected time, when this class
 605                          * will be allowed to send.
 606                          * It will occur, when:
 607                          * (1-W)*true_avgidle + W*delay = 0, i.e.
 608                          * idle = (1/W - 1)*(-true_avgidle)
 609                          * or
 610                          * idle = (1 - W)*(-cl->avgidle);
 611                          */
 612                         idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
 613
 614                         /*
 615                          * That is not all.
 616                          * To maintain the rate allocated to the class,
 617                          * we add to undertime virtual clock,
 618                          * necessary to complete transmitted packet.
 619                          * (len/phys_bandwidth has been already passed
 620                          * to the moment of cbq_update)
 621                          */
 622
 623                         idle -= L2T(&q->link, len);
 624                         idle += L2T(cl, len);
 625
 626                         cl->undertime = now + idle;
 627                 } else {
 628                         /* Underlimit */
 629
 630                         cl->undertime = PSCHED_PASTPERFECT;
 631                         if (avgidle > cl->maxidle)
 632                                 cl->avgidle = cl->maxidle;
 633                         else
 634                                 cl->avgidle = avgidle;
 635                 }
 636                 if ((s64)(now - cl->last) > 0)
 637                         cl->last = now;
 638         }
 639
 640         cbq_update_toplevel(q, this, q->tx_borrowed);
 641 }
 642
 643 static inline struct cbq_class *
 644 cbq_under_limit(struct cbq_class *cl)
 645 {
 646         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 647         struct cbq_class *this_cl = cl;
 648
 649         if (cl->tparent == NULL)
 650                 return cl;
 651
 652         if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
 653                 cl->delayed = 0;
 654                 return cl;
 655         }
 656
 657         do {
 658                 /* It is very suspicious place. Now overlimit
 659                  * action is generated for not bounded classes
 660                  * only if link is completely congested.
 661                  * Though it is in agree with ancestor-only paradigm,
 662                  * it looks very stupid. Particularly,
 663                  * it means that this chunk of code will either
 664                  * never be called or result in strong amplification
 665                  * of burstiness. Dangerous, silly, and, however,
 666                  * no another solution exists.
 667                  */
 668                 cl = cl->borrow;
 669                 if (!cl) {
 670                         this_cl->qstats.overlimits++;
 671                         cbq_overlimit(this_cl);
 672                         return NULL;
 673                 }
 674                 if (cl->level > q->toplevel)
 675                         return NULL;
 676         } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
 677
 678         cl->delayed = 0;
 679         return cl;
 680 }
 681
 682 static inline struct sk_buff *
 683 cbq_dequeue_prio(struct Qdisc *sch, int prio)
 684 {
 685         struct cbq_sched_data *q = qdisc_priv(sch);
 686         struct cbq_class *cl_tail, *cl_prev, *cl;
 687         struct sk_buff *skb;
 688         int deficit;
 689
 690         cl_tail = cl_prev = q->active[prio];
 691         cl = cl_prev->next_alive;
 692
 693         do {
 694                 deficit = 0;
 695
 696                 /* Start round */
 697                 do {
 698                         struct cbq_class *borrow = cl;
 699
 700                         if (cl->q->q.qlen &&
 701                             (borrow = cbq_under_limit(cl)) == NULL)
 702                                 goto skip_class;
 703
 704                         if (cl->deficit <= 0) {
 705                                 /* Class exhausted its allotment per
 706                                  * this round. Switch to the next one.
 707                                  */
 708                                 deficit = 1;
 709                                 cl->deficit += cl->quantum;
 710                                 goto next_class;
 711                         }
 712
 713                         skb = cl->q->dequeue(cl->q);
 714
 715                         /* Class did not give us any skb :-(
 716                          * It could occur even if cl->q->q.qlen != 0
 717                          * f.e. if cl->q == "tbf"
 718                          */
 719                         if (skb == NULL)
 720                                 goto skip_class;
 721
 722                         cl->deficit -= qdisc_pkt_len(skb);
 723                         q->tx_class = cl;
 724                         q->tx_borrowed = borrow;
 725                         if (borrow != cl) {
 726 #ifndef CBQ_XSTATS_BORROWS_BYTES
 727                                 borrow->xstats.borrows++;
 728                                 cl->xstats.borrows++;
 729 #else
 730                                 borrow->xstats.borrows += qdisc_pkt_len(skb);
 731                                 cl->xstats.borrows += qdisc_pkt_len(skb);
 732 #endif
 733                         }
 734                         q->tx_len = qdisc_pkt_len(skb);
 735
 736                         if (cl->deficit <= 0) {
 737                                 q->active[prio] = cl;
 738                                 cl = cl->next_alive;
 739                                 cl->deficit += cl->quantum;
 740                         }
 741                         return skb;
 742
 743 skip_class:
 744                         if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
 745                                 /* Class is empty or penalized.
 746                                  * Unlink it from active chain.
 747                                  */
 748                                 cl_prev->next_alive = cl->next_alive;
 749                                 cl->next_alive = NULL;
 750
 751                                 /* Did cl_tail point to it? */
 752                                 if (cl == cl_tail) {
 753                                         /* Repair it! */
 754                                         cl_tail = cl_prev;
 755
 756                                         /* Was it the last class in this band? */
 757                                         if (cl == cl_tail) {
 758                                                 /* Kill the band! */
 759                                                 q->active[prio] = NULL;
 760                                                 q->activemask &= ~(1<<prio);
 761                                                 if (cl->q->q.qlen)
 762                                                         cbq_activate_class(cl);
 763                                                 return NULL;
 764                                         }
 765
 766                                         q->active[prio] = cl_tail;
 767                                 }
 768                                 if (cl->q->q.qlen)
 769                                         cbq_activate_class(cl);
 770
 771                                 cl = cl_prev;
 772                         }
 773
 774 next_class:
 775                         cl_prev = cl;
 776                         cl = cl->next_alive;
 777                 } while (cl_prev != cl_tail);
 778         } while (deficit);
 779
 780         q->active[prio] = cl_prev;
 781
 782         return NULL;
 783 }
 784
 785 static inline struct sk_buff *
 786 cbq_dequeue_1(struct Qdisc *sch)
 787 {
 788         struct cbq_sched_data *q = qdisc_priv(sch);
 789         struct sk_buff *skb;
 790         unsigned int activemask;
 791
 792         activemask = q->activemask & 0xFF;
 793         while (activemask) {
 794                 int prio = ffz(~activemask);
 795                 activemask &= ~(1<<prio);
 796                 skb = cbq_dequeue_prio(sch, prio);
 797                 if (skb)
 798                         return skb;
 799         }
 800         return NULL;
 801 }
 802
 803 static struct sk_buff *
 804 cbq_dequeue(struct Qdisc *sch)
 805 {
 806         struct sk_buff *skb;
 807         struct cbq_sched_data *q = qdisc_priv(sch);
 808         psched_time_t now;
 809
 810         now = psched_get_time();
 811
 812         if (q->tx_class)
 813                 cbq_update(q);
 814
 815         q->now = now;
 816
 817         for (;;) {
 818                 q->wd_expires = 0;
 819
 820                 skb = cbq_dequeue_1(sch);
 821                 if (skb) {
 822                         qdisc_bstats_update(sch, skb);
 823                         sch->q.qlen--;
 824                         return skb;
 825                 }
 826
 827                 /* All the classes are overlimit.
 828                  *
 829                  * It is possible, if:
 830                  *
 831                  * 1. Scheduler is empty.
 832                  * 2. Toplevel cutoff inhibited borrowing.
 833                  * 3. Root class is overlimit.
 834                  *
 835                  * Reset 2d and 3d conditions and retry.
 836                  *
 837                  * Note, that NS and cbq-2.0 are buggy, peeking
 838                  * an arbitrary class is appropriate for ancestor-only
 839                  * sharing, but not for toplevel algorithm.
 840                  *
 841                  * Our version is better, but slower, because it requires
 842                  * two passes, but it is unavoidable with top-level sharing.
 843                  */
 844
 845                 if (q->toplevel == TC_CBQ_MAXLEVEL &&
 846                     q->link.undertime == PSCHED_PASTPERFECT)
 847                         break;
 848
 849                 q->toplevel = TC_CBQ_MAXLEVEL;
 850                 q->link.undertime = PSCHED_PASTPERFECT;
 851         }
 852
 853         /* No packets in scheduler or nobody wants to give them to us :-(
 854          * Sigh... start watchdog timer in the last case.
 855          */
 856
 857         if (sch->q.qlen) {
 858                 qdisc_qstats_overlimit(sch);
 859                 if (q->wd_expires)
 860                         qdisc_watchdog_schedule(&q->watchdog,
 861                                                 now + q->wd_expires);
 862         }
 863         return NULL;
 864 }
 865
 866 /* CBQ class maintanance routines */
 867
 868 static void cbq_adjust_levels(struct cbq_class *this)
 869 {
 870         if (this == NULL)
 871                 return;
 872
 873         do {
 874                 int level = 0;
 875                 struct cbq_class *cl;
 876
 877                 cl = this->children;
 878                 if (cl) {
 879                         do {
 880                                 if (cl->level > level)
 881                                         level = cl->level;
 882                         } while ((cl = cl->sibling) != this->children);
 883                 }
 884                 this->level = level + 1;
 885         } while ((this = this->tparent) != NULL);
 886 }
 887
 888 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
 889 {
 890         struct cbq_class *cl;
 891         unsigned int h;
 892
 893         if (q->quanta[prio] == 0)
 894                 return;
 895
 896         for (h = 0; h < q->clhash.hashsize; h++) {
 897                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
 898                         /* BUGGGG... Beware! This expression suffer of
 899                          * arithmetic overflows!
 900                          */
 901                         if (cl->priority == prio) {
 902                                 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
 903                                         q->quanta[prio];
 904                         }
 905                         if (cl->quantum <= 0 ||
 906                             cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
 907                                 pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
 908                                         cl->common.classid, cl->quantum);
 909                                 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
 910                         }
 911                 }
 912         }
 913 }
 914
 915 static void cbq_sync_defmap(struct cbq_class *cl)
 916 {
 917         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 918         struct cbq_class *split = cl->split;
 919         unsigned int h;
 920         int i;
 921
 922         if (split == NULL)
 923                 return;
 924
 925         for (i = 0; i <= TC_PRIO_MAX; i++) {
 926                 if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
 927                         split->defaults[i] = NULL;
 928         }
 929
 930         for (i = 0; i <= TC_PRIO_MAX; i++) {
 931                 int level = split->level;
 932
 933                 if (split->defaults[i])
 934                         continue;
 935
 936                 for (h = 0; h < q->clhash.hashsize; h++) {
 937                         struct cbq_class *c;
 938
 939                         hlist_for_each_entry(c, &q->clhash.hash[h],
 940                                              common.hnode) {
 941                                 if (c->split == split && c->level < level &&
 942                                     c->defmap & (1<<i)) {
 943                                         split->defaults[i] = c;
 944                                         level = c->level;
 945                                 }
 946                         }
 947                 }
 948         }
 949 }
 950
 951 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
 952 {
 953         struct cbq_class *split = NULL;
 954
 955         if (splitid == 0) {
 956                 split = cl->split;
 957                 if (!split)
 958                         return;
 959                 splitid = split->common.classid;
 960         }
 961
 962         if (split == NULL || split->common.classid != splitid) {
 963                 for (split = cl->tparent; split; split = split->tparent)
 964                         if (split->common.classid == splitid)
 965                                 break;
 966         }
 967
 968         if (split == NULL)
 969                 return;
 970
 971         if (cl->split != split) {
 972                 cl->defmap = 0;
 973                 cbq_sync_defmap(cl);
 974                 cl->split = split;
 975                 cl->defmap = def & mask;
 976         } else
 977                 cl->defmap = (cl->defmap & ~mask) | (def & mask);
 978
 979         cbq_sync_defmap(cl);
 980 }
 981
 982 static void cbq_unlink_class(struct cbq_class *this)
 983 {
 984         struct cbq_class *cl, **clp;
 985         struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 986
 987         qdisc_class_hash_remove(&q->clhash, &this->common);
 988
 989         if (this->tparent) {
 990                 clp = &this->sibling;
 991                 cl = *clp;
 992                 do {
 993                         if (cl == this) {
 994                                 *clp = cl->sibling;
 995                                 break;
 996                         }
 997                         clp = &cl->sibling;
 998                 } while ((cl = *clp) != this->sibling);
 999
1000                 if (this->tparent->children == this) {
1001                         this->tparent->children = this->sibling;
1002                         if (this->sibling == this)
1003                                 this->tparent->children = NULL;
1004                 }
1005         } else {
1006                 WARN_ON(this->sibling != this);
1007         }
1008 }
1009
1010 static void cbq_link_class(struct cbq_class *this)
1011 {
1012         struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1013         struct cbq_class *parent = this->tparent;
1014
1015         this->sibling = this;
1016         qdisc_class_hash_insert(&q->clhash, &this->common);
1017
1018         if (parent == NULL)
1019                 return;
1020
1021         if (parent->children == NULL) {
1022                 parent->children = this;
1023         } else {
1024                 this->sibling = parent->children->sibling;
1025                 parent->children->sibling = this;
1026         }
1027 }
1028
1029 static void
1030 cbq_reset(struct Qdisc *sch)
1031 {
1032         struct cbq_sched_data *q = qdisc_priv(sch);
1033         struct cbq_class *cl;
1034         int prio;
1035         unsigned int h;
1036
1037         q->activemask = 0;
1038         q->pmask = 0;
1039         q->tx_class = NULL;
1040         q->tx_borrowed = NULL;
1041         qdisc_watchdog_cancel(&q->watchdog);
1042         hrtimer_cancel(&q->delay_timer);
1043         q->toplevel = TC_CBQ_MAXLEVEL;
1044         q->now = psched_get_time();
1045
1046         for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1047                 q->active[prio] = NULL;
1048
1049         for (h = 0; h < q->clhash.hashsize; h++) {
1050                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1051                         qdisc_reset(cl->q);
1052
1053                         cl->next_alive = NULL;
1054                         cl->undertime = PSCHED_PASTPERFECT;
1055                         cl->avgidle = cl->maxidle;
1056                         cl->deficit = cl->quantum;
1057                         cl->cpriority = cl->priority;
1058                 }
1059         }
1060         sch->q.qlen = 0;
1061 }
1062
1063
1064 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1065 {
1066         if (lss->change & TCF_CBQ_LSS_FLAGS) {
1067                 cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1068                 cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1069         }
1070         if (lss->change & TCF_CBQ_LSS_EWMA)
1071                 cl->ewma_log = lss->ewma_log;
1072         if (lss->change & TCF_CBQ_LSS_AVPKT)
1073                 cl->avpkt = lss->avpkt;
1074         if (lss->change & TCF_CBQ_LSS_MINIDLE)
1075                 cl->minidle = -(long)lss->minidle;
1076         if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1077                 cl->maxidle = lss->maxidle;
1078                 cl->avgidle = lss->maxidle;
1079         }
1080         if (lss->change & TCF_CBQ_LSS_OFFTIME)
1081                 cl->offtime = lss->offtime;
1082         return 0;
1083 }
1084
1085 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1086 {
1087         q->nclasses[cl->priority]--;
1088         q->quanta[cl->priority] -= cl->weight;
1089         cbq_normalize_quanta(q, cl->priority);
1090 }
1091
1092 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1093 {
1094         q->nclasses[cl->priority]++;
1095         q->quanta[cl->priority] += cl->weight;
1096         cbq_normalize_quanta(q, cl->priority);
1097 }
1098
1099 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1100 {
1101         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1102
1103         if (wrr->allot)
1104                 cl->allot = wrr->allot;
1105         if (wrr->weight)
1106                 cl->weight = wrr->weight;
1107         if (wrr->priority) {
1108                 cl->priority = wrr->priority - 1;
1109                 cl->cpriority = cl->priority;
1110                 if (cl->priority >= cl->priority2)
1111                         cl->priority2 = TC_CBQ_MAXPRIO - 1;
1112         }
1113
1114         cbq_addprio(q, cl);
1115         return 0;
1116 }
1117
1118 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1119 {
1120         cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1121         return 0;
1122 }
1123
1124 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1125         [TCA_CBQ_LSSOPT]        = { .len = sizeof(struct tc_cbq_lssopt) },
1126         [TCA_CBQ_WRROPT]        = { .len = sizeof(struct tc_cbq_wrropt) },
1127         [TCA_CBQ_FOPT]          = { .len = sizeof(struct tc_cbq_fopt) },
1128         [TCA_CBQ_OVL_STRATEGY]  = { .len = sizeof(struct tc_cbq_ovl) },
1129         [TCA_CBQ_RATE]          = { .len = sizeof(struct tc_ratespec) },
1130         [TCA_CBQ_RTAB]          = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1131         [TCA_CBQ_POLICE]        = { .len = sizeof(struct tc_cbq_police) },
1132 };
1133
1134 static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
1135 {
1136         struct cbq_sched_data *q = qdisc_priv(sch);
1137         struct nlattr *tb[TCA_CBQ_MAX + 1];
1138         struct tc_ratespec *r;
1139         int err;
1140
1141         qdisc_watchdog_init(&q->watchdog, sch);
1142         hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1143         q->delay_timer.function = cbq_undelay;
1144
1145         if (!opt)
1146                 return -EINVAL;
1147
1148         err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, NULL);
1149         if (err < 0)
1150                 return err;
1151
1152         if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
1153                 return -EINVAL;
1154
1155         r = nla_data(tb[TCA_CBQ_RATE]);
1156
1157         if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
1158                 return -EINVAL;
1159
1160         err = qdisc_class_hash_init(&q->clhash);
1161         if (err < 0)
1162                 goto put_rtab;
1163
1164         q->link.sibling = &q->link;
1165         q->link.common.classid = sch->handle;
1166         q->link.qdisc = sch;
1167         q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1168                                       sch->handle);
1169         if (!q->link.q)
1170                 q->link.q = &noop_qdisc;
1171         else
1172                 qdisc_hash_add(q->link.q, true);
1173
1174         q->link.priority = TC_CBQ_MAXPRIO - 1;
1175         q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1176         q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1177         q->link.allot = psched_mtu(qdisc_dev(sch));
1178         q->link.quantum = q->link.allot;
1179         q->link.weight = q->link.R_tab->rate.rate;
1180
1181         q->link.ewma_log = TC_CBQ_DEF_EWMA;
1182         q->link.avpkt = q->link.allot/2;
1183         q->link.minidle = -0x7FFFFFFF;
1184
1185         q->toplevel = TC_CBQ_MAXLEVEL;
1186         q->now = psched_get_time();
1187
1188         cbq_link_class(&q->link);
1189
1190         if (tb[TCA_CBQ_LSSOPT])
1191                 cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1192
1193         cbq_addprio(q, &q->link);
1194         return 0;
1195
1196 put_rtab:
1197         qdisc_put_rtab(q->link.R_tab);
1198         return err;
1199 }
1200
1201 static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1202 {
1203         unsigned char *b = skb_tail_pointer(skb);
1204
1205         if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1206                 goto nla_put_failure;
1207         return skb->len;
1208
1209 nla_put_failure:
1210         nlmsg_trim(skb, b);
1211         return -1;
1212 }
1213
1214 static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1215 {
1216         unsigned char *b = skb_tail_pointer(skb);
1217         struct tc_cbq_lssopt opt;
1218
1219         opt.flags = 0;
1220         if (cl->borrow == NULL)
1221                 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1222         if (cl->share == NULL)
1223                 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1224         opt.ewma_log = cl->ewma_log;
1225         opt.level = cl->level;
1226         opt.avpkt = cl->avpkt;
1227         opt.maxidle = cl->maxidle;
1228         opt.minidle = (u32)(-cl->minidle);
1229         opt.offtime = cl->offtime;
1230         opt.change = ~0;
1231         if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1232                 goto nla_put_failure;
1233         return skb->len;
1234
1235 nla_put_failure:
1236         nlmsg_trim(skb, b);
1237         return -1;
1238 }
1239
1240 static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1241 {
1242         unsigned char *b = skb_tail_pointer(skb);
1243         struct tc_cbq_wrropt opt;
1244
1245         memset(&opt, 0, sizeof(opt));
1246         opt.flags = 0;
1247         opt.allot = cl->allot;
1248         opt.priority = cl->priority + 1;
1249         opt.cpriority = cl->cpriority + 1;
1250         opt.weight = cl->weight;
1251         if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1252                 goto nla_put_failure;
1253         return skb->len;
1254
1255 nla_put_failure:
1256         nlmsg_trim(skb, b);
1257         return -1;
1258 }
1259
1260 static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1261 {
1262         unsigned char *b = skb_tail_pointer(skb);
1263         struct tc_cbq_fopt opt;
1264
1265         if (cl->split || cl->defmap) {
1266                 opt.split = cl->split ? cl->split->common.classid : 0;
1267                 opt.defmap = cl->defmap;
1268                 opt.defchange = ~0;
1269                 if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1270                         goto nla_put_failure;
1271         }
1272         return skb->len;
1273
1274 nla_put_failure:
1275         nlmsg_trim(skb, b);
1276         return -1;
1277 }
1278
1279 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1280 {
1281         if (cbq_dump_lss(skb, cl) < 0 ||
1282             cbq_dump_rate(skb, cl) < 0 ||
1283             cbq_dump_wrr(skb, cl) < 0 ||
1284             cbq_dump_fopt(skb, cl) < 0)
1285                 return -1;
1286         return 0;
1287 }
1288
1289 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1290 {
1291         struct cbq_sched_data *q = qdisc_priv(sch);
1292         struct nlattr *nest;
1293
1294         nest = nla_nest_start(skb, TCA_OPTIONS);
1295         if (nest == NULL)
1296                 goto nla_put_failure;
1297         if (cbq_dump_attr(skb, &q->link) < 0)
1298                 goto nla_put_failure;
1299         return nla_nest_end(skb, nest);
1300
1301 nla_put_failure:
1302         nla_nest_cancel(skb, nest);
1303         return -1;
1304 }
1305
1306 static int
1307 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1308 {
1309         struct cbq_sched_data *q = qdisc_priv(sch);
1310
1311         q->link.xstats.avgidle = q->link.avgidle;
1312         return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1313 }
1314
1315 static int
1316 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1317                struct sk_buff *skb, struct tcmsg *tcm)
1318 {
1319         struct cbq_class *cl = (struct cbq_class *)arg;
1320         struct nlattr *nest;
1321
1322         if (cl->tparent)
1323                 tcm->tcm_parent = cl->tparent->common.classid;
1324         else
1325                 tcm->tcm_parent = TC_H_ROOT;
1326         tcm->tcm_handle = cl->common.classid;
1327         tcm->tcm_info = cl->q->handle;
1328
1329         nest = nla_nest_start(skb, TCA_OPTIONS);
1330         if (nest == NULL)
1331                 goto nla_put_failure;
1332         if (cbq_dump_attr(skb, cl) < 0)
1333                 goto nla_put_failure;
1334         return nla_nest_end(skb, nest);
1335
1336 nla_put_failure:
1337         nla_nest_cancel(skb, nest);
1338         return -1;
1339 }
1340
1341 static int
1342 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1343         struct gnet_dump *d)
1344 {
1345         struct cbq_sched_data *q = qdisc_priv(sch);
1346         struct cbq_class *cl = (struct cbq_class *)arg;
1347
1348         cl->xstats.avgidle = cl->avgidle;
1349         cl->xstats.undertime = 0;
1350
1351         if (cl->undertime != PSCHED_PASTPERFECT)
1352                 cl->xstats.undertime = cl->undertime - q->now;
1353
1354         if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1355                                   d, NULL, &cl->bstats) < 0 ||
1356             gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1357             gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
1358                 return -1;
1359
1360         return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1361 }
1362
1363 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1364                      struct Qdisc **old)
1365 {
1366         struct cbq_class *cl = (struct cbq_class *)arg;
1367
1368         if (new == NULL) {
1369                 new = qdisc_create_dflt(sch->dev_queue,
1370                                         &pfifo_qdisc_ops, cl->common.classid);
1371                 if (new == NULL)
1372                         return -ENOBUFS;
1373         }
1374
1375         *old = qdisc_replace(sch, new, &cl->q);
1376         return 0;
1377 }
1378
1379 static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1380 {
1381         struct cbq_class *cl = (struct cbq_class *)arg;
1382
1383         return cl->q;
1384 }
1385
1386 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1387 {
1388         struct cbq_class *cl = (struct cbq_class *)arg;
1389
1390         cbq_deactivate_class(cl);
1391 }
1392
1393 static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
1394 {
1395         struct cbq_sched_data *q = qdisc_priv(sch);
1396
1397         return (unsigned long)cbq_class_lookup(q, classid);
1398 }
1399
1400 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1401 {
1402         struct cbq_sched_data *q = qdisc_priv(sch);
1403
1404         WARN_ON(cl->filters);
1405
1406         tcf_block_put(cl->block);
1407         qdisc_destroy(cl->q);
1408         qdisc_put_rtab(cl->R_tab);
1409         gen_kill_estimator(&cl->rate_est);
1410         if (cl != &q->link)
1411                 kfree(cl);
1412 }
1413
1414 static void cbq_destroy(struct Qdisc *sch)
1415 {
1416         struct cbq_sched_data *q = qdisc_priv(sch);
1417         struct hlist_node *next;
1418         struct cbq_class *cl;
1419         unsigned int h;
1420
1421 #ifdef CONFIG_NET_CLS_ACT
1422         q->rx_class = NULL;
1423 #endif
1424         /*
1425          * Filters must be destroyed first because we don't destroy the
1426          * classes from root to leafs which means that filters can still
1427          * be bound to classes which have been destroyed already. --TGR '04
1428          */
1429         for (h = 0; h < q->clhash.hashsize; h++) {
1430                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1431                         tcf_block_put(cl->block);
1432                         cl->block = NULL;
1433                 }
1434         }
1435         for (h = 0; h < q->clhash.hashsize; h++) {
1436                 hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1437                                           common.hnode)
1438                         cbq_destroy_class(sch, cl);
1439         }
1440         qdisc_class_hash_destroy(&q->clhash);
1441 }
1442
1443 static int
1444 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1445                  unsigned long *arg)
1446 {
1447         int err;
1448         struct cbq_sched_data *q = qdisc_priv(sch);
1449         struct cbq_class *cl = (struct cbq_class *)*arg;
1450         struct nlattr *opt = tca[TCA_OPTIONS];
1451         struct nlattr *tb[TCA_CBQ_MAX + 1];
1452         struct cbq_class *parent;
1453         struct qdisc_rate_table *rtab = NULL;
1454
1455         if (opt == NULL)
1456                 return -EINVAL;
1457
1458         err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, NULL);
1459         if (err < 0)
1460                 return err;
1461
1462         if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE])
1463                 return -EOPNOTSUPP;
1464
1465         if (cl) {
1466                 /* Check parent */
1467                 if (parentid) {
1468                         if (cl->tparent &&
1469                             cl->tparent->common.classid != parentid)
1470                                 return -EINVAL;
1471                         if (!cl->tparent && parentid != TC_H_ROOT)
1472                                 return -EINVAL;
1473                 }
1474
1475                 if (tb[TCA_CBQ_RATE]) {
1476                         rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1477                                               tb[TCA_CBQ_RTAB]);
1478                         if (rtab == NULL)
1479                                 return -EINVAL;
1480                 }
1481
1482                 if (tca[TCA_RATE]) {
1483                         err = gen_replace_estimator(&cl->bstats, NULL,
1484                                                     &cl->rate_est,
1485                                                     NULL,
1486                                                     qdisc_root_sleeping_running(sch),
1487                                                     tca[TCA_RATE]);
1488                         if (err) {
1489                                 qdisc_put_rtab(rtab);
1490                                 return err;
1491                         }
1492                 }
1493
1494                 /* Change class parameters */
1495                 sch_tree_lock(sch);
1496
1497                 if (cl->next_alive != NULL)
1498                         cbq_deactivate_class(cl);
1499
1500                 if (rtab) {
1501                         qdisc_put_rtab(cl->R_tab);
1502                         cl->R_tab = rtab;
1503                 }
1504
1505                 if (tb[TCA_CBQ_LSSOPT])
1506                         cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1507
1508                 if (tb[TCA_CBQ_WRROPT]) {
1509                         cbq_rmprio(q, cl);
1510                         cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1511                 }
1512
1513                 if (tb[TCA_CBQ_FOPT])
1514                         cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1515
1516                 if (cl->q->q.qlen)
1517                         cbq_activate_class(cl);
1518
1519                 sch_tree_unlock(sch);
1520
1521                 return 0;
1522         }
1523
1524         if (parentid == TC_H_ROOT)
1525                 return -EINVAL;
1526
1527         if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
1528             tb[TCA_CBQ_LSSOPT] == NULL)
1529                 return -EINVAL;
1530
1531         rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1532         if (rtab == NULL)
1533                 return -EINVAL;
1534
1535         if (classid) {
1536                 err = -EINVAL;
1537                 if (TC_H_MAJ(classid ^ sch->handle) ||
1538                     cbq_class_lookup(q, classid))
1539                         goto failure;
1540         } else {
1541                 int i;
1542                 classid = TC_H_MAKE(sch->handle, 0x8000);
1543
1544                 for (i = 0; i < 0x8000; i++) {
1545                         if (++q->hgenerator >= 0x8000)
1546                                 q->hgenerator = 1;
1547                         if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1548                                 break;
1549                 }
1550                 err = -ENOSR;
1551                 if (i >= 0x8000)
1552                         goto failure;
1553                 classid = classid|q->hgenerator;
1554         }
1555
1556         parent = &q->link;
1557         if (parentid) {
1558                 parent = cbq_class_lookup(q, parentid);
1559                 err = -EINVAL;
1560                 if (parent == NULL)
1561                         goto failure;
1562         }
1563
1564         err = -ENOBUFS;
1565         cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1566         if (cl == NULL)
1567                 goto failure;
1568
1569         err = tcf_block_get(&cl->block, &cl->filter_list);
1570         if (err) {
1571                 kfree(cl);
1572                 return err;
1573         }
1574
1575         if (tca[TCA_RATE]) {
1576                 err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1577                                         NULL,
1578                                         qdisc_root_sleeping_running(sch),
1579                                         tca[TCA_RATE]);
1580                 if (err) {
1581                         tcf_block_put(cl->block);
1582                         kfree(cl);
1583                         goto failure;
1584                 }
1585         }
1586
1587         cl->R_tab = rtab;
1588         rtab = NULL;
1589         cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
1590         if (!cl->q)
1591                 cl->q = &noop_qdisc;
1592         else
1593                 qdisc_hash_add(cl->q, true);
1594
1595         cl->common.classid = classid;
1596         cl->tparent = parent;
1597         cl->qdisc = sch;
1598         cl->allot = parent->allot;
1599         cl->quantum = cl->allot;
1600         cl->weight = cl->R_tab->rate.rate;
1601
1602         sch_tree_lock(sch);
1603         cbq_link_class(cl);
1604         cl->borrow = cl->tparent;
1605         if (cl->tparent != &q->link)
1606                 cl->share = cl->tparent;
1607         cbq_adjust_levels(parent);
1608         cl->minidle = -0x7FFFFFFF;
1609         cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1610         cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1611         if (cl->ewma_log == 0)
1612                 cl->ewma_log = q->link.ewma_log;
1613         if (cl->maxidle == 0)
1614                 cl->maxidle = q->link.maxidle;
1615         if (cl->avpkt == 0)
1616                 cl->avpkt = q->link.avpkt;
1617         if (tb[TCA_CBQ_FOPT])
1618                 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1619         sch_tree_unlock(sch);
1620
1621         qdisc_class_hash_grow(sch, &q->clhash);
1622
1623         *arg = (unsigned long)cl;
1624         return 0;
1625
1626 failure:
1627         qdisc_put_rtab(rtab);
1628         return err;
1629 }
1630
1631 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1632 {
1633         struct cbq_sched_data *q = qdisc_priv(sch);
1634         struct cbq_class *cl = (struct cbq_class *)arg;
1635         unsigned int qlen, backlog;
1636
1637         if (cl->filters || cl->children || cl == &q->link)
1638                 return -EBUSY;
1639
1640         sch_tree_lock(sch);
1641
1642         qlen = cl->q->q.qlen;
1643         backlog = cl->q->qstats.backlog;
1644         qdisc_reset(cl->q);
1645         qdisc_tree_reduce_backlog(cl->q, qlen, backlog);
1646
1647         if (cl->next_alive)
1648                 cbq_deactivate_class(cl);
1649
1650         if (q->tx_borrowed == cl)
1651                 q->tx_borrowed = q->tx_class;
1652         if (q->tx_class == cl) {
1653                 q->tx_class = NULL;
1654                 q->tx_borrowed = NULL;
1655         }
1656 #ifdef CONFIG_NET_CLS_ACT
1657         if (q->rx_class == cl)
1658                 q->rx_class = NULL;
1659 #endif
1660
1661         cbq_unlink_class(cl);
1662         cbq_adjust_levels(cl->tparent);
1663         cl->defmap = 0;
1664         cbq_sync_defmap(cl);
1665
1666         cbq_rmprio(q, cl);
1667         sch_tree_unlock(sch);
1668
1669         cbq_destroy_class(sch, cl);
1670         return 0;
1671 }
1672
1673 static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg)
1674 {
1675         struct cbq_sched_data *q = qdisc_priv(sch);
1676         struct cbq_class *cl = (struct cbq_class *)arg;
1677
1678         if (cl == NULL)
1679                 cl = &q->link;
1680
1681         return cl->block;
1682 }
1683
1684 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1685                                      u32 classid)
1686 {
1687         struct cbq_sched_data *q = qdisc_priv(sch);
1688         struct cbq_class *p = (struct cbq_class *)parent;
1689         struct cbq_class *cl = cbq_class_lookup(q, classid);
1690
1691         if (cl) {
1692                 if (p && p->level <= cl->level)
1693                         return 0;
1694                 cl->filters++;
1695                 return (unsigned long)cl;
1696         }
1697         return 0;
1698 }
1699
1700 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1701 {
1702         struct cbq_class *cl = (struct cbq_class *)arg;
1703
1704         cl->filters--;
1705 }
1706
1707 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1708 {
1709         struct cbq_sched_data *q = qdisc_priv(sch);
1710         struct cbq_class *cl;
1711         unsigned int h;
1712
1713         if (arg->stop)
1714                 return;
1715
1716         for (h = 0; h < q->clhash.hashsize; h++) {
1717                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1718                         if (arg->count < arg->skip) {
1719                                 arg->count++;
1720                                 continue;
1721                         }
1722                         if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1723                                 arg->stop = 1;
1724                                 return;
1725                         }
1726                         arg->count++;
1727                 }
1728         }
1729 }
1730
1731 static const struct Qdisc_class_ops cbq_class_ops = {
1732         .graft          =       cbq_graft,
1733         .leaf           =       cbq_leaf,
1734         .qlen_notify    =       cbq_qlen_notify,
1735         .find           =       cbq_find,
1736         .change         =       cbq_change_class,
1737         .delete         =       cbq_delete,
1738         .walk           =       cbq_walk,
1739         .tcf_block      =       cbq_tcf_block,
1740         .bind_tcf       =       cbq_bind_filter,
1741         .unbind_tcf     =       cbq_unbind_filter,
1742         .dump           =       cbq_dump_class,
1743         .dump_stats     =       cbq_dump_class_stats,
1744 };
1745
1746 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1747         .next           =       NULL,
1748         .cl_ops         =       &cbq_class_ops,
1749         .id             =       "cbq",
1750         .priv_size      =       sizeof(struct cbq_sched_data),
1751         .enqueue        =       cbq_enqueue,
1752         .dequeue        =       cbq_dequeue,
1753         .peek           =       qdisc_peek_dequeued,
1754         .init           =       cbq_init,
1755         .reset          =       cbq_reset,
1756         .destroy        =       cbq_destroy,
1757         .change         =       NULL,
1758         .dump           =       cbq_dump,
1759         .dump_stats     =       cbq_dump_stats,
1760         .owner          =       THIS_MODULE,
1761 };
1762
1763 static int __init cbq_module_init(void)
1764 {
1765         return register_qdisc(&cbq_qdisc_ops);
1766 }
1767 static void __exit cbq_module_exit(void)
1768 {
1769         unregister_qdisc(&cbq_qdisc_ops);
1770 }
1771 module_init(cbq_module_init)
1772 module_exit(cbq_module_exit)
1773 MODULE_LICENSE("GPL");