root/net/sched/sch_cbq.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. cbq_class_lookup
  2. cbq_reclassify
  3. cbq_classify
  4. cbq_activate_class
  5. cbq_deactivate_class
  6. cbq_mark_toplevel
  7. cbq_enqueue
  8. cbq_overlimit
  9. cbq_undelay_prio
  10. cbq_undelay
  11. cbq_update_toplevel
  12. cbq_update
  13. cbq_under_limit
  14. cbq_dequeue_prio
  15. cbq_dequeue_1
  16. cbq_dequeue
  17. cbq_adjust_levels
  18. cbq_normalize_quanta
  19. cbq_sync_defmap
  20. cbq_change_defmap
  21. cbq_unlink_class
  22. cbq_link_class
  23. cbq_reset
  24. cbq_set_lss
  25. cbq_rmprio
  26. cbq_addprio
  27. cbq_set_wrr
  28. cbq_set_fopt
  29. cbq_opt_parse
  30. cbq_init
  31. cbq_dump_rate
  32. cbq_dump_lss
  33. cbq_dump_wrr
  34. cbq_dump_fopt
  35. cbq_dump_attr
  36. cbq_dump
  37. cbq_dump_stats
  38. cbq_dump_class
  39. cbq_dump_class_stats
  40. cbq_graft
  41. cbq_leaf
  42. cbq_qlen_notify
  43. cbq_find
  44. cbq_destroy_class
  45. cbq_destroy
  46. cbq_change_class
  47. cbq_delete
  48. cbq_tcf_block
  49. cbq_bind_filter
  50. cbq_unbind_filter
  51. cbq_walk
  52. cbq_module_init
  53. cbq_module_exit

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * net/sched/sch_cbq.c  Class-Based Queueing discipline.
   4  *
   5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
   6  */
   7 
   8 #include <linux/module.h>
   9 #include <linux/slab.h>
  10 #include <linux/types.h>
  11 #include <linux/kernel.h>
  12 #include <linux/string.h>
  13 #include <linux/errno.h>
  14 #include <linux/skbuff.h>
  15 #include <net/netlink.h>
  16 #include <net/pkt_sched.h>
  17 #include <net/pkt_cls.h>
  18 
  19 
  20 /*      Class-Based Queueing (CBQ) algorithm.
  21         =======================================
  22 
  23         Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
  24                  Management Models for Packet Networks",
  25                  IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
  26 
  27                  [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
  28 
  29                  [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
  30                  Parameters", 1996
  31 
  32                  [4] Sally Floyd and Michael Speer, "Experimental Results
  33                  for Class-Based Queueing", 1998, not published.
  34 
  35         -----------------------------------------------------------------------
  36 
  37         Algorithm skeleton was taken from NS simulator cbq.cc.
  38         If someone wants to check this code against the LBL version,
  39         he should take into account that ONLY the skeleton was borrowed,
  40         the implementation is different. Particularly:
  41 
  42         --- The WRR algorithm is different. Our version looks more
  43         reasonable (I hope) and works when quanta are allowed to be
  44         less than MTU, which is always the case when real time classes
  45         have small rates. Note, that the statement of [3] is
  46         incomplete, delay may actually be estimated even if class
  47         per-round allotment is less than MTU. Namely, if per-round
  48         allotment is W*r_i, and r_1+...+r_k = r < 1
  49 
  50         delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
  51 
  52         In the worst case we have IntServ estimate with D = W*r+k*MTU
  53         and C = MTU*r. The proof (if correct at all) is trivial.
  54 
  55 
  56         --- It seems that cbq-2.0 is not very accurate. At least, I cannot
  57         interpret some places, which look like wrong translations
  58         from NS. Anyone is advised to find these differences
  59         and explain to me, why I am wrong 8).
  60 
  61         --- Linux has no EOI event, so that we cannot estimate true class
  62         idle time. Workaround is to consider the next dequeue event
  63         as sign that previous packet is finished. This is wrong because of
  64         internal device queueing, but on a permanently loaded link it is true.
  65         Moreover, combined with clock integrator, this scheme looks
  66         very close to an ideal solution.  */
  67 
  68 struct cbq_sched_data;
  69 
  70 
  71 struct cbq_class {
  72         struct Qdisc_class_common common;
  73         struct cbq_class        *next_alive;    /* next class with backlog in this priority band */
  74 
  75 /* Parameters */
  76         unsigned char           priority;       /* class priority */
  77         unsigned char           priority2;      /* priority to be used after overlimit */
  78         unsigned char           ewma_log;       /* time constant for idle time calculation */
  79 
  80         u32                     defmap;
  81 
  82         /* Link-sharing scheduler parameters */
  83         long                    maxidle;        /* Class parameters: see below. */
  84         long                    offtime;
  85         long                    minidle;
  86         u32                     avpkt;
  87         struct qdisc_rate_table *R_tab;
  88 
  89         /* General scheduler (WRR) parameters */
  90         long                    allot;
  91         long                    quantum;        /* Allotment per WRR round */
  92         long                    weight;         /* Relative allotment: see below */
  93 
  94         struct Qdisc            *qdisc;         /* Ptr to CBQ discipline */
  95         struct cbq_class        *split;         /* Ptr to split node */
  96         struct cbq_class        *share;         /* Ptr to LS parent in the class tree */
  97         struct cbq_class        *tparent;       /* Ptr to tree parent in the class tree */
  98         struct cbq_class        *borrow;        /* NULL if class is bandwidth limited;
  99                                                    parent otherwise */
 100         struct cbq_class        *sibling;       /* Sibling chain */
 101         struct cbq_class        *children;      /* Pointer to children chain */
 102 
 103         struct Qdisc            *q;             /* Elementary queueing discipline */
 104 
 105 
 106 /* Variables */
 107         unsigned char           cpriority;      /* Effective priority */
 108         unsigned char           delayed;
 109         unsigned char           level;          /* level of the class in hierarchy:
 110                                                    0 for leaf classes, and maximal
 111                                                    level of children + 1 for nodes.
 112                                                  */
 113 
 114         psched_time_t           last;           /* Last end of service */
 115         psched_time_t           undertime;
 116         long                    avgidle;
 117         long                    deficit;        /* Saved deficit for WRR */
 118         psched_time_t           penalized;
 119         struct gnet_stats_basic_packed bstats;
 120         struct gnet_stats_queue qstats;
 121         struct net_rate_estimator __rcu *rate_est;
 122         struct tc_cbq_xstats    xstats;
 123 
 124         struct tcf_proto __rcu  *filter_list;
 125         struct tcf_block        *block;
 126 
 127         int                     filters;
 128 
 129         struct cbq_class        *defaults[TC_PRIO_MAX + 1];
 130 };
 131 
 132 struct cbq_sched_data {
 133         struct Qdisc_class_hash clhash;                 /* Hash table of all classes */
 134         int                     nclasses[TC_CBQ_MAXPRIO + 1];
 135         unsigned int            quanta[TC_CBQ_MAXPRIO + 1];
 136 
 137         struct cbq_class        link;
 138 
 139         unsigned int            activemask;
 140         struct cbq_class        *active[TC_CBQ_MAXPRIO + 1];    /* List of all classes
 141                                                                    with backlog */
 142 
 143 #ifdef CONFIG_NET_CLS_ACT
 144         struct cbq_class        *rx_class;
 145 #endif
 146         struct cbq_class        *tx_class;
 147         struct cbq_class        *tx_borrowed;
 148         int                     tx_len;
 149         psched_time_t           now;            /* Cached timestamp */
 150         unsigned int            pmask;
 151 
 152         struct hrtimer          delay_timer;
 153         struct qdisc_watchdog   watchdog;       /* Watchdog timer,
 154                                                    started when CBQ has
 155                                                    backlog, but cannot
 156                                                    transmit just now */
 157         psched_tdiff_t          wd_expires;
 158         int                     toplevel;
 159         u32                     hgenerator;
 160 };
 161 
 162 
 163 #define L2T(cl, len)    qdisc_l2t((cl)->R_tab, len)
 164 
 165 static inline struct cbq_class *
 166 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
 167 {
 168         struct Qdisc_class_common *clc;
 169 
 170         clc = qdisc_class_find(&q->clhash, classid);
 171         if (clc == NULL)
 172                 return NULL;
 173         return container_of(clc, struct cbq_class, common);
 174 }
 175 
 176 #ifdef CONFIG_NET_CLS_ACT
 177 
 178 static struct cbq_class *
 179 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
 180 {
 181         struct cbq_class *cl;
 182 
 183         for (cl = this->tparent; cl; cl = cl->tparent) {
 184                 struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
 185 
 186                 if (new != NULL && new != this)
 187                         return new;
 188         }
 189         return NULL;
 190 }
 191 
 192 #endif
 193 
 194 /* Classify packet. The procedure is pretty complicated, but
 195  * it allows us to combine link sharing and priority scheduling
 196  * transparently.
 197  *
 198  * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
 199  * so that it resolves to split nodes. Then packets are classified
 200  * by logical priority, or a more specific classifier may be attached
 201  * to the split node.
 202  */
 203 
 204 static struct cbq_class *
 205 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 206 {
 207         struct cbq_sched_data *q = qdisc_priv(sch);
 208         struct cbq_class *head = &q->link;
 209         struct cbq_class **defmap;
 210         struct cbq_class *cl = NULL;
 211         u32 prio = skb->priority;
 212         struct tcf_proto *fl;
 213         struct tcf_result res;
 214 
 215         /*
 216          *  Step 1. If skb->priority points to one of our classes, use it.
 217          */
 218         if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
 219             (cl = cbq_class_lookup(q, prio)) != NULL)
 220                 return cl;
 221 
 222         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 223         for (;;) {
 224                 int result = 0;
 225                 defmap = head->defaults;
 226 
 227                 fl = rcu_dereference_bh(head->filter_list);
 228                 /*
 229                  * Step 2+n. Apply classifier.
 230                  */
 231                 result = tcf_classify(skb, fl, &res, true);
 232                 if (!fl || result < 0)
 233                         goto fallback;
 234 
 235                 cl = (void *)res.class;
 236                 if (!cl) {
 237                         if (TC_H_MAJ(res.classid))
 238                                 cl = cbq_class_lookup(q, res.classid);
 239                         else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
 240                                 cl = defmap[TC_PRIO_BESTEFFORT];
 241 
 242                         if (cl == NULL)
 243                                 goto fallback;
 244                 }
 245                 if (cl->level >= head->level)
 246                         goto fallback;
 247 #ifdef CONFIG_NET_CLS_ACT
 248                 switch (result) {
 249                 case TC_ACT_QUEUED:
 250                 case TC_ACT_STOLEN:
 251                 case TC_ACT_TRAP:
 252                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 253                         /* fall through */
 254                 case TC_ACT_SHOT:
 255                         return NULL;
 256                 case TC_ACT_RECLASSIFY:
 257                         return cbq_reclassify(skb, cl);
 258                 }
 259 #endif
 260                 if (cl->level == 0)
 261                         return cl;
 262 
 263                 /*
 264                  * Step 3+n. If classifier selected a link sharing class,
 265                  *         apply agency specific classifier.
 266                  *         Repeat this procdure until we hit a leaf node.
 267                  */
 268                 head = cl;
 269         }
 270 
 271 fallback:
 272         cl = head;
 273 
 274         /*
 275          * Step 4. No success...
 276          */
 277         if (TC_H_MAJ(prio) == 0 &&
 278             !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
 279             !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
 280                 return head;
 281 
 282         return cl;
 283 }
 284 
 285 /*
 286  * A packet has just been enqueued on the empty class.
 287  * cbq_activate_class adds it to the tail of active class list
 288  * of its priority band.
 289  */
 290 
 291 static inline void cbq_activate_class(struct cbq_class *cl)
 292 {
 293         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 294         int prio = cl->cpriority;
 295         struct cbq_class *cl_tail;
 296 
 297         cl_tail = q->active[prio];
 298         q->active[prio] = cl;
 299 
 300         if (cl_tail != NULL) {
 301                 cl->next_alive = cl_tail->next_alive;
 302                 cl_tail->next_alive = cl;
 303         } else {
 304                 cl->next_alive = cl;
 305                 q->activemask |= (1<<prio);
 306         }
 307 }
 308 
 309 /*
 310  * Unlink class from active chain.
 311  * Note that this same procedure is done directly in cbq_dequeue*
 312  * during round-robin procedure.
 313  */
 314 
 315 static void cbq_deactivate_class(struct cbq_class *this)
 316 {
 317         struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 318         int prio = this->cpriority;
 319         struct cbq_class *cl;
 320         struct cbq_class *cl_prev = q->active[prio];
 321 
 322         do {
 323                 cl = cl_prev->next_alive;
 324                 if (cl == this) {
 325                         cl_prev->next_alive = cl->next_alive;
 326                         cl->next_alive = NULL;
 327 
 328                         if (cl == q->active[prio]) {
 329                                 q->active[prio] = cl_prev;
 330                                 if (cl == q->active[prio]) {
 331                                         q->active[prio] = NULL;
 332                                         q->activemask &= ~(1<<prio);
 333                                         return;
 334                                 }
 335                         }
 336                         return;
 337                 }
 338         } while ((cl_prev = cl) != q->active[prio]);
 339 }
 340 
 341 static void
 342 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
 343 {
 344         int toplevel = q->toplevel;
 345 
 346         if (toplevel > cl->level) {
 347                 psched_time_t now = psched_get_time();
 348 
 349                 do {
 350                         if (cl->undertime < now) {
 351                                 q->toplevel = cl->level;
 352                                 return;
 353                         }
 354                 } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
 355         }
 356 }
 357 
 358 static int
 359 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 360             struct sk_buff **to_free)
 361 {
 362         struct cbq_sched_data *q = qdisc_priv(sch);
 363         int uninitialized_var(ret);
 364         struct cbq_class *cl = cbq_classify(skb, sch, &ret);
 365 
 366 #ifdef CONFIG_NET_CLS_ACT
 367         q->rx_class = cl;
 368 #endif
 369         if (cl == NULL) {
 370                 if (ret & __NET_XMIT_BYPASS)
 371                         qdisc_qstats_drop(sch);
 372                 __qdisc_drop(skb, to_free);
 373                 return ret;
 374         }
 375 
 376         ret = qdisc_enqueue(skb, cl->q, to_free);
 377         if (ret == NET_XMIT_SUCCESS) {
 378                 sch->q.qlen++;
 379                 cbq_mark_toplevel(q, cl);
 380                 if (!cl->next_alive)
 381                         cbq_activate_class(cl);
 382                 return ret;
 383         }
 384 
 385         if (net_xmit_drop_count(ret)) {
 386                 qdisc_qstats_drop(sch);
 387                 cbq_mark_toplevel(q, cl);
 388                 cl->qstats.drops++;
 389         }
 390         return ret;
 391 }
 392 
 393 /* Overlimit action: penalize leaf class by adding offtime */
 394 static void cbq_overlimit(struct cbq_class *cl)
 395 {
 396         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 397         psched_tdiff_t delay = cl->undertime - q->now;
 398 
 399         if (!cl->delayed) {
 400                 delay += cl->offtime;
 401 
 402                 /*
 403                  * Class goes to sleep, so that it will have no
 404                  * chance to work avgidle. Let's forgive it 8)
 405                  *
 406                  * BTW cbq-2.0 has a crap in this
 407                  * place, apparently they forgot to shift it by cl->ewma_log.
 408                  */
 409                 if (cl->avgidle < 0)
 410                         delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
 411                 if (cl->avgidle < cl->minidle)
 412                         cl->avgidle = cl->minidle;
 413                 if (delay <= 0)
 414                         delay = 1;
 415                 cl->undertime = q->now + delay;
 416 
 417                 cl->xstats.overactions++;
 418                 cl->delayed = 1;
 419         }
 420         if (q->wd_expires == 0 || q->wd_expires > delay)
 421                 q->wd_expires = delay;
 422 
 423         /* Dirty work! We must schedule wakeups based on
 424          * real available rate, rather than leaf rate,
 425          * which may be tiny (even zero).
 426          */
 427         if (q->toplevel == TC_CBQ_MAXLEVEL) {
 428                 struct cbq_class *b;
 429                 psched_tdiff_t base_delay = q->wd_expires;
 430 
 431                 for (b = cl->borrow; b; b = b->borrow) {
 432                         delay = b->undertime - q->now;
 433                         if (delay < base_delay) {
 434                                 if (delay <= 0)
 435                                         delay = 1;
 436                                 base_delay = delay;
 437                         }
 438                 }
 439 
 440                 q->wd_expires = base_delay;
 441         }
 442 }
 443 
 444 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
 445                                        psched_time_t now)
 446 {
 447         struct cbq_class *cl;
 448         struct cbq_class *cl_prev = q->active[prio];
 449         psched_time_t sched = now;
 450 
 451         if (cl_prev == NULL)
 452                 return 0;
 453 
 454         do {
 455                 cl = cl_prev->next_alive;
 456                 if (now - cl->penalized > 0) {
 457                         cl_prev->next_alive = cl->next_alive;
 458                         cl->next_alive = NULL;
 459                         cl->cpriority = cl->priority;
 460                         cl->delayed = 0;
 461                         cbq_activate_class(cl);
 462 
 463                         if (cl == q->active[prio]) {
 464                                 q->active[prio] = cl_prev;
 465                                 if (cl == q->active[prio]) {
 466                                         q->active[prio] = NULL;
 467                                         return 0;
 468                                 }
 469                         }
 470 
 471                         cl = cl_prev->next_alive;
 472                 } else if (sched - cl->penalized > 0)
 473                         sched = cl->penalized;
 474         } while ((cl_prev = cl) != q->active[prio]);
 475 
 476         return sched - now;
 477 }
 478 
 479 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
 480 {
 481         struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
 482                                                 delay_timer);
 483         struct Qdisc *sch = q->watchdog.qdisc;
 484         psched_time_t now;
 485         psched_tdiff_t delay = 0;
 486         unsigned int pmask;
 487 
 488         now = psched_get_time();
 489 
 490         pmask = q->pmask;
 491         q->pmask = 0;
 492 
 493         while (pmask) {
 494                 int prio = ffz(~pmask);
 495                 psched_tdiff_t tmp;
 496 
 497                 pmask &= ~(1<<prio);
 498 
 499                 tmp = cbq_undelay_prio(q, prio, now);
 500                 if (tmp > 0) {
 501                         q->pmask |= 1<<prio;
 502                         if (tmp < delay || delay == 0)
 503                                 delay = tmp;
 504                 }
 505         }
 506 
 507         if (delay) {
 508                 ktime_t time;
 509 
 510                 time = 0;
 511                 time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
 512                 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
 513         }
 514 
 515         __netif_schedule(qdisc_root(sch));
 516         return HRTIMER_NORESTART;
 517 }
 518 
 519 /*
 520  * It is mission critical procedure.
 521  *
 522  * We "regenerate" toplevel cutoff, if transmitting class
 523  * has backlog and it is not regulated. It is not part of
 524  * original CBQ description, but looks more reasonable.
 525  * Probably, it is wrong. This question needs further investigation.
 526  */
 527 
 528 static inline void
 529 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
 530                     struct cbq_class *borrowed)
 531 {
 532         if (cl && q->toplevel >= borrowed->level) {
 533                 if (cl->q->q.qlen > 1) {
 534                         do {
 535                                 if (borrowed->undertime == PSCHED_PASTPERFECT) {
 536                                         q->toplevel = borrowed->level;
 537                                         return;
 538                                 }
 539                         } while ((borrowed = borrowed->borrow) != NULL);
 540                 }
 541 #if 0
 542         /* It is not necessary now. Uncommenting it
 543            will save CPU cycles, but decrease fairness.
 544          */
 545                 q->toplevel = TC_CBQ_MAXLEVEL;
 546 #endif
 547         }
 548 }
 549 
 550 static void
 551 cbq_update(struct cbq_sched_data *q)
 552 {
 553         struct cbq_class *this = q->tx_class;
 554         struct cbq_class *cl = this;
 555         int len = q->tx_len;
 556         psched_time_t now;
 557 
 558         q->tx_class = NULL;
 559         /* Time integrator. We calculate EOS time
 560          * by adding expected packet transmission time.
 561          */
 562         now = q->now + L2T(&q->link, len);
 563 
 564         for ( ; cl; cl = cl->share) {
 565                 long avgidle = cl->avgidle;
 566                 long idle;
 567 
 568                 cl->bstats.packets++;
 569                 cl->bstats.bytes += len;
 570 
 571                 /*
 572                  * (now - last) is total time between packet right edges.
 573                  * (last_pktlen/rate) is "virtual" busy time, so that
 574                  *
 575                  *      idle = (now - last) - last_pktlen/rate
 576                  */
 577 
 578                 idle = now - cl->last;
 579                 if ((unsigned long)idle > 128*1024*1024) {
 580                         avgidle = cl->maxidle;
 581                 } else {
 582                         idle -= L2T(cl, len);
 583 
 584                 /* true_avgidle := (1-W)*true_avgidle + W*idle,
 585                  * where W=2^{-ewma_log}. But cl->avgidle is scaled:
 586                  * cl->avgidle == true_avgidle/W,
 587                  * hence:
 588                  */
 589                         avgidle += idle - (avgidle>>cl->ewma_log);
 590                 }
 591 
 592                 if (avgidle <= 0) {
 593                         /* Overlimit or at-limit */
 594 
 595                         if (avgidle < cl->minidle)
 596                                 avgidle = cl->minidle;
 597 
 598                         cl->avgidle = avgidle;
 599 
 600                         /* Calculate expected time, when this class
 601                          * will be allowed to send.
 602                          * It will occur, when:
 603                          * (1-W)*true_avgidle + W*delay = 0, i.e.
 604                          * idle = (1/W - 1)*(-true_avgidle)
 605                          * or
 606                          * idle = (1 - W)*(-cl->avgidle);
 607                          */
 608                         idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
 609 
 610                         /*
 611                          * That is not all.
 612                          * To maintain the rate allocated to the class,
 613                          * we add to undertime virtual clock,
 614                          * necessary to complete transmitted packet.
 615                          * (len/phys_bandwidth has been already passed
 616                          * to the moment of cbq_update)
 617                          */
 618 
 619                         idle -= L2T(&q->link, len);
 620                         idle += L2T(cl, len);
 621 
 622                         cl->undertime = now + idle;
 623                 } else {
 624                         /* Underlimit */
 625 
 626                         cl->undertime = PSCHED_PASTPERFECT;
 627                         if (avgidle > cl->maxidle)
 628                                 cl->avgidle = cl->maxidle;
 629                         else
 630                                 cl->avgidle = avgidle;
 631                 }
 632                 if ((s64)(now - cl->last) > 0)
 633                         cl->last = now;
 634         }
 635 
 636         cbq_update_toplevel(q, this, q->tx_borrowed);
 637 }
 638 
 639 static inline struct cbq_class *
 640 cbq_under_limit(struct cbq_class *cl)
 641 {
 642         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 643         struct cbq_class *this_cl = cl;
 644 
 645         if (cl->tparent == NULL)
 646                 return cl;
 647 
 648         if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
 649                 cl->delayed = 0;
 650                 return cl;
 651         }
 652 
 653         do {
 654                 /* It is very suspicious place. Now overlimit
 655                  * action is generated for not bounded classes
 656                  * only if link is completely congested.
 657                  * Though it is in agree with ancestor-only paradigm,
 658                  * it looks very stupid. Particularly,
 659                  * it means that this chunk of code will either
 660                  * never be called or result in strong amplification
 661                  * of burstiness. Dangerous, silly, and, however,
 662                  * no another solution exists.
 663                  */
 664                 cl = cl->borrow;
 665                 if (!cl) {
 666                         this_cl->qstats.overlimits++;
 667                         cbq_overlimit(this_cl);
 668                         return NULL;
 669                 }
 670                 if (cl->level > q->toplevel)
 671                         return NULL;
 672         } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
 673 
 674         cl->delayed = 0;
 675         return cl;
 676 }
 677 
 678 static inline struct sk_buff *
 679 cbq_dequeue_prio(struct Qdisc *sch, int prio)
 680 {
 681         struct cbq_sched_data *q = qdisc_priv(sch);
 682         struct cbq_class *cl_tail, *cl_prev, *cl;
 683         struct sk_buff *skb;
 684         int deficit;
 685 
 686         cl_tail = cl_prev = q->active[prio];
 687         cl = cl_prev->next_alive;
 688 
 689         do {
 690                 deficit = 0;
 691 
 692                 /* Start round */
 693                 do {
 694                         struct cbq_class *borrow = cl;
 695 
 696                         if (cl->q->q.qlen &&
 697                             (borrow = cbq_under_limit(cl)) == NULL)
 698                                 goto skip_class;
 699 
 700                         if (cl->deficit <= 0) {
 701                                 /* Class exhausted its allotment per
 702                                  * this round. Switch to the next one.
 703                                  */
 704                                 deficit = 1;
 705                                 cl->deficit += cl->quantum;
 706                                 goto next_class;
 707                         }
 708 
 709                         skb = cl->q->dequeue(cl->q);
 710 
 711                         /* Class did not give us any skb :-(
 712                          * It could occur even if cl->q->q.qlen != 0
 713                          * f.e. if cl->q == "tbf"
 714                          */
 715                         if (skb == NULL)
 716                                 goto skip_class;
 717 
 718                         cl->deficit -= qdisc_pkt_len(skb);
 719                         q->tx_class = cl;
 720                         q->tx_borrowed = borrow;
 721                         if (borrow != cl) {
 722 #ifndef CBQ_XSTATS_BORROWS_BYTES
 723                                 borrow->xstats.borrows++;
 724                                 cl->xstats.borrows++;
 725 #else
 726                                 borrow->xstats.borrows += qdisc_pkt_len(skb);
 727                                 cl->xstats.borrows += qdisc_pkt_len(skb);
 728 #endif
 729                         }
 730                         q->tx_len = qdisc_pkt_len(skb);
 731 
 732                         if (cl->deficit <= 0) {
 733                                 q->active[prio] = cl;
 734                                 cl = cl->next_alive;
 735                                 cl->deficit += cl->quantum;
 736                         }
 737                         return skb;
 738 
 739 skip_class:
 740                         if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
 741                                 /* Class is empty or penalized.
 742                                  * Unlink it from active chain.
 743                                  */
 744                                 cl_prev->next_alive = cl->next_alive;
 745                                 cl->next_alive = NULL;
 746 
 747                                 /* Did cl_tail point to it? */
 748                                 if (cl == cl_tail) {
 749                                         /* Repair it! */
 750                                         cl_tail = cl_prev;
 751 
 752                                         /* Was it the last class in this band? */
 753                                         if (cl == cl_tail) {
 754                                                 /* Kill the band! */
 755                                                 q->active[prio] = NULL;
 756                                                 q->activemask &= ~(1<<prio);
 757                                                 if (cl->q->q.qlen)
 758                                                         cbq_activate_class(cl);
 759                                                 return NULL;
 760                                         }
 761 
 762                                         q->active[prio] = cl_tail;
 763                                 }
 764                                 if (cl->q->q.qlen)
 765                                         cbq_activate_class(cl);
 766 
 767                                 cl = cl_prev;
 768                         }
 769 
 770 next_class:
 771                         cl_prev = cl;
 772                         cl = cl->next_alive;
 773                 } while (cl_prev != cl_tail);
 774         } while (deficit);
 775 
 776         q->active[prio] = cl_prev;
 777 
 778         return NULL;
 779 }
 780 
 781 static inline struct sk_buff *
 782 cbq_dequeue_1(struct Qdisc *sch)
 783 {
 784         struct cbq_sched_data *q = qdisc_priv(sch);
 785         struct sk_buff *skb;
 786         unsigned int activemask;
 787 
 788         activemask = q->activemask & 0xFF;
 789         while (activemask) {
 790                 int prio = ffz(~activemask);
 791                 activemask &= ~(1<<prio);
 792                 skb = cbq_dequeue_prio(sch, prio);
 793                 if (skb)
 794                         return skb;
 795         }
 796         return NULL;
 797 }
 798 
 799 static struct sk_buff *
 800 cbq_dequeue(struct Qdisc *sch)
 801 {
 802         struct sk_buff *skb;
 803         struct cbq_sched_data *q = qdisc_priv(sch);
 804         psched_time_t now;
 805 
 806         now = psched_get_time();
 807 
 808         if (q->tx_class)
 809                 cbq_update(q);
 810 
 811         q->now = now;
 812 
 813         for (;;) {
 814                 q->wd_expires = 0;
 815 
 816                 skb = cbq_dequeue_1(sch);
 817                 if (skb) {
 818                         qdisc_bstats_update(sch, skb);
 819                         sch->q.qlen--;
 820                         return skb;
 821                 }
 822 
 823                 /* All the classes are overlimit.
 824                  *
 825                  * It is possible, if:
 826                  *
 827                  * 1. Scheduler is empty.
 828                  * 2. Toplevel cutoff inhibited borrowing.
 829                  * 3. Root class is overlimit.
 830                  *
 831                  * Reset 2d and 3d conditions and retry.
 832                  *
 833                  * Note, that NS and cbq-2.0 are buggy, peeking
 834                  * an arbitrary class is appropriate for ancestor-only
 835                  * sharing, but not for toplevel algorithm.
 836                  *
 837                  * Our version is better, but slower, because it requires
 838                  * two passes, but it is unavoidable with top-level sharing.
 839                  */
 840 
 841                 if (q->toplevel == TC_CBQ_MAXLEVEL &&
 842                     q->link.undertime == PSCHED_PASTPERFECT)
 843                         break;
 844 
 845                 q->toplevel = TC_CBQ_MAXLEVEL;
 846                 q->link.undertime = PSCHED_PASTPERFECT;
 847         }
 848 
 849         /* No packets in scheduler or nobody wants to give them to us :-(
 850          * Sigh... start watchdog timer in the last case.
 851          */
 852 
 853         if (sch->q.qlen) {
 854                 qdisc_qstats_overlimit(sch);
 855                 if (q->wd_expires)
 856                         qdisc_watchdog_schedule(&q->watchdog,
 857                                                 now + q->wd_expires);
 858         }
 859         return NULL;
 860 }
 861 
 862 /* CBQ class maintanance routines */
 863 
 864 static void cbq_adjust_levels(struct cbq_class *this)
 865 {
 866         if (this == NULL)
 867                 return;
 868 
 869         do {
 870                 int level = 0;
 871                 struct cbq_class *cl;
 872 
 873                 cl = this->children;
 874                 if (cl) {
 875                         do {
 876                                 if (cl->level > level)
 877                                         level = cl->level;
 878                         } while ((cl = cl->sibling) != this->children);
 879                 }
 880                 this->level = level + 1;
 881         } while ((this = this->tparent) != NULL);
 882 }
 883 
 884 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
 885 {
 886         struct cbq_class *cl;
 887         unsigned int h;
 888 
 889         if (q->quanta[prio] == 0)
 890                 return;
 891 
 892         for (h = 0; h < q->clhash.hashsize; h++) {
 893                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
 894                         /* BUGGGG... Beware! This expression suffer of
 895                          * arithmetic overflows!
 896                          */
 897                         if (cl->priority == prio) {
 898                                 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
 899                                         q->quanta[prio];
 900                         }
 901                         if (cl->quantum <= 0 ||
 902                             cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
 903                                 pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
 904                                         cl->common.classid, cl->quantum);
 905                                 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
 906                         }
 907                 }
 908         }
 909 }
 910 
 911 static void cbq_sync_defmap(struct cbq_class *cl)
 912 {
 913         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 914         struct cbq_class *split = cl->split;
 915         unsigned int h;
 916         int i;
 917 
 918         if (split == NULL)
 919                 return;
 920 
 921         for (i = 0; i <= TC_PRIO_MAX; i++) {
 922                 if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
 923                         split->defaults[i] = NULL;
 924         }
 925 
 926         for (i = 0; i <= TC_PRIO_MAX; i++) {
 927                 int level = split->level;
 928 
 929                 if (split->defaults[i])
 930                         continue;
 931 
 932                 for (h = 0; h < q->clhash.hashsize; h++) {
 933                         struct cbq_class *c;
 934 
 935                         hlist_for_each_entry(c, &q->clhash.hash[h],
 936                                              common.hnode) {
 937                                 if (c->split == split && c->level < level &&
 938                                     c->defmap & (1<<i)) {
 939                                         split->defaults[i] = c;
 940                                         level = c->level;
 941                                 }
 942                         }
 943                 }
 944         }
 945 }
 946 
 947 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
 948 {
 949         struct cbq_class *split = NULL;
 950 
 951         if (splitid == 0) {
 952                 split = cl->split;
 953                 if (!split)
 954                         return;
 955                 splitid = split->common.classid;
 956         }
 957 
 958         if (split == NULL || split->common.classid != splitid) {
 959                 for (split = cl->tparent; split; split = split->tparent)
 960                         if (split->common.classid == splitid)
 961                                 break;
 962         }
 963 
 964         if (split == NULL)
 965                 return;
 966 
 967         if (cl->split != split) {
 968                 cl->defmap = 0;
 969                 cbq_sync_defmap(cl);
 970                 cl->split = split;
 971                 cl->defmap = def & mask;
 972         } else
 973                 cl->defmap = (cl->defmap & ~mask) | (def & mask);
 974 
 975         cbq_sync_defmap(cl);
 976 }
 977 
 978 static void cbq_unlink_class(struct cbq_class *this)
 979 {
 980         struct cbq_class *cl, **clp;
 981         struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 982 
 983         qdisc_class_hash_remove(&q->clhash, &this->common);
 984 
 985         if (this->tparent) {
 986                 clp = &this->sibling;
 987                 cl = *clp;
 988                 do {
 989                         if (cl == this) {
 990                                 *clp = cl->sibling;
 991                                 break;
 992                         }
 993                         clp = &cl->sibling;
 994                 } while ((cl = *clp) != this->sibling);
 995 
 996                 if (this->tparent->children == this) {
 997                         this->tparent->children = this->sibling;
 998                         if (this->sibling == this)
 999                                 this->tparent->children = NULL;
1000                 }
1001         } else {
1002                 WARN_ON(this->sibling != this);
1003         }
1004 }
1005 
1006 static void cbq_link_class(struct cbq_class *this)
1007 {
1008         struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1009         struct cbq_class *parent = this->tparent;
1010 
1011         this->sibling = this;
1012         qdisc_class_hash_insert(&q->clhash, &this->common);
1013 
1014         if (parent == NULL)
1015                 return;
1016 
1017         if (parent->children == NULL) {
1018                 parent->children = this;
1019         } else {
1020                 this->sibling = parent->children->sibling;
1021                 parent->children->sibling = this;
1022         }
1023 }
1024 
1025 static void
1026 cbq_reset(struct Qdisc *sch)
1027 {
1028         struct cbq_sched_data *q = qdisc_priv(sch);
1029         struct cbq_class *cl;
1030         int prio;
1031         unsigned int h;
1032 
1033         q->activemask = 0;
1034         q->pmask = 0;
1035         q->tx_class = NULL;
1036         q->tx_borrowed = NULL;
1037         qdisc_watchdog_cancel(&q->watchdog);
1038         hrtimer_cancel(&q->delay_timer);
1039         q->toplevel = TC_CBQ_MAXLEVEL;
1040         q->now = psched_get_time();
1041 
1042         for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1043                 q->active[prio] = NULL;
1044 
1045         for (h = 0; h < q->clhash.hashsize; h++) {
1046                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1047                         qdisc_reset(cl->q);
1048 
1049                         cl->next_alive = NULL;
1050                         cl->undertime = PSCHED_PASTPERFECT;
1051                         cl->avgidle = cl->maxidle;
1052                         cl->deficit = cl->quantum;
1053                         cl->cpriority = cl->priority;
1054                 }
1055         }
1056         sch->q.qlen = 0;
1057 }
1058 
1059 
1060 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1061 {
1062         if (lss->change & TCF_CBQ_LSS_FLAGS) {
1063                 cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1064                 cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1065         }
1066         if (lss->change & TCF_CBQ_LSS_EWMA)
1067                 cl->ewma_log = lss->ewma_log;
1068         if (lss->change & TCF_CBQ_LSS_AVPKT)
1069                 cl->avpkt = lss->avpkt;
1070         if (lss->change & TCF_CBQ_LSS_MINIDLE)
1071                 cl->minidle = -(long)lss->minidle;
1072         if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1073                 cl->maxidle = lss->maxidle;
1074                 cl->avgidle = lss->maxidle;
1075         }
1076         if (lss->change & TCF_CBQ_LSS_OFFTIME)
1077                 cl->offtime = lss->offtime;
1078         return 0;
1079 }
1080 
1081 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1082 {
1083         q->nclasses[cl->priority]--;
1084         q->quanta[cl->priority] -= cl->weight;
1085         cbq_normalize_quanta(q, cl->priority);
1086 }
1087 
1088 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1089 {
1090         q->nclasses[cl->priority]++;
1091         q->quanta[cl->priority] += cl->weight;
1092         cbq_normalize_quanta(q, cl->priority);
1093 }
1094 
1095 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1096 {
1097         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1098 
1099         if (wrr->allot)
1100                 cl->allot = wrr->allot;
1101         if (wrr->weight)
1102                 cl->weight = wrr->weight;
1103         if (wrr->priority) {
1104                 cl->priority = wrr->priority - 1;
1105                 cl->cpriority = cl->priority;
1106                 if (cl->priority >= cl->priority2)
1107                         cl->priority2 = TC_CBQ_MAXPRIO - 1;
1108         }
1109 
1110         cbq_addprio(q, cl);
1111         return 0;
1112 }
1113 
1114 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1115 {
1116         cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1117         return 0;
1118 }
1119 
1120 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1121         [TCA_CBQ_LSSOPT]        = { .len = sizeof(struct tc_cbq_lssopt) },
1122         [TCA_CBQ_WRROPT]        = { .len = sizeof(struct tc_cbq_wrropt) },
1123         [TCA_CBQ_FOPT]          = { .len = sizeof(struct tc_cbq_fopt) },
1124         [TCA_CBQ_OVL_STRATEGY]  = { .len = sizeof(struct tc_cbq_ovl) },
1125         [TCA_CBQ_RATE]          = { .len = sizeof(struct tc_ratespec) },
1126         [TCA_CBQ_RTAB]          = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1127         [TCA_CBQ_POLICE]        = { .len = sizeof(struct tc_cbq_police) },
1128 };
1129 
1130 static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1],
1131                          struct nlattr *opt,
1132                          struct netlink_ext_ack *extack)
1133 {
1134         int err;
1135 
1136         if (!opt) {
1137                 NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
1138                 return -EINVAL;
1139         }
1140 
1141         err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt,
1142                                           cbq_policy, extack);
1143         if (err < 0)
1144                 return err;
1145 
1146         if (tb[TCA_CBQ_WRROPT]) {
1147                 const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]);
1148 
1149                 if (wrr->priority > TC_CBQ_MAXPRIO) {
1150                         NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO");
1151                         err = -EINVAL;
1152                 }
1153         }
1154         return err;
1155 }
1156 
1157 static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
1158                     struct netlink_ext_ack *extack)
1159 {
1160         struct cbq_sched_data *q = qdisc_priv(sch);
1161         struct nlattr *tb[TCA_CBQ_MAX + 1];
1162         struct tc_ratespec *r;
1163         int err;
1164 
1165         qdisc_watchdog_init(&q->watchdog, sch);
1166         hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1167         q->delay_timer.function = cbq_undelay;
1168 
1169         err = cbq_opt_parse(tb, opt, extack);
1170         if (err < 0)
1171                 return err;
1172 
1173         if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) {
1174                 NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete");
1175                 return -EINVAL;
1176         }
1177 
1178         r = nla_data(tb[TCA_CBQ_RATE]);
1179 
1180         q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack);
1181         if (!q->link.R_tab)
1182                 return -EINVAL;
1183 
1184         err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack);
1185         if (err)
1186                 goto put_rtab;
1187 
1188         err = qdisc_class_hash_init(&q->clhash);
1189         if (err < 0)
1190                 goto put_block;
1191 
1192         q->link.sibling = &q->link;
1193         q->link.common.classid = sch->handle;
1194         q->link.qdisc = sch;
1195         q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1196                                       sch->handle, NULL);
1197         if (!q->link.q)
1198                 q->link.q = &noop_qdisc;
1199         else
1200                 qdisc_hash_add(q->link.q, true);
1201 
1202         q->link.priority = TC_CBQ_MAXPRIO - 1;
1203         q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1204         q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1205         q->link.allot = psched_mtu(qdisc_dev(sch));
1206         q->link.quantum = q->link.allot;
1207         q->link.weight = q->link.R_tab->rate.rate;
1208 
1209         q->link.ewma_log = TC_CBQ_DEF_EWMA;
1210         q->link.avpkt = q->link.allot/2;
1211         q->link.minidle = -0x7FFFFFFF;
1212 
1213         q->toplevel = TC_CBQ_MAXLEVEL;
1214         q->now = psched_get_time();
1215 
1216         cbq_link_class(&q->link);
1217 
1218         if (tb[TCA_CBQ_LSSOPT])
1219                 cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1220 
1221         cbq_addprio(q, &q->link);
1222         return 0;
1223 
1224 put_block:
1225         tcf_block_put(q->link.block);
1226 
1227 put_rtab:
1228         qdisc_put_rtab(q->link.R_tab);
1229         return err;
1230 }
1231 
1232 static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1233 {
1234         unsigned char *b = skb_tail_pointer(skb);
1235 
1236         if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1237                 goto nla_put_failure;
1238         return skb->len;
1239 
1240 nla_put_failure:
1241         nlmsg_trim(skb, b);
1242         return -1;
1243 }
1244 
1245 static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1246 {
1247         unsigned char *b = skb_tail_pointer(skb);
1248         struct tc_cbq_lssopt opt;
1249 
1250         opt.flags = 0;
1251         if (cl->borrow == NULL)
1252                 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1253         if (cl->share == NULL)
1254                 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1255         opt.ewma_log = cl->ewma_log;
1256         opt.level = cl->level;
1257         opt.avpkt = cl->avpkt;
1258         opt.maxidle = cl->maxidle;
1259         opt.minidle = (u32)(-cl->minidle);
1260         opt.offtime = cl->offtime;
1261         opt.change = ~0;
1262         if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1263                 goto nla_put_failure;
1264         return skb->len;
1265 
1266 nla_put_failure:
1267         nlmsg_trim(skb, b);
1268         return -1;
1269 }
1270 
1271 static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1272 {
1273         unsigned char *b = skb_tail_pointer(skb);
1274         struct tc_cbq_wrropt opt;
1275 
1276         memset(&opt, 0, sizeof(opt));
1277         opt.flags = 0;
1278         opt.allot = cl->allot;
1279         opt.priority = cl->priority + 1;
1280         opt.cpriority = cl->cpriority + 1;
1281         opt.weight = cl->weight;
1282         if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1283                 goto nla_put_failure;
1284         return skb->len;
1285 
1286 nla_put_failure:
1287         nlmsg_trim(skb, b);
1288         return -1;
1289 }
1290 
1291 static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1292 {
1293         unsigned char *b = skb_tail_pointer(skb);
1294         struct tc_cbq_fopt opt;
1295 
1296         if (cl->split || cl->defmap) {
1297                 opt.split = cl->split ? cl->split->common.classid : 0;
1298                 opt.defmap = cl->defmap;
1299                 opt.defchange = ~0;
1300                 if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1301                         goto nla_put_failure;
1302         }
1303         return skb->len;
1304 
1305 nla_put_failure:
1306         nlmsg_trim(skb, b);
1307         return -1;
1308 }
1309 
1310 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1311 {
1312         if (cbq_dump_lss(skb, cl) < 0 ||
1313             cbq_dump_rate(skb, cl) < 0 ||
1314             cbq_dump_wrr(skb, cl) < 0 ||
1315             cbq_dump_fopt(skb, cl) < 0)
1316                 return -1;
1317         return 0;
1318 }
1319 
1320 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1321 {
1322         struct cbq_sched_data *q = qdisc_priv(sch);
1323         struct nlattr *nest;
1324 
1325         nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1326         if (nest == NULL)
1327                 goto nla_put_failure;
1328         if (cbq_dump_attr(skb, &q->link) < 0)
1329                 goto nla_put_failure;
1330         return nla_nest_end(skb, nest);
1331 
1332 nla_put_failure:
1333         nla_nest_cancel(skb, nest);
1334         return -1;
1335 }
1336 
1337 static int
1338 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1339 {
1340         struct cbq_sched_data *q = qdisc_priv(sch);
1341 
1342         q->link.xstats.avgidle = q->link.avgidle;
1343         return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1344 }
1345 
1346 static int
1347 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1348                struct sk_buff *skb, struct tcmsg *tcm)
1349 {
1350         struct cbq_class *cl = (struct cbq_class *)arg;
1351         struct nlattr *nest;
1352 
1353         if (cl->tparent)
1354                 tcm->tcm_parent = cl->tparent->common.classid;
1355         else
1356                 tcm->tcm_parent = TC_H_ROOT;
1357         tcm->tcm_handle = cl->common.classid;
1358         tcm->tcm_info = cl->q->handle;
1359 
1360         nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1361         if (nest == NULL)
1362                 goto nla_put_failure;
1363         if (cbq_dump_attr(skb, cl) < 0)
1364                 goto nla_put_failure;
1365         return nla_nest_end(skb, nest);
1366 
1367 nla_put_failure:
1368         nla_nest_cancel(skb, nest);
1369         return -1;
1370 }
1371 
1372 static int
1373 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1374         struct gnet_dump *d)
1375 {
1376         struct cbq_sched_data *q = qdisc_priv(sch);
1377         struct cbq_class *cl = (struct cbq_class *)arg;
1378         __u32 qlen;
1379 
1380         cl->xstats.avgidle = cl->avgidle;
1381         cl->xstats.undertime = 0;
1382         qdisc_qstats_qlen_backlog(cl->q, &qlen, &cl->qstats.backlog);
1383 
1384         if (cl->undertime != PSCHED_PASTPERFECT)
1385                 cl->xstats.undertime = cl->undertime - q->now;
1386 
1387         if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1388                                   d, NULL, &cl->bstats) < 0 ||
1389             gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1390             gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
1391                 return -1;
1392 
1393         return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1394 }
1395 
1396 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1397                      struct Qdisc **old, struct netlink_ext_ack *extack)
1398 {
1399         struct cbq_class *cl = (struct cbq_class *)arg;
1400 
1401         if (new == NULL) {
1402                 new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1403                                         cl->common.classid, extack);
1404                 if (new == NULL)
1405                         return -ENOBUFS;
1406         }
1407 
1408         *old = qdisc_replace(sch, new, &cl->q);
1409         return 0;
1410 }
1411 
1412 static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1413 {
1414         struct cbq_class *cl = (struct cbq_class *)arg;
1415 
1416         return cl->q;
1417 }
1418 
1419 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1420 {
1421         struct cbq_class *cl = (struct cbq_class *)arg;
1422 
1423         cbq_deactivate_class(cl);
1424 }
1425 
1426 static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
1427 {
1428         struct cbq_sched_data *q = qdisc_priv(sch);
1429 
1430         return (unsigned long)cbq_class_lookup(q, classid);
1431 }
1432 
1433 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1434 {
1435         struct cbq_sched_data *q = qdisc_priv(sch);
1436 
1437         WARN_ON(cl->filters);
1438 
1439         tcf_block_put(cl->block);
1440         qdisc_put(cl->q);
1441         qdisc_put_rtab(cl->R_tab);
1442         gen_kill_estimator(&cl->rate_est);
1443         if (cl != &q->link)
1444                 kfree(cl);
1445 }
1446 
1447 static void cbq_destroy(struct Qdisc *sch)
1448 {
1449         struct cbq_sched_data *q = qdisc_priv(sch);
1450         struct hlist_node *next;
1451         struct cbq_class *cl;
1452         unsigned int h;
1453 
1454 #ifdef CONFIG_NET_CLS_ACT
1455         q->rx_class = NULL;
1456 #endif
1457         /*
1458          * Filters must be destroyed first because we don't destroy the
1459          * classes from root to leafs which means that filters can still
1460          * be bound to classes which have been destroyed already. --TGR '04
1461          */
1462         for (h = 0; h < q->clhash.hashsize; h++) {
1463                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1464                         tcf_block_put(cl->block);
1465                         cl->block = NULL;
1466                 }
1467         }
1468         for (h = 0; h < q->clhash.hashsize; h++) {
1469                 hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1470                                           common.hnode)
1471                         cbq_destroy_class(sch, cl);
1472         }
1473         qdisc_class_hash_destroy(&q->clhash);
1474 }
1475 
1476 static int
1477 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1478                  unsigned long *arg, struct netlink_ext_ack *extack)
1479 {
1480         int err;
1481         struct cbq_sched_data *q = qdisc_priv(sch);
1482         struct cbq_class *cl = (struct cbq_class *)*arg;
1483         struct nlattr *opt = tca[TCA_OPTIONS];
1484         struct nlattr *tb[TCA_CBQ_MAX + 1];
1485         struct cbq_class *parent;
1486         struct qdisc_rate_table *rtab = NULL;
1487 
1488         err = cbq_opt_parse(tb, opt, extack);
1489         if (err < 0)
1490                 return err;
1491 
1492         if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) {
1493                 NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params");
1494                 return -EOPNOTSUPP;
1495         }
1496 
1497         if (cl) {
1498                 /* Check parent */
1499                 if (parentid) {
1500                         if (cl->tparent &&
1501                             cl->tparent->common.classid != parentid) {
1502                                 NL_SET_ERR_MSG(extack, "Invalid parent id");
1503                                 return -EINVAL;
1504                         }
1505                         if (!cl->tparent && parentid != TC_H_ROOT) {
1506                                 NL_SET_ERR_MSG(extack, "Parent must be root");
1507                                 return -EINVAL;
1508                         }
1509                 }
1510 
1511                 if (tb[TCA_CBQ_RATE]) {
1512                         rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1513                                               tb[TCA_CBQ_RTAB], extack);
1514                         if (rtab == NULL)
1515                                 return -EINVAL;
1516                 }
1517 
1518                 if (tca[TCA_RATE]) {
1519                         err = gen_replace_estimator(&cl->bstats, NULL,
1520                                                     &cl->rate_est,
1521                                                     NULL,
1522                                                     qdisc_root_sleeping_running(sch),
1523                                                     tca[TCA_RATE]);
1524                         if (err) {
1525                                 NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator");
1526                                 qdisc_put_rtab(rtab);
1527                                 return err;
1528                         }
1529                 }
1530 
1531                 /* Change class parameters */
1532                 sch_tree_lock(sch);
1533 
1534                 if (cl->next_alive != NULL)
1535                         cbq_deactivate_class(cl);
1536 
1537                 if (rtab) {
1538                         qdisc_put_rtab(cl->R_tab);
1539                         cl->R_tab = rtab;
1540                 }
1541 
1542                 if (tb[TCA_CBQ_LSSOPT])
1543                         cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1544 
1545                 if (tb[TCA_CBQ_WRROPT]) {
1546                         cbq_rmprio(q, cl);
1547                         cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1548                 }
1549 
1550                 if (tb[TCA_CBQ_FOPT])
1551                         cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1552 
1553                 if (cl->q->q.qlen)
1554                         cbq_activate_class(cl);
1555 
1556                 sch_tree_unlock(sch);
1557 
1558                 return 0;
1559         }
1560 
1561         if (parentid == TC_H_ROOT)
1562                 return -EINVAL;
1563 
1564         if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) {
1565                 NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing");
1566                 return -EINVAL;
1567         }
1568 
1569         rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB],
1570                               extack);
1571         if (rtab == NULL)
1572                 return -EINVAL;
1573 
1574         if (classid) {
1575                 err = -EINVAL;
1576                 if (TC_H_MAJ(classid ^ sch->handle) ||
1577                     cbq_class_lookup(q, classid)) {
1578                         NL_SET_ERR_MSG(extack, "Specified class not found");
1579                         goto failure;
1580                 }
1581         } else {
1582                 int i;
1583                 classid = TC_H_MAKE(sch->handle, 0x8000);
1584 
1585                 for (i = 0; i < 0x8000; i++) {
1586                         if (++q->hgenerator >= 0x8000)
1587                                 q->hgenerator = 1;
1588                         if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1589                                 break;
1590                 }
1591                 err = -ENOSR;
1592                 if (i >= 0x8000) {
1593                         NL_SET_ERR_MSG(extack, "Unable to generate classid");
1594                         goto failure;
1595                 }
1596                 classid = classid|q->hgenerator;
1597         }
1598 
1599         parent = &q->link;
1600         if (parentid) {
1601                 parent = cbq_class_lookup(q, parentid);
1602                 err = -EINVAL;
1603                 if (!parent) {
1604                         NL_SET_ERR_MSG(extack, "Failed to find parentid");
1605                         goto failure;
1606                 }
1607         }
1608 
1609         err = -ENOBUFS;
1610         cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1611         if (cl == NULL)
1612                 goto failure;
1613 
1614         err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1615         if (err) {
1616                 kfree(cl);
1617                 return err;
1618         }
1619 
1620         if (tca[TCA_RATE]) {
1621                 err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1622                                         NULL,
1623                                         qdisc_root_sleeping_running(sch),
1624                                         tca[TCA_RATE]);
1625                 if (err) {
1626                         NL_SET_ERR_MSG(extack, "Couldn't create new estimator");
1627                         tcf_block_put(cl->block);
1628                         kfree(cl);
1629                         goto failure;
1630                 }
1631         }
1632 
1633         cl->R_tab = rtab;
1634         rtab = NULL;
1635         cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
1636                                   NULL);
1637         if (!cl->q)
1638                 cl->q = &noop_qdisc;
1639         else
1640                 qdisc_hash_add(cl->q, true);
1641 
1642         cl->common.classid = classid;
1643         cl->tparent = parent;
1644         cl->qdisc = sch;
1645         cl->allot = parent->allot;
1646         cl->quantum = cl->allot;
1647         cl->weight = cl->R_tab->rate.rate;
1648 
1649         sch_tree_lock(sch);
1650         cbq_link_class(cl);
1651         cl->borrow = cl->tparent;
1652         if (cl->tparent != &q->link)
1653                 cl->share = cl->tparent;
1654         cbq_adjust_levels(parent);
1655         cl->minidle = -0x7FFFFFFF;
1656         cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1657         cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1658         if (cl->ewma_log == 0)
1659                 cl->ewma_log = q->link.ewma_log;
1660         if (cl->maxidle == 0)
1661                 cl->maxidle = q->link.maxidle;
1662         if (cl->avpkt == 0)
1663                 cl->avpkt = q->link.avpkt;
1664         if (tb[TCA_CBQ_FOPT])
1665                 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1666         sch_tree_unlock(sch);
1667 
1668         qdisc_class_hash_grow(sch, &q->clhash);
1669 
1670         *arg = (unsigned long)cl;
1671         return 0;
1672 
1673 failure:
1674         qdisc_put_rtab(rtab);
1675         return err;
1676 }
1677 
1678 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1679 {
1680         struct cbq_sched_data *q = qdisc_priv(sch);
1681         struct cbq_class *cl = (struct cbq_class *)arg;
1682 
1683         if (cl->filters || cl->children || cl == &q->link)
1684                 return -EBUSY;
1685 
1686         sch_tree_lock(sch);
1687 
1688         qdisc_purge_queue(cl->q);
1689 
1690         if (cl->next_alive)
1691                 cbq_deactivate_class(cl);
1692 
1693         if (q->tx_borrowed == cl)
1694                 q->tx_borrowed = q->tx_class;
1695         if (q->tx_class == cl) {
1696                 q->tx_class = NULL;
1697                 q->tx_borrowed = NULL;
1698         }
1699 #ifdef CONFIG_NET_CLS_ACT
1700         if (q->rx_class == cl)
1701                 q->rx_class = NULL;
1702 #endif
1703 
1704         cbq_unlink_class(cl);
1705         cbq_adjust_levels(cl->tparent);
1706         cl->defmap = 0;
1707         cbq_sync_defmap(cl);
1708 
1709         cbq_rmprio(q, cl);
1710         sch_tree_unlock(sch);
1711 
1712         cbq_destroy_class(sch, cl);
1713         return 0;
1714 }
1715 
1716 static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg,
1717                                        struct netlink_ext_ack *extack)
1718 {
1719         struct cbq_sched_data *q = qdisc_priv(sch);
1720         struct cbq_class *cl = (struct cbq_class *)arg;
1721 
1722         if (cl == NULL)
1723                 cl = &q->link;
1724 
1725         return cl->block;
1726 }
1727 
1728 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1729                                      u32 classid)
1730 {
1731         struct cbq_sched_data *q = qdisc_priv(sch);
1732         struct cbq_class *p = (struct cbq_class *)parent;
1733         struct cbq_class *cl = cbq_class_lookup(q, classid);
1734 
1735         if (cl) {
1736                 if (p && p->level <= cl->level)
1737                         return 0;
1738                 cl->filters++;
1739                 return (unsigned long)cl;
1740         }
1741         return 0;
1742 }
1743 
1744 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1745 {
1746         struct cbq_class *cl = (struct cbq_class *)arg;
1747 
1748         cl->filters--;
1749 }
1750 
1751 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1752 {
1753         struct cbq_sched_data *q = qdisc_priv(sch);
1754         struct cbq_class *cl;
1755         unsigned int h;
1756 
1757         if (arg->stop)
1758                 return;
1759 
1760         for (h = 0; h < q->clhash.hashsize; h++) {
1761                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1762                         if (arg->count < arg->skip) {
1763                                 arg->count++;
1764                                 continue;
1765                         }
1766                         if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1767                                 arg->stop = 1;
1768                                 return;
1769                         }
1770                         arg->count++;
1771                 }
1772         }
1773 }
1774 
1775 static const struct Qdisc_class_ops cbq_class_ops = {
1776         .graft          =       cbq_graft,
1777         .leaf           =       cbq_leaf,
1778         .qlen_notify    =       cbq_qlen_notify,
1779         .find           =       cbq_find,
1780         .change         =       cbq_change_class,
1781         .delete         =       cbq_delete,
1782         .walk           =       cbq_walk,
1783         .tcf_block      =       cbq_tcf_block,
1784         .bind_tcf       =       cbq_bind_filter,
1785         .unbind_tcf     =       cbq_unbind_filter,
1786         .dump           =       cbq_dump_class,
1787         .dump_stats     =       cbq_dump_class_stats,
1788 };
1789 
1790 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1791         .next           =       NULL,
1792         .cl_ops         =       &cbq_class_ops,
1793         .id             =       "cbq",
1794         .priv_size      =       sizeof(struct cbq_sched_data),
1795         .enqueue        =       cbq_enqueue,
1796         .dequeue        =       cbq_dequeue,
1797         .peek           =       qdisc_peek_dequeued,
1798         .init           =       cbq_init,
1799         .reset          =       cbq_reset,
1800         .destroy        =       cbq_destroy,
1801         .change         =       NULL,
1802         .dump           =       cbq_dump,
1803         .dump_stats     =       cbq_dump_stats,
1804         .owner          =       THIS_MODULE,
1805 };
1806 
1807 static int __init cbq_module_init(void)
1808 {
1809         return register_qdisc(&cbq_qdisc_ops);
1810 }
1811 static void __exit cbq_module_exit(void)
1812 {
1813         unregister_qdisc(&cbq_qdisc_ops);
1814 }
1815 module_init(cbq_module_init)
1816 module_exit(cbq_module_exit)
1817 MODULE_LICENSE("GPL");

/* [<][>][^][v][top][bottom][index][help] */