1/*
2 * net/sched/sch_cbq.c	Class-Based Queueing discipline.
3 *
4 *		This program is free software; you can redistribute it and/or
5 *		modify it under the terms of the GNU General Public License
6 *		as published by the Free Software Foundation; either version
7 *		2 of the License, or (at your option) any later version.
8 *
9 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 */
12
13#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/types.h>
16#include <linux/kernel.h>
17#include <linux/string.h>
18#include <linux/errno.h>
19#include <linux/skbuff.h>
20#include <net/netlink.h>
21#include <net/pkt_sched.h>
22
23
24/*	Class-Based Queueing (CBQ) algorithm.
25	=======================================
26
27	Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
28		 Management Models for Packet Networks",
29		 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
30
31		 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
32
33		 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
34		 Parameters", 1996
35
36		 [4] Sally Floyd and Michael Speer, "Experimental Results
37		 for Class-Based Queueing", 1998, not published.
38
39	-----------------------------------------------------------------------
40
41	Algorithm skeleton was taken from NS simulator cbq.cc.
42	If someone wants to check this code against the LBL version,
43	he should take into account that ONLY the skeleton was borrowed,
44	the implementation is different. Particularly:
45
46	--- The WRR algorithm is different. Our version looks more
47	reasonable (I hope) and works when quanta are allowed to be
48	less than MTU, which is always the case when real time classes
49	have small rates. Note, that the statement of [3] is
50	incomplete, delay may actually be estimated even if class
51	per-round allotment is less than MTU. Namely, if per-round
52	allotment is W*r_i, and r_1+...+r_k = r < 1
53
54	delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
55
56	In the worst case we have IntServ estimate with D = W*r+k*MTU
57	and C = MTU*r. The proof (if correct at all) is trivial.
58
59
60	--- It seems that cbq-2.0 is not very accurate. At least, I cannot
61	interpret some places, which look like wrong translations
62	from NS. Anyone is advised to find these differences
63	and explain to me, why I am wrong 8).
64
65	--- Linux has no EOI event, so that we cannot estimate true class
66	idle time. Workaround is to consider the next dequeue event
67	as sign that previous packet is finished. This is wrong because of
68	internal device queueing, but on a permanently loaded link it is true.
69	Moreover, combined with clock integrator, this scheme looks
70	very close to an ideal solution.  */
71
72struct cbq_sched_data;
73
74
75struct cbq_class {
76	struct Qdisc_class_common common;
77	struct cbq_class	*next_alive;	/* next class with backlog in this priority band */
78
79/* Parameters */
80	unsigned char		priority;	/* class priority */
81	unsigned char		priority2;	/* priority to be used after overlimit */
82	unsigned char		ewma_log;	/* time constant for idle time calculation */
83	unsigned char		ovl_strategy;
84#ifdef CONFIG_NET_CLS_ACT
85	unsigned char		police;
86#endif
87
88	u32			defmap;
89
90	/* Link-sharing scheduler parameters */
91	long			maxidle;	/* Class parameters: see below. */
92	long			offtime;
93	long			minidle;
94	u32			avpkt;
95	struct qdisc_rate_table	*R_tab;
96
97	/* Overlimit strategy parameters */
98	void			(*overlimit)(struct cbq_class *cl);
99	psched_tdiff_t		penalty;
100
101	/* General scheduler (WRR) parameters */
102	long			allot;
103	long			quantum;	/* Allotment per WRR round */
104	long			weight;		/* Relative allotment: see below */
105
106	struct Qdisc		*qdisc;		/* Ptr to CBQ discipline */
107	struct cbq_class	*split;		/* Ptr to split node */
108	struct cbq_class	*share;		/* Ptr to LS parent in the class tree */
109	struct cbq_class	*tparent;	/* Ptr to tree parent in the class tree */
110	struct cbq_class	*borrow;	/* NULL if class is bandwidth limited;
111						   parent otherwise */
112	struct cbq_class	*sibling;	/* Sibling chain */
113	struct cbq_class	*children;	/* Pointer to children chain */
114
115	struct Qdisc		*q;		/* Elementary queueing discipline */
116
117
118/* Variables */
119	unsigned char		cpriority;	/* Effective priority */
120	unsigned char		delayed;
121	unsigned char		level;		/* level of the class in hierarchy:
122						   0 for leaf classes, and maximal
123						   level of children + 1 for nodes.
124						 */
125
126	psched_time_t		last;		/* Last end of service */
127	psched_time_t		undertime;
128	long			avgidle;
129	long			deficit;	/* Saved deficit for WRR */
130	psched_time_t		penalized;
131	struct gnet_stats_basic_packed bstats;
132	struct gnet_stats_queue qstats;
133	struct gnet_stats_rate_est64 rate_est;
134	struct tc_cbq_xstats	xstats;
135
136	struct tcf_proto __rcu	*filter_list;
137
138	int			refcnt;
139	int			filters;
140
141	struct cbq_class	*defaults[TC_PRIO_MAX + 1];
142};
143
144struct cbq_sched_data {
145	struct Qdisc_class_hash	clhash;			/* Hash table of all classes */
146	int			nclasses[TC_CBQ_MAXPRIO + 1];
147	unsigned int		quanta[TC_CBQ_MAXPRIO + 1];
148
149	struct cbq_class	link;
150
151	unsigned int		activemask;
152	struct cbq_class	*active[TC_CBQ_MAXPRIO + 1];	/* List of all classes
153								   with backlog */
154
155#ifdef CONFIG_NET_CLS_ACT
156	struct cbq_class	*rx_class;
157#endif
158	struct cbq_class	*tx_class;
159	struct cbq_class	*tx_borrowed;
160	int			tx_len;
161	psched_time_t		now;		/* Cached timestamp */
162	unsigned int		pmask;
163
164	struct hrtimer		delay_timer;
165	struct qdisc_watchdog	watchdog;	/* Watchdog timer,
166						   started when CBQ has
167						   backlog, but cannot
168						   transmit just now */
169	psched_tdiff_t		wd_expires;
170	int			toplevel;
171	u32			hgenerator;
172};
173
174
175#define L2T(cl, len)	qdisc_l2t((cl)->R_tab, len)
176
177static inline struct cbq_class *
178cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
179{
180	struct Qdisc_class_common *clc;
181
182	clc = qdisc_class_find(&q->clhash, classid);
183	if (clc == NULL)
184		return NULL;
185	return container_of(clc, struct cbq_class, common);
186}
187
188#ifdef CONFIG_NET_CLS_ACT
189
190static struct cbq_class *
191cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
192{
193	struct cbq_class *cl;
194
195	for (cl = this->tparent; cl; cl = cl->tparent) {
196		struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
197
198		if (new != NULL && new != this)
199			return new;
200	}
201	return NULL;
202}
203
204#endif
205
206/* Classify packet. The procedure is pretty complicated, but
207 * it allows us to combine link sharing and priority scheduling
208 * transparently.
209 *
210 * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
211 * so that it resolves to split nodes. Then packets are classified
212 * by logical priority, or a more specific classifier may be attached
213 * to the split node.
214 */
215
216static struct cbq_class *
217cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
218{
219	struct cbq_sched_data *q = qdisc_priv(sch);
220	struct cbq_class *head = &q->link;
221	struct cbq_class **defmap;
222	struct cbq_class *cl = NULL;
223	u32 prio = skb->priority;
224	struct tcf_proto *fl;
225	struct tcf_result res;
226
227	/*
228	 *  Step 1. If skb->priority points to one of our classes, use it.
229	 */
230	if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
231	    (cl = cbq_class_lookup(q, prio)) != NULL)
232		return cl;
233
234	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
235	for (;;) {
236		int result = 0;
237		defmap = head->defaults;
238
239		fl = rcu_dereference_bh(head->filter_list);
240		/*
241		 * Step 2+n. Apply classifier.
242		 */
243		result = tc_classify_compat(skb, fl, &res);
244		if (!fl || result < 0)
245			goto fallback;
246
247		cl = (void *)res.class;
248		if (!cl) {
249			if (TC_H_MAJ(res.classid))
250				cl = cbq_class_lookup(q, res.classid);
251			else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
252				cl = defmap[TC_PRIO_BESTEFFORT];
253
254			if (cl == NULL)
255				goto fallback;
256		}
257		if (cl->level >= head->level)
258			goto fallback;
259#ifdef CONFIG_NET_CLS_ACT
260		switch (result) {
261		case TC_ACT_QUEUED:
262		case TC_ACT_STOLEN:
263			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
264		case TC_ACT_SHOT:
265			return NULL;
266		case TC_ACT_RECLASSIFY:
267			return cbq_reclassify(skb, cl);
268		}
269#endif
270		if (cl->level == 0)
271			return cl;
272
273		/*
274		 * Step 3+n. If classifier selected a link sharing class,
275		 *	   apply agency specific classifier.
276		 *	   Repeat this procdure until we hit a leaf node.
277		 */
278		head = cl;
279	}
280
281fallback:
282	cl = head;
283
284	/*
285	 * Step 4. No success...
286	 */
287	if (TC_H_MAJ(prio) == 0 &&
288	    !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
289	    !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
290		return head;
291
292	return cl;
293}
294
295/*
296 * A packet has just been enqueued on the empty class.
297 * cbq_activate_class adds it to the tail of active class list
298 * of its priority band.
299 */
300
301static inline void cbq_activate_class(struct cbq_class *cl)
302{
303	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
304	int prio = cl->cpriority;
305	struct cbq_class *cl_tail;
306
307	cl_tail = q->active[prio];
308	q->active[prio] = cl;
309
310	if (cl_tail != NULL) {
311		cl->next_alive = cl_tail->next_alive;
312		cl_tail->next_alive = cl;
313	} else {
314		cl->next_alive = cl;
315		q->activemask |= (1<<prio);
316	}
317}
318
319/*
320 * Unlink class from active chain.
321 * Note that this same procedure is done directly in cbq_dequeue*
322 * during round-robin procedure.
323 */
324
325static void cbq_deactivate_class(struct cbq_class *this)
326{
327	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
328	int prio = this->cpriority;
329	struct cbq_class *cl;
330	struct cbq_class *cl_prev = q->active[prio];
331
332	do {
333		cl = cl_prev->next_alive;
334		if (cl == this) {
335			cl_prev->next_alive = cl->next_alive;
336			cl->next_alive = NULL;
337
338			if (cl == q->active[prio]) {
339				q->active[prio] = cl_prev;
340				if (cl == q->active[prio]) {
341					q->active[prio] = NULL;
342					q->activemask &= ~(1<<prio);
343					return;
344				}
345			}
346			return;
347		}
348	} while ((cl_prev = cl) != q->active[prio]);
349}
350
351static void
352cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
353{
354	int toplevel = q->toplevel;
355
356	if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) {
357		psched_time_t now = psched_get_time();
358
359		do {
360			if (cl->undertime < now) {
361				q->toplevel = cl->level;
362				return;
363			}
364		} while ((cl = cl->borrow) != NULL && toplevel > cl->level);
365	}
366}
367
368static int
369cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
370{
371	struct cbq_sched_data *q = qdisc_priv(sch);
372	int uninitialized_var(ret);
373	struct cbq_class *cl = cbq_classify(skb, sch, &ret);
374
375#ifdef CONFIG_NET_CLS_ACT
376	q->rx_class = cl;
377#endif
378	if (cl == NULL) {
379		if (ret & __NET_XMIT_BYPASS)
380			qdisc_qstats_drop(sch);
381		kfree_skb(skb);
382		return ret;
383	}
384
385#ifdef CONFIG_NET_CLS_ACT
386	cl->q->__parent = sch;
387#endif
388	ret = qdisc_enqueue(skb, cl->q);
389	if (ret == NET_XMIT_SUCCESS) {
390		sch->q.qlen++;
391		cbq_mark_toplevel(q, cl);
392		if (!cl->next_alive)
393			cbq_activate_class(cl);
394		return ret;
395	}
396
397	if (net_xmit_drop_count(ret)) {
398		qdisc_qstats_drop(sch);
399		cbq_mark_toplevel(q, cl);
400		cl->qstats.drops++;
401	}
402	return ret;
403}
404
405/* Overlimit actions */
406
407/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
408
409static void cbq_ovl_classic(struct cbq_class *cl)
410{
411	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
412	psched_tdiff_t delay = cl->undertime - q->now;
413
414	if (!cl->delayed) {
415		delay += cl->offtime;
416
417		/*
418		 * Class goes to sleep, so that it will have no
419		 * chance to work avgidle. Let's forgive it 8)
420		 *
421		 * BTW cbq-2.0 has a crap in this
422		 * place, apparently they forgot to shift it by cl->ewma_log.
423		 */
424		if (cl->avgidle < 0)
425			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
426		if (cl->avgidle < cl->minidle)
427			cl->avgidle = cl->minidle;
428		if (delay <= 0)
429			delay = 1;
430		cl->undertime = q->now + delay;
431
432		cl->xstats.overactions++;
433		cl->delayed = 1;
434	}
435	if (q->wd_expires == 0 || q->wd_expires > delay)
436		q->wd_expires = delay;
437
438	/* Dirty work! We must schedule wakeups based on
439	 * real available rate, rather than leaf rate,
440	 * which may be tiny (even zero).
441	 */
442	if (q->toplevel == TC_CBQ_MAXLEVEL) {
443		struct cbq_class *b;
444		psched_tdiff_t base_delay = q->wd_expires;
445
446		for (b = cl->borrow; b; b = b->borrow) {
447			delay = b->undertime - q->now;
448			if (delay < base_delay) {
449				if (delay <= 0)
450					delay = 1;
451				base_delay = delay;
452			}
453		}
454
455		q->wd_expires = base_delay;
456	}
457}
458
459/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
460 * they go overlimit
461 */
462
463static void cbq_ovl_rclassic(struct cbq_class *cl)
464{
465	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
466	struct cbq_class *this = cl;
467
468	do {
469		if (cl->level > q->toplevel) {
470			cl = NULL;
471			break;
472		}
473	} while ((cl = cl->borrow) != NULL);
474
475	if (cl == NULL)
476		cl = this;
477	cbq_ovl_classic(cl);
478}
479
480/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
481
482static void cbq_ovl_delay(struct cbq_class *cl)
483{
484	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
485	psched_tdiff_t delay = cl->undertime - q->now;
486
487	if (test_bit(__QDISC_STATE_DEACTIVATED,
488		     &qdisc_root_sleeping(cl->qdisc)->state))
489		return;
490
491	if (!cl->delayed) {
492		psched_time_t sched = q->now;
493		ktime_t expires;
494
495		delay += cl->offtime;
496		if (cl->avgidle < 0)
497			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
498		if (cl->avgidle < cl->minidle)
499			cl->avgidle = cl->minidle;
500		cl->undertime = q->now + delay;
501
502		if (delay > 0) {
503			sched += delay + cl->penalty;
504			cl->penalized = sched;
505			cl->cpriority = TC_CBQ_MAXPRIO;
506			q->pmask |= (1<<TC_CBQ_MAXPRIO);
507
508			expires = ns_to_ktime(PSCHED_TICKS2NS(sched));
509			if (hrtimer_try_to_cancel(&q->delay_timer) &&
510			    ktime_to_ns(ktime_sub(
511					hrtimer_get_expires(&q->delay_timer),
512					expires)) > 0)
513				hrtimer_set_expires(&q->delay_timer, expires);
514			hrtimer_restart(&q->delay_timer);
515			cl->delayed = 1;
516			cl->xstats.overactions++;
517			return;
518		}
519		delay = 1;
520	}
521	if (q->wd_expires == 0 || q->wd_expires > delay)
522		q->wd_expires = delay;
523}
524
525/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
526
527static void cbq_ovl_lowprio(struct cbq_class *cl)
528{
529	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
530
531	cl->penalized = q->now + cl->penalty;
532
533	if (cl->cpriority != cl->priority2) {
534		cl->cpriority = cl->priority2;
535		q->pmask |= (1<<cl->cpriority);
536		cl->xstats.overactions++;
537	}
538	cbq_ovl_classic(cl);
539}
540
541/* TC_CBQ_OVL_DROP: penalize class by dropping */
542
543static void cbq_ovl_drop(struct cbq_class *cl)
544{
545	if (cl->q->ops->drop)
546		if (cl->q->ops->drop(cl->q))
547			cl->qdisc->q.qlen--;
548	cl->xstats.overactions++;
549	cbq_ovl_classic(cl);
550}
551
552static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
553				       psched_time_t now)
554{
555	struct cbq_class *cl;
556	struct cbq_class *cl_prev = q->active[prio];
557	psched_time_t sched = now;
558
559	if (cl_prev == NULL)
560		return 0;
561
562	do {
563		cl = cl_prev->next_alive;
564		if (now - cl->penalized > 0) {
565			cl_prev->next_alive = cl->next_alive;
566			cl->next_alive = NULL;
567			cl->cpriority = cl->priority;
568			cl->delayed = 0;
569			cbq_activate_class(cl);
570
571			if (cl == q->active[prio]) {
572				q->active[prio] = cl_prev;
573				if (cl == q->active[prio]) {
574					q->active[prio] = NULL;
575					return 0;
576				}
577			}
578
579			cl = cl_prev->next_alive;
580		} else if (sched - cl->penalized > 0)
581			sched = cl->penalized;
582	} while ((cl_prev = cl) != q->active[prio]);
583
584	return sched - now;
585}
586
587static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
588{
589	struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
590						delay_timer);
591	struct Qdisc *sch = q->watchdog.qdisc;
592	psched_time_t now;
593	psched_tdiff_t delay = 0;
594	unsigned int pmask;
595
596	now = psched_get_time();
597
598	pmask = q->pmask;
599	q->pmask = 0;
600
601	while (pmask) {
602		int prio = ffz(~pmask);
603		psched_tdiff_t tmp;
604
605		pmask &= ~(1<<prio);
606
607		tmp = cbq_undelay_prio(q, prio, now);
608		if (tmp > 0) {
609			q->pmask |= 1<<prio;
610			if (tmp < delay || delay == 0)
611				delay = tmp;
612		}
613	}
614
615	if (delay) {
616		ktime_t time;
617
618		time = ktime_set(0, 0);
619		time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
620		hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
621	}
622
623	qdisc_unthrottled(sch);
624	__netif_schedule(qdisc_root(sch));
625	return HRTIMER_NORESTART;
626}
627
628#ifdef CONFIG_NET_CLS_ACT
629static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
630{
631	struct Qdisc *sch = child->__parent;
632	struct cbq_sched_data *q = qdisc_priv(sch);
633	struct cbq_class *cl = q->rx_class;
634
635	q->rx_class = NULL;
636
637	if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
638		int ret;
639
640		cbq_mark_toplevel(q, cl);
641
642		q->rx_class = cl;
643		cl->q->__parent = sch;
644
645		ret = qdisc_enqueue(skb, cl->q);
646		if (ret == NET_XMIT_SUCCESS) {
647			sch->q.qlen++;
648			if (!cl->next_alive)
649				cbq_activate_class(cl);
650			return 0;
651		}
652		if (net_xmit_drop_count(ret))
653			qdisc_qstats_drop(sch);
654		return 0;
655	}
656
657	qdisc_qstats_drop(sch);
658	return -1;
659}
660#endif
661
662/*
663 * It is mission critical procedure.
664 *
665 * We "regenerate" toplevel cutoff, if transmitting class
666 * has backlog and it is not regulated. It is not part of
667 * original CBQ description, but looks more reasonable.
668 * Probably, it is wrong. This question needs further investigation.
669 */
670
671static inline void
672cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
673		    struct cbq_class *borrowed)
674{
675	if (cl && q->toplevel >= borrowed->level) {
676		if (cl->q->q.qlen > 1) {
677			do {
678				if (borrowed->undertime == PSCHED_PASTPERFECT) {
679					q->toplevel = borrowed->level;
680					return;
681				}
682			} while ((borrowed = borrowed->borrow) != NULL);
683		}
684#if 0
685	/* It is not necessary now. Uncommenting it
686	   will save CPU cycles, but decrease fairness.
687	 */
688		q->toplevel = TC_CBQ_MAXLEVEL;
689#endif
690	}
691}
692
693static void
694cbq_update(struct cbq_sched_data *q)
695{
696	struct cbq_class *this = q->tx_class;
697	struct cbq_class *cl = this;
698	int len = q->tx_len;
699	psched_time_t now;
700
701	q->tx_class = NULL;
702	/* Time integrator. We calculate EOS time
703	 * by adding expected packet transmission time.
704	 */
705	now = q->now + L2T(&q->link, len);
706
707	for ( ; cl; cl = cl->share) {
708		long avgidle = cl->avgidle;
709		long idle;
710
711		cl->bstats.packets++;
712		cl->bstats.bytes += len;
713
714		/*
715		 * (now - last) is total time between packet right edges.
716		 * (last_pktlen/rate) is "virtual" busy time, so that
717		 *
718		 *	idle = (now - last) - last_pktlen/rate
719		 */
720
721		idle = now - cl->last;
722		if ((unsigned long)idle > 128*1024*1024) {
723			avgidle = cl->maxidle;
724		} else {
725			idle -= L2T(cl, len);
726
727		/* true_avgidle := (1-W)*true_avgidle + W*idle,
728		 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
729		 * cl->avgidle == true_avgidle/W,
730		 * hence:
731		 */
732			avgidle += idle - (avgidle>>cl->ewma_log);
733		}
734
735		if (avgidle <= 0) {
736			/* Overlimit or at-limit */
737
738			if (avgidle < cl->minidle)
739				avgidle = cl->minidle;
740
741			cl->avgidle = avgidle;
742
743			/* Calculate expected time, when this class
744			 * will be allowed to send.
745			 * It will occur, when:
746			 * (1-W)*true_avgidle + W*delay = 0, i.e.
747			 * idle = (1/W - 1)*(-true_avgidle)
748			 * or
749			 * idle = (1 - W)*(-cl->avgidle);
750			 */
751			idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
752
753			/*
754			 * That is not all.
755			 * To maintain the rate allocated to the class,
756			 * we add to undertime virtual clock,
757			 * necessary to complete transmitted packet.
758			 * (len/phys_bandwidth has been already passed
759			 * to the moment of cbq_update)
760			 */
761
762			idle -= L2T(&q->link, len);
763			idle += L2T(cl, len);
764
765			cl->undertime = now + idle;
766		} else {
767			/* Underlimit */
768
769			cl->undertime = PSCHED_PASTPERFECT;
770			if (avgidle > cl->maxidle)
771				cl->avgidle = cl->maxidle;
772			else
773				cl->avgidle = avgidle;
774		}
775		if ((s64)(now - cl->last) > 0)
776			cl->last = now;
777	}
778
779	cbq_update_toplevel(q, this, q->tx_borrowed);
780}
781
782static inline struct cbq_class *
783cbq_under_limit(struct cbq_class *cl)
784{
785	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
786	struct cbq_class *this_cl = cl;
787
788	if (cl->tparent == NULL)
789		return cl;
790
791	if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
792		cl->delayed = 0;
793		return cl;
794	}
795
796	do {
797		/* It is very suspicious place. Now overlimit
798		 * action is generated for not bounded classes
799		 * only if link is completely congested.
800		 * Though it is in agree with ancestor-only paradigm,
801		 * it looks very stupid. Particularly,
802		 * it means that this chunk of code will either
803		 * never be called or result in strong amplification
804		 * of burstiness. Dangerous, silly, and, however,
805		 * no another solution exists.
806		 */
807		cl = cl->borrow;
808		if (!cl) {
809			this_cl->qstats.overlimits++;
810			this_cl->overlimit(this_cl);
811			return NULL;
812		}
813		if (cl->level > q->toplevel)
814			return NULL;
815	} while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
816
817	cl->delayed = 0;
818	return cl;
819}
820
821static inline struct sk_buff *
822cbq_dequeue_prio(struct Qdisc *sch, int prio)
823{
824	struct cbq_sched_data *q = qdisc_priv(sch);
825	struct cbq_class *cl_tail, *cl_prev, *cl;
826	struct sk_buff *skb;
827	int deficit;
828
829	cl_tail = cl_prev = q->active[prio];
830	cl = cl_prev->next_alive;
831
832	do {
833		deficit = 0;
834
835		/* Start round */
836		do {
837			struct cbq_class *borrow = cl;
838
839			if (cl->q->q.qlen &&
840			    (borrow = cbq_under_limit(cl)) == NULL)
841				goto skip_class;
842
843			if (cl->deficit <= 0) {
844				/* Class exhausted its allotment per
845				 * this round. Switch to the next one.
846				 */
847				deficit = 1;
848				cl->deficit += cl->quantum;
849				goto next_class;
850			}
851
852			skb = cl->q->dequeue(cl->q);
853
854			/* Class did not give us any skb :-(
855			 * It could occur even if cl->q->q.qlen != 0
856			 * f.e. if cl->q == "tbf"
857			 */
858			if (skb == NULL)
859				goto skip_class;
860
861			cl->deficit -= qdisc_pkt_len(skb);
862			q->tx_class = cl;
863			q->tx_borrowed = borrow;
864			if (borrow != cl) {
865#ifndef CBQ_XSTATS_BORROWS_BYTES
866				borrow->xstats.borrows++;
867				cl->xstats.borrows++;
868#else
869				borrow->xstats.borrows += qdisc_pkt_len(skb);
870				cl->xstats.borrows += qdisc_pkt_len(skb);
871#endif
872			}
873			q->tx_len = qdisc_pkt_len(skb);
874
875			if (cl->deficit <= 0) {
876				q->active[prio] = cl;
877				cl = cl->next_alive;
878				cl->deficit += cl->quantum;
879			}
880			return skb;
881
882skip_class:
883			if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
884				/* Class is empty or penalized.
885				 * Unlink it from active chain.
886				 */
887				cl_prev->next_alive = cl->next_alive;
888				cl->next_alive = NULL;
889
890				/* Did cl_tail point to it? */
891				if (cl == cl_tail) {
892					/* Repair it! */
893					cl_tail = cl_prev;
894
895					/* Was it the last class in this band? */
896					if (cl == cl_tail) {
897						/* Kill the band! */
898						q->active[prio] = NULL;
899						q->activemask &= ~(1<<prio);
900						if (cl->q->q.qlen)
901							cbq_activate_class(cl);
902						return NULL;
903					}
904
905					q->active[prio] = cl_tail;
906				}
907				if (cl->q->q.qlen)
908					cbq_activate_class(cl);
909
910				cl = cl_prev;
911			}
912
913next_class:
914			cl_prev = cl;
915			cl = cl->next_alive;
916		} while (cl_prev != cl_tail);
917	} while (deficit);
918
919	q->active[prio] = cl_prev;
920
921	return NULL;
922}
923
924static inline struct sk_buff *
925cbq_dequeue_1(struct Qdisc *sch)
926{
927	struct cbq_sched_data *q = qdisc_priv(sch);
928	struct sk_buff *skb;
929	unsigned int activemask;
930
931	activemask = q->activemask & 0xFF;
932	while (activemask) {
933		int prio = ffz(~activemask);
934		activemask &= ~(1<<prio);
935		skb = cbq_dequeue_prio(sch, prio);
936		if (skb)
937			return skb;
938	}
939	return NULL;
940}
941
942static struct sk_buff *
943cbq_dequeue(struct Qdisc *sch)
944{
945	struct sk_buff *skb;
946	struct cbq_sched_data *q = qdisc_priv(sch);
947	psched_time_t now;
948
949	now = psched_get_time();
950
951	if (q->tx_class)
952		cbq_update(q);
953
954	q->now = now;
955
956	for (;;) {
957		q->wd_expires = 0;
958
959		skb = cbq_dequeue_1(sch);
960		if (skb) {
961			qdisc_bstats_update(sch, skb);
962			sch->q.qlen--;
963			qdisc_unthrottled(sch);
964			return skb;
965		}
966
967		/* All the classes are overlimit.
968		 *
969		 * It is possible, if:
970		 *
971		 * 1. Scheduler is empty.
972		 * 2. Toplevel cutoff inhibited borrowing.
973		 * 3. Root class is overlimit.
974		 *
975		 * Reset 2d and 3d conditions and retry.
976		 *
977		 * Note, that NS and cbq-2.0 are buggy, peeking
978		 * an arbitrary class is appropriate for ancestor-only
979		 * sharing, but not for toplevel algorithm.
980		 *
981		 * Our version is better, but slower, because it requires
982		 * two passes, but it is unavoidable with top-level sharing.
983		 */
984
985		if (q->toplevel == TC_CBQ_MAXLEVEL &&
986		    q->link.undertime == PSCHED_PASTPERFECT)
987			break;
988
989		q->toplevel = TC_CBQ_MAXLEVEL;
990		q->link.undertime = PSCHED_PASTPERFECT;
991	}
992
993	/* No packets in scheduler or nobody wants to give them to us :-(
994	 * Sigh... start watchdog timer in the last case.
995	 */
996
997	if (sch->q.qlen) {
998		qdisc_qstats_overlimit(sch);
999		if (q->wd_expires)
1000			qdisc_watchdog_schedule(&q->watchdog,
1001						now + q->wd_expires);
1002	}
1003	return NULL;
1004}
1005
1006/* CBQ class maintanance routines */
1007
1008static void cbq_adjust_levels(struct cbq_class *this)
1009{
1010	if (this == NULL)
1011		return;
1012
1013	do {
1014		int level = 0;
1015		struct cbq_class *cl;
1016
1017		cl = this->children;
1018		if (cl) {
1019			do {
1020				if (cl->level > level)
1021					level = cl->level;
1022			} while ((cl = cl->sibling) != this->children);
1023		}
1024		this->level = level + 1;
1025	} while ((this = this->tparent) != NULL);
1026}
1027
1028static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1029{
1030	struct cbq_class *cl;
1031	unsigned int h;
1032
1033	if (q->quanta[prio] == 0)
1034		return;
1035
1036	for (h = 0; h < q->clhash.hashsize; h++) {
1037		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1038			/* BUGGGG... Beware! This expression suffer of
1039			 * arithmetic overflows!
1040			 */
1041			if (cl->priority == prio) {
1042				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1043					q->quanta[prio];
1044			}
1045			if (cl->quantum <= 0 ||
1046			    cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
1047				pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
1048					cl->common.classid, cl->quantum);
1049				cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
1050			}
1051		}
1052	}
1053}
1054
1055static void cbq_sync_defmap(struct cbq_class *cl)
1056{
1057	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1058	struct cbq_class *split = cl->split;
1059	unsigned int h;
1060	int i;
1061
1062	if (split == NULL)
1063		return;
1064
1065	for (i = 0; i <= TC_PRIO_MAX; i++) {
1066		if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
1067			split->defaults[i] = NULL;
1068	}
1069
1070	for (i = 0; i <= TC_PRIO_MAX; i++) {
1071		int level = split->level;
1072
1073		if (split->defaults[i])
1074			continue;
1075
1076		for (h = 0; h < q->clhash.hashsize; h++) {
1077			struct cbq_class *c;
1078
1079			hlist_for_each_entry(c, &q->clhash.hash[h],
1080					     common.hnode) {
1081				if (c->split == split && c->level < level &&
1082				    c->defmap & (1<<i)) {
1083					split->defaults[i] = c;
1084					level = c->level;
1085				}
1086			}
1087		}
1088	}
1089}
1090
1091static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1092{
1093	struct cbq_class *split = NULL;
1094
1095	if (splitid == 0) {
1096		split = cl->split;
1097		if (!split)
1098			return;
1099		splitid = split->common.classid;
1100	}
1101
1102	if (split == NULL || split->common.classid != splitid) {
1103		for (split = cl->tparent; split; split = split->tparent)
1104			if (split->common.classid == splitid)
1105				break;
1106	}
1107
1108	if (split == NULL)
1109		return;
1110
1111	if (cl->split != split) {
1112		cl->defmap = 0;
1113		cbq_sync_defmap(cl);
1114		cl->split = split;
1115		cl->defmap = def & mask;
1116	} else
1117		cl->defmap = (cl->defmap & ~mask) | (def & mask);
1118
1119	cbq_sync_defmap(cl);
1120}
1121
1122static void cbq_unlink_class(struct cbq_class *this)
1123{
1124	struct cbq_class *cl, **clp;
1125	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1126
1127	qdisc_class_hash_remove(&q->clhash, &this->common);
1128
1129	if (this->tparent) {
1130		clp = &this->sibling;
1131		cl = *clp;
1132		do {
1133			if (cl == this) {
1134				*clp = cl->sibling;
1135				break;
1136			}
1137			clp = &cl->sibling;
1138		} while ((cl = *clp) != this->sibling);
1139
1140		if (this->tparent->children == this) {
1141			this->tparent->children = this->sibling;
1142			if (this->sibling == this)
1143				this->tparent->children = NULL;
1144		}
1145	} else {
1146		WARN_ON(this->sibling != this);
1147	}
1148}
1149
1150static void cbq_link_class(struct cbq_class *this)
1151{
1152	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1153	struct cbq_class *parent = this->tparent;
1154
1155	this->sibling = this;
1156	qdisc_class_hash_insert(&q->clhash, &this->common);
1157
1158	if (parent == NULL)
1159		return;
1160
1161	if (parent->children == NULL) {
1162		parent->children = this;
1163	} else {
1164		this->sibling = parent->children->sibling;
1165		parent->children->sibling = this;
1166	}
1167}
1168
1169static unsigned int cbq_drop(struct Qdisc *sch)
1170{
1171	struct cbq_sched_data *q = qdisc_priv(sch);
1172	struct cbq_class *cl, *cl_head;
1173	int prio;
1174	unsigned int len;
1175
1176	for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1177		cl_head = q->active[prio];
1178		if (!cl_head)
1179			continue;
1180
1181		cl = cl_head;
1182		do {
1183			if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
1184				sch->q.qlen--;
1185				if (!cl->q->q.qlen)
1186					cbq_deactivate_class(cl);
1187				return len;
1188			}
1189		} while ((cl = cl->next_alive) != cl_head);
1190	}
1191	return 0;
1192}
1193
1194static void
1195cbq_reset(struct Qdisc *sch)
1196{
1197	struct cbq_sched_data *q = qdisc_priv(sch);
1198	struct cbq_class *cl;
1199	int prio;
1200	unsigned int h;
1201
1202	q->activemask = 0;
1203	q->pmask = 0;
1204	q->tx_class = NULL;
1205	q->tx_borrowed = NULL;
1206	qdisc_watchdog_cancel(&q->watchdog);
1207	hrtimer_cancel(&q->delay_timer);
1208	q->toplevel = TC_CBQ_MAXLEVEL;
1209	q->now = psched_get_time();
1210
1211	for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1212		q->active[prio] = NULL;
1213
1214	for (h = 0; h < q->clhash.hashsize; h++) {
1215		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1216			qdisc_reset(cl->q);
1217
1218			cl->next_alive = NULL;
1219			cl->undertime = PSCHED_PASTPERFECT;
1220			cl->avgidle = cl->maxidle;
1221			cl->deficit = cl->quantum;
1222			cl->cpriority = cl->priority;
1223		}
1224	}
1225	sch->q.qlen = 0;
1226}
1227
1228
1229static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1230{
1231	if (lss->change & TCF_CBQ_LSS_FLAGS) {
1232		cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1233		cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1234	}
1235	if (lss->change & TCF_CBQ_LSS_EWMA)
1236		cl->ewma_log = lss->ewma_log;
1237	if (lss->change & TCF_CBQ_LSS_AVPKT)
1238		cl->avpkt = lss->avpkt;
1239	if (lss->change & TCF_CBQ_LSS_MINIDLE)
1240		cl->minidle = -(long)lss->minidle;
1241	if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1242		cl->maxidle = lss->maxidle;
1243		cl->avgidle = lss->maxidle;
1244	}
1245	if (lss->change & TCF_CBQ_LSS_OFFTIME)
1246		cl->offtime = lss->offtime;
1247	return 0;
1248}
1249
1250static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1251{
1252	q->nclasses[cl->priority]--;
1253	q->quanta[cl->priority] -= cl->weight;
1254	cbq_normalize_quanta(q, cl->priority);
1255}
1256
1257static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1258{
1259	q->nclasses[cl->priority]++;
1260	q->quanta[cl->priority] += cl->weight;
1261	cbq_normalize_quanta(q, cl->priority);
1262}
1263
1264static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1265{
1266	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1267
1268	if (wrr->allot)
1269		cl->allot = wrr->allot;
1270	if (wrr->weight)
1271		cl->weight = wrr->weight;
1272	if (wrr->priority) {
1273		cl->priority = wrr->priority - 1;
1274		cl->cpriority = cl->priority;
1275		if (cl->priority >= cl->priority2)
1276			cl->priority2 = TC_CBQ_MAXPRIO - 1;
1277	}
1278
1279	cbq_addprio(q, cl);
1280	return 0;
1281}
1282
1283static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1284{
1285	switch (ovl->strategy) {
1286	case TC_CBQ_OVL_CLASSIC:
1287		cl->overlimit = cbq_ovl_classic;
1288		break;
1289	case TC_CBQ_OVL_DELAY:
1290		cl->overlimit = cbq_ovl_delay;
1291		break;
1292	case TC_CBQ_OVL_LOWPRIO:
1293		if (ovl->priority2 - 1 >= TC_CBQ_MAXPRIO ||
1294		    ovl->priority2 - 1 <= cl->priority)
1295			return -EINVAL;
1296		cl->priority2 = ovl->priority2 - 1;
1297		cl->overlimit = cbq_ovl_lowprio;
1298		break;
1299	case TC_CBQ_OVL_DROP:
1300		cl->overlimit = cbq_ovl_drop;
1301		break;
1302	case TC_CBQ_OVL_RCLASSIC:
1303		cl->overlimit = cbq_ovl_rclassic;
1304		break;
1305	default:
1306		return -EINVAL;
1307	}
1308	cl->penalty = ovl->penalty;
1309	return 0;
1310}
1311
1312#ifdef CONFIG_NET_CLS_ACT
1313static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1314{
1315	cl->police = p->police;
1316
1317	if (cl->q->handle) {
1318		if (p->police == TC_POLICE_RECLASSIFY)
1319			cl->q->reshape_fail = cbq_reshape_fail;
1320		else
1321			cl->q->reshape_fail = NULL;
1322	}
1323	return 0;
1324}
1325#endif
1326
1327static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1328{
1329	cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1330	return 0;
1331}
1332
1333static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1334	[TCA_CBQ_LSSOPT]	= { .len = sizeof(struct tc_cbq_lssopt) },
1335	[TCA_CBQ_WRROPT]	= { .len = sizeof(struct tc_cbq_wrropt) },
1336	[TCA_CBQ_FOPT]		= { .len = sizeof(struct tc_cbq_fopt) },
1337	[TCA_CBQ_OVL_STRATEGY]	= { .len = sizeof(struct tc_cbq_ovl) },
1338	[TCA_CBQ_RATE]		= { .len = sizeof(struct tc_ratespec) },
1339	[TCA_CBQ_RTAB]		= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1340	[TCA_CBQ_POLICE]	= { .len = sizeof(struct tc_cbq_police) },
1341};
1342
1343static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
1344{
1345	struct cbq_sched_data *q = qdisc_priv(sch);
1346	struct nlattr *tb[TCA_CBQ_MAX + 1];
1347	struct tc_ratespec *r;
1348	int err;
1349
1350	err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1351	if (err < 0)
1352		return err;
1353
1354	if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
1355		return -EINVAL;
1356
1357	r = nla_data(tb[TCA_CBQ_RATE]);
1358
1359	if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
1360		return -EINVAL;
1361
1362	err = qdisc_class_hash_init(&q->clhash);
1363	if (err < 0)
1364		goto put_rtab;
1365
1366	q->link.refcnt = 1;
1367	q->link.sibling = &q->link;
1368	q->link.common.classid = sch->handle;
1369	q->link.qdisc = sch;
1370	q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1371				      sch->handle);
1372	if (!q->link.q)
1373		q->link.q = &noop_qdisc;
1374
1375	q->link.priority = TC_CBQ_MAXPRIO - 1;
1376	q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1377	q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1378	q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1379	q->link.overlimit = cbq_ovl_classic;
1380	q->link.allot = psched_mtu(qdisc_dev(sch));
1381	q->link.quantum = q->link.allot;
1382	q->link.weight = q->link.R_tab->rate.rate;
1383
1384	q->link.ewma_log = TC_CBQ_DEF_EWMA;
1385	q->link.avpkt = q->link.allot/2;
1386	q->link.minidle = -0x7FFFFFFF;
1387
1388	qdisc_watchdog_init(&q->watchdog, sch);
1389	hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1390	q->delay_timer.function = cbq_undelay;
1391	q->toplevel = TC_CBQ_MAXLEVEL;
1392	q->now = psched_get_time();
1393
1394	cbq_link_class(&q->link);
1395
1396	if (tb[TCA_CBQ_LSSOPT])
1397		cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1398
1399	cbq_addprio(q, &q->link);
1400	return 0;
1401
1402put_rtab:
1403	qdisc_put_rtab(q->link.R_tab);
1404	return err;
1405}
1406
1407static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1408{
1409	unsigned char *b = skb_tail_pointer(skb);
1410
1411	if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1412		goto nla_put_failure;
1413	return skb->len;
1414
1415nla_put_failure:
1416	nlmsg_trim(skb, b);
1417	return -1;
1418}
1419
1420static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1421{
1422	unsigned char *b = skb_tail_pointer(skb);
1423	struct tc_cbq_lssopt opt;
1424
1425	opt.flags = 0;
1426	if (cl->borrow == NULL)
1427		opt.flags |= TCF_CBQ_LSS_BOUNDED;
1428	if (cl->share == NULL)
1429		opt.flags |= TCF_CBQ_LSS_ISOLATED;
1430	opt.ewma_log = cl->ewma_log;
1431	opt.level = cl->level;
1432	opt.avpkt = cl->avpkt;
1433	opt.maxidle = cl->maxidle;
1434	opt.minidle = (u32)(-cl->minidle);
1435	opt.offtime = cl->offtime;
1436	opt.change = ~0;
1437	if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1438		goto nla_put_failure;
1439	return skb->len;
1440
1441nla_put_failure:
1442	nlmsg_trim(skb, b);
1443	return -1;
1444}
1445
1446static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1447{
1448	unsigned char *b = skb_tail_pointer(skb);
1449	struct tc_cbq_wrropt opt;
1450
1451	memset(&opt, 0, sizeof(opt));
1452	opt.flags = 0;
1453	opt.allot = cl->allot;
1454	opt.priority = cl->priority + 1;
1455	opt.cpriority = cl->cpriority + 1;
1456	opt.weight = cl->weight;
1457	if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1458		goto nla_put_failure;
1459	return skb->len;
1460
1461nla_put_failure:
1462	nlmsg_trim(skb, b);
1463	return -1;
1464}
1465
1466static int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1467{
1468	unsigned char *b = skb_tail_pointer(skb);
1469	struct tc_cbq_ovl opt;
1470
1471	opt.strategy = cl->ovl_strategy;
1472	opt.priority2 = cl->priority2 + 1;
1473	opt.pad = 0;
1474	opt.penalty = cl->penalty;
1475	if (nla_put(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt))
1476		goto nla_put_failure;
1477	return skb->len;
1478
1479nla_put_failure:
1480	nlmsg_trim(skb, b);
1481	return -1;
1482}
1483
1484static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1485{
1486	unsigned char *b = skb_tail_pointer(skb);
1487	struct tc_cbq_fopt opt;
1488
1489	if (cl->split || cl->defmap) {
1490		opt.split = cl->split ? cl->split->common.classid : 0;
1491		opt.defmap = cl->defmap;
1492		opt.defchange = ~0;
1493		if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1494			goto nla_put_failure;
1495	}
1496	return skb->len;
1497
1498nla_put_failure:
1499	nlmsg_trim(skb, b);
1500	return -1;
1501}
1502
1503#ifdef CONFIG_NET_CLS_ACT
1504static int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1505{
1506	unsigned char *b = skb_tail_pointer(skb);
1507	struct tc_cbq_police opt;
1508
1509	if (cl->police) {
1510		opt.police = cl->police;
1511		opt.__res1 = 0;
1512		opt.__res2 = 0;
1513		if (nla_put(skb, TCA_CBQ_POLICE, sizeof(opt), &opt))
1514			goto nla_put_failure;
1515	}
1516	return skb->len;
1517
1518nla_put_failure:
1519	nlmsg_trim(skb, b);
1520	return -1;
1521}
1522#endif
1523
1524static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1525{
1526	if (cbq_dump_lss(skb, cl) < 0 ||
1527	    cbq_dump_rate(skb, cl) < 0 ||
1528	    cbq_dump_wrr(skb, cl) < 0 ||
1529	    cbq_dump_ovl(skb, cl) < 0 ||
1530#ifdef CONFIG_NET_CLS_ACT
1531	    cbq_dump_police(skb, cl) < 0 ||
1532#endif
1533	    cbq_dump_fopt(skb, cl) < 0)
1534		return -1;
1535	return 0;
1536}
1537
1538static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1539{
1540	struct cbq_sched_data *q = qdisc_priv(sch);
1541	struct nlattr *nest;
1542
1543	nest = nla_nest_start(skb, TCA_OPTIONS);
1544	if (nest == NULL)
1545		goto nla_put_failure;
1546	if (cbq_dump_attr(skb, &q->link) < 0)
1547		goto nla_put_failure;
1548	return nla_nest_end(skb, nest);
1549
1550nla_put_failure:
1551	nla_nest_cancel(skb, nest);
1552	return -1;
1553}
1554
1555static int
1556cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1557{
1558	struct cbq_sched_data *q = qdisc_priv(sch);
1559
1560	q->link.xstats.avgidle = q->link.avgidle;
1561	return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1562}
1563
1564static int
1565cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1566	       struct sk_buff *skb, struct tcmsg *tcm)
1567{
1568	struct cbq_class *cl = (struct cbq_class *)arg;
1569	struct nlattr *nest;
1570
1571	if (cl->tparent)
1572		tcm->tcm_parent = cl->tparent->common.classid;
1573	else
1574		tcm->tcm_parent = TC_H_ROOT;
1575	tcm->tcm_handle = cl->common.classid;
1576	tcm->tcm_info = cl->q->handle;
1577
1578	nest = nla_nest_start(skb, TCA_OPTIONS);
1579	if (nest == NULL)
1580		goto nla_put_failure;
1581	if (cbq_dump_attr(skb, cl) < 0)
1582		goto nla_put_failure;
1583	return nla_nest_end(skb, nest);
1584
1585nla_put_failure:
1586	nla_nest_cancel(skb, nest);
1587	return -1;
1588}
1589
1590static int
1591cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1592	struct gnet_dump *d)
1593{
1594	struct cbq_sched_data *q = qdisc_priv(sch);
1595	struct cbq_class *cl = (struct cbq_class *)arg;
1596
1597	cl->xstats.avgidle = cl->avgidle;
1598	cl->xstats.undertime = 0;
1599
1600	if (cl->undertime != PSCHED_PASTPERFECT)
1601		cl->xstats.undertime = cl->undertime - q->now;
1602
1603	if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
1604	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
1605	    gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
1606		return -1;
1607
1608	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1609}
1610
1611static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1612		     struct Qdisc **old)
1613{
1614	struct cbq_class *cl = (struct cbq_class *)arg;
1615
1616	if (new == NULL) {
1617		new = qdisc_create_dflt(sch->dev_queue,
1618					&pfifo_qdisc_ops, cl->common.classid);
1619		if (new == NULL)
1620			return -ENOBUFS;
1621	} else {
1622#ifdef CONFIG_NET_CLS_ACT
1623		if (cl->police == TC_POLICE_RECLASSIFY)
1624			new->reshape_fail = cbq_reshape_fail;
1625#endif
1626	}
1627	sch_tree_lock(sch);
1628	*old = cl->q;
1629	cl->q = new;
1630	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1631	qdisc_reset(*old);
1632	sch_tree_unlock(sch);
1633
1634	return 0;
1635}
1636
1637static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1638{
1639	struct cbq_class *cl = (struct cbq_class *)arg;
1640
1641	return cl->q;
1642}
1643
1644static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1645{
1646	struct cbq_class *cl = (struct cbq_class *)arg;
1647
1648	if (cl->q->q.qlen == 0)
1649		cbq_deactivate_class(cl);
1650}
1651
1652static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1653{
1654	struct cbq_sched_data *q = qdisc_priv(sch);
1655	struct cbq_class *cl = cbq_class_lookup(q, classid);
1656
1657	if (cl) {
1658		cl->refcnt++;
1659		return (unsigned long)cl;
1660	}
1661	return 0;
1662}
1663
1664static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1665{
1666	struct cbq_sched_data *q = qdisc_priv(sch);
1667
1668	WARN_ON(cl->filters);
1669
1670	tcf_destroy_chain(&cl->filter_list);
1671	qdisc_destroy(cl->q);
1672	qdisc_put_rtab(cl->R_tab);
1673	gen_kill_estimator(&cl->bstats, &cl->rate_est);
1674	if (cl != &q->link)
1675		kfree(cl);
1676}
1677
1678static void cbq_destroy(struct Qdisc *sch)
1679{
1680	struct cbq_sched_data *q = qdisc_priv(sch);
1681	struct hlist_node *next;
1682	struct cbq_class *cl;
1683	unsigned int h;
1684
1685#ifdef CONFIG_NET_CLS_ACT
1686	q->rx_class = NULL;
1687#endif
1688	/*
1689	 * Filters must be destroyed first because we don't destroy the
1690	 * classes from root to leafs which means that filters can still
1691	 * be bound to classes which have been destroyed already. --TGR '04
1692	 */
1693	for (h = 0; h < q->clhash.hashsize; h++) {
1694		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode)
1695			tcf_destroy_chain(&cl->filter_list);
1696	}
1697	for (h = 0; h < q->clhash.hashsize; h++) {
1698		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1699					  common.hnode)
1700			cbq_destroy_class(sch, cl);
1701	}
1702	qdisc_class_hash_destroy(&q->clhash);
1703}
1704
1705static void cbq_put(struct Qdisc *sch, unsigned long arg)
1706{
1707	struct cbq_class *cl = (struct cbq_class *)arg;
1708
1709	if (--cl->refcnt == 0) {
1710#ifdef CONFIG_NET_CLS_ACT
1711		spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1712		struct cbq_sched_data *q = qdisc_priv(sch);
1713
1714		spin_lock_bh(root_lock);
1715		if (q->rx_class == cl)
1716			q->rx_class = NULL;
1717		spin_unlock_bh(root_lock);
1718#endif
1719
1720		cbq_destroy_class(sch, cl);
1721	}
1722}
1723
1724static int
1725cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1726		 unsigned long *arg)
1727{
1728	int err;
1729	struct cbq_sched_data *q = qdisc_priv(sch);
1730	struct cbq_class *cl = (struct cbq_class *)*arg;
1731	struct nlattr *opt = tca[TCA_OPTIONS];
1732	struct nlattr *tb[TCA_CBQ_MAX + 1];
1733	struct cbq_class *parent;
1734	struct qdisc_rate_table *rtab = NULL;
1735
1736	if (opt == NULL)
1737		return -EINVAL;
1738
1739	err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1740	if (err < 0)
1741		return err;
1742
1743	if (cl) {
1744		/* Check parent */
1745		if (parentid) {
1746			if (cl->tparent &&
1747			    cl->tparent->common.classid != parentid)
1748				return -EINVAL;
1749			if (!cl->tparent && parentid != TC_H_ROOT)
1750				return -EINVAL;
1751		}
1752
1753		if (tb[TCA_CBQ_RATE]) {
1754			rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1755					      tb[TCA_CBQ_RTAB]);
1756			if (rtab == NULL)
1757				return -EINVAL;
1758		}
1759
1760		if (tca[TCA_RATE]) {
1761			err = gen_replace_estimator(&cl->bstats, NULL,
1762						    &cl->rate_est,
1763						    qdisc_root_sleeping_lock(sch),
1764						    tca[TCA_RATE]);
1765			if (err) {
1766				qdisc_put_rtab(rtab);
1767				return err;
1768			}
1769		}
1770
1771		/* Change class parameters */
1772		sch_tree_lock(sch);
1773
1774		if (cl->next_alive != NULL)
1775			cbq_deactivate_class(cl);
1776
1777		if (rtab) {
1778			qdisc_put_rtab(cl->R_tab);
1779			cl->R_tab = rtab;
1780		}
1781
1782		if (tb[TCA_CBQ_LSSOPT])
1783			cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1784
1785		if (tb[TCA_CBQ_WRROPT]) {
1786			cbq_rmprio(q, cl);
1787			cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1788		}
1789
1790		if (tb[TCA_CBQ_OVL_STRATEGY])
1791			cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1792
1793#ifdef CONFIG_NET_CLS_ACT
1794		if (tb[TCA_CBQ_POLICE])
1795			cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1796#endif
1797
1798		if (tb[TCA_CBQ_FOPT])
1799			cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1800
1801		if (cl->q->q.qlen)
1802			cbq_activate_class(cl);
1803
1804		sch_tree_unlock(sch);
1805
1806		return 0;
1807	}
1808
1809	if (parentid == TC_H_ROOT)
1810		return -EINVAL;
1811
1812	if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
1813	    tb[TCA_CBQ_LSSOPT] == NULL)
1814		return -EINVAL;
1815
1816	rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1817	if (rtab == NULL)
1818		return -EINVAL;
1819
1820	if (classid) {
1821		err = -EINVAL;
1822		if (TC_H_MAJ(classid ^ sch->handle) ||
1823		    cbq_class_lookup(q, classid))
1824			goto failure;
1825	} else {
1826		int i;
1827		classid = TC_H_MAKE(sch->handle, 0x8000);
1828
1829		for (i = 0; i < 0x8000; i++) {
1830			if (++q->hgenerator >= 0x8000)
1831				q->hgenerator = 1;
1832			if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1833				break;
1834		}
1835		err = -ENOSR;
1836		if (i >= 0x8000)
1837			goto failure;
1838		classid = classid|q->hgenerator;
1839	}
1840
1841	parent = &q->link;
1842	if (parentid) {
1843		parent = cbq_class_lookup(q, parentid);
1844		err = -EINVAL;
1845		if (parent == NULL)
1846			goto failure;
1847	}
1848
1849	err = -ENOBUFS;
1850	cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1851	if (cl == NULL)
1852		goto failure;
1853
1854	if (tca[TCA_RATE]) {
1855		err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1856					qdisc_root_sleeping_lock(sch),
1857					tca[TCA_RATE]);
1858		if (err) {
1859			kfree(cl);
1860			goto failure;
1861		}
1862	}
1863
1864	cl->R_tab = rtab;
1865	rtab = NULL;
1866	cl->refcnt = 1;
1867	cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
1868	if (!cl->q)
1869		cl->q = &noop_qdisc;
1870	cl->common.classid = classid;
1871	cl->tparent = parent;
1872	cl->qdisc = sch;
1873	cl->allot = parent->allot;
1874	cl->quantum = cl->allot;
1875	cl->weight = cl->R_tab->rate.rate;
1876
1877	sch_tree_lock(sch);
1878	cbq_link_class(cl);
1879	cl->borrow = cl->tparent;
1880	if (cl->tparent != &q->link)
1881		cl->share = cl->tparent;
1882	cbq_adjust_levels(parent);
1883	cl->minidle = -0x7FFFFFFF;
1884	cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1885	cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1886	if (cl->ewma_log == 0)
1887		cl->ewma_log = q->link.ewma_log;
1888	if (cl->maxidle == 0)
1889		cl->maxidle = q->link.maxidle;
1890	if (cl->avpkt == 0)
1891		cl->avpkt = q->link.avpkt;
1892	cl->overlimit = cbq_ovl_classic;
1893	if (tb[TCA_CBQ_OVL_STRATEGY])
1894		cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1895#ifdef CONFIG_NET_CLS_ACT
1896	if (tb[TCA_CBQ_POLICE])
1897		cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1898#endif
1899	if (tb[TCA_CBQ_FOPT])
1900		cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1901	sch_tree_unlock(sch);
1902
1903	qdisc_class_hash_grow(sch, &q->clhash);
1904
1905	*arg = (unsigned long)cl;
1906	return 0;
1907
1908failure:
1909	qdisc_put_rtab(rtab);
1910	return err;
1911}
1912
1913static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1914{
1915	struct cbq_sched_data *q = qdisc_priv(sch);
1916	struct cbq_class *cl = (struct cbq_class *)arg;
1917	unsigned int qlen;
1918
1919	if (cl->filters || cl->children || cl == &q->link)
1920		return -EBUSY;
1921
1922	sch_tree_lock(sch);
1923
1924	qlen = cl->q->q.qlen;
1925	qdisc_reset(cl->q);
1926	qdisc_tree_decrease_qlen(cl->q, qlen);
1927
1928	if (cl->next_alive)
1929		cbq_deactivate_class(cl);
1930
1931	if (q->tx_borrowed == cl)
1932		q->tx_borrowed = q->tx_class;
1933	if (q->tx_class == cl) {
1934		q->tx_class = NULL;
1935		q->tx_borrowed = NULL;
1936	}
1937#ifdef CONFIG_NET_CLS_ACT
1938	if (q->rx_class == cl)
1939		q->rx_class = NULL;
1940#endif
1941
1942	cbq_unlink_class(cl);
1943	cbq_adjust_levels(cl->tparent);
1944	cl->defmap = 0;
1945	cbq_sync_defmap(cl);
1946
1947	cbq_rmprio(q, cl);
1948	sch_tree_unlock(sch);
1949
1950	BUG_ON(--cl->refcnt == 0);
1951	/*
1952	 * This shouldn't happen: we "hold" one cops->get() when called
1953	 * from tc_ctl_tclass; the destroy method is done from cops->put().
1954	 */
1955
1956	return 0;
1957}
1958
1959static struct tcf_proto __rcu **cbq_find_tcf(struct Qdisc *sch,
1960					     unsigned long arg)
1961{
1962	struct cbq_sched_data *q = qdisc_priv(sch);
1963	struct cbq_class *cl = (struct cbq_class *)arg;
1964
1965	if (cl == NULL)
1966		cl = &q->link;
1967
1968	return &cl->filter_list;
1969}
1970
1971static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1972				     u32 classid)
1973{
1974	struct cbq_sched_data *q = qdisc_priv(sch);
1975	struct cbq_class *p = (struct cbq_class *)parent;
1976	struct cbq_class *cl = cbq_class_lookup(q, classid);
1977
1978	if (cl) {
1979		if (p && p->level <= cl->level)
1980			return 0;
1981		cl->filters++;
1982		return (unsigned long)cl;
1983	}
1984	return 0;
1985}
1986
1987static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1988{
1989	struct cbq_class *cl = (struct cbq_class *)arg;
1990
1991	cl->filters--;
1992}
1993
1994static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1995{
1996	struct cbq_sched_data *q = qdisc_priv(sch);
1997	struct cbq_class *cl;
1998	unsigned int h;
1999
2000	if (arg->stop)
2001		return;
2002
2003	for (h = 0; h < q->clhash.hashsize; h++) {
2004		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
2005			if (arg->count < arg->skip) {
2006				arg->count++;
2007				continue;
2008			}
2009			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2010				arg->stop = 1;
2011				return;
2012			}
2013			arg->count++;
2014		}
2015	}
2016}
2017
2018static const struct Qdisc_class_ops cbq_class_ops = {
2019	.graft		=	cbq_graft,
2020	.leaf		=	cbq_leaf,
2021	.qlen_notify	=	cbq_qlen_notify,
2022	.get		=	cbq_get,
2023	.put		=	cbq_put,
2024	.change		=	cbq_change_class,
2025	.delete		=	cbq_delete,
2026	.walk		=	cbq_walk,
2027	.tcf_chain	=	cbq_find_tcf,
2028	.bind_tcf	=	cbq_bind_filter,
2029	.unbind_tcf	=	cbq_unbind_filter,
2030	.dump		=	cbq_dump_class,
2031	.dump_stats	=	cbq_dump_class_stats,
2032};
2033
2034static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
2035	.next		=	NULL,
2036	.cl_ops		=	&cbq_class_ops,
2037	.id		=	"cbq",
2038	.priv_size	=	sizeof(struct cbq_sched_data),
2039	.enqueue	=	cbq_enqueue,
2040	.dequeue	=	cbq_dequeue,
2041	.peek		=	qdisc_peek_dequeued,
2042	.drop		=	cbq_drop,
2043	.init		=	cbq_init,
2044	.reset		=	cbq_reset,
2045	.destroy	=	cbq_destroy,
2046	.change		=	NULL,
2047	.dump		=	cbq_dump,
2048	.dump_stats	=	cbq_dump_stats,
2049	.owner		=	THIS_MODULE,
2050};
2051
2052static int __init cbq_module_init(void)
2053{
2054	return register_qdisc(&cbq_qdisc_ops);
2055}
2056static void __exit cbq_module_exit(void)
2057{
2058	unregister_qdisc(&cbq_qdisc_ops);
2059}
2060module_init(cbq_module_init)
2061module_exit(cbq_module_exit)
2062MODULE_LICENSE("GPL");
2063