1/*
2 * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
3 *
4 *		This program is free software; you can redistribute it and/or
5 *		modify it under the terms of the GNU General Public License
6 *		as published by the Free Software Foundation; either version
7 *		2 of the License, or (at your option) any later version.
8 *
9 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 */
11
12#include <linux/module.h>
13#include <linux/types.h>
14#include <linux/kernel.h>
15#include <linux/jiffies.h>
16#include <linux/string.h>
17#include <linux/in.h>
18#include <linux/errno.h>
19#include <linux/init.h>
20#include <linux/skbuff.h>
21#include <linux/jhash.h>
22#include <linux/slab.h>
23#include <linux/vmalloc.h>
24#include <net/netlink.h>
25#include <net/pkt_sched.h>
26#include <net/flow_keys.h>
27#include <net/red.h>
28
29
30/*	Stochastic Fairness Queuing algorithm.
31	=======================================
32
33	Source:
34	Paul E. McKenney "Stochastic Fairness Queuing",
35	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
36
37	Paul E. McKenney "Stochastic Fairness Queuing",
38	"Interworking: Research and Experience", v.2, 1991, p.113-131.
39
40
41	See also:
42	M. Shreedhar and George Varghese "Efficient Fair
43	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
44
45
46	This is not the thing that is usually called (W)FQ nowadays.
47	It does not use any timestamp mechanism, but instead
48	processes queues in round-robin order.
49
50	ADVANTAGE:
51
52	- It is very cheap. Both CPU and memory requirements are minimal.
53
54	DRAWBACKS:
55
56	- "Stochastic" -> It is not 100% fair.
57	When hash collisions occur, several flows are considered as one.
58
59	- "Round-robin" -> It introduces larger delays than virtual clock
60	based schemes, and should not be used for isolating interactive
61	traffic	from non-interactive. It means, that this scheduler
62	should be used as leaf of CBQ or P3, which put interactive traffic
63	to higher priority band.
64
65	We still need true WFQ for top level CSZ, but using WFQ
66	for the best effort traffic is absolutely pointless:
67	SFQ is superior for this purpose.
68
69	IMPLEMENTATION:
70	This implementation limits :
71	- maximal queue length per flow to 127 packets.
72	- max mtu to 2^18-1;
73	- max 65408 flows,
74	- number of hash buckets to 65536.
75
76	It is easy to increase these values, but not in flight.  */
77
78#define SFQ_MAX_DEPTH		127 /* max number of packets per flow */
79#define SFQ_DEFAULT_FLOWS	128
80#define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
81#define SFQ_EMPTY_SLOT		0xffff
82#define SFQ_DEFAULT_HASH_DIVISOR 1024
83
84/* We use 16 bits to store allot, and want to handle packets up to 64K
85 * Scale allot by 8 (1<<3) so that no overflow occurs.
86 */
87#define SFQ_ALLOT_SHIFT		3
88#define SFQ_ALLOT_SIZE(X)	DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
89
90/* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
91typedef u16 sfq_index;
92
93/*
94 * We dont use pointers to save space.
95 * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
96 * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
97 * are 'pointers' to dep[] array
98 */
99struct sfq_head {
100	sfq_index	next;
101	sfq_index	prev;
102};
103
104struct sfq_slot {
105	struct sk_buff	*skblist_next;
106	struct sk_buff	*skblist_prev;
107	sfq_index	qlen; /* number of skbs in skblist */
108	sfq_index	next; /* next slot in sfq RR chain */
109	struct sfq_head dep; /* anchor in dep[] chains */
110	unsigned short	hash; /* hash value (index in ht[]) */
111	short		allot; /* credit for this slot */
112
113	unsigned int    backlog;
114	struct red_vars vars;
115};
116
117struct sfq_sched_data {
118/* frequently used fields */
119	int		limit;		/* limit of total number of packets in this qdisc */
120	unsigned int	divisor;	/* number of slots in hash table */
121	u8		headdrop;
122	u8		maxdepth;	/* limit of packets per flow */
123
124	u32		perturbation;
125	u8		cur_depth;	/* depth of longest slot */
126	u8		flags;
127	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
128	struct tcf_proto __rcu *filter_list;
129	sfq_index	*ht;		/* Hash table ('divisor' slots) */
130	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
131
132	struct red_parms *red_parms;
133	struct tc_sfqred_stats stats;
134	struct sfq_slot *tail;		/* current slot in round */
135
136	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
137					/* Linked lists of slots, indexed by depth
138					 * dep[0] : list of unused flows
139					 * dep[1] : list of flows with 1 packet
140					 * dep[X] : list of flows with X packets
141					 */
142
143	unsigned int	maxflows;	/* number of flows in flows array */
144	int		perturb_period;
145	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
146	struct timer_list perturb_timer;
147};
148
149/*
150 * sfq_head are either in a sfq_slot or in dep[] array
151 */
152static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
153{
154	if (val < SFQ_MAX_FLOWS)
155		return &q->slots[val].dep;
156	return &q->dep[val - SFQ_MAX_FLOWS];
157}
158
159/*
160 * In order to be able to quickly rehash our queue when timer changes
161 * q->perturbation, we store flow_keys in skb->cb[]
162 */
163struct sfq_skb_cb {
164       struct flow_keys        keys;
165};
166
167static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
168{
169	qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb));
170	return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
171}
172
173static unsigned int sfq_hash(const struct sfq_sched_data *q,
174			     const struct sk_buff *skb)
175{
176	const struct flow_keys *keys = &sfq_skb_cb(skb)->keys;
177	unsigned int hash;
178
179	hash = jhash_3words((__force u32)keys->dst,
180			    (__force u32)keys->src ^ keys->ip_proto,
181			    (__force u32)keys->ports, q->perturbation);
182	return hash & (q->divisor - 1);
183}
184
185static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
186				 int *qerr)
187{
188	struct sfq_sched_data *q = qdisc_priv(sch);
189	struct tcf_result res;
190	struct tcf_proto *fl;
191	int result;
192
193	if (TC_H_MAJ(skb->priority) == sch->handle &&
194	    TC_H_MIN(skb->priority) > 0 &&
195	    TC_H_MIN(skb->priority) <= q->divisor)
196		return TC_H_MIN(skb->priority);
197
198	fl = rcu_dereference_bh(q->filter_list);
199	if (!fl) {
200		skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
201		return sfq_hash(q, skb) + 1;
202	}
203
204	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
205	result = tc_classify(skb, fl, &res);
206	if (result >= 0) {
207#ifdef CONFIG_NET_CLS_ACT
208		switch (result) {
209		case TC_ACT_STOLEN:
210		case TC_ACT_QUEUED:
211			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
212		case TC_ACT_SHOT:
213			return 0;
214		}
215#endif
216		if (TC_H_MIN(res.classid) <= q->divisor)
217			return TC_H_MIN(res.classid);
218	}
219	return 0;
220}
221
222/*
223 * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
224 */
225static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
226{
227	sfq_index p, n;
228	struct sfq_slot *slot = &q->slots[x];
229	int qlen = slot->qlen;
230
231	p = qlen + SFQ_MAX_FLOWS;
232	n = q->dep[qlen].next;
233
234	slot->dep.next = n;
235	slot->dep.prev = p;
236
237	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
238	sfq_dep_head(q, n)->prev = x;
239}
240
241#define sfq_unlink(q, x, n, p)			\
242	do {					\
243		n = q->slots[x].dep.next;	\
244		p = q->slots[x].dep.prev;	\
245		sfq_dep_head(q, p)->next = n;	\
246		sfq_dep_head(q, n)->prev = p;	\
247	} while (0)
248
249
250static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
251{
252	sfq_index p, n;
253	int d;
254
255	sfq_unlink(q, x, n, p);
256
257	d = q->slots[x].qlen--;
258	if (n == p && q->cur_depth == d)
259		q->cur_depth--;
260	sfq_link(q, x);
261}
262
263static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
264{
265	sfq_index p, n;
266	int d;
267
268	sfq_unlink(q, x, n, p);
269
270	d = ++q->slots[x].qlen;
271	if (q->cur_depth < d)
272		q->cur_depth = d;
273	sfq_link(q, x);
274}
275
276/* helper functions : might be changed when/if skb use a standard list_head */
277
278/* remove one skb from tail of slot queue */
279static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
280{
281	struct sk_buff *skb = slot->skblist_prev;
282
283	slot->skblist_prev = skb->prev;
284	skb->prev->next = (struct sk_buff *)slot;
285	skb->next = skb->prev = NULL;
286	return skb;
287}
288
289/* remove one skb from head of slot queue */
290static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
291{
292	struct sk_buff *skb = slot->skblist_next;
293
294	slot->skblist_next = skb->next;
295	skb->next->prev = (struct sk_buff *)slot;
296	skb->next = skb->prev = NULL;
297	return skb;
298}
299
300static inline void slot_queue_init(struct sfq_slot *slot)
301{
302	memset(slot, 0, sizeof(*slot));
303	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
304}
305
306/* add skb to slot queue (tail add) */
307static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
308{
309	skb->prev = slot->skblist_prev;
310	skb->next = (struct sk_buff *)slot;
311	slot->skblist_prev->next = skb;
312	slot->skblist_prev = skb;
313}
314
315static unsigned int sfq_drop(struct Qdisc *sch)
316{
317	struct sfq_sched_data *q = qdisc_priv(sch);
318	sfq_index x, d = q->cur_depth;
319	struct sk_buff *skb;
320	unsigned int len;
321	struct sfq_slot *slot;
322
323	/* Queue is full! Find the longest slot and drop tail packet from it */
324	if (d > 1) {
325		x = q->dep[d].next;
326		slot = &q->slots[x];
327drop:
328		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
329		len = qdisc_pkt_len(skb);
330		slot->backlog -= len;
331		sfq_dec(q, x);
332		kfree_skb(skb);
333		sch->q.qlen--;
334		qdisc_qstats_drop(sch);
335		qdisc_qstats_backlog_dec(sch, skb);
336		return len;
337	}
338
339	if (d == 1) {
340		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
341		x = q->tail->next;
342		slot = &q->slots[x];
343		q->tail->next = slot->next;
344		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
345		goto drop;
346	}
347
348	return 0;
349}
350
351/* Is ECN parameter configured */
352static int sfq_prob_mark(const struct sfq_sched_data *q)
353{
354	return q->flags & TC_RED_ECN;
355}
356
357/* Should packets over max threshold just be marked */
358static int sfq_hard_mark(const struct sfq_sched_data *q)
359{
360	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
361}
362
363static int sfq_headdrop(const struct sfq_sched_data *q)
364{
365	return q->headdrop;
366}
367
368static int
369sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
370{
371	struct sfq_sched_data *q = qdisc_priv(sch);
372	unsigned int hash;
373	sfq_index x, qlen;
374	struct sfq_slot *slot;
375	int uninitialized_var(ret);
376	struct sk_buff *head;
377	int delta;
378
379	hash = sfq_classify(skb, sch, &ret);
380	if (hash == 0) {
381		if (ret & __NET_XMIT_BYPASS)
382			qdisc_qstats_drop(sch);
383		kfree_skb(skb);
384		return ret;
385	}
386	hash--;
387
388	x = q->ht[hash];
389	slot = &q->slots[x];
390	if (x == SFQ_EMPTY_SLOT) {
391		x = q->dep[0].next; /* get a free slot */
392		if (x >= SFQ_MAX_FLOWS)
393			return qdisc_drop(skb, sch);
394		q->ht[hash] = x;
395		slot = &q->slots[x];
396		slot->hash = hash;
397		slot->backlog = 0; /* should already be 0 anyway... */
398		red_set_vars(&slot->vars);
399		goto enqueue;
400	}
401	if (q->red_parms) {
402		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
403							&slot->vars,
404							slot->backlog);
405		switch (red_action(q->red_parms,
406				   &slot->vars,
407				   slot->vars.qavg)) {
408		case RED_DONT_MARK:
409			break;
410
411		case RED_PROB_MARK:
412			qdisc_qstats_overlimit(sch);
413			if (sfq_prob_mark(q)) {
414				/* We know we have at least one packet in queue */
415				if (sfq_headdrop(q) &&
416				    INET_ECN_set_ce(slot->skblist_next)) {
417					q->stats.prob_mark_head++;
418					break;
419				}
420				if (INET_ECN_set_ce(skb)) {
421					q->stats.prob_mark++;
422					break;
423				}
424			}
425			q->stats.prob_drop++;
426			goto congestion_drop;
427
428		case RED_HARD_MARK:
429			qdisc_qstats_overlimit(sch);
430			if (sfq_hard_mark(q)) {
431				/* We know we have at least one packet in queue */
432				if (sfq_headdrop(q) &&
433				    INET_ECN_set_ce(slot->skblist_next)) {
434					q->stats.forced_mark_head++;
435					break;
436				}
437				if (INET_ECN_set_ce(skb)) {
438					q->stats.forced_mark++;
439					break;
440				}
441			}
442			q->stats.forced_drop++;
443			goto congestion_drop;
444		}
445	}
446
447	if (slot->qlen >= q->maxdepth) {
448congestion_drop:
449		if (!sfq_headdrop(q))
450			return qdisc_drop(skb, sch);
451
452		/* We know we have at least one packet in queue */
453		head = slot_dequeue_head(slot);
454		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
455		sch->qstats.backlog -= delta;
456		slot->backlog -= delta;
457		qdisc_drop(head, sch);
458
459		slot_queue_add(slot, skb);
460		return NET_XMIT_CN;
461	}
462
463enqueue:
464	qdisc_qstats_backlog_inc(sch, skb);
465	slot->backlog += qdisc_pkt_len(skb);
466	slot_queue_add(slot, skb);
467	sfq_inc(q, x);
468	if (slot->qlen == 1) {		/* The flow is new */
469		if (q->tail == NULL) {	/* It is the first flow */
470			slot->next = x;
471		} else {
472			slot->next = q->tail->next;
473			q->tail->next = x;
474		}
475		/* We put this flow at the end of our flow list.
476		 * This might sound unfair for a new flow to wait after old ones,
477		 * but we could endup servicing new flows only, and freeze old ones.
478		 */
479		q->tail = slot;
480		/* We could use a bigger initial quantum for new flows */
481		slot->allot = q->scaled_quantum;
482	}
483	if (++sch->q.qlen <= q->limit)
484		return NET_XMIT_SUCCESS;
485
486	qlen = slot->qlen;
487	sfq_drop(sch);
488	/* Return Congestion Notification only if we dropped a packet
489	 * from this flow.
490	 */
491	if (qlen != slot->qlen)
492		return NET_XMIT_CN;
493
494	/* As we dropped a packet, better let upper stack know this */
495	qdisc_tree_decrease_qlen(sch, 1);
496	return NET_XMIT_SUCCESS;
497}
498
499static struct sk_buff *
500sfq_dequeue(struct Qdisc *sch)
501{
502	struct sfq_sched_data *q = qdisc_priv(sch);
503	struct sk_buff *skb;
504	sfq_index a, next_a;
505	struct sfq_slot *slot;
506
507	/* No active slots */
508	if (q->tail == NULL)
509		return NULL;
510
511next_slot:
512	a = q->tail->next;
513	slot = &q->slots[a];
514	if (slot->allot <= 0) {
515		q->tail = slot;
516		slot->allot += q->scaled_quantum;
517		goto next_slot;
518	}
519	skb = slot_dequeue_head(slot);
520	sfq_dec(q, a);
521	qdisc_bstats_update(sch, skb);
522	sch->q.qlen--;
523	qdisc_qstats_backlog_dec(sch, skb);
524	slot->backlog -= qdisc_pkt_len(skb);
525	/* Is the slot empty? */
526	if (slot->qlen == 0) {
527		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
528		next_a = slot->next;
529		if (a == next_a) {
530			q->tail = NULL; /* no more active slots */
531			return skb;
532		}
533		q->tail->next = next_a;
534	} else {
535		slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
536	}
537	return skb;
538}
539
540static void
541sfq_reset(struct Qdisc *sch)
542{
543	struct sk_buff *skb;
544
545	while ((skb = sfq_dequeue(sch)) != NULL)
546		kfree_skb(skb);
547}
548
549/*
550 * When q->perturbation is changed, we rehash all queued skbs
551 * to avoid OOO (Out Of Order) effects.
552 * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
553 * counters.
554 */
555static void sfq_rehash(struct Qdisc *sch)
556{
557	struct sfq_sched_data *q = qdisc_priv(sch);
558	struct sk_buff *skb;
559	int i;
560	struct sfq_slot *slot;
561	struct sk_buff_head list;
562	int dropped = 0;
563
564	__skb_queue_head_init(&list);
565
566	for (i = 0; i < q->maxflows; i++) {
567		slot = &q->slots[i];
568		if (!slot->qlen)
569			continue;
570		while (slot->qlen) {
571			skb = slot_dequeue_head(slot);
572			sfq_dec(q, i);
573			__skb_queue_tail(&list, skb);
574		}
575		slot->backlog = 0;
576		red_set_vars(&slot->vars);
577		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
578	}
579	q->tail = NULL;
580
581	while ((skb = __skb_dequeue(&list)) != NULL) {
582		unsigned int hash = sfq_hash(q, skb);
583		sfq_index x = q->ht[hash];
584
585		slot = &q->slots[x];
586		if (x == SFQ_EMPTY_SLOT) {
587			x = q->dep[0].next; /* get a free slot */
588			if (x >= SFQ_MAX_FLOWS) {
589drop:
590				qdisc_qstats_backlog_dec(sch, skb);
591				kfree_skb(skb);
592				dropped++;
593				continue;
594			}
595			q->ht[hash] = x;
596			slot = &q->slots[x];
597			slot->hash = hash;
598		}
599		if (slot->qlen >= q->maxdepth)
600			goto drop;
601		slot_queue_add(slot, skb);
602		if (q->red_parms)
603			slot->vars.qavg = red_calc_qavg(q->red_parms,
604							&slot->vars,
605							slot->backlog);
606		slot->backlog += qdisc_pkt_len(skb);
607		sfq_inc(q, x);
608		if (slot->qlen == 1) {		/* The flow is new */
609			if (q->tail == NULL) {	/* It is the first flow */
610				slot->next = x;
611			} else {
612				slot->next = q->tail->next;
613				q->tail->next = x;
614			}
615			q->tail = slot;
616			slot->allot = q->scaled_quantum;
617		}
618	}
619	sch->q.qlen -= dropped;
620	qdisc_tree_decrease_qlen(sch, dropped);
621}
622
623static void sfq_perturbation(unsigned long arg)
624{
625	struct Qdisc *sch = (struct Qdisc *)arg;
626	struct sfq_sched_data *q = qdisc_priv(sch);
627	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
628
629	spin_lock(root_lock);
630	q->perturbation = prandom_u32();
631	if (!q->filter_list && q->tail)
632		sfq_rehash(sch);
633	spin_unlock(root_lock);
634
635	if (q->perturb_period)
636		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
637}
638
639static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
640{
641	struct sfq_sched_data *q = qdisc_priv(sch);
642	struct tc_sfq_qopt *ctl = nla_data(opt);
643	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
644	unsigned int qlen;
645	struct red_parms *p = NULL;
646
647	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
648		return -EINVAL;
649	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
650		ctl_v1 = nla_data(opt);
651	if (ctl->divisor &&
652	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
653		return -EINVAL;
654	if (ctl_v1 && ctl_v1->qth_min) {
655		p = kmalloc(sizeof(*p), GFP_KERNEL);
656		if (!p)
657			return -ENOMEM;
658	}
659	sch_tree_lock(sch);
660	if (ctl->quantum) {
661		q->quantum = ctl->quantum;
662		q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
663	}
664	q->perturb_period = ctl->perturb_period * HZ;
665	if (ctl->flows)
666		q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
667	if (ctl->divisor) {
668		q->divisor = ctl->divisor;
669		q->maxflows = min_t(u32, q->maxflows, q->divisor);
670	}
671	if (ctl_v1) {
672		if (ctl_v1->depth)
673			q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
674		if (p) {
675			swap(q->red_parms, p);
676			red_set_parms(q->red_parms,
677				      ctl_v1->qth_min, ctl_v1->qth_max,
678				      ctl_v1->Wlog,
679				      ctl_v1->Plog, ctl_v1->Scell_log,
680				      NULL,
681				      ctl_v1->max_P);
682		}
683		q->flags = ctl_v1->flags;
684		q->headdrop = ctl_v1->headdrop;
685	}
686	if (ctl->limit) {
687		q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
688		q->maxflows = min_t(u32, q->maxflows, q->limit);
689	}
690
691	qlen = sch->q.qlen;
692	while (sch->q.qlen > q->limit)
693		sfq_drop(sch);
694	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
695
696	del_timer(&q->perturb_timer);
697	if (q->perturb_period) {
698		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
699		q->perturbation = prandom_u32();
700	}
701	sch_tree_unlock(sch);
702	kfree(p);
703	return 0;
704}
705
706static void *sfq_alloc(size_t sz)
707{
708	void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
709
710	if (!ptr)
711		ptr = vmalloc(sz);
712	return ptr;
713}
714
715static void sfq_free(void *addr)
716{
717	kvfree(addr);
718}
719
720static void sfq_destroy(struct Qdisc *sch)
721{
722	struct sfq_sched_data *q = qdisc_priv(sch);
723
724	tcf_destroy_chain(&q->filter_list);
725	q->perturb_period = 0;
726	del_timer_sync(&q->perturb_timer);
727	sfq_free(q->ht);
728	sfq_free(q->slots);
729	kfree(q->red_parms);
730}
731
732static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
733{
734	struct sfq_sched_data *q = qdisc_priv(sch);
735	int i;
736
737	q->perturb_timer.function = sfq_perturbation;
738	q->perturb_timer.data = (unsigned long)sch;
739	init_timer_deferrable(&q->perturb_timer);
740
741	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
742		q->dep[i].next = i + SFQ_MAX_FLOWS;
743		q->dep[i].prev = i + SFQ_MAX_FLOWS;
744	}
745
746	q->limit = SFQ_MAX_DEPTH;
747	q->maxdepth = SFQ_MAX_DEPTH;
748	q->cur_depth = 0;
749	q->tail = NULL;
750	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
751	q->maxflows = SFQ_DEFAULT_FLOWS;
752	q->quantum = psched_mtu(qdisc_dev(sch));
753	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
754	q->perturb_period = 0;
755	q->perturbation = prandom_u32();
756
757	if (opt) {
758		int err = sfq_change(sch, opt);
759		if (err)
760			return err;
761	}
762
763	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
764	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
765	if (!q->ht || !q->slots) {
766		sfq_destroy(sch);
767		return -ENOMEM;
768	}
769	for (i = 0; i < q->divisor; i++)
770		q->ht[i] = SFQ_EMPTY_SLOT;
771
772	for (i = 0; i < q->maxflows; i++) {
773		slot_queue_init(&q->slots[i]);
774		sfq_link(q, i);
775	}
776	if (q->limit >= 1)
777		sch->flags |= TCQ_F_CAN_BYPASS;
778	else
779		sch->flags &= ~TCQ_F_CAN_BYPASS;
780	return 0;
781}
782
783static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
784{
785	struct sfq_sched_data *q = qdisc_priv(sch);
786	unsigned char *b = skb_tail_pointer(skb);
787	struct tc_sfq_qopt_v1 opt;
788	struct red_parms *p = q->red_parms;
789
790	memset(&opt, 0, sizeof(opt));
791	opt.v0.quantum	= q->quantum;
792	opt.v0.perturb_period = q->perturb_period / HZ;
793	opt.v0.limit	= q->limit;
794	opt.v0.divisor	= q->divisor;
795	opt.v0.flows	= q->maxflows;
796	opt.depth	= q->maxdepth;
797	opt.headdrop	= q->headdrop;
798
799	if (p) {
800		opt.qth_min	= p->qth_min >> p->Wlog;
801		opt.qth_max	= p->qth_max >> p->Wlog;
802		opt.Wlog	= p->Wlog;
803		opt.Plog	= p->Plog;
804		opt.Scell_log	= p->Scell_log;
805		opt.max_P	= p->max_P;
806	}
807	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
808	opt.flags	= q->flags;
809
810	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
811		goto nla_put_failure;
812
813	return skb->len;
814
815nla_put_failure:
816	nlmsg_trim(skb, b);
817	return -1;
818}
819
820static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
821{
822	return NULL;
823}
824
825static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
826{
827	return 0;
828}
829
830static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
831			      u32 classid)
832{
833	/* we cannot bypass queue discipline anymore */
834	sch->flags &= ~TCQ_F_CAN_BYPASS;
835	return 0;
836}
837
838static void sfq_put(struct Qdisc *q, unsigned long cl)
839{
840}
841
842static struct tcf_proto __rcu **sfq_find_tcf(struct Qdisc *sch,
843					     unsigned long cl)
844{
845	struct sfq_sched_data *q = qdisc_priv(sch);
846
847	if (cl)
848		return NULL;
849	return &q->filter_list;
850}
851
852static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
853			  struct sk_buff *skb, struct tcmsg *tcm)
854{
855	tcm->tcm_handle |= TC_H_MIN(cl);
856	return 0;
857}
858
859static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
860				struct gnet_dump *d)
861{
862	struct sfq_sched_data *q = qdisc_priv(sch);
863	sfq_index idx = q->ht[cl - 1];
864	struct gnet_stats_queue qs = { 0 };
865	struct tc_sfq_xstats xstats = { 0 };
866
867	if (idx != SFQ_EMPTY_SLOT) {
868		const struct sfq_slot *slot = &q->slots[idx];
869
870		xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
871		qs.qlen = slot->qlen;
872		qs.backlog = slot->backlog;
873	}
874	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
875		return -1;
876	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
877}
878
879static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
880{
881	struct sfq_sched_data *q = qdisc_priv(sch);
882	unsigned int i;
883
884	if (arg->stop)
885		return;
886
887	for (i = 0; i < q->divisor; i++) {
888		if (q->ht[i] == SFQ_EMPTY_SLOT ||
889		    arg->count < arg->skip) {
890			arg->count++;
891			continue;
892		}
893		if (arg->fn(sch, i + 1, arg) < 0) {
894			arg->stop = 1;
895			break;
896		}
897		arg->count++;
898	}
899}
900
901static const struct Qdisc_class_ops sfq_class_ops = {
902	.leaf		=	sfq_leaf,
903	.get		=	sfq_get,
904	.put		=	sfq_put,
905	.tcf_chain	=	sfq_find_tcf,
906	.bind_tcf	=	sfq_bind,
907	.unbind_tcf	=	sfq_put,
908	.dump		=	sfq_dump_class,
909	.dump_stats	=	sfq_dump_class_stats,
910	.walk		=	sfq_walk,
911};
912
913static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
914	.cl_ops		=	&sfq_class_ops,
915	.id		=	"sfq",
916	.priv_size	=	sizeof(struct sfq_sched_data),
917	.enqueue	=	sfq_enqueue,
918	.dequeue	=	sfq_dequeue,
919	.peek		=	qdisc_peek_dequeued,
920	.drop		=	sfq_drop,
921	.init		=	sfq_init,
922	.reset		=	sfq_reset,
923	.destroy	=	sfq_destroy,
924	.change		=	NULL,
925	.dump		=	sfq_dump,
926	.owner		=	THIS_MODULE,
927};
928
929static int __init sfq_module_init(void)
930{
931	return register_qdisc(&sfq_qdisc_ops);
932}
933static void __exit sfq_module_exit(void)
934{
935	unregister_qdisc(&sfq_qdisc_ops);
936}
937module_init(sfq_module_init)
938module_exit(sfq_module_exit)
939MODULE_LICENSE("GPL");
940