• Home
  • History
  • Annotate
  • only in this directory
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		ROUTE - implementation of the IP router.
7 *
8 * Authors:	Ross Biro
9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 *		Alan Cox	:	Verify area fixes.
16 *		Alan Cox	:	cli() protects routing changes
17 *		Rui Oliveira	:	ICMP routing table updates
18 *		(rco@di.uminho.pt)	Routing table insertion and update
19 *		Linus Torvalds	:	Rewrote bits to be sensible
20 *		Alan Cox	:	Added BSD route gw semantics
21 *		Alan Cox	:	Super /proc >4K
22 *		Alan Cox	:	MTU in route table
23 *		Alan Cox	: 	MSS actually. Also added the window
24 *					clamper.
25 *		Sam Lantinga	:	Fixed route matching in rt_del()
26 *		Alan Cox	:	Routing cache support.
27 *		Alan Cox	:	Removed compatibility cruft.
28 *		Alan Cox	:	RTF_REJECT support.
29 *		Alan Cox	:	TCP irtt support.
30 *		Jonathan Naylor	:	Added Metric support.
31 *	Miquel van Smoorenburg	:	BSD API fixes.
32 *	Miquel van Smoorenburg	:	Metrics.
33 *		Alan Cox	:	Use __u32 properly
34 *		Alan Cox	:	Aligned routing errors more closely with BSD
35 *					our system is still very different.
36 *		Alan Cox	:	Faster /proc handling
37 *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38 *					routing caches and better behaviour.
39 *
40 *		Olaf Erb	:	irtt wasn't being copied right.
41 *		Bjorn Ekwall	:	Kerneld route support.
42 *		Alan Cox	:	Multicast fixed (I hope)
43 * 		Pavel Krauz	:	Limited broadcast fixed
44 *		Mike McLagan	:	Routing by source
45 *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46 *					route.c and rewritten from scratch.
47 *		Andi Kleen	:	Load-limit warning messages.
48 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49 *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50 *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51 *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52 *		Marc Boucher	:	routing by fwmark
53 *	Robert Olsson		:	Added rt_cache statistics
54 *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55 *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56 * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57 * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58 *
59 *		This program is free software; you can redistribute it and/or
60 *		modify it under the terms of the GNU General Public License
61 *		as published by the Free Software Foundation; either version
62 *		2 of the License, or (at your option) any later version.
63 */
64
65#define pr_fmt(fmt) "IPv4: " fmt
66
67#include <linux/module.h>
68#include <asm/uaccess.h>
69#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
72#include <linux/mm.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/skbuff.h>
83#include <linux/inetdevice.h>
84#include <linux/igmp.h>
85#include <linux/pkt_sched.h>
86#include <linux/mroute.h>
87#include <linux/netfilter_ipv4.h>
88#include <linux/random.h>
89#include <linux/rcupdate.h>
90#include <linux/times.h>
91#include <linux/slab.h>
92#include <linux/jhash.h>
93#include <net/dst.h>
94#include <net/net_namespace.h>
95#include <net/protocol.h>
96#include <net/ip.h>
97#include <net/route.h>
98#include <net/inetpeer.h>
99#include <net/sock.h>
100#include <net/ip_fib.h>
101#include <net/arp.h>
102#include <net/tcp.h>
103#include <net/icmp.h>
104#include <net/xfrm.h>
105#include <net/netevent.h>
106#include <net/rtnetlink.h>
107#ifdef CONFIG_SYSCTL
108#include <linux/sysctl.h>
109#include <linux/kmemleak.h>
110#endif
111#include <net/secure_seq.h>
112
113#define RT_FL_TOS(oldflp4) \
114	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
115
116#define RT_GC_TIMEOUT (300*HZ)
117
118static int ip_rt_max_size;
119static int ip_rt_redirect_number __read_mostly	= 9;
120static int ip_rt_redirect_load __read_mostly	= HZ / 50;
121static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
122static int ip_rt_error_cost __read_mostly	= HZ;
123static int ip_rt_error_burst __read_mostly	= 5 * HZ;
124static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
125static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
126static int ip_rt_min_advmss __read_mostly	= 256;
127
128static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
129/*
130 *	Interface to generic destination cache.
131 */
132
133static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
134static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
135static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
136static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
137static void		 ipv4_link_failure(struct sk_buff *skb);
138static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
139					   struct sk_buff *skb, u32 mtu);
140static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
141					struct sk_buff *skb);
142static void		ipv4_dst_destroy(struct dst_entry *dst);
143
144static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
145{
146	WARN_ON(1);
147	return NULL;
148}
149
150static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
151					   struct sk_buff *skb,
152					   const void *daddr);
153
154static struct dst_ops ipv4_dst_ops = {
155	.family =		AF_INET,
156	.check =		ipv4_dst_check,
157	.default_advmss =	ipv4_default_advmss,
158	.mtu =			ipv4_mtu,
159	.cow_metrics =		ipv4_cow_metrics,
160	.destroy =		ipv4_dst_destroy,
161	.negative_advice =	ipv4_negative_advice,
162	.link_failure =		ipv4_link_failure,
163	.update_pmtu =		ip_rt_update_pmtu,
164	.redirect =		ip_do_redirect,
165	.local_out =		__ip_local_out,
166	.neigh_lookup =		ipv4_neigh_lookup,
167};
168
169#define ECN_OR_COST(class)	TC_PRIO_##class
170
171const __u8 ip_tos2prio[16] = {
172	TC_PRIO_BESTEFFORT,
173	ECN_OR_COST(BESTEFFORT),
174	TC_PRIO_BESTEFFORT,
175	ECN_OR_COST(BESTEFFORT),
176	TC_PRIO_BULK,
177	ECN_OR_COST(BULK),
178	TC_PRIO_BULK,
179	ECN_OR_COST(BULK),
180	TC_PRIO_INTERACTIVE,
181	ECN_OR_COST(INTERACTIVE),
182	TC_PRIO_INTERACTIVE,
183	ECN_OR_COST(INTERACTIVE),
184	TC_PRIO_INTERACTIVE_BULK,
185	ECN_OR_COST(INTERACTIVE_BULK),
186	TC_PRIO_INTERACTIVE_BULK,
187	ECN_OR_COST(INTERACTIVE_BULK)
188};
189EXPORT_SYMBOL(ip_tos2prio);
190
191static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
192#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
193
194#ifdef CONFIG_PROC_FS
195static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
196{
197	if (*pos)
198		return NULL;
199	return SEQ_START_TOKEN;
200}
201
202static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
203{
204	++*pos;
205	return NULL;
206}
207
208static void rt_cache_seq_stop(struct seq_file *seq, void *v)
209{
210}
211
212static int rt_cache_seq_show(struct seq_file *seq, void *v)
213{
214	if (v == SEQ_START_TOKEN)
215		seq_printf(seq, "%-127s\n",
216			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
217			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
218			   "HHUptod\tSpecDst");
219	return 0;
220}
221
222static const struct seq_operations rt_cache_seq_ops = {
223	.start  = rt_cache_seq_start,
224	.next   = rt_cache_seq_next,
225	.stop   = rt_cache_seq_stop,
226	.show   = rt_cache_seq_show,
227};
228
229static int rt_cache_seq_open(struct inode *inode, struct file *file)
230{
231	return seq_open(file, &rt_cache_seq_ops);
232}
233
234static const struct file_operations rt_cache_seq_fops = {
235	.owner	 = THIS_MODULE,
236	.open	 = rt_cache_seq_open,
237	.read	 = seq_read,
238	.llseek	 = seq_lseek,
239	.release = seq_release,
240};
241
242
243static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
244{
245	int cpu;
246
247	if (*pos == 0)
248		return SEQ_START_TOKEN;
249
250	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
251		if (!cpu_possible(cpu))
252			continue;
253		*pos = cpu+1;
254		return &per_cpu(rt_cache_stat, cpu);
255	}
256	return NULL;
257}
258
259static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
260{
261	int cpu;
262
263	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
264		if (!cpu_possible(cpu))
265			continue;
266		*pos = cpu+1;
267		return &per_cpu(rt_cache_stat, cpu);
268	}
269	return NULL;
270
271}
272
273static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
274{
275
276}
277
278static int rt_cpu_seq_show(struct seq_file *seq, void *v)
279{
280	struct rt_cache_stat *st = v;
281
282	if (v == SEQ_START_TOKEN) {
283		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
284		return 0;
285	}
286
287	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
288		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
289		   dst_entries_get_slow(&ipv4_dst_ops),
290		   0, /* st->in_hit */
291		   st->in_slow_tot,
292		   st->in_slow_mc,
293		   st->in_no_route,
294		   st->in_brd,
295		   st->in_martian_dst,
296		   st->in_martian_src,
297
298		   0, /* st->out_hit */
299		   st->out_slow_tot,
300		   st->out_slow_mc,
301
302		   0, /* st->gc_total */
303		   0, /* st->gc_ignored */
304		   0, /* st->gc_goal_miss */
305		   0, /* st->gc_dst_overflow */
306		   0, /* st->in_hlist_search */
307		   0  /* st->out_hlist_search */
308		);
309	return 0;
310}
311
312static const struct seq_operations rt_cpu_seq_ops = {
313	.start  = rt_cpu_seq_start,
314	.next   = rt_cpu_seq_next,
315	.stop   = rt_cpu_seq_stop,
316	.show   = rt_cpu_seq_show,
317};
318
319
320static int rt_cpu_seq_open(struct inode *inode, struct file *file)
321{
322	return seq_open(file, &rt_cpu_seq_ops);
323}
324
325static const struct file_operations rt_cpu_seq_fops = {
326	.owner	 = THIS_MODULE,
327	.open	 = rt_cpu_seq_open,
328	.read	 = seq_read,
329	.llseek	 = seq_lseek,
330	.release = seq_release,
331};
332
333#ifdef CONFIG_IP_ROUTE_CLASSID
334static int rt_acct_proc_show(struct seq_file *m, void *v)
335{
336	struct ip_rt_acct *dst, *src;
337	unsigned int i, j;
338
339	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
340	if (!dst)
341		return -ENOMEM;
342
343	for_each_possible_cpu(i) {
344		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
345		for (j = 0; j < 256; j++) {
346			dst[j].o_bytes   += src[j].o_bytes;
347			dst[j].o_packets += src[j].o_packets;
348			dst[j].i_bytes   += src[j].i_bytes;
349			dst[j].i_packets += src[j].i_packets;
350		}
351	}
352
353	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
354	kfree(dst);
355	return 0;
356}
357
358static int rt_acct_proc_open(struct inode *inode, struct file *file)
359{
360	return single_open(file, rt_acct_proc_show, NULL);
361}
362
363static const struct file_operations rt_acct_proc_fops = {
364	.owner		= THIS_MODULE,
365	.open		= rt_acct_proc_open,
366	.read		= seq_read,
367	.llseek		= seq_lseek,
368	.release	= single_release,
369};
370#endif
371
372static int __net_init ip_rt_do_proc_init(struct net *net)
373{
374	struct proc_dir_entry *pde;
375
376	pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
377			  &rt_cache_seq_fops);
378	if (!pde)
379		goto err1;
380
381	pde = proc_create("rt_cache", S_IRUGO,
382			  net->proc_net_stat, &rt_cpu_seq_fops);
383	if (!pde)
384		goto err2;
385
386#ifdef CONFIG_IP_ROUTE_CLASSID
387	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
388	if (!pde)
389		goto err3;
390#endif
391	return 0;
392
393#ifdef CONFIG_IP_ROUTE_CLASSID
394err3:
395	remove_proc_entry("rt_cache", net->proc_net_stat);
396#endif
397err2:
398	remove_proc_entry("rt_cache", net->proc_net);
399err1:
400	return -ENOMEM;
401}
402
403static void __net_exit ip_rt_do_proc_exit(struct net *net)
404{
405	remove_proc_entry("rt_cache", net->proc_net_stat);
406	remove_proc_entry("rt_cache", net->proc_net);
407#ifdef CONFIG_IP_ROUTE_CLASSID
408	remove_proc_entry("rt_acct", net->proc_net);
409#endif
410}
411
412static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
413	.init = ip_rt_do_proc_init,
414	.exit = ip_rt_do_proc_exit,
415};
416
417static int __init ip_rt_proc_init(void)
418{
419	return register_pernet_subsys(&ip_rt_proc_ops);
420}
421
422#else
423static inline int ip_rt_proc_init(void)
424{
425	return 0;
426}
427#endif /* CONFIG_PROC_FS */
428
429static inline bool rt_is_expired(const struct rtable *rth)
430{
431	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
432}
433
434void rt_cache_flush(struct net *net)
435{
436	rt_genid_bump_ipv4(net);
437}
438
439static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
440					   struct sk_buff *skb,
441					   const void *daddr)
442{
443	struct net_device *dev = dst->dev;
444	const __be32 *pkey = daddr;
445	const struct rtable *rt;
446	struct neighbour *n;
447
448	rt = (const struct rtable *) dst;
449	if (rt->rt_gateway)
450		pkey = (const __be32 *) &rt->rt_gateway;
451	else if (skb)
452		pkey = &ip_hdr(skb)->daddr;
453
454	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
455	if (n)
456		return n;
457	return neigh_create(&arp_tbl, pkey, dev);
458}
459
460#define IP_IDENTS_SZ 2048u
461struct ip_ident_bucket {
462	atomic_t	id;
463	u32		stamp32;
464};
465
466static struct ip_ident_bucket *ip_idents __read_mostly;
467
468/* In order to protect privacy, we add a perturbation to identifiers
469 * if one generator is seldom used. This makes hard for an attacker
470 * to infer how many packets were sent between two points in time.
471 */
472u32 ip_idents_reserve(u32 hash, int segs)
473{
474	struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
475	u32 old = ACCESS_ONCE(bucket->stamp32);
476	u32 now = (u32)jiffies;
477	u32 delta = 0;
478
479	if (old != now && cmpxchg(&bucket->stamp32, old, now) == old)
480		delta = prandom_u32_max(now - old);
481
482	return atomic_add_return(segs + delta, &bucket->id) - segs;
483}
484EXPORT_SYMBOL(ip_idents_reserve);
485
486void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
487{
488	static u32 ip_idents_hashrnd __read_mostly;
489	u32 hash, id;
490
491	net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
492
493	hash = jhash_3words((__force u32)iph->daddr,
494			    (__force u32)iph->saddr,
495			    iph->protocol ^ net_hash_mix(net),
496			    ip_idents_hashrnd);
497	id = ip_idents_reserve(hash, segs);
498	iph->id = htons(id);
499}
500EXPORT_SYMBOL(__ip_select_ident);
501
502static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
503			     const struct iphdr *iph,
504			     int oif, u8 tos,
505			     u8 prot, u32 mark, int flow_flags)
506{
507	if (sk) {
508		const struct inet_sock *inet = inet_sk(sk);
509
510		oif = sk->sk_bound_dev_if;
511		mark = sk->sk_mark;
512		tos = RT_CONN_FLAGS(sk);
513		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
514	}
515	flowi4_init_output(fl4, oif, mark, tos,
516			   RT_SCOPE_UNIVERSE, prot,
517			   flow_flags,
518			   iph->daddr, iph->saddr, 0, 0);
519}
520
521static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
522			       const struct sock *sk)
523{
524	const struct iphdr *iph = ip_hdr(skb);
525	int oif = skb->dev->ifindex;
526	u8 tos = RT_TOS(iph->tos);
527	u8 prot = iph->protocol;
528	u32 mark = skb->mark;
529
530	__build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
531}
532
533static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
534{
535	const struct inet_sock *inet = inet_sk(sk);
536	const struct ip_options_rcu *inet_opt;
537	__be32 daddr = inet->inet_daddr;
538
539	rcu_read_lock();
540	inet_opt = rcu_dereference(inet->inet_opt);
541	if (inet_opt && inet_opt->opt.srr)
542		daddr = inet_opt->opt.faddr;
543	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
544			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
545			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
546			   inet_sk_flowi_flags(sk),
547			   daddr, inet->inet_saddr, 0, 0);
548	rcu_read_unlock();
549}
550
551static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
552				 const struct sk_buff *skb)
553{
554	if (skb)
555		build_skb_flow_key(fl4, skb, sk);
556	else
557		build_sk_flow_key(fl4, sk);
558}
559
560static inline void rt_free(struct rtable *rt)
561{
562	call_rcu(&rt->dst.rcu_head, dst_rcu_free);
563}
564
565static DEFINE_SPINLOCK(fnhe_lock);
566
567static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
568{
569	struct rtable *rt;
570
571	rt = rcu_dereference(fnhe->fnhe_rth_input);
572	if (rt) {
573		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
574		rt_free(rt);
575	}
576	rt = rcu_dereference(fnhe->fnhe_rth_output);
577	if (rt) {
578		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
579		rt_free(rt);
580	}
581}
582
583static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
584{
585	struct fib_nh_exception *fnhe, *oldest;
586
587	oldest = rcu_dereference(hash->chain);
588	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
589	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
590		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
591			oldest = fnhe;
592	}
593	fnhe_flush_routes(oldest);
594	return oldest;
595}
596
597static inline u32 fnhe_hashfun(__be32 daddr)
598{
599	static u32 fnhe_hashrnd __read_mostly;
600	u32 hval;
601
602	net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
603	hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
604	return hash_32(hval, FNHE_HASH_SHIFT);
605}
606
607static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
608{
609	rt->rt_pmtu = fnhe->fnhe_pmtu;
610	rt->dst.expires = fnhe->fnhe_expires;
611
612	if (fnhe->fnhe_gw) {
613		rt->rt_flags |= RTCF_REDIRECTED;
614		rt->rt_gateway = fnhe->fnhe_gw;
615		rt->rt_uses_gateway = 1;
616	}
617}
618
619static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
620				  u32 pmtu, unsigned long expires)
621{
622	struct fnhe_hash_bucket *hash;
623	struct fib_nh_exception *fnhe;
624	struct rtable *rt;
625	unsigned int i;
626	int depth;
627	u32 hval = fnhe_hashfun(daddr);
628
629	spin_lock_bh(&fnhe_lock);
630
631	hash = rcu_dereference(nh->nh_exceptions);
632	if (!hash) {
633		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
634		if (!hash)
635			goto out_unlock;
636		rcu_assign_pointer(nh->nh_exceptions, hash);
637	}
638
639	hash += hval;
640
641	depth = 0;
642	for (fnhe = rcu_dereference(hash->chain); fnhe;
643	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
644		if (fnhe->fnhe_daddr == daddr)
645			break;
646		depth++;
647	}
648
649	if (fnhe) {
650		if (gw)
651			fnhe->fnhe_gw = gw;
652		if (pmtu) {
653			fnhe->fnhe_pmtu = pmtu;
654			fnhe->fnhe_expires = max(1UL, expires);
655		}
656		/* Update all cached dsts too */
657		rt = rcu_dereference(fnhe->fnhe_rth_input);
658		if (rt)
659			fill_route_from_fnhe(rt, fnhe);
660		rt = rcu_dereference(fnhe->fnhe_rth_output);
661		if (rt)
662			fill_route_from_fnhe(rt, fnhe);
663	} else {
664		if (depth > FNHE_RECLAIM_DEPTH)
665			fnhe = fnhe_oldest(hash);
666		else {
667			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
668			if (!fnhe)
669				goto out_unlock;
670
671			fnhe->fnhe_next = hash->chain;
672			rcu_assign_pointer(hash->chain, fnhe);
673		}
674		fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
675		fnhe->fnhe_daddr = daddr;
676		fnhe->fnhe_gw = gw;
677		fnhe->fnhe_pmtu = pmtu;
678		fnhe->fnhe_expires = expires;
679
680		/* Exception created; mark the cached routes for the nexthop
681		 * stale, so anyone caching it rechecks if this exception
682		 * applies to them.
683		 */
684		rt = rcu_dereference(nh->nh_rth_input);
685		if (rt)
686			rt->dst.obsolete = DST_OBSOLETE_KILL;
687
688		for_each_possible_cpu(i) {
689			struct rtable __rcu **prt;
690			prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
691			rt = rcu_dereference(*prt);
692			if (rt)
693				rt->dst.obsolete = DST_OBSOLETE_KILL;
694		}
695	}
696
697	fnhe->fnhe_stamp = jiffies;
698
699out_unlock:
700	spin_unlock_bh(&fnhe_lock);
701}
702
703static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
704			     bool kill_route)
705{
706	__be32 new_gw = icmp_hdr(skb)->un.gateway;
707	__be32 old_gw = ip_hdr(skb)->saddr;
708	struct net_device *dev = skb->dev;
709	struct in_device *in_dev;
710	struct fib_result res;
711	struct neighbour *n;
712	struct net *net;
713
714	switch (icmp_hdr(skb)->code & 7) {
715	case ICMP_REDIR_NET:
716	case ICMP_REDIR_NETTOS:
717	case ICMP_REDIR_HOST:
718	case ICMP_REDIR_HOSTTOS:
719		break;
720
721	default:
722		return;
723	}
724
725	if (rt->rt_gateway != old_gw)
726		return;
727
728	in_dev = __in_dev_get_rcu(dev);
729	if (!in_dev)
730		return;
731
732	net = dev_net(dev);
733	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
734	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
735	    ipv4_is_zeronet(new_gw))
736		goto reject_redirect;
737
738	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
739		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
740			goto reject_redirect;
741		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
742			goto reject_redirect;
743	} else {
744		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
745			goto reject_redirect;
746	}
747
748	n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
749	if (!IS_ERR(n)) {
750		if (!(n->nud_state & NUD_VALID)) {
751			neigh_event_send(n, NULL);
752		} else {
753			if (fib_lookup(net, fl4, &res) == 0) {
754				struct fib_nh *nh = &FIB_RES_NH(res);
755
756				update_or_create_fnhe(nh, fl4->daddr, new_gw,
757						0, jiffies + ip_rt_gc_timeout);
758			}
759			if (kill_route)
760				rt->dst.obsolete = DST_OBSOLETE_KILL;
761			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
762		}
763		neigh_release(n);
764	}
765	return;
766
767reject_redirect:
768#ifdef CONFIG_IP_ROUTE_VERBOSE
769	if (IN_DEV_LOG_MARTIANS(in_dev)) {
770		const struct iphdr *iph = (const struct iphdr *) skb->data;
771		__be32 daddr = iph->daddr;
772		__be32 saddr = iph->saddr;
773
774		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
775				     "  Advised path = %pI4 -> %pI4\n",
776				     &old_gw, dev->name, &new_gw,
777				     &saddr, &daddr);
778	}
779#endif
780	;
781}
782
783static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
784{
785	struct rtable *rt;
786	struct flowi4 fl4;
787	const struct iphdr *iph = (const struct iphdr *) skb->data;
788	int oif = skb->dev->ifindex;
789	u8 tos = RT_TOS(iph->tos);
790	u8 prot = iph->protocol;
791	u32 mark = skb->mark;
792
793	rt = (struct rtable *) dst;
794
795	__build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
796	__ip_do_redirect(rt, skb, &fl4, true);
797}
798
799static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
800{
801	struct rtable *rt = (struct rtable *)dst;
802	struct dst_entry *ret = dst;
803
804	if (rt) {
805		if (dst->obsolete > 0) {
806			ip_rt_put(rt);
807			ret = NULL;
808		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
809			   rt->dst.expires) {
810			ip_rt_put(rt);
811			ret = NULL;
812		}
813	}
814	return ret;
815}
816
817/*
818 * Algorithm:
819 *	1. The first ip_rt_redirect_number redirects are sent
820 *	   with exponential backoff, then we stop sending them at all,
821 *	   assuming that the host ignores our redirects.
822 *	2. If we did not see packets requiring redirects
823 *	   during ip_rt_redirect_silence, we assume that the host
824 *	   forgot redirected route and start to send redirects again.
825 *
826 * This algorithm is much cheaper and more intelligent than dumb load limiting
827 * in icmp.c.
828 *
829 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
830 * and "frag. need" (breaks PMTU discovery) in icmp.c.
831 */
832
833void ip_rt_send_redirect(struct sk_buff *skb)
834{
835	struct rtable *rt = skb_rtable(skb);
836	struct in_device *in_dev;
837	struct inet_peer *peer;
838	struct net *net;
839	int log_martians;
840
841	rcu_read_lock();
842	in_dev = __in_dev_get_rcu(rt->dst.dev);
843	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
844		rcu_read_unlock();
845		return;
846	}
847	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
848	rcu_read_unlock();
849
850	net = dev_net(rt->dst.dev);
851	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
852	if (!peer) {
853		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
854			  rt_nexthop(rt, ip_hdr(skb)->daddr));
855		return;
856	}
857
858	/* No redirected packets during ip_rt_redirect_silence;
859	 * reset the algorithm.
860	 */
861	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
862		peer->rate_tokens = 0;
863
864	/* Too many ignored redirects; do not send anything
865	 * set dst.rate_last to the last seen redirected packet.
866	 */
867	if (peer->rate_tokens >= ip_rt_redirect_number) {
868		peer->rate_last = jiffies;
869		goto out_put_peer;
870	}
871
872	/* Check for load limit; set rate_last to the latest sent
873	 * redirect.
874	 */
875	if (peer->rate_tokens == 0 ||
876	    time_after(jiffies,
877		       (peer->rate_last +
878			(ip_rt_redirect_load << peer->rate_tokens)))) {
879		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
880
881		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
882		peer->rate_last = jiffies;
883		++peer->rate_tokens;
884#ifdef CONFIG_IP_ROUTE_VERBOSE
885		if (log_martians &&
886		    peer->rate_tokens == ip_rt_redirect_number)
887			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
888					     &ip_hdr(skb)->saddr, inet_iif(skb),
889					     &ip_hdr(skb)->daddr, &gw);
890#endif
891	}
892out_put_peer:
893	inet_putpeer(peer);
894}
895
896static int ip_error(struct sk_buff *skb)
897{
898	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
899	struct rtable *rt = skb_rtable(skb);
900	struct inet_peer *peer;
901	unsigned long now;
902	struct net *net;
903	bool send;
904	int code;
905
906	/* IP on this device is disabled. */
907	if (!in_dev)
908		goto out;
909
910	net = dev_net(rt->dst.dev);
911	if (!IN_DEV_FORWARD(in_dev)) {
912		switch (rt->dst.error) {
913		case EHOSTUNREACH:
914			IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
915			break;
916
917		case ENETUNREACH:
918			IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
919			break;
920		}
921		goto out;
922	}
923
924	switch (rt->dst.error) {
925	case EINVAL:
926	default:
927		goto out;
928	case EHOSTUNREACH:
929		code = ICMP_HOST_UNREACH;
930		break;
931	case ENETUNREACH:
932		code = ICMP_NET_UNREACH;
933		IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
934		break;
935	case EACCES:
936		code = ICMP_PKT_FILTERED;
937		break;
938	}
939
940	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
941
942	send = true;
943	if (peer) {
944		now = jiffies;
945		peer->rate_tokens += now - peer->rate_last;
946		if (peer->rate_tokens > ip_rt_error_burst)
947			peer->rate_tokens = ip_rt_error_burst;
948		peer->rate_last = now;
949		if (peer->rate_tokens >= ip_rt_error_cost)
950			peer->rate_tokens -= ip_rt_error_cost;
951		else
952			send = false;
953		inet_putpeer(peer);
954	}
955	if (send)
956		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
957
958out:	kfree_skb(skb);
959	return 0;
960}
961
962static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
963{
964	struct dst_entry *dst = &rt->dst;
965	struct fib_result res;
966
967	if (dst_metric_locked(dst, RTAX_MTU))
968		return;
969
970	if (ipv4_mtu(dst) < mtu)
971		return;
972
973	if (mtu < ip_rt_min_pmtu)
974		mtu = ip_rt_min_pmtu;
975
976	if (rt->rt_pmtu == mtu &&
977	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
978		return;
979
980	rcu_read_lock();
981	if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
982		struct fib_nh *nh = &FIB_RES_NH(res);
983
984		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
985				      jiffies + ip_rt_mtu_expires);
986	}
987	rcu_read_unlock();
988}
989
990static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
991			      struct sk_buff *skb, u32 mtu)
992{
993	struct rtable *rt = (struct rtable *) dst;
994	struct flowi4 fl4;
995
996	ip_rt_build_flow_key(&fl4, sk, skb);
997	__ip_rt_update_pmtu(rt, &fl4, mtu);
998}
999
1000void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1001		      int oif, u32 mark, u8 protocol, int flow_flags)
1002{
1003	const struct iphdr *iph = (const struct iphdr *) skb->data;
1004	struct flowi4 fl4;
1005	struct rtable *rt;
1006
1007	if (!mark)
1008		mark = IP4_REPLY_MARK(net, skb->mark);
1009
1010	__build_flow_key(&fl4, NULL, iph, oif,
1011			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1012	rt = __ip_route_output_key(net, &fl4);
1013	if (!IS_ERR(rt)) {
1014		__ip_rt_update_pmtu(rt, &fl4, mtu);
1015		ip_rt_put(rt);
1016	}
1017}
1018EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1019
1020static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1021{
1022	const struct iphdr *iph = (const struct iphdr *) skb->data;
1023	struct flowi4 fl4;
1024	struct rtable *rt;
1025
1026	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1027
1028	if (!fl4.flowi4_mark)
1029		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1030
1031	rt = __ip_route_output_key(sock_net(sk), &fl4);
1032	if (!IS_ERR(rt)) {
1033		__ip_rt_update_pmtu(rt, &fl4, mtu);
1034		ip_rt_put(rt);
1035	}
1036}
1037
1038void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1039{
1040	const struct iphdr *iph = (const struct iphdr *) skb->data;
1041	struct flowi4 fl4;
1042	struct rtable *rt;
1043	struct dst_entry *odst = NULL;
1044	bool new = false;
1045
1046	bh_lock_sock(sk);
1047
1048	if (!ip_sk_accept_pmtu(sk))
1049		goto out;
1050
1051	odst = sk_dst_get(sk);
1052
1053	if (sock_owned_by_user(sk) || !odst) {
1054		__ipv4_sk_update_pmtu(skb, sk, mtu);
1055		goto out;
1056	}
1057
1058	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1059
1060	rt = (struct rtable *)odst;
1061	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1062		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1063		if (IS_ERR(rt))
1064			goto out;
1065
1066		new = true;
1067	}
1068
1069	__ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1070
1071	if (!dst_check(&rt->dst, 0)) {
1072		if (new)
1073			dst_release(&rt->dst);
1074
1075		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1076		if (IS_ERR(rt))
1077			goto out;
1078
1079		new = true;
1080	}
1081
1082	if (new)
1083		sk_dst_set(sk, &rt->dst);
1084
1085out:
1086	bh_unlock_sock(sk);
1087	dst_release(odst);
1088}
1089EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1090
1091void ipv4_redirect(struct sk_buff *skb, struct net *net,
1092		   int oif, u32 mark, u8 protocol, int flow_flags)
1093{
1094	const struct iphdr *iph = (const struct iphdr *) skb->data;
1095	struct flowi4 fl4;
1096	struct rtable *rt;
1097
1098	__build_flow_key(&fl4, NULL, iph, oif,
1099			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1100	rt = __ip_route_output_key(net, &fl4);
1101	if (!IS_ERR(rt)) {
1102		__ip_do_redirect(rt, skb, &fl4, false);
1103		ip_rt_put(rt);
1104	}
1105}
1106EXPORT_SYMBOL_GPL(ipv4_redirect);
1107
1108void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1109{
1110	const struct iphdr *iph = (const struct iphdr *) skb->data;
1111	struct flowi4 fl4;
1112	struct rtable *rt;
1113
1114	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1115	rt = __ip_route_output_key(sock_net(sk), &fl4);
1116	if (!IS_ERR(rt)) {
1117		__ip_do_redirect(rt, skb, &fl4, false);
1118		ip_rt_put(rt);
1119	}
1120}
1121EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1122
1123static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1124{
1125	struct rtable *rt = (struct rtable *) dst;
1126
1127	/* All IPV4 dsts are created with ->obsolete set to the value
1128	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1129	 * into this function always.
1130	 *
1131	 * When a PMTU/redirect information update invalidates a route,
1132	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1133	 * DST_OBSOLETE_DEAD by dst_free().
1134	 */
1135	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1136		return NULL;
1137	return dst;
1138}
1139
1140static void ipv4_link_failure(struct sk_buff *skb)
1141{
1142	struct rtable *rt;
1143
1144	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1145
1146	rt = skb_rtable(skb);
1147	if (rt)
1148		dst_set_expires(&rt->dst, 0);
1149}
1150
1151static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)
1152{
1153	pr_debug("%s: %pI4 -> %pI4, %s\n",
1154		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1155		 skb->dev ? skb->dev->name : "?");
1156	kfree_skb(skb);
1157	WARN_ON(1);
1158	return 0;
1159}
1160
1161/*
1162   We do not cache source address of outgoing interface,
1163   because it is used only by IP RR, TS and SRR options,
1164   so that it out of fast path.
1165
1166   BTW remember: "addr" is allowed to be not aligned
1167   in IP options!
1168 */
1169
1170void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1171{
1172	__be32 src;
1173
1174	if (rt_is_output_route(rt))
1175		src = ip_hdr(skb)->saddr;
1176	else {
1177		struct fib_result res;
1178		struct flowi4 fl4;
1179		struct iphdr *iph;
1180
1181		iph = ip_hdr(skb);
1182
1183		memset(&fl4, 0, sizeof(fl4));
1184		fl4.daddr = iph->daddr;
1185		fl4.saddr = iph->saddr;
1186		fl4.flowi4_tos = RT_TOS(iph->tos);
1187		fl4.flowi4_oif = rt->dst.dev->ifindex;
1188		fl4.flowi4_iif = skb->dev->ifindex;
1189		fl4.flowi4_mark = skb->mark;
1190
1191		rcu_read_lock();
1192		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1193			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1194		else
1195			src = inet_select_addr(rt->dst.dev,
1196					       rt_nexthop(rt, iph->daddr),
1197					       RT_SCOPE_UNIVERSE);
1198		rcu_read_unlock();
1199	}
1200	memcpy(addr, &src, 4);
1201}
1202
1203#ifdef CONFIG_IP_ROUTE_CLASSID
1204static void set_class_tag(struct rtable *rt, u32 tag)
1205{
1206	if (!(rt->dst.tclassid & 0xFFFF))
1207		rt->dst.tclassid |= tag & 0xFFFF;
1208	if (!(rt->dst.tclassid & 0xFFFF0000))
1209		rt->dst.tclassid |= tag & 0xFFFF0000;
1210}
1211#endif
1212
1213static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1214{
1215	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1216
1217	if (advmss == 0) {
1218		advmss = max_t(unsigned int, dst->dev->mtu - 40,
1219			       ip_rt_min_advmss);
1220		if (advmss > 65535 - 40)
1221			advmss = 65535 - 40;
1222	}
1223	return advmss;
1224}
1225
1226static unsigned int ipv4_mtu(const struct dst_entry *dst)
1227{
1228	const struct rtable *rt = (const struct rtable *) dst;
1229	unsigned int mtu = rt->rt_pmtu;
1230
1231	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1232		mtu = dst_metric_raw(dst, RTAX_MTU);
1233
1234	if (mtu)
1235		return mtu;
1236
1237	mtu = dst->dev->mtu;
1238
1239	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1240		if (rt->rt_uses_gateway && mtu > 576)
1241			mtu = 576;
1242	}
1243
1244	return min_t(unsigned int, mtu, IP_MAX_MTU);
1245}
1246
1247static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1248{
1249	struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1250	struct fib_nh_exception *fnhe;
1251	u32 hval;
1252
1253	if (!hash)
1254		return NULL;
1255
1256	hval = fnhe_hashfun(daddr);
1257
1258	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1259	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1260		if (fnhe->fnhe_daddr == daddr)
1261			return fnhe;
1262	}
1263	return NULL;
1264}
1265
1266static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1267			      __be32 daddr)
1268{
1269	bool ret = false;
1270
1271	spin_lock_bh(&fnhe_lock);
1272
1273	if (daddr == fnhe->fnhe_daddr) {
1274		struct rtable __rcu **porig;
1275		struct rtable *orig;
1276		int genid = fnhe_genid(dev_net(rt->dst.dev));
1277
1278		if (rt_is_input_route(rt))
1279			porig = &fnhe->fnhe_rth_input;
1280		else
1281			porig = &fnhe->fnhe_rth_output;
1282		orig = rcu_dereference(*porig);
1283
1284		if (fnhe->fnhe_genid != genid) {
1285			fnhe->fnhe_genid = genid;
1286			fnhe->fnhe_gw = 0;
1287			fnhe->fnhe_pmtu = 0;
1288			fnhe->fnhe_expires = 0;
1289			fnhe_flush_routes(fnhe);
1290			orig = NULL;
1291		}
1292		fill_route_from_fnhe(rt, fnhe);
1293		if (!rt->rt_gateway)
1294			rt->rt_gateway = daddr;
1295
1296		if (!(rt->dst.flags & DST_NOCACHE)) {
1297			rcu_assign_pointer(*porig, rt);
1298			if (orig)
1299				rt_free(orig);
1300			ret = true;
1301		}
1302
1303		fnhe->fnhe_stamp = jiffies;
1304	}
1305	spin_unlock_bh(&fnhe_lock);
1306
1307	return ret;
1308}
1309
1310static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1311{
1312	struct rtable *orig, *prev, **p;
1313	bool ret = true;
1314
1315	if (rt_is_input_route(rt)) {
1316		p = (struct rtable **)&nh->nh_rth_input;
1317	} else {
1318		p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1319	}
1320	orig = *p;
1321
1322	prev = cmpxchg(p, orig, rt);
1323	if (prev == orig) {
1324		if (orig)
1325			rt_free(orig);
1326	} else
1327		ret = false;
1328
1329	return ret;
1330}
1331
1332struct uncached_list {
1333	spinlock_t		lock;
1334	struct list_head	head;
1335};
1336
1337static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1338
1339static void rt_add_uncached_list(struct rtable *rt)
1340{
1341	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1342
1343	rt->rt_uncached_list = ul;
1344
1345	spin_lock_bh(&ul->lock);
1346	list_add_tail(&rt->rt_uncached, &ul->head);
1347	spin_unlock_bh(&ul->lock);
1348}
1349
1350static void ipv4_dst_destroy(struct dst_entry *dst)
1351{
1352	struct rtable *rt = (struct rtable *) dst;
1353
1354	if (!list_empty(&rt->rt_uncached)) {
1355		struct uncached_list *ul = rt->rt_uncached_list;
1356
1357		spin_lock_bh(&ul->lock);
1358		list_del(&rt->rt_uncached);
1359		spin_unlock_bh(&ul->lock);
1360	}
1361}
1362
1363void rt_flush_dev(struct net_device *dev)
1364{
1365	struct net *net = dev_net(dev);
1366	struct rtable *rt;
1367	int cpu;
1368
1369	for_each_possible_cpu(cpu) {
1370		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1371
1372		spin_lock_bh(&ul->lock);
1373		list_for_each_entry(rt, &ul->head, rt_uncached) {
1374			if (rt->dst.dev != dev)
1375				continue;
1376			rt->dst.dev = net->loopback_dev;
1377			dev_hold(rt->dst.dev);
1378			dev_put(dev);
1379		}
1380		spin_unlock_bh(&ul->lock);
1381	}
1382}
1383
1384static bool rt_cache_valid(const struct rtable *rt)
1385{
1386	return	rt &&
1387		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1388		!rt_is_expired(rt);
1389}
1390
1391static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1392			   const struct fib_result *res,
1393			   struct fib_nh_exception *fnhe,
1394			   struct fib_info *fi, u16 type, u32 itag)
1395{
1396	bool cached = false;
1397
1398	if (fi) {
1399		struct fib_nh *nh = &FIB_RES_NH(*res);
1400
1401		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1402			rt->rt_gateway = nh->nh_gw;
1403			rt->rt_uses_gateway = 1;
1404		}
1405		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1406#ifdef CONFIG_IP_ROUTE_CLASSID
1407		rt->dst.tclassid = nh->nh_tclassid;
1408#endif
1409		if (unlikely(fnhe))
1410			cached = rt_bind_exception(rt, fnhe, daddr);
1411		else if (!(rt->dst.flags & DST_NOCACHE))
1412			cached = rt_cache_route(nh, rt);
1413		if (unlikely(!cached)) {
1414			/* Routes we intend to cache in nexthop exception or
1415			 * FIB nexthop have the DST_NOCACHE bit clear.
1416			 * However, if we are unsuccessful at storing this
1417			 * route into the cache we really need to set it.
1418			 */
1419			rt->dst.flags |= DST_NOCACHE;
1420			if (!rt->rt_gateway)
1421				rt->rt_gateway = daddr;
1422			rt_add_uncached_list(rt);
1423		}
1424	} else
1425		rt_add_uncached_list(rt);
1426
1427#ifdef CONFIG_IP_ROUTE_CLASSID
1428#ifdef CONFIG_IP_MULTIPLE_TABLES
1429	set_class_tag(rt, res->tclassid);
1430#endif
1431	set_class_tag(rt, itag);
1432#endif
1433}
1434
1435static struct rtable *rt_dst_alloc(struct net_device *dev,
1436				   bool nopolicy, bool noxfrm, bool will_cache)
1437{
1438	return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1439			 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1440			 (nopolicy ? DST_NOPOLICY : 0) |
1441			 (noxfrm ? DST_NOXFRM : 0));
1442}
1443
1444/* called in rcu_read_lock() section */
1445static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1446				u8 tos, struct net_device *dev, int our)
1447{
1448	struct rtable *rth;
1449	struct in_device *in_dev = __in_dev_get_rcu(dev);
1450	u32 itag = 0;
1451	int err;
1452
1453	/* Primary sanity checks. */
1454
1455	if (!in_dev)
1456		return -EINVAL;
1457
1458	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1459	    skb->protocol != htons(ETH_P_IP))
1460		goto e_inval;
1461
1462	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1463		if (ipv4_is_loopback(saddr))
1464			goto e_inval;
1465
1466	if (ipv4_is_zeronet(saddr)) {
1467		if (!ipv4_is_local_multicast(daddr))
1468			goto e_inval;
1469	} else {
1470		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1471					  in_dev, &itag);
1472		if (err < 0)
1473			goto e_err;
1474	}
1475	rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1476			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1477	if (!rth)
1478		goto e_nobufs;
1479
1480#ifdef CONFIG_IP_ROUTE_CLASSID
1481	rth->dst.tclassid = itag;
1482#endif
1483	rth->dst.output = ip_rt_bug;
1484
1485	rth->rt_genid	= rt_genid_ipv4(dev_net(dev));
1486	rth->rt_flags	= RTCF_MULTICAST;
1487	rth->rt_type	= RTN_MULTICAST;
1488	rth->rt_is_input= 1;
1489	rth->rt_iif	= 0;
1490	rth->rt_pmtu	= 0;
1491	rth->rt_gateway	= 0;
1492	rth->rt_uses_gateway = 0;
1493	INIT_LIST_HEAD(&rth->rt_uncached);
1494	if (our) {
1495		rth->dst.input= ip_local_deliver;
1496		rth->rt_flags |= RTCF_LOCAL;
1497	}
1498
1499#ifdef CONFIG_IP_MROUTE
1500	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1501		rth->dst.input = ip_mr_input;
1502#endif
1503	RT_CACHE_STAT_INC(in_slow_mc);
1504
1505	skb_dst_set(skb, &rth->dst);
1506	return 0;
1507
1508e_nobufs:
1509	return -ENOBUFS;
1510e_inval:
1511	return -EINVAL;
1512e_err:
1513	return err;
1514}
1515
1516
1517static void ip_handle_martian_source(struct net_device *dev,
1518				     struct in_device *in_dev,
1519				     struct sk_buff *skb,
1520				     __be32 daddr,
1521				     __be32 saddr)
1522{
1523	RT_CACHE_STAT_INC(in_martian_src);
1524#ifdef CONFIG_IP_ROUTE_VERBOSE
1525	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1526		/*
1527		 *	RFC1812 recommendation, if source is martian,
1528		 *	the only hint is MAC header.
1529		 */
1530		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1531			&daddr, &saddr, dev->name);
1532		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1533			print_hex_dump(KERN_WARNING, "ll header: ",
1534				       DUMP_PREFIX_OFFSET, 16, 1,
1535				       skb_mac_header(skb),
1536				       dev->hard_header_len, true);
1537		}
1538	}
1539#endif
1540}
1541
1542static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1543{
1544	struct fnhe_hash_bucket *hash;
1545	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1546	u32 hval = fnhe_hashfun(daddr);
1547
1548	spin_lock_bh(&fnhe_lock);
1549
1550	hash = rcu_dereference_protected(nh->nh_exceptions,
1551					 lockdep_is_held(&fnhe_lock));
1552	hash += hval;
1553
1554	fnhe_p = &hash->chain;
1555	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1556	while (fnhe) {
1557		if (fnhe->fnhe_daddr == daddr) {
1558			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1559				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1560			fnhe_flush_routes(fnhe);
1561			kfree_rcu(fnhe, rcu);
1562			break;
1563		}
1564		fnhe_p = &fnhe->fnhe_next;
1565		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1566						 lockdep_is_held(&fnhe_lock));
1567	}
1568
1569	spin_unlock_bh(&fnhe_lock);
1570}
1571
1572/* called in rcu_read_lock() section */
1573static int __mkroute_input(struct sk_buff *skb,
1574			   const struct fib_result *res,
1575			   struct in_device *in_dev,
1576			   __be32 daddr, __be32 saddr, u32 tos)
1577{
1578	struct fib_nh_exception *fnhe;
1579	struct rtable *rth;
1580	int err;
1581	struct in_device *out_dev;
1582	unsigned int flags = 0;
1583	bool do_cache;
1584	u32 itag = 0;
1585
1586	/* get a working reference to the output device */
1587	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1588	if (!out_dev) {
1589		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1590		return -EINVAL;
1591	}
1592
1593	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1594				  in_dev->dev, in_dev, &itag);
1595	if (err < 0) {
1596		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1597					 saddr);
1598
1599		goto cleanup;
1600	}
1601
1602	do_cache = res->fi && !itag;
1603	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1604	    skb->protocol == htons(ETH_P_IP) &&
1605	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1606	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1607		IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1608
1609	if (skb->protocol != htons(ETH_P_IP)) {
1610		/* Not IP (i.e. ARP). Do not create route, if it is
1611		 * invalid for proxy arp. DNAT routes are always valid.
1612		 *
1613		 * Proxy arp feature have been extended to allow, ARP
1614		 * replies back to the same interface, to support
1615		 * Private VLAN switch technologies. See arp.c.
1616		 */
1617		if (out_dev == in_dev &&
1618		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1619			err = -EINVAL;
1620			goto cleanup;
1621		}
1622	}
1623
1624	fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1625	if (do_cache) {
1626		if (fnhe) {
1627			rth = rcu_dereference(fnhe->fnhe_rth_input);
1628			if (rth && rth->dst.expires &&
1629			    time_after(jiffies, rth->dst.expires)) {
1630				ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1631				fnhe = NULL;
1632			} else {
1633				goto rt_cache;
1634			}
1635		}
1636
1637		rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1638
1639rt_cache:
1640		if (rt_cache_valid(rth)) {
1641			skb_dst_set_noref(skb, &rth->dst);
1642			goto out;
1643		}
1644	}
1645
1646	rth = rt_dst_alloc(out_dev->dev,
1647			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1648			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1649	if (!rth) {
1650		err = -ENOBUFS;
1651		goto cleanup;
1652	}
1653
1654	rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
1655	rth->rt_flags = flags;
1656	rth->rt_type = res->type;
1657	rth->rt_is_input = 1;
1658	rth->rt_iif 	= 0;
1659	rth->rt_pmtu	= 0;
1660	rth->rt_gateway	= 0;
1661	rth->rt_uses_gateway = 0;
1662	INIT_LIST_HEAD(&rth->rt_uncached);
1663	RT_CACHE_STAT_INC(in_slow_tot);
1664
1665	rth->dst.input = ip_forward;
1666	rth->dst.output = ip_output;
1667
1668	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1669	skb_dst_set(skb, &rth->dst);
1670out:
1671	err = 0;
1672 cleanup:
1673	return err;
1674}
1675
1676static int ip_mkroute_input(struct sk_buff *skb,
1677			    struct fib_result *res,
1678			    const struct flowi4 *fl4,
1679			    struct in_device *in_dev,
1680			    __be32 daddr, __be32 saddr, u32 tos)
1681{
1682#ifdef CONFIG_IP_ROUTE_MULTIPATH
1683	if (res->fi && res->fi->fib_nhs > 1)
1684		fib_select_multipath(res);
1685#endif
1686
1687	/* create a routing cache entry */
1688	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1689}
1690
1691/*
1692 *	NOTE. We drop all the packets that has local source
1693 *	addresses, because every properly looped back packet
1694 *	must have correct destination already attached by output routine.
1695 *
1696 *	Such approach solves two big problems:
1697 *	1. Not simplex devices are handled properly.
1698 *	2. IP spoofing attempts are filtered with 100% of guarantee.
1699 *	called with rcu_read_lock()
1700 */
1701
1702static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1703			       u8 tos, struct net_device *dev)
1704{
1705	struct fib_result res;
1706	struct in_device *in_dev = __in_dev_get_rcu(dev);
1707	struct flowi4	fl4;
1708	unsigned int	flags = 0;
1709	u32		itag = 0;
1710	struct rtable	*rth;
1711	int		err = -EINVAL;
1712	struct net    *net = dev_net(dev);
1713	bool do_cache;
1714
1715	/* IP on this device is disabled. */
1716
1717	if (!in_dev)
1718		goto out;
1719
1720	/* Check for the most weird martians, which can be not detected
1721	   by fib_lookup.
1722	 */
1723
1724	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1725		goto martian_source;
1726
1727	res.fi = NULL;
1728	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1729		goto brd_input;
1730
1731	/* Accept zero addresses only to limited broadcast;
1732	 * I even do not know to fix it or not. Waiting for complains :-)
1733	 */
1734	if (ipv4_is_zeronet(saddr))
1735		goto martian_source;
1736
1737	if (ipv4_is_zeronet(daddr))
1738		goto martian_destination;
1739
1740	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1741	 * and call it once if daddr or/and saddr are loopback addresses
1742	 */
1743	if (ipv4_is_loopback(daddr)) {
1744		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1745			goto martian_destination;
1746	} else if (ipv4_is_loopback(saddr)) {
1747		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1748			goto martian_source;
1749	}
1750
1751	/*
1752	 *	Now we are ready to route packet.
1753	 */
1754	fl4.flowi4_oif = 0;
1755	fl4.flowi4_iif = dev->ifindex;
1756	fl4.flowi4_mark = skb->mark;
1757	fl4.flowi4_tos = tos;
1758	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1759	fl4.daddr = daddr;
1760	fl4.saddr = saddr;
1761	err = fib_lookup(net, &fl4, &res);
1762	if (err != 0) {
1763		if (!IN_DEV_FORWARD(in_dev))
1764			err = -EHOSTUNREACH;
1765		goto no_route;
1766	}
1767
1768	if (res.type == RTN_BROADCAST)
1769		goto brd_input;
1770
1771	if (res.type == RTN_LOCAL) {
1772		err = fib_validate_source(skb, saddr, daddr, tos,
1773					  0, dev, in_dev, &itag);
1774		if (err < 0)
1775			goto martian_source_keep_err;
1776		goto local_input;
1777	}
1778
1779	if (!IN_DEV_FORWARD(in_dev)) {
1780		err = -EHOSTUNREACH;
1781		goto no_route;
1782	}
1783	if (res.type != RTN_UNICAST)
1784		goto martian_destination;
1785
1786	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1787out:	return err;
1788
1789brd_input:
1790	if (skb->protocol != htons(ETH_P_IP))
1791		goto e_inval;
1792
1793	if (!ipv4_is_zeronet(saddr)) {
1794		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1795					  in_dev, &itag);
1796		if (err < 0)
1797			goto martian_source_keep_err;
1798	}
1799	flags |= RTCF_BROADCAST;
1800	res.type = RTN_BROADCAST;
1801	RT_CACHE_STAT_INC(in_brd);
1802
1803local_input:
1804	do_cache = false;
1805	if (res.fi) {
1806		if (!itag) {
1807			rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1808			if (rt_cache_valid(rth)) {
1809				skb_dst_set_noref(skb, &rth->dst);
1810				err = 0;
1811				goto out;
1812			}
1813			do_cache = true;
1814		}
1815	}
1816
1817	rth = rt_dst_alloc(net->loopback_dev,
1818			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1819	if (!rth)
1820		goto e_nobufs;
1821
1822	rth->dst.input= ip_local_deliver;
1823	rth->dst.output= ip_rt_bug;
1824#ifdef CONFIG_IP_ROUTE_CLASSID
1825	rth->dst.tclassid = itag;
1826#endif
1827
1828	rth->rt_genid = rt_genid_ipv4(net);
1829	rth->rt_flags 	= flags|RTCF_LOCAL;
1830	rth->rt_type	= res.type;
1831	rth->rt_is_input = 1;
1832	rth->rt_iif	= 0;
1833	rth->rt_pmtu	= 0;
1834	rth->rt_gateway	= 0;
1835	rth->rt_uses_gateway = 0;
1836	INIT_LIST_HEAD(&rth->rt_uncached);
1837	RT_CACHE_STAT_INC(in_slow_tot);
1838	if (res.type == RTN_UNREACHABLE) {
1839		rth->dst.input= ip_error;
1840		rth->dst.error= -err;
1841		rth->rt_flags 	&= ~RTCF_LOCAL;
1842	}
1843	if (do_cache) {
1844		if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1845			rth->dst.flags |= DST_NOCACHE;
1846			rt_add_uncached_list(rth);
1847		}
1848	}
1849	skb_dst_set(skb, &rth->dst);
1850	err = 0;
1851	goto out;
1852
1853no_route:
1854	RT_CACHE_STAT_INC(in_no_route);
1855	res.type = RTN_UNREACHABLE;
1856	res.fi = NULL;
1857	goto local_input;
1858
1859	/*
1860	 *	Do not cache martian addresses: they should be logged (RFC1812)
1861	 */
1862martian_destination:
1863	RT_CACHE_STAT_INC(in_martian_dst);
1864#ifdef CONFIG_IP_ROUTE_VERBOSE
1865	if (IN_DEV_LOG_MARTIANS(in_dev))
1866		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1867				     &daddr, &saddr, dev->name);
1868#endif
1869
1870e_inval:
1871	err = -EINVAL;
1872	goto out;
1873
1874e_nobufs:
1875	err = -ENOBUFS;
1876	goto out;
1877
1878martian_source:
1879	err = -EINVAL;
1880martian_source_keep_err:
1881	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1882	goto out;
1883}
1884
1885int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1886			 u8 tos, struct net_device *dev)
1887{
1888	int res;
1889
1890	rcu_read_lock();
1891
1892	/* Multicast recognition logic is moved from route cache to here.
1893	   The problem was that too many Ethernet cards have broken/missing
1894	   hardware multicast filters :-( As result the host on multicasting
1895	   network acquires a lot of useless route cache entries, sort of
1896	   SDR messages from all the world. Now we try to get rid of them.
1897	   Really, provided software IP multicast filter is organized
1898	   reasonably (at least, hashed), it does not result in a slowdown
1899	   comparing with route cache reject entries.
1900	   Note, that multicast routers are not affected, because
1901	   route cache entry is created eventually.
1902	 */
1903	if (ipv4_is_multicast(daddr)) {
1904		struct in_device *in_dev = __in_dev_get_rcu(dev);
1905
1906		if (in_dev) {
1907			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1908						  ip_hdr(skb)->protocol);
1909			if (our
1910#ifdef CONFIG_IP_MROUTE
1911				||
1912			    (!ipv4_is_local_multicast(daddr) &&
1913			     IN_DEV_MFORWARD(in_dev))
1914#endif
1915			   ) {
1916				int res = ip_route_input_mc(skb, daddr, saddr,
1917							    tos, dev, our);
1918				rcu_read_unlock();
1919				return res;
1920			}
1921		}
1922		rcu_read_unlock();
1923		return -EINVAL;
1924	}
1925	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1926	rcu_read_unlock();
1927	return res;
1928}
1929EXPORT_SYMBOL(ip_route_input_noref);
1930
1931/* called with rcu_read_lock() */
1932static struct rtable *__mkroute_output(const struct fib_result *res,
1933				       const struct flowi4 *fl4, int orig_oif,
1934				       struct net_device *dev_out,
1935				       unsigned int flags)
1936{
1937	struct fib_info *fi = res->fi;
1938	struct fib_nh_exception *fnhe;
1939	struct in_device *in_dev;
1940	u16 type = res->type;
1941	struct rtable *rth;
1942	bool do_cache;
1943
1944	in_dev = __in_dev_get_rcu(dev_out);
1945	if (!in_dev)
1946		return ERR_PTR(-EINVAL);
1947
1948	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1949		if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1950			return ERR_PTR(-EINVAL);
1951
1952	if (ipv4_is_lbcast(fl4->daddr))
1953		type = RTN_BROADCAST;
1954	else if (ipv4_is_multicast(fl4->daddr))
1955		type = RTN_MULTICAST;
1956	else if (ipv4_is_zeronet(fl4->daddr))
1957		return ERR_PTR(-EINVAL);
1958
1959	if (dev_out->flags & IFF_LOOPBACK)
1960		flags |= RTCF_LOCAL;
1961
1962	do_cache = true;
1963	if (type == RTN_BROADCAST) {
1964		flags |= RTCF_BROADCAST | RTCF_LOCAL;
1965		fi = NULL;
1966	} else if (type == RTN_MULTICAST) {
1967		flags |= RTCF_MULTICAST | RTCF_LOCAL;
1968		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1969				     fl4->flowi4_proto))
1970			flags &= ~RTCF_LOCAL;
1971		else
1972			do_cache = false;
1973		/* If multicast route do not exist use
1974		 * default one, but do not gateway in this case.
1975		 * Yes, it is hack.
1976		 */
1977		if (fi && res->prefixlen < 4)
1978			fi = NULL;
1979	}
1980
1981	fnhe = NULL;
1982	do_cache &= fi != NULL;
1983	if (do_cache) {
1984		struct rtable __rcu **prth;
1985		struct fib_nh *nh = &FIB_RES_NH(*res);
1986
1987		fnhe = find_exception(nh, fl4->daddr);
1988		if (fnhe) {
1989			prth = &fnhe->fnhe_rth_output;
1990			rth = rcu_dereference(*prth);
1991			if (rth && rth->dst.expires &&
1992			    time_after(jiffies, rth->dst.expires)) {
1993				ip_del_fnhe(nh, fl4->daddr);
1994				fnhe = NULL;
1995			} else {
1996				goto rt_cache;
1997			}
1998		}
1999
2000		if (unlikely(fl4->flowi4_flags &
2001			     FLOWI_FLAG_KNOWN_NH &&
2002			     !(nh->nh_gw &&
2003			       nh->nh_scope == RT_SCOPE_LINK))) {
2004			do_cache = false;
2005			goto add;
2006		}
2007		prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2008		rth = rcu_dereference(*prth);
2009
2010rt_cache:
2011		if (rt_cache_valid(rth)) {
2012			dst_hold(&rth->dst);
2013			return rth;
2014		}
2015	}
2016
2017add:
2018	rth = rt_dst_alloc(dev_out,
2019			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2020			   IN_DEV_CONF_GET(in_dev, NOXFRM),
2021			   do_cache);
2022	if (!rth)
2023		return ERR_PTR(-ENOBUFS);
2024
2025	rth->dst.output = ip_output;
2026
2027	rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
2028	rth->rt_flags	= flags;
2029	rth->rt_type	= type;
2030	rth->rt_is_input = 0;
2031	rth->rt_iif	= orig_oif ? : 0;
2032	rth->rt_pmtu	= 0;
2033	rth->rt_gateway = 0;
2034	rth->rt_uses_gateway = 0;
2035	INIT_LIST_HEAD(&rth->rt_uncached);
2036
2037	RT_CACHE_STAT_INC(out_slow_tot);
2038
2039	if (flags & RTCF_LOCAL)
2040		rth->dst.input = ip_local_deliver;
2041	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2042		if (flags & RTCF_LOCAL &&
2043		    !(dev_out->flags & IFF_LOOPBACK)) {
2044			rth->dst.output = ip_mc_output;
2045			RT_CACHE_STAT_INC(out_slow_mc);
2046		}
2047#ifdef CONFIG_IP_MROUTE
2048		if (type == RTN_MULTICAST) {
2049			if (IN_DEV_MFORWARD(in_dev) &&
2050			    !ipv4_is_local_multicast(fl4->daddr)) {
2051				rth->dst.input = ip_mr_input;
2052				rth->dst.output = ip_mc_output;
2053			}
2054		}
2055#endif
2056	}
2057
2058	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2059
2060	return rth;
2061}
2062
2063/*
2064 * Major route resolver routine.
2065 */
2066
2067struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
2068{
2069	struct net_device *dev_out = NULL;
2070	__u8 tos = RT_FL_TOS(fl4);
2071	unsigned int flags = 0;
2072	struct fib_result res;
2073	struct rtable *rth;
2074	int orig_oif;
2075
2076	res.tclassid	= 0;
2077	res.fi		= NULL;
2078	res.table	= NULL;
2079
2080	orig_oif = fl4->flowi4_oif;
2081
2082	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2083	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2084	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2085			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2086
2087	rcu_read_lock();
2088	if (fl4->saddr) {
2089		rth = ERR_PTR(-EINVAL);
2090		if (ipv4_is_multicast(fl4->saddr) ||
2091		    ipv4_is_lbcast(fl4->saddr) ||
2092		    ipv4_is_zeronet(fl4->saddr))
2093			goto out;
2094
2095		/* I removed check for oif == dev_out->oif here.
2096		   It was wrong for two reasons:
2097		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2098		      is assigned to multiple interfaces.
2099		   2. Moreover, we are allowed to send packets with saddr
2100		      of another iface. --ANK
2101		 */
2102
2103		if (fl4->flowi4_oif == 0 &&
2104		    (ipv4_is_multicast(fl4->daddr) ||
2105		     ipv4_is_lbcast(fl4->daddr))) {
2106			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2107			dev_out = __ip_dev_find(net, fl4->saddr, false);
2108			if (!dev_out)
2109				goto out;
2110
2111			/* Special hack: user can direct multicasts
2112			   and limited broadcast via necessary interface
2113			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2114			   This hack is not just for fun, it allows
2115			   vic,vat and friends to work.
2116			   They bind socket to loopback, set ttl to zero
2117			   and expect that it will work.
2118			   From the viewpoint of routing cache they are broken,
2119			   because we are not allowed to build multicast path
2120			   with loopback source addr (look, routing cache
2121			   cannot know, that ttl is zero, so that packet
2122			   will not leave this host and route is valid).
2123			   Luckily, this hack is good workaround.
2124			 */
2125
2126			fl4->flowi4_oif = dev_out->ifindex;
2127			goto make_route;
2128		}
2129
2130		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2131			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2132			if (!__ip_dev_find(net, fl4->saddr, false))
2133				goto out;
2134		}
2135	}
2136
2137
2138	if (fl4->flowi4_oif) {
2139		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2140		rth = ERR_PTR(-ENODEV);
2141		if (!dev_out)
2142			goto out;
2143
2144		/* RACE: Check return value of inet_select_addr instead. */
2145		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2146			rth = ERR_PTR(-ENETUNREACH);
2147			goto out;
2148		}
2149		if (ipv4_is_local_multicast(fl4->daddr) ||
2150		    ipv4_is_lbcast(fl4->daddr)) {
2151			if (!fl4->saddr)
2152				fl4->saddr = inet_select_addr(dev_out, 0,
2153							      RT_SCOPE_LINK);
2154			goto make_route;
2155		}
2156		if (!fl4->saddr) {
2157			if (ipv4_is_multicast(fl4->daddr))
2158				fl4->saddr = inet_select_addr(dev_out, 0,
2159							      fl4->flowi4_scope);
2160			else if (!fl4->daddr)
2161				fl4->saddr = inet_select_addr(dev_out, 0,
2162							      RT_SCOPE_HOST);
2163		}
2164	}
2165
2166	if (!fl4->daddr) {
2167		fl4->daddr = fl4->saddr;
2168		if (!fl4->daddr)
2169			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2170		dev_out = net->loopback_dev;
2171		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2172		res.type = RTN_LOCAL;
2173		flags |= RTCF_LOCAL;
2174		goto make_route;
2175	}
2176
2177	if (fib_lookup(net, fl4, &res)) {
2178		res.fi = NULL;
2179		res.table = NULL;
2180		if (fl4->flowi4_oif) {
2181			/* Apparently, routing tables are wrong. Assume,
2182			   that the destination is on link.
2183
2184			   WHY? DW.
2185			   Because we are allowed to send to iface
2186			   even if it has NO routes and NO assigned
2187			   addresses. When oif is specified, routing
2188			   tables are looked up with only one purpose:
2189			   to catch if destination is gatewayed, rather than
2190			   direct. Moreover, if MSG_DONTROUTE is set,
2191			   we send packet, ignoring both routing tables
2192			   and ifaddr state. --ANK
2193
2194
2195			   We could make it even if oif is unknown,
2196			   likely IPv6, but we do not.
2197			 */
2198
2199			if (fl4->saddr == 0)
2200				fl4->saddr = inet_select_addr(dev_out, 0,
2201							      RT_SCOPE_LINK);
2202			res.type = RTN_UNICAST;
2203			goto make_route;
2204		}
2205		rth = ERR_PTR(-ENETUNREACH);
2206		goto out;
2207	}
2208
2209	if (res.type == RTN_LOCAL) {
2210		if (!fl4->saddr) {
2211			if (res.fi->fib_prefsrc)
2212				fl4->saddr = res.fi->fib_prefsrc;
2213			else
2214				fl4->saddr = fl4->daddr;
2215		}
2216		dev_out = net->loopback_dev;
2217		fl4->flowi4_oif = dev_out->ifindex;
2218		flags |= RTCF_LOCAL;
2219		goto make_route;
2220	}
2221
2222#ifdef CONFIG_IP_ROUTE_MULTIPATH
2223	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2224		fib_select_multipath(&res);
2225	else
2226#endif
2227	if (!res.prefixlen &&
2228	    res.table->tb_num_default > 1 &&
2229	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
2230		fib_select_default(&res);
2231
2232	if (!fl4->saddr)
2233		fl4->saddr = FIB_RES_PREFSRC(net, res);
2234
2235	dev_out = FIB_RES_DEV(res);
2236	fl4->flowi4_oif = dev_out->ifindex;
2237
2238
2239make_route:
2240	rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2241
2242out:
2243	rcu_read_unlock();
2244	return rth;
2245}
2246EXPORT_SYMBOL_GPL(__ip_route_output_key);
2247
2248static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2249{
2250	return NULL;
2251}
2252
2253static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2254{
2255	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2256
2257	return mtu ? : dst->dev->mtu;
2258}
2259
2260static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2261					  struct sk_buff *skb, u32 mtu)
2262{
2263}
2264
2265static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2266				       struct sk_buff *skb)
2267{
2268}
2269
2270static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2271					  unsigned long old)
2272{
2273	return NULL;
2274}
2275
2276static struct dst_ops ipv4_dst_blackhole_ops = {
2277	.family			=	AF_INET,
2278	.check			=	ipv4_blackhole_dst_check,
2279	.mtu			=	ipv4_blackhole_mtu,
2280	.default_advmss		=	ipv4_default_advmss,
2281	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2282	.redirect		=	ipv4_rt_blackhole_redirect,
2283	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2284	.neigh_lookup		=	ipv4_neigh_lookup,
2285};
2286
2287struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2288{
2289	struct rtable *ort = (struct rtable *) dst_orig;
2290	struct rtable *rt;
2291
2292	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2293	if (rt) {
2294		struct dst_entry *new = &rt->dst;
2295
2296		new->__use = 1;
2297		new->input = dst_discard;
2298		new->output = dst_discard_sk;
2299
2300		new->dev = ort->dst.dev;
2301		if (new->dev)
2302			dev_hold(new->dev);
2303
2304		rt->rt_is_input = ort->rt_is_input;
2305		rt->rt_iif = ort->rt_iif;
2306		rt->rt_pmtu = ort->rt_pmtu;
2307
2308		rt->rt_genid = rt_genid_ipv4(net);
2309		rt->rt_flags = ort->rt_flags;
2310		rt->rt_type = ort->rt_type;
2311		rt->rt_gateway = ort->rt_gateway;
2312		rt->rt_uses_gateway = ort->rt_uses_gateway;
2313
2314		INIT_LIST_HEAD(&rt->rt_uncached);
2315
2316		dst_free(new);
2317	}
2318
2319	dst_release(dst_orig);
2320
2321	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2322}
2323
2324struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2325				    struct sock *sk)
2326{
2327	struct rtable *rt = __ip_route_output_key(net, flp4);
2328
2329	if (IS_ERR(rt))
2330		return rt;
2331
2332	if (flp4->flowi4_proto)
2333		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2334							flowi4_to_flowi(flp4),
2335							sk, 0);
2336
2337	return rt;
2338}
2339EXPORT_SYMBOL_GPL(ip_route_output_flow);
2340
2341static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2342			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2343			u32 seq, int event, int nowait, unsigned int flags)
2344{
2345	struct rtable *rt = skb_rtable(skb);
2346	struct rtmsg *r;
2347	struct nlmsghdr *nlh;
2348	unsigned long expires = 0;
2349	u32 error;
2350	u32 metrics[RTAX_MAX];
2351
2352	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2353	if (!nlh)
2354		return -EMSGSIZE;
2355
2356	r = nlmsg_data(nlh);
2357	r->rtm_family	 = AF_INET;
2358	r->rtm_dst_len	= 32;
2359	r->rtm_src_len	= 0;
2360	r->rtm_tos	= fl4->flowi4_tos;
2361	r->rtm_table	= RT_TABLE_MAIN;
2362	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2363		goto nla_put_failure;
2364	r->rtm_type	= rt->rt_type;
2365	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2366	r->rtm_protocol = RTPROT_UNSPEC;
2367	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2368	if (rt->rt_flags & RTCF_NOTIFY)
2369		r->rtm_flags |= RTM_F_NOTIFY;
2370	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2371		r->rtm_flags |= RTCF_DOREDIRECT;
2372
2373	if (nla_put_in_addr(skb, RTA_DST, dst))
2374		goto nla_put_failure;
2375	if (src) {
2376		r->rtm_src_len = 32;
2377		if (nla_put_in_addr(skb, RTA_SRC, src))
2378			goto nla_put_failure;
2379	}
2380	if (rt->dst.dev &&
2381	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2382		goto nla_put_failure;
2383#ifdef CONFIG_IP_ROUTE_CLASSID
2384	if (rt->dst.tclassid &&
2385	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2386		goto nla_put_failure;
2387#endif
2388	if (!rt_is_input_route(rt) &&
2389	    fl4->saddr != src) {
2390		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2391			goto nla_put_failure;
2392	}
2393	if (rt->rt_uses_gateway &&
2394	    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2395		goto nla_put_failure;
2396
2397	expires = rt->dst.expires;
2398	if (expires) {
2399		unsigned long now = jiffies;
2400
2401		if (time_before(now, expires))
2402			expires -= now;
2403		else
2404			expires = 0;
2405	}
2406
2407	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2408	if (rt->rt_pmtu && expires)
2409		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2410	if (rtnetlink_put_metrics(skb, metrics) < 0)
2411		goto nla_put_failure;
2412
2413	if (fl4->flowi4_mark &&
2414	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2415		goto nla_put_failure;
2416
2417	error = rt->dst.error;
2418
2419	if (rt_is_input_route(rt)) {
2420#ifdef CONFIG_IP_MROUTE
2421		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2422		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2423			int err = ipmr_get_route(net, skb,
2424						 fl4->saddr, fl4->daddr,
2425						 r, nowait);
2426			if (err <= 0) {
2427				if (!nowait) {
2428					if (err == 0)
2429						return 0;
2430					goto nla_put_failure;
2431				} else {
2432					if (err == -EMSGSIZE)
2433						goto nla_put_failure;
2434					error = err;
2435				}
2436			}
2437		} else
2438#endif
2439			if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2440				goto nla_put_failure;
2441	}
2442
2443	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2444		goto nla_put_failure;
2445
2446	nlmsg_end(skb, nlh);
2447	return 0;
2448
2449nla_put_failure:
2450	nlmsg_cancel(skb, nlh);
2451	return -EMSGSIZE;
2452}
2453
2454static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2455{
2456	struct net *net = sock_net(in_skb->sk);
2457	struct rtmsg *rtm;
2458	struct nlattr *tb[RTA_MAX+1];
2459	struct rtable *rt = NULL;
2460	struct flowi4 fl4;
2461	__be32 dst = 0;
2462	__be32 src = 0;
2463	u32 iif;
2464	int err;
2465	int mark;
2466	struct sk_buff *skb;
2467
2468	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2469	if (err < 0)
2470		goto errout;
2471
2472	rtm = nlmsg_data(nlh);
2473
2474	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2475	if (!skb) {
2476		err = -ENOBUFS;
2477		goto errout;
2478	}
2479
2480	/* Reserve room for dummy headers, this skb can pass
2481	   through good chunk of routing engine.
2482	 */
2483	skb_reset_mac_header(skb);
2484	skb_reset_network_header(skb);
2485
2486	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2487	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2488	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2489
2490	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2491	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2492	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2493	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2494
2495	memset(&fl4, 0, sizeof(fl4));
2496	fl4.daddr = dst;
2497	fl4.saddr = src;
2498	fl4.flowi4_tos = rtm->rtm_tos;
2499	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2500	fl4.flowi4_mark = mark;
2501
2502	if (iif) {
2503		struct net_device *dev;
2504
2505		dev = __dev_get_by_index(net, iif);
2506		if (!dev) {
2507			err = -ENODEV;
2508			goto errout_free;
2509		}
2510
2511		skb->protocol	= htons(ETH_P_IP);
2512		skb->dev	= dev;
2513		skb->mark	= mark;
2514		local_bh_disable();
2515		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2516		local_bh_enable();
2517
2518		rt = skb_rtable(skb);
2519		if (err == 0 && rt->dst.error)
2520			err = -rt->dst.error;
2521	} else {
2522		rt = ip_route_output_key(net, &fl4);
2523
2524		err = 0;
2525		if (IS_ERR(rt))
2526			err = PTR_ERR(rt);
2527	}
2528
2529	if (err)
2530		goto errout_free;
2531
2532	skb_dst_set(skb, &rt->dst);
2533	if (rtm->rtm_flags & RTM_F_NOTIFY)
2534		rt->rt_flags |= RTCF_NOTIFY;
2535
2536	err = rt_fill_info(net, dst, src, &fl4, skb,
2537			   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2538			   RTM_NEWROUTE, 0, 0);
2539	if (err < 0)
2540		goto errout_free;
2541
2542	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2543errout:
2544	return err;
2545
2546errout_free:
2547	kfree_skb(skb);
2548	goto errout;
2549}
2550
2551void ip_rt_multicast_event(struct in_device *in_dev)
2552{
2553	rt_cache_flush(dev_net(in_dev->dev));
2554}
2555
2556#ifdef CONFIG_SYSCTL
2557static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2558static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
2559static int ip_rt_gc_elasticity __read_mostly	= 8;
2560
2561static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2562					void __user *buffer,
2563					size_t *lenp, loff_t *ppos)
2564{
2565	struct net *net = (struct net *)__ctl->extra1;
2566
2567	if (write) {
2568		rt_cache_flush(net);
2569		fnhe_genid_bump(net);
2570		return 0;
2571	}
2572
2573	return -EINVAL;
2574}
2575
2576static struct ctl_table ipv4_route_table[] = {
2577	{
2578		.procname	= "gc_thresh",
2579		.data		= &ipv4_dst_ops.gc_thresh,
2580		.maxlen		= sizeof(int),
2581		.mode		= 0644,
2582		.proc_handler	= proc_dointvec,
2583	},
2584	{
2585		.procname	= "max_size",
2586		.data		= &ip_rt_max_size,
2587		.maxlen		= sizeof(int),
2588		.mode		= 0644,
2589		.proc_handler	= proc_dointvec,
2590	},
2591	{
2592		/*  Deprecated. Use gc_min_interval_ms */
2593
2594		.procname	= "gc_min_interval",
2595		.data		= &ip_rt_gc_min_interval,
2596		.maxlen		= sizeof(int),
2597		.mode		= 0644,
2598		.proc_handler	= proc_dointvec_jiffies,
2599	},
2600	{
2601		.procname	= "gc_min_interval_ms",
2602		.data		= &ip_rt_gc_min_interval,
2603		.maxlen		= sizeof(int),
2604		.mode		= 0644,
2605		.proc_handler	= proc_dointvec_ms_jiffies,
2606	},
2607	{
2608		.procname	= "gc_timeout",
2609		.data		= &ip_rt_gc_timeout,
2610		.maxlen		= sizeof(int),
2611		.mode		= 0644,
2612		.proc_handler	= proc_dointvec_jiffies,
2613	},
2614	{
2615		.procname	= "gc_interval",
2616		.data		= &ip_rt_gc_interval,
2617		.maxlen		= sizeof(int),
2618		.mode		= 0644,
2619		.proc_handler	= proc_dointvec_jiffies,
2620	},
2621	{
2622		.procname	= "redirect_load",
2623		.data		= &ip_rt_redirect_load,
2624		.maxlen		= sizeof(int),
2625		.mode		= 0644,
2626		.proc_handler	= proc_dointvec,
2627	},
2628	{
2629		.procname	= "redirect_number",
2630		.data		= &ip_rt_redirect_number,
2631		.maxlen		= sizeof(int),
2632		.mode		= 0644,
2633		.proc_handler	= proc_dointvec,
2634	},
2635	{
2636		.procname	= "redirect_silence",
2637		.data		= &ip_rt_redirect_silence,
2638		.maxlen		= sizeof(int),
2639		.mode		= 0644,
2640		.proc_handler	= proc_dointvec,
2641	},
2642	{
2643		.procname	= "error_cost",
2644		.data		= &ip_rt_error_cost,
2645		.maxlen		= sizeof(int),
2646		.mode		= 0644,
2647		.proc_handler	= proc_dointvec,
2648	},
2649	{
2650		.procname	= "error_burst",
2651		.data		= &ip_rt_error_burst,
2652		.maxlen		= sizeof(int),
2653		.mode		= 0644,
2654		.proc_handler	= proc_dointvec,
2655	},
2656	{
2657		.procname	= "gc_elasticity",
2658		.data		= &ip_rt_gc_elasticity,
2659		.maxlen		= sizeof(int),
2660		.mode		= 0644,
2661		.proc_handler	= proc_dointvec,
2662	},
2663	{
2664		.procname	= "mtu_expires",
2665		.data		= &ip_rt_mtu_expires,
2666		.maxlen		= sizeof(int),
2667		.mode		= 0644,
2668		.proc_handler	= proc_dointvec_jiffies,
2669	},
2670	{
2671		.procname	= "min_pmtu",
2672		.data		= &ip_rt_min_pmtu,
2673		.maxlen		= sizeof(int),
2674		.mode		= 0644,
2675		.proc_handler	= proc_dointvec,
2676	},
2677	{
2678		.procname	= "min_adv_mss",
2679		.data		= &ip_rt_min_advmss,
2680		.maxlen		= sizeof(int),
2681		.mode		= 0644,
2682		.proc_handler	= proc_dointvec,
2683	},
2684	{ }
2685};
2686
2687static struct ctl_table ipv4_route_flush_table[] = {
2688	{
2689		.procname	= "flush",
2690		.maxlen		= sizeof(int),
2691		.mode		= 0200,
2692		.proc_handler	= ipv4_sysctl_rtcache_flush,
2693	},
2694	{ },
2695};
2696
2697static __net_init int sysctl_route_net_init(struct net *net)
2698{
2699	struct ctl_table *tbl;
2700
2701	tbl = ipv4_route_flush_table;
2702	if (!net_eq(net, &init_net)) {
2703		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2704		if (!tbl)
2705			goto err_dup;
2706
2707		/* Don't export sysctls to unprivileged users */
2708		if (net->user_ns != &init_user_ns)
2709			tbl[0].procname = NULL;
2710	}
2711	tbl[0].extra1 = net;
2712
2713	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2714	if (!net->ipv4.route_hdr)
2715		goto err_reg;
2716	return 0;
2717
2718err_reg:
2719	if (tbl != ipv4_route_flush_table)
2720		kfree(tbl);
2721err_dup:
2722	return -ENOMEM;
2723}
2724
2725static __net_exit void sysctl_route_net_exit(struct net *net)
2726{
2727	struct ctl_table *tbl;
2728
2729	tbl = net->ipv4.route_hdr->ctl_table_arg;
2730	unregister_net_sysctl_table(net->ipv4.route_hdr);
2731	BUG_ON(tbl == ipv4_route_flush_table);
2732	kfree(tbl);
2733}
2734
2735static __net_initdata struct pernet_operations sysctl_route_ops = {
2736	.init = sysctl_route_net_init,
2737	.exit = sysctl_route_net_exit,
2738};
2739#endif
2740
2741static __net_init int rt_genid_init(struct net *net)
2742{
2743	atomic_set(&net->ipv4.rt_genid, 0);
2744	atomic_set(&net->fnhe_genid, 0);
2745	get_random_bytes(&net->ipv4.dev_addr_genid,
2746			 sizeof(net->ipv4.dev_addr_genid));
2747	return 0;
2748}
2749
2750static __net_initdata struct pernet_operations rt_genid_ops = {
2751	.init = rt_genid_init,
2752};
2753
2754static int __net_init ipv4_inetpeer_init(struct net *net)
2755{
2756	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2757
2758	if (!bp)
2759		return -ENOMEM;
2760	inet_peer_base_init(bp);
2761	net->ipv4.peers = bp;
2762	return 0;
2763}
2764
2765static void __net_exit ipv4_inetpeer_exit(struct net *net)
2766{
2767	struct inet_peer_base *bp = net->ipv4.peers;
2768
2769	net->ipv4.peers = NULL;
2770	inetpeer_invalidate_tree(bp);
2771	kfree(bp);
2772}
2773
2774static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2775	.init	=	ipv4_inetpeer_init,
2776	.exit	=	ipv4_inetpeer_exit,
2777};
2778
2779#ifdef CONFIG_IP_ROUTE_CLASSID
2780struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2781#endif /* CONFIG_IP_ROUTE_CLASSID */
2782
2783int __init ip_rt_init(void)
2784{
2785	int rc = 0;
2786	int cpu;
2787
2788	ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2789	if (!ip_idents)
2790		panic("IP: failed to allocate ip_idents\n");
2791
2792	prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2793
2794	for_each_possible_cpu(cpu) {
2795		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2796
2797		INIT_LIST_HEAD(&ul->head);
2798		spin_lock_init(&ul->lock);
2799	}
2800#ifdef CONFIG_IP_ROUTE_CLASSID
2801	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2802	if (!ip_rt_acct)
2803		panic("IP: failed to allocate ip_rt_acct\n");
2804#endif
2805
2806	ipv4_dst_ops.kmem_cachep =
2807		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2808				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2809
2810	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2811
2812	if (dst_entries_init(&ipv4_dst_ops) < 0)
2813		panic("IP: failed to allocate ipv4_dst_ops counter\n");
2814
2815	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2816		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2817
2818	ipv4_dst_ops.gc_thresh = ~0;
2819	ip_rt_max_size = INT_MAX;
2820
2821	devinet_init();
2822	ip_fib_init();
2823
2824	if (ip_rt_proc_init())
2825		pr_err("Unable to create route proc files\n");
2826#ifdef CONFIG_XFRM
2827	xfrm_init();
2828	xfrm4_init();
2829#endif
2830	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2831
2832#ifdef CONFIG_SYSCTL
2833	register_pernet_subsys(&sysctl_route_ops);
2834#endif
2835	register_pernet_subsys(&rt_genid_ops);
2836	register_pernet_subsys(&ipv4_inetpeer_ops);
2837	return rc;
2838}
2839
2840#ifdef CONFIG_SYSCTL
2841/*
2842 * We really need to sanitize the damn ipv4 init order, then all
2843 * this nonsense will go away.
2844 */
2845void __init ip_static_sysctl_init(void)
2846{
2847	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2848}
2849#endif
2850