1/*
2 *	IPv6 output functions
3 *	Linux INET6 implementation
4 *
5 *	Authors:
6 *	Pedro Roque		<roque@di.fc.ul.pt>
7 *
8 *	Based on linux/net/ipv4/ip_output.c
9 *
10 *	This program is free software; you can redistribute it and/or
11 *      modify it under the terms of the GNU General Public License
12 *      as published by the Free Software Foundation; either version
13 *      2 of the License, or (at your option) any later version.
14 *
15 *	Changes:
16 *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17 *				extension headers are implemented.
18 *				route changes now work.
19 *				ip6_forward does not confuse sniffers.
20 *				etc.
21 *
22 *      H. von Brand    :       Added missing #include <linux/string.h>
23 *	Imran Patel	:	frag id should be in NBO
24 *      Kazunori MIYAZAWA @USAGI
25 *			:       add ip6_append_data and related functions
26 *				for datagram xmit
27 */
28
29#include <linux/errno.h>
30#include <linux/kernel.h>
31#include <linux/string.h>
32#include <linux/socket.h>
33#include <linux/net.h>
34#include <linux/netdevice.h>
35#include <linux/if_arp.h>
36#include <linux/in6.h>
37#include <linux/tcp.h>
38#include <linux/route.h>
39#include <linux/module.h>
40#include <linux/slab.h>
41
42#include <linux/netfilter.h>
43#include <linux/netfilter_ipv6.h>
44
45#include <net/sock.h>
46#include <net/snmp.h>
47
48#include <net/ipv6.h>
49#include <net/ndisc.h>
50#include <net/protocol.h>
51#include <net/ip6_route.h>
52#include <net/addrconf.h>
53#include <net/rawv6.h>
54#include <net/icmp.h>
55#include <net/xfrm.h>
56#include <net/checksum.h>
57#include <linux/mroute6.h>
58
59static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb)
60{
61	struct dst_entry *dst = skb_dst(skb);
62	struct net_device *dev = dst->dev;
63	struct neighbour *neigh;
64	struct in6_addr *nexthop;
65	int ret;
66
67	skb->protocol = htons(ETH_P_IPV6);
68	skb->dev = dev;
69
70	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72
73		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74		    ((mroute6_socket(dev_net(dev), skb) &&
75		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77					 &ipv6_hdr(skb)->saddr))) {
78			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79
80			/* Do not check for IFF_ALLMULTI; multicast routing
81			   is not supported in any case.
82			 */
83			if (newskb)
84				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85					sk, newskb, NULL, newskb->dev,
86					dev_loopback_xmit);
87
88			if (ipv6_hdr(skb)->hop_limit == 0) {
89				IP6_INC_STATS(dev_net(dev), idev,
90					      IPSTATS_MIB_OUTDISCARDS);
91				kfree_skb(skb);
92				return 0;
93			}
94		}
95
96		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
97				skb->len);
98
99		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
100		    IPV6_ADDR_SCOPE_NODELOCAL &&
101		    !(dev->flags & IFF_LOOPBACK)) {
102			kfree_skb(skb);
103			return 0;
104		}
105	}
106
107	rcu_read_lock_bh();
108	nexthop = rt6_nexthop((struct rt6_info *)dst);
109	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
110	if (unlikely(!neigh))
111		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
112	if (!IS_ERR(neigh)) {
113		ret = dst_neigh_output(dst, neigh, skb);
114		rcu_read_unlock_bh();
115		return ret;
116	}
117	rcu_read_unlock_bh();
118
119	IP6_INC_STATS(dev_net(dst->dev),
120		      ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
121	kfree_skb(skb);
122	return -EINVAL;
123}
124
125static int ip6_finish_output(struct sock *sk, struct sk_buff *skb)
126{
127	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
128	    dst_allfrag(skb_dst(skb)) ||
129	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
130		return ip6_fragment(sk, skb, ip6_finish_output2);
131	else
132		return ip6_finish_output2(sk, skb);
133}
134
135int ip6_output(struct sock *sk, struct sk_buff *skb)
136{
137	struct net_device *dev = skb_dst(skb)->dev;
138	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
139	if (unlikely(idev->cnf.disable_ipv6)) {
140		IP6_INC_STATS(dev_net(dev), idev,
141			      IPSTATS_MIB_OUTDISCARDS);
142		kfree_skb(skb);
143		return 0;
144	}
145
146	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, sk, skb,
147			    NULL, dev,
148			    ip6_finish_output,
149			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
150}
151
152/*
153 *	xmit an sk_buff (used by TCP, SCTP and DCCP)
154 */
155
156int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
157	     struct ipv6_txoptions *opt, int tclass)
158{
159	struct net *net = sock_net(sk);
160	struct ipv6_pinfo *np = inet6_sk(sk);
161	struct in6_addr *first_hop = &fl6->daddr;
162	struct dst_entry *dst = skb_dst(skb);
163	struct ipv6hdr *hdr;
164	u8  proto = fl6->flowi6_proto;
165	int seg_len = skb->len;
166	int hlimit = -1;
167	u32 mtu;
168
169	if (opt) {
170		unsigned int head_room;
171
172		/* First: exthdrs may take lots of space (~8K for now)
173		   MAX_HEADER is not enough.
174		 */
175		head_room = opt->opt_nflen + opt->opt_flen;
176		seg_len += head_room;
177		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
178
179		if (skb_headroom(skb) < head_room) {
180			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
181			if (!skb2) {
182				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
183					      IPSTATS_MIB_OUTDISCARDS);
184				kfree_skb(skb);
185				return -ENOBUFS;
186			}
187			consume_skb(skb);
188			skb = skb2;
189			skb_set_owner_w(skb, sk);
190		}
191		if (opt->opt_flen)
192			ipv6_push_frag_opts(skb, opt, &proto);
193		if (opt->opt_nflen)
194			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
195	}
196
197	skb_push(skb, sizeof(struct ipv6hdr));
198	skb_reset_network_header(skb);
199	hdr = ipv6_hdr(skb);
200
201	/*
202	 *	Fill in the IPv6 header
203	 */
204	if (np)
205		hlimit = np->hop_limit;
206	if (hlimit < 0)
207		hlimit = ip6_dst_hoplimit(dst);
208
209	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
210						     np->autoflowlabel));
211
212	hdr->payload_len = htons(seg_len);
213	hdr->nexthdr = proto;
214	hdr->hop_limit = hlimit;
215
216	hdr->saddr = fl6->saddr;
217	hdr->daddr = *first_hop;
218
219	skb->protocol = htons(ETH_P_IPV6);
220	skb->priority = sk->sk_priority;
221	skb->mark = sk->sk_mark;
222
223	mtu = dst_mtu(dst);
224	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
225		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
226			      IPSTATS_MIB_OUT, skb->len);
227		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb,
228			       NULL, dst->dev, dst_output_sk);
229	}
230
231	skb->dev = dst->dev;
232	ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
233	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
234	kfree_skb(skb);
235	return -EMSGSIZE;
236}
237EXPORT_SYMBOL(ip6_xmit);
238
239static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
240{
241	struct ip6_ra_chain *ra;
242	struct sock *last = NULL;
243
244	read_lock(&ip6_ra_lock);
245	for (ra = ip6_ra_chain; ra; ra = ra->next) {
246		struct sock *sk = ra->sk;
247		if (sk && ra->sel == sel &&
248		    (!sk->sk_bound_dev_if ||
249		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
250			if (last) {
251				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
252				if (skb2)
253					rawv6_rcv(last, skb2);
254			}
255			last = sk;
256		}
257	}
258
259	if (last) {
260		rawv6_rcv(last, skb);
261		read_unlock(&ip6_ra_lock);
262		return 1;
263	}
264	read_unlock(&ip6_ra_lock);
265	return 0;
266}
267
268static int ip6_forward_proxy_check(struct sk_buff *skb)
269{
270	struct ipv6hdr *hdr = ipv6_hdr(skb);
271	u8 nexthdr = hdr->nexthdr;
272	__be16 frag_off;
273	int offset;
274
275	if (ipv6_ext_hdr(nexthdr)) {
276		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
277		if (offset < 0)
278			return 0;
279	} else
280		offset = sizeof(struct ipv6hdr);
281
282	if (nexthdr == IPPROTO_ICMPV6) {
283		struct icmp6hdr *icmp6;
284
285		if (!pskb_may_pull(skb, (skb_network_header(skb) +
286					 offset + 1 - skb->data)))
287			return 0;
288
289		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
290
291		switch (icmp6->icmp6_type) {
292		case NDISC_ROUTER_SOLICITATION:
293		case NDISC_ROUTER_ADVERTISEMENT:
294		case NDISC_NEIGHBOUR_SOLICITATION:
295		case NDISC_NEIGHBOUR_ADVERTISEMENT:
296		case NDISC_REDIRECT:
297			/* For reaction involving unicast neighbor discovery
298			 * message destined to the proxied address, pass it to
299			 * input function.
300			 */
301			return 1;
302		default:
303			break;
304		}
305	}
306
307	/*
308	 * The proxying router can't forward traffic sent to a link-local
309	 * address, so signal the sender and discard the packet. This
310	 * behavior is clarified by the MIPv6 specification.
311	 */
312	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
313		dst_link_failure(skb);
314		return -1;
315	}
316
317	return 0;
318}
319
320static inline int ip6_forward_finish(struct sock *sk, struct sk_buff *skb)
321{
322	skb_sender_cpu_clear(skb);
323	return dst_output_sk(sk, skb);
324}
325
326static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
327{
328	unsigned int mtu;
329	struct inet6_dev *idev;
330
331	if (dst_metric_locked(dst, RTAX_MTU)) {
332		mtu = dst_metric_raw(dst, RTAX_MTU);
333		if (mtu)
334			return mtu;
335	}
336
337	mtu = IPV6_MIN_MTU;
338	rcu_read_lock();
339	idev = __in6_dev_get(dst->dev);
340	if (idev)
341		mtu = idev->cnf.mtu6;
342	rcu_read_unlock();
343
344	return mtu;
345}
346
347static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
348{
349	if (skb->len <= mtu)
350		return false;
351
352	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
353	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
354		return true;
355
356	if (skb->ignore_df)
357		return false;
358
359	if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
360		return false;
361
362	return true;
363}
364
365int ip6_forward(struct sk_buff *skb)
366{
367	struct dst_entry *dst = skb_dst(skb);
368	struct ipv6hdr *hdr = ipv6_hdr(skb);
369	struct inet6_skb_parm *opt = IP6CB(skb);
370	struct net *net = dev_net(dst->dev);
371	u32 mtu;
372
373	if (net->ipv6.devconf_all->forwarding == 0)
374		goto error;
375
376	if (skb->pkt_type != PACKET_HOST)
377		goto drop;
378
379	if (unlikely(skb->sk))
380		goto drop;
381
382	if (skb_warn_if_lro(skb))
383		goto drop;
384
385	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
386		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
387				 IPSTATS_MIB_INDISCARDS);
388		goto drop;
389	}
390
391	skb_forward_csum(skb);
392
393	/*
394	 *	We DO NOT make any processing on
395	 *	RA packets, pushing them to user level AS IS
396	 *	without ane WARRANTY that application will be able
397	 *	to interpret them. The reason is that we
398	 *	cannot make anything clever here.
399	 *
400	 *	We are not end-node, so that if packet contains
401	 *	AH/ESP, we cannot make anything.
402	 *	Defragmentation also would be mistake, RA packets
403	 *	cannot be fragmented, because there is no warranty
404	 *	that different fragments will go along one path. --ANK
405	 */
406	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
407		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
408			return 0;
409	}
410
411	/*
412	 *	check and decrement ttl
413	 */
414	if (hdr->hop_limit <= 1) {
415		/* Force OUTPUT device used as source address */
416		skb->dev = dst->dev;
417		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
418		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
419				 IPSTATS_MIB_INHDRERRORS);
420
421		kfree_skb(skb);
422		return -ETIMEDOUT;
423	}
424
425	/* XXX: idev->cnf.proxy_ndp? */
426	if (net->ipv6.devconf_all->proxy_ndp &&
427	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
428		int proxied = ip6_forward_proxy_check(skb);
429		if (proxied > 0)
430			return ip6_input(skb);
431		else if (proxied < 0) {
432			IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
433					 IPSTATS_MIB_INDISCARDS);
434			goto drop;
435		}
436	}
437
438	if (!xfrm6_route_forward(skb)) {
439		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
440				 IPSTATS_MIB_INDISCARDS);
441		goto drop;
442	}
443	dst = skb_dst(skb);
444
445	/* IPv6 specs say nothing about it, but it is clear that we cannot
446	   send redirects to source routed frames.
447	   We don't send redirects to frames decapsulated from IPsec.
448	 */
449	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
450		struct in6_addr *target = NULL;
451		struct inet_peer *peer;
452		struct rt6_info *rt;
453
454		/*
455		 *	incoming and outgoing devices are the same
456		 *	send a redirect.
457		 */
458
459		rt = (struct rt6_info *) dst;
460		if (rt->rt6i_flags & RTF_GATEWAY)
461			target = &rt->rt6i_gateway;
462		else
463			target = &hdr->daddr;
464
465		peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
466
467		/* Limit redirects both by destination (here)
468		   and by source (inside ndisc_send_redirect)
469		 */
470		if (inet_peer_xrlim_allow(peer, 1*HZ))
471			ndisc_send_redirect(skb, target);
472		if (peer)
473			inet_putpeer(peer);
474	} else {
475		int addrtype = ipv6_addr_type(&hdr->saddr);
476
477		/* This check is security critical. */
478		if (addrtype == IPV6_ADDR_ANY ||
479		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
480			goto error;
481		if (addrtype & IPV6_ADDR_LINKLOCAL) {
482			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
483				    ICMPV6_NOT_NEIGHBOUR, 0);
484			goto error;
485		}
486	}
487
488	mtu = ip6_dst_mtu_forward(dst);
489	if (mtu < IPV6_MIN_MTU)
490		mtu = IPV6_MIN_MTU;
491
492	if (ip6_pkt_too_big(skb, mtu)) {
493		/* Again, force OUTPUT device used as source address */
494		skb->dev = dst->dev;
495		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
496		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
497				 IPSTATS_MIB_INTOOBIGERRORS);
498		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
499				 IPSTATS_MIB_FRAGFAILS);
500		kfree_skb(skb);
501		return -EMSGSIZE;
502	}
503
504	if (skb_cow(skb, dst->dev->hard_header_len)) {
505		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
506				 IPSTATS_MIB_OUTDISCARDS);
507		goto drop;
508	}
509
510	hdr = ipv6_hdr(skb);
511
512	/* Mangling hops number delayed to point after skb COW */
513
514	hdr->hop_limit--;
515
516	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
517	IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
518	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, NULL, skb,
519		       skb->dev, dst->dev,
520		       ip6_forward_finish);
521
522error:
523	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
524drop:
525	kfree_skb(skb);
526	return -EINVAL;
527}
528
529static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
530{
531	to->pkt_type = from->pkt_type;
532	to->priority = from->priority;
533	to->protocol = from->protocol;
534	skb_dst_drop(to);
535	skb_dst_set(to, dst_clone(skb_dst(from)));
536	to->dev = from->dev;
537	to->mark = from->mark;
538
539#ifdef CONFIG_NET_SCHED
540	to->tc_index = from->tc_index;
541#endif
542	nf_copy(to, from);
543	skb_copy_secmark(to, from);
544}
545
546int ip6_fragment(struct sock *sk, struct sk_buff *skb,
547		 int (*output)(struct sock *, struct sk_buff *))
548{
549	struct sk_buff *frag;
550	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
551	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
552				inet6_sk(skb->sk) : NULL;
553	struct ipv6hdr *tmp_hdr;
554	struct frag_hdr *fh;
555	unsigned int mtu, hlen, left, len;
556	int hroom, troom;
557	__be32 frag_id = 0;
558	int ptr, offset = 0, err = 0;
559	u8 *prevhdr, nexthdr = 0;
560	struct net *net = dev_net(skb_dst(skb)->dev);
561
562	hlen = ip6_find_1stfragopt(skb, &prevhdr);
563	nexthdr = *prevhdr;
564
565	mtu = ip6_skb_dst_mtu(skb);
566
567	/* We must not fragment if the socket is set to force MTU discovery
568	 * or if the skb it not generated by a local socket.
569	 */
570	if (unlikely(!skb->ignore_df && skb->len > mtu) ||
571		     (IP6CB(skb)->frag_max_size &&
572		      IP6CB(skb)->frag_max_size > mtu)) {
573		if (skb->sk && dst_allfrag(skb_dst(skb)))
574			sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
575
576		skb->dev = skb_dst(skb)->dev;
577		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
578		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
579			      IPSTATS_MIB_FRAGFAILS);
580		kfree_skb(skb);
581		return -EMSGSIZE;
582	}
583
584	if (np && np->frag_size < mtu) {
585		if (np->frag_size)
586			mtu = np->frag_size;
587	}
588	mtu -= hlen + sizeof(struct frag_hdr);
589
590	if (skb_has_frag_list(skb)) {
591		int first_len = skb_pagelen(skb);
592		struct sk_buff *frag2;
593
594		if (first_len - hlen > mtu ||
595		    ((first_len - hlen) & 7) ||
596		    skb_cloned(skb))
597			goto slow_path;
598
599		skb_walk_frags(skb, frag) {
600			/* Correct geometry. */
601			if (frag->len > mtu ||
602			    ((frag->len & 7) && frag->next) ||
603			    skb_headroom(frag) < hlen)
604				goto slow_path_clean;
605
606			/* Partially cloned skb? */
607			if (skb_shared(frag))
608				goto slow_path_clean;
609
610			BUG_ON(frag->sk);
611			if (skb->sk) {
612				frag->sk = skb->sk;
613				frag->destructor = sock_wfree;
614			}
615			skb->truesize -= frag->truesize;
616		}
617
618		err = 0;
619		offset = 0;
620		frag = skb_shinfo(skb)->frag_list;
621		skb_frag_list_init(skb);
622		/* BUILD HEADER */
623
624		*prevhdr = NEXTHDR_FRAGMENT;
625		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
626		if (!tmp_hdr) {
627			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
628				      IPSTATS_MIB_FRAGFAILS);
629			return -ENOMEM;
630		}
631
632		__skb_pull(skb, hlen);
633		fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
634		__skb_push(skb, hlen);
635		skb_reset_network_header(skb);
636		memcpy(skb_network_header(skb), tmp_hdr, hlen);
637
638		ipv6_select_ident(net, fh, rt);
639		fh->nexthdr = nexthdr;
640		fh->reserved = 0;
641		fh->frag_off = htons(IP6_MF);
642		frag_id = fh->identification;
643
644		first_len = skb_pagelen(skb);
645		skb->data_len = first_len - skb_headlen(skb);
646		skb->len = first_len;
647		ipv6_hdr(skb)->payload_len = htons(first_len -
648						   sizeof(struct ipv6hdr));
649
650		dst_hold(&rt->dst);
651
652		for (;;) {
653			/* Prepare header of the next frame,
654			 * before previous one went down. */
655			if (frag) {
656				frag->ip_summed = CHECKSUM_NONE;
657				skb_reset_transport_header(frag);
658				fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
659				__skb_push(frag, hlen);
660				skb_reset_network_header(frag);
661				memcpy(skb_network_header(frag), tmp_hdr,
662				       hlen);
663				offset += skb->len - hlen - sizeof(struct frag_hdr);
664				fh->nexthdr = nexthdr;
665				fh->reserved = 0;
666				fh->frag_off = htons(offset);
667				if (frag->next)
668					fh->frag_off |= htons(IP6_MF);
669				fh->identification = frag_id;
670				ipv6_hdr(frag)->payload_len =
671						htons(frag->len -
672						      sizeof(struct ipv6hdr));
673				ip6_copy_metadata(frag, skb);
674			}
675
676			err = output(sk, skb);
677			if (!err)
678				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
679					      IPSTATS_MIB_FRAGCREATES);
680
681			if (err || !frag)
682				break;
683
684			skb = frag;
685			frag = skb->next;
686			skb->next = NULL;
687		}
688
689		kfree(tmp_hdr);
690
691		if (err == 0) {
692			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
693				      IPSTATS_MIB_FRAGOKS);
694			ip6_rt_put(rt);
695			return 0;
696		}
697
698		kfree_skb_list(frag);
699
700		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
701			      IPSTATS_MIB_FRAGFAILS);
702		ip6_rt_put(rt);
703		return err;
704
705slow_path_clean:
706		skb_walk_frags(skb, frag2) {
707			if (frag2 == frag)
708				break;
709			frag2->sk = NULL;
710			frag2->destructor = NULL;
711			skb->truesize += frag2->truesize;
712		}
713	}
714
715slow_path:
716	if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
717	    skb_checksum_help(skb))
718		goto fail;
719
720	left = skb->len - hlen;		/* Space per frame */
721	ptr = hlen;			/* Where to start from */
722
723	/*
724	 *	Fragment the datagram.
725	 */
726
727	*prevhdr = NEXTHDR_FRAGMENT;
728	hroom = LL_RESERVED_SPACE(rt->dst.dev);
729	troom = rt->dst.dev->needed_tailroom;
730
731	/*
732	 *	Keep copying data until we run out.
733	 */
734	while (left > 0)	{
735		len = left;
736		/* IF: it doesn't fit, use 'mtu' - the data space left */
737		if (len > mtu)
738			len = mtu;
739		/* IF: we are not sending up to and including the packet end
740		   then align the next start on an eight byte boundary */
741		if (len < left)	{
742			len &= ~7;
743		}
744
745		/* Allocate buffer */
746		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
747				 hroom + troom, GFP_ATOMIC);
748		if (!frag) {
749			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
750				      IPSTATS_MIB_FRAGFAILS);
751			err = -ENOMEM;
752			goto fail;
753		}
754
755		/*
756		 *	Set up data on packet
757		 */
758
759		ip6_copy_metadata(frag, skb);
760		skb_reserve(frag, hroom);
761		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
762		skb_reset_network_header(frag);
763		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
764		frag->transport_header = (frag->network_header + hlen +
765					  sizeof(struct frag_hdr));
766
767		/*
768		 *	Charge the memory for the fragment to any owner
769		 *	it might possess
770		 */
771		if (skb->sk)
772			skb_set_owner_w(frag, skb->sk);
773
774		/*
775		 *	Copy the packet header into the new buffer.
776		 */
777		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
778
779		/*
780		 *	Build fragment header.
781		 */
782		fh->nexthdr = nexthdr;
783		fh->reserved = 0;
784		if (!frag_id) {
785			ipv6_select_ident(net, fh, rt);
786			frag_id = fh->identification;
787		} else
788			fh->identification = frag_id;
789
790		/*
791		 *	Copy a block of the IP datagram.
792		 */
793		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
794				     len));
795		left -= len;
796
797		fh->frag_off = htons(offset);
798		if (left > 0)
799			fh->frag_off |= htons(IP6_MF);
800		ipv6_hdr(frag)->payload_len = htons(frag->len -
801						    sizeof(struct ipv6hdr));
802
803		ptr += len;
804		offset += len;
805
806		/*
807		 *	Put this fragment into the sending queue.
808		 */
809		err = output(sk, frag);
810		if (err)
811			goto fail;
812
813		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
814			      IPSTATS_MIB_FRAGCREATES);
815	}
816	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
817		      IPSTATS_MIB_FRAGOKS);
818	consume_skb(skb);
819	return err;
820
821fail:
822	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
823		      IPSTATS_MIB_FRAGFAILS);
824	kfree_skb(skb);
825	return err;
826}
827
828static inline int ip6_rt_check(const struct rt6key *rt_key,
829			       const struct in6_addr *fl_addr,
830			       const struct in6_addr *addr_cache)
831{
832	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
833		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
834}
835
836static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
837					  struct dst_entry *dst,
838					  const struct flowi6 *fl6)
839{
840	struct ipv6_pinfo *np = inet6_sk(sk);
841	struct rt6_info *rt;
842
843	if (!dst)
844		goto out;
845
846	if (dst->ops->family != AF_INET6) {
847		dst_release(dst);
848		return NULL;
849	}
850
851	rt = (struct rt6_info *)dst;
852	/* Yes, checking route validity in not connected
853	 * case is not very simple. Take into account,
854	 * that we do not support routing by source, TOS,
855	 * and MSG_DONTROUTE		--ANK (980726)
856	 *
857	 * 1. ip6_rt_check(): If route was host route,
858	 *    check that cached destination is current.
859	 *    If it is network route, we still may
860	 *    check its validity using saved pointer
861	 *    to the last used address: daddr_cache.
862	 *    We do not want to save whole address now,
863	 *    (because main consumer of this service
864	 *    is tcp, which has not this problem),
865	 *    so that the last trick works only on connected
866	 *    sockets.
867	 * 2. oif also should be the same.
868	 */
869	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
870#ifdef CONFIG_IPV6_SUBTREES
871	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
872#endif
873	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
874		dst_release(dst);
875		dst = NULL;
876	}
877
878out:
879	return dst;
880}
881
882static int ip6_dst_lookup_tail(struct sock *sk,
883			       struct dst_entry **dst, struct flowi6 *fl6)
884{
885	struct net *net = sock_net(sk);
886#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
887	struct neighbour *n;
888	struct rt6_info *rt;
889#endif
890	int err;
891	int flags = 0;
892
893	/* The correct way to handle this would be to do
894	 * ip6_route_get_saddr, and then ip6_route_output; however,
895	 * the route-specific preferred source forces the
896	 * ip6_route_output call _before_ ip6_route_get_saddr.
897	 *
898	 * In source specific routing (no src=any default route),
899	 * ip6_route_output will fail given src=any saddr, though, so
900	 * that's why we try it again later.
901	 */
902	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
903		struct rt6_info *rt;
904		bool had_dst = *dst != NULL;
905
906		if (!had_dst)
907			*dst = ip6_route_output(net, sk, fl6);
908		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
909		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
910					  sk ? inet6_sk(sk)->srcprefs : 0,
911					  &fl6->saddr);
912		if (err)
913			goto out_err_release;
914
915		/* If we had an erroneous initial result, pretend it
916		 * never existed and let the SA-enabled version take
917		 * over.
918		 */
919		if (!had_dst && (*dst)->error) {
920			dst_release(*dst);
921			*dst = NULL;
922		}
923
924		if (fl6->flowi6_oif)
925			flags |= RT6_LOOKUP_F_IFACE;
926	}
927
928	if (!*dst)
929		*dst = ip6_route_output_flags(net, sk, fl6, flags);
930
931	err = (*dst)->error;
932	if (err)
933		goto out_err_release;
934
935#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
936	/*
937	 * Here if the dst entry we've looked up
938	 * has a neighbour entry that is in the INCOMPLETE
939	 * state and the src address from the flow is
940	 * marked as OPTIMISTIC, we release the found
941	 * dst entry and replace it instead with the
942	 * dst entry of the nexthop router
943	 */
944	rt = (struct rt6_info *) *dst;
945	rcu_read_lock_bh();
946	n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt));
947	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
948	rcu_read_unlock_bh();
949
950	if (err) {
951		struct inet6_ifaddr *ifp;
952		struct flowi6 fl_gw6;
953		int redirect;
954
955		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
956				      (*dst)->dev, 1);
957
958		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
959		if (ifp)
960			in6_ifa_put(ifp);
961
962		if (redirect) {
963			/*
964			 * We need to get the dst entry for the
965			 * default router instead
966			 */
967			dst_release(*dst);
968			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
969			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
970			*dst = ip6_route_output(net, sk, &fl_gw6);
971			err = (*dst)->error;
972			if (err)
973				goto out_err_release;
974		}
975	}
976#endif
977
978	return 0;
979
980out_err_release:
981	if (err == -ENETUNREACH)
982		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
983	dst_release(*dst);
984	*dst = NULL;
985	return err;
986}
987
988/**
989 *	ip6_dst_lookup - perform route lookup on flow
990 *	@sk: socket which provides route info
991 *	@dst: pointer to dst_entry * for result
992 *	@fl6: flow to lookup
993 *
994 *	This function performs a route lookup on the given flow.
995 *
996 *	It returns zero on success, or a standard errno code on error.
997 */
998int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
999{
1000	*dst = NULL;
1001	return ip6_dst_lookup_tail(sk, dst, fl6);
1002}
1003EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1004
1005/**
1006 *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1007 *	@sk: socket which provides route info
1008 *	@fl6: flow to lookup
1009 *	@final_dst: final destination address for ipsec lookup
1010 *
1011 *	This function performs a route lookup on the given flow.
1012 *
1013 *	It returns a valid dst pointer on success, or a pointer encoded
1014 *	error code.
1015 */
1016struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1017				      const struct in6_addr *final_dst)
1018{
1019	struct dst_entry *dst = NULL;
1020	int err;
1021
1022	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1023	if (err)
1024		return ERR_PTR(err);
1025	if (final_dst)
1026		fl6->daddr = *final_dst;
1027
1028	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1029}
1030EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1031
1032/**
1033 *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1034 *	@sk: socket which provides the dst cache and route info
1035 *	@fl6: flow to lookup
1036 *	@final_dst: final destination address for ipsec lookup
1037 *
1038 *	This function performs a route lookup on the given flow with the
1039 *	possibility of using the cached route in the socket if it is valid.
1040 *	It will take the socket dst lock when operating on the dst cache.
1041 *	As a result, this function can only be used in process context.
1042 *
1043 *	It returns a valid dst pointer on success, or a pointer encoded
1044 *	error code.
1045 */
1046struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1047					 const struct in6_addr *final_dst)
1048{
1049	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1050	int err;
1051
1052	dst = ip6_sk_dst_check(sk, dst, fl6);
1053
1054	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1055	if (err)
1056		return ERR_PTR(err);
1057	if (final_dst)
1058		fl6->daddr = *final_dst;
1059
1060	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1061}
1062EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1063
1064static inline int ip6_ufo_append_data(struct sock *sk,
1065			struct sk_buff_head *queue,
1066			int getfrag(void *from, char *to, int offset, int len,
1067			int odd, struct sk_buff *skb),
1068			void *from, int length, int hh_len, int fragheaderlen,
1069			int transhdrlen, int mtu, unsigned int flags,
1070			struct rt6_info *rt)
1071
1072{
1073	struct sk_buff *skb;
1074	struct frag_hdr fhdr;
1075	int err;
1076
1077	/* There is support for UDP large send offload by network
1078	 * device, so create one single skb packet containing complete
1079	 * udp datagram
1080	 */
1081	skb = skb_peek_tail(queue);
1082	if (!skb) {
1083		skb = sock_alloc_send_skb(sk,
1084			hh_len + fragheaderlen + transhdrlen + 20,
1085			(flags & MSG_DONTWAIT), &err);
1086		if (!skb)
1087			return err;
1088
1089		/* reserve space for Hardware header */
1090		skb_reserve(skb, hh_len);
1091
1092		/* create space for UDP/IP header */
1093		skb_put(skb, fragheaderlen + transhdrlen);
1094
1095		/* initialize network header pointer */
1096		skb_reset_network_header(skb);
1097
1098		/* initialize protocol header pointer */
1099		skb->transport_header = skb->network_header + fragheaderlen;
1100
1101		skb->protocol = htons(ETH_P_IPV6);
1102		skb->csum = 0;
1103
1104		__skb_queue_tail(queue, skb);
1105	} else if (skb_is_gso(skb)) {
1106		goto append;
1107	}
1108
1109	skb->ip_summed = CHECKSUM_PARTIAL;
1110	/* Specify the length of each IPv6 datagram fragment.
1111	 * It has to be a multiple of 8.
1112	 */
1113	skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1114				     sizeof(struct frag_hdr)) & ~7;
1115	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1116	ipv6_select_ident(sock_net(sk), &fhdr, rt);
1117	skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1118
1119append:
1120	return skb_append_datato_frags(sk, skb, getfrag, from,
1121				       (length - transhdrlen));
1122}
1123
1124static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1125					       gfp_t gfp)
1126{
1127	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1128}
1129
1130static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1131						gfp_t gfp)
1132{
1133	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1134}
1135
1136static void ip6_append_data_mtu(unsigned int *mtu,
1137				int *maxfraglen,
1138				unsigned int fragheaderlen,
1139				struct sk_buff *skb,
1140				struct rt6_info *rt,
1141				unsigned int orig_mtu)
1142{
1143	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1144		if (!skb) {
1145			/* first fragment, reserve header_len */
1146			*mtu = orig_mtu - rt->dst.header_len;
1147
1148		} else {
1149			/*
1150			 * this fragment is not first, the headers
1151			 * space is regarded as data space.
1152			 */
1153			*mtu = orig_mtu;
1154		}
1155		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1156			      + fragheaderlen - sizeof(struct frag_hdr);
1157	}
1158}
1159
1160static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1161			  struct inet6_cork *v6_cork,
1162			  int hlimit, int tclass, struct ipv6_txoptions *opt,
1163			  struct rt6_info *rt, struct flowi6 *fl6)
1164{
1165	struct ipv6_pinfo *np = inet6_sk(sk);
1166	unsigned int mtu;
1167
1168	/*
1169	 * setup for corking
1170	 */
1171	if (opt) {
1172		if (WARN_ON(v6_cork->opt))
1173			return -EINVAL;
1174
1175		v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1176		if (unlikely(!v6_cork->opt))
1177			return -ENOBUFS;
1178
1179		v6_cork->opt->tot_len = opt->tot_len;
1180		v6_cork->opt->opt_flen = opt->opt_flen;
1181		v6_cork->opt->opt_nflen = opt->opt_nflen;
1182
1183		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1184						    sk->sk_allocation);
1185		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1186			return -ENOBUFS;
1187
1188		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1189						    sk->sk_allocation);
1190		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1191			return -ENOBUFS;
1192
1193		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1194						   sk->sk_allocation);
1195		if (opt->hopopt && !v6_cork->opt->hopopt)
1196			return -ENOBUFS;
1197
1198		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1199						    sk->sk_allocation);
1200		if (opt->srcrt && !v6_cork->opt->srcrt)
1201			return -ENOBUFS;
1202
1203		/* need source address above miyazawa*/
1204	}
1205	dst_hold(&rt->dst);
1206	cork->base.dst = &rt->dst;
1207	cork->fl.u.ip6 = *fl6;
1208	v6_cork->hop_limit = hlimit;
1209	v6_cork->tclass = tclass;
1210	if (rt->dst.flags & DST_XFRM_TUNNEL)
1211		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1212		      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1213	else
1214		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1215		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1216	if (np->frag_size < mtu) {
1217		if (np->frag_size)
1218			mtu = np->frag_size;
1219	}
1220	cork->base.fragsize = mtu;
1221	if (dst_allfrag(rt->dst.path))
1222		cork->base.flags |= IPCORK_ALLFRAG;
1223	cork->base.length = 0;
1224
1225	return 0;
1226}
1227
1228static int __ip6_append_data(struct sock *sk,
1229			     struct flowi6 *fl6,
1230			     struct sk_buff_head *queue,
1231			     struct inet_cork *cork,
1232			     struct inet6_cork *v6_cork,
1233			     struct page_frag *pfrag,
1234			     int getfrag(void *from, char *to, int offset,
1235					 int len, int odd, struct sk_buff *skb),
1236			     void *from, int length, int transhdrlen,
1237			     unsigned int flags, int dontfrag)
1238{
1239	struct sk_buff *skb, *skb_prev = NULL;
1240	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1241	int exthdrlen = 0;
1242	int dst_exthdrlen = 0;
1243	int hh_len;
1244	int copy;
1245	int err;
1246	int offset = 0;
1247	__u8 tx_flags = 0;
1248	u32 tskey = 0;
1249	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1250	struct ipv6_txoptions *opt = v6_cork->opt;
1251	int csummode = CHECKSUM_NONE;
1252
1253	skb = skb_peek_tail(queue);
1254	if (!skb) {
1255		exthdrlen = opt ? opt->opt_flen : 0;
1256		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1257	}
1258
1259	mtu = cork->fragsize;
1260	orig_mtu = mtu;
1261
1262	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1263
1264	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1265			(opt ? opt->opt_nflen : 0);
1266	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1267		     sizeof(struct frag_hdr);
1268
1269	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1270		unsigned int maxnonfragsize, headersize;
1271
1272		headersize = sizeof(struct ipv6hdr) +
1273			     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1274			     (dst_allfrag(&rt->dst) ?
1275			      sizeof(struct frag_hdr) : 0) +
1276			     rt->rt6i_nfheader_len;
1277
1278		if (ip6_sk_ignore_df(sk))
1279			maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1280		else
1281			maxnonfragsize = mtu;
1282
1283		/* dontfrag active */
1284		if ((cork->length + length > mtu - headersize) && dontfrag &&
1285		    (sk->sk_protocol == IPPROTO_UDP ||
1286		     sk->sk_protocol == IPPROTO_RAW)) {
1287			ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1288						   sizeof(struct ipv6hdr));
1289			goto emsgsize;
1290		}
1291
1292		if (cork->length + length > maxnonfragsize - headersize) {
1293emsgsize:
1294			ipv6_local_error(sk, EMSGSIZE, fl6,
1295					 mtu - headersize +
1296					 sizeof(struct ipv6hdr));
1297			return -EMSGSIZE;
1298		}
1299	}
1300
1301	if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1302		sock_tx_timestamp(sk, &tx_flags);
1303		if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1304		    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1305			tskey = sk->sk_tskey++;
1306	}
1307
1308	/* If this is the first and only packet and device
1309	 * supports checksum offloading, let's use it.
1310	 * Use transhdrlen, same as IPv4, because partial
1311	 * sums only work when transhdrlen is set.
1312	 */
1313	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1314	    length + fragheaderlen < mtu &&
1315	    rt->dst.dev->features & NETIF_F_V6_CSUM &&
1316	    !exthdrlen)
1317		csummode = CHECKSUM_PARTIAL;
1318	/*
1319	 * Let's try using as much space as possible.
1320	 * Use MTU if total length of the message fits into the MTU.
1321	 * Otherwise, we need to reserve fragment header and
1322	 * fragment alignment (= 8-15 octects, in total).
1323	 *
1324	 * Note that we may need to "move" the data from the tail of
1325	 * of the buffer to the new fragment when we split
1326	 * the message.
1327	 *
1328	 * FIXME: It may be fragmented into multiple chunks
1329	 *        at once if non-fragmentable extension headers
1330	 *        are too large.
1331	 * --yoshfuji
1332	 */
1333
1334	cork->length += length;
1335	if (((length > mtu) ||
1336	     (skb && skb_is_gso(skb))) &&
1337	    (sk->sk_protocol == IPPROTO_UDP) &&
1338	    (rt->dst.dev->features & NETIF_F_UFO) &&
1339	    (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
1340		err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1341					  hh_len, fragheaderlen,
1342					  transhdrlen, mtu, flags, rt);
1343		if (err)
1344			goto error;
1345		return 0;
1346	}
1347
1348	if (!skb)
1349		goto alloc_new_skb;
1350
1351	while (length > 0) {
1352		/* Check if the remaining data fits into current packet. */
1353		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1354		if (copy < length)
1355			copy = maxfraglen - skb->len;
1356
1357		if (copy <= 0) {
1358			char *data;
1359			unsigned int datalen;
1360			unsigned int fraglen;
1361			unsigned int fraggap;
1362			unsigned int alloclen;
1363alloc_new_skb:
1364			/* There's no room in the current skb */
1365			if (skb)
1366				fraggap = skb->len - maxfraglen;
1367			else
1368				fraggap = 0;
1369			/* update mtu and maxfraglen if necessary */
1370			if (!skb || !skb_prev)
1371				ip6_append_data_mtu(&mtu, &maxfraglen,
1372						    fragheaderlen, skb, rt,
1373						    orig_mtu);
1374
1375			skb_prev = skb;
1376
1377			/*
1378			 * If remaining data exceeds the mtu,
1379			 * we know we need more fragment(s).
1380			 */
1381			datalen = length + fraggap;
1382
1383			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1384				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1385			if ((flags & MSG_MORE) &&
1386			    !(rt->dst.dev->features&NETIF_F_SG))
1387				alloclen = mtu;
1388			else
1389				alloclen = datalen + fragheaderlen;
1390
1391			alloclen += dst_exthdrlen;
1392
1393			if (datalen != length + fraggap) {
1394				/*
1395				 * this is not the last fragment, the trailer
1396				 * space is regarded as data space.
1397				 */
1398				datalen += rt->dst.trailer_len;
1399			}
1400
1401			alloclen += rt->dst.trailer_len;
1402			fraglen = datalen + fragheaderlen;
1403
1404			/*
1405			 * We just reserve space for fragment header.
1406			 * Note: this may be overallocation if the message
1407			 * (without MSG_MORE) fits into the MTU.
1408			 */
1409			alloclen += sizeof(struct frag_hdr);
1410
1411			if (transhdrlen) {
1412				skb = sock_alloc_send_skb(sk,
1413						alloclen + hh_len,
1414						(flags & MSG_DONTWAIT), &err);
1415			} else {
1416				skb = NULL;
1417				if (atomic_read(&sk->sk_wmem_alloc) <=
1418				    2 * sk->sk_sndbuf)
1419					skb = sock_wmalloc(sk,
1420							   alloclen + hh_len, 1,
1421							   sk->sk_allocation);
1422				if (unlikely(!skb))
1423					err = -ENOBUFS;
1424			}
1425			if (!skb)
1426				goto error;
1427			/*
1428			 *	Fill in the control structures
1429			 */
1430			skb->protocol = htons(ETH_P_IPV6);
1431			skb->ip_summed = csummode;
1432			skb->csum = 0;
1433			/* reserve for fragmentation and ipsec header */
1434			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1435				    dst_exthdrlen);
1436
1437			/* Only the initial fragment is time stamped */
1438			skb_shinfo(skb)->tx_flags = tx_flags;
1439			tx_flags = 0;
1440			skb_shinfo(skb)->tskey = tskey;
1441			tskey = 0;
1442
1443			/*
1444			 *	Find where to start putting bytes
1445			 */
1446			data = skb_put(skb, fraglen);
1447			skb_set_network_header(skb, exthdrlen);
1448			data += fragheaderlen;
1449			skb->transport_header = (skb->network_header +
1450						 fragheaderlen);
1451			if (fraggap) {
1452				skb->csum = skb_copy_and_csum_bits(
1453					skb_prev, maxfraglen,
1454					data + transhdrlen, fraggap, 0);
1455				skb_prev->csum = csum_sub(skb_prev->csum,
1456							  skb->csum);
1457				data += fraggap;
1458				pskb_trim_unique(skb_prev, maxfraglen);
1459			}
1460			copy = datalen - transhdrlen - fraggap;
1461
1462			if (copy < 0) {
1463				err = -EINVAL;
1464				kfree_skb(skb);
1465				goto error;
1466			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1467				err = -EFAULT;
1468				kfree_skb(skb);
1469				goto error;
1470			}
1471
1472			offset += copy;
1473			length -= datalen - fraggap;
1474			transhdrlen = 0;
1475			exthdrlen = 0;
1476			dst_exthdrlen = 0;
1477
1478			/*
1479			 * Put the packet on the pending queue
1480			 */
1481			__skb_queue_tail(queue, skb);
1482			continue;
1483		}
1484
1485		if (copy > length)
1486			copy = length;
1487
1488		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1489			unsigned int off;
1490
1491			off = skb->len;
1492			if (getfrag(from, skb_put(skb, copy),
1493						offset, copy, off, skb) < 0) {
1494				__skb_trim(skb, off);
1495				err = -EFAULT;
1496				goto error;
1497			}
1498		} else {
1499			int i = skb_shinfo(skb)->nr_frags;
1500
1501			err = -ENOMEM;
1502			if (!sk_page_frag_refill(sk, pfrag))
1503				goto error;
1504
1505			if (!skb_can_coalesce(skb, i, pfrag->page,
1506					      pfrag->offset)) {
1507				err = -EMSGSIZE;
1508				if (i == MAX_SKB_FRAGS)
1509					goto error;
1510
1511				__skb_fill_page_desc(skb, i, pfrag->page,
1512						     pfrag->offset, 0);
1513				skb_shinfo(skb)->nr_frags = ++i;
1514				get_page(pfrag->page);
1515			}
1516			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1517			if (getfrag(from,
1518				    page_address(pfrag->page) + pfrag->offset,
1519				    offset, copy, skb->len, skb) < 0)
1520				goto error_efault;
1521
1522			pfrag->offset += copy;
1523			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1524			skb->len += copy;
1525			skb->data_len += copy;
1526			skb->truesize += copy;
1527			atomic_add(copy, &sk->sk_wmem_alloc);
1528		}
1529		offset += copy;
1530		length -= copy;
1531	}
1532
1533	return 0;
1534
1535error_efault:
1536	err = -EFAULT;
1537error:
1538	cork->length -= length;
1539	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1540	return err;
1541}
1542
1543int ip6_append_data(struct sock *sk,
1544		    int getfrag(void *from, char *to, int offset, int len,
1545				int odd, struct sk_buff *skb),
1546		    void *from, int length, int transhdrlen, int hlimit,
1547		    int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1548		    struct rt6_info *rt, unsigned int flags, int dontfrag)
1549{
1550	struct inet_sock *inet = inet_sk(sk);
1551	struct ipv6_pinfo *np = inet6_sk(sk);
1552	int exthdrlen;
1553	int err;
1554
1555	if (flags&MSG_PROBE)
1556		return 0;
1557	if (skb_queue_empty(&sk->sk_write_queue)) {
1558		/*
1559		 * setup for corking
1560		 */
1561		err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit,
1562				     tclass, opt, rt, fl6);
1563		if (err)
1564			return err;
1565
1566		exthdrlen = (opt ? opt->opt_flen : 0);
1567		length += exthdrlen;
1568		transhdrlen += exthdrlen;
1569	} else {
1570		fl6 = &inet->cork.fl.u.ip6;
1571		transhdrlen = 0;
1572	}
1573
1574	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1575				 &np->cork, sk_page_frag(sk), getfrag,
1576				 from, length, transhdrlen, flags, dontfrag);
1577}
1578EXPORT_SYMBOL_GPL(ip6_append_data);
1579
1580static void ip6_cork_release(struct inet_cork_full *cork,
1581			     struct inet6_cork *v6_cork)
1582{
1583	if (v6_cork->opt) {
1584		kfree(v6_cork->opt->dst0opt);
1585		kfree(v6_cork->opt->dst1opt);
1586		kfree(v6_cork->opt->hopopt);
1587		kfree(v6_cork->opt->srcrt);
1588		kfree(v6_cork->opt);
1589		v6_cork->opt = NULL;
1590	}
1591
1592	if (cork->base.dst) {
1593		dst_release(cork->base.dst);
1594		cork->base.dst = NULL;
1595		cork->base.flags &= ~IPCORK_ALLFRAG;
1596	}
1597	memset(&cork->fl, 0, sizeof(cork->fl));
1598}
1599
1600struct sk_buff *__ip6_make_skb(struct sock *sk,
1601			       struct sk_buff_head *queue,
1602			       struct inet_cork_full *cork,
1603			       struct inet6_cork *v6_cork)
1604{
1605	struct sk_buff *skb, *tmp_skb;
1606	struct sk_buff **tail_skb;
1607	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1608	struct ipv6_pinfo *np = inet6_sk(sk);
1609	struct net *net = sock_net(sk);
1610	struct ipv6hdr *hdr;
1611	struct ipv6_txoptions *opt = v6_cork->opt;
1612	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1613	struct flowi6 *fl6 = &cork->fl.u.ip6;
1614	unsigned char proto = fl6->flowi6_proto;
1615
1616	skb = __skb_dequeue(queue);
1617	if (!skb)
1618		goto out;
1619	tail_skb = &(skb_shinfo(skb)->frag_list);
1620
1621	/* move skb->data to ip header from ext header */
1622	if (skb->data < skb_network_header(skb))
1623		__skb_pull(skb, skb_network_offset(skb));
1624	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1625		__skb_pull(tmp_skb, skb_network_header_len(skb));
1626		*tail_skb = tmp_skb;
1627		tail_skb = &(tmp_skb->next);
1628		skb->len += tmp_skb->len;
1629		skb->data_len += tmp_skb->len;
1630		skb->truesize += tmp_skb->truesize;
1631		tmp_skb->destructor = NULL;
1632		tmp_skb->sk = NULL;
1633	}
1634
1635	/* Allow local fragmentation. */
1636	skb->ignore_df = ip6_sk_ignore_df(sk);
1637
1638	*final_dst = fl6->daddr;
1639	__skb_pull(skb, skb_network_header_len(skb));
1640	if (opt && opt->opt_flen)
1641		ipv6_push_frag_opts(skb, opt, &proto);
1642	if (opt && opt->opt_nflen)
1643		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1644
1645	skb_push(skb, sizeof(struct ipv6hdr));
1646	skb_reset_network_header(skb);
1647	hdr = ipv6_hdr(skb);
1648
1649	ip6_flow_hdr(hdr, v6_cork->tclass,
1650		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1651					np->autoflowlabel));
1652	hdr->hop_limit = v6_cork->hop_limit;
1653	hdr->nexthdr = proto;
1654	hdr->saddr = fl6->saddr;
1655	hdr->daddr = *final_dst;
1656
1657	skb->priority = sk->sk_priority;
1658	skb->mark = sk->sk_mark;
1659
1660	skb_dst_set(skb, dst_clone(&rt->dst));
1661	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1662	if (proto == IPPROTO_ICMPV6) {
1663		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1664
1665		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1666		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1667	}
1668
1669	ip6_cork_release(cork, v6_cork);
1670out:
1671	return skb;
1672}
1673
1674int ip6_send_skb(struct sk_buff *skb)
1675{
1676	struct net *net = sock_net(skb->sk);
1677	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1678	int err;
1679
1680	err = ip6_local_out(skb);
1681	if (err) {
1682		if (err > 0)
1683			err = net_xmit_errno(err);
1684		if (err)
1685			IP6_INC_STATS(net, rt->rt6i_idev,
1686				      IPSTATS_MIB_OUTDISCARDS);
1687	}
1688
1689	return err;
1690}
1691
1692int ip6_push_pending_frames(struct sock *sk)
1693{
1694	struct sk_buff *skb;
1695
1696	skb = ip6_finish_skb(sk);
1697	if (!skb)
1698		return 0;
1699
1700	return ip6_send_skb(skb);
1701}
1702EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1703
1704static void __ip6_flush_pending_frames(struct sock *sk,
1705				       struct sk_buff_head *queue,
1706				       struct inet_cork_full *cork,
1707				       struct inet6_cork *v6_cork)
1708{
1709	struct sk_buff *skb;
1710
1711	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1712		if (skb_dst(skb))
1713			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1714				      IPSTATS_MIB_OUTDISCARDS);
1715		kfree_skb(skb);
1716	}
1717
1718	ip6_cork_release(cork, v6_cork);
1719}
1720
1721void ip6_flush_pending_frames(struct sock *sk)
1722{
1723	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1724				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1725}
1726EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1727
1728struct sk_buff *ip6_make_skb(struct sock *sk,
1729			     int getfrag(void *from, char *to, int offset,
1730					 int len, int odd, struct sk_buff *skb),
1731			     void *from, int length, int transhdrlen,
1732			     int hlimit, int tclass,
1733			     struct ipv6_txoptions *opt, struct flowi6 *fl6,
1734			     struct rt6_info *rt, unsigned int flags,
1735			     int dontfrag)
1736{
1737	struct inet_cork_full cork;
1738	struct inet6_cork v6_cork;
1739	struct sk_buff_head queue;
1740	int exthdrlen = (opt ? opt->opt_flen : 0);
1741	int err;
1742
1743	if (flags & MSG_PROBE)
1744		return NULL;
1745
1746	__skb_queue_head_init(&queue);
1747
1748	cork.base.flags = 0;
1749	cork.base.addr = 0;
1750	cork.base.opt = NULL;
1751	v6_cork.opt = NULL;
1752	err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6);
1753	if (err)
1754		return ERR_PTR(err);
1755
1756	if (dontfrag < 0)
1757		dontfrag = inet6_sk(sk)->dontfrag;
1758
1759	err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1760				&current->task_frag, getfrag, from,
1761				length + exthdrlen, transhdrlen + exthdrlen,
1762				flags, dontfrag);
1763	if (err) {
1764		__ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1765		return ERR_PTR(err);
1766	}
1767
1768	return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1769}
1770