1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Authors:	Ross Biro
9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
11 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
12 *		Florian La Roche, <flla@stud.uni-sb.de>
13 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
14 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
15 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
16 *		Matthew Dillon, <dillon@apollo.west.oic.com>
17 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18 *		Jorge Cwik, <jorge@laser.satlink.net>
19 *
20 * Fixes:
21 *		Alan Cox	:	Numerous verify_area() calls
22 *		Alan Cox	:	Set the ACK bit on a reset
23 *		Alan Cox	:	Stopped it crashing if it closed while
24 *					sk->inuse=1 and was trying to connect
25 *					(tcp_err()).
26 *		Alan Cox	:	All icmp error handling was broken
27 *					pointers passed where wrong and the
28 *					socket was looked up backwards. Nobody
29 *					tested any icmp error code obviously.
30 *		Alan Cox	:	tcp_err() now handled properly. It
31 *					wakes people on errors. poll
32 *					behaves and the icmp error race
33 *					has gone by moving it into sock.c
34 *		Alan Cox	:	tcp_send_reset() fixed to work for
35 *					everything not just packets for
36 *					unknown sockets.
37 *		Alan Cox	:	tcp option processing.
38 *		Alan Cox	:	Reset tweaked (still not 100%) [Had
39 *					syn rule wrong]
40 *		Herp Rosmanith  :	More reset fixes
41 *		Alan Cox	:	No longer acks invalid rst frames.
42 *					Acking any kind of RST is right out.
43 *		Alan Cox	:	Sets an ignore me flag on an rst
44 *					receive otherwise odd bits of prattle
45 *					escape still
46 *		Alan Cox	:	Fixed another acking RST frame bug.
47 *					Should stop LAN workplace lockups.
48 *		Alan Cox	: 	Some tidyups using the new skb list
49 *					facilities
50 *		Alan Cox	:	sk->keepopen now seems to work
51 *		Alan Cox	:	Pulls options out correctly on accepts
52 *		Alan Cox	:	Fixed assorted sk->rqueue->next errors
53 *		Alan Cox	:	PSH doesn't end a TCP read. Switched a
54 *					bit to skb ops.
55 *		Alan Cox	:	Tidied tcp_data to avoid a potential
56 *					nasty.
57 *		Alan Cox	:	Added some better commenting, as the
58 *					tcp is hard to follow
59 *		Alan Cox	:	Removed incorrect check for 20 * psh
60 *	Michael O'Reilly	:	ack < copied bug fix.
61 *	Johannes Stille		:	Misc tcp fixes (not all in yet).
62 *		Alan Cox	:	FIN with no memory -> CRASH
63 *		Alan Cox	:	Added socket option proto entries.
64 *					Also added awareness of them to accept.
65 *		Alan Cox	:	Added TCP options (SOL_TCP)
66 *		Alan Cox	:	Switched wakeup calls to callbacks,
67 *					so the kernel can layer network
68 *					sockets.
69 *		Alan Cox	:	Use ip_tos/ip_ttl settings.
70 *		Alan Cox	:	Handle FIN (more) properly (we hope).
71 *		Alan Cox	:	RST frames sent on unsynchronised
72 *					state ack error.
73 *		Alan Cox	:	Put in missing check for SYN bit.
74 *		Alan Cox	:	Added tcp_select_window() aka NET2E
75 *					window non shrink trick.
76 *		Alan Cox	:	Added a couple of small NET2E timer
77 *					fixes
78 *		Charles Hedrick :	TCP fixes
79 *		Toomas Tamm	:	TCP window fixes
80 *		Alan Cox	:	Small URG fix to rlogin ^C ack fight
81 *		Charles Hedrick	:	Rewrote most of it to actually work
82 *		Linus		:	Rewrote tcp_read() and URG handling
83 *					completely
84 *		Gerhard Koerting:	Fixed some missing timer handling
85 *		Matthew Dillon  :	Reworked TCP machine states as per RFC
86 *		Gerhard Koerting:	PC/TCP workarounds
87 *		Adam Caldwell	:	Assorted timer/timing errors
88 *		Matthew Dillon	:	Fixed another RST bug
89 *		Alan Cox	:	Move to kernel side addressing changes.
90 *		Alan Cox	:	Beginning work on TCP fastpathing
91 *					(not yet usable)
92 *		Arnt Gulbrandsen:	Turbocharged tcp_check() routine.
93 *		Alan Cox	:	TCP fast path debugging
94 *		Alan Cox	:	Window clamping
95 *		Michael Riepe	:	Bug in tcp_check()
96 *		Matt Dillon	:	More TCP improvements and RST bug fixes
97 *		Matt Dillon	:	Yet more small nasties remove from the
98 *					TCP code (Be very nice to this man if
99 *					tcp finally works 100%) 8)
100 *		Alan Cox	:	BSD accept semantics.
101 *		Alan Cox	:	Reset on closedown bug.
102 *	Peter De Schrijver	:	ENOTCONN check missing in tcp_sendto().
103 *		Michael Pall	:	Handle poll() after URG properly in
104 *					all cases.
105 *		Michael Pall	:	Undo the last fix in tcp_read_urg()
106 *					(multi URG PUSH broke rlogin).
107 *		Michael Pall	:	Fix the multi URG PUSH problem in
108 *					tcp_readable(), poll() after URG
109 *					works now.
110 *		Michael Pall	:	recv(...,MSG_OOB) never blocks in the
111 *					BSD api.
112 *		Alan Cox	:	Changed the semantics of sk->socket to
113 *					fix a race and a signal problem with
114 *					accept() and async I/O.
115 *		Alan Cox	:	Relaxed the rules on tcp_sendto().
116 *		Yury Shevchuk	:	Really fixed accept() blocking problem.
117 *		Craig I. Hagan  :	Allow for BSD compatible TIME_WAIT for
118 *					clients/servers which listen in on
119 *					fixed ports.
120 *		Alan Cox	:	Cleaned the above up and shrank it to
121 *					a sensible code size.
122 *		Alan Cox	:	Self connect lockup fix.
123 *		Alan Cox	:	No connect to multicast.
124 *		Ross Biro	:	Close unaccepted children on master
125 *					socket close.
126 *		Alan Cox	:	Reset tracing code.
127 *		Alan Cox	:	Spurious resets on shutdown.
128 *		Alan Cox	:	Giant 15 minute/60 second timer error
129 *		Alan Cox	:	Small whoops in polling before an
130 *					accept.
131 *		Alan Cox	:	Kept the state trace facility since
132 *					it's handy for debugging.
133 *		Alan Cox	:	More reset handler fixes.
134 *		Alan Cox	:	Started rewriting the code based on
135 *					the RFC's for other useful protocol
136 *					references see: Comer, KA9Q NOS, and
137 *					for a reference on the difference
138 *					between specifications and how BSD
139 *					works see the 4.4lite source.
140 *		A.N.Kuznetsov	:	Don't time wait on completion of tidy
141 *					close.
142 *		Linus Torvalds	:	Fin/Shutdown & copied_seq changes.
143 *		Linus Torvalds	:	Fixed BSD port reuse to work first syn
144 *		Alan Cox	:	Reimplemented timers as per the RFC
145 *					and using multiple timers for sanity.
146 *		Alan Cox	:	Small bug fixes, and a lot of new
147 *					comments.
148 *		Alan Cox	:	Fixed dual reader crash by locking
149 *					the buffers (much like datagram.c)
150 *		Alan Cox	:	Fixed stuck sockets in probe. A probe
151 *					now gets fed up of retrying without
152 *					(even a no space) answer.
153 *		Alan Cox	:	Extracted closing code better
154 *		Alan Cox	:	Fixed the closing state machine to
155 *					resemble the RFC.
156 *		Alan Cox	:	More 'per spec' fixes.
157 *		Jorge Cwik	:	Even faster checksumming.
158 *		Alan Cox	:	tcp_data() doesn't ack illegal PSH
159 *					only frames. At least one pc tcp stack
160 *					generates them.
161 *		Alan Cox	:	Cache last socket.
162 *		Alan Cox	:	Per route irtt.
163 *		Matt Day	:	poll()->select() match BSD precisely on error
164 *		Alan Cox	:	New buffers
165 *		Marc Tamsky	:	Various sk->prot->retransmits and
166 *					sk->retransmits misupdating fixed.
167 *					Fixed tcp_write_timeout: stuck close,
168 *					and TCP syn retries gets used now.
169 *		Mark Yarvis	:	In tcp_read_wakeup(), don't send an
170 *					ack if state is TCP_CLOSED.
171 *		Alan Cox	:	Look up device on a retransmit - routes may
172 *					change. Doesn't yet cope with MSS shrink right
173 *					but it's a start!
174 *		Marc Tamsky	:	Closing in closing fixes.
175 *		Mike Shaver	:	RFC1122 verifications.
176 *		Alan Cox	:	rcv_saddr errors.
177 *		Alan Cox	:	Block double connect().
178 *		Alan Cox	:	Small hooks for enSKIP.
179 *		Alexey Kuznetsov:	Path MTU discovery.
180 *		Alan Cox	:	Support soft errors.
181 *		Alan Cox	:	Fix MTU discovery pathological case
182 *					when the remote claims no mtu!
183 *		Marc Tamsky	:	TCP_CLOSE fix.
184 *		Colin (G3TNE)	:	Send a reset on syn ack replies in
185 *					window but wrong (fixes NT lpd problems)
186 *		Pedro Roque	:	Better TCP window handling, delayed ack.
187 *		Joerg Reuter	:	No modification of locked buffers in
188 *					tcp_do_retransmit()
189 *		Eric Schenk	:	Changed receiver side silly window
190 *					avoidance algorithm to BSD style
191 *					algorithm. This doubles throughput
192 *					against machines running Solaris,
193 *					and seems to result in general
194 *					improvement.
195 *	Stefan Magdalinski	:	adjusted tcp_readable() to fix FIONREAD
196 *	Willy Konynenberg	:	Transparent proxying support.
197 *	Mike McLagan		:	Routing by source
198 *		Keith Owens	:	Do proper merging with partial SKB's in
199 *					tcp_do_sendmsg to avoid burstiness.
200 *		Eric Schenk	:	Fix fast close down bug with
201 *					shutdown() followed by close().
202 *		Andi Kleen 	:	Make poll agree with SIGIO
203 *	Salvatore Sanfilippo	:	Support SO_LINGER with linger == 1 and
204 *					lingertime == 0 (RFC 793 ABORT Call)
205 *	Hirokazu Takahashi	:	Use copy_from_user() instead of
206 *					csum_and_copy_from_user() if possible.
207 *
208 *		This program is free software; you can redistribute it and/or
209 *		modify it under the terms of the GNU General Public License
210 *		as published by the Free Software Foundation; either version
211 *		2 of the License, or(at your option) any later version.
212 *
213 * Description of States:
214 *
215 *	TCP_SYN_SENT		sent a connection request, waiting for ack
216 *
217 *	TCP_SYN_RECV		received a connection request, sent ack,
218 *				waiting for final ack in three-way handshake.
219 *
220 *	TCP_ESTABLISHED		connection established
221 *
222 *	TCP_FIN_WAIT1		our side has shutdown, waiting to complete
223 *				transmission of remaining buffered data
224 *
225 *	TCP_FIN_WAIT2		all buffered data sent, waiting for remote
226 *				to shutdown
227 *
228 *	TCP_CLOSING		both sides have shutdown but we still have
229 *				data we have to finish sending
230 *
231 *	TCP_TIME_WAIT		timeout to catch resent junk before entering
232 *				closed, can only be entered from FIN_WAIT2
233 *				or CLOSING.  Required because the other end
234 *				may not have gotten our last ACK causing it
235 *				to retransmit the data packet (which we ignore)
236 *
237 *	TCP_CLOSE_WAIT		remote side has shutdown and is waiting for
238 *				us to finish writing our data and to shutdown
239 *				(we have to close() to move on to LAST_ACK)
240 *
241 *	TCP_LAST_ACK		out side has shutdown after remote has
242 *				shutdown.  There may still be data in our
243 *				buffer that we have to finish sending
244 *
245 *	TCP_CLOSE		socket is finished
246 */
247
248#define pr_fmt(fmt) "TCP: " fmt
249
250#include <linux/kernel.h>
251#include <linux/module.h>
252#include <linux/types.h>
253#include <linux/fcntl.h>
254#include <linux/poll.h>
255#include <linux/inet_diag.h>
256#include <linux/init.h>
257#include <linux/fs.h>
258#include <linux/skbuff.h>
259#include <linux/scatterlist.h>
260#include <linux/splice.h>
261#include <linux/net.h>
262#include <linux/socket.h>
263#include <linux/random.h>
264#include <linux/bootmem.h>
265#include <linux/highmem.h>
266#include <linux/swap.h>
267#include <linux/cache.h>
268#include <linux/err.h>
269#include <linux/crypto.h>
270#include <linux/time.h>
271#include <linux/slab.h>
272
273#include <net/icmp.h>
274#include <net/inet_common.h>
275#include <net/tcp.h>
276#include <net/xfrm.h>
277#include <net/ip.h>
278#include <net/sock.h>
279
280#include <asm/uaccess.h>
281#include <asm/ioctls.h>
282#include <asm/unaligned.h>
283#include <net/busy_poll.h>
284
285int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
286
287int sysctl_tcp_min_tso_segs __read_mostly = 2;
288
289int sysctl_tcp_autocorking __read_mostly = 1;
290
291struct percpu_counter tcp_orphan_count;
292EXPORT_SYMBOL_GPL(tcp_orphan_count);
293
294long sysctl_tcp_mem[3] __read_mostly;
295int sysctl_tcp_wmem[3] __read_mostly;
296int sysctl_tcp_rmem[3] __read_mostly;
297
298EXPORT_SYMBOL(sysctl_tcp_mem);
299EXPORT_SYMBOL(sysctl_tcp_rmem);
300EXPORT_SYMBOL(sysctl_tcp_wmem);
301
302atomic_long_t tcp_memory_allocated;	/* Current allocated memory. */
303EXPORT_SYMBOL(tcp_memory_allocated);
304
305/*
306 * Current number of TCP sockets.
307 */
308struct percpu_counter tcp_sockets_allocated;
309EXPORT_SYMBOL(tcp_sockets_allocated);
310
311/*
312 * TCP splice context
313 */
314struct tcp_splice_state {
315	struct pipe_inode_info *pipe;
316	size_t len;
317	unsigned int flags;
318};
319
320/*
321 * Pressure flag: try to collapse.
322 * Technical note: it is used by multiple contexts non atomically.
323 * All the __sk_mem_schedule() is of this nature: accounting
324 * is strict, actions are advisory and have some latency.
325 */
326int tcp_memory_pressure __read_mostly;
327EXPORT_SYMBOL(tcp_memory_pressure);
328
329void tcp_enter_memory_pressure(struct sock *sk)
330{
331	if (!tcp_memory_pressure) {
332		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
333		tcp_memory_pressure = 1;
334	}
335}
336EXPORT_SYMBOL(tcp_enter_memory_pressure);
337
338/* Convert seconds to retransmits based on initial and max timeout */
339static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
340{
341	u8 res = 0;
342
343	if (seconds > 0) {
344		int period = timeout;
345
346		res = 1;
347		while (seconds > period && res < 255) {
348			res++;
349			timeout <<= 1;
350			if (timeout > rto_max)
351				timeout = rto_max;
352			period += timeout;
353		}
354	}
355	return res;
356}
357
358/* Convert retransmits to seconds based on initial and max timeout */
359static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
360{
361	int period = 0;
362
363	if (retrans > 0) {
364		period = timeout;
365		while (--retrans) {
366			timeout <<= 1;
367			if (timeout > rto_max)
368				timeout = rto_max;
369			period += timeout;
370		}
371	}
372	return period;
373}
374
375/* Address-family independent initialization for a tcp_sock.
376 *
377 * NOTE: A lot of things set to zero explicitly by call to
378 *       sk_alloc() so need not be done here.
379 */
380void tcp_init_sock(struct sock *sk)
381{
382	struct inet_connection_sock *icsk = inet_csk(sk);
383	struct tcp_sock *tp = tcp_sk(sk);
384
385	__skb_queue_head_init(&tp->out_of_order_queue);
386	tcp_init_xmit_timers(sk);
387	tcp_prequeue_init(tp);
388	INIT_LIST_HEAD(&tp->tsq_node);
389
390	icsk->icsk_rto = TCP_TIMEOUT_INIT;
391	tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
392
393	/* So many TCP implementations out there (incorrectly) count the
394	 * initial SYN frame in their delayed-ACK and congestion control
395	 * algorithms that we must have the following bandaid to talk
396	 * efficiently to them.  -DaveM
397	 */
398	tp->snd_cwnd = TCP_INIT_CWND;
399
400	/* See draft-stevens-tcpca-spec-01 for discussion of the
401	 * initialization of these values.
402	 */
403	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
404	tp->snd_cwnd_clamp = ~0;
405	tp->mss_cache = TCP_MSS_DEFAULT;
406	u64_stats_init(&tp->syncp);
407
408	tp->reordering = sysctl_tcp_reordering;
409	tcp_enable_early_retrans(tp);
410	tcp_assign_congestion_control(sk);
411
412	tp->tsoffset = 0;
413
414	sk->sk_state = TCP_CLOSE;
415
416	sk->sk_write_space = sk_stream_write_space;
417	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
418
419	icsk->icsk_sync_mss = tcp_sync_mss;
420
421	sk->sk_sndbuf = sysctl_tcp_wmem[1];
422	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
423
424	local_bh_disable();
425	sock_update_memcg(sk);
426	sk_sockets_allocated_inc(sk);
427	local_bh_enable();
428}
429EXPORT_SYMBOL(tcp_init_sock);
430
431static void tcp_tx_timestamp(struct sock *sk, struct sk_buff *skb)
432{
433	if (sk->sk_tsflags) {
434		struct skb_shared_info *shinfo = skb_shinfo(skb);
435
436		sock_tx_timestamp(sk, &shinfo->tx_flags);
437		if (shinfo->tx_flags & SKBTX_ANY_TSTAMP)
438			shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
439	}
440}
441
442/*
443 *	Wait for a TCP event.
444 *
445 *	Note that we don't need to lock the socket, as the upper poll layers
446 *	take care of normal races (between the test and the event) and we don't
447 *	go look at any of the socket buffers directly.
448 */
449unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
450{
451	unsigned int mask;
452	struct sock *sk = sock->sk;
453	const struct tcp_sock *tp = tcp_sk(sk);
454
455	sock_rps_record_flow(sk);
456
457	sock_poll_wait(file, sk_sleep(sk), wait);
458	if (sk->sk_state == TCP_LISTEN)
459		return inet_csk_listen_poll(sk);
460
461	/* Socket is not locked. We are protected from async events
462	 * by poll logic and correct handling of state changes
463	 * made by other threads is impossible in any case.
464	 */
465
466	mask = 0;
467
468	/*
469	 * POLLHUP is certainly not done right. But poll() doesn't
470	 * have a notion of HUP in just one direction, and for a
471	 * socket the read side is more interesting.
472	 *
473	 * Some poll() documentation says that POLLHUP is incompatible
474	 * with the POLLOUT/POLLWR flags, so somebody should check this
475	 * all. But careful, it tends to be safer to return too many
476	 * bits than too few, and you can easily break real applications
477	 * if you don't tell them that something has hung up!
478	 *
479	 * Check-me.
480	 *
481	 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
482	 * our fs/select.c). It means that after we received EOF,
483	 * poll always returns immediately, making impossible poll() on write()
484	 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
485	 * if and only if shutdown has been made in both directions.
486	 * Actually, it is interesting to look how Solaris and DUX
487	 * solve this dilemma. I would prefer, if POLLHUP were maskable,
488	 * then we could set it on SND_SHUTDOWN. BTW examples given
489	 * in Stevens' books assume exactly this behaviour, it explains
490	 * why POLLHUP is incompatible with POLLOUT.	--ANK
491	 *
492	 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
493	 * blocking on fresh not-connected or disconnected socket. --ANK
494	 */
495	if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
496		mask |= POLLHUP;
497	if (sk->sk_shutdown & RCV_SHUTDOWN)
498		mask |= POLLIN | POLLRDNORM | POLLRDHUP;
499
500	/* Connected or passive Fast Open socket? */
501	if (sk->sk_state != TCP_SYN_SENT &&
502	    (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk)) {
503		int target = sock_rcvlowat(sk, 0, INT_MAX);
504
505		if (tp->urg_seq == tp->copied_seq &&
506		    !sock_flag(sk, SOCK_URGINLINE) &&
507		    tp->urg_data)
508			target++;
509
510		/* Potential race condition. If read of tp below will
511		 * escape above sk->sk_state, we can be illegally awaken
512		 * in SYN_* states. */
513		if (tp->rcv_nxt - tp->copied_seq >= target)
514			mask |= POLLIN | POLLRDNORM;
515
516		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
517			if (sk_stream_is_writeable(sk)) {
518				mask |= POLLOUT | POLLWRNORM;
519			} else {  /* send SIGIO later */
520				set_bit(SOCK_ASYNC_NOSPACE,
521					&sk->sk_socket->flags);
522				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
523
524				/* Race breaker. If space is freed after
525				 * wspace test but before the flags are set,
526				 * IO signal will be lost. Memory barrier
527				 * pairs with the input side.
528				 */
529				smp_mb__after_atomic();
530				if (sk_stream_is_writeable(sk))
531					mask |= POLLOUT | POLLWRNORM;
532			}
533		} else
534			mask |= POLLOUT | POLLWRNORM;
535
536		if (tp->urg_data & TCP_URG_VALID)
537			mask |= POLLPRI;
538	}
539	/* This barrier is coupled with smp_wmb() in tcp_reset() */
540	smp_rmb();
541	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
542		mask |= POLLERR;
543
544	return mask;
545}
546EXPORT_SYMBOL(tcp_poll);
547
548int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
549{
550	struct tcp_sock *tp = tcp_sk(sk);
551	int answ;
552	bool slow;
553
554	switch (cmd) {
555	case SIOCINQ:
556		if (sk->sk_state == TCP_LISTEN)
557			return -EINVAL;
558
559		slow = lock_sock_fast(sk);
560		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
561			answ = 0;
562		else if (sock_flag(sk, SOCK_URGINLINE) ||
563			 !tp->urg_data ||
564			 before(tp->urg_seq, tp->copied_seq) ||
565			 !before(tp->urg_seq, tp->rcv_nxt)) {
566
567			answ = tp->rcv_nxt - tp->copied_seq;
568
569			/* Subtract 1, if FIN was received */
570			if (answ && sock_flag(sk, SOCK_DONE))
571				answ--;
572		} else
573			answ = tp->urg_seq - tp->copied_seq;
574		unlock_sock_fast(sk, slow);
575		break;
576	case SIOCATMARK:
577		answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
578		break;
579	case SIOCOUTQ:
580		if (sk->sk_state == TCP_LISTEN)
581			return -EINVAL;
582
583		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
584			answ = 0;
585		else
586			answ = tp->write_seq - tp->snd_una;
587		break;
588	case SIOCOUTQNSD:
589		if (sk->sk_state == TCP_LISTEN)
590			return -EINVAL;
591
592		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
593			answ = 0;
594		else
595			answ = tp->write_seq - tp->snd_nxt;
596		break;
597	default:
598		return -ENOIOCTLCMD;
599	}
600
601	return put_user(answ, (int __user *)arg);
602}
603EXPORT_SYMBOL(tcp_ioctl);
604
605static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
606{
607	TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
608	tp->pushed_seq = tp->write_seq;
609}
610
611static inline bool forced_push(const struct tcp_sock *tp)
612{
613	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
614}
615
616static void skb_entail(struct sock *sk, struct sk_buff *skb)
617{
618	struct tcp_sock *tp = tcp_sk(sk);
619	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
620
621	skb->csum    = 0;
622	tcb->seq     = tcb->end_seq = tp->write_seq;
623	tcb->tcp_flags = TCPHDR_ACK;
624	tcb->sacked  = 0;
625	__skb_header_release(skb);
626	tcp_add_write_queue_tail(sk, skb);
627	sk->sk_wmem_queued += skb->truesize;
628	sk_mem_charge(sk, skb->truesize);
629	if (tp->nonagle & TCP_NAGLE_PUSH)
630		tp->nonagle &= ~TCP_NAGLE_PUSH;
631}
632
633static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
634{
635	if (flags & MSG_OOB)
636		tp->snd_up = tp->write_seq;
637}
638
639/* If a not yet filled skb is pushed, do not send it if
640 * we have data packets in Qdisc or NIC queues :
641 * Because TX completion will happen shortly, it gives a chance
642 * to coalesce future sendmsg() payload into this skb, without
643 * need for a timer, and with no latency trade off.
644 * As packets containing data payload have a bigger truesize
645 * than pure acks (dataless) packets, the last checks prevent
646 * autocorking if we only have an ACK in Qdisc/NIC queues,
647 * or if TX completion was delayed after we processed ACK packet.
648 */
649static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
650				int size_goal)
651{
652	return skb->len < size_goal &&
653	       sysctl_tcp_autocorking &&
654	       skb != tcp_write_queue_head(sk) &&
655	       atomic_read(&sk->sk_wmem_alloc) > skb->truesize;
656}
657
658static void tcp_push(struct sock *sk, int flags, int mss_now,
659		     int nonagle, int size_goal)
660{
661	struct tcp_sock *tp = tcp_sk(sk);
662	struct sk_buff *skb;
663
664	if (!tcp_send_head(sk))
665		return;
666
667	skb = tcp_write_queue_tail(sk);
668	if (!(flags & MSG_MORE) || forced_push(tp))
669		tcp_mark_push(tp, skb);
670
671	tcp_mark_urg(tp, flags);
672
673	if (tcp_should_autocork(sk, skb, size_goal)) {
674
675		/* avoid atomic op if TSQ_THROTTLED bit is already set */
676		if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) {
677			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
678			set_bit(TSQ_THROTTLED, &tp->tsq_flags);
679		}
680		/* It is possible TX completion already happened
681		 * before we set TSQ_THROTTLED.
682		 */
683		if (atomic_read(&sk->sk_wmem_alloc) > skb->truesize)
684			return;
685	}
686
687	if (flags & MSG_MORE)
688		nonagle = TCP_NAGLE_CORK;
689
690	__tcp_push_pending_frames(sk, mss_now, nonagle);
691}
692
693static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
694				unsigned int offset, size_t len)
695{
696	struct tcp_splice_state *tss = rd_desc->arg.data;
697	int ret;
698
699	ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len),
700			      tss->flags);
701	if (ret > 0)
702		rd_desc->count -= ret;
703	return ret;
704}
705
706static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
707{
708	/* Store TCP splice context information in read_descriptor_t. */
709	read_descriptor_t rd_desc = {
710		.arg.data = tss,
711		.count	  = tss->len,
712	};
713
714	return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
715}
716
717/**
718 *  tcp_splice_read - splice data from TCP socket to a pipe
719 * @sock:	socket to splice from
720 * @ppos:	position (not valid)
721 * @pipe:	pipe to splice to
722 * @len:	number of bytes to splice
723 * @flags:	splice modifier flags
724 *
725 * Description:
726 *    Will read pages from given socket and fill them into a pipe.
727 *
728 **/
729ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
730			struct pipe_inode_info *pipe, size_t len,
731			unsigned int flags)
732{
733	struct sock *sk = sock->sk;
734	struct tcp_splice_state tss = {
735		.pipe = pipe,
736		.len = len,
737		.flags = flags,
738	};
739	long timeo;
740	ssize_t spliced;
741	int ret;
742
743	sock_rps_record_flow(sk);
744	/*
745	 * We can't seek on a socket input
746	 */
747	if (unlikely(*ppos))
748		return -ESPIPE;
749
750	ret = spliced = 0;
751
752	lock_sock(sk);
753
754	timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
755	while (tss.len) {
756		ret = __tcp_splice_read(sk, &tss);
757		if (ret < 0)
758			break;
759		else if (!ret) {
760			if (spliced)
761				break;
762			if (sock_flag(sk, SOCK_DONE))
763				break;
764			if (sk->sk_err) {
765				ret = sock_error(sk);
766				break;
767			}
768			if (sk->sk_shutdown & RCV_SHUTDOWN)
769				break;
770			if (sk->sk_state == TCP_CLOSE) {
771				/*
772				 * This occurs when user tries to read
773				 * from never connected socket.
774				 */
775				if (!sock_flag(sk, SOCK_DONE))
776					ret = -ENOTCONN;
777				break;
778			}
779			if (!timeo) {
780				ret = -EAGAIN;
781				break;
782			}
783			sk_wait_data(sk, &timeo);
784			if (signal_pending(current)) {
785				ret = sock_intr_errno(timeo);
786				break;
787			}
788			continue;
789		}
790		tss.len -= ret;
791		spliced += ret;
792
793		if (!timeo)
794			break;
795		release_sock(sk);
796		lock_sock(sk);
797
798		if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
799		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
800		    signal_pending(current))
801			break;
802	}
803
804	release_sock(sk);
805
806	if (spliced)
807		return spliced;
808
809	return ret;
810}
811EXPORT_SYMBOL(tcp_splice_read);
812
813struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
814{
815	struct sk_buff *skb;
816
817	/* The TCP header must be at least 32-bit aligned.  */
818	size = ALIGN(size, 4);
819
820	skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
821	if (skb) {
822		if (sk_wmem_schedule(sk, skb->truesize)) {
823			skb_reserve(skb, sk->sk_prot->max_header);
824			/*
825			 * Make sure that we have exactly size bytes
826			 * available to the caller, no more, no less.
827			 */
828			skb->reserved_tailroom = skb->end - skb->tail - size;
829			return skb;
830		}
831		__kfree_skb(skb);
832	} else {
833		sk->sk_prot->enter_memory_pressure(sk);
834		sk_stream_moderate_sndbuf(sk);
835	}
836	return NULL;
837}
838
839static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
840				       int large_allowed)
841{
842	struct tcp_sock *tp = tcp_sk(sk);
843	u32 new_size_goal, size_goal;
844
845	if (!large_allowed || !sk_can_gso(sk))
846		return mss_now;
847
848	/* Note : tcp_tso_autosize() will eventually split this later */
849	new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER;
850	new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
851
852	/* We try hard to avoid divides here */
853	size_goal = tp->gso_segs * mss_now;
854	if (unlikely(new_size_goal < size_goal ||
855		     new_size_goal >= size_goal + mss_now)) {
856		tp->gso_segs = min_t(u16, new_size_goal / mss_now,
857				     sk->sk_gso_max_segs);
858		size_goal = tp->gso_segs * mss_now;
859	}
860
861	return max(size_goal, mss_now);
862}
863
864static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
865{
866	int mss_now;
867
868	mss_now = tcp_current_mss(sk);
869	*size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
870
871	return mss_now;
872}
873
874static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
875				size_t size, int flags)
876{
877	struct tcp_sock *tp = tcp_sk(sk);
878	int mss_now, size_goal;
879	int err;
880	ssize_t copied;
881	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
882
883	/* Wait for a connection to finish. One exception is TCP Fast Open
884	 * (passive side) where data is allowed to be sent before a connection
885	 * is fully established.
886	 */
887	if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
888	    !tcp_passive_fastopen(sk)) {
889		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
890			goto out_err;
891	}
892
893	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
894
895	mss_now = tcp_send_mss(sk, &size_goal, flags);
896	copied = 0;
897
898	err = -EPIPE;
899	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
900		goto out_err;
901
902	while (size > 0) {
903		struct sk_buff *skb = tcp_write_queue_tail(sk);
904		int copy, i;
905		bool can_coalesce;
906
907		if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
908new_segment:
909			if (!sk_stream_memory_free(sk))
910				goto wait_for_sndbuf;
911
912			skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
913			if (!skb)
914				goto wait_for_memory;
915
916			skb_entail(sk, skb);
917			copy = size_goal;
918		}
919
920		if (copy > size)
921			copy = size;
922
923		i = skb_shinfo(skb)->nr_frags;
924		can_coalesce = skb_can_coalesce(skb, i, page, offset);
925		if (!can_coalesce && i >= sysctl_max_skb_frags) {
926			tcp_mark_push(tp, skb);
927			goto new_segment;
928		}
929		if (!sk_wmem_schedule(sk, copy))
930			goto wait_for_memory;
931
932		if (can_coalesce) {
933			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
934		} else {
935			get_page(page);
936			skb_fill_page_desc(skb, i, page, offset, copy);
937		}
938		skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
939
940		skb->len += copy;
941		skb->data_len += copy;
942		skb->truesize += copy;
943		sk->sk_wmem_queued += copy;
944		sk_mem_charge(sk, copy);
945		skb->ip_summed = CHECKSUM_PARTIAL;
946		tp->write_seq += copy;
947		TCP_SKB_CB(skb)->end_seq += copy;
948		tcp_skb_pcount_set(skb, 0);
949
950		if (!copied)
951			TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
952
953		copied += copy;
954		offset += copy;
955		if (!(size -= copy)) {
956			tcp_tx_timestamp(sk, skb);
957			goto out;
958		}
959
960		if (skb->len < size_goal || (flags & MSG_OOB))
961			continue;
962
963		if (forced_push(tp)) {
964			tcp_mark_push(tp, skb);
965			__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
966		} else if (skb == tcp_send_head(sk))
967			tcp_push_one(sk, mss_now);
968		continue;
969
970wait_for_sndbuf:
971		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
972wait_for_memory:
973		tcp_push(sk, flags & ~MSG_MORE, mss_now,
974			 TCP_NAGLE_PUSH, size_goal);
975
976		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
977			goto do_error;
978
979		mss_now = tcp_send_mss(sk, &size_goal, flags);
980	}
981
982out:
983	if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
984		tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
985	return copied;
986
987do_error:
988	if (copied)
989		goto out;
990out_err:
991	return sk_stream_error(sk, flags, err);
992}
993
994int tcp_sendpage(struct sock *sk, struct page *page, int offset,
995		 size_t size, int flags)
996{
997	ssize_t res;
998
999	if (!(sk->sk_route_caps & NETIF_F_SG) ||
1000	    !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
1001		return sock_no_sendpage(sk->sk_socket, page, offset, size,
1002					flags);
1003
1004	lock_sock(sk);
1005	res = do_tcp_sendpages(sk, page, offset, size, flags);
1006	release_sock(sk);
1007	return res;
1008}
1009EXPORT_SYMBOL(tcp_sendpage);
1010
1011static inline int select_size(const struct sock *sk, bool sg)
1012{
1013	const struct tcp_sock *tp = tcp_sk(sk);
1014	int tmp = tp->mss_cache;
1015
1016	if (sg) {
1017		if (sk_can_gso(sk)) {
1018			/* Small frames wont use a full page:
1019			 * Payload will immediately follow tcp header.
1020			 */
1021			tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
1022		} else {
1023			int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
1024
1025			if (tmp >= pgbreak &&
1026			    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
1027				tmp = pgbreak;
1028		}
1029	}
1030
1031	return tmp;
1032}
1033
1034void tcp_free_fastopen_req(struct tcp_sock *tp)
1035{
1036	if (tp->fastopen_req) {
1037		kfree(tp->fastopen_req);
1038		tp->fastopen_req = NULL;
1039	}
1040}
1041
1042static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
1043				int *copied, size_t size)
1044{
1045	struct tcp_sock *tp = tcp_sk(sk);
1046	int err, flags;
1047
1048	if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
1049		return -EOPNOTSUPP;
1050	if (tp->fastopen_req)
1051		return -EALREADY; /* Another Fast Open is in progress */
1052
1053	tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1054				   sk->sk_allocation);
1055	if (unlikely(!tp->fastopen_req))
1056		return -ENOBUFS;
1057	tp->fastopen_req->data = msg;
1058	tp->fastopen_req->size = size;
1059
1060	flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1061	err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
1062				    msg->msg_namelen, flags);
1063	*copied = tp->fastopen_req->copied;
1064	tcp_free_fastopen_req(tp);
1065	return err;
1066}
1067
1068int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
1069{
1070	struct tcp_sock *tp = tcp_sk(sk);
1071	struct sk_buff *skb;
1072	int flags, err, copied = 0;
1073	int mss_now = 0, size_goal, copied_syn = 0;
1074	bool sg;
1075	long timeo;
1076
1077	lock_sock(sk);
1078
1079	flags = msg->msg_flags;
1080	if (flags & MSG_FASTOPEN) {
1081		err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
1082		if (err == -EINPROGRESS && copied_syn > 0)
1083			goto out;
1084		else if (err)
1085			goto out_err;
1086	}
1087
1088	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1089
1090	/* Wait for a connection to finish. One exception is TCP Fast Open
1091	 * (passive side) where data is allowed to be sent before a connection
1092	 * is fully established.
1093	 */
1094	if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1095	    !tcp_passive_fastopen(sk)) {
1096		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
1097			goto do_error;
1098	}
1099
1100	if (unlikely(tp->repair)) {
1101		if (tp->repair_queue == TCP_RECV_QUEUE) {
1102			copied = tcp_send_rcvq(sk, msg, size);
1103			goto out_nopush;
1104		}
1105
1106		err = -EINVAL;
1107		if (tp->repair_queue == TCP_NO_QUEUE)
1108			goto out_err;
1109
1110		/* 'common' sending to sendq */
1111	}
1112
1113	/* This should be in poll */
1114	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1115
1116	mss_now = tcp_send_mss(sk, &size_goal, flags);
1117
1118	/* Ok commence sending. */
1119	copied = 0;
1120
1121	err = -EPIPE;
1122	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1123		goto out_err;
1124
1125	sg = !!(sk->sk_route_caps & NETIF_F_SG);
1126
1127	while (msg_data_left(msg)) {
1128		int copy = 0;
1129		int max = size_goal;
1130
1131		skb = tcp_write_queue_tail(sk);
1132		if (tcp_send_head(sk)) {
1133			if (skb->ip_summed == CHECKSUM_NONE)
1134				max = mss_now;
1135			copy = max - skb->len;
1136		}
1137
1138		if (copy <= 0) {
1139new_segment:
1140			/* Allocate new segment. If the interface is SG,
1141			 * allocate skb fitting to single page.
1142			 */
1143			if (!sk_stream_memory_free(sk))
1144				goto wait_for_sndbuf;
1145
1146			skb = sk_stream_alloc_skb(sk,
1147						  select_size(sk, sg),
1148						  sk->sk_allocation);
1149			if (!skb)
1150				goto wait_for_memory;
1151
1152			/*
1153			 * Check whether we can use HW checksum.
1154			 */
1155			if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
1156				skb->ip_summed = CHECKSUM_PARTIAL;
1157
1158			skb_entail(sk, skb);
1159			copy = size_goal;
1160			max = size_goal;
1161
1162			/* All packets are restored as if they have
1163			 * already been sent. skb_mstamp isn't set to
1164			 * avoid wrong rtt estimation.
1165			 */
1166			if (tp->repair)
1167				TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
1168		}
1169
1170		/* Try to append data to the end of skb. */
1171		if (copy > msg_data_left(msg))
1172			copy = msg_data_left(msg);
1173
1174		/* Where to copy to? */
1175		if (skb_availroom(skb) > 0) {
1176			/* We have some space in skb head. Superb! */
1177			copy = min_t(int, copy, skb_availroom(skb));
1178			err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
1179			if (err)
1180				goto do_fault;
1181		} else {
1182			bool merge = true;
1183			int i = skb_shinfo(skb)->nr_frags;
1184			struct page_frag *pfrag = sk_page_frag(sk);
1185
1186			if (!sk_page_frag_refill(sk, pfrag))
1187				goto wait_for_memory;
1188
1189			if (!skb_can_coalesce(skb, i, pfrag->page,
1190					      pfrag->offset)) {
1191				if (i == sysctl_max_skb_frags || !sg) {
1192					tcp_mark_push(tp, skb);
1193					goto new_segment;
1194				}
1195				merge = false;
1196			}
1197
1198			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1199
1200			if (!sk_wmem_schedule(sk, copy))
1201				goto wait_for_memory;
1202
1203			err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
1204						       pfrag->page,
1205						       pfrag->offset,
1206						       copy);
1207			if (err)
1208				goto do_error;
1209
1210			/* Update the skb. */
1211			if (merge) {
1212				skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1213			} else {
1214				skb_fill_page_desc(skb, i, pfrag->page,
1215						   pfrag->offset, copy);
1216				get_page(pfrag->page);
1217			}
1218			pfrag->offset += copy;
1219		}
1220
1221		if (!copied)
1222			TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1223
1224		tp->write_seq += copy;
1225		TCP_SKB_CB(skb)->end_seq += copy;
1226		tcp_skb_pcount_set(skb, 0);
1227
1228		copied += copy;
1229		if (!msg_data_left(msg)) {
1230			tcp_tx_timestamp(sk, skb);
1231			goto out;
1232		}
1233
1234		if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
1235			continue;
1236
1237		if (forced_push(tp)) {
1238			tcp_mark_push(tp, skb);
1239			__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1240		} else if (skb == tcp_send_head(sk))
1241			tcp_push_one(sk, mss_now);
1242		continue;
1243
1244wait_for_sndbuf:
1245		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1246wait_for_memory:
1247		if (copied)
1248			tcp_push(sk, flags & ~MSG_MORE, mss_now,
1249				 TCP_NAGLE_PUSH, size_goal);
1250
1251		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
1252			goto do_error;
1253
1254		mss_now = tcp_send_mss(sk, &size_goal, flags);
1255	}
1256
1257out:
1258	if (copied)
1259		tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1260out_nopush:
1261	release_sock(sk);
1262	return copied + copied_syn;
1263
1264do_fault:
1265	if (!skb->len) {
1266		tcp_unlink_write_queue(skb, sk);
1267		/* It is the one place in all of TCP, except connection
1268		 * reset, where we can be unlinking the send_head.
1269		 */
1270		tcp_check_send_head(sk, skb);
1271		sk_wmem_free_skb(sk, skb);
1272	}
1273
1274do_error:
1275	if (copied + copied_syn)
1276		goto out;
1277out_err:
1278	err = sk_stream_error(sk, flags, err);
1279	release_sock(sk);
1280	return err;
1281}
1282EXPORT_SYMBOL(tcp_sendmsg);
1283
1284/*
1285 *	Handle reading urgent data. BSD has very simple semantics for
1286 *	this, no blocking and very strange errors 8)
1287 */
1288
1289static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1290{
1291	struct tcp_sock *tp = tcp_sk(sk);
1292
1293	/* No URG data to read. */
1294	if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1295	    tp->urg_data == TCP_URG_READ)
1296		return -EINVAL;	/* Yes this is right ! */
1297
1298	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1299		return -ENOTCONN;
1300
1301	if (tp->urg_data & TCP_URG_VALID) {
1302		int err = 0;
1303		char c = tp->urg_data;
1304
1305		if (!(flags & MSG_PEEK))
1306			tp->urg_data = TCP_URG_READ;
1307
1308		/* Read urgent data. */
1309		msg->msg_flags |= MSG_OOB;
1310
1311		if (len > 0) {
1312			if (!(flags & MSG_TRUNC))
1313				err = memcpy_to_msg(msg, &c, 1);
1314			len = 1;
1315		} else
1316			msg->msg_flags |= MSG_TRUNC;
1317
1318		return err ? -EFAULT : len;
1319	}
1320
1321	if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1322		return 0;
1323
1324	/* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1325	 * the available implementations agree in this case:
1326	 * this call should never block, independent of the
1327	 * blocking state of the socket.
1328	 * Mike <pall@rz.uni-karlsruhe.de>
1329	 */
1330	return -EAGAIN;
1331}
1332
1333static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1334{
1335	struct sk_buff *skb;
1336	int copied = 0, err = 0;
1337
1338	/* XXX -- need to support SO_PEEK_OFF */
1339
1340	skb_queue_walk(&sk->sk_write_queue, skb) {
1341		err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1342		if (err)
1343			break;
1344
1345		copied += skb->len;
1346	}
1347
1348	return err ?: copied;
1349}
1350
1351/* Clean up the receive buffer for full frames taken by the user,
1352 * then send an ACK if necessary.  COPIED is the number of bytes
1353 * tcp_recvmsg has given to the user so far, it speeds up the
1354 * calculation of whether or not we must ACK for the sake of
1355 * a window update.
1356 */
1357static void tcp_cleanup_rbuf(struct sock *sk, int copied)
1358{
1359	struct tcp_sock *tp = tcp_sk(sk);
1360	bool time_to_ack = false;
1361
1362	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1363
1364	WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1365	     "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1366	     tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1367
1368	if (inet_csk_ack_scheduled(sk)) {
1369		const struct inet_connection_sock *icsk = inet_csk(sk);
1370		   /* Delayed ACKs frequently hit locked sockets during bulk
1371		    * receive. */
1372		if (icsk->icsk_ack.blocked ||
1373		    /* Once-per-two-segments ACK was not sent by tcp_input.c */
1374		    tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1375		    /*
1376		     * If this read emptied read buffer, we send ACK, if
1377		     * connection is not bidirectional, user drained
1378		     * receive buffer and there was a small segment
1379		     * in queue.
1380		     */
1381		    (copied > 0 &&
1382		     ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1383		      ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1384		       !icsk->icsk_ack.pingpong)) &&
1385		      !atomic_read(&sk->sk_rmem_alloc)))
1386			time_to_ack = true;
1387	}
1388
1389	/* We send an ACK if we can now advertise a non-zero window
1390	 * which has been raised "significantly".
1391	 *
1392	 * Even if window raised up to infinity, do not send window open ACK
1393	 * in states, where we will not receive more. It is useless.
1394	 */
1395	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1396		__u32 rcv_window_now = tcp_receive_window(tp);
1397
1398		/* Optimize, __tcp_select_window() is not cheap. */
1399		if (2*rcv_window_now <= tp->window_clamp) {
1400			__u32 new_window = __tcp_select_window(sk);
1401
1402			/* Send ACK now, if this read freed lots of space
1403			 * in our buffer. Certainly, new_window is new window.
1404			 * We can advertise it now, if it is not less than current one.
1405			 * "Lots" means "at least twice" here.
1406			 */
1407			if (new_window && new_window >= 2 * rcv_window_now)
1408				time_to_ack = true;
1409		}
1410	}
1411	if (time_to_ack)
1412		tcp_send_ack(sk);
1413}
1414
1415static void tcp_prequeue_process(struct sock *sk)
1416{
1417	struct sk_buff *skb;
1418	struct tcp_sock *tp = tcp_sk(sk);
1419
1420	NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
1421
1422	/* RX process wants to run with disabled BHs, though it is not
1423	 * necessary */
1424	local_bh_disable();
1425	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1426		sk_backlog_rcv(sk, skb);
1427	local_bh_enable();
1428
1429	/* Clear memory counter. */
1430	tp->ucopy.memory = 0;
1431}
1432
1433static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1434{
1435	struct sk_buff *skb;
1436	u32 offset;
1437
1438	while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
1439		offset = seq - TCP_SKB_CB(skb)->seq;
1440		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
1441			offset--;
1442		if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
1443			*off = offset;
1444			return skb;
1445		}
1446		/* This looks weird, but this can happen if TCP collapsing
1447		 * splitted a fat GRO packet, while we released socket lock
1448		 * in skb_splice_bits()
1449		 */
1450		sk_eat_skb(sk, skb);
1451	}
1452	return NULL;
1453}
1454
1455/*
1456 * This routine provides an alternative to tcp_recvmsg() for routines
1457 * that would like to handle copying from skbuffs directly in 'sendfile'
1458 * fashion.
1459 * Note:
1460 *	- It is assumed that the socket was locked by the caller.
1461 *	- The routine does not block.
1462 *	- At present, there is no support for reading OOB data
1463 *	  or for 'peeking' the socket using this routine
1464 *	  (although both would be easy to implement).
1465 */
1466int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1467		  sk_read_actor_t recv_actor)
1468{
1469	struct sk_buff *skb;
1470	struct tcp_sock *tp = tcp_sk(sk);
1471	u32 seq = tp->copied_seq;
1472	u32 offset;
1473	int copied = 0;
1474
1475	if (sk->sk_state == TCP_LISTEN)
1476		return -ENOTCONN;
1477	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1478		if (offset < skb->len) {
1479			int used;
1480			size_t len;
1481
1482			len = skb->len - offset;
1483			/* Stop reading if we hit a patch of urgent data */
1484			if (tp->urg_data) {
1485				u32 urg_offset = tp->urg_seq - seq;
1486				if (urg_offset < len)
1487					len = urg_offset;
1488				if (!len)
1489					break;
1490			}
1491			used = recv_actor(desc, skb, offset, len);
1492			if (used <= 0) {
1493				if (!copied)
1494					copied = used;
1495				break;
1496			} else if (used <= len) {
1497				seq += used;
1498				copied += used;
1499				offset += used;
1500			}
1501			/* If recv_actor drops the lock (e.g. TCP splice
1502			 * receive) the skb pointer might be invalid when
1503			 * getting here: tcp_collapse might have deleted it
1504			 * while aggregating skbs from the socket queue.
1505			 */
1506			skb = tcp_recv_skb(sk, seq - 1, &offset);
1507			if (!skb)
1508				break;
1509			/* TCP coalescing might have appended data to the skb.
1510			 * Try to splice more frags
1511			 */
1512			if (offset + 1 != skb->len)
1513				continue;
1514		}
1515		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
1516			sk_eat_skb(sk, skb);
1517			++seq;
1518			break;
1519		}
1520		sk_eat_skb(sk, skb);
1521		if (!desc->count)
1522			break;
1523		tp->copied_seq = seq;
1524	}
1525	tp->copied_seq = seq;
1526
1527	tcp_rcv_space_adjust(sk);
1528
1529	/* Clean up data we have read: This will do ACK frames. */
1530	if (copied > 0) {
1531		tcp_recv_skb(sk, seq, &offset);
1532		tcp_cleanup_rbuf(sk, copied);
1533	}
1534	return copied;
1535}
1536EXPORT_SYMBOL(tcp_read_sock);
1537
1538/*
1539 *	This routine copies from a sock struct into the user buffer.
1540 *
1541 *	Technical note: in 2.3 we work on _locked_ socket, so that
1542 *	tricks with *seq access order and skb->users are not required.
1543 *	Probably, code can be easily improved even more.
1544 */
1545
1546int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
1547		int flags, int *addr_len)
1548{
1549	struct tcp_sock *tp = tcp_sk(sk);
1550	int copied = 0;
1551	u32 peek_seq;
1552	u32 *seq;
1553	unsigned long used;
1554	int err;
1555	int target;		/* Read at least this many bytes */
1556	long timeo;
1557	struct task_struct *user_recv = NULL;
1558	struct sk_buff *skb;
1559	u32 urg_hole = 0;
1560
1561	if (unlikely(flags & MSG_ERRQUEUE))
1562		return inet_recv_error(sk, msg, len, addr_len);
1563
1564	if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) &&
1565	    (sk->sk_state == TCP_ESTABLISHED))
1566		sk_busy_loop(sk, nonblock);
1567
1568	lock_sock(sk);
1569
1570	err = -ENOTCONN;
1571	if (sk->sk_state == TCP_LISTEN)
1572		goto out;
1573
1574	timeo = sock_rcvtimeo(sk, nonblock);
1575
1576	/* Urgent data needs to be handled specially. */
1577	if (flags & MSG_OOB)
1578		goto recv_urg;
1579
1580	if (unlikely(tp->repair)) {
1581		err = -EPERM;
1582		if (!(flags & MSG_PEEK))
1583			goto out;
1584
1585		if (tp->repair_queue == TCP_SEND_QUEUE)
1586			goto recv_sndq;
1587
1588		err = -EINVAL;
1589		if (tp->repair_queue == TCP_NO_QUEUE)
1590			goto out;
1591
1592		/* 'common' recv queue MSG_PEEK-ing */
1593	}
1594
1595	seq = &tp->copied_seq;
1596	if (flags & MSG_PEEK) {
1597		peek_seq = tp->copied_seq;
1598		seq = &peek_seq;
1599	}
1600
1601	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1602
1603	do {
1604		u32 offset;
1605
1606		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1607		if (tp->urg_data && tp->urg_seq == *seq) {
1608			if (copied)
1609				break;
1610			if (signal_pending(current)) {
1611				copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1612				break;
1613			}
1614		}
1615
1616		/* Next get a buffer. */
1617
1618		skb_queue_walk(&sk->sk_receive_queue, skb) {
1619			/* Now that we have two receive queues this
1620			 * shouldn't happen.
1621			 */
1622			if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
1623				 "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
1624				 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
1625				 flags))
1626				break;
1627
1628			offset = *seq - TCP_SKB_CB(skb)->seq;
1629			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
1630				offset--;
1631			if (offset < skb->len)
1632				goto found_ok_skb;
1633			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
1634				goto found_fin_ok;
1635			WARN(!(flags & MSG_PEEK),
1636			     "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
1637			     *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
1638		}
1639
1640		/* Well, if we have backlog, try to process it now yet. */
1641
1642		if (copied >= target && !sk->sk_backlog.tail)
1643			break;
1644
1645		if (copied) {
1646			if (sk->sk_err ||
1647			    sk->sk_state == TCP_CLOSE ||
1648			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1649			    !timeo ||
1650			    signal_pending(current))
1651				break;
1652		} else {
1653			if (sock_flag(sk, SOCK_DONE))
1654				break;
1655
1656			if (sk->sk_err) {
1657				copied = sock_error(sk);
1658				break;
1659			}
1660
1661			if (sk->sk_shutdown & RCV_SHUTDOWN)
1662				break;
1663
1664			if (sk->sk_state == TCP_CLOSE) {
1665				if (!sock_flag(sk, SOCK_DONE)) {
1666					/* This occurs when user tries to read
1667					 * from never connected socket.
1668					 */
1669					copied = -ENOTCONN;
1670					break;
1671				}
1672				break;
1673			}
1674
1675			if (!timeo) {
1676				copied = -EAGAIN;
1677				break;
1678			}
1679
1680			if (signal_pending(current)) {
1681				copied = sock_intr_errno(timeo);
1682				break;
1683			}
1684		}
1685
1686		tcp_cleanup_rbuf(sk, copied);
1687
1688		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1689			/* Install new reader */
1690			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1691				user_recv = current;
1692				tp->ucopy.task = user_recv;
1693				tp->ucopy.msg = msg;
1694			}
1695
1696			tp->ucopy.len = len;
1697
1698			WARN_ON(tp->copied_seq != tp->rcv_nxt &&
1699				!(flags & (MSG_PEEK | MSG_TRUNC)));
1700
1701			/* Ugly... If prequeue is not empty, we have to
1702			 * process it before releasing socket, otherwise
1703			 * order will be broken at second iteration.
1704			 * More elegant solution is required!!!
1705			 *
1706			 * Look: we have the following (pseudo)queues:
1707			 *
1708			 * 1. packets in flight
1709			 * 2. backlog
1710			 * 3. prequeue
1711			 * 4. receive_queue
1712			 *
1713			 * Each queue can be processed only if the next ones
1714			 * are empty. At this point we have empty receive_queue.
1715			 * But prequeue _can_ be not empty after 2nd iteration,
1716			 * when we jumped to start of loop because backlog
1717			 * processing added something to receive_queue.
1718			 * We cannot release_sock(), because backlog contains
1719			 * packets arrived _after_ prequeued ones.
1720			 *
1721			 * Shortly, algorithm is clear --- to process all
1722			 * the queues in order. We could make it more directly,
1723			 * requeueing packets from backlog to prequeue, if
1724			 * is not empty. It is more elegant, but eats cycles,
1725			 * unfortunately.
1726			 */
1727			if (!skb_queue_empty(&tp->ucopy.prequeue))
1728				goto do_prequeue;
1729
1730			/* __ Set realtime policy in scheduler __ */
1731		}
1732
1733		if (copied >= target) {
1734			/* Do not sleep, just process backlog. */
1735			release_sock(sk);
1736			lock_sock(sk);
1737		} else
1738			sk_wait_data(sk, &timeo);
1739
1740		if (user_recv) {
1741			int chunk;
1742
1743			/* __ Restore normal policy in scheduler __ */
1744
1745			if ((chunk = len - tp->ucopy.len) != 0) {
1746				NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1747				len -= chunk;
1748				copied += chunk;
1749			}
1750
1751			if (tp->rcv_nxt == tp->copied_seq &&
1752			    !skb_queue_empty(&tp->ucopy.prequeue)) {
1753do_prequeue:
1754				tcp_prequeue_process(sk);
1755
1756				if ((chunk = len - tp->ucopy.len) != 0) {
1757					NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1758					len -= chunk;
1759					copied += chunk;
1760				}
1761			}
1762		}
1763		if ((flags & MSG_PEEK) &&
1764		    (peek_seq - copied - urg_hole != tp->copied_seq)) {
1765			net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
1766					    current->comm,
1767					    task_pid_nr(current));
1768			peek_seq = tp->copied_seq;
1769		}
1770		continue;
1771
1772	found_ok_skb:
1773		/* Ok so how much can we use? */
1774		used = skb->len - offset;
1775		if (len < used)
1776			used = len;
1777
1778		/* Do we have urgent data here? */
1779		if (tp->urg_data) {
1780			u32 urg_offset = tp->urg_seq - *seq;
1781			if (urg_offset < used) {
1782				if (!urg_offset) {
1783					if (!sock_flag(sk, SOCK_URGINLINE)) {
1784						++*seq;
1785						urg_hole++;
1786						offset++;
1787						used--;
1788						if (!used)
1789							goto skip_copy;
1790					}
1791				} else
1792					used = urg_offset;
1793			}
1794		}
1795
1796		if (!(flags & MSG_TRUNC)) {
1797			err = skb_copy_datagram_msg(skb, offset, msg, used);
1798			if (err) {
1799				/* Exception. Bailout! */
1800				if (!copied)
1801					copied = -EFAULT;
1802				break;
1803			}
1804		}
1805
1806		*seq += used;
1807		copied += used;
1808		len -= used;
1809
1810		tcp_rcv_space_adjust(sk);
1811
1812skip_copy:
1813		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1814			tp->urg_data = 0;
1815			tcp_fast_path_check(sk);
1816		}
1817		if (used + offset < skb->len)
1818			continue;
1819
1820		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
1821			goto found_fin_ok;
1822		if (!(flags & MSG_PEEK))
1823			sk_eat_skb(sk, skb);
1824		continue;
1825
1826	found_fin_ok:
1827		/* Process the FIN. */
1828		++*seq;
1829		if (!(flags & MSG_PEEK))
1830			sk_eat_skb(sk, skb);
1831		break;
1832	} while (len > 0);
1833
1834	if (user_recv) {
1835		if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1836			int chunk;
1837
1838			tp->ucopy.len = copied > 0 ? len : 0;
1839
1840			tcp_prequeue_process(sk);
1841
1842			if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1843				NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1844				len -= chunk;
1845				copied += chunk;
1846			}
1847		}
1848
1849		tp->ucopy.task = NULL;
1850		tp->ucopy.len = 0;
1851	}
1852
1853	/* According to UNIX98, msg_name/msg_namelen are ignored
1854	 * on connected socket. I was just happy when found this 8) --ANK
1855	 */
1856
1857	/* Clean up data we have read: This will do ACK frames. */
1858	tcp_cleanup_rbuf(sk, copied);
1859
1860	release_sock(sk);
1861	return copied;
1862
1863out:
1864	release_sock(sk);
1865	return err;
1866
1867recv_urg:
1868	err = tcp_recv_urg(sk, msg, len, flags);
1869	goto out;
1870
1871recv_sndq:
1872	err = tcp_peek_sndq(sk, msg, len);
1873	goto out;
1874}
1875EXPORT_SYMBOL(tcp_recvmsg);
1876
1877void tcp_set_state(struct sock *sk, int state)
1878{
1879	int oldstate = sk->sk_state;
1880
1881	switch (state) {
1882	case TCP_ESTABLISHED:
1883		if (oldstate != TCP_ESTABLISHED)
1884			TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1885		break;
1886
1887	case TCP_CLOSE:
1888		if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
1889			TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
1890
1891		sk->sk_prot->unhash(sk);
1892		if (inet_csk(sk)->icsk_bind_hash &&
1893		    !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
1894			inet_put_port(sk);
1895		/* fall through */
1896	default:
1897		if (oldstate == TCP_ESTABLISHED)
1898			TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1899	}
1900
1901	/* Change state AFTER socket is unhashed to avoid closed
1902	 * socket sitting in hash tables.
1903	 */
1904	sk->sk_state = state;
1905
1906#ifdef STATE_TRACE
1907	SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
1908#endif
1909}
1910EXPORT_SYMBOL_GPL(tcp_set_state);
1911
1912/*
1913 *	State processing on a close. This implements the state shift for
1914 *	sending our FIN frame. Note that we only send a FIN for some
1915 *	states. A shutdown() may have already sent the FIN, or we may be
1916 *	closed.
1917 */
1918
1919static const unsigned char new_state[16] = {
1920  /* current state:        new state:      action:	*/
1921  [0 /* (Invalid) */]	= TCP_CLOSE,
1922  [TCP_ESTABLISHED]	= TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1923  [TCP_SYN_SENT]	= TCP_CLOSE,
1924  [TCP_SYN_RECV]	= TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1925  [TCP_FIN_WAIT1]	= TCP_FIN_WAIT1,
1926  [TCP_FIN_WAIT2]	= TCP_FIN_WAIT2,
1927  [TCP_TIME_WAIT]	= TCP_CLOSE,
1928  [TCP_CLOSE]		= TCP_CLOSE,
1929  [TCP_CLOSE_WAIT]	= TCP_LAST_ACK  | TCP_ACTION_FIN,
1930  [TCP_LAST_ACK]	= TCP_LAST_ACK,
1931  [TCP_LISTEN]		= TCP_CLOSE,
1932  [TCP_CLOSING]		= TCP_CLOSING,
1933  [TCP_NEW_SYN_RECV]	= TCP_CLOSE,	/* should not happen ! */
1934};
1935
1936static int tcp_close_state(struct sock *sk)
1937{
1938	int next = (int)new_state[sk->sk_state];
1939	int ns = next & TCP_STATE_MASK;
1940
1941	tcp_set_state(sk, ns);
1942
1943	return next & TCP_ACTION_FIN;
1944}
1945
1946/*
1947 *	Shutdown the sending side of a connection. Much like close except
1948 *	that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD).
1949 */
1950
1951void tcp_shutdown(struct sock *sk, int how)
1952{
1953	/*	We need to grab some memory, and put together a FIN,
1954	 *	and then put it into the queue to be sent.
1955	 *		Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1956	 */
1957	if (!(how & SEND_SHUTDOWN))
1958		return;
1959
1960	/* If we've already sent a FIN, or it's a closed state, skip this. */
1961	if ((1 << sk->sk_state) &
1962	    (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1963	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1964		/* Clear out any half completed packets.  FIN if needed. */
1965		if (tcp_close_state(sk))
1966			tcp_send_fin(sk);
1967	}
1968}
1969EXPORT_SYMBOL(tcp_shutdown);
1970
1971bool tcp_check_oom(struct sock *sk, int shift)
1972{
1973	bool too_many_orphans, out_of_socket_memory;
1974
1975	too_many_orphans = tcp_too_many_orphans(sk, shift);
1976	out_of_socket_memory = tcp_out_of_memory(sk);
1977
1978	if (too_many_orphans)
1979		net_info_ratelimited("too many orphaned sockets\n");
1980	if (out_of_socket_memory)
1981		net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
1982	return too_many_orphans || out_of_socket_memory;
1983}
1984
1985void tcp_close(struct sock *sk, long timeout)
1986{
1987	struct sk_buff *skb;
1988	int data_was_unread = 0;
1989	int state;
1990
1991	lock_sock(sk);
1992	sk->sk_shutdown = SHUTDOWN_MASK;
1993
1994	if (sk->sk_state == TCP_LISTEN) {
1995		tcp_set_state(sk, TCP_CLOSE);
1996
1997		/* Special case. */
1998		inet_csk_listen_stop(sk);
1999
2000		goto adjudge_to_death;
2001	}
2002
2003	/*  We need to flush the recv. buffs.  We do this only on the
2004	 *  descriptor close, not protocol-sourced closes, because the
2005	 *  reader process may not have drained the data yet!
2006	 */
2007	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2008		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
2009
2010		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2011			len--;
2012		data_was_unread += len;
2013		__kfree_skb(skb);
2014	}
2015
2016	sk_mem_reclaim(sk);
2017
2018	/* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
2019	if (sk->sk_state == TCP_CLOSE)
2020		goto adjudge_to_death;
2021
2022	/* As outlined in RFC 2525, section 2.17, we send a RST here because
2023	 * data was lost. To witness the awful effects of the old behavior of
2024	 * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
2025	 * GET in an FTP client, suspend the process, wait for the client to
2026	 * advertise a zero window, then kill -9 the FTP client, wheee...
2027	 * Note: timeout is always zero in such a case.
2028	 */
2029	if (unlikely(tcp_sk(sk)->repair)) {
2030		sk->sk_prot->disconnect(sk, 0);
2031	} else if (data_was_unread) {
2032		/* Unread data was tossed, zap the connection. */
2033		NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2034		tcp_set_state(sk, TCP_CLOSE);
2035		tcp_send_active_reset(sk, sk->sk_allocation);
2036	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
2037		/* Check zero linger _after_ checking for unread data. */
2038		sk->sk_prot->disconnect(sk, 0);
2039		NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
2040	} else if (tcp_close_state(sk)) {
2041		/* We FIN if the application ate all the data before
2042		 * zapping the connection.
2043		 */
2044
2045		/* RED-PEN. Formally speaking, we have broken TCP state
2046		 * machine. State transitions:
2047		 *
2048		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
2049		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)
2050		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
2051		 *
2052		 * are legal only when FIN has been sent (i.e. in window),
2053		 * rather than queued out of window. Purists blame.
2054		 *
2055		 * F.e. "RFC state" is ESTABLISHED,
2056		 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
2057		 *
2058		 * The visible declinations are that sometimes
2059		 * we enter time-wait state, when it is not required really
2060		 * (harmless), do not send active resets, when they are
2061		 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
2062		 * they look as CLOSING or LAST_ACK for Linux)
2063		 * Probably, I missed some more holelets.
2064		 * 						--ANK
2065		 * XXX (TFO) - To start off we don't support SYN+ACK+FIN
2066		 * in a single packet! (May consider it later but will
2067		 * probably need API support or TCP_CORK SYN-ACK until
2068		 * data is written and socket is closed.)
2069		 */
2070		tcp_send_fin(sk);
2071	}
2072
2073	sk_stream_wait_close(sk, timeout);
2074
2075adjudge_to_death:
2076	state = sk->sk_state;
2077	sock_hold(sk);
2078	sock_orphan(sk);
2079
2080	/* It is the last release_sock in its life. It will remove backlog. */
2081	release_sock(sk);
2082
2083
2084	/* Now socket is owned by kernel and we acquire BH lock
2085	   to finish close. No need to check for user refs.
2086	 */
2087	local_bh_disable();
2088	bh_lock_sock(sk);
2089	WARN_ON(sock_owned_by_user(sk));
2090
2091	percpu_counter_inc(sk->sk_prot->orphan_count);
2092
2093	/* Have we already been destroyed by a softirq or backlog? */
2094	if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2095		goto out;
2096
2097	/*	This is a (useful) BSD violating of the RFC. There is a
2098	 *	problem with TCP as specified in that the other end could
2099	 *	keep a socket open forever with no application left this end.
2100	 *	We use a 1 minute timeout (about the same as BSD) then kill
2101	 *	our end. If they send after that then tough - BUT: long enough
2102	 *	that we won't make the old 4*rto = almost no time - whoops
2103	 *	reset mistake.
2104	 *
2105	 *	Nope, it was not mistake. It is really desired behaviour
2106	 *	f.e. on http servers, when such sockets are useless, but
2107	 *	consume significant resources. Let's do it with special
2108	 *	linger2	option.					--ANK
2109	 */
2110
2111	if (sk->sk_state == TCP_FIN_WAIT2) {
2112		struct tcp_sock *tp = tcp_sk(sk);
2113		if (tp->linger2 < 0) {
2114			tcp_set_state(sk, TCP_CLOSE);
2115			tcp_send_active_reset(sk, GFP_ATOMIC);
2116			NET_INC_STATS_BH(sock_net(sk),
2117					LINUX_MIB_TCPABORTONLINGER);
2118		} else {
2119			const int tmo = tcp_fin_time(sk);
2120
2121			if (tmo > TCP_TIMEWAIT_LEN) {
2122				inet_csk_reset_keepalive_timer(sk,
2123						tmo - TCP_TIMEWAIT_LEN);
2124			} else {
2125				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2126				goto out;
2127			}
2128		}
2129	}
2130	if (sk->sk_state != TCP_CLOSE) {
2131		sk_mem_reclaim(sk);
2132		if (tcp_check_oom(sk, 0)) {
2133			tcp_set_state(sk, TCP_CLOSE);
2134			tcp_send_active_reset(sk, GFP_ATOMIC);
2135			NET_INC_STATS_BH(sock_net(sk),
2136					LINUX_MIB_TCPABORTONMEMORY);
2137		}
2138	}
2139
2140	if (sk->sk_state == TCP_CLOSE) {
2141		struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
2142		/* We could get here with a non-NULL req if the socket is
2143		 * aborted (e.g., closed with unread data) before 3WHS
2144		 * finishes.
2145		 */
2146		if (req)
2147			reqsk_fastopen_remove(sk, req, false);
2148		inet_csk_destroy_sock(sk);
2149	}
2150	/* Otherwise, socket is reprieved until protocol close. */
2151
2152out:
2153	bh_unlock_sock(sk);
2154	local_bh_enable();
2155	sock_put(sk);
2156}
2157EXPORT_SYMBOL(tcp_close);
2158
2159/* These states need RST on ABORT according to RFC793 */
2160
2161static inline bool tcp_need_reset(int state)
2162{
2163	return (1 << state) &
2164	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2165		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2166}
2167
2168int tcp_disconnect(struct sock *sk, int flags)
2169{
2170	struct inet_sock *inet = inet_sk(sk);
2171	struct inet_connection_sock *icsk = inet_csk(sk);
2172	struct tcp_sock *tp = tcp_sk(sk);
2173	int err = 0;
2174	int old_state = sk->sk_state;
2175
2176	if (old_state != TCP_CLOSE)
2177		tcp_set_state(sk, TCP_CLOSE);
2178
2179	/* ABORT function of RFC793 */
2180	if (old_state == TCP_LISTEN) {
2181		inet_csk_listen_stop(sk);
2182	} else if (unlikely(tp->repair)) {
2183		sk->sk_err = ECONNABORTED;
2184	} else if (tcp_need_reset(old_state) ||
2185		   (tp->snd_nxt != tp->write_seq &&
2186		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2187		/* The last check adjusts for discrepancy of Linux wrt. RFC
2188		 * states
2189		 */
2190		tcp_send_active_reset(sk, gfp_any());
2191		sk->sk_err = ECONNRESET;
2192	} else if (old_state == TCP_SYN_SENT)
2193		sk->sk_err = ECONNRESET;
2194
2195	tcp_clear_xmit_timers(sk);
2196	__skb_queue_purge(&sk->sk_receive_queue);
2197	tcp_write_queue_purge(sk);
2198	__skb_queue_purge(&tp->out_of_order_queue);
2199
2200	inet->inet_dport = 0;
2201
2202	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2203		inet_reset_saddr(sk);
2204
2205	sk->sk_shutdown = 0;
2206	sock_reset_flag(sk, SOCK_DONE);
2207	tp->srtt_us = 0;
2208	if ((tp->write_seq += tp->max_window + 2) == 0)
2209		tp->write_seq = 1;
2210	icsk->icsk_backoff = 0;
2211	tp->snd_cwnd = 2;
2212	icsk->icsk_probes_out = 0;
2213	tp->packets_out = 0;
2214	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2215	tp->snd_cwnd_cnt = 0;
2216	tp->window_clamp = 0;
2217	tcp_set_ca_state(sk, TCP_CA_Open);
2218	tcp_clear_retrans(tp);
2219	inet_csk_delack_init(sk);
2220	tcp_init_send_head(sk);
2221	memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2222	__sk_dst_reset(sk);
2223
2224	WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
2225
2226	sk->sk_error_report(sk);
2227	return err;
2228}
2229EXPORT_SYMBOL(tcp_disconnect);
2230
2231void tcp_sock_destruct(struct sock *sk)
2232{
2233	inet_sock_destruct(sk);
2234
2235	kfree(inet_csk(sk)->icsk_accept_queue.fastopenq);
2236}
2237
2238static inline bool tcp_can_repair_sock(const struct sock *sk)
2239{
2240	return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
2241		((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
2242}
2243
2244static int tcp_repair_options_est(struct tcp_sock *tp,
2245		struct tcp_repair_opt __user *optbuf, unsigned int len)
2246{
2247	struct tcp_repair_opt opt;
2248
2249	while (len >= sizeof(opt)) {
2250		if (copy_from_user(&opt, optbuf, sizeof(opt)))
2251			return -EFAULT;
2252
2253		optbuf++;
2254		len -= sizeof(opt);
2255
2256		switch (opt.opt_code) {
2257		case TCPOPT_MSS:
2258			tp->rx_opt.mss_clamp = opt.opt_val;
2259			break;
2260		case TCPOPT_WINDOW:
2261			{
2262				u16 snd_wscale = opt.opt_val & 0xFFFF;
2263				u16 rcv_wscale = opt.opt_val >> 16;
2264
2265				if (snd_wscale > 14 || rcv_wscale > 14)
2266					return -EFBIG;
2267
2268				tp->rx_opt.snd_wscale = snd_wscale;
2269				tp->rx_opt.rcv_wscale = rcv_wscale;
2270				tp->rx_opt.wscale_ok = 1;
2271			}
2272			break;
2273		case TCPOPT_SACK_PERM:
2274			if (opt.opt_val != 0)
2275				return -EINVAL;
2276
2277			tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
2278			if (sysctl_tcp_fack)
2279				tcp_enable_fack(tp);
2280			break;
2281		case TCPOPT_TIMESTAMP:
2282			if (opt.opt_val != 0)
2283				return -EINVAL;
2284
2285			tp->rx_opt.tstamp_ok = 1;
2286			break;
2287		}
2288	}
2289
2290	return 0;
2291}
2292
2293/*
2294 *	Socket option code for TCP.
2295 */
2296static int do_tcp_setsockopt(struct sock *sk, int level,
2297		int optname, char __user *optval, unsigned int optlen)
2298{
2299	struct tcp_sock *tp = tcp_sk(sk);
2300	struct inet_connection_sock *icsk = inet_csk(sk);
2301	int val;
2302	int err = 0;
2303
2304	/* These are data/string values, all the others are ints */
2305	switch (optname) {
2306	case TCP_CONGESTION: {
2307		char name[TCP_CA_NAME_MAX];
2308
2309		if (optlen < 1)
2310			return -EINVAL;
2311
2312		val = strncpy_from_user(name, optval,
2313					min_t(long, TCP_CA_NAME_MAX-1, optlen));
2314		if (val < 0)
2315			return -EFAULT;
2316		name[val] = 0;
2317
2318		lock_sock(sk);
2319		err = tcp_set_congestion_control(sk, name);
2320		release_sock(sk);
2321		return err;
2322	}
2323	default:
2324		/* fallthru */
2325		break;
2326	}
2327
2328	if (optlen < sizeof(int))
2329		return -EINVAL;
2330
2331	if (get_user(val, (int __user *)optval))
2332		return -EFAULT;
2333
2334	lock_sock(sk);
2335
2336	switch (optname) {
2337	case TCP_MAXSEG:
2338		/* Values greater than interface MTU won't take effect. However
2339		 * at the point when this call is done we typically don't yet
2340		 * know which interface is going to be used */
2341		if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) {
2342			err = -EINVAL;
2343			break;
2344		}
2345		tp->rx_opt.user_mss = val;
2346		break;
2347
2348	case TCP_NODELAY:
2349		if (val) {
2350			/* TCP_NODELAY is weaker than TCP_CORK, so that
2351			 * this option on corked socket is remembered, but
2352			 * it is not activated until cork is cleared.
2353			 *
2354			 * However, when TCP_NODELAY is set we make
2355			 * an explicit push, which overrides even TCP_CORK
2356			 * for currently queued segments.
2357			 */
2358			tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2359			tcp_push_pending_frames(sk);
2360		} else {
2361			tp->nonagle &= ~TCP_NAGLE_OFF;
2362		}
2363		break;
2364
2365	case TCP_THIN_LINEAR_TIMEOUTS:
2366		if (val < 0 || val > 1)
2367			err = -EINVAL;
2368		else
2369			tp->thin_lto = val;
2370		break;
2371
2372	case TCP_THIN_DUPACK:
2373		if (val < 0 || val > 1)
2374			err = -EINVAL;
2375		else {
2376			tp->thin_dupack = val;
2377			if (tp->thin_dupack)
2378				tcp_disable_early_retrans(tp);
2379		}
2380		break;
2381
2382	case TCP_REPAIR:
2383		if (!tcp_can_repair_sock(sk))
2384			err = -EPERM;
2385		else if (val == 1) {
2386			tp->repair = 1;
2387			sk->sk_reuse = SK_FORCE_REUSE;
2388			tp->repair_queue = TCP_NO_QUEUE;
2389		} else if (val == 0) {
2390			tp->repair = 0;
2391			sk->sk_reuse = SK_NO_REUSE;
2392			tcp_send_window_probe(sk);
2393		} else
2394			err = -EINVAL;
2395
2396		break;
2397
2398	case TCP_REPAIR_QUEUE:
2399		if (!tp->repair)
2400			err = -EPERM;
2401		else if (val < TCP_QUEUES_NR)
2402			tp->repair_queue = val;
2403		else
2404			err = -EINVAL;
2405		break;
2406
2407	case TCP_QUEUE_SEQ:
2408		if (sk->sk_state != TCP_CLOSE)
2409			err = -EPERM;
2410		else if (tp->repair_queue == TCP_SEND_QUEUE)
2411			tp->write_seq = val;
2412		else if (tp->repair_queue == TCP_RECV_QUEUE)
2413			tp->rcv_nxt = val;
2414		else
2415			err = -EINVAL;
2416		break;
2417
2418	case TCP_REPAIR_OPTIONS:
2419		if (!tp->repair)
2420			err = -EINVAL;
2421		else if (sk->sk_state == TCP_ESTABLISHED)
2422			err = tcp_repair_options_est(tp,
2423					(struct tcp_repair_opt __user *)optval,
2424					optlen);
2425		else
2426			err = -EPERM;
2427		break;
2428
2429	case TCP_CORK:
2430		/* When set indicates to always queue non-full frames.
2431		 * Later the user clears this option and we transmit
2432		 * any pending partial frames in the queue.  This is
2433		 * meant to be used alongside sendfile() to get properly
2434		 * filled frames when the user (for example) must write
2435		 * out headers with a write() call first and then use
2436		 * sendfile to send out the data parts.
2437		 *
2438		 * TCP_CORK can be set together with TCP_NODELAY and it is
2439		 * stronger than TCP_NODELAY.
2440		 */
2441		if (val) {
2442			tp->nonagle |= TCP_NAGLE_CORK;
2443		} else {
2444			tp->nonagle &= ~TCP_NAGLE_CORK;
2445			if (tp->nonagle&TCP_NAGLE_OFF)
2446				tp->nonagle |= TCP_NAGLE_PUSH;
2447			tcp_push_pending_frames(sk);
2448		}
2449		break;
2450
2451	case TCP_KEEPIDLE:
2452		if (val < 1 || val > MAX_TCP_KEEPIDLE)
2453			err = -EINVAL;
2454		else {
2455			tp->keepalive_time = val * HZ;
2456			if (sock_flag(sk, SOCK_KEEPOPEN) &&
2457			    !((1 << sk->sk_state) &
2458			      (TCPF_CLOSE | TCPF_LISTEN))) {
2459				u32 elapsed = keepalive_time_elapsed(tp);
2460				if (tp->keepalive_time > elapsed)
2461					elapsed = tp->keepalive_time - elapsed;
2462				else
2463					elapsed = 0;
2464				inet_csk_reset_keepalive_timer(sk, elapsed);
2465			}
2466		}
2467		break;
2468	case TCP_KEEPINTVL:
2469		if (val < 1 || val > MAX_TCP_KEEPINTVL)
2470			err = -EINVAL;
2471		else
2472			tp->keepalive_intvl = val * HZ;
2473		break;
2474	case TCP_KEEPCNT:
2475		if (val < 1 || val > MAX_TCP_KEEPCNT)
2476			err = -EINVAL;
2477		else
2478			tp->keepalive_probes = val;
2479		break;
2480	case TCP_SYNCNT:
2481		if (val < 1 || val > MAX_TCP_SYNCNT)
2482			err = -EINVAL;
2483		else
2484			icsk->icsk_syn_retries = val;
2485		break;
2486
2487	case TCP_LINGER2:
2488		if (val < 0)
2489			tp->linger2 = -1;
2490		else if (val > sysctl_tcp_fin_timeout / HZ)
2491			tp->linger2 = 0;
2492		else
2493			tp->linger2 = val * HZ;
2494		break;
2495
2496	case TCP_DEFER_ACCEPT:
2497		/* Translate value in seconds to number of retransmits */
2498		icsk->icsk_accept_queue.rskq_defer_accept =
2499			secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
2500					TCP_RTO_MAX / HZ);
2501		break;
2502
2503	case TCP_WINDOW_CLAMP:
2504		if (!val) {
2505			if (sk->sk_state != TCP_CLOSE) {
2506				err = -EINVAL;
2507				break;
2508			}
2509			tp->window_clamp = 0;
2510		} else
2511			tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2512						SOCK_MIN_RCVBUF / 2 : val;
2513		break;
2514
2515	case TCP_QUICKACK:
2516		if (!val) {
2517			icsk->icsk_ack.pingpong = 1;
2518		} else {
2519			icsk->icsk_ack.pingpong = 0;
2520			if ((1 << sk->sk_state) &
2521			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2522			    inet_csk_ack_scheduled(sk)) {
2523				icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
2524				tcp_cleanup_rbuf(sk, 1);
2525				if (!(val & 1))
2526					icsk->icsk_ack.pingpong = 1;
2527			}
2528		}
2529		break;
2530
2531#ifdef CONFIG_TCP_MD5SIG
2532	case TCP_MD5SIG:
2533		/* Read the IP->Key mappings from userspace */
2534		err = tp->af_specific->md5_parse(sk, optval, optlen);
2535		break;
2536#endif
2537	case TCP_USER_TIMEOUT:
2538		/* Cap the max time in ms TCP will retry or probe the window
2539		 * before giving up and aborting (ETIMEDOUT) a connection.
2540		 */
2541		if (val < 0)
2542			err = -EINVAL;
2543		else
2544			icsk->icsk_user_timeout = msecs_to_jiffies(val);
2545		break;
2546
2547	case TCP_FASTOPEN:
2548		if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
2549		    TCPF_LISTEN))) {
2550			tcp_fastopen_init_key_once(true);
2551
2552			err = fastopen_init_queue(sk, val);
2553		} else {
2554			err = -EINVAL;
2555		}
2556		break;
2557	case TCP_TIMESTAMP:
2558		if (!tp->repair)
2559			err = -EPERM;
2560		else
2561			tp->tsoffset = val - tcp_time_stamp;
2562		break;
2563	case TCP_NOTSENT_LOWAT:
2564		tp->notsent_lowat = val;
2565		sk->sk_write_space(sk);
2566		break;
2567	default:
2568		err = -ENOPROTOOPT;
2569		break;
2570	}
2571
2572	release_sock(sk);
2573	return err;
2574}
2575
2576int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2577		   unsigned int optlen)
2578{
2579	const struct inet_connection_sock *icsk = inet_csk(sk);
2580
2581	if (level != SOL_TCP)
2582		return icsk->icsk_af_ops->setsockopt(sk, level, optname,
2583						     optval, optlen);
2584	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2585}
2586EXPORT_SYMBOL(tcp_setsockopt);
2587
2588#ifdef CONFIG_COMPAT
2589int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
2590			  char __user *optval, unsigned int optlen)
2591{
2592	if (level != SOL_TCP)
2593		return inet_csk_compat_setsockopt(sk, level, optname,
2594						  optval, optlen);
2595	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2596}
2597EXPORT_SYMBOL(compat_tcp_setsockopt);
2598#endif
2599
2600/* Return information about state of tcp endpoint in API format. */
2601void tcp_get_info(struct sock *sk, struct tcp_info *info)
2602{
2603	const struct tcp_sock *tp = tcp_sk(sk);
2604	const struct inet_connection_sock *icsk = inet_csk(sk);
2605	u32 now = tcp_time_stamp;
2606	unsigned int start;
2607	u64 rate64;
2608	u32 rate;
2609
2610	memset(info, 0, sizeof(*info));
2611
2612	info->tcpi_state = sk->sk_state;
2613	info->tcpi_ca_state = icsk->icsk_ca_state;
2614	info->tcpi_retransmits = icsk->icsk_retransmits;
2615	info->tcpi_probes = icsk->icsk_probes_out;
2616	info->tcpi_backoff = icsk->icsk_backoff;
2617
2618	if (tp->rx_opt.tstamp_ok)
2619		info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2620	if (tcp_is_sack(tp))
2621		info->tcpi_options |= TCPI_OPT_SACK;
2622	if (tp->rx_opt.wscale_ok) {
2623		info->tcpi_options |= TCPI_OPT_WSCALE;
2624		info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2625		info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2626	}
2627
2628	if (tp->ecn_flags & TCP_ECN_OK)
2629		info->tcpi_options |= TCPI_OPT_ECN;
2630	if (tp->ecn_flags & TCP_ECN_SEEN)
2631		info->tcpi_options |= TCPI_OPT_ECN_SEEN;
2632	if (tp->syn_data_acked)
2633		info->tcpi_options |= TCPI_OPT_SYN_DATA;
2634
2635	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2636	info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2637	info->tcpi_snd_mss = tp->mss_cache;
2638	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2639
2640	if (sk->sk_state == TCP_LISTEN) {
2641		info->tcpi_unacked = sk->sk_ack_backlog;
2642		info->tcpi_sacked = sk->sk_max_ack_backlog;
2643	} else {
2644		info->tcpi_unacked = tp->packets_out;
2645		info->tcpi_sacked = tp->sacked_out;
2646	}
2647	info->tcpi_lost = tp->lost_out;
2648	info->tcpi_retrans = tp->retrans_out;
2649	info->tcpi_fackets = tp->fackets_out;
2650
2651	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2652	info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2653	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2654
2655	info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
2656	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2657	info->tcpi_rtt = tp->srtt_us >> 3;
2658	info->tcpi_rttvar = tp->mdev_us >> 2;
2659	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2660	info->tcpi_snd_cwnd = tp->snd_cwnd;
2661	info->tcpi_advmss = tp->advmss;
2662	info->tcpi_reordering = tp->reordering;
2663
2664	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2665	info->tcpi_rcv_space = tp->rcvq_space.space;
2666
2667	info->tcpi_total_retrans = tp->total_retrans;
2668
2669	rate = READ_ONCE(sk->sk_pacing_rate);
2670	rate64 = rate != ~0U ? rate : ~0ULL;
2671	put_unaligned(rate64, &info->tcpi_pacing_rate);
2672
2673	rate = READ_ONCE(sk->sk_max_pacing_rate);
2674	rate64 = rate != ~0U ? rate : ~0ULL;
2675	put_unaligned(rate64, &info->tcpi_max_pacing_rate);
2676
2677	do {
2678		start = u64_stats_fetch_begin_irq(&tp->syncp);
2679		put_unaligned(tp->bytes_acked, &info->tcpi_bytes_acked);
2680		put_unaligned(tp->bytes_received, &info->tcpi_bytes_received);
2681	} while (u64_stats_fetch_retry_irq(&tp->syncp, start));
2682}
2683EXPORT_SYMBOL_GPL(tcp_get_info);
2684
2685static int do_tcp_getsockopt(struct sock *sk, int level,
2686		int optname, char __user *optval, int __user *optlen)
2687{
2688	struct inet_connection_sock *icsk = inet_csk(sk);
2689	struct tcp_sock *tp = tcp_sk(sk);
2690	int val, len;
2691
2692	if (get_user(len, optlen))
2693		return -EFAULT;
2694
2695	len = min_t(unsigned int, len, sizeof(int));
2696
2697	if (len < 0)
2698		return -EINVAL;
2699
2700	switch (optname) {
2701	case TCP_MAXSEG:
2702		val = tp->mss_cache;
2703		if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2704			val = tp->rx_opt.user_mss;
2705		if (tp->repair)
2706			val = tp->rx_opt.mss_clamp;
2707		break;
2708	case TCP_NODELAY:
2709		val = !!(tp->nonagle&TCP_NAGLE_OFF);
2710		break;
2711	case TCP_CORK:
2712		val = !!(tp->nonagle&TCP_NAGLE_CORK);
2713		break;
2714	case TCP_KEEPIDLE:
2715		val = keepalive_time_when(tp) / HZ;
2716		break;
2717	case TCP_KEEPINTVL:
2718		val = keepalive_intvl_when(tp) / HZ;
2719		break;
2720	case TCP_KEEPCNT:
2721		val = keepalive_probes(tp);
2722		break;
2723	case TCP_SYNCNT:
2724		val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
2725		break;
2726	case TCP_LINGER2:
2727		val = tp->linger2;
2728		if (val >= 0)
2729			val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2730		break;
2731	case TCP_DEFER_ACCEPT:
2732		val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
2733				      TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
2734		break;
2735	case TCP_WINDOW_CLAMP:
2736		val = tp->window_clamp;
2737		break;
2738	case TCP_INFO: {
2739		struct tcp_info info;
2740
2741		if (get_user(len, optlen))
2742			return -EFAULT;
2743
2744		tcp_get_info(sk, &info);
2745
2746		len = min_t(unsigned int, len, sizeof(info));
2747		if (put_user(len, optlen))
2748			return -EFAULT;
2749		if (copy_to_user(optval, &info, len))
2750			return -EFAULT;
2751		return 0;
2752	}
2753	case TCP_CC_INFO: {
2754		const struct tcp_congestion_ops *ca_ops;
2755		union tcp_cc_info info;
2756		size_t sz = 0;
2757		int attr;
2758
2759		if (get_user(len, optlen))
2760			return -EFAULT;
2761
2762		ca_ops = icsk->icsk_ca_ops;
2763		if (ca_ops && ca_ops->get_info)
2764			sz = ca_ops->get_info(sk, ~0U, &attr, &info);
2765
2766		len = min_t(unsigned int, len, sz);
2767		if (put_user(len, optlen))
2768			return -EFAULT;
2769		if (copy_to_user(optval, &info, len))
2770			return -EFAULT;
2771		return 0;
2772	}
2773	case TCP_QUICKACK:
2774		val = !icsk->icsk_ack.pingpong;
2775		break;
2776
2777	case TCP_CONGESTION:
2778		if (get_user(len, optlen))
2779			return -EFAULT;
2780		len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2781		if (put_user(len, optlen))
2782			return -EFAULT;
2783		if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2784			return -EFAULT;
2785		return 0;
2786
2787	case TCP_THIN_LINEAR_TIMEOUTS:
2788		val = tp->thin_lto;
2789		break;
2790	case TCP_THIN_DUPACK:
2791		val = tp->thin_dupack;
2792		break;
2793
2794	case TCP_REPAIR:
2795		val = tp->repair;
2796		break;
2797
2798	case TCP_REPAIR_QUEUE:
2799		if (tp->repair)
2800			val = tp->repair_queue;
2801		else
2802			return -EINVAL;
2803		break;
2804
2805	case TCP_QUEUE_SEQ:
2806		if (tp->repair_queue == TCP_SEND_QUEUE)
2807			val = tp->write_seq;
2808		else if (tp->repair_queue == TCP_RECV_QUEUE)
2809			val = tp->rcv_nxt;
2810		else
2811			return -EINVAL;
2812		break;
2813
2814	case TCP_USER_TIMEOUT:
2815		val = jiffies_to_msecs(icsk->icsk_user_timeout);
2816		break;
2817
2818	case TCP_FASTOPEN:
2819		if (icsk->icsk_accept_queue.fastopenq)
2820			val = icsk->icsk_accept_queue.fastopenq->max_qlen;
2821		else
2822			val = 0;
2823		break;
2824
2825	case TCP_TIMESTAMP:
2826		val = tcp_time_stamp + tp->tsoffset;
2827		break;
2828	case TCP_NOTSENT_LOWAT:
2829		val = tp->notsent_lowat;
2830		break;
2831	default:
2832		return -ENOPROTOOPT;
2833	}
2834
2835	if (put_user(len, optlen))
2836		return -EFAULT;
2837	if (copy_to_user(optval, &val, len))
2838		return -EFAULT;
2839	return 0;
2840}
2841
2842int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2843		   int __user *optlen)
2844{
2845	struct inet_connection_sock *icsk = inet_csk(sk);
2846
2847	if (level != SOL_TCP)
2848		return icsk->icsk_af_ops->getsockopt(sk, level, optname,
2849						     optval, optlen);
2850	return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2851}
2852EXPORT_SYMBOL(tcp_getsockopt);
2853
2854#ifdef CONFIG_COMPAT
2855int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2856			  char __user *optval, int __user *optlen)
2857{
2858	if (level != SOL_TCP)
2859		return inet_csk_compat_getsockopt(sk, level, optname,
2860						  optval, optlen);
2861	return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2862}
2863EXPORT_SYMBOL(compat_tcp_getsockopt);
2864#endif
2865
2866#ifdef CONFIG_TCP_MD5SIG
2867static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
2868static DEFINE_MUTEX(tcp_md5sig_mutex);
2869static bool tcp_md5sig_pool_populated = false;
2870
2871static void __tcp_alloc_md5sig_pool(void)
2872{
2873	int cpu;
2874
2875	for_each_possible_cpu(cpu) {
2876		if (!per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm) {
2877			struct crypto_hash *hash;
2878
2879			hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
2880			if (IS_ERR_OR_NULL(hash))
2881				return;
2882			per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm = hash;
2883		}
2884	}
2885	/* before setting tcp_md5sig_pool_populated, we must commit all writes
2886	 * to memory. See smp_rmb() in tcp_get_md5sig_pool()
2887	 */
2888	smp_wmb();
2889	tcp_md5sig_pool_populated = true;
2890}
2891
2892bool tcp_alloc_md5sig_pool(void)
2893{
2894	if (unlikely(!tcp_md5sig_pool_populated)) {
2895		mutex_lock(&tcp_md5sig_mutex);
2896
2897		if (!tcp_md5sig_pool_populated)
2898			__tcp_alloc_md5sig_pool();
2899
2900		mutex_unlock(&tcp_md5sig_mutex);
2901	}
2902	return tcp_md5sig_pool_populated;
2903}
2904EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
2905
2906
2907/**
2908 *	tcp_get_md5sig_pool - get md5sig_pool for this user
2909 *
2910 *	We use percpu structure, so if we succeed, we exit with preemption
2911 *	and BH disabled, to make sure another thread or softirq handling
2912 *	wont try to get same context.
2913 */
2914struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
2915{
2916	local_bh_disable();
2917
2918	if (tcp_md5sig_pool_populated) {
2919		/* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */
2920		smp_rmb();
2921		return this_cpu_ptr(&tcp_md5sig_pool);
2922	}
2923	local_bh_enable();
2924	return NULL;
2925}
2926EXPORT_SYMBOL(tcp_get_md5sig_pool);
2927
2928int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
2929			const struct tcphdr *th)
2930{
2931	struct scatterlist sg;
2932	struct tcphdr hdr;
2933	int err;
2934
2935	/* We are not allowed to change tcphdr, make a local copy */
2936	memcpy(&hdr, th, sizeof(hdr));
2937	hdr.check = 0;
2938
2939	/* options aren't included in the hash */
2940	sg_init_one(&sg, &hdr, sizeof(hdr));
2941	err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr));
2942	return err;
2943}
2944EXPORT_SYMBOL(tcp_md5_hash_header);
2945
2946int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
2947			  const struct sk_buff *skb, unsigned int header_len)
2948{
2949	struct scatterlist sg;
2950	const struct tcphdr *tp = tcp_hdr(skb);
2951	struct hash_desc *desc = &hp->md5_desc;
2952	unsigned int i;
2953	const unsigned int head_data_len = skb_headlen(skb) > header_len ?
2954					   skb_headlen(skb) - header_len : 0;
2955	const struct skb_shared_info *shi = skb_shinfo(skb);
2956	struct sk_buff *frag_iter;
2957
2958	sg_init_table(&sg, 1);
2959
2960	sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
2961	if (crypto_hash_update(desc, &sg, head_data_len))
2962		return 1;
2963
2964	for (i = 0; i < shi->nr_frags; ++i) {
2965		const struct skb_frag_struct *f = &shi->frags[i];
2966		unsigned int offset = f->page_offset;
2967		struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
2968
2969		sg_set_page(&sg, page, skb_frag_size(f),
2970			    offset_in_page(offset));
2971		if (crypto_hash_update(desc, &sg, skb_frag_size(f)))
2972			return 1;
2973	}
2974
2975	skb_walk_frags(skb, frag_iter)
2976		if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
2977			return 1;
2978
2979	return 0;
2980}
2981EXPORT_SYMBOL(tcp_md5_hash_skb_data);
2982
2983int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
2984{
2985	struct scatterlist sg;
2986
2987	sg_init_one(&sg, key->key, key->keylen);
2988	return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
2989}
2990EXPORT_SYMBOL(tcp_md5_hash_key);
2991
2992#endif
2993
2994void tcp_done(struct sock *sk)
2995{
2996	struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
2997
2998	if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
2999		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3000
3001	tcp_set_state(sk, TCP_CLOSE);
3002	tcp_clear_xmit_timers(sk);
3003	if (req)
3004		reqsk_fastopen_remove(sk, req, false);
3005
3006	sk->sk_shutdown = SHUTDOWN_MASK;
3007
3008	if (!sock_flag(sk, SOCK_DEAD))
3009		sk->sk_state_change(sk);
3010	else
3011		inet_csk_destroy_sock(sk);
3012}
3013EXPORT_SYMBOL_GPL(tcp_done);
3014
3015extern struct tcp_congestion_ops tcp_reno;
3016
3017static __initdata unsigned long thash_entries;
3018static int __init set_thash_entries(char *str)
3019{
3020	ssize_t ret;
3021
3022	if (!str)
3023		return 0;
3024
3025	ret = kstrtoul(str, 0, &thash_entries);
3026	if (ret)
3027		return 0;
3028
3029	return 1;
3030}
3031__setup("thash_entries=", set_thash_entries);
3032
3033static void __init tcp_init_mem(void)
3034{
3035	unsigned long limit = nr_free_buffer_pages() / 8;
3036	limit = max(limit, 128UL);
3037	sysctl_tcp_mem[0] = limit / 4 * 3;
3038	sysctl_tcp_mem[1] = limit;
3039	sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
3040}
3041
3042void __init tcp_init(void)
3043{
3044	unsigned long limit;
3045	int max_rshare, max_wshare, cnt;
3046	unsigned int i;
3047
3048	sock_skb_cb_check_size(sizeof(struct tcp_skb_cb));
3049
3050	percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
3051	percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
3052	tcp_hashinfo.bind_bucket_cachep =
3053		kmem_cache_create("tcp_bind_bucket",
3054				  sizeof(struct inet_bind_bucket), 0,
3055				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3056
3057	/* Size and allocate the main established and bind bucket
3058	 * hash tables.
3059	 *
3060	 * The methodology is similar to that of the buffer cache.
3061	 */
3062	tcp_hashinfo.ehash =
3063		alloc_large_system_hash("TCP established",
3064					sizeof(struct inet_ehash_bucket),
3065					thash_entries,
3066					17, /* one slot per 128 KB of memory */
3067					0,
3068					NULL,
3069					&tcp_hashinfo.ehash_mask,
3070					0,
3071					thash_entries ? 0 : 512 * 1024);
3072	for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
3073		INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
3074
3075	if (inet_ehash_locks_alloc(&tcp_hashinfo))
3076		panic("TCP: failed to alloc ehash_locks");
3077	tcp_hashinfo.bhash =
3078		alloc_large_system_hash("TCP bind",
3079					sizeof(struct inet_bind_hashbucket),
3080					tcp_hashinfo.ehash_mask + 1,
3081					17, /* one slot per 128 KB of memory */
3082					0,
3083					&tcp_hashinfo.bhash_size,
3084					NULL,
3085					0,
3086					64 * 1024);
3087	tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
3088	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
3089		spin_lock_init(&tcp_hashinfo.bhash[i].lock);
3090		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
3091	}
3092
3093
3094	cnt = tcp_hashinfo.ehash_mask + 1;
3095
3096	tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3097	sysctl_tcp_max_orphans = cnt / 2;
3098	sysctl_max_syn_backlog = max(128, cnt / 256);
3099
3100	tcp_init_mem();
3101	/* Set per-socket limits to no more than 1/128 the pressure threshold */
3102	limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
3103	max_wshare = min(4UL*1024*1024, limit);
3104	max_rshare = min(6UL*1024*1024, limit);
3105
3106	sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
3107	sysctl_tcp_wmem[1] = 16*1024;
3108	sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
3109
3110	sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
3111	sysctl_tcp_rmem[1] = 87380;
3112	sysctl_tcp_rmem[2] = max(87380, max_rshare);
3113
3114	pr_info("Hash tables configured (established %u bind %u)\n",
3115		tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
3116
3117	tcp_metrics_init();
3118	BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
3119	tcp_tasklet_init();
3120}
3121