1/*
2 * NET4:	Implementation of BSD Unix domain sockets.
3 *
4 * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
5 *
6 *		This program is free software; you can redistribute it and/or
7 *		modify it under the terms of the GNU General Public License
8 *		as published by the Free Software Foundation; either version
9 *		2 of the License, or (at your option) any later version.
10 *
11 * Fixes:
12 *		Linus Torvalds	:	Assorted bug cures.
13 *		Niibe Yutaka	:	async I/O support.
14 *		Carsten Paeth	:	PF_UNIX check, address fixes.
15 *		Alan Cox	:	Limit size of allocated blocks.
16 *		Alan Cox	:	Fixed the stupid socketpair bug.
17 *		Alan Cox	:	BSD compatibility fine tuning.
18 *		Alan Cox	:	Fixed a bug in connect when interrupted.
19 *		Alan Cox	:	Sorted out a proper draft version of
20 *					file descriptor passing hacked up from
21 *					Mike Shaver's work.
22 *		Marty Leisner	:	Fixes to fd passing
23 *		Nick Nevin	:	recvmsg bugfix.
24 *		Alan Cox	:	Started proper garbage collector
25 *		Heiko EiBfeldt	:	Missing verify_area check
26 *		Alan Cox	:	Started POSIXisms
27 *		Andreas Schwab	:	Replace inode by dentry for proper
28 *					reference counting
29 *		Kirk Petersen	:	Made this a module
30 *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
31 *					Lots of bug fixes.
32 *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
33 *					by above two patches.
34 *	     Andrea Arcangeli	:	If possible we block in connect(2)
35 *					if the max backlog of the listen socket
36 *					is been reached. This won't break
37 *					old apps and it will avoid huge amount
38 *					of socks hashed (this for unix_gc()
39 *					performances reasons).
40 *					Security fix that limits the max
41 *					number of socks to 2*max_files and
42 *					the number of skb queueable in the
43 *					dgram receiver.
44 *		Artur Skawina   :	Hash function optimizations
45 *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
46 *	      Malcolm Beattie   :	Set peercred for socketpair
47 *	     Michal Ostrowski   :       Module initialization cleanup.
48 *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
49 *	     				the core infrastructure is doing that
50 *	     				for all net proto families now (2.5.69+)
51 *
52 *
53 * Known differences from reference BSD that was tested:
54 *
55 *	[TO FIX]
56 *	ECONNREFUSED is not returned from one end of a connected() socket to the
57 *		other the moment one end closes.
58 *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
59 *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
60 *	[NOT TO FIX]
61 *	accept() returns a path name even if the connecting socket has closed
62 *		in the meantime (BSD loses the path and gives up).
63 *	accept() returns 0 length path for an unbound connector. BSD returns 16
64 *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
65 *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
66 *	BSD af_unix apparently has connect forgetting to block properly.
67 *		(need to check this with the POSIX spec in detail)
68 *
69 * Differences from 2.0.0-11-... (ANK)
70 *	Bug fixes and improvements.
71 *		- client shutdown killed server socket.
72 *		- removed all useless cli/sti pairs.
73 *
74 *	Semantic changes/extensions.
75 *		- generic control message passing.
76 *		- SCM_CREDENTIALS control message.
77 *		- "Abstract" (not FS based) socket bindings.
78 *		  Abstract names are sequences of bytes (not zero terminated)
79 *		  started by 0, so that this name space does not intersect
80 *		  with BSD names.
81 */
82
83#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
84
85#include <linux/module.h>
86#include <linux/kernel.h>
87#include <linux/signal.h>
88#include <linux/sched.h>
89#include <linux/errno.h>
90#include <linux/string.h>
91#include <linux/stat.h>
92#include <linux/dcache.h>
93#include <linux/namei.h>
94#include <linux/socket.h>
95#include <linux/un.h>
96#include <linux/fcntl.h>
97#include <linux/termios.h>
98#include <linux/sockios.h>
99#include <linux/net.h>
100#include <linux/in.h>
101#include <linux/fs.h>
102#include <linux/slab.h>
103#include <asm/uaccess.h>
104#include <linux/skbuff.h>
105#include <linux/netdevice.h>
106#include <net/net_namespace.h>
107#include <net/sock.h>
108#include <net/tcp_states.h>
109#include <net/af_unix.h>
110#include <linux/proc_fs.h>
111#include <linux/seq_file.h>
112#include <net/scm.h>
113#include <linux/init.h>
114#include <linux/poll.h>
115#include <linux/rtnetlink.h>
116#include <linux/mount.h>
117#include <net/checksum.h>
118#include <linux/security.h>
119#include <linux/freezer.h>
120
121struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
122EXPORT_SYMBOL_GPL(unix_socket_table);
123DEFINE_SPINLOCK(unix_table_lock);
124EXPORT_SYMBOL_GPL(unix_table_lock);
125static atomic_long_t unix_nr_socks;
126
127
128static struct hlist_head *unix_sockets_unbound(void *addr)
129{
130	unsigned long hash = (unsigned long)addr;
131
132	hash ^= hash >> 16;
133	hash ^= hash >> 8;
134	hash %= UNIX_HASH_SIZE;
135	return &unix_socket_table[UNIX_HASH_SIZE + hash];
136}
137
138#define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
139
140#ifdef CONFIG_SECURITY_NETWORK
141static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
142{
143	memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
144}
145
146static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
147{
148	scm->secid = *UNIXSID(skb);
149}
150#else
151static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
152{ }
153
154static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
155{ }
156#endif /* CONFIG_SECURITY_NETWORK */
157
158/*
159 *  SMP locking strategy:
160 *    hash table is protected with spinlock unix_table_lock
161 *    each socket state is protected by separate spin lock.
162 */
163
164static inline unsigned int unix_hash_fold(__wsum n)
165{
166	unsigned int hash = (__force unsigned int)csum_fold(n);
167
168	hash ^= hash>>8;
169	return hash&(UNIX_HASH_SIZE-1);
170}
171
172#define unix_peer(sk) (unix_sk(sk)->peer)
173
174static inline int unix_our_peer(struct sock *sk, struct sock *osk)
175{
176	return unix_peer(osk) == sk;
177}
178
179static inline int unix_may_send(struct sock *sk, struct sock *osk)
180{
181	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
182}
183
184static inline int unix_recvq_full(struct sock const *sk)
185{
186	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
187}
188
189struct sock *unix_peer_get(struct sock *s)
190{
191	struct sock *peer;
192
193	unix_state_lock(s);
194	peer = unix_peer(s);
195	if (peer)
196		sock_hold(peer);
197	unix_state_unlock(s);
198	return peer;
199}
200EXPORT_SYMBOL_GPL(unix_peer_get);
201
202static inline void unix_release_addr(struct unix_address *addr)
203{
204	if (atomic_dec_and_test(&addr->refcnt))
205		kfree(addr);
206}
207
208/*
209 *	Check unix socket name:
210 *		- should be not zero length.
211 *	        - if started by not zero, should be NULL terminated (FS object)
212 *		- if started by zero, it is abstract name.
213 */
214
215static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
216{
217	if (len <= sizeof(short) || len > sizeof(*sunaddr))
218		return -EINVAL;
219	if (!sunaddr || sunaddr->sun_family != AF_UNIX)
220		return -EINVAL;
221	if (sunaddr->sun_path[0]) {
222		/*
223		 * This may look like an off by one error but it is a bit more
224		 * subtle. 108 is the longest valid AF_UNIX path for a binding.
225		 * sun_path[108] doesn't as such exist.  However in kernel space
226		 * we are guaranteed that it is a valid memory location in our
227		 * kernel address buffer.
228		 */
229		((char *)sunaddr)[len] = 0;
230		len = strlen(sunaddr->sun_path)+1+sizeof(short);
231		return len;
232	}
233
234	*hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
235	return len;
236}
237
238static void __unix_remove_socket(struct sock *sk)
239{
240	sk_del_node_init(sk);
241}
242
243static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
244{
245	WARN_ON(!sk_unhashed(sk));
246	sk_add_node(sk, list);
247}
248
249static inline void unix_remove_socket(struct sock *sk)
250{
251	spin_lock(&unix_table_lock);
252	__unix_remove_socket(sk);
253	spin_unlock(&unix_table_lock);
254}
255
256static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
257{
258	spin_lock(&unix_table_lock);
259	__unix_insert_socket(list, sk);
260	spin_unlock(&unix_table_lock);
261}
262
263static struct sock *__unix_find_socket_byname(struct net *net,
264					      struct sockaddr_un *sunname,
265					      int len, int type, unsigned int hash)
266{
267	struct sock *s;
268
269	sk_for_each(s, &unix_socket_table[hash ^ type]) {
270		struct unix_sock *u = unix_sk(s);
271
272		if (!net_eq(sock_net(s), net))
273			continue;
274
275		if (u->addr->len == len &&
276		    !memcmp(u->addr->name, sunname, len))
277			goto found;
278	}
279	s = NULL;
280found:
281	return s;
282}
283
284static inline struct sock *unix_find_socket_byname(struct net *net,
285						   struct sockaddr_un *sunname,
286						   int len, int type,
287						   unsigned int hash)
288{
289	struct sock *s;
290
291	spin_lock(&unix_table_lock);
292	s = __unix_find_socket_byname(net, sunname, len, type, hash);
293	if (s)
294		sock_hold(s);
295	spin_unlock(&unix_table_lock);
296	return s;
297}
298
299static struct sock *unix_find_socket_byinode(struct inode *i)
300{
301	struct sock *s;
302
303	spin_lock(&unix_table_lock);
304	sk_for_each(s,
305		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
306		struct dentry *dentry = unix_sk(s)->path.dentry;
307
308		if (dentry && d_backing_inode(dentry) == i) {
309			sock_hold(s);
310			goto found;
311		}
312	}
313	s = NULL;
314found:
315	spin_unlock(&unix_table_lock);
316	return s;
317}
318
319/* Support code for asymmetrically connected dgram sockets
320 *
321 * If a datagram socket is connected to a socket not itself connected
322 * to the first socket (eg, /dev/log), clients may only enqueue more
323 * messages if the present receive queue of the server socket is not
324 * "too large". This means there's a second writeability condition
325 * poll and sendmsg need to test. The dgram recv code will do a wake
326 * up on the peer_wait wait queue of a socket upon reception of a
327 * datagram which needs to be propagated to sleeping would-be writers
328 * since these might not have sent anything so far. This can't be
329 * accomplished via poll_wait because the lifetime of the server
330 * socket might be less than that of its clients if these break their
331 * association with it or if the server socket is closed while clients
332 * are still connected to it and there's no way to inform "a polling
333 * implementation" that it should let go of a certain wait queue
334 *
335 * In order to propagate a wake up, a wait_queue_t of the client
336 * socket is enqueued on the peer_wait queue of the server socket
337 * whose wake function does a wake_up on the ordinary client socket
338 * wait queue. This connection is established whenever a write (or
339 * poll for write) hit the flow control condition and broken when the
340 * association to the server socket is dissolved or after a wake up
341 * was relayed.
342 */
343
344static int unix_dgram_peer_wake_relay(wait_queue_t *q, unsigned mode, int flags,
345				      void *key)
346{
347	struct unix_sock *u;
348	wait_queue_head_t *u_sleep;
349
350	u = container_of(q, struct unix_sock, peer_wake);
351
352	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
353			    q);
354	u->peer_wake.private = NULL;
355
356	/* relaying can only happen while the wq still exists */
357	u_sleep = sk_sleep(&u->sk);
358	if (u_sleep)
359		wake_up_interruptible_poll(u_sleep, key);
360
361	return 0;
362}
363
364static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
365{
366	struct unix_sock *u, *u_other;
367	int rc;
368
369	u = unix_sk(sk);
370	u_other = unix_sk(other);
371	rc = 0;
372	spin_lock(&u_other->peer_wait.lock);
373
374	if (!u->peer_wake.private) {
375		u->peer_wake.private = other;
376		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
377
378		rc = 1;
379	}
380
381	spin_unlock(&u_other->peer_wait.lock);
382	return rc;
383}
384
385static void unix_dgram_peer_wake_disconnect(struct sock *sk,
386					    struct sock *other)
387{
388	struct unix_sock *u, *u_other;
389
390	u = unix_sk(sk);
391	u_other = unix_sk(other);
392	spin_lock(&u_other->peer_wait.lock);
393
394	if (u->peer_wake.private == other) {
395		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
396		u->peer_wake.private = NULL;
397	}
398
399	spin_unlock(&u_other->peer_wait.lock);
400}
401
402static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
403						   struct sock *other)
404{
405	unix_dgram_peer_wake_disconnect(sk, other);
406	wake_up_interruptible_poll(sk_sleep(sk),
407				   POLLOUT |
408				   POLLWRNORM |
409				   POLLWRBAND);
410}
411
412/* preconditions:
413 *	- unix_peer(sk) == other
414 *	- association is stable
415 */
416static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
417{
418	int connected;
419
420	connected = unix_dgram_peer_wake_connect(sk, other);
421
422	if (unix_recvq_full(other))
423		return 1;
424
425	if (connected)
426		unix_dgram_peer_wake_disconnect(sk, other);
427
428	return 0;
429}
430
431static inline int unix_writable(struct sock *sk)
432{
433	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
434}
435
436static void unix_write_space(struct sock *sk)
437{
438	struct socket_wq *wq;
439
440	rcu_read_lock();
441	if (unix_writable(sk)) {
442		wq = rcu_dereference(sk->sk_wq);
443		if (wq_has_sleeper(wq))
444			wake_up_interruptible_sync_poll(&wq->wait,
445				POLLOUT | POLLWRNORM | POLLWRBAND);
446		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
447	}
448	rcu_read_unlock();
449}
450
451/* When dgram socket disconnects (or changes its peer), we clear its receive
452 * queue of packets arrived from previous peer. First, it allows to do
453 * flow control based only on wmem_alloc; second, sk connected to peer
454 * may receive messages only from that peer. */
455static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
456{
457	if (!skb_queue_empty(&sk->sk_receive_queue)) {
458		skb_queue_purge(&sk->sk_receive_queue);
459		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
460
461		/* If one link of bidirectional dgram pipe is disconnected,
462		 * we signal error. Messages are lost. Do not make this,
463		 * when peer was not connected to us.
464		 */
465		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
466			other->sk_err = ECONNRESET;
467			other->sk_error_report(other);
468		}
469	}
470}
471
472static void unix_sock_destructor(struct sock *sk)
473{
474	struct unix_sock *u = unix_sk(sk);
475
476	skb_queue_purge(&sk->sk_receive_queue);
477
478	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
479	WARN_ON(!sk_unhashed(sk));
480	WARN_ON(sk->sk_socket);
481	if (!sock_flag(sk, SOCK_DEAD)) {
482		pr_info("Attempt to release alive unix socket: %p\n", sk);
483		return;
484	}
485
486	if (u->addr)
487		unix_release_addr(u->addr);
488
489	atomic_long_dec(&unix_nr_socks);
490	local_bh_disable();
491	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
492	local_bh_enable();
493#ifdef UNIX_REFCNT_DEBUG
494	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
495		atomic_long_read(&unix_nr_socks));
496#endif
497}
498
499static void unix_release_sock(struct sock *sk, int embrion)
500{
501	struct unix_sock *u = unix_sk(sk);
502	struct path path;
503	struct sock *skpair;
504	struct sk_buff *skb;
505	int state;
506
507	unix_remove_socket(sk);
508
509	/* Clear state */
510	unix_state_lock(sk);
511	sock_orphan(sk);
512	sk->sk_shutdown = SHUTDOWN_MASK;
513	path	     = u->path;
514	u->path.dentry = NULL;
515	u->path.mnt = NULL;
516	state = sk->sk_state;
517	sk->sk_state = TCP_CLOSE;
518	unix_state_unlock(sk);
519
520	wake_up_interruptible_all(&u->peer_wait);
521
522	skpair = unix_peer(sk);
523
524	if (skpair != NULL) {
525		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
526			unix_state_lock(skpair);
527			/* No more writes */
528			skpair->sk_shutdown = SHUTDOWN_MASK;
529			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
530				skpair->sk_err = ECONNRESET;
531			unix_state_unlock(skpair);
532			skpair->sk_state_change(skpair);
533			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
534		}
535
536		unix_dgram_peer_wake_disconnect(sk, skpair);
537		sock_put(skpair); /* It may now die */
538		unix_peer(sk) = NULL;
539	}
540
541	/* Try to flush out this socket. Throw out buffers at least */
542
543	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
544		if (state == TCP_LISTEN)
545			unix_release_sock(skb->sk, 1);
546		/* passed fds are erased in the kfree_skb hook	      */
547		kfree_skb(skb);
548	}
549
550	if (path.dentry)
551		path_put(&path);
552
553	sock_put(sk);
554
555	/* ---- Socket is dead now and most probably destroyed ---- */
556
557	/*
558	 * Fixme: BSD difference: In BSD all sockets connected to us get
559	 *	  ECONNRESET and we die on the spot. In Linux we behave
560	 *	  like files and pipes do and wait for the last
561	 *	  dereference.
562	 *
563	 * Can't we simply set sock->err?
564	 *
565	 *	  What the above comment does talk about? --ANK(980817)
566	 */
567
568	if (unix_tot_inflight)
569		unix_gc();		/* Garbage collect fds */
570}
571
572static void init_peercred(struct sock *sk)
573{
574	put_pid(sk->sk_peer_pid);
575	if (sk->sk_peer_cred)
576		put_cred(sk->sk_peer_cred);
577	sk->sk_peer_pid  = get_pid(task_tgid(current));
578	sk->sk_peer_cred = get_current_cred();
579}
580
581static void copy_peercred(struct sock *sk, struct sock *peersk)
582{
583	put_pid(sk->sk_peer_pid);
584	if (sk->sk_peer_cred)
585		put_cred(sk->sk_peer_cred);
586	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
587	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
588}
589
590static int unix_listen(struct socket *sock, int backlog)
591{
592	int err;
593	struct sock *sk = sock->sk;
594	struct unix_sock *u = unix_sk(sk);
595	struct pid *old_pid = NULL;
596
597	err = -EOPNOTSUPP;
598	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
599		goto out;	/* Only stream/seqpacket sockets accept */
600	err = -EINVAL;
601	if (!u->addr)
602		goto out;	/* No listens on an unbound socket */
603	unix_state_lock(sk);
604	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
605		goto out_unlock;
606	if (backlog > sk->sk_max_ack_backlog)
607		wake_up_interruptible_all(&u->peer_wait);
608	sk->sk_max_ack_backlog	= backlog;
609	sk->sk_state		= TCP_LISTEN;
610	/* set credentials so connect can copy them */
611	init_peercred(sk);
612	err = 0;
613
614out_unlock:
615	unix_state_unlock(sk);
616	put_pid(old_pid);
617out:
618	return err;
619}
620
621static int unix_release(struct socket *);
622static int unix_bind(struct socket *, struct sockaddr *, int);
623static int unix_stream_connect(struct socket *, struct sockaddr *,
624			       int addr_len, int flags);
625static int unix_socketpair(struct socket *, struct socket *);
626static int unix_accept(struct socket *, struct socket *, int);
627static int unix_getname(struct socket *, struct sockaddr *, int *, int);
628static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
629static unsigned int unix_dgram_poll(struct file *, struct socket *,
630				    poll_table *);
631static int unix_ioctl(struct socket *, unsigned int, unsigned long);
632static int unix_shutdown(struct socket *, int);
633static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
634static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
635static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
636static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
637static int unix_dgram_connect(struct socket *, struct sockaddr *,
638			      int, int);
639static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
640static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
641				  int);
642
643static int unix_set_peek_off(struct sock *sk, int val)
644{
645	struct unix_sock *u = unix_sk(sk);
646
647	if (mutex_lock_interruptible(&u->readlock))
648		return -EINTR;
649
650	sk->sk_peek_off = val;
651	mutex_unlock(&u->readlock);
652
653	return 0;
654}
655
656
657static const struct proto_ops unix_stream_ops = {
658	.family =	PF_UNIX,
659	.owner =	THIS_MODULE,
660	.release =	unix_release,
661	.bind =		unix_bind,
662	.connect =	unix_stream_connect,
663	.socketpair =	unix_socketpair,
664	.accept =	unix_accept,
665	.getname =	unix_getname,
666	.poll =		unix_poll,
667	.ioctl =	unix_ioctl,
668	.listen =	unix_listen,
669	.shutdown =	unix_shutdown,
670	.setsockopt =	sock_no_setsockopt,
671	.getsockopt =	sock_no_getsockopt,
672	.sendmsg =	unix_stream_sendmsg,
673	.recvmsg =	unix_stream_recvmsg,
674	.mmap =		sock_no_mmap,
675	.sendpage =	sock_no_sendpage,
676	.set_peek_off =	unix_set_peek_off,
677};
678
679static const struct proto_ops unix_dgram_ops = {
680	.family =	PF_UNIX,
681	.owner =	THIS_MODULE,
682	.release =	unix_release,
683	.bind =		unix_bind,
684	.connect =	unix_dgram_connect,
685	.socketpair =	unix_socketpair,
686	.accept =	sock_no_accept,
687	.getname =	unix_getname,
688	.poll =		unix_dgram_poll,
689	.ioctl =	unix_ioctl,
690	.listen =	sock_no_listen,
691	.shutdown =	unix_shutdown,
692	.setsockopt =	sock_no_setsockopt,
693	.getsockopt =	sock_no_getsockopt,
694	.sendmsg =	unix_dgram_sendmsg,
695	.recvmsg =	unix_dgram_recvmsg,
696	.mmap =		sock_no_mmap,
697	.sendpage =	sock_no_sendpage,
698	.set_peek_off =	unix_set_peek_off,
699};
700
701static const struct proto_ops unix_seqpacket_ops = {
702	.family =	PF_UNIX,
703	.owner =	THIS_MODULE,
704	.release =	unix_release,
705	.bind =		unix_bind,
706	.connect =	unix_stream_connect,
707	.socketpair =	unix_socketpair,
708	.accept =	unix_accept,
709	.getname =	unix_getname,
710	.poll =		unix_dgram_poll,
711	.ioctl =	unix_ioctl,
712	.listen =	unix_listen,
713	.shutdown =	unix_shutdown,
714	.setsockopt =	sock_no_setsockopt,
715	.getsockopt =	sock_no_getsockopt,
716	.sendmsg =	unix_seqpacket_sendmsg,
717	.recvmsg =	unix_seqpacket_recvmsg,
718	.mmap =		sock_no_mmap,
719	.sendpage =	sock_no_sendpage,
720	.set_peek_off =	unix_set_peek_off,
721};
722
723static struct proto unix_proto = {
724	.name			= "UNIX",
725	.owner			= THIS_MODULE,
726	.obj_size		= sizeof(struct unix_sock),
727};
728
729/*
730 * AF_UNIX sockets do not interact with hardware, hence they
731 * dont trigger interrupts - so it's safe for them to have
732 * bh-unsafe locking for their sk_receive_queue.lock. Split off
733 * this special lock-class by reinitializing the spinlock key:
734 */
735static struct lock_class_key af_unix_sk_receive_queue_lock_key;
736
737static struct sock *unix_create1(struct net *net, struct socket *sock)
738{
739	struct sock *sk = NULL;
740	struct unix_sock *u;
741
742	atomic_long_inc(&unix_nr_socks);
743	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
744		goto out;
745
746	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
747	if (!sk)
748		goto out;
749
750	sock_init_data(sock, sk);
751	lockdep_set_class(&sk->sk_receive_queue.lock,
752				&af_unix_sk_receive_queue_lock_key);
753
754	sk->sk_write_space	= unix_write_space;
755	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
756	sk->sk_destruct		= unix_sock_destructor;
757	u	  = unix_sk(sk);
758	u->path.dentry = NULL;
759	u->path.mnt = NULL;
760	spin_lock_init(&u->lock);
761	atomic_long_set(&u->inflight, 0);
762	INIT_LIST_HEAD(&u->link);
763	mutex_init(&u->readlock); /* single task reading lock */
764	init_waitqueue_head(&u->peer_wait);
765	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
766	unix_insert_socket(unix_sockets_unbound(sk), sk);
767out:
768	if (sk == NULL)
769		atomic_long_dec(&unix_nr_socks);
770	else {
771		local_bh_disable();
772		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
773		local_bh_enable();
774	}
775	return sk;
776}
777
778static int unix_create(struct net *net, struct socket *sock, int protocol,
779		       int kern)
780{
781	if (protocol && protocol != PF_UNIX)
782		return -EPROTONOSUPPORT;
783
784	sock->state = SS_UNCONNECTED;
785
786	switch (sock->type) {
787	case SOCK_STREAM:
788		sock->ops = &unix_stream_ops;
789		break;
790		/*
791		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
792		 *	nothing uses it.
793		 */
794	case SOCK_RAW:
795		sock->type = SOCK_DGRAM;
796	case SOCK_DGRAM:
797		sock->ops = &unix_dgram_ops;
798		break;
799	case SOCK_SEQPACKET:
800		sock->ops = &unix_seqpacket_ops;
801		break;
802	default:
803		return -ESOCKTNOSUPPORT;
804	}
805
806	return unix_create1(net, sock) ? 0 : -ENOMEM;
807}
808
809static int unix_release(struct socket *sock)
810{
811	struct sock *sk = sock->sk;
812
813	if (!sk)
814		return 0;
815
816	unix_release_sock(sk, 0);
817	sock->sk = NULL;
818
819	return 0;
820}
821
822static int unix_autobind(struct socket *sock)
823{
824	struct sock *sk = sock->sk;
825	struct net *net = sock_net(sk);
826	struct unix_sock *u = unix_sk(sk);
827	static u32 ordernum = 1;
828	struct unix_address *addr;
829	int err;
830	unsigned int retries = 0;
831
832	err = mutex_lock_interruptible(&u->readlock);
833	if (err)
834		return err;
835
836	err = 0;
837	if (u->addr)
838		goto out;
839
840	err = -ENOMEM;
841	addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
842	if (!addr)
843		goto out;
844
845	addr->name->sun_family = AF_UNIX;
846	atomic_set(&addr->refcnt, 1);
847
848retry:
849	addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
850	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
851
852	spin_lock(&unix_table_lock);
853	ordernum = (ordernum+1)&0xFFFFF;
854
855	if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
856				      addr->hash)) {
857		spin_unlock(&unix_table_lock);
858		/*
859		 * __unix_find_socket_byname() may take long time if many names
860		 * are already in use.
861		 */
862		cond_resched();
863		/* Give up if all names seems to be in use. */
864		if (retries++ == 0xFFFFF) {
865			err = -ENOSPC;
866			kfree(addr);
867			goto out;
868		}
869		goto retry;
870	}
871	addr->hash ^= sk->sk_type;
872
873	__unix_remove_socket(sk);
874	u->addr = addr;
875	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
876	spin_unlock(&unix_table_lock);
877	err = 0;
878
879out:	mutex_unlock(&u->readlock);
880	return err;
881}
882
883static struct sock *unix_find_other(struct net *net,
884				    struct sockaddr_un *sunname, int len,
885				    int type, unsigned int hash, int *error)
886{
887	struct sock *u;
888	struct path path;
889	int err = 0;
890
891	if (sunname->sun_path[0]) {
892		struct inode *inode;
893		err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
894		if (err)
895			goto fail;
896		inode = d_backing_inode(path.dentry);
897		err = inode_permission(inode, MAY_WRITE);
898		if (err)
899			goto put_fail;
900
901		err = -ECONNREFUSED;
902		if (!S_ISSOCK(inode->i_mode))
903			goto put_fail;
904		u = unix_find_socket_byinode(inode);
905		if (!u)
906			goto put_fail;
907
908		if (u->sk_type == type)
909			touch_atime(&path);
910
911		path_put(&path);
912
913		err = -EPROTOTYPE;
914		if (u->sk_type != type) {
915			sock_put(u);
916			goto fail;
917		}
918	} else {
919		err = -ECONNREFUSED;
920		u = unix_find_socket_byname(net, sunname, len, type, hash);
921		if (u) {
922			struct dentry *dentry;
923			dentry = unix_sk(u)->path.dentry;
924			if (dentry)
925				touch_atime(&unix_sk(u)->path);
926		} else
927			goto fail;
928	}
929	return u;
930
931put_fail:
932	path_put(&path);
933fail:
934	*error = err;
935	return NULL;
936}
937
938static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
939{
940	struct dentry *dentry;
941	struct path path;
942	int err = 0;
943	/*
944	 * Get the parent directory, calculate the hash for last
945	 * component.
946	 */
947	dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
948	err = PTR_ERR(dentry);
949	if (IS_ERR(dentry))
950		return err;
951
952	/*
953	 * All right, let's create it.
954	 */
955	err = security_path_mknod(&path, dentry, mode, 0);
956	if (!err) {
957		err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
958		if (!err) {
959			res->mnt = mntget(path.mnt);
960			res->dentry = dget(dentry);
961		}
962	}
963	done_path_create(&path, dentry);
964	return err;
965}
966
967static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
968{
969	struct sock *sk = sock->sk;
970	struct net *net = sock_net(sk);
971	struct unix_sock *u = unix_sk(sk);
972	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
973	char *sun_path = sunaddr->sun_path;
974	int err;
975	unsigned int hash;
976	struct unix_address *addr;
977	struct hlist_head *list;
978
979	err = -EINVAL;
980	if (sunaddr->sun_family != AF_UNIX)
981		goto out;
982
983	if (addr_len == sizeof(short)) {
984		err = unix_autobind(sock);
985		goto out;
986	}
987
988	err = unix_mkname(sunaddr, addr_len, &hash);
989	if (err < 0)
990		goto out;
991	addr_len = err;
992
993	err = mutex_lock_interruptible(&u->readlock);
994	if (err)
995		goto out;
996
997	err = -EINVAL;
998	if (u->addr)
999		goto out_up;
1000
1001	err = -ENOMEM;
1002	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
1003	if (!addr)
1004		goto out_up;
1005
1006	memcpy(addr->name, sunaddr, addr_len);
1007	addr->len = addr_len;
1008	addr->hash = hash ^ sk->sk_type;
1009	atomic_set(&addr->refcnt, 1);
1010
1011	if (sun_path[0]) {
1012		struct path path;
1013		umode_t mode = S_IFSOCK |
1014		       (SOCK_INODE(sock)->i_mode & ~current_umask());
1015		err = unix_mknod(sun_path, mode, &path);
1016		if (err) {
1017			if (err == -EEXIST)
1018				err = -EADDRINUSE;
1019			unix_release_addr(addr);
1020			goto out_up;
1021		}
1022		addr->hash = UNIX_HASH_SIZE;
1023		hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE-1);
1024		spin_lock(&unix_table_lock);
1025		u->path = path;
1026		list = &unix_socket_table[hash];
1027	} else {
1028		spin_lock(&unix_table_lock);
1029		err = -EADDRINUSE;
1030		if (__unix_find_socket_byname(net, sunaddr, addr_len,
1031					      sk->sk_type, hash)) {
1032			unix_release_addr(addr);
1033			goto out_unlock;
1034		}
1035
1036		list = &unix_socket_table[addr->hash];
1037	}
1038
1039	err = 0;
1040	__unix_remove_socket(sk);
1041	u->addr = addr;
1042	__unix_insert_socket(list, sk);
1043
1044out_unlock:
1045	spin_unlock(&unix_table_lock);
1046out_up:
1047	mutex_unlock(&u->readlock);
1048out:
1049	return err;
1050}
1051
1052static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1053{
1054	if (unlikely(sk1 == sk2) || !sk2) {
1055		unix_state_lock(sk1);
1056		return;
1057	}
1058	if (sk1 < sk2) {
1059		unix_state_lock(sk1);
1060		unix_state_lock_nested(sk2);
1061	} else {
1062		unix_state_lock(sk2);
1063		unix_state_lock_nested(sk1);
1064	}
1065}
1066
1067static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1068{
1069	if (unlikely(sk1 == sk2) || !sk2) {
1070		unix_state_unlock(sk1);
1071		return;
1072	}
1073	unix_state_unlock(sk1);
1074	unix_state_unlock(sk2);
1075}
1076
1077static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1078			      int alen, int flags)
1079{
1080	struct sock *sk = sock->sk;
1081	struct net *net = sock_net(sk);
1082	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1083	struct sock *other;
1084	unsigned int hash;
1085	int err;
1086
1087	if (addr->sa_family != AF_UNSPEC) {
1088		err = unix_mkname(sunaddr, alen, &hash);
1089		if (err < 0)
1090			goto out;
1091		alen = err;
1092
1093		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1094		    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
1095			goto out;
1096
1097restart:
1098		other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
1099		if (!other)
1100			goto out;
1101
1102		unix_state_double_lock(sk, other);
1103
1104		/* Apparently VFS overslept socket death. Retry. */
1105		if (sock_flag(other, SOCK_DEAD)) {
1106			unix_state_double_unlock(sk, other);
1107			sock_put(other);
1108			goto restart;
1109		}
1110
1111		err = -EPERM;
1112		if (!unix_may_send(sk, other))
1113			goto out_unlock;
1114
1115		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1116		if (err)
1117			goto out_unlock;
1118
1119	} else {
1120		/*
1121		 *	1003.1g breaking connected state with AF_UNSPEC
1122		 */
1123		other = NULL;
1124		unix_state_double_lock(sk, other);
1125	}
1126
1127	/*
1128	 * If it was connected, reconnect.
1129	 */
1130	if (unix_peer(sk)) {
1131		struct sock *old_peer = unix_peer(sk);
1132		unix_peer(sk) = other;
1133		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1134
1135		unix_state_double_unlock(sk, other);
1136
1137		if (other != old_peer)
1138			unix_dgram_disconnected(sk, old_peer);
1139		sock_put(old_peer);
1140	} else {
1141		unix_peer(sk) = other;
1142		unix_state_double_unlock(sk, other);
1143	}
1144	return 0;
1145
1146out_unlock:
1147	unix_state_double_unlock(sk, other);
1148	sock_put(other);
1149out:
1150	return err;
1151}
1152
1153static long unix_wait_for_peer(struct sock *other, long timeo)
1154{
1155	struct unix_sock *u = unix_sk(other);
1156	int sched;
1157	DEFINE_WAIT(wait);
1158
1159	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1160
1161	sched = !sock_flag(other, SOCK_DEAD) &&
1162		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1163		unix_recvq_full(other);
1164
1165	unix_state_unlock(other);
1166
1167	if (sched)
1168		timeo = schedule_timeout(timeo);
1169
1170	finish_wait(&u->peer_wait, &wait);
1171	return timeo;
1172}
1173
1174static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1175			       int addr_len, int flags)
1176{
1177	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1178	struct sock *sk = sock->sk;
1179	struct net *net = sock_net(sk);
1180	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1181	struct sock *newsk = NULL;
1182	struct sock *other = NULL;
1183	struct sk_buff *skb = NULL;
1184	unsigned int hash;
1185	int st;
1186	int err;
1187	long timeo;
1188
1189	err = unix_mkname(sunaddr, addr_len, &hash);
1190	if (err < 0)
1191		goto out;
1192	addr_len = err;
1193
1194	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1195	    (err = unix_autobind(sock)) != 0)
1196		goto out;
1197
1198	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1199
1200	/* First of all allocate resources.
1201	   If we will make it after state is locked,
1202	   we will have to recheck all again in any case.
1203	 */
1204
1205	err = -ENOMEM;
1206
1207	/* create new sock for complete connection */
1208	newsk = unix_create1(sock_net(sk), NULL);
1209	if (newsk == NULL)
1210		goto out;
1211
1212	/* Allocate skb for sending to listening sock */
1213	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1214	if (skb == NULL)
1215		goto out;
1216
1217restart:
1218	/*  Find listening sock. */
1219	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1220	if (!other)
1221		goto out;
1222
1223	/* Latch state of peer */
1224	unix_state_lock(other);
1225
1226	/* Apparently VFS overslept socket death. Retry. */
1227	if (sock_flag(other, SOCK_DEAD)) {
1228		unix_state_unlock(other);
1229		sock_put(other);
1230		goto restart;
1231	}
1232
1233	err = -ECONNREFUSED;
1234	if (other->sk_state != TCP_LISTEN)
1235		goto out_unlock;
1236	if (other->sk_shutdown & RCV_SHUTDOWN)
1237		goto out_unlock;
1238
1239	if (unix_recvq_full(other)) {
1240		err = -EAGAIN;
1241		if (!timeo)
1242			goto out_unlock;
1243
1244		timeo = unix_wait_for_peer(other, timeo);
1245
1246		err = sock_intr_errno(timeo);
1247		if (signal_pending(current))
1248			goto out;
1249		sock_put(other);
1250		goto restart;
1251	}
1252
1253	/* Latch our state.
1254
1255	   It is tricky place. We need to grab our state lock and cannot
1256	   drop lock on peer. It is dangerous because deadlock is
1257	   possible. Connect to self case and simultaneous
1258	   attempt to connect are eliminated by checking socket
1259	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1260	   check this before attempt to grab lock.
1261
1262	   Well, and we have to recheck the state after socket locked.
1263	 */
1264	st = sk->sk_state;
1265
1266	switch (st) {
1267	case TCP_CLOSE:
1268		/* This is ok... continue with connect */
1269		break;
1270	case TCP_ESTABLISHED:
1271		/* Socket is already connected */
1272		err = -EISCONN;
1273		goto out_unlock;
1274	default:
1275		err = -EINVAL;
1276		goto out_unlock;
1277	}
1278
1279	unix_state_lock_nested(sk);
1280
1281	if (sk->sk_state != st) {
1282		unix_state_unlock(sk);
1283		unix_state_unlock(other);
1284		sock_put(other);
1285		goto restart;
1286	}
1287
1288	err = security_unix_stream_connect(sk, other, newsk);
1289	if (err) {
1290		unix_state_unlock(sk);
1291		goto out_unlock;
1292	}
1293
1294	/* The way is open! Fastly set all the necessary fields... */
1295
1296	sock_hold(sk);
1297	unix_peer(newsk)	= sk;
1298	newsk->sk_state		= TCP_ESTABLISHED;
1299	newsk->sk_type		= sk->sk_type;
1300	init_peercred(newsk);
1301	newu = unix_sk(newsk);
1302	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1303	otheru = unix_sk(other);
1304
1305	/* copy address information from listening to new sock*/
1306	if (otheru->addr) {
1307		atomic_inc(&otheru->addr->refcnt);
1308		newu->addr = otheru->addr;
1309	}
1310	if (otheru->path.dentry) {
1311		path_get(&otheru->path);
1312		newu->path = otheru->path;
1313	}
1314
1315	/* Set credentials */
1316	copy_peercred(sk, other);
1317
1318	sock->state	= SS_CONNECTED;
1319	sk->sk_state	= TCP_ESTABLISHED;
1320	sock_hold(newsk);
1321
1322	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1323	unix_peer(sk)	= newsk;
1324
1325	unix_state_unlock(sk);
1326
1327	/* take ten and and send info to listening sock */
1328	spin_lock(&other->sk_receive_queue.lock);
1329	__skb_queue_tail(&other->sk_receive_queue, skb);
1330	spin_unlock(&other->sk_receive_queue.lock);
1331	unix_state_unlock(other);
1332	other->sk_data_ready(other);
1333	sock_put(other);
1334	return 0;
1335
1336out_unlock:
1337	if (other)
1338		unix_state_unlock(other);
1339
1340out:
1341	kfree_skb(skb);
1342	if (newsk)
1343		unix_release_sock(newsk, 0);
1344	if (other)
1345		sock_put(other);
1346	return err;
1347}
1348
1349static int unix_socketpair(struct socket *socka, struct socket *sockb)
1350{
1351	struct sock *ska = socka->sk, *skb = sockb->sk;
1352
1353	/* Join our sockets back to back */
1354	sock_hold(ska);
1355	sock_hold(skb);
1356	unix_peer(ska) = skb;
1357	unix_peer(skb) = ska;
1358	init_peercred(ska);
1359	init_peercred(skb);
1360
1361	if (ska->sk_type != SOCK_DGRAM) {
1362		ska->sk_state = TCP_ESTABLISHED;
1363		skb->sk_state = TCP_ESTABLISHED;
1364		socka->state  = SS_CONNECTED;
1365		sockb->state  = SS_CONNECTED;
1366	}
1367	return 0;
1368}
1369
1370static void unix_sock_inherit_flags(const struct socket *old,
1371				    struct socket *new)
1372{
1373	if (test_bit(SOCK_PASSCRED, &old->flags))
1374		set_bit(SOCK_PASSCRED, &new->flags);
1375	if (test_bit(SOCK_PASSSEC, &old->flags))
1376		set_bit(SOCK_PASSSEC, &new->flags);
1377}
1378
1379static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1380{
1381	struct sock *sk = sock->sk;
1382	struct sock *tsk;
1383	struct sk_buff *skb;
1384	int err;
1385
1386	err = -EOPNOTSUPP;
1387	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1388		goto out;
1389
1390	err = -EINVAL;
1391	if (sk->sk_state != TCP_LISTEN)
1392		goto out;
1393
1394	/* If socket state is TCP_LISTEN it cannot change (for now...),
1395	 * so that no locks are necessary.
1396	 */
1397
1398	skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1399	if (!skb) {
1400		/* This means receive shutdown. */
1401		if (err == 0)
1402			err = -EINVAL;
1403		goto out;
1404	}
1405
1406	tsk = skb->sk;
1407	skb_free_datagram(sk, skb);
1408	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1409
1410	/* attach accepted sock to socket */
1411	unix_state_lock(tsk);
1412	newsock->state = SS_CONNECTED;
1413	unix_sock_inherit_flags(sock, newsock);
1414	sock_graft(tsk, newsock);
1415	unix_state_unlock(tsk);
1416	return 0;
1417
1418out:
1419	return err;
1420}
1421
1422
1423static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1424{
1425	struct sock *sk = sock->sk;
1426	struct unix_sock *u;
1427	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1428	int err = 0;
1429
1430	if (peer) {
1431		sk = unix_peer_get(sk);
1432
1433		err = -ENOTCONN;
1434		if (!sk)
1435			goto out;
1436		err = 0;
1437	} else {
1438		sock_hold(sk);
1439	}
1440
1441	u = unix_sk(sk);
1442	unix_state_lock(sk);
1443	if (!u->addr) {
1444		sunaddr->sun_family = AF_UNIX;
1445		sunaddr->sun_path[0] = 0;
1446		*uaddr_len = sizeof(short);
1447	} else {
1448		struct unix_address *addr = u->addr;
1449
1450		*uaddr_len = addr->len;
1451		memcpy(sunaddr, addr->name, *uaddr_len);
1452	}
1453	unix_state_unlock(sk);
1454	sock_put(sk);
1455out:
1456	return err;
1457}
1458
1459static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1460{
1461	int i;
1462
1463	scm->fp = UNIXCB(skb).fp;
1464	UNIXCB(skb).fp = NULL;
1465
1466	for (i = scm->fp->count-1; i >= 0; i--)
1467		unix_notinflight(scm->fp->user, scm->fp->fp[i]);
1468}
1469
1470static void unix_destruct_scm(struct sk_buff *skb)
1471{
1472	struct scm_cookie scm;
1473	memset(&scm, 0, sizeof(scm));
1474	scm.pid  = UNIXCB(skb).pid;
1475	if (UNIXCB(skb).fp)
1476		unix_detach_fds(&scm, skb);
1477
1478	/* Alas, it calls VFS */
1479	/* So fscking what? fput() had been SMP-safe since the last Summer */
1480	scm_destroy(&scm);
1481	sock_wfree(skb);
1482}
1483
1484/*
1485 * The "user->unix_inflight" variable is protected by the garbage
1486 * collection lock, and we just read it locklessly here. If you go
1487 * over the limit, there might be a tiny race in actually noticing
1488 * it across threads. Tough.
1489 */
1490static inline bool too_many_unix_fds(struct task_struct *p)
1491{
1492	struct user_struct *user = current_user();
1493
1494	if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
1495		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1496	return false;
1497}
1498
1499#define MAX_RECURSION_LEVEL 4
1500
1501static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1502{
1503	int i;
1504	unsigned char max_level = 0;
1505	int unix_sock_count = 0;
1506
1507	if (too_many_unix_fds(current))
1508		return -ETOOMANYREFS;
1509
1510	for (i = scm->fp->count - 1; i >= 0; i--) {
1511		struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1512
1513		if (sk) {
1514			unix_sock_count++;
1515			max_level = max(max_level,
1516					unix_sk(sk)->recursion_level);
1517		}
1518	}
1519	if (unlikely(max_level > MAX_RECURSION_LEVEL))
1520		return -ETOOMANYREFS;
1521
1522	/*
1523	 * Need to duplicate file references for the sake of garbage
1524	 * collection.  Otherwise a socket in the fps might become a
1525	 * candidate for GC while the skb is not yet queued.
1526	 */
1527	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1528	if (!UNIXCB(skb).fp)
1529		return -ENOMEM;
1530
1531	for (i = scm->fp->count - 1; i >= 0; i--)
1532		unix_inflight(scm->fp->user, scm->fp->fp[i]);
1533	return max_level;
1534}
1535
1536static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1537{
1538	int err = 0;
1539
1540	UNIXCB(skb).pid  = get_pid(scm->pid);
1541	UNIXCB(skb).uid = scm->creds.uid;
1542	UNIXCB(skb).gid = scm->creds.gid;
1543	UNIXCB(skb).fp = NULL;
1544	if (scm->fp && send_fds)
1545		err = unix_attach_fds(scm, skb);
1546
1547	skb->destructor = unix_destruct_scm;
1548	return err;
1549}
1550
1551/*
1552 * Some apps rely on write() giving SCM_CREDENTIALS
1553 * We include credentials if source or destination socket
1554 * asserted SOCK_PASSCRED.
1555 */
1556static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1557			    const struct sock *other)
1558{
1559	if (UNIXCB(skb).pid)
1560		return;
1561	if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1562	    !other->sk_socket ||
1563	    test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1564		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1565		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1566	}
1567}
1568
1569/*
1570 *	Send AF_UNIX data.
1571 */
1572
1573static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1574			      size_t len)
1575{
1576	struct sock *sk = sock->sk;
1577	struct net *net = sock_net(sk);
1578	struct unix_sock *u = unix_sk(sk);
1579	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1580	struct sock *other = NULL;
1581	int namelen = 0; /* fake GCC */
1582	int err;
1583	unsigned int hash;
1584	struct sk_buff *skb;
1585	long timeo;
1586	struct scm_cookie scm;
1587	int max_level;
1588	int data_len = 0;
1589	int sk_locked;
1590
1591	wait_for_unix_gc();
1592	err = scm_send(sock, msg, &scm, false);
1593	if (err < 0)
1594		return err;
1595
1596	err = -EOPNOTSUPP;
1597	if (msg->msg_flags&MSG_OOB)
1598		goto out;
1599
1600	if (msg->msg_namelen) {
1601		err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1602		if (err < 0)
1603			goto out;
1604		namelen = err;
1605	} else {
1606		sunaddr = NULL;
1607		err = -ENOTCONN;
1608		other = unix_peer_get(sk);
1609		if (!other)
1610			goto out;
1611	}
1612
1613	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1614	    && (err = unix_autobind(sock)) != 0)
1615		goto out;
1616
1617	err = -EMSGSIZE;
1618	if (len > sk->sk_sndbuf - 32)
1619		goto out;
1620
1621	if (len > SKB_MAX_ALLOC) {
1622		data_len = min_t(size_t,
1623				 len - SKB_MAX_ALLOC,
1624				 MAX_SKB_FRAGS * PAGE_SIZE);
1625		data_len = PAGE_ALIGN(data_len);
1626
1627		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1628	}
1629
1630	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1631				   msg->msg_flags & MSG_DONTWAIT, &err,
1632				   PAGE_ALLOC_COSTLY_ORDER);
1633	if (skb == NULL)
1634		goto out;
1635
1636	err = unix_scm_to_skb(&scm, skb, true);
1637	if (err < 0)
1638		goto out_free;
1639	max_level = err + 1;
1640	unix_get_secdata(&scm, skb);
1641
1642	skb_put(skb, len - data_len);
1643	skb->data_len = data_len;
1644	skb->len = len;
1645	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1646	if (err)
1647		goto out_free;
1648
1649	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1650
1651restart:
1652	if (!other) {
1653		err = -ECONNRESET;
1654		if (sunaddr == NULL)
1655			goto out_free;
1656
1657		other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1658					hash, &err);
1659		if (other == NULL)
1660			goto out_free;
1661	}
1662
1663	if (sk_filter(other, skb) < 0) {
1664		/* Toss the packet but do not return any error to the sender */
1665		err = len;
1666		goto out_free;
1667	}
1668
1669	sk_locked = 0;
1670	unix_state_lock(other);
1671restart_locked:
1672	err = -EPERM;
1673	if (!unix_may_send(sk, other))
1674		goto out_unlock;
1675
1676	if (unlikely(sock_flag(other, SOCK_DEAD))) {
1677		/*
1678		 *	Check with 1003.1g - what should
1679		 *	datagram error
1680		 */
1681		unix_state_unlock(other);
1682		sock_put(other);
1683
1684		if (!sk_locked)
1685			unix_state_lock(sk);
1686
1687		err = 0;
1688		if (unix_peer(sk) == other) {
1689			unix_peer(sk) = NULL;
1690			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1691
1692			unix_state_unlock(sk);
1693
1694			unix_dgram_disconnected(sk, other);
1695			sock_put(other);
1696			err = -ECONNREFUSED;
1697		} else {
1698			unix_state_unlock(sk);
1699		}
1700
1701		other = NULL;
1702		if (err)
1703			goto out_free;
1704		goto restart;
1705	}
1706
1707	err = -EPIPE;
1708	if (other->sk_shutdown & RCV_SHUTDOWN)
1709		goto out_unlock;
1710
1711	if (sk->sk_type != SOCK_SEQPACKET) {
1712		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1713		if (err)
1714			goto out_unlock;
1715	}
1716
1717	/* other == sk && unix_peer(other) != sk if
1718	 * - unix_peer(sk) == NULL, destination address bound to sk
1719	 * - unix_peer(sk) == sk by time of get but disconnected before lock
1720	 */
1721	if (other != sk &&
1722	    unlikely(unix_peer(other) != sk && unix_recvq_full(other))) {
1723		if (timeo) {
1724			timeo = unix_wait_for_peer(other, timeo);
1725
1726			err = sock_intr_errno(timeo);
1727			if (signal_pending(current))
1728				goto out_free;
1729
1730			goto restart;
1731		}
1732
1733		if (!sk_locked) {
1734			unix_state_unlock(other);
1735			unix_state_double_lock(sk, other);
1736		}
1737
1738		if (unix_peer(sk) != other ||
1739		    unix_dgram_peer_wake_me(sk, other)) {
1740			err = -EAGAIN;
1741			sk_locked = 1;
1742			goto out_unlock;
1743		}
1744
1745		if (!sk_locked) {
1746			sk_locked = 1;
1747			goto restart_locked;
1748		}
1749	}
1750
1751	if (unlikely(sk_locked))
1752		unix_state_unlock(sk);
1753
1754	if (sock_flag(other, SOCK_RCVTSTAMP))
1755		__net_timestamp(skb);
1756	maybe_add_creds(skb, sock, other);
1757	skb_queue_tail(&other->sk_receive_queue, skb);
1758	if (max_level > unix_sk(other)->recursion_level)
1759		unix_sk(other)->recursion_level = max_level;
1760	unix_state_unlock(other);
1761	other->sk_data_ready(other);
1762	sock_put(other);
1763	scm_destroy(&scm);
1764	return len;
1765
1766out_unlock:
1767	if (sk_locked)
1768		unix_state_unlock(sk);
1769	unix_state_unlock(other);
1770out_free:
1771	kfree_skb(skb);
1772out:
1773	if (other)
1774		sock_put(other);
1775	scm_destroy(&scm);
1776	return err;
1777}
1778
1779/* We use paged skbs for stream sockets, and limit occupancy to 32768
1780 * bytes, and a minimun of a full page.
1781 */
1782#define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1783
1784static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
1785			       size_t len)
1786{
1787	struct sock *sk = sock->sk;
1788	struct sock *other = NULL;
1789	int err, size;
1790	struct sk_buff *skb;
1791	int sent = 0;
1792	struct scm_cookie scm;
1793	bool fds_sent = false;
1794	int max_level;
1795	int data_len;
1796
1797	wait_for_unix_gc();
1798	err = scm_send(sock, msg, &scm, false);
1799	if (err < 0)
1800		return err;
1801
1802	err = -EOPNOTSUPP;
1803	if (msg->msg_flags&MSG_OOB)
1804		goto out_err;
1805
1806	if (msg->msg_namelen) {
1807		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1808		goto out_err;
1809	} else {
1810		err = -ENOTCONN;
1811		other = unix_peer(sk);
1812		if (!other)
1813			goto out_err;
1814	}
1815
1816	if (sk->sk_shutdown & SEND_SHUTDOWN)
1817		goto pipe_err;
1818
1819	while (sent < len) {
1820		size = len - sent;
1821
1822		/* Keep two messages in the pipe so it schedules better */
1823		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1824
1825		/* allow fallback to order-0 allocations */
1826		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1827
1828		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1829
1830		data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1831
1832		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1833					   msg->msg_flags & MSG_DONTWAIT, &err,
1834					   get_order(UNIX_SKB_FRAGS_SZ));
1835		if (!skb)
1836			goto out_err;
1837
1838		/* Only send the fds in the first buffer */
1839		err = unix_scm_to_skb(&scm, skb, !fds_sent);
1840		if (err < 0) {
1841			kfree_skb(skb);
1842			goto out_err;
1843		}
1844		max_level = err + 1;
1845		fds_sent = true;
1846
1847		skb_put(skb, size - data_len);
1848		skb->data_len = data_len;
1849		skb->len = size;
1850		err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
1851		if (err) {
1852			kfree_skb(skb);
1853			goto out_err;
1854		}
1855
1856		unix_state_lock(other);
1857
1858		if (sock_flag(other, SOCK_DEAD) ||
1859		    (other->sk_shutdown & RCV_SHUTDOWN))
1860			goto pipe_err_free;
1861
1862		maybe_add_creds(skb, sock, other);
1863		skb_queue_tail(&other->sk_receive_queue, skb);
1864		if (max_level > unix_sk(other)->recursion_level)
1865			unix_sk(other)->recursion_level = max_level;
1866		unix_state_unlock(other);
1867		other->sk_data_ready(other);
1868		sent += size;
1869	}
1870
1871	scm_destroy(&scm);
1872
1873	return sent;
1874
1875pipe_err_free:
1876	unix_state_unlock(other);
1877	kfree_skb(skb);
1878pipe_err:
1879	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1880		send_sig(SIGPIPE, current, 0);
1881	err = -EPIPE;
1882out_err:
1883	scm_destroy(&scm);
1884	return sent ? : err;
1885}
1886
1887static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
1888				  size_t len)
1889{
1890	int err;
1891	struct sock *sk = sock->sk;
1892
1893	err = sock_error(sk);
1894	if (err)
1895		return err;
1896
1897	if (sk->sk_state != TCP_ESTABLISHED)
1898		return -ENOTCONN;
1899
1900	if (msg->msg_namelen)
1901		msg->msg_namelen = 0;
1902
1903	return unix_dgram_sendmsg(sock, msg, len);
1904}
1905
1906static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
1907				  size_t size, int flags)
1908{
1909	struct sock *sk = sock->sk;
1910
1911	if (sk->sk_state != TCP_ESTABLISHED)
1912		return -ENOTCONN;
1913
1914	return unix_dgram_recvmsg(sock, msg, size, flags);
1915}
1916
1917static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1918{
1919	struct unix_sock *u = unix_sk(sk);
1920
1921	if (u->addr) {
1922		msg->msg_namelen = u->addr->len;
1923		memcpy(msg->msg_name, u->addr->name, u->addr->len);
1924	}
1925}
1926
1927static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
1928			      size_t size, int flags)
1929{
1930	struct scm_cookie scm;
1931	struct sock *sk = sock->sk;
1932	struct unix_sock *u = unix_sk(sk);
1933	int noblock = flags & MSG_DONTWAIT;
1934	struct sk_buff *skb;
1935	int err;
1936	int peeked, skip;
1937
1938	err = -EOPNOTSUPP;
1939	if (flags&MSG_OOB)
1940		goto out;
1941
1942	err = mutex_lock_interruptible(&u->readlock);
1943	if (unlikely(err)) {
1944		/* recvmsg() in non blocking mode is supposed to return -EAGAIN
1945		 * sk_rcvtimeo is not honored by mutex_lock_interruptible()
1946		 */
1947		err = noblock ? -EAGAIN : -ERESTARTSYS;
1948		goto out;
1949	}
1950
1951	skip = sk_peek_offset(sk, flags);
1952
1953	skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1954	if (!skb) {
1955		unix_state_lock(sk);
1956		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1957		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1958		    (sk->sk_shutdown & RCV_SHUTDOWN))
1959			err = 0;
1960		unix_state_unlock(sk);
1961		goto out_unlock;
1962	}
1963
1964	wake_up_interruptible_sync_poll(&u->peer_wait,
1965					POLLOUT | POLLWRNORM | POLLWRBAND);
1966
1967	if (msg->msg_name)
1968		unix_copy_addr(msg, skb->sk);
1969
1970	if (size > skb->len - skip)
1971		size = skb->len - skip;
1972	else if (size < skb->len - skip)
1973		msg->msg_flags |= MSG_TRUNC;
1974
1975	err = skb_copy_datagram_msg(skb, skip, msg, size);
1976	if (err)
1977		goto out_free;
1978
1979	if (sock_flag(sk, SOCK_RCVTSTAMP))
1980		__sock_recv_timestamp(msg, sk, skb);
1981
1982	memset(&scm, 0, sizeof(scm));
1983
1984	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1985	unix_set_secdata(&scm, skb);
1986
1987	if (!(flags & MSG_PEEK)) {
1988		if (UNIXCB(skb).fp)
1989			unix_detach_fds(&scm, skb);
1990
1991		sk_peek_offset_bwd(sk, skb->len);
1992	} else {
1993		/* It is questionable: on PEEK we could:
1994		   - do not return fds - good, but too simple 8)
1995		   - return fds, and do not return them on read (old strategy,
1996		     apparently wrong)
1997		   - clone fds (I chose it for now, it is the most universal
1998		     solution)
1999
2000		   POSIX 1003.1g does not actually define this clearly
2001		   at all. POSIX 1003.1g doesn't define a lot of things
2002		   clearly however!
2003
2004		*/
2005
2006		sk_peek_offset_fwd(sk, size);
2007
2008		if (UNIXCB(skb).fp)
2009			scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2010	}
2011	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2012
2013	scm_recv(sock, msg, &scm, flags);
2014
2015out_free:
2016	skb_free_datagram(sk, skb);
2017out_unlock:
2018	mutex_unlock(&u->readlock);
2019out:
2020	return err;
2021}
2022
2023/*
2024 *	Sleep until more data has arrived. But check for races..
2025 */
2026static long unix_stream_data_wait(struct sock *sk, long timeo,
2027				  struct sk_buff *last)
2028{
2029	DEFINE_WAIT(wait);
2030
2031	unix_state_lock(sk);
2032
2033	for (;;) {
2034		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2035
2036		if (skb_peek_tail(&sk->sk_receive_queue) != last ||
2037		    sk->sk_err ||
2038		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2039		    signal_pending(current) ||
2040		    !timeo)
2041			break;
2042
2043		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
2044		unix_state_unlock(sk);
2045		timeo = freezable_schedule_timeout(timeo);
2046		unix_state_lock(sk);
2047
2048		if (sock_flag(sk, SOCK_DEAD))
2049			break;
2050
2051		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
2052	}
2053
2054	finish_wait(sk_sleep(sk), &wait);
2055	unix_state_unlock(sk);
2056	return timeo;
2057}
2058
2059static unsigned int unix_skb_len(const struct sk_buff *skb)
2060{
2061	return skb->len - UNIXCB(skb).consumed;
2062}
2063
2064static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2065			       size_t size, int flags)
2066{
2067	struct scm_cookie scm;
2068	struct sock *sk = sock->sk;
2069	struct unix_sock *u = unix_sk(sk);
2070	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
2071	int copied = 0;
2072	int noblock = flags & MSG_DONTWAIT;
2073	int check_creds = 0;
2074	int target;
2075	int err = 0;
2076	long timeo;
2077	int skip;
2078
2079	err = -EINVAL;
2080	if (sk->sk_state != TCP_ESTABLISHED)
2081		goto out;
2082
2083	err = -EOPNOTSUPP;
2084	if (flags&MSG_OOB)
2085		goto out;
2086
2087	target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
2088	timeo = sock_rcvtimeo(sk, noblock);
2089
2090	/* Lock the socket to prevent queue disordering
2091	 * while sleeps in memcpy_tomsg
2092	 */
2093
2094	memset(&scm, 0, sizeof(scm));
2095
2096	mutex_lock(&u->readlock);
2097
2098	if (flags & MSG_PEEK)
2099		skip = sk_peek_offset(sk, flags);
2100	else
2101		skip = 0;
2102
2103	do {
2104		int chunk;
2105		struct sk_buff *skb, *last;
2106
2107		unix_state_lock(sk);
2108		if (sock_flag(sk, SOCK_DEAD)) {
2109			err = -ECONNRESET;
2110			goto unlock;
2111		}
2112		last = skb = skb_peek(&sk->sk_receive_queue);
2113again:
2114		if (skb == NULL) {
2115			unix_sk(sk)->recursion_level = 0;
2116			if (copied >= target)
2117				goto unlock;
2118
2119			/*
2120			 *	POSIX 1003.1g mandates this order.
2121			 */
2122
2123			err = sock_error(sk);
2124			if (err)
2125				goto unlock;
2126			if (sk->sk_shutdown & RCV_SHUTDOWN)
2127				goto unlock;
2128
2129			unix_state_unlock(sk);
2130			err = -EAGAIN;
2131			if (!timeo)
2132				break;
2133			mutex_unlock(&u->readlock);
2134
2135			timeo = unix_stream_data_wait(sk, timeo, last);
2136
2137			if (signal_pending(current)) {
2138				err = sock_intr_errno(timeo);
2139				scm_destroy(&scm);
2140				goto out;
2141			}
2142
2143			mutex_lock(&u->readlock);
2144			continue;
2145 unlock:
2146			unix_state_unlock(sk);
2147			break;
2148		}
2149
2150		while (skip >= unix_skb_len(skb)) {
2151			skip -= unix_skb_len(skb);
2152			last = skb;
2153			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2154			if (!skb)
2155				goto again;
2156		}
2157
2158		unix_state_unlock(sk);
2159
2160		if (check_creds) {
2161			/* Never glue messages from different writers */
2162			if ((UNIXCB(skb).pid  != scm.pid) ||
2163			    !uid_eq(UNIXCB(skb).uid, scm.creds.uid) ||
2164			    !gid_eq(UNIXCB(skb).gid, scm.creds.gid))
2165				break;
2166		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2167			/* Copy credentials */
2168			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2169			check_creds = 1;
2170		}
2171
2172		/* Copy address just once */
2173		if (sunaddr) {
2174			unix_copy_addr(msg, skb->sk);
2175			sunaddr = NULL;
2176		}
2177
2178		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2179		if (skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2180					  msg, chunk)) {
2181			if (copied == 0)
2182				copied = -EFAULT;
2183			break;
2184		}
2185		copied += chunk;
2186		size -= chunk;
2187
2188		/* Mark read part of skb as used */
2189		if (!(flags & MSG_PEEK)) {
2190			UNIXCB(skb).consumed += chunk;
2191
2192			sk_peek_offset_bwd(sk, chunk);
2193
2194			if (UNIXCB(skb).fp)
2195				unix_detach_fds(&scm, skb);
2196
2197			if (unix_skb_len(skb))
2198				break;
2199
2200			skb_unlink(skb, &sk->sk_receive_queue);
2201			consume_skb(skb);
2202
2203			if (scm.fp)
2204				break;
2205		} else {
2206			/* It is questionable, see note in unix_dgram_recvmsg.
2207			 */
2208			if (UNIXCB(skb).fp)
2209				scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2210
2211			sk_peek_offset_fwd(sk, chunk);
2212
2213			if (UNIXCB(skb).fp)
2214				break;
2215
2216			skip = 0;
2217			last = skb;
2218			unix_state_lock(sk);
2219			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2220			if (skb)
2221				goto again;
2222			unix_state_unlock(sk);
2223			break;
2224		}
2225	} while (size);
2226
2227	mutex_unlock(&u->readlock);
2228	scm_recv(sock, msg, &scm, flags);
2229out:
2230	return copied ? : err;
2231}
2232
2233static int unix_shutdown(struct socket *sock, int mode)
2234{
2235	struct sock *sk = sock->sk;
2236	struct sock *other;
2237
2238	if (mode < SHUT_RD || mode > SHUT_RDWR)
2239		return -EINVAL;
2240	/* This maps:
2241	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2242	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2243	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2244	 */
2245	++mode;
2246
2247	unix_state_lock(sk);
2248	sk->sk_shutdown |= mode;
2249	other = unix_peer(sk);
2250	if (other)
2251		sock_hold(other);
2252	unix_state_unlock(sk);
2253	sk->sk_state_change(sk);
2254
2255	if (other &&
2256		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2257
2258		int peer_mode = 0;
2259
2260		if (mode&RCV_SHUTDOWN)
2261			peer_mode |= SEND_SHUTDOWN;
2262		if (mode&SEND_SHUTDOWN)
2263			peer_mode |= RCV_SHUTDOWN;
2264		unix_state_lock(other);
2265		other->sk_shutdown |= peer_mode;
2266		unix_state_unlock(other);
2267		other->sk_state_change(other);
2268		if (peer_mode == SHUTDOWN_MASK)
2269			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2270		else if (peer_mode & RCV_SHUTDOWN)
2271			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2272	}
2273	if (other)
2274		sock_put(other);
2275
2276	return 0;
2277}
2278
2279long unix_inq_len(struct sock *sk)
2280{
2281	struct sk_buff *skb;
2282	long amount = 0;
2283
2284	if (sk->sk_state == TCP_LISTEN)
2285		return -EINVAL;
2286
2287	spin_lock(&sk->sk_receive_queue.lock);
2288	if (sk->sk_type == SOCK_STREAM ||
2289	    sk->sk_type == SOCK_SEQPACKET) {
2290		skb_queue_walk(&sk->sk_receive_queue, skb)
2291			amount += unix_skb_len(skb);
2292	} else {
2293		skb = skb_peek(&sk->sk_receive_queue);
2294		if (skb)
2295			amount = skb->len;
2296	}
2297	spin_unlock(&sk->sk_receive_queue.lock);
2298
2299	return amount;
2300}
2301EXPORT_SYMBOL_GPL(unix_inq_len);
2302
2303long unix_outq_len(struct sock *sk)
2304{
2305	return sk_wmem_alloc_get(sk);
2306}
2307EXPORT_SYMBOL_GPL(unix_outq_len);
2308
2309static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2310{
2311	struct sock *sk = sock->sk;
2312	long amount = 0;
2313	int err;
2314
2315	switch (cmd) {
2316	case SIOCOUTQ:
2317		amount = unix_outq_len(sk);
2318		err = put_user(amount, (int __user *)arg);
2319		break;
2320	case SIOCINQ:
2321		amount = unix_inq_len(sk);
2322		if (amount < 0)
2323			err = amount;
2324		else
2325			err = put_user(amount, (int __user *)arg);
2326		break;
2327	default:
2328		err = -ENOIOCTLCMD;
2329		break;
2330	}
2331	return err;
2332}
2333
2334static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2335{
2336	struct sock *sk = sock->sk;
2337	unsigned int mask;
2338
2339	sock_poll_wait(file, sk_sleep(sk), wait);
2340	mask = 0;
2341
2342	/* exceptional events? */
2343	if (sk->sk_err)
2344		mask |= POLLERR;
2345	if (sk->sk_shutdown == SHUTDOWN_MASK)
2346		mask |= POLLHUP;
2347	if (sk->sk_shutdown & RCV_SHUTDOWN)
2348		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2349
2350	/* readable? */
2351	if (!skb_queue_empty(&sk->sk_receive_queue))
2352		mask |= POLLIN | POLLRDNORM;
2353
2354	/* Connection-based need to check for termination and startup */
2355	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2356	    sk->sk_state == TCP_CLOSE)
2357		mask |= POLLHUP;
2358
2359	/*
2360	 * we set writable also when the other side has shut down the
2361	 * connection. This prevents stuck sockets.
2362	 */
2363	if (unix_writable(sk))
2364		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2365
2366	return mask;
2367}
2368
2369static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2370				    poll_table *wait)
2371{
2372	struct sock *sk = sock->sk, *other;
2373	unsigned int mask, writable;
2374
2375	sock_poll_wait(file, sk_sleep(sk), wait);
2376	mask = 0;
2377
2378	/* exceptional events? */
2379	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2380		mask |= POLLERR |
2381			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2382
2383	if (sk->sk_shutdown & RCV_SHUTDOWN)
2384		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2385	if (sk->sk_shutdown == SHUTDOWN_MASK)
2386		mask |= POLLHUP;
2387
2388	/* readable? */
2389	if (!skb_queue_empty(&sk->sk_receive_queue))
2390		mask |= POLLIN | POLLRDNORM;
2391
2392	/* Connection-based need to check for termination and startup */
2393	if (sk->sk_type == SOCK_SEQPACKET) {
2394		if (sk->sk_state == TCP_CLOSE)
2395			mask |= POLLHUP;
2396		/* connection hasn't started yet? */
2397		if (sk->sk_state == TCP_SYN_SENT)
2398			return mask;
2399	}
2400
2401	/* No write status requested, avoid expensive OUT tests. */
2402	if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2403		return mask;
2404
2405	writable = unix_writable(sk);
2406	if (writable) {
2407		unix_state_lock(sk);
2408
2409		other = unix_peer(sk);
2410		if (other && unix_peer(other) != sk &&
2411		    unix_recvq_full(other) &&
2412		    unix_dgram_peer_wake_me(sk, other))
2413			writable = 0;
2414
2415		unix_state_unlock(sk);
2416	}
2417
2418	if (writable)
2419		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2420	else
2421		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2422
2423	return mask;
2424}
2425
2426#ifdef CONFIG_PROC_FS
2427
2428#define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2429
2430#define get_bucket(x) ((x) >> BUCKET_SPACE)
2431#define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2432#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2433
2434static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2435{
2436	unsigned long offset = get_offset(*pos);
2437	unsigned long bucket = get_bucket(*pos);
2438	struct sock *sk;
2439	unsigned long count = 0;
2440
2441	for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2442		if (sock_net(sk) != seq_file_net(seq))
2443			continue;
2444		if (++count == offset)
2445			break;
2446	}
2447
2448	return sk;
2449}
2450
2451static struct sock *unix_next_socket(struct seq_file *seq,
2452				     struct sock *sk,
2453				     loff_t *pos)
2454{
2455	unsigned long bucket;
2456
2457	while (sk > (struct sock *)SEQ_START_TOKEN) {
2458		sk = sk_next(sk);
2459		if (!sk)
2460			goto next_bucket;
2461		if (sock_net(sk) == seq_file_net(seq))
2462			return sk;
2463	}
2464
2465	do {
2466		sk = unix_from_bucket(seq, pos);
2467		if (sk)
2468			return sk;
2469
2470next_bucket:
2471		bucket = get_bucket(*pos) + 1;
2472		*pos = set_bucket_offset(bucket, 1);
2473	} while (bucket < ARRAY_SIZE(unix_socket_table));
2474
2475	return NULL;
2476}
2477
2478static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2479	__acquires(unix_table_lock)
2480{
2481	spin_lock(&unix_table_lock);
2482
2483	if (!*pos)
2484		return SEQ_START_TOKEN;
2485
2486	if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2487		return NULL;
2488
2489	return unix_next_socket(seq, NULL, pos);
2490}
2491
2492static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2493{
2494	++*pos;
2495	return unix_next_socket(seq, v, pos);
2496}
2497
2498static void unix_seq_stop(struct seq_file *seq, void *v)
2499	__releases(unix_table_lock)
2500{
2501	spin_unlock(&unix_table_lock);
2502}
2503
2504static int unix_seq_show(struct seq_file *seq, void *v)
2505{
2506
2507	if (v == SEQ_START_TOKEN)
2508		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2509			 "Inode Path\n");
2510	else {
2511		struct sock *s = v;
2512		struct unix_sock *u = unix_sk(s);
2513		unix_state_lock(s);
2514
2515		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2516			s,
2517			atomic_read(&s->sk_refcnt),
2518			0,
2519			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2520			s->sk_type,
2521			s->sk_socket ?
2522			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2523			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2524			sock_i_ino(s));
2525
2526		if (u->addr) {
2527			int i, len;
2528			seq_putc(seq, ' ');
2529
2530			i = 0;
2531			len = u->addr->len - sizeof(short);
2532			if (!UNIX_ABSTRACT(s))
2533				len--;
2534			else {
2535				seq_putc(seq, '@');
2536				i++;
2537			}
2538			for ( ; i < len; i++)
2539				seq_putc(seq, u->addr->name->sun_path[i]);
2540		}
2541		unix_state_unlock(s);
2542		seq_putc(seq, '\n');
2543	}
2544
2545	return 0;
2546}
2547
2548static const struct seq_operations unix_seq_ops = {
2549	.start  = unix_seq_start,
2550	.next   = unix_seq_next,
2551	.stop   = unix_seq_stop,
2552	.show   = unix_seq_show,
2553};
2554
2555static int unix_seq_open(struct inode *inode, struct file *file)
2556{
2557	return seq_open_net(inode, file, &unix_seq_ops,
2558			    sizeof(struct seq_net_private));
2559}
2560
2561static const struct file_operations unix_seq_fops = {
2562	.owner		= THIS_MODULE,
2563	.open		= unix_seq_open,
2564	.read		= seq_read,
2565	.llseek		= seq_lseek,
2566	.release	= seq_release_net,
2567};
2568
2569#endif
2570
2571static const struct net_proto_family unix_family_ops = {
2572	.family = PF_UNIX,
2573	.create = unix_create,
2574	.owner	= THIS_MODULE,
2575};
2576
2577
2578static int __net_init unix_net_init(struct net *net)
2579{
2580	int error = -ENOMEM;
2581
2582	net->unx.sysctl_max_dgram_qlen = 10;
2583	if (unix_sysctl_register(net))
2584		goto out;
2585
2586#ifdef CONFIG_PROC_FS
2587	if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2588		unix_sysctl_unregister(net);
2589		goto out;
2590	}
2591#endif
2592	error = 0;
2593out:
2594	return error;
2595}
2596
2597static void __net_exit unix_net_exit(struct net *net)
2598{
2599	unix_sysctl_unregister(net);
2600	remove_proc_entry("unix", net->proc_net);
2601}
2602
2603static struct pernet_operations unix_net_ops = {
2604	.init = unix_net_init,
2605	.exit = unix_net_exit,
2606};
2607
2608static int __init af_unix_init(void)
2609{
2610	int rc = -1;
2611
2612	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2613
2614	rc = proto_register(&unix_proto, 1);
2615	if (rc != 0) {
2616		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2617		goto out;
2618	}
2619
2620	sock_register(&unix_family_ops);
2621	register_pernet_subsys(&unix_net_ops);
2622out:
2623	return rc;
2624}
2625
2626static void __exit af_unix_exit(void)
2627{
2628	sock_unregister(PF_UNIX);
2629	proto_unregister(&unix_proto);
2630	unregister_pernet_subsys(&unix_net_ops);
2631}
2632
2633/* Earlier than device_initcall() so that other drivers invoking
2634   request_module() don't end up in a loop when modprobe tries
2635   to use a UNIX socket. But later than subsys_initcall() because
2636   we depend on stuff initialised there */
2637fs_initcall(af_unix_init);
2638module_exit(af_unix_exit);
2639
2640MODULE_LICENSE("GPL");
2641MODULE_ALIAS_NETPROTO(PF_UNIX);
2642