1/*
2 *  linux/fs/read_write.c
3 *
4 *  Copyright (C) 1991, 1992  Linus Torvalds
5 */
6
7#include <linux/slab.h>
8#include <linux/stat.h>
9#include <linux/fcntl.h>
10#include <linux/file.h>
11#include <linux/uio.h>
12#include <linux/fsnotify.h>
13#include <linux/security.h>
14#include <linux/export.h>
15#include <linux/syscalls.h>
16#include <linux/pagemap.h>
17#include <linux/splice.h>
18#include <linux/compat.h>
19#include "internal.h"
20
21#include <asm/uaccess.h>
22#include <asm/unistd.h>
23
24typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
25typedef ssize_t (*iter_fn_t)(struct kiocb *, struct iov_iter *);
26
27const struct file_operations generic_ro_fops = {
28	.llseek		= generic_file_llseek,
29	.read_iter	= generic_file_read_iter,
30	.mmap		= generic_file_readonly_mmap,
31	.splice_read	= generic_file_splice_read,
32};
33
34EXPORT_SYMBOL(generic_ro_fops);
35
36static inline int unsigned_offsets(struct file *file)
37{
38	return file->f_mode & FMODE_UNSIGNED_OFFSET;
39}
40
41/**
42 * vfs_setpos - update the file offset for lseek
43 * @file:	file structure in question
44 * @offset:	file offset to seek to
45 * @maxsize:	maximum file size
46 *
47 * This is a low-level filesystem helper for updating the file offset to
48 * the value specified by @offset if the given offset is valid and it is
49 * not equal to the current file offset.
50 *
51 * Return the specified offset on success and -EINVAL on invalid offset.
52 */
53loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
54{
55	if (offset < 0 && !unsigned_offsets(file))
56		return -EINVAL;
57	if (offset > maxsize)
58		return -EINVAL;
59
60	if (offset != file->f_pos) {
61		file->f_pos = offset;
62		file->f_version = 0;
63	}
64	return offset;
65}
66EXPORT_SYMBOL(vfs_setpos);
67
68/**
69 * generic_file_llseek_size - generic llseek implementation for regular files
70 * @file:	file structure to seek on
71 * @offset:	file offset to seek to
72 * @whence:	type of seek
73 * @size:	max size of this file in file system
74 * @eof:	offset used for SEEK_END position
75 *
76 * This is a variant of generic_file_llseek that allows passing in a custom
77 * maximum file size and a custom EOF position, for e.g. hashed directories
78 *
79 * Synchronization:
80 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
81 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
82 * read/writes behave like SEEK_SET against seeks.
83 */
84loff_t
85generic_file_llseek_size(struct file *file, loff_t offset, int whence,
86		loff_t maxsize, loff_t eof)
87{
88	switch (whence) {
89	case SEEK_END:
90		offset += eof;
91		break;
92	case SEEK_CUR:
93		/*
94		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
95		 * position-querying operation.  Avoid rewriting the "same"
96		 * f_pos value back to the file because a concurrent read(),
97		 * write() or lseek() might have altered it
98		 */
99		if (offset == 0)
100			return file->f_pos;
101		/*
102		 * f_lock protects against read/modify/write race with other
103		 * SEEK_CURs. Note that parallel writes and reads behave
104		 * like SEEK_SET.
105		 */
106		spin_lock(&file->f_lock);
107		offset = vfs_setpos(file, file->f_pos + offset, maxsize);
108		spin_unlock(&file->f_lock);
109		return offset;
110	case SEEK_DATA:
111		/*
112		 * In the generic case the entire file is data, so as long as
113		 * offset isn't at the end of the file then the offset is data.
114		 */
115		if (offset >= eof)
116			return -ENXIO;
117		break;
118	case SEEK_HOLE:
119		/*
120		 * There is a virtual hole at the end of the file, so as long as
121		 * offset isn't i_size or larger, return i_size.
122		 */
123		if (offset >= eof)
124			return -ENXIO;
125		offset = eof;
126		break;
127	}
128
129	return vfs_setpos(file, offset, maxsize);
130}
131EXPORT_SYMBOL(generic_file_llseek_size);
132
133/**
134 * generic_file_llseek - generic llseek implementation for regular files
135 * @file:	file structure to seek on
136 * @offset:	file offset to seek to
137 * @whence:	type of seek
138 *
139 * This is a generic implemenation of ->llseek useable for all normal local
140 * filesystems.  It just updates the file offset to the value specified by
141 * @offset and @whence.
142 */
143loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
144{
145	struct inode *inode = file->f_mapping->host;
146
147	return generic_file_llseek_size(file, offset, whence,
148					inode->i_sb->s_maxbytes,
149					i_size_read(inode));
150}
151EXPORT_SYMBOL(generic_file_llseek);
152
153/**
154 * fixed_size_llseek - llseek implementation for fixed-sized devices
155 * @file:	file structure to seek on
156 * @offset:	file offset to seek to
157 * @whence:	type of seek
158 * @size:	size of the file
159 *
160 */
161loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
162{
163	switch (whence) {
164	case SEEK_SET: case SEEK_CUR: case SEEK_END:
165		return generic_file_llseek_size(file, offset, whence,
166						size, size);
167	default:
168		return -EINVAL;
169	}
170}
171EXPORT_SYMBOL(fixed_size_llseek);
172
173/**
174 * noop_llseek - No Operation Performed llseek implementation
175 * @file:	file structure to seek on
176 * @offset:	file offset to seek to
177 * @whence:	type of seek
178 *
179 * This is an implementation of ->llseek useable for the rare special case when
180 * userspace expects the seek to succeed but the (device) file is actually not
181 * able to perform the seek. In this case you use noop_llseek() instead of
182 * falling back to the default implementation of ->llseek.
183 */
184loff_t noop_llseek(struct file *file, loff_t offset, int whence)
185{
186	return file->f_pos;
187}
188EXPORT_SYMBOL(noop_llseek);
189
190loff_t no_llseek(struct file *file, loff_t offset, int whence)
191{
192	return -ESPIPE;
193}
194EXPORT_SYMBOL(no_llseek);
195
196loff_t default_llseek(struct file *file, loff_t offset, int whence)
197{
198	struct inode *inode = file_inode(file);
199	loff_t retval;
200
201	mutex_lock(&inode->i_mutex);
202	switch (whence) {
203		case SEEK_END:
204			offset += i_size_read(inode);
205			break;
206		case SEEK_CUR:
207			if (offset == 0) {
208				retval = file->f_pos;
209				goto out;
210			}
211			offset += file->f_pos;
212			break;
213		case SEEK_DATA:
214			/*
215			 * In the generic case the entire file is data, so as
216			 * long as offset isn't at the end of the file then the
217			 * offset is data.
218			 */
219			if (offset >= inode->i_size) {
220				retval = -ENXIO;
221				goto out;
222			}
223			break;
224		case SEEK_HOLE:
225			/*
226			 * There is a virtual hole at the end of the file, so
227			 * as long as offset isn't i_size or larger, return
228			 * i_size.
229			 */
230			if (offset >= inode->i_size) {
231				retval = -ENXIO;
232				goto out;
233			}
234			offset = inode->i_size;
235			break;
236	}
237	retval = -EINVAL;
238	if (offset >= 0 || unsigned_offsets(file)) {
239		if (offset != file->f_pos) {
240			file->f_pos = offset;
241			file->f_version = 0;
242		}
243		retval = offset;
244	}
245out:
246	mutex_unlock(&inode->i_mutex);
247	return retval;
248}
249EXPORT_SYMBOL(default_llseek);
250
251loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
252{
253	loff_t (*fn)(struct file *, loff_t, int);
254
255	fn = no_llseek;
256	if (file->f_mode & FMODE_LSEEK) {
257		if (file->f_op->llseek)
258			fn = file->f_op->llseek;
259	}
260	return fn(file, offset, whence);
261}
262EXPORT_SYMBOL(vfs_llseek);
263
264static inline struct fd fdget_pos(int fd)
265{
266	return __to_fd(__fdget_pos(fd));
267}
268
269static inline void fdput_pos(struct fd f)
270{
271	if (f.flags & FDPUT_POS_UNLOCK)
272		mutex_unlock(&f.file->f_pos_lock);
273	fdput(f);
274}
275
276SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
277{
278	off_t retval;
279	struct fd f = fdget_pos(fd);
280	if (!f.file)
281		return -EBADF;
282
283	retval = -EINVAL;
284	if (whence <= SEEK_MAX) {
285		loff_t res = vfs_llseek(f.file, offset, whence);
286		retval = res;
287		if (res != (loff_t)retval)
288			retval = -EOVERFLOW;	/* LFS: should only happen on 32 bit platforms */
289	}
290	fdput_pos(f);
291	return retval;
292}
293
294#ifdef CONFIG_COMPAT
295COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
296{
297	return sys_lseek(fd, offset, whence);
298}
299#endif
300
301#ifdef __ARCH_WANT_SYS_LLSEEK
302SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
303		unsigned long, offset_low, loff_t __user *, result,
304		unsigned int, whence)
305{
306	int retval;
307	struct fd f = fdget_pos(fd);
308	loff_t offset;
309
310	if (!f.file)
311		return -EBADF;
312
313	retval = -EINVAL;
314	if (whence > SEEK_MAX)
315		goto out_putf;
316
317	offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
318			whence);
319
320	retval = (int)offset;
321	if (offset >= 0) {
322		retval = -EFAULT;
323		if (!copy_to_user(result, &offset, sizeof(offset)))
324			retval = 0;
325	}
326out_putf:
327	fdput_pos(f);
328	return retval;
329}
330#endif
331
332ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos)
333{
334	struct kiocb kiocb;
335	ssize_t ret;
336
337	if (!file->f_op->read_iter)
338		return -EINVAL;
339
340	init_sync_kiocb(&kiocb, file);
341	kiocb.ki_pos = *ppos;
342
343	iter->type |= READ;
344	ret = file->f_op->read_iter(&kiocb, iter);
345	BUG_ON(ret == -EIOCBQUEUED);
346	if (ret > 0)
347		*ppos = kiocb.ki_pos;
348	return ret;
349}
350EXPORT_SYMBOL(vfs_iter_read);
351
352ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
353{
354	struct kiocb kiocb;
355	ssize_t ret;
356
357	if (!file->f_op->write_iter)
358		return -EINVAL;
359
360	init_sync_kiocb(&kiocb, file);
361	kiocb.ki_pos = *ppos;
362
363	iter->type |= WRITE;
364	ret = file->f_op->write_iter(&kiocb, iter);
365	BUG_ON(ret == -EIOCBQUEUED);
366	if (ret > 0)
367		*ppos = kiocb.ki_pos;
368	return ret;
369}
370EXPORT_SYMBOL(vfs_iter_write);
371
372/*
373 * rw_verify_area doesn't like huge counts. We limit
374 * them to something that fits in "int" so that others
375 * won't have to do range checks all the time.
376 */
377int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
378{
379	struct inode *inode;
380	loff_t pos;
381	int retval = -EINVAL;
382
383	inode = file_inode(file);
384	if (unlikely((ssize_t) count < 0))
385		return retval;
386	pos = *ppos;
387	if (unlikely(pos < 0)) {
388		if (!unsigned_offsets(file))
389			return retval;
390		if (count >= -pos) /* both values are in 0..LLONG_MAX */
391			return -EOVERFLOW;
392	} else if (unlikely((loff_t) (pos + count) < 0)) {
393		if (!unsigned_offsets(file))
394			return retval;
395	}
396
397	if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
398		retval = locks_mandatory_area(
399			read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
400			inode, file, pos, count);
401		if (retval < 0)
402			return retval;
403	}
404	retval = security_file_permission(file,
405				read_write == READ ? MAY_READ : MAY_WRITE);
406	if (retval)
407		return retval;
408	return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
409}
410
411static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
412{
413	struct iovec iov = { .iov_base = buf, .iov_len = len };
414	struct kiocb kiocb;
415	struct iov_iter iter;
416	ssize_t ret;
417
418	init_sync_kiocb(&kiocb, filp);
419	kiocb.ki_pos = *ppos;
420	iov_iter_init(&iter, READ, &iov, 1, len);
421
422	ret = filp->f_op->read_iter(&kiocb, &iter);
423	BUG_ON(ret == -EIOCBQUEUED);
424	*ppos = kiocb.ki_pos;
425	return ret;
426}
427
428ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
429		   loff_t *pos)
430{
431	if (file->f_op->read)
432		return file->f_op->read(file, buf, count, pos);
433	else if (file->f_op->read_iter)
434		return new_sync_read(file, buf, count, pos);
435	else
436		return -EINVAL;
437}
438EXPORT_SYMBOL(__vfs_read);
439
440ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
441{
442	ssize_t ret;
443
444	if (!(file->f_mode & FMODE_READ))
445		return -EBADF;
446	if (!(file->f_mode & FMODE_CAN_READ))
447		return -EINVAL;
448	if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
449		return -EFAULT;
450
451	ret = rw_verify_area(READ, file, pos, count);
452	if (ret >= 0) {
453		count = ret;
454		ret = __vfs_read(file, buf, count, pos);
455		if (ret > 0) {
456			fsnotify_access(file);
457			add_rchar(current, ret);
458		}
459		inc_syscr(current);
460	}
461
462	return ret;
463}
464
465EXPORT_SYMBOL(vfs_read);
466
467static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
468{
469	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
470	struct kiocb kiocb;
471	struct iov_iter iter;
472	ssize_t ret;
473
474	init_sync_kiocb(&kiocb, filp);
475	kiocb.ki_pos = *ppos;
476	iov_iter_init(&iter, WRITE, &iov, 1, len);
477
478	ret = filp->f_op->write_iter(&kiocb, &iter);
479	BUG_ON(ret == -EIOCBQUEUED);
480	if (ret > 0)
481		*ppos = kiocb.ki_pos;
482	return ret;
483}
484
485ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
486		    loff_t *pos)
487{
488	if (file->f_op->write)
489		return file->f_op->write(file, p, count, pos);
490	else if (file->f_op->write_iter)
491		return new_sync_write(file, p, count, pos);
492	else
493		return -EINVAL;
494}
495EXPORT_SYMBOL(__vfs_write);
496
497ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
498{
499	mm_segment_t old_fs;
500	const char __user *p;
501	ssize_t ret;
502
503	if (!(file->f_mode & FMODE_CAN_WRITE))
504		return -EINVAL;
505
506	old_fs = get_fs();
507	set_fs(get_ds());
508	p = (__force const char __user *)buf;
509	if (count > MAX_RW_COUNT)
510		count =  MAX_RW_COUNT;
511	ret = __vfs_write(file, p, count, pos);
512	set_fs(old_fs);
513	if (ret > 0) {
514		fsnotify_modify(file);
515		add_wchar(current, ret);
516	}
517	inc_syscw(current);
518	return ret;
519}
520
521EXPORT_SYMBOL(__kernel_write);
522
523ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
524{
525	ssize_t ret;
526
527	if (!(file->f_mode & FMODE_WRITE))
528		return -EBADF;
529	if (!(file->f_mode & FMODE_CAN_WRITE))
530		return -EINVAL;
531	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
532		return -EFAULT;
533
534	ret = rw_verify_area(WRITE, file, pos, count);
535	if (ret >= 0) {
536		count = ret;
537		file_start_write(file);
538		ret = __vfs_write(file, buf, count, pos);
539		if (ret > 0) {
540			fsnotify_modify(file);
541			add_wchar(current, ret);
542		}
543		inc_syscw(current);
544		file_end_write(file);
545	}
546
547	return ret;
548}
549
550EXPORT_SYMBOL(vfs_write);
551
552static inline loff_t file_pos_read(struct file *file)
553{
554	return file->f_pos;
555}
556
557static inline void file_pos_write(struct file *file, loff_t pos)
558{
559	file->f_pos = pos;
560}
561
562SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
563{
564	struct fd f = fdget_pos(fd);
565	ssize_t ret = -EBADF;
566
567	if (f.file) {
568		loff_t pos = file_pos_read(f.file);
569		ret = vfs_read(f.file, buf, count, &pos);
570		if (ret >= 0)
571			file_pos_write(f.file, pos);
572		fdput_pos(f);
573	}
574	return ret;
575}
576
577SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
578		size_t, count)
579{
580	struct fd f = fdget_pos(fd);
581	ssize_t ret = -EBADF;
582
583	if (f.file) {
584		loff_t pos = file_pos_read(f.file);
585		ret = vfs_write(f.file, buf, count, &pos);
586		if (ret >= 0)
587			file_pos_write(f.file, pos);
588		fdput_pos(f);
589	}
590
591	return ret;
592}
593
594SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
595			size_t, count, loff_t, pos)
596{
597	struct fd f;
598	ssize_t ret = -EBADF;
599
600	if (pos < 0)
601		return -EINVAL;
602
603	f = fdget(fd);
604	if (f.file) {
605		ret = -ESPIPE;
606		if (f.file->f_mode & FMODE_PREAD)
607			ret = vfs_read(f.file, buf, count, &pos);
608		fdput(f);
609	}
610
611	return ret;
612}
613
614SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
615			 size_t, count, loff_t, pos)
616{
617	struct fd f;
618	ssize_t ret = -EBADF;
619
620	if (pos < 0)
621		return -EINVAL;
622
623	f = fdget(fd);
624	if (f.file) {
625		ret = -ESPIPE;
626		if (f.file->f_mode & FMODE_PWRITE)
627			ret = vfs_write(f.file, buf, count, &pos);
628		fdput(f);
629	}
630
631	return ret;
632}
633
634/*
635 * Reduce an iovec's length in-place.  Return the resulting number of segments
636 */
637unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
638{
639	unsigned long seg = 0;
640	size_t len = 0;
641
642	while (seg < nr_segs) {
643		seg++;
644		if (len + iov->iov_len >= to) {
645			iov->iov_len = to - len;
646			break;
647		}
648		len += iov->iov_len;
649		iov++;
650	}
651	return seg;
652}
653EXPORT_SYMBOL(iov_shorten);
654
655static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
656		loff_t *ppos, iter_fn_t fn)
657{
658	struct kiocb kiocb;
659	ssize_t ret;
660
661	init_sync_kiocb(&kiocb, filp);
662	kiocb.ki_pos = *ppos;
663
664	ret = fn(&kiocb, iter);
665	BUG_ON(ret == -EIOCBQUEUED);
666	*ppos = kiocb.ki_pos;
667	return ret;
668}
669
670/* Do it by hand, with file-ops */
671static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
672		loff_t *ppos, io_fn_t fn)
673{
674	ssize_t ret = 0;
675
676	while (iov_iter_count(iter)) {
677		struct iovec iovec = iov_iter_iovec(iter);
678		ssize_t nr;
679
680		nr = fn(filp, iovec.iov_base, iovec.iov_len, ppos);
681
682		if (nr < 0) {
683			if (!ret)
684				ret = nr;
685			break;
686		}
687		ret += nr;
688		if (nr != iovec.iov_len)
689			break;
690		iov_iter_advance(iter, nr);
691	}
692
693	return ret;
694}
695
696/* A write operation does a read from user space and vice versa */
697#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
698
699ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
700			      unsigned long nr_segs, unsigned long fast_segs,
701			      struct iovec *fast_pointer,
702			      struct iovec **ret_pointer)
703{
704	unsigned long seg;
705	ssize_t ret;
706	struct iovec *iov = fast_pointer;
707
708	/*
709	 * SuS says "The readv() function *may* fail if the iovcnt argument
710	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
711	 * traditionally returned zero for zero segments, so...
712	 */
713	if (nr_segs == 0) {
714		ret = 0;
715		goto out;
716	}
717
718	/*
719	 * First get the "struct iovec" from user memory and
720	 * verify all the pointers
721	 */
722	if (nr_segs > UIO_MAXIOV) {
723		ret = -EINVAL;
724		goto out;
725	}
726	if (nr_segs > fast_segs) {
727		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
728		if (iov == NULL) {
729			ret = -ENOMEM;
730			goto out;
731		}
732	}
733	if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
734		ret = -EFAULT;
735		goto out;
736	}
737
738	/*
739	 * According to the Single Unix Specification we should return EINVAL
740	 * if an element length is < 0 when cast to ssize_t or if the
741	 * total length would overflow the ssize_t return value of the
742	 * system call.
743	 *
744	 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
745	 * overflow case.
746	 */
747	ret = 0;
748	for (seg = 0; seg < nr_segs; seg++) {
749		void __user *buf = iov[seg].iov_base;
750		ssize_t len = (ssize_t)iov[seg].iov_len;
751
752		/* see if we we're about to use an invalid len or if
753		 * it's about to overflow ssize_t */
754		if (len < 0) {
755			ret = -EINVAL;
756			goto out;
757		}
758		if (type >= 0
759		    && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
760			ret = -EFAULT;
761			goto out;
762		}
763		if (len > MAX_RW_COUNT - ret) {
764			len = MAX_RW_COUNT - ret;
765			iov[seg].iov_len = len;
766		}
767		ret += len;
768	}
769out:
770	*ret_pointer = iov;
771	return ret;
772}
773
774static ssize_t do_readv_writev(int type, struct file *file,
775			       const struct iovec __user * uvector,
776			       unsigned long nr_segs, loff_t *pos)
777{
778	size_t tot_len;
779	struct iovec iovstack[UIO_FASTIOV];
780	struct iovec *iov = iovstack;
781	struct iov_iter iter;
782	ssize_t ret;
783	io_fn_t fn;
784	iter_fn_t iter_fn;
785
786	ret = import_iovec(type, uvector, nr_segs,
787			   ARRAY_SIZE(iovstack), &iov, &iter);
788	if (ret < 0)
789		return ret;
790
791	tot_len = iov_iter_count(&iter);
792	if (!tot_len)
793		goto out;
794	ret = rw_verify_area(type, file, pos, tot_len);
795	if (ret < 0)
796		goto out;
797
798	if (type == READ) {
799		fn = file->f_op->read;
800		iter_fn = file->f_op->read_iter;
801	} else {
802		fn = (io_fn_t)file->f_op->write;
803		iter_fn = file->f_op->write_iter;
804		file_start_write(file);
805	}
806
807	if (iter_fn)
808		ret = do_iter_readv_writev(file, &iter, pos, iter_fn);
809	else
810		ret = do_loop_readv_writev(file, &iter, pos, fn);
811
812	if (type != READ)
813		file_end_write(file);
814
815out:
816	kfree(iov);
817	if ((ret + (type == READ)) > 0) {
818		if (type == READ)
819			fsnotify_access(file);
820		else
821			fsnotify_modify(file);
822	}
823	return ret;
824}
825
826ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
827		  unsigned long vlen, loff_t *pos)
828{
829	if (!(file->f_mode & FMODE_READ))
830		return -EBADF;
831	if (!(file->f_mode & FMODE_CAN_READ))
832		return -EINVAL;
833
834	return do_readv_writev(READ, file, vec, vlen, pos);
835}
836
837EXPORT_SYMBOL(vfs_readv);
838
839ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
840		   unsigned long vlen, loff_t *pos)
841{
842	if (!(file->f_mode & FMODE_WRITE))
843		return -EBADF;
844	if (!(file->f_mode & FMODE_CAN_WRITE))
845		return -EINVAL;
846
847	return do_readv_writev(WRITE, file, vec, vlen, pos);
848}
849
850EXPORT_SYMBOL(vfs_writev);
851
852SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
853		unsigned long, vlen)
854{
855	struct fd f = fdget_pos(fd);
856	ssize_t ret = -EBADF;
857
858	if (f.file) {
859		loff_t pos = file_pos_read(f.file);
860		ret = vfs_readv(f.file, vec, vlen, &pos);
861		if (ret >= 0)
862			file_pos_write(f.file, pos);
863		fdput_pos(f);
864	}
865
866	if (ret > 0)
867		add_rchar(current, ret);
868	inc_syscr(current);
869	return ret;
870}
871
872SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
873		unsigned long, vlen)
874{
875	struct fd f = fdget_pos(fd);
876	ssize_t ret = -EBADF;
877
878	if (f.file) {
879		loff_t pos = file_pos_read(f.file);
880		ret = vfs_writev(f.file, vec, vlen, &pos);
881		if (ret >= 0)
882			file_pos_write(f.file, pos);
883		fdput_pos(f);
884	}
885
886	if (ret > 0)
887		add_wchar(current, ret);
888	inc_syscw(current);
889	return ret;
890}
891
892static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
893{
894#define HALF_LONG_BITS (BITS_PER_LONG / 2)
895	return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
896}
897
898SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
899		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
900{
901	loff_t pos = pos_from_hilo(pos_h, pos_l);
902	struct fd f;
903	ssize_t ret = -EBADF;
904
905	if (pos < 0)
906		return -EINVAL;
907
908	f = fdget(fd);
909	if (f.file) {
910		ret = -ESPIPE;
911		if (f.file->f_mode & FMODE_PREAD)
912			ret = vfs_readv(f.file, vec, vlen, &pos);
913		fdput(f);
914	}
915
916	if (ret > 0)
917		add_rchar(current, ret);
918	inc_syscr(current);
919	return ret;
920}
921
922SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
923		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
924{
925	loff_t pos = pos_from_hilo(pos_h, pos_l);
926	struct fd f;
927	ssize_t ret = -EBADF;
928
929	if (pos < 0)
930		return -EINVAL;
931
932	f = fdget(fd);
933	if (f.file) {
934		ret = -ESPIPE;
935		if (f.file->f_mode & FMODE_PWRITE)
936			ret = vfs_writev(f.file, vec, vlen, &pos);
937		fdput(f);
938	}
939
940	if (ret > 0)
941		add_wchar(current, ret);
942	inc_syscw(current);
943	return ret;
944}
945
946#ifdef CONFIG_COMPAT
947
948static ssize_t compat_do_readv_writev(int type, struct file *file,
949			       const struct compat_iovec __user *uvector,
950			       unsigned long nr_segs, loff_t *pos)
951{
952	compat_ssize_t tot_len;
953	struct iovec iovstack[UIO_FASTIOV];
954	struct iovec *iov = iovstack;
955	struct iov_iter iter;
956	ssize_t ret;
957	io_fn_t fn;
958	iter_fn_t iter_fn;
959
960	ret = compat_import_iovec(type, uvector, nr_segs,
961				  UIO_FASTIOV, &iov, &iter);
962	if (ret < 0)
963		return ret;
964
965	tot_len = iov_iter_count(&iter);
966	if (!tot_len)
967		goto out;
968	ret = rw_verify_area(type, file, pos, tot_len);
969	if (ret < 0)
970		goto out;
971
972	if (type == READ) {
973		fn = file->f_op->read;
974		iter_fn = file->f_op->read_iter;
975	} else {
976		fn = (io_fn_t)file->f_op->write;
977		iter_fn = file->f_op->write_iter;
978		file_start_write(file);
979	}
980
981	if (iter_fn)
982		ret = do_iter_readv_writev(file, &iter, pos, iter_fn);
983	else
984		ret = do_loop_readv_writev(file, &iter, pos, fn);
985
986	if (type != READ)
987		file_end_write(file);
988
989out:
990	kfree(iov);
991	if ((ret + (type == READ)) > 0) {
992		if (type == READ)
993			fsnotify_access(file);
994		else
995			fsnotify_modify(file);
996	}
997	return ret;
998}
999
1000static size_t compat_readv(struct file *file,
1001			   const struct compat_iovec __user *vec,
1002			   unsigned long vlen, loff_t *pos)
1003{
1004	ssize_t ret = -EBADF;
1005
1006	if (!(file->f_mode & FMODE_READ))
1007		goto out;
1008
1009	ret = -EINVAL;
1010	if (!(file->f_mode & FMODE_CAN_READ))
1011		goto out;
1012
1013	ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
1014
1015out:
1016	if (ret > 0)
1017		add_rchar(current, ret);
1018	inc_syscr(current);
1019	return ret;
1020}
1021
1022COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
1023		const struct compat_iovec __user *,vec,
1024		compat_ulong_t, vlen)
1025{
1026	struct fd f = fdget_pos(fd);
1027	ssize_t ret;
1028	loff_t pos;
1029
1030	if (!f.file)
1031		return -EBADF;
1032	pos = f.file->f_pos;
1033	ret = compat_readv(f.file, vec, vlen, &pos);
1034	if (ret >= 0)
1035		f.file->f_pos = pos;
1036	fdput_pos(f);
1037	return ret;
1038}
1039
1040static long __compat_sys_preadv64(unsigned long fd,
1041				  const struct compat_iovec __user *vec,
1042				  unsigned long vlen, loff_t pos)
1043{
1044	struct fd f;
1045	ssize_t ret;
1046
1047	if (pos < 0)
1048		return -EINVAL;
1049	f = fdget(fd);
1050	if (!f.file)
1051		return -EBADF;
1052	ret = -ESPIPE;
1053	if (f.file->f_mode & FMODE_PREAD)
1054		ret = compat_readv(f.file, vec, vlen, &pos);
1055	fdput(f);
1056	return ret;
1057}
1058
1059#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
1060COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1061		const struct compat_iovec __user *,vec,
1062		unsigned long, vlen, loff_t, pos)
1063{
1064	return __compat_sys_preadv64(fd, vec, vlen, pos);
1065}
1066#endif
1067
1068COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1069		const struct compat_iovec __user *,vec,
1070		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1071{
1072	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1073
1074	return __compat_sys_preadv64(fd, vec, vlen, pos);
1075}
1076
1077static size_t compat_writev(struct file *file,
1078			    const struct compat_iovec __user *vec,
1079			    unsigned long vlen, loff_t *pos)
1080{
1081	ssize_t ret = -EBADF;
1082
1083	if (!(file->f_mode & FMODE_WRITE))
1084		goto out;
1085
1086	ret = -EINVAL;
1087	if (!(file->f_mode & FMODE_CAN_WRITE))
1088		goto out;
1089
1090	ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
1091
1092out:
1093	if (ret > 0)
1094		add_wchar(current, ret);
1095	inc_syscw(current);
1096	return ret;
1097}
1098
1099COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1100		const struct compat_iovec __user *, vec,
1101		compat_ulong_t, vlen)
1102{
1103	struct fd f = fdget_pos(fd);
1104	ssize_t ret;
1105	loff_t pos;
1106
1107	if (!f.file)
1108		return -EBADF;
1109	pos = f.file->f_pos;
1110	ret = compat_writev(f.file, vec, vlen, &pos);
1111	if (ret >= 0)
1112		f.file->f_pos = pos;
1113	fdput_pos(f);
1114	return ret;
1115}
1116
1117static long __compat_sys_pwritev64(unsigned long fd,
1118				   const struct compat_iovec __user *vec,
1119				   unsigned long vlen, loff_t pos)
1120{
1121	struct fd f;
1122	ssize_t ret;
1123
1124	if (pos < 0)
1125		return -EINVAL;
1126	f = fdget(fd);
1127	if (!f.file)
1128		return -EBADF;
1129	ret = -ESPIPE;
1130	if (f.file->f_mode & FMODE_PWRITE)
1131		ret = compat_writev(f.file, vec, vlen, &pos);
1132	fdput(f);
1133	return ret;
1134}
1135
1136#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
1137COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1138		const struct compat_iovec __user *,vec,
1139		unsigned long, vlen, loff_t, pos)
1140{
1141	return __compat_sys_pwritev64(fd, vec, vlen, pos);
1142}
1143#endif
1144
1145COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1146		const struct compat_iovec __user *,vec,
1147		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1148{
1149	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1150
1151	return __compat_sys_pwritev64(fd, vec, vlen, pos);
1152}
1153#endif
1154
1155static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1156		  	   size_t count, loff_t max)
1157{
1158	struct fd in, out;
1159	struct inode *in_inode, *out_inode;
1160	loff_t pos;
1161	loff_t out_pos;
1162	ssize_t retval;
1163	int fl;
1164
1165	/*
1166	 * Get input file, and verify that it is ok..
1167	 */
1168	retval = -EBADF;
1169	in = fdget(in_fd);
1170	if (!in.file)
1171		goto out;
1172	if (!(in.file->f_mode & FMODE_READ))
1173		goto fput_in;
1174	retval = -ESPIPE;
1175	if (!ppos) {
1176		pos = in.file->f_pos;
1177	} else {
1178		pos = *ppos;
1179		if (!(in.file->f_mode & FMODE_PREAD))
1180			goto fput_in;
1181	}
1182	retval = rw_verify_area(READ, in.file, &pos, count);
1183	if (retval < 0)
1184		goto fput_in;
1185	count = retval;
1186
1187	/*
1188	 * Get output file, and verify that it is ok..
1189	 */
1190	retval = -EBADF;
1191	out = fdget(out_fd);
1192	if (!out.file)
1193		goto fput_in;
1194	if (!(out.file->f_mode & FMODE_WRITE))
1195		goto fput_out;
1196	retval = -EINVAL;
1197	in_inode = file_inode(in.file);
1198	out_inode = file_inode(out.file);
1199	out_pos = out.file->f_pos;
1200	retval = rw_verify_area(WRITE, out.file, &out_pos, count);
1201	if (retval < 0)
1202		goto fput_out;
1203	count = retval;
1204
1205	if (!max)
1206		max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1207
1208	if (unlikely(pos + count > max)) {
1209		retval = -EOVERFLOW;
1210		if (pos >= max)
1211			goto fput_out;
1212		count = max - pos;
1213	}
1214
1215	fl = 0;
1216#if 0
1217	/*
1218	 * We need to debate whether we can enable this or not. The
1219	 * man page documents EAGAIN return for the output at least,
1220	 * and the application is arguably buggy if it doesn't expect
1221	 * EAGAIN on a non-blocking file descriptor.
1222	 */
1223	if (in.file->f_flags & O_NONBLOCK)
1224		fl = SPLICE_F_NONBLOCK;
1225#endif
1226	file_start_write(out.file);
1227	retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
1228	file_end_write(out.file);
1229
1230	if (retval > 0) {
1231		add_rchar(current, retval);
1232		add_wchar(current, retval);
1233		fsnotify_access(in.file);
1234		fsnotify_modify(out.file);
1235		out.file->f_pos = out_pos;
1236		if (ppos)
1237			*ppos = pos;
1238		else
1239			in.file->f_pos = pos;
1240	}
1241
1242	inc_syscr(current);
1243	inc_syscw(current);
1244	if (pos > max)
1245		retval = -EOVERFLOW;
1246
1247fput_out:
1248	fdput(out);
1249fput_in:
1250	fdput(in);
1251out:
1252	return retval;
1253}
1254
1255SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1256{
1257	loff_t pos;
1258	off_t off;
1259	ssize_t ret;
1260
1261	if (offset) {
1262		if (unlikely(get_user(off, offset)))
1263			return -EFAULT;
1264		pos = off;
1265		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1266		if (unlikely(put_user(pos, offset)))
1267			return -EFAULT;
1268		return ret;
1269	}
1270
1271	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1272}
1273
1274SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1275{
1276	loff_t pos;
1277	ssize_t ret;
1278
1279	if (offset) {
1280		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1281			return -EFAULT;
1282		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1283		if (unlikely(put_user(pos, offset)))
1284			return -EFAULT;
1285		return ret;
1286	}
1287
1288	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1289}
1290
1291#ifdef CONFIG_COMPAT
1292COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1293		compat_off_t __user *, offset, compat_size_t, count)
1294{
1295	loff_t pos;
1296	off_t off;
1297	ssize_t ret;
1298
1299	if (offset) {
1300		if (unlikely(get_user(off, offset)))
1301			return -EFAULT;
1302		pos = off;
1303		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1304		if (unlikely(put_user(pos, offset)))
1305			return -EFAULT;
1306		return ret;
1307	}
1308
1309	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1310}
1311
1312COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1313		compat_loff_t __user *, offset, compat_size_t, count)
1314{
1315	loff_t pos;
1316	ssize_t ret;
1317
1318	if (offset) {
1319		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1320			return -EFAULT;
1321		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1322		if (unlikely(put_user(pos, offset)))
1323			return -EFAULT;
1324		return ret;
1325	}
1326
1327	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1328}
1329#endif
1330