1/*
2 *  linux/fs/buffer.c
3 *
4 *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
5 */
6
7/*
8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
9 *
10 * Removed a lot of unnecessary code and simplified things now that
11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
12 *
13 * Speed up hash, lru, and free list operations.  Use gfp() for allocating
14 * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
15 *
16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
17 *
18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
19 */
20
21#include <linux/kernel.h>
22#include <linux/syscalls.h>
23#include <linux/fs.h>
24#include <linux/mm.h>
25#include <linux/percpu.h>
26#include <linux/slab.h>
27#include <linux/capability.h>
28#include <linux/blkdev.h>
29#include <linux/file.h>
30#include <linux/quotaops.h>
31#include <linux/highmem.h>
32#include <linux/export.h>
33#include <linux/writeback.h>
34#include <linux/hash.h>
35#include <linux/suspend.h>
36#include <linux/buffer_head.h>
37#include <linux/task_io_accounting_ops.h>
38#include <linux/bio.h>
39#include <linux/notifier.h>
40#include <linux/cpu.h>
41#include <linux/bitops.h>
42#include <linux/mpage.h>
43#include <linux/bit_spinlock.h>
44#include <trace/events/block.h>
45
46static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
47
48#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
49
50void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
51{
52	bh->b_end_io = handler;
53	bh->b_private = private;
54}
55EXPORT_SYMBOL(init_buffer);
56
57inline void touch_buffer(struct buffer_head *bh)
58{
59	trace_block_touch_buffer(bh);
60	mark_page_accessed(bh->b_page);
61}
62EXPORT_SYMBOL(touch_buffer);
63
64void __lock_buffer(struct buffer_head *bh)
65{
66	wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
67}
68EXPORT_SYMBOL(__lock_buffer);
69
70void unlock_buffer(struct buffer_head *bh)
71{
72	clear_bit_unlock(BH_Lock, &bh->b_state);
73	smp_mb__after_atomic();
74	wake_up_bit(&bh->b_state, BH_Lock);
75}
76EXPORT_SYMBOL(unlock_buffer);
77
78/*
79 * Returns if the page has dirty or writeback buffers. If all the buffers
80 * are unlocked and clean then the PageDirty information is stale. If
81 * any of the pages are locked, it is assumed they are locked for IO.
82 */
83void buffer_check_dirty_writeback(struct page *page,
84				     bool *dirty, bool *writeback)
85{
86	struct buffer_head *head, *bh;
87	*dirty = false;
88	*writeback = false;
89
90	BUG_ON(!PageLocked(page));
91
92	if (!page_has_buffers(page))
93		return;
94
95	if (PageWriteback(page))
96		*writeback = true;
97
98	head = page_buffers(page);
99	bh = head;
100	do {
101		if (buffer_locked(bh))
102			*writeback = true;
103
104		if (buffer_dirty(bh))
105			*dirty = true;
106
107		bh = bh->b_this_page;
108	} while (bh != head);
109}
110EXPORT_SYMBOL(buffer_check_dirty_writeback);
111
112/*
113 * Block until a buffer comes unlocked.  This doesn't stop it
114 * from becoming locked again - you have to lock it yourself
115 * if you want to preserve its state.
116 */
117void __wait_on_buffer(struct buffer_head * bh)
118{
119	wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
120}
121EXPORT_SYMBOL(__wait_on_buffer);
122
123static void
124__clear_page_buffers(struct page *page)
125{
126	ClearPagePrivate(page);
127	set_page_private(page, 0);
128	page_cache_release(page);
129}
130
131static void buffer_io_error(struct buffer_head *bh, char *msg)
132{
133	char b[BDEVNAME_SIZE];
134
135	if (!test_bit(BH_Quiet, &bh->b_state))
136		printk_ratelimited(KERN_ERR
137			"Buffer I/O error on dev %s, logical block %llu%s\n",
138			bdevname(bh->b_bdev, b),
139			(unsigned long long)bh->b_blocknr, msg);
140}
141
142/*
143 * End-of-IO handler helper function which does not touch the bh after
144 * unlocking it.
145 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
146 * a race there is benign: unlock_buffer() only use the bh's address for
147 * hashing after unlocking the buffer, so it doesn't actually touch the bh
148 * itself.
149 */
150static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
151{
152	if (uptodate) {
153		set_buffer_uptodate(bh);
154	} else {
155		/* This happens, due to failed READA attempts. */
156		clear_buffer_uptodate(bh);
157	}
158	unlock_buffer(bh);
159}
160
161/*
162 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
163 * unlock the buffer. This is what ll_rw_block uses too.
164 */
165void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
166{
167	__end_buffer_read_notouch(bh, uptodate);
168	put_bh(bh);
169}
170EXPORT_SYMBOL(end_buffer_read_sync);
171
172void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
173{
174	if (uptodate) {
175		set_buffer_uptodate(bh);
176	} else {
177		buffer_io_error(bh, ", lost sync page write");
178		set_buffer_write_io_error(bh);
179		clear_buffer_uptodate(bh);
180	}
181	unlock_buffer(bh);
182	put_bh(bh);
183}
184EXPORT_SYMBOL(end_buffer_write_sync);
185
186/*
187 * Various filesystems appear to want __find_get_block to be non-blocking.
188 * But it's the page lock which protects the buffers.  To get around this,
189 * we get exclusion from try_to_free_buffers with the blockdev mapping's
190 * private_lock.
191 *
192 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
193 * may be quite high.  This code could TryLock the page, and if that
194 * succeeds, there is no need to take private_lock. (But if
195 * private_lock is contended then so is mapping->tree_lock).
196 */
197static struct buffer_head *
198__find_get_block_slow(struct block_device *bdev, sector_t block)
199{
200	struct inode *bd_inode = bdev->bd_inode;
201	struct address_space *bd_mapping = bd_inode->i_mapping;
202	struct buffer_head *ret = NULL;
203	pgoff_t index;
204	struct buffer_head *bh;
205	struct buffer_head *head;
206	struct page *page;
207	int all_mapped = 1;
208
209	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
210	page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
211	if (!page)
212		goto out;
213
214	spin_lock(&bd_mapping->private_lock);
215	if (!page_has_buffers(page))
216		goto out_unlock;
217	head = page_buffers(page);
218	bh = head;
219	do {
220		if (!buffer_mapped(bh))
221			all_mapped = 0;
222		else if (bh->b_blocknr == block) {
223			ret = bh;
224			get_bh(bh);
225			goto out_unlock;
226		}
227		bh = bh->b_this_page;
228	} while (bh != head);
229
230	/* we might be here because some of the buffers on this page are
231	 * not mapped.  This is due to various races between
232	 * file io on the block device and getblk.  It gets dealt with
233	 * elsewhere, don't buffer_error if we had some unmapped buffers
234	 */
235	if (all_mapped) {
236		char b[BDEVNAME_SIZE];
237
238		printk("__find_get_block_slow() failed. "
239			"block=%llu, b_blocknr=%llu\n",
240			(unsigned long long)block,
241			(unsigned long long)bh->b_blocknr);
242		printk("b_state=0x%08lx, b_size=%zu\n",
243			bh->b_state, bh->b_size);
244		printk("device %s blocksize: %d\n", bdevname(bdev, b),
245			1 << bd_inode->i_blkbits);
246	}
247out_unlock:
248	spin_unlock(&bd_mapping->private_lock);
249	page_cache_release(page);
250out:
251	return ret;
252}
253
254/*
255 * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
256 */
257static void free_more_memory(void)
258{
259	struct zone *zone;
260	int nid;
261
262	wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
263	yield();
264
265	for_each_online_node(nid) {
266		(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
267						gfp_zone(GFP_NOFS), NULL,
268						&zone);
269		if (zone)
270			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
271						GFP_NOFS, NULL);
272	}
273}
274
275/*
276 * I/O completion handler for block_read_full_page() - pages
277 * which come unlocked at the end of I/O.
278 */
279static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
280{
281	unsigned long flags;
282	struct buffer_head *first;
283	struct buffer_head *tmp;
284	struct page *page;
285	int page_uptodate = 1;
286
287	BUG_ON(!buffer_async_read(bh));
288
289	page = bh->b_page;
290	if (uptodate) {
291		set_buffer_uptodate(bh);
292	} else {
293		clear_buffer_uptodate(bh);
294		buffer_io_error(bh, ", async page read");
295		SetPageError(page);
296	}
297
298	/*
299	 * Be _very_ careful from here on. Bad things can happen if
300	 * two buffer heads end IO at almost the same time and both
301	 * decide that the page is now completely done.
302	 */
303	first = page_buffers(page);
304	local_irq_save(flags);
305	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
306	clear_buffer_async_read(bh);
307	unlock_buffer(bh);
308	tmp = bh;
309	do {
310		if (!buffer_uptodate(tmp))
311			page_uptodate = 0;
312		if (buffer_async_read(tmp)) {
313			BUG_ON(!buffer_locked(tmp));
314			goto still_busy;
315		}
316		tmp = tmp->b_this_page;
317	} while (tmp != bh);
318	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
319	local_irq_restore(flags);
320
321	/*
322	 * If none of the buffers had errors and they are all
323	 * uptodate then we can set the page uptodate.
324	 */
325	if (page_uptodate && !PageError(page))
326		SetPageUptodate(page);
327	unlock_page(page);
328	return;
329
330still_busy:
331	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
332	local_irq_restore(flags);
333	return;
334}
335
336/*
337 * Completion handler for block_write_full_page() - pages which are unlocked
338 * during I/O, and which have PageWriteback cleared upon I/O completion.
339 */
340void end_buffer_async_write(struct buffer_head *bh, int uptodate)
341{
342	unsigned long flags;
343	struct buffer_head *first;
344	struct buffer_head *tmp;
345	struct page *page;
346
347	BUG_ON(!buffer_async_write(bh));
348
349	page = bh->b_page;
350	if (uptodate) {
351		set_buffer_uptodate(bh);
352	} else {
353		buffer_io_error(bh, ", lost async page write");
354		set_bit(AS_EIO, &page->mapping->flags);
355		set_buffer_write_io_error(bh);
356		clear_buffer_uptodate(bh);
357		SetPageError(page);
358	}
359
360	first = page_buffers(page);
361	local_irq_save(flags);
362	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
363
364	clear_buffer_async_write(bh);
365	unlock_buffer(bh);
366	tmp = bh->b_this_page;
367	while (tmp != bh) {
368		if (buffer_async_write(tmp)) {
369			BUG_ON(!buffer_locked(tmp));
370			goto still_busy;
371		}
372		tmp = tmp->b_this_page;
373	}
374	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
375	local_irq_restore(flags);
376	end_page_writeback(page);
377	return;
378
379still_busy:
380	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
381	local_irq_restore(flags);
382	return;
383}
384EXPORT_SYMBOL(end_buffer_async_write);
385
386/*
387 * If a page's buffers are under async readin (end_buffer_async_read
388 * completion) then there is a possibility that another thread of
389 * control could lock one of the buffers after it has completed
390 * but while some of the other buffers have not completed.  This
391 * locked buffer would confuse end_buffer_async_read() into not unlocking
392 * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
393 * that this buffer is not under async I/O.
394 *
395 * The page comes unlocked when it has no locked buffer_async buffers
396 * left.
397 *
398 * PageLocked prevents anyone starting new async I/O reads any of
399 * the buffers.
400 *
401 * PageWriteback is used to prevent simultaneous writeout of the same
402 * page.
403 *
404 * PageLocked prevents anyone from starting writeback of a page which is
405 * under read I/O (PageWriteback is only ever set against a locked page).
406 */
407static void mark_buffer_async_read(struct buffer_head *bh)
408{
409	bh->b_end_io = end_buffer_async_read;
410	set_buffer_async_read(bh);
411}
412
413static void mark_buffer_async_write_endio(struct buffer_head *bh,
414					  bh_end_io_t *handler)
415{
416	bh->b_end_io = handler;
417	set_buffer_async_write(bh);
418}
419
420void mark_buffer_async_write(struct buffer_head *bh)
421{
422	mark_buffer_async_write_endio(bh, end_buffer_async_write);
423}
424EXPORT_SYMBOL(mark_buffer_async_write);
425
426
427/*
428 * fs/buffer.c contains helper functions for buffer-backed address space's
429 * fsync functions.  A common requirement for buffer-based filesystems is
430 * that certain data from the backing blockdev needs to be written out for
431 * a successful fsync().  For example, ext2 indirect blocks need to be
432 * written back and waited upon before fsync() returns.
433 *
434 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
435 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
436 * management of a list of dependent buffers at ->i_mapping->private_list.
437 *
438 * Locking is a little subtle: try_to_free_buffers() will remove buffers
439 * from their controlling inode's queue when they are being freed.  But
440 * try_to_free_buffers() will be operating against the *blockdev* mapping
441 * at the time, not against the S_ISREG file which depends on those buffers.
442 * So the locking for private_list is via the private_lock in the address_space
443 * which backs the buffers.  Which is different from the address_space
444 * against which the buffers are listed.  So for a particular address_space,
445 * mapping->private_lock does *not* protect mapping->private_list!  In fact,
446 * mapping->private_list will always be protected by the backing blockdev's
447 * ->private_lock.
448 *
449 * Which introduces a requirement: all buffers on an address_space's
450 * ->private_list must be from the same address_space: the blockdev's.
451 *
452 * address_spaces which do not place buffers at ->private_list via these
453 * utility functions are free to use private_lock and private_list for
454 * whatever they want.  The only requirement is that list_empty(private_list)
455 * be true at clear_inode() time.
456 *
457 * FIXME: clear_inode should not call invalidate_inode_buffers().  The
458 * filesystems should do that.  invalidate_inode_buffers() should just go
459 * BUG_ON(!list_empty).
460 *
461 * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
462 * take an address_space, not an inode.  And it should be called
463 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
464 * queued up.
465 *
466 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
467 * list if it is already on a list.  Because if the buffer is on a list,
468 * it *must* already be on the right one.  If not, the filesystem is being
469 * silly.  This will save a ton of locking.  But first we have to ensure
470 * that buffers are taken *off* the old inode's list when they are freed
471 * (presumably in truncate).  That requires careful auditing of all
472 * filesystems (do it inside bforget()).  It could also be done by bringing
473 * b_inode back.
474 */
475
476/*
477 * The buffer's backing address_space's private_lock must be held
478 */
479static void __remove_assoc_queue(struct buffer_head *bh)
480{
481	list_del_init(&bh->b_assoc_buffers);
482	WARN_ON(!bh->b_assoc_map);
483	if (buffer_write_io_error(bh))
484		set_bit(AS_EIO, &bh->b_assoc_map->flags);
485	bh->b_assoc_map = NULL;
486}
487
488int inode_has_buffers(struct inode *inode)
489{
490	return !list_empty(&inode->i_data.private_list);
491}
492
493/*
494 * osync is designed to support O_SYNC io.  It waits synchronously for
495 * all already-submitted IO to complete, but does not queue any new
496 * writes to the disk.
497 *
498 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
499 * you dirty the buffers, and then use osync_inode_buffers to wait for
500 * completion.  Any other dirty buffers which are not yet queued for
501 * write will not be flushed to disk by the osync.
502 */
503static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
504{
505	struct buffer_head *bh;
506	struct list_head *p;
507	int err = 0;
508
509	spin_lock(lock);
510repeat:
511	list_for_each_prev(p, list) {
512		bh = BH_ENTRY(p);
513		if (buffer_locked(bh)) {
514			get_bh(bh);
515			spin_unlock(lock);
516			wait_on_buffer(bh);
517			if (!buffer_uptodate(bh))
518				err = -EIO;
519			brelse(bh);
520			spin_lock(lock);
521			goto repeat;
522		}
523	}
524	spin_unlock(lock);
525	return err;
526}
527
528static void do_thaw_one(struct super_block *sb, void *unused)
529{
530	char b[BDEVNAME_SIZE];
531	while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
532		printk(KERN_WARNING "Emergency Thaw on %s\n",
533		       bdevname(sb->s_bdev, b));
534}
535
536static void do_thaw_all(struct work_struct *work)
537{
538	iterate_supers(do_thaw_one, NULL);
539	kfree(work);
540	printk(KERN_WARNING "Emergency Thaw complete\n");
541}
542
543/**
544 * emergency_thaw_all -- forcibly thaw every frozen filesystem
545 *
546 * Used for emergency unfreeze of all filesystems via SysRq
547 */
548void emergency_thaw_all(void)
549{
550	struct work_struct *work;
551
552	work = kmalloc(sizeof(*work), GFP_ATOMIC);
553	if (work) {
554		INIT_WORK(work, do_thaw_all);
555		schedule_work(work);
556	}
557}
558
559/**
560 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
561 * @mapping: the mapping which wants those buffers written
562 *
563 * Starts I/O against the buffers at mapping->private_list, and waits upon
564 * that I/O.
565 *
566 * Basically, this is a convenience function for fsync().
567 * @mapping is a file or directory which needs those buffers to be written for
568 * a successful fsync().
569 */
570int sync_mapping_buffers(struct address_space *mapping)
571{
572	struct address_space *buffer_mapping = mapping->private_data;
573
574	if (buffer_mapping == NULL || list_empty(&mapping->private_list))
575		return 0;
576
577	return fsync_buffers_list(&buffer_mapping->private_lock,
578					&mapping->private_list);
579}
580EXPORT_SYMBOL(sync_mapping_buffers);
581
582/*
583 * Called when we've recently written block `bblock', and it is known that
584 * `bblock' was for a buffer_boundary() buffer.  This means that the block at
585 * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
586 * dirty, schedule it for IO.  So that indirects merge nicely with their data.
587 */
588void write_boundary_block(struct block_device *bdev,
589			sector_t bblock, unsigned blocksize)
590{
591	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
592	if (bh) {
593		if (buffer_dirty(bh))
594			ll_rw_block(WRITE, 1, &bh);
595		put_bh(bh);
596	}
597}
598
599void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
600{
601	struct address_space *mapping = inode->i_mapping;
602	struct address_space *buffer_mapping = bh->b_page->mapping;
603
604	mark_buffer_dirty(bh);
605	if (!mapping->private_data) {
606		mapping->private_data = buffer_mapping;
607	} else {
608		BUG_ON(mapping->private_data != buffer_mapping);
609	}
610	if (!bh->b_assoc_map) {
611		spin_lock(&buffer_mapping->private_lock);
612		list_move_tail(&bh->b_assoc_buffers,
613				&mapping->private_list);
614		bh->b_assoc_map = mapping;
615		spin_unlock(&buffer_mapping->private_lock);
616	}
617}
618EXPORT_SYMBOL(mark_buffer_dirty_inode);
619
620/*
621 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
622 * dirty.
623 *
624 * If warn is true, then emit a warning if the page is not uptodate and has
625 * not been truncated.
626 */
627static void __set_page_dirty(struct page *page,
628		struct address_space *mapping, int warn)
629{
630	unsigned long flags;
631
632	spin_lock_irqsave(&mapping->tree_lock, flags);
633	if (page->mapping) {	/* Race with truncate? */
634		WARN_ON_ONCE(warn && !PageUptodate(page));
635		account_page_dirtied(page, mapping);
636		radix_tree_tag_set(&mapping->page_tree,
637				page_index(page), PAGECACHE_TAG_DIRTY);
638	}
639	spin_unlock_irqrestore(&mapping->tree_lock, flags);
640	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
641}
642
643/*
644 * Add a page to the dirty page list.
645 *
646 * It is a sad fact of life that this function is called from several places
647 * deeply under spinlocking.  It may not sleep.
648 *
649 * If the page has buffers, the uptodate buffers are set dirty, to preserve
650 * dirty-state coherency between the page and the buffers.  It the page does
651 * not have buffers then when they are later attached they will all be set
652 * dirty.
653 *
654 * The buffers are dirtied before the page is dirtied.  There's a small race
655 * window in which a writepage caller may see the page cleanness but not the
656 * buffer dirtiness.  That's fine.  If this code were to set the page dirty
657 * before the buffers, a concurrent writepage caller could clear the page dirty
658 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
659 * page on the dirty page list.
660 *
661 * We use private_lock to lock against try_to_free_buffers while using the
662 * page's buffer list.  Also use this to protect against clean buffers being
663 * added to the page after it was set dirty.
664 *
665 * FIXME: may need to call ->reservepage here as well.  That's rather up to the
666 * address_space though.
667 */
668int __set_page_dirty_buffers(struct page *page)
669{
670	int newly_dirty;
671	struct address_space *mapping = page_mapping(page);
672
673	if (unlikely(!mapping))
674		return !TestSetPageDirty(page);
675
676	spin_lock(&mapping->private_lock);
677	if (page_has_buffers(page)) {
678		struct buffer_head *head = page_buffers(page);
679		struct buffer_head *bh = head;
680
681		do {
682			set_buffer_dirty(bh);
683			bh = bh->b_this_page;
684		} while (bh != head);
685	}
686	newly_dirty = !TestSetPageDirty(page);
687	spin_unlock(&mapping->private_lock);
688
689	if (newly_dirty)
690		__set_page_dirty(page, mapping, 1);
691	return newly_dirty;
692}
693EXPORT_SYMBOL(__set_page_dirty_buffers);
694
695/*
696 * Write out and wait upon a list of buffers.
697 *
698 * We have conflicting pressures: we want to make sure that all
699 * initially dirty buffers get waited on, but that any subsequently
700 * dirtied buffers don't.  After all, we don't want fsync to last
701 * forever if somebody is actively writing to the file.
702 *
703 * Do this in two main stages: first we copy dirty buffers to a
704 * temporary inode list, queueing the writes as we go.  Then we clean
705 * up, waiting for those writes to complete.
706 *
707 * During this second stage, any subsequent updates to the file may end
708 * up refiling the buffer on the original inode's dirty list again, so
709 * there is a chance we will end up with a buffer queued for write but
710 * not yet completed on that list.  So, as a final cleanup we go through
711 * the osync code to catch these locked, dirty buffers without requeuing
712 * any newly dirty buffers for write.
713 */
714static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
715{
716	struct buffer_head *bh;
717	struct list_head tmp;
718	struct address_space *mapping;
719	int err = 0, err2;
720	struct blk_plug plug;
721
722	INIT_LIST_HEAD(&tmp);
723	blk_start_plug(&plug);
724
725	spin_lock(lock);
726	while (!list_empty(list)) {
727		bh = BH_ENTRY(list->next);
728		mapping = bh->b_assoc_map;
729		__remove_assoc_queue(bh);
730		/* Avoid race with mark_buffer_dirty_inode() which does
731		 * a lockless check and we rely on seeing the dirty bit */
732		smp_mb();
733		if (buffer_dirty(bh) || buffer_locked(bh)) {
734			list_add(&bh->b_assoc_buffers, &tmp);
735			bh->b_assoc_map = mapping;
736			if (buffer_dirty(bh)) {
737				get_bh(bh);
738				spin_unlock(lock);
739				/*
740				 * Ensure any pending I/O completes so that
741				 * write_dirty_buffer() actually writes the
742				 * current contents - it is a noop if I/O is
743				 * still in flight on potentially older
744				 * contents.
745				 */
746				write_dirty_buffer(bh, WRITE_SYNC);
747
748				/*
749				 * Kick off IO for the previous mapping. Note
750				 * that we will not run the very last mapping,
751				 * wait_on_buffer() will do that for us
752				 * through sync_buffer().
753				 */
754				brelse(bh);
755				spin_lock(lock);
756			}
757		}
758	}
759
760	spin_unlock(lock);
761	blk_finish_plug(&plug);
762	spin_lock(lock);
763
764	while (!list_empty(&tmp)) {
765		bh = BH_ENTRY(tmp.prev);
766		get_bh(bh);
767		mapping = bh->b_assoc_map;
768		__remove_assoc_queue(bh);
769		/* Avoid race with mark_buffer_dirty_inode() which does
770		 * a lockless check and we rely on seeing the dirty bit */
771		smp_mb();
772		if (buffer_dirty(bh)) {
773			list_add(&bh->b_assoc_buffers,
774				 &mapping->private_list);
775			bh->b_assoc_map = mapping;
776		}
777		spin_unlock(lock);
778		wait_on_buffer(bh);
779		if (!buffer_uptodate(bh))
780			err = -EIO;
781		brelse(bh);
782		spin_lock(lock);
783	}
784
785	spin_unlock(lock);
786	err2 = osync_buffers_list(lock, list);
787	if (err)
788		return err;
789	else
790		return err2;
791}
792
793/*
794 * Invalidate any and all dirty buffers on a given inode.  We are
795 * probably unmounting the fs, but that doesn't mean we have already
796 * done a sync().  Just drop the buffers from the inode list.
797 *
798 * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
799 * assumes that all the buffers are against the blockdev.  Not true
800 * for reiserfs.
801 */
802void invalidate_inode_buffers(struct inode *inode)
803{
804	if (inode_has_buffers(inode)) {
805		struct address_space *mapping = &inode->i_data;
806		struct list_head *list = &mapping->private_list;
807		struct address_space *buffer_mapping = mapping->private_data;
808
809		spin_lock(&buffer_mapping->private_lock);
810		while (!list_empty(list))
811			__remove_assoc_queue(BH_ENTRY(list->next));
812		spin_unlock(&buffer_mapping->private_lock);
813	}
814}
815EXPORT_SYMBOL(invalidate_inode_buffers);
816
817/*
818 * Remove any clean buffers from the inode's buffer list.  This is called
819 * when we're trying to free the inode itself.  Those buffers can pin it.
820 *
821 * Returns true if all buffers were removed.
822 */
823int remove_inode_buffers(struct inode *inode)
824{
825	int ret = 1;
826
827	if (inode_has_buffers(inode)) {
828		struct address_space *mapping = &inode->i_data;
829		struct list_head *list = &mapping->private_list;
830		struct address_space *buffer_mapping = mapping->private_data;
831
832		spin_lock(&buffer_mapping->private_lock);
833		while (!list_empty(list)) {
834			struct buffer_head *bh = BH_ENTRY(list->next);
835			if (buffer_dirty(bh)) {
836				ret = 0;
837				break;
838			}
839			__remove_assoc_queue(bh);
840		}
841		spin_unlock(&buffer_mapping->private_lock);
842	}
843	return ret;
844}
845
846/*
847 * Create the appropriate buffers when given a page for data area and
848 * the size of each buffer.. Use the bh->b_this_page linked list to
849 * follow the buffers created.  Return NULL if unable to create more
850 * buffers.
851 *
852 * The retry flag is used to differentiate async IO (paging, swapping)
853 * which may not fail from ordinary buffer allocations.
854 */
855struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
856		int retry)
857{
858	struct buffer_head *bh, *head;
859	long offset;
860
861try_again:
862	head = NULL;
863	offset = PAGE_SIZE;
864	while ((offset -= size) >= 0) {
865		bh = alloc_buffer_head(GFP_NOFS);
866		if (!bh)
867			goto no_grow;
868
869		bh->b_this_page = head;
870		bh->b_blocknr = -1;
871		head = bh;
872
873		bh->b_size = size;
874
875		/* Link the buffer to its page */
876		set_bh_page(bh, page, offset);
877	}
878	return head;
879/*
880 * In case anything failed, we just free everything we got.
881 */
882no_grow:
883	if (head) {
884		do {
885			bh = head;
886			head = head->b_this_page;
887			free_buffer_head(bh);
888		} while (head);
889	}
890
891	/*
892	 * Return failure for non-async IO requests.  Async IO requests
893	 * are not allowed to fail, so we have to wait until buffer heads
894	 * become available.  But we don't want tasks sleeping with
895	 * partially complete buffers, so all were released above.
896	 */
897	if (!retry)
898		return NULL;
899
900	/* We're _really_ low on memory. Now we just
901	 * wait for old buffer heads to become free due to
902	 * finishing IO.  Since this is an async request and
903	 * the reserve list is empty, we're sure there are
904	 * async buffer heads in use.
905	 */
906	free_more_memory();
907	goto try_again;
908}
909EXPORT_SYMBOL_GPL(alloc_page_buffers);
910
911static inline void
912link_dev_buffers(struct page *page, struct buffer_head *head)
913{
914	struct buffer_head *bh, *tail;
915
916	bh = head;
917	do {
918		tail = bh;
919		bh = bh->b_this_page;
920	} while (bh);
921	tail->b_this_page = head;
922	attach_page_buffers(page, head);
923}
924
925static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
926{
927	sector_t retval = ~((sector_t)0);
928	loff_t sz = i_size_read(bdev->bd_inode);
929
930	if (sz) {
931		unsigned int sizebits = blksize_bits(size);
932		retval = (sz >> sizebits);
933	}
934	return retval;
935}
936
937/*
938 * Initialise the state of a blockdev page's buffers.
939 */
940static sector_t
941init_page_buffers(struct page *page, struct block_device *bdev,
942			sector_t block, int size)
943{
944	struct buffer_head *head = page_buffers(page);
945	struct buffer_head *bh = head;
946	int uptodate = PageUptodate(page);
947	sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);
948
949	do {
950		if (!buffer_mapped(bh)) {
951			init_buffer(bh, NULL, NULL);
952			bh->b_bdev = bdev;
953			bh->b_blocknr = block;
954			if (uptodate)
955				set_buffer_uptodate(bh);
956			if (block < end_block)
957				set_buffer_mapped(bh);
958		}
959		block++;
960		bh = bh->b_this_page;
961	} while (bh != head);
962
963	/*
964	 * Caller needs to validate requested block against end of device.
965	 */
966	return end_block;
967}
968
969/*
970 * Create the page-cache page that contains the requested block.
971 *
972 * This is used purely for blockdev mappings.
973 */
974static int
975grow_dev_page(struct block_device *bdev, sector_t block,
976	      pgoff_t index, int size, int sizebits, gfp_t gfp)
977{
978	struct inode *inode = bdev->bd_inode;
979	struct page *page;
980	struct buffer_head *bh;
981	sector_t end_block;
982	int ret = 0;		/* Will call free_more_memory() */
983	gfp_t gfp_mask;
984
985	gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp;
986
987	/*
988	 * XXX: __getblk_slow() can not really deal with failure and
989	 * will endlessly loop on improvised global reclaim.  Prefer
990	 * looping in the allocator rather than here, at least that
991	 * code knows what it's doing.
992	 */
993	gfp_mask |= __GFP_NOFAIL;
994
995	page = find_or_create_page(inode->i_mapping, index, gfp_mask);
996	if (!page)
997		return ret;
998
999	BUG_ON(!PageLocked(page));
1000
1001	if (page_has_buffers(page)) {
1002		bh = page_buffers(page);
1003		if (bh->b_size == size) {
1004			end_block = init_page_buffers(page, bdev,
1005						(sector_t)index << sizebits,
1006						size);
1007			goto done;
1008		}
1009		if (!try_to_free_buffers(page))
1010			goto failed;
1011	}
1012
1013	/*
1014	 * Allocate some buffers for this page
1015	 */
1016	bh = alloc_page_buffers(page, size, 0);
1017	if (!bh)
1018		goto failed;
1019
1020	/*
1021	 * Link the page to the buffers and initialise them.  Take the
1022	 * lock to be atomic wrt __find_get_block(), which does not
1023	 * run under the page lock.
1024	 */
1025	spin_lock(&inode->i_mapping->private_lock);
1026	link_dev_buffers(page, bh);
1027	end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits,
1028			size);
1029	spin_unlock(&inode->i_mapping->private_lock);
1030done:
1031	ret = (block < end_block) ? 1 : -ENXIO;
1032failed:
1033	unlock_page(page);
1034	page_cache_release(page);
1035	return ret;
1036}
1037
1038/*
1039 * Create buffers for the specified block device block's page.  If
1040 * that page was dirty, the buffers are set dirty also.
1041 */
1042static int
1043grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
1044{
1045	pgoff_t index;
1046	int sizebits;
1047
1048	sizebits = -1;
1049	do {
1050		sizebits++;
1051	} while ((size << sizebits) < PAGE_SIZE);
1052
1053	index = block >> sizebits;
1054
1055	/*
1056	 * Check for a block which wants to lie outside our maximum possible
1057	 * pagecache index.  (this comparison is done using sector_t types).
1058	 */
1059	if (unlikely(index != block >> sizebits)) {
1060		char b[BDEVNAME_SIZE];
1061
1062		printk(KERN_ERR "%s: requested out-of-range block %llu for "
1063			"device %s\n",
1064			__func__, (unsigned long long)block,
1065			bdevname(bdev, b));
1066		return -EIO;
1067	}
1068
1069	/* Create a page with the proper size buffers.. */
1070	return grow_dev_page(bdev, block, index, size, sizebits, gfp);
1071}
1072
1073struct buffer_head *
1074__getblk_slow(struct block_device *bdev, sector_t block,
1075	     unsigned size, gfp_t gfp)
1076{
1077	/* Size must be multiple of hard sectorsize */
1078	if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1079			(size < 512 || size > PAGE_SIZE))) {
1080		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1081					size);
1082		printk(KERN_ERR "logical block size: %d\n",
1083					bdev_logical_block_size(bdev));
1084
1085		dump_stack();
1086		return NULL;
1087	}
1088
1089	for (;;) {
1090		struct buffer_head *bh;
1091		int ret;
1092
1093		bh = __find_get_block(bdev, block, size);
1094		if (bh)
1095			return bh;
1096
1097		ret = grow_buffers(bdev, block, size, gfp);
1098		if (ret < 0)
1099			return NULL;
1100		if (ret == 0)
1101			free_more_memory();
1102	}
1103}
1104EXPORT_SYMBOL(__getblk_slow);
1105
1106/*
1107 * The relationship between dirty buffers and dirty pages:
1108 *
1109 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1110 * the page is tagged dirty in its radix tree.
1111 *
1112 * At all times, the dirtiness of the buffers represents the dirtiness of
1113 * subsections of the page.  If the page has buffers, the page dirty bit is
1114 * merely a hint about the true dirty state.
1115 *
1116 * When a page is set dirty in its entirety, all its buffers are marked dirty
1117 * (if the page has buffers).
1118 *
1119 * When a buffer is marked dirty, its page is dirtied, but the page's other
1120 * buffers are not.
1121 *
1122 * Also.  When blockdev buffers are explicitly read with bread(), they
1123 * individually become uptodate.  But their backing page remains not
1124 * uptodate - even if all of its buffers are uptodate.  A subsequent
1125 * block_read_full_page() against that page will discover all the uptodate
1126 * buffers, will set the page uptodate and will perform no I/O.
1127 */
1128
1129/**
1130 * mark_buffer_dirty - mark a buffer_head as needing writeout
1131 * @bh: the buffer_head to mark dirty
1132 *
1133 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1134 * backing page dirty, then tag the page as dirty in its address_space's radix
1135 * tree and then attach the address_space's inode to its superblock's dirty
1136 * inode list.
1137 *
1138 * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1139 * mapping->tree_lock and mapping->host->i_lock.
1140 */
1141void mark_buffer_dirty(struct buffer_head *bh)
1142{
1143	WARN_ON_ONCE(!buffer_uptodate(bh));
1144
1145	trace_block_dirty_buffer(bh);
1146
1147	/*
1148	 * Very *carefully* optimize the it-is-already-dirty case.
1149	 *
1150	 * Don't let the final "is it dirty" escape to before we
1151	 * perhaps modified the buffer.
1152	 */
1153	if (buffer_dirty(bh)) {
1154		smp_mb();
1155		if (buffer_dirty(bh))
1156			return;
1157	}
1158
1159	if (!test_set_buffer_dirty(bh)) {
1160		struct page *page = bh->b_page;
1161		if (!TestSetPageDirty(page)) {
1162			struct address_space *mapping = page_mapping(page);
1163			if (mapping)
1164				__set_page_dirty(page, mapping, 0);
1165		}
1166	}
1167}
1168EXPORT_SYMBOL(mark_buffer_dirty);
1169
1170/*
1171 * Decrement a buffer_head's reference count.  If all buffers against a page
1172 * have zero reference count, are clean and unlocked, and if the page is clean
1173 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1174 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1175 * a page but it ends up not being freed, and buffers may later be reattached).
1176 */
1177void __brelse(struct buffer_head * buf)
1178{
1179	if (atomic_read(&buf->b_count)) {
1180		put_bh(buf);
1181		return;
1182	}
1183	WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1184}
1185EXPORT_SYMBOL(__brelse);
1186
1187/*
1188 * bforget() is like brelse(), except it discards any
1189 * potentially dirty data.
1190 */
1191void __bforget(struct buffer_head *bh)
1192{
1193	clear_buffer_dirty(bh);
1194	if (bh->b_assoc_map) {
1195		struct address_space *buffer_mapping = bh->b_page->mapping;
1196
1197		spin_lock(&buffer_mapping->private_lock);
1198		list_del_init(&bh->b_assoc_buffers);
1199		bh->b_assoc_map = NULL;
1200		spin_unlock(&buffer_mapping->private_lock);
1201	}
1202	__brelse(bh);
1203}
1204EXPORT_SYMBOL(__bforget);
1205
1206static struct buffer_head *__bread_slow(struct buffer_head *bh)
1207{
1208	lock_buffer(bh);
1209	if (buffer_uptodate(bh)) {
1210		unlock_buffer(bh);
1211		return bh;
1212	} else {
1213		get_bh(bh);
1214		bh->b_end_io = end_buffer_read_sync;
1215		submit_bh(READ, bh);
1216		wait_on_buffer(bh);
1217		if (buffer_uptodate(bh))
1218			return bh;
1219	}
1220	brelse(bh);
1221	return NULL;
1222}
1223
1224/*
1225 * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1226 * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1227 * refcount elevated by one when they're in an LRU.  A buffer can only appear
1228 * once in a particular CPU's LRU.  A single buffer can be present in multiple
1229 * CPU's LRUs at the same time.
1230 *
1231 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1232 * sb_find_get_block().
1233 *
1234 * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1235 * a local interrupt disable for that.
1236 */
1237
1238#define BH_LRU_SIZE	16
1239
1240struct bh_lru {
1241	struct buffer_head *bhs[BH_LRU_SIZE];
1242};
1243
1244static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1245
1246#ifdef CONFIG_SMP
1247#define bh_lru_lock()	local_irq_disable()
1248#define bh_lru_unlock()	local_irq_enable()
1249#else
1250#define bh_lru_lock()	preempt_disable()
1251#define bh_lru_unlock()	preempt_enable()
1252#endif
1253
1254static inline void check_irqs_on(void)
1255{
1256#ifdef irqs_disabled
1257	BUG_ON(irqs_disabled());
1258#endif
1259}
1260
1261/*
1262 * The LRU management algorithm is dopey-but-simple.  Sorry.
1263 */
1264static void bh_lru_install(struct buffer_head *bh)
1265{
1266	struct buffer_head *evictee = NULL;
1267
1268	check_irqs_on();
1269	bh_lru_lock();
1270	if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
1271		struct buffer_head *bhs[BH_LRU_SIZE];
1272		int in;
1273		int out = 0;
1274
1275		get_bh(bh);
1276		bhs[out++] = bh;
1277		for (in = 0; in < BH_LRU_SIZE; in++) {
1278			struct buffer_head *bh2 =
1279				__this_cpu_read(bh_lrus.bhs[in]);
1280
1281			if (bh2 == bh) {
1282				__brelse(bh2);
1283			} else {
1284				if (out >= BH_LRU_SIZE) {
1285					BUG_ON(evictee != NULL);
1286					evictee = bh2;
1287				} else {
1288					bhs[out++] = bh2;
1289				}
1290			}
1291		}
1292		while (out < BH_LRU_SIZE)
1293			bhs[out++] = NULL;
1294		memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
1295	}
1296	bh_lru_unlock();
1297
1298	if (evictee)
1299		__brelse(evictee);
1300}
1301
1302/*
1303 * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1304 */
1305static struct buffer_head *
1306lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1307{
1308	struct buffer_head *ret = NULL;
1309	unsigned int i;
1310
1311	check_irqs_on();
1312	bh_lru_lock();
1313	for (i = 0; i < BH_LRU_SIZE; i++) {
1314		struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1315
1316		if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
1317		    bh->b_size == size) {
1318			if (i) {
1319				while (i) {
1320					__this_cpu_write(bh_lrus.bhs[i],
1321						__this_cpu_read(bh_lrus.bhs[i - 1]));
1322					i--;
1323				}
1324				__this_cpu_write(bh_lrus.bhs[0], bh);
1325			}
1326			get_bh(bh);
1327			ret = bh;
1328			break;
1329		}
1330	}
1331	bh_lru_unlock();
1332	return ret;
1333}
1334
1335/*
1336 * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1337 * it in the LRU and mark it as accessed.  If it is not present then return
1338 * NULL
1339 */
1340struct buffer_head *
1341__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1342{
1343	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1344
1345	if (bh == NULL) {
1346		/* __find_get_block_slow will mark the page accessed */
1347		bh = __find_get_block_slow(bdev, block);
1348		if (bh)
1349			bh_lru_install(bh);
1350	} else
1351		touch_buffer(bh);
1352
1353	return bh;
1354}
1355EXPORT_SYMBOL(__find_get_block);
1356
1357/*
1358 * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
1359 * which corresponds to the passed block_device, block and size. The
1360 * returned buffer has its reference count incremented.
1361 *
1362 * __getblk_gfp() will lock up the machine if grow_dev_page's
1363 * try_to_free_buffers() attempt is failing.  FIXME, perhaps?
1364 */
1365struct buffer_head *
1366__getblk_gfp(struct block_device *bdev, sector_t block,
1367	     unsigned size, gfp_t gfp)
1368{
1369	struct buffer_head *bh = __find_get_block(bdev, block, size);
1370
1371	might_sleep();
1372	if (bh == NULL)
1373		bh = __getblk_slow(bdev, block, size, gfp);
1374	return bh;
1375}
1376EXPORT_SYMBOL(__getblk_gfp);
1377
1378/*
1379 * Do async read-ahead on a buffer..
1380 */
1381void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1382{
1383	struct buffer_head *bh = __getblk(bdev, block, size);
1384	if (likely(bh)) {
1385		ll_rw_block(READA, 1, &bh);
1386		brelse(bh);
1387	}
1388}
1389EXPORT_SYMBOL(__breadahead);
1390
1391/**
1392 *  __bread_gfp() - reads a specified block and returns the bh
1393 *  @bdev: the block_device to read from
1394 *  @block: number of block
1395 *  @size: size (in bytes) to read
1396 *  @gfp: page allocation flag
1397 *
1398 *  Reads a specified block, and returns buffer head that contains it.
1399 *  The page cache can be allocated from non-movable area
1400 *  not to prevent page migration if you set gfp to zero.
1401 *  It returns NULL if the block was unreadable.
1402 */
1403struct buffer_head *
1404__bread_gfp(struct block_device *bdev, sector_t block,
1405		   unsigned size, gfp_t gfp)
1406{
1407	struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
1408
1409	if (likely(bh) && !buffer_uptodate(bh))
1410		bh = __bread_slow(bh);
1411	return bh;
1412}
1413EXPORT_SYMBOL(__bread_gfp);
1414
1415/*
1416 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1417 * This doesn't race because it runs in each cpu either in irq
1418 * or with preempt disabled.
1419 */
1420static void invalidate_bh_lru(void *arg)
1421{
1422	struct bh_lru *b = &get_cpu_var(bh_lrus);
1423	int i;
1424
1425	for (i = 0; i < BH_LRU_SIZE; i++) {
1426		brelse(b->bhs[i]);
1427		b->bhs[i] = NULL;
1428	}
1429	put_cpu_var(bh_lrus);
1430}
1431
1432static bool has_bh_in_lru(int cpu, void *dummy)
1433{
1434	struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
1435	int i;
1436
1437	for (i = 0; i < BH_LRU_SIZE; i++) {
1438		if (b->bhs[i])
1439			return 1;
1440	}
1441
1442	return 0;
1443}
1444
1445void invalidate_bh_lrus(void)
1446{
1447	on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1, GFP_KERNEL);
1448}
1449EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1450
1451void set_bh_page(struct buffer_head *bh,
1452		struct page *page, unsigned long offset)
1453{
1454	bh->b_page = page;
1455	BUG_ON(offset >= PAGE_SIZE);
1456	if (PageHighMem(page))
1457		/*
1458		 * This catches illegal uses and preserves the offset:
1459		 */
1460		bh->b_data = (char *)(0 + offset);
1461	else
1462		bh->b_data = page_address(page) + offset;
1463}
1464EXPORT_SYMBOL(set_bh_page);
1465
1466/*
1467 * Called when truncating a buffer on a page completely.
1468 */
1469
1470/* Bits that are cleared during an invalidate */
1471#define BUFFER_FLAGS_DISCARD \
1472	(1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
1473	 1 << BH_Delay | 1 << BH_Unwritten)
1474
1475static void discard_buffer(struct buffer_head * bh)
1476{
1477	unsigned long b_state, b_state_old;
1478
1479	lock_buffer(bh);
1480	clear_buffer_dirty(bh);
1481	bh->b_bdev = NULL;
1482	b_state = bh->b_state;
1483	for (;;) {
1484		b_state_old = cmpxchg(&bh->b_state, b_state,
1485				      (b_state & ~BUFFER_FLAGS_DISCARD));
1486		if (b_state_old == b_state)
1487			break;
1488		b_state = b_state_old;
1489	}
1490	unlock_buffer(bh);
1491}
1492
1493/**
1494 * block_invalidatepage - invalidate part or all of a buffer-backed page
1495 *
1496 * @page: the page which is affected
1497 * @offset: start of the range to invalidate
1498 * @length: length of the range to invalidate
1499 *
1500 * block_invalidatepage() is called when all or part of the page has become
1501 * invalidated by a truncate operation.
1502 *
1503 * block_invalidatepage() does not have to release all buffers, but it must
1504 * ensure that no dirty buffer is left outside @offset and that no I/O
1505 * is underway against any of the blocks which are outside the truncation
1506 * point.  Because the caller is about to free (and possibly reuse) those
1507 * blocks on-disk.
1508 */
1509void block_invalidatepage(struct page *page, unsigned int offset,
1510			  unsigned int length)
1511{
1512	struct buffer_head *head, *bh, *next;
1513	unsigned int curr_off = 0;
1514	unsigned int stop = length + offset;
1515
1516	BUG_ON(!PageLocked(page));
1517	if (!page_has_buffers(page))
1518		goto out;
1519
1520	/*
1521	 * Check for overflow
1522	 */
1523	BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
1524
1525	head = page_buffers(page);
1526	bh = head;
1527	do {
1528		unsigned int next_off = curr_off + bh->b_size;
1529		next = bh->b_this_page;
1530
1531		/*
1532		 * Are we still fully in range ?
1533		 */
1534		if (next_off > stop)
1535			goto out;
1536
1537		/*
1538		 * is this block fully invalidated?
1539		 */
1540		if (offset <= curr_off)
1541			discard_buffer(bh);
1542		curr_off = next_off;
1543		bh = next;
1544	} while (bh != head);
1545
1546	/*
1547	 * We release buffers only if the entire page is being invalidated.
1548	 * The get_block cached value has been unconditionally invalidated,
1549	 * so real IO is not possible anymore.
1550	 */
1551	if (offset == 0)
1552		try_to_release_page(page, 0);
1553out:
1554	return;
1555}
1556EXPORT_SYMBOL(block_invalidatepage);
1557
1558
1559/*
1560 * We attach and possibly dirty the buffers atomically wrt
1561 * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1562 * is already excluded via the page lock.
1563 */
1564void create_empty_buffers(struct page *page,
1565			unsigned long blocksize, unsigned long b_state)
1566{
1567	struct buffer_head *bh, *head, *tail;
1568
1569	head = alloc_page_buffers(page, blocksize, 1);
1570	bh = head;
1571	do {
1572		bh->b_state |= b_state;
1573		tail = bh;
1574		bh = bh->b_this_page;
1575	} while (bh);
1576	tail->b_this_page = head;
1577
1578	spin_lock(&page->mapping->private_lock);
1579	if (PageUptodate(page) || PageDirty(page)) {
1580		bh = head;
1581		do {
1582			if (PageDirty(page))
1583				set_buffer_dirty(bh);
1584			if (PageUptodate(page))
1585				set_buffer_uptodate(bh);
1586			bh = bh->b_this_page;
1587		} while (bh != head);
1588	}
1589	attach_page_buffers(page, head);
1590	spin_unlock(&page->mapping->private_lock);
1591}
1592EXPORT_SYMBOL(create_empty_buffers);
1593
1594/*
1595 * We are taking a block for data and we don't want any output from any
1596 * buffer-cache aliases starting from return from that function and
1597 * until the moment when something will explicitly mark the buffer
1598 * dirty (hopefully that will not happen until we will free that block ;-)
1599 * We don't even need to mark it not-uptodate - nobody can expect
1600 * anything from a newly allocated buffer anyway. We used to used
1601 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1602 * don't want to mark the alias unmapped, for example - it would confuse
1603 * anyone who might pick it with bread() afterwards...
1604 *
1605 * Also..  Note that bforget() doesn't lock the buffer.  So there can
1606 * be writeout I/O going on against recently-freed buffers.  We don't
1607 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1608 * only if we really need to.  That happens here.
1609 */
1610void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1611{
1612	struct buffer_head *old_bh;
1613
1614	might_sleep();
1615
1616	old_bh = __find_get_block_slow(bdev, block);
1617	if (old_bh) {
1618		clear_buffer_dirty(old_bh);
1619		wait_on_buffer(old_bh);
1620		clear_buffer_req(old_bh);
1621		__brelse(old_bh);
1622	}
1623}
1624EXPORT_SYMBOL(unmap_underlying_metadata);
1625
1626/*
1627 * Size is a power-of-two in the range 512..PAGE_SIZE,
1628 * and the case we care about most is PAGE_SIZE.
1629 *
1630 * So this *could* possibly be written with those
1631 * constraints in mind (relevant mostly if some
1632 * architecture has a slow bit-scan instruction)
1633 */
1634static inline int block_size_bits(unsigned int blocksize)
1635{
1636	return ilog2(blocksize);
1637}
1638
1639static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
1640{
1641	BUG_ON(!PageLocked(page));
1642
1643	if (!page_has_buffers(page))
1644		create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state);
1645	return page_buffers(page);
1646}
1647
1648/*
1649 * NOTE! All mapped/uptodate combinations are valid:
1650 *
1651 *	Mapped	Uptodate	Meaning
1652 *
1653 *	No	No		"unknown" - must do get_block()
1654 *	No	Yes		"hole" - zero-filled
1655 *	Yes	No		"allocated" - allocated on disk, not read in
1656 *	Yes	Yes		"valid" - allocated and up-to-date in memory.
1657 *
1658 * "Dirty" is valid only with the last case (mapped+uptodate).
1659 */
1660
1661/*
1662 * While block_write_full_page is writing back the dirty buffers under
1663 * the page lock, whoever dirtied the buffers may decide to clean them
1664 * again at any time.  We handle that by only looking at the buffer
1665 * state inside lock_buffer().
1666 *
1667 * If block_write_full_page() is called for regular writeback
1668 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1669 * locked buffer.   This only can happen if someone has written the buffer
1670 * directly, with submit_bh().  At the address_space level PageWriteback
1671 * prevents this contention from occurring.
1672 *
1673 * If block_write_full_page() is called with wbc->sync_mode ==
1674 * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
1675 * causes the writes to be flagged as synchronous writes.
1676 */
1677static int __block_write_full_page(struct inode *inode, struct page *page,
1678			get_block_t *get_block, struct writeback_control *wbc,
1679			bh_end_io_t *handler)
1680{
1681	int err;
1682	sector_t block;
1683	sector_t last_block;
1684	struct buffer_head *bh, *head;
1685	unsigned int blocksize, bbits;
1686	int nr_underway = 0;
1687	int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1688			WRITE_SYNC : WRITE);
1689
1690	head = create_page_buffers(page, inode,
1691					(1 << BH_Dirty)|(1 << BH_Uptodate));
1692
1693	/*
1694	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1695	 * here, and the (potentially unmapped) buffers may become dirty at
1696	 * any time.  If a buffer becomes dirty here after we've inspected it
1697	 * then we just miss that fact, and the page stays dirty.
1698	 *
1699	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1700	 * handle that here by just cleaning them.
1701	 */
1702
1703	bh = head;
1704	blocksize = bh->b_size;
1705	bbits = block_size_bits(blocksize);
1706
1707	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1708	last_block = (i_size_read(inode) - 1) >> bbits;
1709
1710	/*
1711	 * Get all the dirty buffers mapped to disk addresses and
1712	 * handle any aliases from the underlying blockdev's mapping.
1713	 */
1714	do {
1715		if (block > last_block) {
1716			/*
1717			 * mapped buffers outside i_size will occur, because
1718			 * this page can be outside i_size when there is a
1719			 * truncate in progress.
1720			 */
1721			/*
1722			 * The buffer was zeroed by block_write_full_page()
1723			 */
1724			clear_buffer_dirty(bh);
1725			set_buffer_uptodate(bh);
1726		} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1727			   buffer_dirty(bh)) {
1728			WARN_ON(bh->b_size != blocksize);
1729			err = get_block(inode, block, bh, 1);
1730			if (err)
1731				goto recover;
1732			clear_buffer_delay(bh);
1733			if (buffer_new(bh)) {
1734				/* blockdev mappings never come here */
1735				clear_buffer_new(bh);
1736				unmap_underlying_metadata(bh->b_bdev,
1737							bh->b_blocknr);
1738			}
1739		}
1740		bh = bh->b_this_page;
1741		block++;
1742	} while (bh != head);
1743
1744	do {
1745		if (!buffer_mapped(bh))
1746			continue;
1747		/*
1748		 * If it's a fully non-blocking write attempt and we cannot
1749		 * lock the buffer then redirty the page.  Note that this can
1750		 * potentially cause a busy-wait loop from writeback threads
1751		 * and kswapd activity, but those code paths have their own
1752		 * higher-level throttling.
1753		 */
1754		if (wbc->sync_mode != WB_SYNC_NONE) {
1755			lock_buffer(bh);
1756		} else if (!trylock_buffer(bh)) {
1757			redirty_page_for_writepage(wbc, page);
1758			continue;
1759		}
1760		if (test_clear_buffer_dirty(bh)) {
1761			mark_buffer_async_write_endio(bh, handler);
1762		} else {
1763			unlock_buffer(bh);
1764		}
1765	} while ((bh = bh->b_this_page) != head);
1766
1767	/*
1768	 * The page and its buffers are protected by PageWriteback(), so we can
1769	 * drop the bh refcounts early.
1770	 */
1771	BUG_ON(PageWriteback(page));
1772	set_page_writeback(page);
1773
1774	do {
1775		struct buffer_head *next = bh->b_this_page;
1776		if (buffer_async_write(bh)) {
1777			submit_bh(write_op, bh);
1778			nr_underway++;
1779		}
1780		bh = next;
1781	} while (bh != head);
1782	unlock_page(page);
1783
1784	err = 0;
1785done:
1786	if (nr_underway == 0) {
1787		/*
1788		 * The page was marked dirty, but the buffers were
1789		 * clean.  Someone wrote them back by hand with
1790		 * ll_rw_block/submit_bh.  A rare case.
1791		 */
1792		end_page_writeback(page);
1793
1794		/*
1795		 * The page and buffer_heads can be released at any time from
1796		 * here on.
1797		 */
1798	}
1799	return err;
1800
1801recover:
1802	/*
1803	 * ENOSPC, or some other error.  We may already have added some
1804	 * blocks to the file, so we need to write these out to avoid
1805	 * exposing stale data.
1806	 * The page is currently locked and not marked for writeback
1807	 */
1808	bh = head;
1809	/* Recovery: lock and submit the mapped buffers */
1810	do {
1811		if (buffer_mapped(bh) && buffer_dirty(bh) &&
1812		    !buffer_delay(bh)) {
1813			lock_buffer(bh);
1814			mark_buffer_async_write_endio(bh, handler);
1815		} else {
1816			/*
1817			 * The buffer may have been set dirty during
1818			 * attachment to a dirty page.
1819			 */
1820			clear_buffer_dirty(bh);
1821		}
1822	} while ((bh = bh->b_this_page) != head);
1823	SetPageError(page);
1824	BUG_ON(PageWriteback(page));
1825	mapping_set_error(page->mapping, err);
1826	set_page_writeback(page);
1827	do {
1828		struct buffer_head *next = bh->b_this_page;
1829		if (buffer_async_write(bh)) {
1830			clear_buffer_dirty(bh);
1831			submit_bh(write_op, bh);
1832			nr_underway++;
1833		}
1834		bh = next;
1835	} while (bh != head);
1836	unlock_page(page);
1837	goto done;
1838}
1839
1840/*
1841 * If a page has any new buffers, zero them out here, and mark them uptodate
1842 * and dirty so they'll be written out (in order to prevent uninitialised
1843 * block data from leaking). And clear the new bit.
1844 */
1845void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1846{
1847	unsigned int block_start, block_end;
1848	struct buffer_head *head, *bh;
1849
1850	BUG_ON(!PageLocked(page));
1851	if (!page_has_buffers(page))
1852		return;
1853
1854	bh = head = page_buffers(page);
1855	block_start = 0;
1856	do {
1857		block_end = block_start + bh->b_size;
1858
1859		if (buffer_new(bh)) {
1860			if (block_end > from && block_start < to) {
1861				if (!PageUptodate(page)) {
1862					unsigned start, size;
1863
1864					start = max(from, block_start);
1865					size = min(to, block_end) - start;
1866
1867					zero_user(page, start, size);
1868					set_buffer_uptodate(bh);
1869				}
1870
1871				clear_buffer_new(bh);
1872				mark_buffer_dirty(bh);
1873			}
1874		}
1875
1876		block_start = block_end;
1877		bh = bh->b_this_page;
1878	} while (bh != head);
1879}
1880EXPORT_SYMBOL(page_zero_new_buffers);
1881
1882int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1883		get_block_t *get_block)
1884{
1885	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1886	unsigned to = from + len;
1887	struct inode *inode = page->mapping->host;
1888	unsigned block_start, block_end;
1889	sector_t block;
1890	int err = 0;
1891	unsigned blocksize, bbits;
1892	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1893
1894	BUG_ON(!PageLocked(page));
1895	BUG_ON(from > PAGE_CACHE_SIZE);
1896	BUG_ON(to > PAGE_CACHE_SIZE);
1897	BUG_ON(from > to);
1898
1899	head = create_page_buffers(page, inode, 0);
1900	blocksize = head->b_size;
1901	bbits = block_size_bits(blocksize);
1902
1903	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1904
1905	for(bh = head, block_start = 0; bh != head || !block_start;
1906	    block++, block_start=block_end, bh = bh->b_this_page) {
1907		block_end = block_start + blocksize;
1908		if (block_end <= from || block_start >= to) {
1909			if (PageUptodate(page)) {
1910				if (!buffer_uptodate(bh))
1911					set_buffer_uptodate(bh);
1912			}
1913			continue;
1914		}
1915		if (buffer_new(bh))
1916			clear_buffer_new(bh);
1917		if (!buffer_mapped(bh)) {
1918			WARN_ON(bh->b_size != blocksize);
1919			err = get_block(inode, block, bh, 1);
1920			if (err)
1921				break;
1922			if (buffer_new(bh)) {
1923				unmap_underlying_metadata(bh->b_bdev,
1924							bh->b_blocknr);
1925				if (PageUptodate(page)) {
1926					clear_buffer_new(bh);
1927					set_buffer_uptodate(bh);
1928					mark_buffer_dirty(bh);
1929					continue;
1930				}
1931				if (block_end > to || block_start < from)
1932					zero_user_segments(page,
1933						to, block_end,
1934						block_start, from);
1935				continue;
1936			}
1937		}
1938		if (PageUptodate(page)) {
1939			if (!buffer_uptodate(bh))
1940				set_buffer_uptodate(bh);
1941			continue;
1942		}
1943		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1944		    !buffer_unwritten(bh) &&
1945		     (block_start < from || block_end > to)) {
1946			ll_rw_block(READ, 1, &bh);
1947			*wait_bh++=bh;
1948		}
1949	}
1950	/*
1951	 * If we issued read requests - let them complete.
1952	 */
1953	while(wait_bh > wait) {
1954		wait_on_buffer(*--wait_bh);
1955		if (!buffer_uptodate(*wait_bh))
1956			err = -EIO;
1957	}
1958	if (unlikely(err))
1959		page_zero_new_buffers(page, from, to);
1960	return err;
1961}
1962EXPORT_SYMBOL(__block_write_begin);
1963
1964static int __block_commit_write(struct inode *inode, struct page *page,
1965		unsigned from, unsigned to)
1966{
1967	unsigned block_start, block_end;
1968	int partial = 0;
1969	unsigned blocksize;
1970	struct buffer_head *bh, *head;
1971
1972	bh = head = page_buffers(page);
1973	blocksize = bh->b_size;
1974
1975	block_start = 0;
1976	do {
1977		block_end = block_start + blocksize;
1978		if (block_end <= from || block_start >= to) {
1979			if (!buffer_uptodate(bh))
1980				partial = 1;
1981		} else {
1982			set_buffer_uptodate(bh);
1983			mark_buffer_dirty(bh);
1984		}
1985		clear_buffer_new(bh);
1986
1987		block_start = block_end;
1988		bh = bh->b_this_page;
1989	} while (bh != head);
1990
1991	/*
1992	 * If this is a partial write which happened to make all buffers
1993	 * uptodate then we can optimize away a bogus readpage() for
1994	 * the next read(). Here we 'discover' whether the page went
1995	 * uptodate as a result of this (potentially partial) write.
1996	 */
1997	if (!partial)
1998		SetPageUptodate(page);
1999	return 0;
2000}
2001
2002/*
2003 * block_write_begin takes care of the basic task of block allocation and
2004 * bringing partial write blocks uptodate first.
2005 *
2006 * The filesystem needs to handle block truncation upon failure.
2007 */
2008int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
2009		unsigned flags, struct page **pagep, get_block_t *get_block)
2010{
2011	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2012	struct page *page;
2013	int status;
2014
2015	page = grab_cache_page_write_begin(mapping, index, flags);
2016	if (!page)
2017		return -ENOMEM;
2018
2019	status = __block_write_begin(page, pos, len, get_block);
2020	if (unlikely(status)) {
2021		unlock_page(page);
2022		page_cache_release(page);
2023		page = NULL;
2024	}
2025
2026	*pagep = page;
2027	return status;
2028}
2029EXPORT_SYMBOL(block_write_begin);
2030
2031int block_write_end(struct file *file, struct address_space *mapping,
2032			loff_t pos, unsigned len, unsigned copied,
2033			struct page *page, void *fsdata)
2034{
2035	struct inode *inode = mapping->host;
2036	unsigned start;
2037
2038	start = pos & (PAGE_CACHE_SIZE - 1);
2039
2040	if (unlikely(copied < len)) {
2041		/*
2042		 * The buffers that were written will now be uptodate, so we
2043		 * don't have to worry about a readpage reading them and
2044		 * overwriting a partial write. However if we have encountered
2045		 * a short write and only partially written into a buffer, it
2046		 * will not be marked uptodate, so a readpage might come in and
2047		 * destroy our partial write.
2048		 *
2049		 * Do the simplest thing, and just treat any short write to a
2050		 * non uptodate page as a zero-length write, and force the
2051		 * caller to redo the whole thing.
2052		 */
2053		if (!PageUptodate(page))
2054			copied = 0;
2055
2056		page_zero_new_buffers(page, start+copied, start+len);
2057	}
2058	flush_dcache_page(page);
2059
2060	/* This could be a short (even 0-length) commit */
2061	__block_commit_write(inode, page, start, start+copied);
2062
2063	return copied;
2064}
2065EXPORT_SYMBOL(block_write_end);
2066
2067int generic_write_end(struct file *file, struct address_space *mapping,
2068			loff_t pos, unsigned len, unsigned copied,
2069			struct page *page, void *fsdata)
2070{
2071	struct inode *inode = mapping->host;
2072	loff_t old_size = inode->i_size;
2073	int i_size_changed = 0;
2074
2075	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2076
2077	/*
2078	 * No need to use i_size_read() here, the i_size
2079	 * cannot change under us because we hold i_mutex.
2080	 *
2081	 * But it's important to update i_size while still holding page lock:
2082	 * page writeout could otherwise come in and zero beyond i_size.
2083	 */
2084	if (pos+copied > inode->i_size) {
2085		i_size_write(inode, pos+copied);
2086		i_size_changed = 1;
2087	}
2088
2089	unlock_page(page);
2090	page_cache_release(page);
2091
2092	if (old_size < pos)
2093		pagecache_isize_extended(inode, old_size, pos);
2094	/*
2095	 * Don't mark the inode dirty under page lock. First, it unnecessarily
2096	 * makes the holding time of page lock longer. Second, it forces lock
2097	 * ordering of page lock and transaction start for journaling
2098	 * filesystems.
2099	 */
2100	if (i_size_changed)
2101		mark_inode_dirty(inode);
2102
2103	return copied;
2104}
2105EXPORT_SYMBOL(generic_write_end);
2106
2107/*
2108 * block_is_partially_uptodate checks whether buffers within a page are
2109 * uptodate or not.
2110 *
2111 * Returns true if all buffers which correspond to a file portion
2112 * we want to read are uptodate.
2113 */
2114int block_is_partially_uptodate(struct page *page, unsigned long from,
2115					unsigned long count)
2116{
2117	unsigned block_start, block_end, blocksize;
2118	unsigned to;
2119	struct buffer_head *bh, *head;
2120	int ret = 1;
2121
2122	if (!page_has_buffers(page))
2123		return 0;
2124
2125	head = page_buffers(page);
2126	blocksize = head->b_size;
2127	to = min_t(unsigned, PAGE_CACHE_SIZE - from, count);
2128	to = from + to;
2129	if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2130		return 0;
2131
2132	bh = head;
2133	block_start = 0;
2134	do {
2135		block_end = block_start + blocksize;
2136		if (block_end > from && block_start < to) {
2137			if (!buffer_uptodate(bh)) {
2138				ret = 0;
2139				break;
2140			}
2141			if (block_end >= to)
2142				break;
2143		}
2144		block_start = block_end;
2145		bh = bh->b_this_page;
2146	} while (bh != head);
2147
2148	return ret;
2149}
2150EXPORT_SYMBOL(block_is_partially_uptodate);
2151
2152/*
2153 * Generic "read page" function for block devices that have the normal
2154 * get_block functionality. This is most of the block device filesystems.
2155 * Reads the page asynchronously --- the unlock_buffer() and
2156 * set/clear_buffer_uptodate() functions propagate buffer state into the
2157 * page struct once IO has completed.
2158 */
2159int block_read_full_page(struct page *page, get_block_t *get_block)
2160{
2161	struct inode *inode = page->mapping->host;
2162	sector_t iblock, lblock;
2163	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2164	unsigned int blocksize, bbits;
2165	int nr, i;
2166	int fully_mapped = 1;
2167
2168	head = create_page_buffers(page, inode, 0);
2169	blocksize = head->b_size;
2170	bbits = block_size_bits(blocksize);
2171
2172	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
2173	lblock = (i_size_read(inode)+blocksize-1) >> bbits;
2174	bh = head;
2175	nr = 0;
2176	i = 0;
2177
2178	do {
2179		if (buffer_uptodate(bh))
2180			continue;
2181
2182		if (!buffer_mapped(bh)) {
2183			int err = 0;
2184
2185			fully_mapped = 0;
2186			if (iblock < lblock) {
2187				WARN_ON(bh->b_size != blocksize);
2188				err = get_block(inode, iblock, bh, 0);
2189				if (err)
2190					SetPageError(page);
2191			}
2192			if (!buffer_mapped(bh)) {
2193				zero_user(page, i * blocksize, blocksize);
2194				if (!err)
2195					set_buffer_uptodate(bh);
2196				continue;
2197			}
2198			/*
2199			 * get_block() might have updated the buffer
2200			 * synchronously
2201			 */
2202			if (buffer_uptodate(bh))
2203				continue;
2204		}
2205		arr[nr++] = bh;
2206	} while (i++, iblock++, (bh = bh->b_this_page) != head);
2207
2208	if (fully_mapped)
2209		SetPageMappedToDisk(page);
2210
2211	if (!nr) {
2212		/*
2213		 * All buffers are uptodate - we can set the page uptodate
2214		 * as well. But not if get_block() returned an error.
2215		 */
2216		if (!PageError(page))
2217			SetPageUptodate(page);
2218		unlock_page(page);
2219		return 0;
2220	}
2221
2222	/* Stage two: lock the buffers */
2223	for (i = 0; i < nr; i++) {
2224		bh = arr[i];
2225		lock_buffer(bh);
2226		mark_buffer_async_read(bh);
2227	}
2228
2229	/*
2230	 * Stage 3: start the IO.  Check for uptodateness
2231	 * inside the buffer lock in case another process reading
2232	 * the underlying blockdev brought it uptodate (the sct fix).
2233	 */
2234	for (i = 0; i < nr; i++) {
2235		bh = arr[i];
2236		if (buffer_uptodate(bh))
2237			end_buffer_async_read(bh, 1);
2238		else
2239			submit_bh(READ, bh);
2240	}
2241	return 0;
2242}
2243EXPORT_SYMBOL(block_read_full_page);
2244
2245/* utility function for filesystems that need to do work on expanding
2246 * truncates.  Uses filesystem pagecache writes to allow the filesystem to
2247 * deal with the hole.
2248 */
2249int generic_cont_expand_simple(struct inode *inode, loff_t size)
2250{
2251	struct address_space *mapping = inode->i_mapping;
2252	struct page *page;
2253	void *fsdata;
2254	int err;
2255
2256	err = inode_newsize_ok(inode, size);
2257	if (err)
2258		goto out;
2259
2260	err = pagecache_write_begin(NULL, mapping, size, 0,
2261				AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2262				&page, &fsdata);
2263	if (err)
2264		goto out;
2265
2266	err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2267	BUG_ON(err > 0);
2268
2269out:
2270	return err;
2271}
2272EXPORT_SYMBOL(generic_cont_expand_simple);
2273
2274static int cont_expand_zero(struct file *file, struct address_space *mapping,
2275			    loff_t pos, loff_t *bytes)
2276{
2277	struct inode *inode = mapping->host;
2278	unsigned blocksize = 1 << inode->i_blkbits;
2279	struct page *page;
2280	void *fsdata;
2281	pgoff_t index, curidx;
2282	loff_t curpos;
2283	unsigned zerofrom, offset, len;
2284	int err = 0;
2285
2286	index = pos >> PAGE_CACHE_SHIFT;
2287	offset = pos & ~PAGE_CACHE_MASK;
2288
2289	while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2290		zerofrom = curpos & ~PAGE_CACHE_MASK;
2291		if (zerofrom & (blocksize-1)) {
2292			*bytes |= (blocksize-1);
2293			(*bytes)++;
2294		}
2295		len = PAGE_CACHE_SIZE - zerofrom;
2296
2297		err = pagecache_write_begin(file, mapping, curpos, len,
2298						AOP_FLAG_UNINTERRUPTIBLE,
2299						&page, &fsdata);
2300		if (err)
2301			goto out;
2302		zero_user(page, zerofrom, len);
2303		err = pagecache_write_end(file, mapping, curpos, len, len,
2304						page, fsdata);
2305		if (err < 0)
2306			goto out;
2307		BUG_ON(err != len);
2308		err = 0;
2309
2310		balance_dirty_pages_ratelimited(mapping);
2311
2312		if (unlikely(fatal_signal_pending(current))) {
2313			err = -EINTR;
2314			goto out;
2315		}
2316	}
2317
2318	/* page covers the boundary, find the boundary offset */
2319	if (index == curidx) {
2320		zerofrom = curpos & ~PAGE_CACHE_MASK;
2321		/* if we will expand the thing last block will be filled */
2322		if (offset <= zerofrom) {
2323			goto out;
2324		}
2325		if (zerofrom & (blocksize-1)) {
2326			*bytes |= (blocksize-1);
2327			(*bytes)++;
2328		}
2329		len = offset - zerofrom;
2330
2331		err = pagecache_write_begin(file, mapping, curpos, len,
2332						AOP_FLAG_UNINTERRUPTIBLE,
2333						&page, &fsdata);
2334		if (err)
2335			goto out;
2336		zero_user(page, zerofrom, len);
2337		err = pagecache_write_end(file, mapping, curpos, len, len,
2338						page, fsdata);
2339		if (err < 0)
2340			goto out;
2341		BUG_ON(err != len);
2342		err = 0;
2343	}
2344out:
2345	return err;
2346}
2347
2348/*
2349 * For moronic filesystems that do not allow holes in file.
2350 * We may have to extend the file.
2351 */
2352int cont_write_begin(struct file *file, struct address_space *mapping,
2353			loff_t pos, unsigned len, unsigned flags,
2354			struct page **pagep, void **fsdata,
2355			get_block_t *get_block, loff_t *bytes)
2356{
2357	struct inode *inode = mapping->host;
2358	unsigned blocksize = 1 << inode->i_blkbits;
2359	unsigned zerofrom;
2360	int err;
2361
2362	err = cont_expand_zero(file, mapping, pos, bytes);
2363	if (err)
2364		return err;
2365
2366	zerofrom = *bytes & ~PAGE_CACHE_MASK;
2367	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2368		*bytes |= (blocksize-1);
2369		(*bytes)++;
2370	}
2371
2372	return block_write_begin(mapping, pos, len, flags, pagep, get_block);
2373}
2374EXPORT_SYMBOL(cont_write_begin);
2375
2376int block_commit_write(struct page *page, unsigned from, unsigned to)
2377{
2378	struct inode *inode = page->mapping->host;
2379	__block_commit_write(inode,page,from,to);
2380	return 0;
2381}
2382EXPORT_SYMBOL(block_commit_write);
2383
2384/*
2385 * block_page_mkwrite() is not allowed to change the file size as it gets
2386 * called from a page fault handler when a page is first dirtied. Hence we must
2387 * be careful to check for EOF conditions here. We set the page up correctly
2388 * for a written page which means we get ENOSPC checking when writing into
2389 * holes and correct delalloc and unwritten extent mapping on filesystems that
2390 * support these features.
2391 *
2392 * We are not allowed to take the i_mutex here so we have to play games to
2393 * protect against truncate races as the page could now be beyond EOF.  Because
2394 * truncate writes the inode size before removing pages, once we have the
2395 * page lock we can determine safely if the page is beyond EOF. If it is not
2396 * beyond EOF, then the page is guaranteed safe against truncation until we
2397 * unlock the page.
2398 *
2399 * Direct callers of this function should protect against filesystem freezing
2400 * using sb_start_write() - sb_end_write() functions.
2401 */
2402int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2403			 get_block_t get_block)
2404{
2405	struct page *page = vmf->page;
2406	struct inode *inode = file_inode(vma->vm_file);
2407	unsigned long end;
2408	loff_t size;
2409	int ret;
2410
2411	lock_page(page);
2412	size = i_size_read(inode);
2413	if ((page->mapping != inode->i_mapping) ||
2414	    (page_offset(page) > size)) {
2415		/* We overload EFAULT to mean page got truncated */
2416		ret = -EFAULT;
2417		goto out_unlock;
2418	}
2419
2420	/* page is wholly or partially inside EOF */
2421	if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2422		end = size & ~PAGE_CACHE_MASK;
2423	else
2424		end = PAGE_CACHE_SIZE;
2425
2426	ret = __block_write_begin(page, 0, end, get_block);
2427	if (!ret)
2428		ret = block_commit_write(page, 0, end);
2429
2430	if (unlikely(ret < 0))
2431		goto out_unlock;
2432	set_page_dirty(page);
2433	wait_for_stable_page(page);
2434	return 0;
2435out_unlock:
2436	unlock_page(page);
2437	return ret;
2438}
2439EXPORT_SYMBOL(__block_page_mkwrite);
2440
2441int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2442		   get_block_t get_block)
2443{
2444	int ret;
2445	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
2446
2447	sb_start_pagefault(sb);
2448
2449	/*
2450	 * Update file times before taking page lock. We may end up failing the
2451	 * fault so this update may be superfluous but who really cares...
2452	 */
2453	file_update_time(vma->vm_file);
2454
2455	ret = __block_page_mkwrite(vma, vmf, get_block);
2456	sb_end_pagefault(sb);
2457	return block_page_mkwrite_return(ret);
2458}
2459EXPORT_SYMBOL(block_page_mkwrite);
2460
2461/*
2462 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2463 * immediately, while under the page lock.  So it needs a special end_io
2464 * handler which does not touch the bh after unlocking it.
2465 */
2466static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2467{
2468	__end_buffer_read_notouch(bh, uptodate);
2469}
2470
2471/*
2472 * Attach the singly-linked list of buffers created by nobh_write_begin, to
2473 * the page (converting it to circular linked list and taking care of page
2474 * dirty races).
2475 */
2476static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2477{
2478	struct buffer_head *bh;
2479
2480	BUG_ON(!PageLocked(page));
2481
2482	spin_lock(&page->mapping->private_lock);
2483	bh = head;
2484	do {
2485		if (PageDirty(page))
2486			set_buffer_dirty(bh);
2487		if (!bh->b_this_page)
2488			bh->b_this_page = head;
2489		bh = bh->b_this_page;
2490	} while (bh != head);
2491	attach_page_buffers(page, head);
2492	spin_unlock(&page->mapping->private_lock);
2493}
2494
2495/*
2496 * On entry, the page is fully not uptodate.
2497 * On exit the page is fully uptodate in the areas outside (from,to)
2498 * The filesystem needs to handle block truncation upon failure.
2499 */
2500int nobh_write_begin(struct address_space *mapping,
2501			loff_t pos, unsigned len, unsigned flags,
2502			struct page **pagep, void **fsdata,
2503			get_block_t *get_block)
2504{
2505	struct inode *inode = mapping->host;
2506	const unsigned blkbits = inode->i_blkbits;
2507	const unsigned blocksize = 1 << blkbits;
2508	struct buffer_head *head, *bh;
2509	struct page *page;
2510	pgoff_t index;
2511	unsigned from, to;
2512	unsigned block_in_page;
2513	unsigned block_start, block_end;
2514	sector_t block_in_file;
2515	int nr_reads = 0;
2516	int ret = 0;
2517	int is_mapped_to_disk = 1;
2518
2519	index = pos >> PAGE_CACHE_SHIFT;
2520	from = pos & (PAGE_CACHE_SIZE - 1);
2521	to = from + len;
2522
2523	page = grab_cache_page_write_begin(mapping, index, flags);
2524	if (!page)
2525		return -ENOMEM;
2526	*pagep = page;
2527	*fsdata = NULL;
2528
2529	if (page_has_buffers(page)) {
2530		ret = __block_write_begin(page, pos, len, get_block);
2531		if (unlikely(ret))
2532			goto out_release;
2533		return ret;
2534	}
2535
2536	if (PageMappedToDisk(page))
2537		return 0;
2538
2539	/*
2540	 * Allocate buffers so that we can keep track of state, and potentially
2541	 * attach them to the page if an error occurs. In the common case of
2542	 * no error, they will just be freed again without ever being attached
2543	 * to the page (which is all OK, because we're under the page lock).
2544	 *
2545	 * Be careful: the buffer linked list is a NULL terminated one, rather
2546	 * than the circular one we're used to.
2547	 */
2548	head = alloc_page_buffers(page, blocksize, 0);
2549	if (!head) {
2550		ret = -ENOMEM;
2551		goto out_release;
2552	}
2553
2554	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2555
2556	/*
2557	 * We loop across all blocks in the page, whether or not they are
2558	 * part of the affected region.  This is so we can discover if the
2559	 * page is fully mapped-to-disk.
2560	 */
2561	for (block_start = 0, block_in_page = 0, bh = head;
2562		  block_start < PAGE_CACHE_SIZE;
2563		  block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2564		int create;
2565
2566		block_end = block_start + blocksize;
2567		bh->b_state = 0;
2568		create = 1;
2569		if (block_start >= to)
2570			create = 0;
2571		ret = get_block(inode, block_in_file + block_in_page,
2572					bh, create);
2573		if (ret)
2574			goto failed;
2575		if (!buffer_mapped(bh))
2576			is_mapped_to_disk = 0;
2577		if (buffer_new(bh))
2578			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2579		if (PageUptodate(page)) {
2580			set_buffer_uptodate(bh);
2581			continue;
2582		}
2583		if (buffer_new(bh) || !buffer_mapped(bh)) {
2584			zero_user_segments(page, block_start, from,
2585							to, block_end);
2586			continue;
2587		}
2588		if (buffer_uptodate(bh))
2589			continue;	/* reiserfs does this */
2590		if (block_start < from || block_end > to) {
2591			lock_buffer(bh);
2592			bh->b_end_io = end_buffer_read_nobh;
2593			submit_bh(READ, bh);
2594			nr_reads++;
2595		}
2596	}
2597
2598	if (nr_reads) {
2599		/*
2600		 * The page is locked, so these buffers are protected from
2601		 * any VM or truncate activity.  Hence we don't need to care
2602		 * for the buffer_head refcounts.
2603		 */
2604		for (bh = head; bh; bh = bh->b_this_page) {
2605			wait_on_buffer(bh);
2606			if (!buffer_uptodate(bh))
2607				ret = -EIO;
2608		}
2609		if (ret)
2610			goto failed;
2611	}
2612
2613	if (is_mapped_to_disk)
2614		SetPageMappedToDisk(page);
2615
2616	*fsdata = head; /* to be released by nobh_write_end */
2617
2618	return 0;
2619
2620failed:
2621	BUG_ON(!ret);
2622	/*
2623	 * Error recovery is a bit difficult. We need to zero out blocks that
2624	 * were newly allocated, and dirty them to ensure they get written out.
2625	 * Buffers need to be attached to the page at this point, otherwise
2626	 * the handling of potential IO errors during writeout would be hard
2627	 * (could try doing synchronous writeout, but what if that fails too?)
2628	 */
2629	attach_nobh_buffers(page, head);
2630	page_zero_new_buffers(page, from, to);
2631
2632out_release:
2633	unlock_page(page);
2634	page_cache_release(page);
2635	*pagep = NULL;
2636
2637	return ret;
2638}
2639EXPORT_SYMBOL(nobh_write_begin);
2640
2641int nobh_write_end(struct file *file, struct address_space *mapping,
2642			loff_t pos, unsigned len, unsigned copied,
2643			struct page *page, void *fsdata)
2644{
2645	struct inode *inode = page->mapping->host;
2646	struct buffer_head *head = fsdata;
2647	struct buffer_head *bh;
2648	BUG_ON(fsdata != NULL && page_has_buffers(page));
2649
2650	if (unlikely(copied < len) && head)
2651		attach_nobh_buffers(page, head);
2652	if (page_has_buffers(page))
2653		return generic_write_end(file, mapping, pos, len,
2654					copied, page, fsdata);
2655
2656	SetPageUptodate(page);
2657	set_page_dirty(page);
2658	if (pos+copied > inode->i_size) {
2659		i_size_write(inode, pos+copied);
2660		mark_inode_dirty(inode);
2661	}
2662
2663	unlock_page(page);
2664	page_cache_release(page);
2665
2666	while (head) {
2667		bh = head;
2668		head = head->b_this_page;
2669		free_buffer_head(bh);
2670	}
2671
2672	return copied;
2673}
2674EXPORT_SYMBOL(nobh_write_end);
2675
2676/*
2677 * nobh_writepage() - based on block_full_write_page() except
2678 * that it tries to operate without attaching bufferheads to
2679 * the page.
2680 */
2681int nobh_writepage(struct page *page, get_block_t *get_block,
2682			struct writeback_control *wbc)
2683{
2684	struct inode * const inode = page->mapping->host;
2685	loff_t i_size = i_size_read(inode);
2686	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2687	unsigned offset;
2688	int ret;
2689
2690	/* Is the page fully inside i_size? */
2691	if (page->index < end_index)
2692		goto out;
2693
2694	/* Is the page fully outside i_size? (truncate in progress) */
2695	offset = i_size & (PAGE_CACHE_SIZE-1);
2696	if (page->index >= end_index+1 || !offset) {
2697		/*
2698		 * The page may have dirty, unmapped buffers.  For example,
2699		 * they may have been added in ext3_writepage().  Make them
2700		 * freeable here, so the page does not leak.
2701		 */
2702#if 0
2703		/* Not really sure about this  - do we need this ? */
2704		if (page->mapping->a_ops->invalidatepage)
2705			page->mapping->a_ops->invalidatepage(page, offset);
2706#endif
2707		unlock_page(page);
2708		return 0; /* don't care */
2709	}
2710
2711	/*
2712	 * The page straddles i_size.  It must be zeroed out on each and every
2713	 * writepage invocation because it may be mmapped.  "A file is mapped
2714	 * in multiples of the page size.  For a file that is not a multiple of
2715	 * the  page size, the remaining memory is zeroed when mapped, and
2716	 * writes to that region are not written out to the file."
2717	 */
2718	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2719out:
2720	ret = mpage_writepage(page, get_block, wbc);
2721	if (ret == -EAGAIN)
2722		ret = __block_write_full_page(inode, page, get_block, wbc,
2723					      end_buffer_async_write);
2724	return ret;
2725}
2726EXPORT_SYMBOL(nobh_writepage);
2727
2728int nobh_truncate_page(struct address_space *mapping,
2729			loff_t from, get_block_t *get_block)
2730{
2731	pgoff_t index = from >> PAGE_CACHE_SHIFT;
2732	unsigned offset = from & (PAGE_CACHE_SIZE-1);
2733	unsigned blocksize;
2734	sector_t iblock;
2735	unsigned length, pos;
2736	struct inode *inode = mapping->host;
2737	struct page *page;
2738	struct buffer_head map_bh;
2739	int err;
2740
2741	blocksize = 1 << inode->i_blkbits;
2742	length = offset & (blocksize - 1);
2743
2744	/* Block boundary? Nothing to do */
2745	if (!length)
2746		return 0;
2747
2748	length = blocksize - length;
2749	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2750
2751	page = grab_cache_page(mapping, index);
2752	err = -ENOMEM;
2753	if (!page)
2754		goto out;
2755
2756	if (page_has_buffers(page)) {
2757has_buffers:
2758		unlock_page(page);
2759		page_cache_release(page);
2760		return block_truncate_page(mapping, from, get_block);
2761	}
2762
2763	/* Find the buffer that contains "offset" */
2764	pos = blocksize;
2765	while (offset >= pos) {
2766		iblock++;
2767		pos += blocksize;
2768	}
2769
2770	map_bh.b_size = blocksize;
2771	map_bh.b_state = 0;
2772	err = get_block(inode, iblock, &map_bh, 0);
2773	if (err)
2774		goto unlock;
2775	/* unmapped? It's a hole - nothing to do */
2776	if (!buffer_mapped(&map_bh))
2777		goto unlock;
2778
2779	/* Ok, it's mapped. Make sure it's up-to-date */
2780	if (!PageUptodate(page)) {
2781		err = mapping->a_ops->readpage(NULL, page);
2782		if (err) {
2783			page_cache_release(page);
2784			goto out;
2785		}
2786		lock_page(page);
2787		if (!PageUptodate(page)) {
2788			err = -EIO;
2789			goto unlock;
2790		}
2791		if (page_has_buffers(page))
2792			goto has_buffers;
2793	}
2794	zero_user(page, offset, length);
2795	set_page_dirty(page);
2796	err = 0;
2797
2798unlock:
2799	unlock_page(page);
2800	page_cache_release(page);
2801out:
2802	return err;
2803}
2804EXPORT_SYMBOL(nobh_truncate_page);
2805
2806int block_truncate_page(struct address_space *mapping,
2807			loff_t from, get_block_t *get_block)
2808{
2809	pgoff_t index = from >> PAGE_CACHE_SHIFT;
2810	unsigned offset = from & (PAGE_CACHE_SIZE-1);
2811	unsigned blocksize;
2812	sector_t iblock;
2813	unsigned length, pos;
2814	struct inode *inode = mapping->host;
2815	struct page *page;
2816	struct buffer_head *bh;
2817	int err;
2818
2819	blocksize = 1 << inode->i_blkbits;
2820	length = offset & (blocksize - 1);
2821
2822	/* Block boundary? Nothing to do */
2823	if (!length)
2824		return 0;
2825
2826	length = blocksize - length;
2827	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2828
2829	page = grab_cache_page(mapping, index);
2830	err = -ENOMEM;
2831	if (!page)
2832		goto out;
2833
2834	if (!page_has_buffers(page))
2835		create_empty_buffers(page, blocksize, 0);
2836
2837	/* Find the buffer that contains "offset" */
2838	bh = page_buffers(page);
2839	pos = blocksize;
2840	while (offset >= pos) {
2841		bh = bh->b_this_page;
2842		iblock++;
2843		pos += blocksize;
2844	}
2845
2846	err = 0;
2847	if (!buffer_mapped(bh)) {
2848		WARN_ON(bh->b_size != blocksize);
2849		err = get_block(inode, iblock, bh, 0);
2850		if (err)
2851			goto unlock;
2852		/* unmapped? It's a hole - nothing to do */
2853		if (!buffer_mapped(bh))
2854			goto unlock;
2855	}
2856
2857	/* Ok, it's mapped. Make sure it's up-to-date */
2858	if (PageUptodate(page))
2859		set_buffer_uptodate(bh);
2860
2861	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2862		err = -EIO;
2863		ll_rw_block(READ, 1, &bh);
2864		wait_on_buffer(bh);
2865		/* Uhhuh. Read error. Complain and punt. */
2866		if (!buffer_uptodate(bh))
2867			goto unlock;
2868	}
2869
2870	zero_user(page, offset, length);
2871	mark_buffer_dirty(bh);
2872	err = 0;
2873
2874unlock:
2875	unlock_page(page);
2876	page_cache_release(page);
2877out:
2878	return err;
2879}
2880EXPORT_SYMBOL(block_truncate_page);
2881
2882/*
2883 * The generic ->writepage function for buffer-backed address_spaces
2884 */
2885int block_write_full_page(struct page *page, get_block_t *get_block,
2886			struct writeback_control *wbc)
2887{
2888	struct inode * const inode = page->mapping->host;
2889	loff_t i_size = i_size_read(inode);
2890	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2891	unsigned offset;
2892
2893	/* Is the page fully inside i_size? */
2894	if (page->index < end_index)
2895		return __block_write_full_page(inode, page, get_block, wbc,
2896					       end_buffer_async_write);
2897
2898	/* Is the page fully outside i_size? (truncate in progress) */
2899	offset = i_size & (PAGE_CACHE_SIZE-1);
2900	if (page->index >= end_index+1 || !offset) {
2901		/*
2902		 * The page may have dirty, unmapped buffers.  For example,
2903		 * they may have been added in ext3_writepage().  Make them
2904		 * freeable here, so the page does not leak.
2905		 */
2906		do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
2907		unlock_page(page);
2908		return 0; /* don't care */
2909	}
2910
2911	/*
2912	 * The page straddles i_size.  It must be zeroed out on each and every
2913	 * writepage invocation because it may be mmapped.  "A file is mapped
2914	 * in multiples of the page size.  For a file that is not a multiple of
2915	 * the  page size, the remaining memory is zeroed when mapped, and
2916	 * writes to that region are not written out to the file."
2917	 */
2918	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2919	return __block_write_full_page(inode, page, get_block, wbc,
2920							end_buffer_async_write);
2921}
2922EXPORT_SYMBOL(block_write_full_page);
2923
2924sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2925			    get_block_t *get_block)
2926{
2927	struct buffer_head tmp;
2928	struct inode *inode = mapping->host;
2929	tmp.b_state = 0;
2930	tmp.b_blocknr = 0;
2931	tmp.b_size = 1 << inode->i_blkbits;
2932	get_block(inode, block, &tmp, 0);
2933	return tmp.b_blocknr;
2934}
2935EXPORT_SYMBOL(generic_block_bmap);
2936
2937static void end_bio_bh_io_sync(struct bio *bio, int err)
2938{
2939	struct buffer_head *bh = bio->bi_private;
2940
2941	if (err == -EOPNOTSUPP) {
2942		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2943	}
2944
2945	if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
2946		set_bit(BH_Quiet, &bh->b_state);
2947
2948	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2949	bio_put(bio);
2950}
2951
2952/*
2953 * This allows us to do IO even on the odd last sectors
2954 * of a device, even if the block size is some multiple
2955 * of the physical sector size.
2956 *
2957 * We'll just truncate the bio to the size of the device,
2958 * and clear the end of the buffer head manually.
2959 *
2960 * Truly out-of-range accesses will turn into actual IO
2961 * errors, this only handles the "we need to be able to
2962 * do IO at the final sector" case.
2963 */
2964void guard_bio_eod(int rw, struct bio *bio)
2965{
2966	sector_t maxsector;
2967	struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
2968	unsigned truncated_bytes;
2969
2970	maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
2971	if (!maxsector)
2972		return;
2973
2974	/*
2975	 * If the *whole* IO is past the end of the device,
2976	 * let it through, and the IO layer will turn it into
2977	 * an EIO.
2978	 */
2979	if (unlikely(bio->bi_iter.bi_sector >= maxsector))
2980		return;
2981
2982	maxsector -= bio->bi_iter.bi_sector;
2983	if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
2984		return;
2985
2986	/* Uhhuh. We've got a bio that straddles the device size! */
2987	truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
2988
2989	/* Truncate the bio.. */
2990	bio->bi_iter.bi_size -= truncated_bytes;
2991	bvec->bv_len -= truncated_bytes;
2992
2993	/* ..and clear the end of the buffer for reads */
2994	if ((rw & RW_MASK) == READ) {
2995		zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
2996				truncated_bytes);
2997	}
2998}
2999
3000int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
3001{
3002	struct bio *bio;
3003	int ret = 0;
3004
3005	BUG_ON(!buffer_locked(bh));
3006	BUG_ON(!buffer_mapped(bh));
3007	BUG_ON(!bh->b_end_io);
3008	BUG_ON(buffer_delay(bh));
3009	BUG_ON(buffer_unwritten(bh));
3010
3011	/*
3012	 * Only clear out a write error when rewriting
3013	 */
3014	if (test_set_buffer_req(bh) && (rw & WRITE))
3015		clear_buffer_write_io_error(bh);
3016
3017	/*
3018	 * from here on down, it's all bio -- do the initial mapping,
3019	 * submit_bio -> generic_make_request may further map this bio around
3020	 */
3021	bio = bio_alloc(GFP_NOIO, 1);
3022
3023	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
3024	bio->bi_bdev = bh->b_bdev;
3025	bio->bi_io_vec[0].bv_page = bh->b_page;
3026	bio->bi_io_vec[0].bv_len = bh->b_size;
3027	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
3028
3029	bio->bi_vcnt = 1;
3030	bio->bi_iter.bi_size = bh->b_size;
3031
3032	bio->bi_end_io = end_bio_bh_io_sync;
3033	bio->bi_private = bh;
3034	bio->bi_flags |= bio_flags;
3035
3036	/* Take care of bh's that straddle the end of the device */
3037	guard_bio_eod(rw, bio);
3038
3039	if (buffer_meta(bh))
3040		rw |= REQ_META;
3041	if (buffer_prio(bh))
3042		rw |= REQ_PRIO;
3043
3044	bio_get(bio);
3045	submit_bio(rw, bio);
3046
3047	if (bio_flagged(bio, BIO_EOPNOTSUPP))
3048		ret = -EOPNOTSUPP;
3049
3050	bio_put(bio);
3051	return ret;
3052}
3053EXPORT_SYMBOL_GPL(_submit_bh);
3054
3055int submit_bh(int rw, struct buffer_head *bh)
3056{
3057	return _submit_bh(rw, bh, 0);
3058}
3059EXPORT_SYMBOL(submit_bh);
3060
3061/**
3062 * ll_rw_block: low-level access to block devices (DEPRECATED)
3063 * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
3064 * @nr: number of &struct buffer_heads in the array
3065 * @bhs: array of pointers to &struct buffer_head
3066 *
3067 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
3068 * requests an I/O operation on them, either a %READ or a %WRITE.  The third
3069 * %READA option is described in the documentation for generic_make_request()
3070 * which ll_rw_block() calls.
3071 *
3072 * This function drops any buffer that it cannot get a lock on (with the
3073 * BH_Lock state bit), any buffer that appears to be clean when doing a write
3074 * request, and any buffer that appears to be up-to-date when doing read
3075 * request.  Further it marks as clean buffers that are processed for
3076 * writing (the buffer cache won't assume that they are actually clean
3077 * until the buffer gets unlocked).
3078 *
3079 * ll_rw_block sets b_end_io to simple completion handler that marks
3080 * the buffer up-to-date (if appropriate), unlocks the buffer and wakes
3081 * any waiters.
3082 *
3083 * All of the buffers must be for the same device, and must also be a
3084 * multiple of the current approved size for the device.
3085 */
3086void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
3087{
3088	int i;
3089
3090	for (i = 0; i < nr; i++) {
3091		struct buffer_head *bh = bhs[i];
3092
3093		if (!trylock_buffer(bh))
3094			continue;
3095		if (rw == WRITE) {
3096			if (test_clear_buffer_dirty(bh)) {
3097				bh->b_end_io = end_buffer_write_sync;
3098				get_bh(bh);
3099				submit_bh(WRITE, bh);
3100				continue;
3101			}
3102		} else {
3103			if (!buffer_uptodate(bh)) {
3104				bh->b_end_io = end_buffer_read_sync;
3105				get_bh(bh);
3106				submit_bh(rw, bh);
3107				continue;
3108			}
3109		}
3110		unlock_buffer(bh);
3111	}
3112}
3113EXPORT_SYMBOL(ll_rw_block);
3114
3115void write_dirty_buffer(struct buffer_head *bh, int rw)
3116{
3117	lock_buffer(bh);
3118	if (!test_clear_buffer_dirty(bh)) {
3119		unlock_buffer(bh);
3120		return;
3121	}
3122	bh->b_end_io = end_buffer_write_sync;
3123	get_bh(bh);
3124	submit_bh(rw, bh);
3125}
3126EXPORT_SYMBOL(write_dirty_buffer);
3127
3128/*
3129 * For a data-integrity writeout, we need to wait upon any in-progress I/O
3130 * and then start new I/O and then wait upon it.  The caller must have a ref on
3131 * the buffer_head.
3132 */
3133int __sync_dirty_buffer(struct buffer_head *bh, int rw)
3134{
3135	int ret = 0;
3136
3137	WARN_ON(atomic_read(&bh->b_count) < 1);
3138	lock_buffer(bh);
3139	if (test_clear_buffer_dirty(bh)) {
3140		get_bh(bh);
3141		bh->b_end_io = end_buffer_write_sync;
3142		ret = submit_bh(rw, bh);
3143		wait_on_buffer(bh);
3144		if (!ret && !buffer_uptodate(bh))
3145			ret = -EIO;
3146	} else {
3147		unlock_buffer(bh);
3148	}
3149	return ret;
3150}
3151EXPORT_SYMBOL(__sync_dirty_buffer);
3152
3153int sync_dirty_buffer(struct buffer_head *bh)
3154{
3155	return __sync_dirty_buffer(bh, WRITE_SYNC);
3156}
3157EXPORT_SYMBOL(sync_dirty_buffer);
3158
3159/*
3160 * try_to_free_buffers() checks if all the buffers on this particular page
3161 * are unused, and releases them if so.
3162 *
3163 * Exclusion against try_to_free_buffers may be obtained by either
3164 * locking the page or by holding its mapping's private_lock.
3165 *
3166 * If the page is dirty but all the buffers are clean then we need to
3167 * be sure to mark the page clean as well.  This is because the page
3168 * may be against a block device, and a later reattachment of buffers
3169 * to a dirty page will set *all* buffers dirty.  Which would corrupt
3170 * filesystem data on the same device.
3171 *
3172 * The same applies to regular filesystem pages: if all the buffers are
3173 * clean then we set the page clean and proceed.  To do that, we require
3174 * total exclusion from __set_page_dirty_buffers().  That is obtained with
3175 * private_lock.
3176 *
3177 * try_to_free_buffers() is non-blocking.
3178 */
3179static inline int buffer_busy(struct buffer_head *bh)
3180{
3181	return atomic_read(&bh->b_count) |
3182		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3183}
3184
3185static int
3186drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3187{
3188	struct buffer_head *head = page_buffers(page);
3189	struct buffer_head *bh;
3190
3191	bh = head;
3192	do {
3193		if (buffer_write_io_error(bh) && page->mapping)
3194			set_bit(AS_EIO, &page->mapping->flags);
3195		if (buffer_busy(bh))
3196			goto failed;
3197		bh = bh->b_this_page;
3198	} while (bh != head);
3199
3200	do {
3201		struct buffer_head *next = bh->b_this_page;
3202
3203		if (bh->b_assoc_map)
3204			__remove_assoc_queue(bh);
3205		bh = next;
3206	} while (bh != head);
3207	*buffers_to_free = head;
3208	__clear_page_buffers(page);
3209	return 1;
3210failed:
3211	return 0;
3212}
3213
3214int try_to_free_buffers(struct page *page)
3215{
3216	struct address_space * const mapping = page->mapping;
3217	struct buffer_head *buffers_to_free = NULL;
3218	int ret = 0;
3219
3220	BUG_ON(!PageLocked(page));
3221	if (PageWriteback(page))
3222		return 0;
3223
3224	if (mapping == NULL) {		/* can this still happen? */
3225		ret = drop_buffers(page, &buffers_to_free);
3226		goto out;
3227	}
3228
3229	spin_lock(&mapping->private_lock);
3230	ret = drop_buffers(page, &buffers_to_free);
3231
3232	/*
3233	 * If the filesystem writes its buffers by hand (eg ext3)
3234	 * then we can have clean buffers against a dirty page.  We
3235	 * clean the page here; otherwise the VM will never notice
3236	 * that the filesystem did any IO at all.
3237	 *
3238	 * Also, during truncate, discard_buffer will have marked all
3239	 * the page's buffers clean.  We discover that here and clean
3240	 * the page also.
3241	 *
3242	 * private_lock must be held over this entire operation in order
3243	 * to synchronise against __set_page_dirty_buffers and prevent the
3244	 * dirty bit from being lost.
3245	 */
3246	if (ret && TestClearPageDirty(page))
3247		account_page_cleaned(page, mapping);
3248	spin_unlock(&mapping->private_lock);
3249out:
3250	if (buffers_to_free) {
3251		struct buffer_head *bh = buffers_to_free;
3252
3253		do {
3254			struct buffer_head *next = bh->b_this_page;
3255			free_buffer_head(bh);
3256			bh = next;
3257		} while (bh != buffers_to_free);
3258	}
3259	return ret;
3260}
3261EXPORT_SYMBOL(try_to_free_buffers);
3262
3263/*
3264 * There are no bdflush tunables left.  But distributions are
3265 * still running obsolete flush daemons, so we terminate them here.
3266 *
3267 * Use of bdflush() is deprecated and will be removed in a future kernel.
3268 * The `flush-X' kernel threads fully replace bdflush daemons and this call.
3269 */
3270SYSCALL_DEFINE2(bdflush, int, func, long, data)
3271{
3272	static int msg_count;
3273
3274	if (!capable(CAP_SYS_ADMIN))
3275		return -EPERM;
3276
3277	if (msg_count < 5) {
3278		msg_count++;
3279		printk(KERN_INFO
3280			"warning: process `%s' used the obsolete bdflush"
3281			" system call\n", current->comm);
3282		printk(KERN_INFO "Fix your initscripts?\n");
3283	}
3284
3285	if (func == 1)
3286		do_exit(0);
3287	return 0;
3288}
3289
3290/*
3291 * Buffer-head allocation
3292 */
3293static struct kmem_cache *bh_cachep __read_mostly;
3294
3295/*
3296 * Once the number of bh's in the machine exceeds this level, we start
3297 * stripping them in writeback.
3298 */
3299static unsigned long max_buffer_heads;
3300
3301int buffer_heads_over_limit;
3302
3303struct bh_accounting {
3304	int nr;			/* Number of live bh's */
3305	int ratelimit;		/* Limit cacheline bouncing */
3306};
3307
3308static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3309
3310static void recalc_bh_state(void)
3311{
3312	int i;
3313	int tot = 0;
3314
3315	if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
3316		return;
3317	__this_cpu_write(bh_accounting.ratelimit, 0);
3318	for_each_online_cpu(i)
3319		tot += per_cpu(bh_accounting, i).nr;
3320	buffer_heads_over_limit = (tot > max_buffer_heads);
3321}
3322
3323struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3324{
3325	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3326	if (ret) {
3327		INIT_LIST_HEAD(&ret->b_assoc_buffers);
3328		preempt_disable();
3329		__this_cpu_inc(bh_accounting.nr);
3330		recalc_bh_state();
3331		preempt_enable();
3332	}
3333	return ret;
3334}
3335EXPORT_SYMBOL(alloc_buffer_head);
3336
3337void free_buffer_head(struct buffer_head *bh)
3338{
3339	BUG_ON(!list_empty(&bh->b_assoc_buffers));
3340	kmem_cache_free(bh_cachep, bh);
3341	preempt_disable();
3342	__this_cpu_dec(bh_accounting.nr);
3343	recalc_bh_state();
3344	preempt_enable();
3345}
3346EXPORT_SYMBOL(free_buffer_head);
3347
3348static void buffer_exit_cpu(int cpu)
3349{
3350	int i;
3351	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3352
3353	for (i = 0; i < BH_LRU_SIZE; i++) {
3354		brelse(b->bhs[i]);
3355		b->bhs[i] = NULL;
3356	}
3357	this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3358	per_cpu(bh_accounting, cpu).nr = 0;
3359}
3360
3361static int buffer_cpu_notify(struct notifier_block *self,
3362			      unsigned long action, void *hcpu)
3363{
3364	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
3365		buffer_exit_cpu((unsigned long)hcpu);
3366	return NOTIFY_OK;
3367}
3368
3369/**
3370 * bh_uptodate_or_lock - Test whether the buffer is uptodate
3371 * @bh: struct buffer_head
3372 *
3373 * Return true if the buffer is up-to-date and false,
3374 * with the buffer locked, if not.
3375 */
3376int bh_uptodate_or_lock(struct buffer_head *bh)
3377{
3378	if (!buffer_uptodate(bh)) {
3379		lock_buffer(bh);
3380		if (!buffer_uptodate(bh))
3381			return 0;
3382		unlock_buffer(bh);
3383	}
3384	return 1;
3385}
3386EXPORT_SYMBOL(bh_uptodate_or_lock);
3387
3388/**
3389 * bh_submit_read - Submit a locked buffer for reading
3390 * @bh: struct buffer_head
3391 *
3392 * Returns zero on success and -EIO on error.
3393 */
3394int bh_submit_read(struct buffer_head *bh)
3395{
3396	BUG_ON(!buffer_locked(bh));
3397
3398	if (buffer_uptodate(bh)) {
3399		unlock_buffer(bh);
3400		return 0;
3401	}
3402
3403	get_bh(bh);
3404	bh->b_end_io = end_buffer_read_sync;
3405	submit_bh(READ, bh);
3406	wait_on_buffer(bh);
3407	if (buffer_uptodate(bh))
3408		return 0;
3409	return -EIO;
3410}
3411EXPORT_SYMBOL(bh_submit_read);
3412
3413void __init buffer_init(void)
3414{
3415	unsigned long nrpages;
3416
3417	bh_cachep = kmem_cache_create("buffer_head",
3418			sizeof(struct buffer_head), 0,
3419				(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3420				SLAB_MEM_SPREAD),
3421				NULL);
3422
3423	/*
3424	 * Limit the bh occupancy to 10% of ZONE_NORMAL
3425	 */
3426	nrpages = (nr_free_buffer_pages() * 10) / 100;
3427	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3428	hotcpu_notifier(buffer_cpu_notify, 0);
3429}
3430