1/*
2 * linux/fs/jbd2/commit.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal commit routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 */
15
16#include <linux/time.h>
17#include <linux/fs.h>
18#include <linux/jbd2.h>
19#include <linux/errno.h>
20#include <linux/slab.h>
21#include <linux/mm.h>
22#include <linux/pagemap.h>
23#include <linux/jiffies.h>
24#include <linux/crc32.h>
25#include <linux/writeback.h>
26#include <linux/backing-dev.h>
27#include <linux/bio.h>
28#include <linux/blkdev.h>
29#include <linux/bitops.h>
30#include <trace/events/jbd2.h>
31
32/*
33 * IO end handler for temporary buffer_heads handling writes to the journal.
34 */
35static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
36{
37	struct buffer_head *orig_bh = bh->b_private;
38
39	BUFFER_TRACE(bh, "");
40	if (uptodate)
41		set_buffer_uptodate(bh);
42	else
43		clear_buffer_uptodate(bh);
44	if (orig_bh) {
45		clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
46		smp_mb__after_atomic();
47		wake_up_bit(&orig_bh->b_state, BH_Shadow);
48	}
49	unlock_buffer(bh);
50}
51
52/*
53 * When an ext4 file is truncated, it is possible that some pages are not
54 * successfully freed, because they are attached to a committing transaction.
55 * After the transaction commits, these pages are left on the LRU, with no
56 * ->mapping, and with attached buffers.  These pages are trivially reclaimable
57 * by the VM, but their apparent absence upsets the VM accounting, and it makes
58 * the numbers in /proc/meminfo look odd.
59 *
60 * So here, we have a buffer which has just come off the forget list.  Look to
61 * see if we can strip all buffers from the backing page.
62 *
63 * Called under lock_journal(), and possibly under journal_datalist_lock.  The
64 * caller provided us with a ref against the buffer, and we drop that here.
65 */
66static void release_buffer_page(struct buffer_head *bh)
67{
68	struct page *page;
69
70	if (buffer_dirty(bh))
71		goto nope;
72	if (atomic_read(&bh->b_count) != 1)
73		goto nope;
74	page = bh->b_page;
75	if (!page)
76		goto nope;
77	if (page->mapping)
78		goto nope;
79
80	/* OK, it's a truncated page */
81	if (!trylock_page(page))
82		goto nope;
83
84	page_cache_get(page);
85	__brelse(bh);
86	try_to_free_buffers(page);
87	unlock_page(page);
88	page_cache_release(page);
89	return;
90
91nope:
92	__brelse(bh);
93}
94
95static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
96{
97	struct commit_header *h;
98	__u32 csum;
99
100	if (!jbd2_journal_has_csum_v2or3(j))
101		return;
102
103	h = (struct commit_header *)(bh->b_data);
104	h->h_chksum_type = 0;
105	h->h_chksum_size = 0;
106	h->h_chksum[0] = 0;
107	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
108	h->h_chksum[0] = cpu_to_be32(csum);
109}
110
111/*
112 * Done it all: now submit the commit record.  We should have
113 * cleaned up our previous buffers by now, so if we are in abort
114 * mode we can now just skip the rest of the journal write
115 * entirely.
116 *
117 * Returns 1 if the journal needs to be aborted or 0 on success
118 */
119static int journal_submit_commit_record(journal_t *journal,
120					transaction_t *commit_transaction,
121					struct buffer_head **cbh,
122					__u32 crc32_sum)
123{
124	struct commit_header *tmp;
125	struct buffer_head *bh;
126	int ret;
127	struct timespec now = current_kernel_time();
128
129	*cbh = NULL;
130
131	if (is_journal_aborted(journal))
132		return 0;
133
134	bh = jbd2_journal_get_descriptor_buffer(journal);
135	if (!bh)
136		return 1;
137
138	tmp = (struct commit_header *)bh->b_data;
139	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
140	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
141	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
142	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
143	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
144
145	if (JBD2_HAS_COMPAT_FEATURE(journal,
146				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
147		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
148		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
149		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
150	}
151	jbd2_commit_block_csum_set(journal, bh);
152
153	BUFFER_TRACE(bh, "submit commit block");
154	lock_buffer(bh);
155	clear_buffer_dirty(bh);
156	set_buffer_uptodate(bh);
157	bh->b_end_io = journal_end_buffer_io_sync;
158
159	if (journal->j_flags & JBD2_BARRIER &&
160	    !JBD2_HAS_INCOMPAT_FEATURE(journal,
161				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
162		ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
163	else
164		ret = submit_bh(WRITE_SYNC, bh);
165
166	*cbh = bh;
167	return ret;
168}
169
170/*
171 * This function along with journal_submit_commit_record
172 * allows to write the commit record asynchronously.
173 */
174static int journal_wait_on_commit_record(journal_t *journal,
175					 struct buffer_head *bh)
176{
177	int ret = 0;
178
179	clear_buffer_dirty(bh);
180	wait_on_buffer(bh);
181
182	if (unlikely(!buffer_uptodate(bh)))
183		ret = -EIO;
184	put_bh(bh);            /* One for getblk() */
185
186	return ret;
187}
188
189/*
190 * write the filemap data using writepage() address_space_operations.
191 * We don't do block allocation here even for delalloc. We don't
192 * use writepages() because with dealyed allocation we may be doing
193 * block allocation in writepages().
194 */
195static int journal_submit_inode_data_buffers(struct address_space *mapping)
196{
197	int ret;
198	struct writeback_control wbc = {
199		.sync_mode =  WB_SYNC_ALL,
200		.nr_to_write = mapping->nrpages * 2,
201		.range_start = 0,
202		.range_end = i_size_read(mapping->host),
203	};
204
205	ret = generic_writepages(mapping, &wbc);
206	return ret;
207}
208
209/*
210 * Submit all the data buffers of inode associated with the transaction to
211 * disk.
212 *
213 * We are in a committing transaction. Therefore no new inode can be added to
214 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
215 * operate on from being released while we write out pages.
216 */
217static int journal_submit_data_buffers(journal_t *journal,
218		transaction_t *commit_transaction)
219{
220	struct jbd2_inode *jinode;
221	int err, ret = 0;
222	struct address_space *mapping;
223
224	spin_lock(&journal->j_list_lock);
225	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
226		mapping = jinode->i_vfs_inode->i_mapping;
227		set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
228		spin_unlock(&journal->j_list_lock);
229		/*
230		 * submit the inode data buffers. We use writepage
231		 * instead of writepages. Because writepages can do
232		 * block allocation  with delalloc. We need to write
233		 * only allocated blocks here.
234		 */
235		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
236		err = journal_submit_inode_data_buffers(mapping);
237		if (!ret)
238			ret = err;
239		spin_lock(&journal->j_list_lock);
240		J_ASSERT(jinode->i_transaction == commit_transaction);
241		clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
242		smp_mb__after_atomic();
243		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
244	}
245	spin_unlock(&journal->j_list_lock);
246	return ret;
247}
248
249/*
250 * Wait for data submitted for writeout, refile inodes to proper
251 * transaction if needed.
252 *
253 */
254static int journal_finish_inode_data_buffers(journal_t *journal,
255		transaction_t *commit_transaction)
256{
257	struct jbd2_inode *jinode, *next_i;
258	int err, ret = 0;
259
260	/* For locking, see the comment in journal_submit_data_buffers() */
261	spin_lock(&journal->j_list_lock);
262	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
263		set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
264		spin_unlock(&journal->j_list_lock);
265		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
266		if (err) {
267			/*
268			 * Because AS_EIO is cleared by
269			 * filemap_fdatawait_range(), set it again so
270			 * that user process can get -EIO from fsync().
271			 */
272			set_bit(AS_EIO,
273				&jinode->i_vfs_inode->i_mapping->flags);
274
275			if (!ret)
276				ret = err;
277		}
278		spin_lock(&journal->j_list_lock);
279		clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
280		smp_mb__after_atomic();
281		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
282	}
283
284	/* Now refile inode to proper lists */
285	list_for_each_entry_safe(jinode, next_i,
286				 &commit_transaction->t_inode_list, i_list) {
287		list_del(&jinode->i_list);
288		if (jinode->i_next_transaction) {
289			jinode->i_transaction = jinode->i_next_transaction;
290			jinode->i_next_transaction = NULL;
291			list_add(&jinode->i_list,
292				&jinode->i_transaction->t_inode_list);
293		} else {
294			jinode->i_transaction = NULL;
295		}
296	}
297	spin_unlock(&journal->j_list_lock);
298
299	return ret;
300}
301
302static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
303{
304	struct page *page = bh->b_page;
305	char *addr;
306	__u32 checksum;
307
308	addr = kmap_atomic(page);
309	checksum = crc32_be(crc32_sum,
310		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
311	kunmap_atomic(addr);
312
313	return checksum;
314}
315
316static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
317				   unsigned long long block)
318{
319	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
320	if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_64BIT))
321		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
322}
323
324static void jbd2_descr_block_csum_set(journal_t *j,
325				      struct buffer_head *bh)
326{
327	struct jbd2_journal_block_tail *tail;
328	__u32 csum;
329
330	if (!jbd2_journal_has_csum_v2or3(j))
331		return;
332
333	tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
334			sizeof(struct jbd2_journal_block_tail));
335	tail->t_checksum = 0;
336	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
337	tail->t_checksum = cpu_to_be32(csum);
338}
339
340static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
341				    struct buffer_head *bh, __u32 sequence)
342{
343	journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
344	struct page *page = bh->b_page;
345	__u8 *addr;
346	__u32 csum32;
347	__be32 seq;
348
349	if (!jbd2_journal_has_csum_v2or3(j))
350		return;
351
352	seq = cpu_to_be32(sequence);
353	addr = kmap_atomic(page);
354	csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
355	csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
356			     bh->b_size);
357	kunmap_atomic(addr);
358
359	if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
360		tag3->t_checksum = cpu_to_be32(csum32);
361	else
362		tag->t_checksum = cpu_to_be16(csum32);
363}
364/*
365 * jbd2_journal_commit_transaction
366 *
367 * The primary function for committing a transaction to the log.  This
368 * function is called by the journal thread to begin a complete commit.
369 */
370void jbd2_journal_commit_transaction(journal_t *journal)
371{
372	struct transaction_stats_s stats;
373	transaction_t *commit_transaction;
374	struct journal_head *jh;
375	struct buffer_head *descriptor;
376	struct buffer_head **wbuf = journal->j_wbuf;
377	int bufs;
378	int flags;
379	int err;
380	unsigned long long blocknr;
381	ktime_t start_time;
382	u64 commit_time;
383	char *tagp = NULL;
384	journal_header_t *header;
385	journal_block_tag_t *tag = NULL;
386	int space_left = 0;
387	int first_tag = 0;
388	int tag_flag;
389	int i;
390	int tag_bytes = journal_tag_bytes(journal);
391	struct buffer_head *cbh = NULL; /* For transactional checksums */
392	__u32 crc32_sum = ~0;
393	struct blk_plug plug;
394	/* Tail of the journal */
395	unsigned long first_block;
396	tid_t first_tid;
397	int update_tail;
398	int csum_size = 0;
399	LIST_HEAD(io_bufs);
400	LIST_HEAD(log_bufs);
401
402	if (jbd2_journal_has_csum_v2or3(journal))
403		csum_size = sizeof(struct jbd2_journal_block_tail);
404
405	/*
406	 * First job: lock down the current transaction and wait for
407	 * all outstanding updates to complete.
408	 */
409
410	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
411	if (journal->j_flags & JBD2_FLUSHED) {
412		jbd_debug(3, "super block updated\n");
413		mutex_lock(&journal->j_checkpoint_mutex);
414		/*
415		 * We hold j_checkpoint_mutex so tail cannot change under us.
416		 * We don't need any special data guarantees for writing sb
417		 * since journal is empty and it is ok for write to be
418		 * flushed only with transaction commit.
419		 */
420		jbd2_journal_update_sb_log_tail(journal,
421						journal->j_tail_sequence,
422						journal->j_tail,
423						WRITE_SYNC);
424		mutex_unlock(&journal->j_checkpoint_mutex);
425	} else {
426		jbd_debug(3, "superblock not updated\n");
427	}
428
429	J_ASSERT(journal->j_running_transaction != NULL);
430	J_ASSERT(journal->j_committing_transaction == NULL);
431
432	commit_transaction = journal->j_running_transaction;
433
434	trace_jbd2_start_commit(journal, commit_transaction);
435	jbd_debug(1, "JBD2: starting commit of transaction %d\n",
436			commit_transaction->t_tid);
437
438	write_lock(&journal->j_state_lock);
439	J_ASSERT(commit_transaction->t_state == T_RUNNING);
440	commit_transaction->t_state = T_LOCKED;
441
442	trace_jbd2_commit_locking(journal, commit_transaction);
443	stats.run.rs_wait = commit_transaction->t_max_wait;
444	stats.run.rs_request_delay = 0;
445	stats.run.rs_locked = jiffies;
446	if (commit_transaction->t_requested)
447		stats.run.rs_request_delay =
448			jbd2_time_diff(commit_transaction->t_requested,
449				       stats.run.rs_locked);
450	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
451					      stats.run.rs_locked);
452
453	spin_lock(&commit_transaction->t_handle_lock);
454	while (atomic_read(&commit_transaction->t_updates)) {
455		DEFINE_WAIT(wait);
456
457		prepare_to_wait(&journal->j_wait_updates, &wait,
458					TASK_UNINTERRUPTIBLE);
459		if (atomic_read(&commit_transaction->t_updates)) {
460			spin_unlock(&commit_transaction->t_handle_lock);
461			write_unlock(&journal->j_state_lock);
462			schedule();
463			write_lock(&journal->j_state_lock);
464			spin_lock(&commit_transaction->t_handle_lock);
465		}
466		finish_wait(&journal->j_wait_updates, &wait);
467	}
468	spin_unlock(&commit_transaction->t_handle_lock);
469
470	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
471			journal->j_max_transaction_buffers);
472
473	/*
474	 * First thing we are allowed to do is to discard any remaining
475	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
476	 * that there are no such buffers: if a large filesystem
477	 * operation like a truncate needs to split itself over multiple
478	 * transactions, then it may try to do a jbd2_journal_restart() while
479	 * there are still BJ_Reserved buffers outstanding.  These must
480	 * be released cleanly from the current transaction.
481	 *
482	 * In this case, the filesystem must still reserve write access
483	 * again before modifying the buffer in the new transaction, but
484	 * we do not require it to remember exactly which old buffers it
485	 * has reserved.  This is consistent with the existing behaviour
486	 * that multiple jbd2_journal_get_write_access() calls to the same
487	 * buffer are perfectly permissible.
488	 */
489	while (commit_transaction->t_reserved_list) {
490		jh = commit_transaction->t_reserved_list;
491		JBUFFER_TRACE(jh, "reserved, unused: refile");
492		/*
493		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
494		 * leave undo-committed data.
495		 */
496		if (jh->b_committed_data) {
497			struct buffer_head *bh = jh2bh(jh);
498
499			jbd_lock_bh_state(bh);
500			jbd2_free(jh->b_committed_data, bh->b_size);
501			jh->b_committed_data = NULL;
502			jbd_unlock_bh_state(bh);
503		}
504		jbd2_journal_refile_buffer(journal, jh);
505	}
506
507	/*
508	 * Now try to drop any written-back buffers from the journal's
509	 * checkpoint lists.  We do this *before* commit because it potentially
510	 * frees some memory
511	 */
512	spin_lock(&journal->j_list_lock);
513	__jbd2_journal_clean_checkpoint_list(journal, false);
514	spin_unlock(&journal->j_list_lock);
515
516	jbd_debug(3, "JBD2: commit phase 1\n");
517
518	/*
519	 * Clear revoked flag to reflect there is no revoked buffers
520	 * in the next transaction which is going to be started.
521	 */
522	jbd2_clear_buffer_revoked_flags(journal);
523
524	/*
525	 * Switch to a new revoke table.
526	 */
527	jbd2_journal_switch_revoke_table(journal);
528
529	/*
530	 * Reserved credits cannot be claimed anymore, free them
531	 */
532	atomic_sub(atomic_read(&journal->j_reserved_credits),
533		   &commit_transaction->t_outstanding_credits);
534
535	trace_jbd2_commit_flushing(journal, commit_transaction);
536	stats.run.rs_flushing = jiffies;
537	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
538					     stats.run.rs_flushing);
539
540	commit_transaction->t_state = T_FLUSH;
541	journal->j_committing_transaction = commit_transaction;
542	journal->j_running_transaction = NULL;
543	start_time = ktime_get();
544	commit_transaction->t_log_start = journal->j_head;
545	wake_up(&journal->j_wait_transaction_locked);
546	write_unlock(&journal->j_state_lock);
547
548	jbd_debug(3, "JBD2: commit phase 2a\n");
549
550	/*
551	 * Now start flushing things to disk, in the order they appear
552	 * on the transaction lists.  Data blocks go first.
553	 */
554	err = journal_submit_data_buffers(journal, commit_transaction);
555	if (err)
556		jbd2_journal_abort(journal, err);
557
558	blk_start_plug(&plug);
559	jbd2_journal_write_revoke_records(journal, commit_transaction,
560					  &log_bufs, WRITE_SYNC);
561
562	jbd_debug(3, "JBD2: commit phase 2b\n");
563
564	/*
565	 * Way to go: we have now written out all of the data for a
566	 * transaction!  Now comes the tricky part: we need to write out
567	 * metadata.  Loop over the transaction's entire buffer list:
568	 */
569	write_lock(&journal->j_state_lock);
570	commit_transaction->t_state = T_COMMIT;
571	write_unlock(&journal->j_state_lock);
572
573	trace_jbd2_commit_logging(journal, commit_transaction);
574	stats.run.rs_logging = jiffies;
575	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
576					       stats.run.rs_logging);
577	stats.run.rs_blocks =
578		atomic_read(&commit_transaction->t_outstanding_credits);
579	stats.run.rs_blocks_logged = 0;
580
581	J_ASSERT(commit_transaction->t_nr_buffers <=
582		 atomic_read(&commit_transaction->t_outstanding_credits));
583
584	err = 0;
585	bufs = 0;
586	descriptor = NULL;
587	while (commit_transaction->t_buffers) {
588
589		/* Find the next buffer to be journaled... */
590
591		jh = commit_transaction->t_buffers;
592
593		/* If we're in abort mode, we just un-journal the buffer and
594		   release it. */
595
596		if (is_journal_aborted(journal)) {
597			clear_buffer_jbddirty(jh2bh(jh));
598			JBUFFER_TRACE(jh, "journal is aborting: refile");
599			jbd2_buffer_abort_trigger(jh,
600						  jh->b_frozen_data ?
601						  jh->b_frozen_triggers :
602						  jh->b_triggers);
603			jbd2_journal_refile_buffer(journal, jh);
604			/* If that was the last one, we need to clean up
605			 * any descriptor buffers which may have been
606			 * already allocated, even if we are now
607			 * aborting. */
608			if (!commit_transaction->t_buffers)
609				goto start_journal_io;
610			continue;
611		}
612
613		/* Make sure we have a descriptor block in which to
614		   record the metadata buffer. */
615
616		if (!descriptor) {
617			J_ASSERT (bufs == 0);
618
619			jbd_debug(4, "JBD2: get descriptor\n");
620
621			descriptor = jbd2_journal_get_descriptor_buffer(journal);
622			if (!descriptor) {
623				jbd2_journal_abort(journal, -EIO);
624				continue;
625			}
626
627			jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
628				(unsigned long long)descriptor->b_blocknr,
629				descriptor->b_data);
630			header = (journal_header_t *)descriptor->b_data;
631			header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
632			header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
633			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
634
635			tagp = &descriptor->b_data[sizeof(journal_header_t)];
636			space_left = descriptor->b_size -
637						sizeof(journal_header_t);
638			first_tag = 1;
639			set_buffer_jwrite(descriptor);
640			set_buffer_dirty(descriptor);
641			wbuf[bufs++] = descriptor;
642
643			/* Record it so that we can wait for IO
644                           completion later */
645			BUFFER_TRACE(descriptor, "ph3: file as descriptor");
646			jbd2_file_log_bh(&log_bufs, descriptor);
647		}
648
649		/* Where is the buffer to be written? */
650
651		err = jbd2_journal_next_log_block(journal, &blocknr);
652		/* If the block mapping failed, just abandon the buffer
653		   and repeat this loop: we'll fall into the
654		   refile-on-abort condition above. */
655		if (err) {
656			jbd2_journal_abort(journal, err);
657			continue;
658		}
659
660		/*
661		 * start_this_handle() uses t_outstanding_credits to determine
662		 * the free space in the log, but this counter is changed
663		 * by jbd2_journal_next_log_block() also.
664		 */
665		atomic_dec(&commit_transaction->t_outstanding_credits);
666
667		/* Bump b_count to prevent truncate from stumbling over
668                   the shadowed buffer!  @@@ This can go if we ever get
669                   rid of the shadow pairing of buffers. */
670		atomic_inc(&jh2bh(jh)->b_count);
671
672		/*
673		 * Make a temporary IO buffer with which to write it out
674		 * (this will requeue the metadata buffer to BJ_Shadow).
675		 */
676		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
677		JBUFFER_TRACE(jh, "ph3: write metadata");
678		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
679						jh, &wbuf[bufs], blocknr);
680		if (flags < 0) {
681			jbd2_journal_abort(journal, flags);
682			continue;
683		}
684		jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
685
686		/* Record the new block's tag in the current descriptor
687                   buffer */
688
689		tag_flag = 0;
690		if (flags & 1)
691			tag_flag |= JBD2_FLAG_ESCAPE;
692		if (!first_tag)
693			tag_flag |= JBD2_FLAG_SAME_UUID;
694
695		tag = (journal_block_tag_t *) tagp;
696		write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
697		tag->t_flags = cpu_to_be16(tag_flag);
698		jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
699					commit_transaction->t_tid);
700		tagp += tag_bytes;
701		space_left -= tag_bytes;
702		bufs++;
703
704		if (first_tag) {
705			memcpy (tagp, journal->j_uuid, 16);
706			tagp += 16;
707			space_left -= 16;
708			first_tag = 0;
709		}
710
711		/* If there's no more to do, or if the descriptor is full,
712		   let the IO rip! */
713
714		if (bufs == journal->j_wbufsize ||
715		    commit_transaction->t_buffers == NULL ||
716		    space_left < tag_bytes + 16 + csum_size) {
717
718			jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
719
720			/* Write an end-of-descriptor marker before
721                           submitting the IOs.  "tag" still points to
722                           the last tag we set up. */
723
724			tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
725
726			jbd2_descr_block_csum_set(journal, descriptor);
727start_journal_io:
728			for (i = 0; i < bufs; i++) {
729				struct buffer_head *bh = wbuf[i];
730				/*
731				 * Compute checksum.
732				 */
733				if (JBD2_HAS_COMPAT_FEATURE(journal,
734					JBD2_FEATURE_COMPAT_CHECKSUM)) {
735					crc32_sum =
736					    jbd2_checksum_data(crc32_sum, bh);
737				}
738
739				lock_buffer(bh);
740				clear_buffer_dirty(bh);
741				set_buffer_uptodate(bh);
742				bh->b_end_io = journal_end_buffer_io_sync;
743				submit_bh(WRITE_SYNC, bh);
744			}
745			cond_resched();
746			stats.run.rs_blocks_logged += bufs;
747
748			/* Force a new descriptor to be generated next
749                           time round the loop. */
750			descriptor = NULL;
751			bufs = 0;
752		}
753	}
754
755	err = journal_finish_inode_data_buffers(journal, commit_transaction);
756	if (err) {
757		printk(KERN_WARNING
758			"JBD2: Detected IO errors while flushing file data "
759		       "on %s\n", journal->j_devname);
760		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
761			jbd2_journal_abort(journal, err);
762		err = 0;
763	}
764
765	/*
766	 * Get current oldest transaction in the log before we issue flush
767	 * to the filesystem device. After the flush we can be sure that
768	 * blocks of all older transactions are checkpointed to persistent
769	 * storage and we will be safe to update journal start in the
770	 * superblock with the numbers we get here.
771	 */
772	update_tail =
773		jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
774
775	write_lock(&journal->j_state_lock);
776	if (update_tail) {
777		long freed = first_block - journal->j_tail;
778
779		if (first_block < journal->j_tail)
780			freed += journal->j_last - journal->j_first;
781		/* Update tail only if we free significant amount of space */
782		if (freed < journal->j_maxlen / 4)
783			update_tail = 0;
784	}
785	J_ASSERT(commit_transaction->t_state == T_COMMIT);
786	commit_transaction->t_state = T_COMMIT_DFLUSH;
787	write_unlock(&journal->j_state_lock);
788
789	/*
790	 * If the journal is not located on the file system device,
791	 * then we must flush the file system device before we issue
792	 * the commit record
793	 */
794	if (commit_transaction->t_need_data_flush &&
795	    (journal->j_fs_dev != journal->j_dev) &&
796	    (journal->j_flags & JBD2_BARRIER))
797		blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
798
799	/* Done it all: now write the commit record asynchronously. */
800	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
801				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
802		err = journal_submit_commit_record(journal, commit_transaction,
803						 &cbh, crc32_sum);
804		if (err)
805			__jbd2_journal_abort_hard(journal);
806	}
807
808	blk_finish_plug(&plug);
809
810	/* Lo and behold: we have just managed to send a transaction to
811           the log.  Before we can commit it, wait for the IO so far to
812           complete.  Control buffers being written are on the
813           transaction's t_log_list queue, and metadata buffers are on
814           the io_bufs list.
815
816	   Wait for the buffers in reverse order.  That way we are
817	   less likely to be woken up until all IOs have completed, and
818	   so we incur less scheduling load.
819	*/
820
821	jbd_debug(3, "JBD2: commit phase 3\n");
822
823	while (!list_empty(&io_bufs)) {
824		struct buffer_head *bh = list_entry(io_bufs.prev,
825						    struct buffer_head,
826						    b_assoc_buffers);
827
828		wait_on_buffer(bh);
829		cond_resched();
830
831		if (unlikely(!buffer_uptodate(bh)))
832			err = -EIO;
833		jbd2_unfile_log_bh(bh);
834
835		/*
836		 * The list contains temporary buffer heads created by
837		 * jbd2_journal_write_metadata_buffer().
838		 */
839		BUFFER_TRACE(bh, "dumping temporary bh");
840		__brelse(bh);
841		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
842		free_buffer_head(bh);
843
844		/* We also have to refile the corresponding shadowed buffer */
845		jh = commit_transaction->t_shadow_list->b_tprev;
846		bh = jh2bh(jh);
847		clear_buffer_jwrite(bh);
848		J_ASSERT_BH(bh, buffer_jbddirty(bh));
849		J_ASSERT_BH(bh, !buffer_shadow(bh));
850
851		/* The metadata is now released for reuse, but we need
852                   to remember it against this transaction so that when
853                   we finally commit, we can do any checkpointing
854                   required. */
855		JBUFFER_TRACE(jh, "file as BJ_Forget");
856		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
857		JBUFFER_TRACE(jh, "brelse shadowed buffer");
858		__brelse(bh);
859	}
860
861	J_ASSERT (commit_transaction->t_shadow_list == NULL);
862
863	jbd_debug(3, "JBD2: commit phase 4\n");
864
865	/* Here we wait for the revoke record and descriptor record buffers */
866	while (!list_empty(&log_bufs)) {
867		struct buffer_head *bh;
868
869		bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
870		wait_on_buffer(bh);
871		cond_resched();
872
873		if (unlikely(!buffer_uptodate(bh)))
874			err = -EIO;
875
876		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
877		clear_buffer_jwrite(bh);
878		jbd2_unfile_log_bh(bh);
879		__brelse(bh);		/* One for getblk */
880		/* AKPM: bforget here */
881	}
882
883	if (err)
884		jbd2_journal_abort(journal, err);
885
886	jbd_debug(3, "JBD2: commit phase 5\n");
887	write_lock(&journal->j_state_lock);
888	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
889	commit_transaction->t_state = T_COMMIT_JFLUSH;
890	write_unlock(&journal->j_state_lock);
891
892	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
893				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
894		err = journal_submit_commit_record(journal, commit_transaction,
895						&cbh, crc32_sum);
896		if (err)
897			__jbd2_journal_abort_hard(journal);
898	}
899	if (cbh)
900		err = journal_wait_on_commit_record(journal, cbh);
901	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
902				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
903	    journal->j_flags & JBD2_BARRIER) {
904		blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
905	}
906
907	if (err)
908		jbd2_journal_abort(journal, err);
909
910	/*
911	 * Now disk caches for filesystem device are flushed so we are safe to
912	 * erase checkpointed transactions from the log by updating journal
913	 * superblock.
914	 */
915	if (update_tail)
916		jbd2_update_log_tail(journal, first_tid, first_block);
917
918	/* End of a transaction!  Finally, we can do checkpoint
919           processing: any buffers committed as a result of this
920           transaction can be removed from any checkpoint list it was on
921           before. */
922
923	jbd_debug(3, "JBD2: commit phase 6\n");
924
925	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
926	J_ASSERT(commit_transaction->t_buffers == NULL);
927	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
928	J_ASSERT(commit_transaction->t_shadow_list == NULL);
929
930restart_loop:
931	/*
932	 * As there are other places (journal_unmap_buffer()) adding buffers
933	 * to this list we have to be careful and hold the j_list_lock.
934	 */
935	spin_lock(&journal->j_list_lock);
936	while (commit_transaction->t_forget) {
937		transaction_t *cp_transaction;
938		struct buffer_head *bh;
939		int try_to_free = 0;
940
941		jh = commit_transaction->t_forget;
942		spin_unlock(&journal->j_list_lock);
943		bh = jh2bh(jh);
944		/*
945		 * Get a reference so that bh cannot be freed before we are
946		 * done with it.
947		 */
948		get_bh(bh);
949		jbd_lock_bh_state(bh);
950		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
951
952		/*
953		 * If there is undo-protected committed data against
954		 * this buffer, then we can remove it now.  If it is a
955		 * buffer needing such protection, the old frozen_data
956		 * field now points to a committed version of the
957		 * buffer, so rotate that field to the new committed
958		 * data.
959		 *
960		 * Otherwise, we can just throw away the frozen data now.
961		 *
962		 * We also know that the frozen data has already fired
963		 * its triggers if they exist, so we can clear that too.
964		 */
965		if (jh->b_committed_data) {
966			jbd2_free(jh->b_committed_data, bh->b_size);
967			jh->b_committed_data = NULL;
968			if (jh->b_frozen_data) {
969				jh->b_committed_data = jh->b_frozen_data;
970				jh->b_frozen_data = NULL;
971				jh->b_frozen_triggers = NULL;
972			}
973		} else if (jh->b_frozen_data) {
974			jbd2_free(jh->b_frozen_data, bh->b_size);
975			jh->b_frozen_data = NULL;
976			jh->b_frozen_triggers = NULL;
977		}
978
979		spin_lock(&journal->j_list_lock);
980		cp_transaction = jh->b_cp_transaction;
981		if (cp_transaction) {
982			JBUFFER_TRACE(jh, "remove from old cp transaction");
983			cp_transaction->t_chp_stats.cs_dropped++;
984			__jbd2_journal_remove_checkpoint(jh);
985		}
986
987		/* Only re-checkpoint the buffer_head if it is marked
988		 * dirty.  If the buffer was added to the BJ_Forget list
989		 * by jbd2_journal_forget, it may no longer be dirty and
990		 * there's no point in keeping a checkpoint record for
991		 * it. */
992
993		/*
994		* A buffer which has been freed while still being journaled by
995		* a previous transaction.
996		*/
997		if (buffer_freed(bh)) {
998			/*
999			 * If the running transaction is the one containing
1000			 * "add to orphan" operation (b_next_transaction !=
1001			 * NULL), we have to wait for that transaction to
1002			 * commit before we can really get rid of the buffer.
1003			 * So just clear b_modified to not confuse transaction
1004			 * credit accounting and refile the buffer to
1005			 * BJ_Forget of the running transaction. If the just
1006			 * committed transaction contains "add to orphan"
1007			 * operation, we can completely invalidate the buffer
1008			 * now. We are rather through in that since the
1009			 * buffer may be still accessible when blocksize <
1010			 * pagesize and it is attached to the last partial
1011			 * page.
1012			 */
1013			jh->b_modified = 0;
1014			if (!jh->b_next_transaction) {
1015				clear_buffer_freed(bh);
1016				clear_buffer_jbddirty(bh);
1017				clear_buffer_mapped(bh);
1018				clear_buffer_new(bh);
1019				clear_buffer_req(bh);
1020				bh->b_bdev = NULL;
1021			}
1022		}
1023
1024		if (buffer_jbddirty(bh)) {
1025			JBUFFER_TRACE(jh, "add to new checkpointing trans");
1026			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
1027			if (is_journal_aborted(journal))
1028				clear_buffer_jbddirty(bh);
1029		} else {
1030			J_ASSERT_BH(bh, !buffer_dirty(bh));
1031			/*
1032			 * The buffer on BJ_Forget list and not jbddirty means
1033			 * it has been freed by this transaction and hence it
1034			 * could not have been reallocated until this
1035			 * transaction has committed. *BUT* it could be
1036			 * reallocated once we have written all the data to
1037			 * disk and before we process the buffer on BJ_Forget
1038			 * list.
1039			 */
1040			if (!jh->b_next_transaction)
1041				try_to_free = 1;
1042		}
1043		JBUFFER_TRACE(jh, "refile or unfile buffer");
1044		__jbd2_journal_refile_buffer(jh);
1045		jbd_unlock_bh_state(bh);
1046		if (try_to_free)
1047			release_buffer_page(bh);	/* Drops bh reference */
1048		else
1049			__brelse(bh);
1050		cond_resched_lock(&journal->j_list_lock);
1051	}
1052	spin_unlock(&journal->j_list_lock);
1053	/*
1054	 * This is a bit sleazy.  We use j_list_lock to protect transition
1055	 * of a transaction into T_FINISHED state and calling
1056	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
1057	 * other checkpointing code processing the transaction...
1058	 */
1059	write_lock(&journal->j_state_lock);
1060	spin_lock(&journal->j_list_lock);
1061	/*
1062	 * Now recheck if some buffers did not get attached to the transaction
1063	 * while the lock was dropped...
1064	 */
1065	if (commit_transaction->t_forget) {
1066		spin_unlock(&journal->j_list_lock);
1067		write_unlock(&journal->j_state_lock);
1068		goto restart_loop;
1069	}
1070
1071	/* Add the transaction to the checkpoint list
1072	 * __journal_remove_checkpoint() can not destroy transaction
1073	 * under us because it is not marked as T_FINISHED yet */
1074	if (journal->j_checkpoint_transactions == NULL) {
1075		journal->j_checkpoint_transactions = commit_transaction;
1076		commit_transaction->t_cpnext = commit_transaction;
1077		commit_transaction->t_cpprev = commit_transaction;
1078	} else {
1079		commit_transaction->t_cpnext =
1080			journal->j_checkpoint_transactions;
1081		commit_transaction->t_cpprev =
1082			commit_transaction->t_cpnext->t_cpprev;
1083		commit_transaction->t_cpnext->t_cpprev =
1084			commit_transaction;
1085		commit_transaction->t_cpprev->t_cpnext =
1086				commit_transaction;
1087	}
1088	spin_unlock(&journal->j_list_lock);
1089
1090	/* Done with this transaction! */
1091
1092	jbd_debug(3, "JBD2: commit phase 7\n");
1093
1094	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1095
1096	commit_transaction->t_start = jiffies;
1097	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1098					      commit_transaction->t_start);
1099
1100	/*
1101	 * File the transaction statistics
1102	 */
1103	stats.ts_tid = commit_transaction->t_tid;
1104	stats.run.rs_handle_count =
1105		atomic_read(&commit_transaction->t_handle_count);
1106	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1107			     commit_transaction->t_tid, &stats.run);
1108	stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1109
1110	commit_transaction->t_state = T_COMMIT_CALLBACK;
1111	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1112	journal->j_commit_sequence = commit_transaction->t_tid;
1113	journal->j_committing_transaction = NULL;
1114	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1115
1116	/*
1117	 * weight the commit time higher than the average time so we don't
1118	 * react too strongly to vast changes in the commit time
1119	 */
1120	if (likely(journal->j_average_commit_time))
1121		journal->j_average_commit_time = (commit_time +
1122				journal->j_average_commit_time*3) / 4;
1123	else
1124		journal->j_average_commit_time = commit_time;
1125
1126	write_unlock(&journal->j_state_lock);
1127
1128	if (journal->j_commit_callback)
1129		journal->j_commit_callback(journal, commit_transaction);
1130
1131	trace_jbd2_end_commit(journal, commit_transaction);
1132	jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1133		  journal->j_commit_sequence, journal->j_tail_sequence);
1134
1135	write_lock(&journal->j_state_lock);
1136	spin_lock(&journal->j_list_lock);
1137	commit_transaction->t_state = T_FINISHED;
1138	/* Check if the transaction can be dropped now that we are finished */
1139	if (commit_transaction->t_checkpoint_list == NULL &&
1140	    commit_transaction->t_checkpoint_io_list == NULL) {
1141		__jbd2_journal_drop_transaction(journal, commit_transaction);
1142		jbd2_journal_free_transaction(commit_transaction);
1143	}
1144	spin_unlock(&journal->j_list_lock);
1145	write_unlock(&journal->j_state_lock);
1146	wake_up(&journal->j_wait_done_commit);
1147
1148	/*
1149	 * Calculate overall stats
1150	 */
1151	spin_lock(&journal->j_history_lock);
1152	journal->j_stats.ts_tid++;
1153	journal->j_stats.ts_requested += stats.ts_requested;
1154	journal->j_stats.run.rs_wait += stats.run.rs_wait;
1155	journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1156	journal->j_stats.run.rs_running += stats.run.rs_running;
1157	journal->j_stats.run.rs_locked += stats.run.rs_locked;
1158	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1159	journal->j_stats.run.rs_logging += stats.run.rs_logging;
1160	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1161	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1162	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1163	spin_unlock(&journal->j_history_lock);
1164}
1165