1/*
2 *   Copyright (C) International Business Machines Corp., 2000-2005
3 *   Portions Copyright (C) Christoph Hellwig, 2001-2002
4 *
5 *   This program is free software;  you can redistribute it and/or modify
6 *   it under the terms of the GNU General Public License as published by
7 *   the Free Software Foundation; either version 2 of the License, or
8 *   (at your option) any later version.
9 *
10 *   This program is distributed in the hope that it will be useful,
11 *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
12 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
13 *   the GNU General Public License for more details.
14 *
15 *   You should have received a copy of the GNU General Public License
16 *   along with this program;  if not, write to the Free Software
17 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20/*
21 *	jfs_txnmgr.c: transaction manager
22 *
23 * notes:
24 * transaction starts with txBegin() and ends with txCommit()
25 * or txAbort().
26 *
27 * tlock is acquired at the time of update;
28 * (obviate scan at commit time for xtree and dtree)
29 * tlock and mp points to each other;
30 * (no hashlist for mp -> tlock).
31 *
32 * special cases:
33 * tlock on in-memory inode:
34 * in-place tlock in the in-memory inode itself;
35 * converted to page lock by iWrite() at commit time.
36 *
37 * tlock during write()/mmap() under anonymous transaction (tid = 0):
38 * transferred (?) to transaction at commit time.
39 *
40 * use the page itself to update allocation maps
41 * (obviate intermediate replication of allocation/deallocation data)
42 * hold on to mp+lock thru update of maps
43 */
44
45#include <linux/fs.h>
46#include <linux/vmalloc.h>
47#include <linux/completion.h>
48#include <linux/freezer.h>
49#include <linux/module.h>
50#include <linux/moduleparam.h>
51#include <linux/kthread.h>
52#include <linux/seq_file.h>
53#include "jfs_incore.h"
54#include "jfs_inode.h"
55#include "jfs_filsys.h"
56#include "jfs_metapage.h"
57#include "jfs_dinode.h"
58#include "jfs_imap.h"
59#include "jfs_dmap.h"
60#include "jfs_superblock.h"
61#include "jfs_debug.h"
62
63/*
64 *	transaction management structures
65 */
66static struct {
67	int freetid;		/* index of a free tid structure */
68	int freelock;		/* index first free lock word */
69	wait_queue_head_t freewait;	/* eventlist of free tblock */
70	wait_queue_head_t freelockwait;	/* eventlist of free tlock */
71	wait_queue_head_t lowlockwait;	/* eventlist of ample tlocks */
72	int tlocksInUse;	/* Number of tlocks in use */
73	spinlock_t LazyLock;	/* synchronize sync_queue & unlock_queue */
74/*	struct tblock *sync_queue; * Transactions waiting for data sync */
75	struct list_head unlock_queue;	/* Txns waiting to be released */
76	struct list_head anon_list;	/* inodes having anonymous txns */
77	struct list_head anon_list2;	/* inodes having anonymous txns
78					   that couldn't be sync'ed */
79} TxAnchor;
80
81int jfs_tlocks_low;		/* Indicates low number of available tlocks */
82
83#ifdef CONFIG_JFS_STATISTICS
84static struct {
85	uint txBegin;
86	uint txBegin_barrier;
87	uint txBegin_lockslow;
88	uint txBegin_freetid;
89	uint txBeginAnon;
90	uint txBeginAnon_barrier;
91	uint txBeginAnon_lockslow;
92	uint txLockAlloc;
93	uint txLockAlloc_freelock;
94} TxStat;
95#endif
96
97static int nTxBlock = -1;	/* number of transaction blocks */
98module_param(nTxBlock, int, 0);
99MODULE_PARM_DESC(nTxBlock,
100		 "Number of transaction blocks (max:65536)");
101
102static int nTxLock = -1;	/* number of transaction locks */
103module_param(nTxLock, int, 0);
104MODULE_PARM_DESC(nTxLock,
105		 "Number of transaction locks (max:65536)");
106
107struct tblock *TxBlock;	/* transaction block table */
108static int TxLockLWM;	/* Low water mark for number of txLocks used */
109static int TxLockHWM;	/* High water mark for number of txLocks used */
110static int TxLockVHWM;	/* Very High water mark */
111struct tlock *TxLock;	/* transaction lock table */
112
113/*
114 *	transaction management lock
115 */
116static DEFINE_SPINLOCK(jfsTxnLock);
117
118#define TXN_LOCK()		spin_lock(&jfsTxnLock)
119#define TXN_UNLOCK()		spin_unlock(&jfsTxnLock)
120
121#define LAZY_LOCK_INIT()	spin_lock_init(&TxAnchor.LazyLock);
122#define LAZY_LOCK(flags)	spin_lock_irqsave(&TxAnchor.LazyLock, flags)
123#define LAZY_UNLOCK(flags) spin_unlock_irqrestore(&TxAnchor.LazyLock, flags)
124
125static DECLARE_WAIT_QUEUE_HEAD(jfs_commit_thread_wait);
126static int jfs_commit_thread_waking;
127
128/*
129 * Retry logic exist outside these macros to protect from spurrious wakeups.
130 */
131static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event)
132{
133	DECLARE_WAITQUEUE(wait, current);
134
135	add_wait_queue(event, &wait);
136	set_current_state(TASK_UNINTERRUPTIBLE);
137	TXN_UNLOCK();
138	io_schedule();
139	remove_wait_queue(event, &wait);
140}
141
142#define TXN_SLEEP(event)\
143{\
144	TXN_SLEEP_DROP_LOCK(event);\
145	TXN_LOCK();\
146}
147
148#define TXN_WAKEUP(event) wake_up_all(event)
149
150/*
151 *	statistics
152 */
153static struct {
154	tid_t maxtid;		/* 4: biggest tid ever used */
155	lid_t maxlid;		/* 4: biggest lid ever used */
156	int ntid;		/* 4: # of transactions performed */
157	int nlid;		/* 4: # of tlocks acquired */
158	int waitlock;		/* 4: # of tlock wait */
159} stattx;
160
161/*
162 * forward references
163 */
164static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
165		struct tlock * tlck, struct commit * cd);
166static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
167		struct tlock * tlck);
168static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
169		struct tlock * tlck);
170static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
171		struct tlock * tlck);
172static void txAllocPMap(struct inode *ip, struct maplock * maplock,
173		struct tblock * tblk);
174static void txForce(struct tblock * tblk);
175static int txLog(struct jfs_log * log, struct tblock * tblk,
176		struct commit * cd);
177static void txUpdateMap(struct tblock * tblk);
178static void txRelease(struct tblock * tblk);
179static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
180	   struct tlock * tlck);
181static void LogSyncRelease(struct metapage * mp);
182
183/*
184 *		transaction block/lock management
185 *		---------------------------------
186 */
187
188/*
189 * Get a transaction lock from the free list.  If the number in use is
190 * greater than the high water mark, wake up the sync daemon.  This should
191 * free some anonymous transaction locks.  (TXN_LOCK must be held.)
192 */
193static lid_t txLockAlloc(void)
194{
195	lid_t lid;
196
197	INCREMENT(TxStat.txLockAlloc);
198	if (!TxAnchor.freelock) {
199		INCREMENT(TxStat.txLockAlloc_freelock);
200	}
201
202	while (!(lid = TxAnchor.freelock))
203		TXN_SLEEP(&TxAnchor.freelockwait);
204	TxAnchor.freelock = TxLock[lid].next;
205	HIGHWATERMARK(stattx.maxlid, lid);
206	if ((++TxAnchor.tlocksInUse > TxLockHWM) && (jfs_tlocks_low == 0)) {
207		jfs_info("txLockAlloc tlocks low");
208		jfs_tlocks_low = 1;
209		wake_up_process(jfsSyncThread);
210	}
211
212	return lid;
213}
214
215static void txLockFree(lid_t lid)
216{
217	TxLock[lid].tid = 0;
218	TxLock[lid].next = TxAnchor.freelock;
219	TxAnchor.freelock = lid;
220	TxAnchor.tlocksInUse--;
221	if (jfs_tlocks_low && (TxAnchor.tlocksInUse < TxLockLWM)) {
222		jfs_info("txLockFree jfs_tlocks_low no more");
223		jfs_tlocks_low = 0;
224		TXN_WAKEUP(&TxAnchor.lowlockwait);
225	}
226	TXN_WAKEUP(&TxAnchor.freelockwait);
227}
228
229/*
230 * NAME:	txInit()
231 *
232 * FUNCTION:	initialize transaction management structures
233 *
234 * RETURN:
235 *
236 * serialization: single thread at jfs_init()
237 */
238int txInit(void)
239{
240	int k, size;
241	struct sysinfo si;
242
243	/* Set defaults for nTxLock and nTxBlock if unset */
244
245	if (nTxLock == -1) {
246		if (nTxBlock == -1) {
247			/* Base default on memory size */
248			si_meminfo(&si);
249			if (si.totalram > (256 * 1024)) /* 1 GB */
250				nTxLock = 64 * 1024;
251			else
252				nTxLock = si.totalram >> 2;
253		} else if (nTxBlock > (8 * 1024))
254			nTxLock = 64 * 1024;
255		else
256			nTxLock = nTxBlock << 3;
257	}
258	if (nTxBlock == -1)
259		nTxBlock = nTxLock >> 3;
260
261	/* Verify tunable parameters */
262	if (nTxBlock < 16)
263		nTxBlock = 16;	/* No one should set it this low */
264	if (nTxBlock > 65536)
265		nTxBlock = 65536;
266	if (nTxLock < 256)
267		nTxLock = 256;	/* No one should set it this low */
268	if (nTxLock > 65536)
269		nTxLock = 65536;
270
271	printk(KERN_INFO "JFS: nTxBlock = %d, nTxLock = %d\n",
272	       nTxBlock, nTxLock);
273	/*
274	 * initialize transaction block (tblock) table
275	 *
276	 * transaction id (tid) = tblock index
277	 * tid = 0 is reserved.
278	 */
279	TxLockLWM = (nTxLock * 4) / 10;
280	TxLockHWM = (nTxLock * 7) / 10;
281	TxLockVHWM = (nTxLock * 8) / 10;
282
283	size = sizeof(struct tblock) * nTxBlock;
284	TxBlock = vmalloc(size);
285	if (TxBlock == NULL)
286		return -ENOMEM;
287
288	for (k = 1; k < nTxBlock - 1; k++) {
289		TxBlock[k].next = k + 1;
290		init_waitqueue_head(&TxBlock[k].gcwait);
291		init_waitqueue_head(&TxBlock[k].waitor);
292	}
293	TxBlock[k].next = 0;
294	init_waitqueue_head(&TxBlock[k].gcwait);
295	init_waitqueue_head(&TxBlock[k].waitor);
296
297	TxAnchor.freetid = 1;
298	init_waitqueue_head(&TxAnchor.freewait);
299
300	stattx.maxtid = 1;	/* statistics */
301
302	/*
303	 * initialize transaction lock (tlock) table
304	 *
305	 * transaction lock id = tlock index
306	 * tlock id = 0 is reserved.
307	 */
308	size = sizeof(struct tlock) * nTxLock;
309	TxLock = vmalloc(size);
310	if (TxLock == NULL) {
311		vfree(TxBlock);
312		return -ENOMEM;
313	}
314
315	/* initialize tlock table */
316	for (k = 1; k < nTxLock - 1; k++)
317		TxLock[k].next = k + 1;
318	TxLock[k].next = 0;
319	init_waitqueue_head(&TxAnchor.freelockwait);
320	init_waitqueue_head(&TxAnchor.lowlockwait);
321
322	TxAnchor.freelock = 1;
323	TxAnchor.tlocksInUse = 0;
324	INIT_LIST_HEAD(&TxAnchor.anon_list);
325	INIT_LIST_HEAD(&TxAnchor.anon_list2);
326
327	LAZY_LOCK_INIT();
328	INIT_LIST_HEAD(&TxAnchor.unlock_queue);
329
330	stattx.maxlid = 1;	/* statistics */
331
332	return 0;
333}
334
335/*
336 * NAME:	txExit()
337 *
338 * FUNCTION:	clean up when module is unloaded
339 */
340void txExit(void)
341{
342	vfree(TxLock);
343	TxLock = NULL;
344	vfree(TxBlock);
345	TxBlock = NULL;
346}
347
348/*
349 * NAME:	txBegin()
350 *
351 * FUNCTION:	start a transaction.
352 *
353 * PARAMETER:	sb	- superblock
354 *		flag	- force for nested tx;
355 *
356 * RETURN:	tid	- transaction id
357 *
358 * note: flag force allows to start tx for nested tx
359 * to prevent deadlock on logsync barrier;
360 */
361tid_t txBegin(struct super_block *sb, int flag)
362{
363	tid_t t;
364	struct tblock *tblk;
365	struct jfs_log *log;
366
367	jfs_info("txBegin: flag = 0x%x", flag);
368	log = JFS_SBI(sb)->log;
369
370	TXN_LOCK();
371
372	INCREMENT(TxStat.txBegin);
373
374      retry:
375	if (!(flag & COMMIT_FORCE)) {
376		/*
377		 * synchronize with logsync barrier
378		 */
379		if (test_bit(log_SYNCBARRIER, &log->flag) ||
380		    test_bit(log_QUIESCE, &log->flag)) {
381			INCREMENT(TxStat.txBegin_barrier);
382			TXN_SLEEP(&log->syncwait);
383			goto retry;
384		}
385	}
386	if (flag == 0) {
387		/*
388		 * Don't begin transaction if we're getting starved for tlocks
389		 * unless COMMIT_FORCE or COMMIT_INODE (which may ultimately
390		 * free tlocks)
391		 */
392		if (TxAnchor.tlocksInUse > TxLockVHWM) {
393			INCREMENT(TxStat.txBegin_lockslow);
394			TXN_SLEEP(&TxAnchor.lowlockwait);
395			goto retry;
396		}
397	}
398
399	/*
400	 * allocate transaction id/block
401	 */
402	if ((t = TxAnchor.freetid) == 0) {
403		jfs_info("txBegin: waiting for free tid");
404		INCREMENT(TxStat.txBegin_freetid);
405		TXN_SLEEP(&TxAnchor.freewait);
406		goto retry;
407	}
408
409	tblk = tid_to_tblock(t);
410
411	if ((tblk->next == 0) && !(flag & COMMIT_FORCE)) {
412		/* Don't let a non-forced transaction take the last tblk */
413		jfs_info("txBegin: waiting for free tid");
414		INCREMENT(TxStat.txBegin_freetid);
415		TXN_SLEEP(&TxAnchor.freewait);
416		goto retry;
417	}
418
419	TxAnchor.freetid = tblk->next;
420
421	/*
422	 * initialize transaction
423	 */
424
425	/*
426	 * We can't zero the whole thing or we screw up another thread being
427	 * awakened after sleeping on tblk->waitor
428	 *
429	 * memset(tblk, 0, sizeof(struct tblock));
430	 */
431	tblk->next = tblk->last = tblk->xflag = tblk->flag = tblk->lsn = 0;
432
433	tblk->sb = sb;
434	++log->logtid;
435	tblk->logtid = log->logtid;
436
437	++log->active;
438
439	HIGHWATERMARK(stattx.maxtid, t);	/* statistics */
440	INCREMENT(stattx.ntid);	/* statistics */
441
442	TXN_UNLOCK();
443
444	jfs_info("txBegin: returning tid = %d", t);
445
446	return t;
447}
448
449/*
450 * NAME:	txBeginAnon()
451 *
452 * FUNCTION:	start an anonymous transaction.
453 *		Blocks if logsync or available tlocks are low to prevent
454 *		anonymous tlocks from depleting supply.
455 *
456 * PARAMETER:	sb	- superblock
457 *
458 * RETURN:	none
459 */
460void txBeginAnon(struct super_block *sb)
461{
462	struct jfs_log *log;
463
464	log = JFS_SBI(sb)->log;
465
466	TXN_LOCK();
467	INCREMENT(TxStat.txBeginAnon);
468
469      retry:
470	/*
471	 * synchronize with logsync barrier
472	 */
473	if (test_bit(log_SYNCBARRIER, &log->flag) ||
474	    test_bit(log_QUIESCE, &log->flag)) {
475		INCREMENT(TxStat.txBeginAnon_barrier);
476		TXN_SLEEP(&log->syncwait);
477		goto retry;
478	}
479
480	/*
481	 * Don't begin transaction if we're getting starved for tlocks
482	 */
483	if (TxAnchor.tlocksInUse > TxLockVHWM) {
484		INCREMENT(TxStat.txBeginAnon_lockslow);
485		TXN_SLEEP(&TxAnchor.lowlockwait);
486		goto retry;
487	}
488	TXN_UNLOCK();
489}
490
491/*
492 *	txEnd()
493 *
494 * function: free specified transaction block.
495 *
496 *	logsync barrier processing:
497 *
498 * serialization:
499 */
500void txEnd(tid_t tid)
501{
502	struct tblock *tblk = tid_to_tblock(tid);
503	struct jfs_log *log;
504
505	jfs_info("txEnd: tid = %d", tid);
506	TXN_LOCK();
507
508	/*
509	 * wakeup transactions waiting on the page locked
510	 * by the current transaction
511	 */
512	TXN_WAKEUP(&tblk->waitor);
513
514	log = JFS_SBI(tblk->sb)->log;
515
516	/*
517	 * Lazy commit thread can't free this guy until we mark it UNLOCKED,
518	 * otherwise, we would be left with a transaction that may have been
519	 * reused.
520	 *
521	 * Lazy commit thread will turn off tblkGC_LAZY before calling this
522	 * routine.
523	 */
524	if (tblk->flag & tblkGC_LAZY) {
525		jfs_info("txEnd called w/lazy tid: %d, tblk = 0x%p", tid, tblk);
526		TXN_UNLOCK();
527
528		spin_lock_irq(&log->gclock);	// LOGGC_LOCK
529		tblk->flag |= tblkGC_UNLOCKED;
530		spin_unlock_irq(&log->gclock);	// LOGGC_UNLOCK
531		return;
532	}
533
534	jfs_info("txEnd: tid: %d, tblk = 0x%p", tid, tblk);
535
536	assert(tblk->next == 0);
537
538	/*
539	 * insert tblock back on freelist
540	 */
541	tblk->next = TxAnchor.freetid;
542	TxAnchor.freetid = tid;
543
544	/*
545	 * mark the tblock not active
546	 */
547	if (--log->active == 0) {
548		clear_bit(log_FLUSH, &log->flag);
549
550		/*
551		 * synchronize with logsync barrier
552		 */
553		if (test_bit(log_SYNCBARRIER, &log->flag)) {
554			TXN_UNLOCK();
555
556			/* write dirty metadata & forward log syncpt */
557			jfs_syncpt(log, 1);
558
559			jfs_info("log barrier off: 0x%x", log->lsn);
560
561			/* enable new transactions start */
562			clear_bit(log_SYNCBARRIER, &log->flag);
563
564			/* wakeup all waitors for logsync barrier */
565			TXN_WAKEUP(&log->syncwait);
566
567			goto wakeup;
568		}
569	}
570
571	TXN_UNLOCK();
572wakeup:
573	/*
574	 * wakeup all waitors for a free tblock
575	 */
576	TXN_WAKEUP(&TxAnchor.freewait);
577}
578
579/*
580 *	txLock()
581 *
582 * function: acquire a transaction lock on the specified <mp>
583 *
584 * parameter:
585 *
586 * return:	transaction lock id
587 *
588 * serialization:
589 */
590struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp,
591		     int type)
592{
593	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
594	int dir_xtree = 0;
595	lid_t lid;
596	tid_t xtid;
597	struct tlock *tlck;
598	struct xtlock *xtlck;
599	struct linelock *linelock;
600	xtpage_t *p;
601	struct tblock *tblk;
602
603	TXN_LOCK();
604
605	if (S_ISDIR(ip->i_mode) && (type & tlckXTREE) &&
606	    !(mp->xflag & COMMIT_PAGE)) {
607		/*
608		 * Directory inode is special.  It can have both an xtree tlock
609		 * and a dtree tlock associated with it.
610		 */
611		dir_xtree = 1;
612		lid = jfs_ip->xtlid;
613	} else
614		lid = mp->lid;
615
616	/* is page not locked by a transaction ? */
617	if (lid == 0)
618		goto allocateLock;
619
620	jfs_info("txLock: tid:%d ip:0x%p mp:0x%p lid:%d", tid, ip, mp, lid);
621
622	/* is page locked by the requester transaction ? */
623	tlck = lid_to_tlock(lid);
624	if ((xtid = tlck->tid) == tid) {
625		TXN_UNLOCK();
626		goto grantLock;
627	}
628
629	/*
630	 * is page locked by anonymous transaction/lock ?
631	 *
632	 * (page update without transaction (i.e., file write) is
633	 * locked under anonymous transaction tid = 0:
634	 * anonymous tlocks maintained on anonymous tlock list of
635	 * the inode of the page and available to all anonymous
636	 * transactions until txCommit() time at which point
637	 * they are transferred to the transaction tlock list of
638	 * the committing transaction of the inode)
639	 */
640	if (xtid == 0) {
641		tlck->tid = tid;
642		TXN_UNLOCK();
643		tblk = tid_to_tblock(tid);
644		/*
645		 * The order of the tlocks in the transaction is important
646		 * (during truncate, child xtree pages must be freed before
647		 * parent's tlocks change the working map).
648		 * Take tlock off anonymous list and add to tail of
649		 * transaction list
650		 *
651		 * Note:  We really need to get rid of the tid & lid and
652		 * use list_head's.  This code is getting UGLY!
653		 */
654		if (jfs_ip->atlhead == lid) {
655			if (jfs_ip->atltail == lid) {
656				/* only anonymous txn.
657				 * Remove from anon_list
658				 */
659				TXN_LOCK();
660				list_del_init(&jfs_ip->anon_inode_list);
661				TXN_UNLOCK();
662			}
663			jfs_ip->atlhead = tlck->next;
664		} else {
665			lid_t last;
666			for (last = jfs_ip->atlhead;
667			     lid_to_tlock(last)->next != lid;
668			     last = lid_to_tlock(last)->next) {
669				assert(last);
670			}
671			lid_to_tlock(last)->next = tlck->next;
672			if (jfs_ip->atltail == lid)
673				jfs_ip->atltail = last;
674		}
675
676		/* insert the tlock at tail of transaction tlock list */
677
678		if (tblk->next)
679			lid_to_tlock(tblk->last)->next = lid;
680		else
681			tblk->next = lid;
682		tlck->next = 0;
683		tblk->last = lid;
684
685		goto grantLock;
686	}
687
688	goto waitLock;
689
690	/*
691	 * allocate a tlock
692	 */
693      allocateLock:
694	lid = txLockAlloc();
695	tlck = lid_to_tlock(lid);
696
697	/*
698	 * initialize tlock
699	 */
700	tlck->tid = tid;
701
702	TXN_UNLOCK();
703
704	/* mark tlock for meta-data page */
705	if (mp->xflag & COMMIT_PAGE) {
706
707		tlck->flag = tlckPAGELOCK;
708
709		/* mark the page dirty and nohomeok */
710		metapage_nohomeok(mp);
711
712		jfs_info("locking mp = 0x%p, nohomeok = %d tid = %d tlck = 0x%p",
713			 mp, mp->nohomeok, tid, tlck);
714
715		/* if anonymous transaction, and buffer is on the group
716		 * commit synclist, mark inode to show this.  This will
717		 * prevent the buffer from being marked nohomeok for too
718		 * long a time.
719		 */
720		if ((tid == 0) && mp->lsn)
721			set_cflag(COMMIT_Synclist, ip);
722	}
723	/* mark tlock for in-memory inode */
724	else
725		tlck->flag = tlckINODELOCK;
726
727	if (S_ISDIR(ip->i_mode))
728		tlck->flag |= tlckDIRECTORY;
729
730	tlck->type = 0;
731
732	/* bind the tlock and the page */
733	tlck->ip = ip;
734	tlck->mp = mp;
735	if (dir_xtree)
736		jfs_ip->xtlid = lid;
737	else
738		mp->lid = lid;
739
740	/*
741	 * enqueue transaction lock to transaction/inode
742	 */
743	/* insert the tlock at tail of transaction tlock list */
744	if (tid) {
745		tblk = tid_to_tblock(tid);
746		if (tblk->next)
747			lid_to_tlock(tblk->last)->next = lid;
748		else
749			tblk->next = lid;
750		tlck->next = 0;
751		tblk->last = lid;
752	}
753	/* anonymous transaction:
754	 * insert the tlock at head of inode anonymous tlock list
755	 */
756	else {
757		tlck->next = jfs_ip->atlhead;
758		jfs_ip->atlhead = lid;
759		if (tlck->next == 0) {
760			/* This inode's first anonymous transaction */
761			jfs_ip->atltail = lid;
762			TXN_LOCK();
763			list_add_tail(&jfs_ip->anon_inode_list,
764				      &TxAnchor.anon_list);
765			TXN_UNLOCK();
766		}
767	}
768
769	/* initialize type dependent area for linelock */
770	linelock = (struct linelock *) & tlck->lock;
771	linelock->next = 0;
772	linelock->flag = tlckLINELOCK;
773	linelock->maxcnt = TLOCKSHORT;
774	linelock->index = 0;
775
776	switch (type & tlckTYPE) {
777	case tlckDTREE:
778		linelock->l2linesize = L2DTSLOTSIZE;
779		break;
780
781	case tlckXTREE:
782		linelock->l2linesize = L2XTSLOTSIZE;
783
784		xtlck = (struct xtlock *) linelock;
785		xtlck->header.offset = 0;
786		xtlck->header.length = 2;
787
788		if (type & tlckNEW) {
789			xtlck->lwm.offset = XTENTRYSTART;
790		} else {
791			if (mp->xflag & COMMIT_PAGE)
792				p = (xtpage_t *) mp->data;
793			else
794				p = &jfs_ip->i_xtroot;
795			xtlck->lwm.offset =
796			    le16_to_cpu(p->header.nextindex);
797		}
798		xtlck->lwm.length = 0;	/* ! */
799		xtlck->twm.offset = 0;
800		xtlck->hwm.offset = 0;
801
802		xtlck->index = 2;
803		break;
804
805	case tlckINODE:
806		linelock->l2linesize = L2INODESLOTSIZE;
807		break;
808
809	case tlckDATA:
810		linelock->l2linesize = L2DATASLOTSIZE;
811		break;
812
813	default:
814		jfs_err("UFO tlock:0x%p", tlck);
815	}
816
817	/*
818	 * update tlock vector
819	 */
820      grantLock:
821	tlck->type |= type;
822
823	return tlck;
824
825	/*
826	 * page is being locked by another transaction:
827	 */
828      waitLock:
829	/* Only locks on ipimap or ipaimap should reach here */
830	/* assert(jfs_ip->fileset == AGGREGATE_I); */
831	if (jfs_ip->fileset != AGGREGATE_I) {
832		printk(KERN_ERR "txLock: trying to lock locked page!");
833		print_hex_dump(KERN_ERR, "ip: ", DUMP_PREFIX_ADDRESS, 16, 4,
834			       ip, sizeof(*ip), 0);
835		print_hex_dump(KERN_ERR, "mp: ", DUMP_PREFIX_ADDRESS, 16, 4,
836			       mp, sizeof(*mp), 0);
837		print_hex_dump(KERN_ERR, "Locker's tblock: ",
838			       DUMP_PREFIX_ADDRESS, 16, 4, tid_to_tblock(tid),
839			       sizeof(struct tblock), 0);
840		print_hex_dump(KERN_ERR, "Tlock: ", DUMP_PREFIX_ADDRESS, 16, 4,
841			       tlck, sizeof(*tlck), 0);
842		BUG();
843	}
844	INCREMENT(stattx.waitlock);	/* statistics */
845	TXN_UNLOCK();
846	release_metapage(mp);
847	TXN_LOCK();
848	xtid = tlck->tid;	/* reacquire after dropping TXN_LOCK */
849
850	jfs_info("txLock: in waitLock, tid = %d, xtid = %d, lid = %d",
851		 tid, xtid, lid);
852
853	/* Recheck everything since dropping TXN_LOCK */
854	if (xtid && (tlck->mp == mp) && (mp->lid == lid))
855		TXN_SLEEP_DROP_LOCK(&tid_to_tblock(xtid)->waitor);
856	else
857		TXN_UNLOCK();
858	jfs_info("txLock: awakened     tid = %d, lid = %d", tid, lid);
859
860	return NULL;
861}
862
863/*
864 * NAME:	txRelease()
865 *
866 * FUNCTION:	Release buffers associated with transaction locks, but don't
867 *		mark homeok yet.  The allows other transactions to modify
868 *		buffers, but won't let them go to disk until commit record
869 *		actually gets written.
870 *
871 * PARAMETER:
872 *		tblk	-
873 *
874 * RETURN:	Errors from subroutines.
875 */
876static void txRelease(struct tblock * tblk)
877{
878	struct metapage *mp;
879	lid_t lid;
880	struct tlock *tlck;
881
882	TXN_LOCK();
883
884	for (lid = tblk->next; lid; lid = tlck->next) {
885		tlck = lid_to_tlock(lid);
886		if ((mp = tlck->mp) != NULL &&
887		    (tlck->type & tlckBTROOT) == 0) {
888			assert(mp->xflag & COMMIT_PAGE);
889			mp->lid = 0;
890		}
891	}
892
893	/*
894	 * wakeup transactions waiting on a page locked
895	 * by the current transaction
896	 */
897	TXN_WAKEUP(&tblk->waitor);
898
899	TXN_UNLOCK();
900}
901
902/*
903 * NAME:	txUnlock()
904 *
905 * FUNCTION:	Initiates pageout of pages modified by tid in journalled
906 *		objects and frees their lockwords.
907 */
908static void txUnlock(struct tblock * tblk)
909{
910	struct tlock *tlck;
911	struct linelock *linelock;
912	lid_t lid, next, llid, k;
913	struct metapage *mp;
914	struct jfs_log *log;
915	int difft, diffp;
916	unsigned long flags;
917
918	jfs_info("txUnlock: tblk = 0x%p", tblk);
919	log = JFS_SBI(tblk->sb)->log;
920
921	/*
922	 * mark page under tlock homeok (its log has been written):
923	 */
924	for (lid = tblk->next; lid; lid = next) {
925		tlck = lid_to_tlock(lid);
926		next = tlck->next;
927
928		jfs_info("unlocking lid = %d, tlck = 0x%p", lid, tlck);
929
930		/* unbind page from tlock */
931		if ((mp = tlck->mp) != NULL &&
932		    (tlck->type & tlckBTROOT) == 0) {
933			assert(mp->xflag & COMMIT_PAGE);
934
935			/* hold buffer
936			 */
937			hold_metapage(mp);
938
939			assert(mp->nohomeok > 0);
940			_metapage_homeok(mp);
941
942			/* inherit younger/larger clsn */
943			LOGSYNC_LOCK(log, flags);
944			if (mp->clsn) {
945				logdiff(difft, tblk->clsn, log);
946				logdiff(diffp, mp->clsn, log);
947				if (difft > diffp)
948					mp->clsn = tblk->clsn;
949			} else
950				mp->clsn = tblk->clsn;
951			LOGSYNC_UNLOCK(log, flags);
952
953			assert(!(tlck->flag & tlckFREEPAGE));
954
955			put_metapage(mp);
956		}
957
958		/* insert tlock, and linelock(s) of the tlock if any,
959		 * at head of freelist
960		 */
961		TXN_LOCK();
962
963		llid = ((struct linelock *) & tlck->lock)->next;
964		while (llid) {
965			linelock = (struct linelock *) lid_to_tlock(llid);
966			k = linelock->next;
967			txLockFree(llid);
968			llid = k;
969		}
970		txLockFree(lid);
971
972		TXN_UNLOCK();
973	}
974	tblk->next = tblk->last = 0;
975
976	/*
977	 * remove tblock from logsynclist
978	 * (allocation map pages inherited lsn of tblk and
979	 * has been inserted in logsync list at txUpdateMap())
980	 */
981	if (tblk->lsn) {
982		LOGSYNC_LOCK(log, flags);
983		log->count--;
984		list_del(&tblk->synclist);
985		LOGSYNC_UNLOCK(log, flags);
986	}
987}
988
989/*
990 *	txMaplock()
991 *
992 * function: allocate a transaction lock for freed page/entry;
993 *	for freed page, maplock is used as xtlock/dtlock type;
994 */
995struct tlock *txMaplock(tid_t tid, struct inode *ip, int type)
996{
997	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
998	lid_t lid;
999	struct tblock *tblk;
1000	struct tlock *tlck;
1001	struct maplock *maplock;
1002
1003	TXN_LOCK();
1004
1005	/*
1006	 * allocate a tlock
1007	 */
1008	lid = txLockAlloc();
1009	tlck = lid_to_tlock(lid);
1010
1011	/*
1012	 * initialize tlock
1013	 */
1014	tlck->tid = tid;
1015
1016	/* bind the tlock and the object */
1017	tlck->flag = tlckINODELOCK;
1018	if (S_ISDIR(ip->i_mode))
1019		tlck->flag |= tlckDIRECTORY;
1020	tlck->ip = ip;
1021	tlck->mp = NULL;
1022
1023	tlck->type = type;
1024
1025	/*
1026	 * enqueue transaction lock to transaction/inode
1027	 */
1028	/* insert the tlock at tail of transaction tlock list */
1029	if (tid) {
1030		tblk = tid_to_tblock(tid);
1031		if (tblk->next)
1032			lid_to_tlock(tblk->last)->next = lid;
1033		else
1034			tblk->next = lid;
1035		tlck->next = 0;
1036		tblk->last = lid;
1037	}
1038	/* anonymous transaction:
1039	 * insert the tlock at head of inode anonymous tlock list
1040	 */
1041	else {
1042		tlck->next = jfs_ip->atlhead;
1043		jfs_ip->atlhead = lid;
1044		if (tlck->next == 0) {
1045			/* This inode's first anonymous transaction */
1046			jfs_ip->atltail = lid;
1047			list_add_tail(&jfs_ip->anon_inode_list,
1048				      &TxAnchor.anon_list);
1049		}
1050	}
1051
1052	TXN_UNLOCK();
1053
1054	/* initialize type dependent area for maplock */
1055	maplock = (struct maplock *) & tlck->lock;
1056	maplock->next = 0;
1057	maplock->maxcnt = 0;
1058	maplock->index = 0;
1059
1060	return tlck;
1061}
1062
1063/*
1064 *	txLinelock()
1065 *
1066 * function: allocate a transaction lock for log vector list
1067 */
1068struct linelock *txLinelock(struct linelock * tlock)
1069{
1070	lid_t lid;
1071	struct tlock *tlck;
1072	struct linelock *linelock;
1073
1074	TXN_LOCK();
1075
1076	/* allocate a TxLock structure */
1077	lid = txLockAlloc();
1078	tlck = lid_to_tlock(lid);
1079
1080	TXN_UNLOCK();
1081
1082	/* initialize linelock */
1083	linelock = (struct linelock *) tlck;
1084	linelock->next = 0;
1085	linelock->flag = tlckLINELOCK;
1086	linelock->maxcnt = TLOCKLONG;
1087	linelock->index = 0;
1088	if (tlck->flag & tlckDIRECTORY)
1089		linelock->flag |= tlckDIRECTORY;
1090
1091	/* append linelock after tlock */
1092	linelock->next = tlock->next;
1093	tlock->next = lid;
1094
1095	return linelock;
1096}
1097
1098/*
1099 *		transaction commit management
1100 *		-----------------------------
1101 */
1102
1103/*
1104 * NAME:	txCommit()
1105 *
1106 * FUNCTION:	commit the changes to the objects specified in
1107 *		clist.  For journalled segments only the
1108 *		changes of the caller are committed, ie by tid.
1109 *		for non-journalled segments the data are flushed to
1110 *		disk and then the change to the disk inode and indirect
1111 *		blocks committed (so blocks newly allocated to the
1112 *		segment will be made a part of the segment atomically).
1113 *
1114 *		all of the segments specified in clist must be in
1115 *		one file system. no more than 6 segments are needed
1116 *		to handle all unix svcs.
1117 *
1118 *		if the i_nlink field (i.e. disk inode link count)
1119 *		is zero, and the type of inode is a regular file or
1120 *		directory, or symbolic link , the inode is truncated
1121 *		to zero length. the truncation is committed but the
1122 *		VM resources are unaffected until it is closed (see
1123 *		iput and iclose).
1124 *
1125 * PARAMETER:
1126 *
1127 * RETURN:
1128 *
1129 * serialization:
1130 *		on entry the inode lock on each segment is assumed
1131 *		to be held.
1132 *
1133 * i/o error:
1134 */
1135int txCommit(tid_t tid,		/* transaction identifier */
1136	     int nip,		/* number of inodes to commit */
1137	     struct inode **iplist,	/* list of inode to commit */
1138	     int flag)
1139{
1140	int rc = 0;
1141	struct commit cd;
1142	struct jfs_log *log;
1143	struct tblock *tblk;
1144	struct lrd *lrd;
1145	struct inode *ip;
1146	struct jfs_inode_info *jfs_ip;
1147	int k, n;
1148	ino_t top;
1149	struct super_block *sb;
1150
1151	jfs_info("txCommit, tid = %d, flag = %d", tid, flag);
1152	/* is read-only file system ? */
1153	if (isReadOnly(iplist[0])) {
1154		rc = -EROFS;
1155		goto TheEnd;
1156	}
1157
1158	sb = cd.sb = iplist[0]->i_sb;
1159	cd.tid = tid;
1160
1161	if (tid == 0)
1162		tid = txBegin(sb, 0);
1163	tblk = tid_to_tblock(tid);
1164
1165	/*
1166	 * initialize commit structure
1167	 */
1168	log = JFS_SBI(sb)->log;
1169	cd.log = log;
1170
1171	/* initialize log record descriptor in commit */
1172	lrd = &cd.lrd;
1173	lrd->logtid = cpu_to_le32(tblk->logtid);
1174	lrd->backchain = 0;
1175
1176	tblk->xflag |= flag;
1177
1178	if ((flag & (COMMIT_FORCE | COMMIT_SYNC)) == 0)
1179		tblk->xflag |= COMMIT_LAZY;
1180	/*
1181	 *	prepare non-journaled objects for commit
1182	 *
1183	 * flush data pages of non-journaled file
1184	 * to prevent the file getting non-initialized disk blocks
1185	 * in case of crash.
1186	 * (new blocks - )
1187	 */
1188	cd.iplist = iplist;
1189	cd.nip = nip;
1190
1191	/*
1192	 *	acquire transaction lock on (on-disk) inodes
1193	 *
1194	 * update on-disk inode from in-memory inode
1195	 * acquiring transaction locks for AFTER records
1196	 * on the on-disk inode of file object
1197	 *
1198	 * sort the inodes array by inode number in descending order
1199	 * to prevent deadlock when acquiring transaction lock
1200	 * of on-disk inodes on multiple on-disk inode pages by
1201	 * multiple concurrent transactions
1202	 */
1203	for (k = 0; k < cd.nip; k++) {
1204		top = (cd.iplist[k])->i_ino;
1205		for (n = k + 1; n < cd.nip; n++) {
1206			ip = cd.iplist[n];
1207			if (ip->i_ino > top) {
1208				top = ip->i_ino;
1209				cd.iplist[n] = cd.iplist[k];
1210				cd.iplist[k] = ip;
1211			}
1212		}
1213
1214		ip = cd.iplist[k];
1215		jfs_ip = JFS_IP(ip);
1216
1217		/*
1218		 * BUGBUG - This code has temporarily been removed.  The
1219		 * intent is to ensure that any file data is written before
1220		 * the metadata is committed to the journal.  This prevents
1221		 * uninitialized data from appearing in a file after the
1222		 * journal has been replayed.  (The uninitialized data
1223		 * could be sensitive data removed by another user.)
1224		 *
1225		 * The problem now is that we are holding the IWRITELOCK
1226		 * on the inode, and calling filemap_fdatawrite on an
1227		 * unmapped page will cause a deadlock in jfs_get_block.
1228		 *
1229		 * The long term solution is to pare down the use of
1230		 * IWRITELOCK.  We are currently holding it too long.
1231		 * We could also be smarter about which data pages need
1232		 * to be written before the transaction is committed and
1233		 * when we don't need to worry about it at all.
1234		 *
1235		 * if ((!S_ISDIR(ip->i_mode))
1236		 *    && (tblk->flag & COMMIT_DELETE) == 0)
1237		 *	filemap_write_and_wait(ip->i_mapping);
1238		 */
1239
1240		/*
1241		 * Mark inode as not dirty.  It will still be on the dirty
1242		 * inode list, but we'll know not to commit it again unless
1243		 * it gets marked dirty again
1244		 */
1245		clear_cflag(COMMIT_Dirty, ip);
1246
1247		/* inherit anonymous tlock(s) of inode */
1248		if (jfs_ip->atlhead) {
1249			lid_to_tlock(jfs_ip->atltail)->next = tblk->next;
1250			tblk->next = jfs_ip->atlhead;
1251			if (!tblk->last)
1252				tblk->last = jfs_ip->atltail;
1253			jfs_ip->atlhead = jfs_ip->atltail = 0;
1254			TXN_LOCK();
1255			list_del_init(&jfs_ip->anon_inode_list);
1256			TXN_UNLOCK();
1257		}
1258
1259		/*
1260		 * acquire transaction lock on on-disk inode page
1261		 * (become first tlock of the tblk's tlock list)
1262		 */
1263		if (((rc = diWrite(tid, ip))))
1264			goto out;
1265	}
1266
1267	/*
1268	 *	write log records from transaction locks
1269	 *
1270	 * txUpdateMap() resets XAD_NEW in XAD.
1271	 */
1272	if ((rc = txLog(log, tblk, &cd)))
1273		goto TheEnd;
1274
1275	/*
1276	 * Ensure that inode isn't reused before
1277	 * lazy commit thread finishes processing
1278	 */
1279	if (tblk->xflag & COMMIT_DELETE) {
1280		ihold(tblk->u.ip);
1281		/*
1282		 * Avoid a rare deadlock
1283		 *
1284		 * If the inode is locked, we may be blocked in
1285		 * jfs_commit_inode.  If so, we don't want the
1286		 * lazy_commit thread doing the last iput() on the inode
1287		 * since that may block on the locked inode.  Instead,
1288		 * commit the transaction synchronously, so the last iput
1289		 * will be done by the calling thread (or later)
1290		 */
1291		/*
1292		 * I believe this code is no longer needed.  Splitting I_LOCK
1293		 * into two bits, I_NEW and I_SYNC should prevent this
1294		 * deadlock as well.  But since I don't have a JFS testload
1295		 * to verify this, only a trivial s/I_LOCK/I_SYNC/ was done.
1296		 * Joern
1297		 */
1298		if (tblk->u.ip->i_state & I_SYNC)
1299			tblk->xflag &= ~COMMIT_LAZY;
1300	}
1301
1302	ASSERT((!(tblk->xflag & COMMIT_DELETE)) ||
1303	       ((tblk->u.ip->i_nlink == 0) &&
1304		!test_cflag(COMMIT_Nolink, tblk->u.ip)));
1305
1306	/*
1307	 *	write COMMIT log record
1308	 */
1309	lrd->type = cpu_to_le16(LOG_COMMIT);
1310	lrd->length = 0;
1311	lmLog(log, tblk, lrd, NULL);
1312
1313	lmGroupCommit(log, tblk);
1314
1315	/*
1316	 *	- transaction is now committed -
1317	 */
1318
1319	/*
1320	 * force pages in careful update
1321	 * (imap addressing structure update)
1322	 */
1323	if (flag & COMMIT_FORCE)
1324		txForce(tblk);
1325
1326	/*
1327	 *	update allocation map.
1328	 *
1329	 * update inode allocation map and inode:
1330	 * free pager lock on memory object of inode if any.
1331	 * update block allocation map.
1332	 *
1333	 * txUpdateMap() resets XAD_NEW in XAD.
1334	 */
1335	if (tblk->xflag & COMMIT_FORCE)
1336		txUpdateMap(tblk);
1337
1338	/*
1339	 *	free transaction locks and pageout/free pages
1340	 */
1341	txRelease(tblk);
1342
1343	if ((tblk->flag & tblkGC_LAZY) == 0)
1344		txUnlock(tblk);
1345
1346
1347	/*
1348	 *	reset in-memory object state
1349	 */
1350	for (k = 0; k < cd.nip; k++) {
1351		ip = cd.iplist[k];
1352		jfs_ip = JFS_IP(ip);
1353
1354		/*
1355		 * reset in-memory inode state
1356		 */
1357		jfs_ip->bxflag = 0;
1358		jfs_ip->blid = 0;
1359	}
1360
1361      out:
1362	if (rc != 0)
1363		txAbort(tid, 1);
1364
1365      TheEnd:
1366	jfs_info("txCommit: tid = %d, returning %d", tid, rc);
1367	return rc;
1368}
1369
1370/*
1371 * NAME:	txLog()
1372 *
1373 * FUNCTION:	Writes AFTER log records for all lines modified
1374 *		by tid for segments specified by inodes in comdata.
1375 *		Code assumes only WRITELOCKS are recorded in lockwords.
1376 *
1377 * PARAMETERS:
1378 *
1379 * RETURN :
1380 */
1381static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd)
1382{
1383	int rc = 0;
1384	struct inode *ip;
1385	lid_t lid;
1386	struct tlock *tlck;
1387	struct lrd *lrd = &cd->lrd;
1388
1389	/*
1390	 * write log record(s) for each tlock of transaction,
1391	 */
1392	for (lid = tblk->next; lid; lid = tlck->next) {
1393		tlck = lid_to_tlock(lid);
1394
1395		tlck->flag |= tlckLOG;
1396
1397		/* initialize lrd common */
1398		ip = tlck->ip;
1399		lrd->aggregate = cpu_to_le32(JFS_SBI(ip->i_sb)->aggregate);
1400		lrd->log.redopage.fileset = cpu_to_le32(JFS_IP(ip)->fileset);
1401		lrd->log.redopage.inode = cpu_to_le32(ip->i_ino);
1402
1403		/* write log record of page from the tlock */
1404		switch (tlck->type & tlckTYPE) {
1405		case tlckXTREE:
1406			xtLog(log, tblk, lrd, tlck);
1407			break;
1408
1409		case tlckDTREE:
1410			dtLog(log, tblk, lrd, tlck);
1411			break;
1412
1413		case tlckINODE:
1414			diLog(log, tblk, lrd, tlck, cd);
1415			break;
1416
1417		case tlckMAP:
1418			mapLog(log, tblk, lrd, tlck);
1419			break;
1420
1421		case tlckDATA:
1422			dataLog(log, tblk, lrd, tlck);
1423			break;
1424
1425		default:
1426			jfs_err("UFO tlock:0x%p", tlck);
1427		}
1428	}
1429
1430	return rc;
1431}
1432
1433/*
1434 *	diLog()
1435 *
1436 * function:	log inode tlock and format maplock to update bmap;
1437 */
1438static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1439		 struct tlock * tlck, struct commit * cd)
1440{
1441	int rc = 0;
1442	struct metapage *mp;
1443	pxd_t *pxd;
1444	struct pxd_lock *pxdlock;
1445
1446	mp = tlck->mp;
1447
1448	/* initialize as REDOPAGE record format */
1449	lrd->log.redopage.type = cpu_to_le16(LOG_INODE);
1450	lrd->log.redopage.l2linesize = cpu_to_le16(L2INODESLOTSIZE);
1451
1452	pxd = &lrd->log.redopage.pxd;
1453
1454	/*
1455	 *	inode after image
1456	 */
1457	if (tlck->type & tlckENTRY) {
1458		/* log after-image for logredo(): */
1459		lrd->type = cpu_to_le16(LOG_REDOPAGE);
1460		PXDaddress(pxd, mp->index);
1461		PXDlength(pxd,
1462			  mp->logical_size >> tblk->sb->s_blocksize_bits);
1463		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1464
1465		/* mark page as homeward bound */
1466		tlck->flag |= tlckWRITEPAGE;
1467	} else if (tlck->type & tlckFREE) {
1468		/*
1469		 *	free inode extent
1470		 *
1471		 * (pages of the freed inode extent have been invalidated and
1472		 * a maplock for free of the extent has been formatted at
1473		 * txLock() time);
1474		 *
1475		 * the tlock had been acquired on the inode allocation map page
1476		 * (iag) that specifies the freed extent, even though the map
1477		 * page is not itself logged, to prevent pageout of the map
1478		 * page before the log;
1479		 */
1480
1481		/* log LOG_NOREDOINOEXT of the freed inode extent for
1482		 * logredo() to start NoRedoPage filters, and to update
1483		 * imap and bmap for free of the extent;
1484		 */
1485		lrd->type = cpu_to_le16(LOG_NOREDOINOEXT);
1486		/*
1487		 * For the LOG_NOREDOINOEXT record, we need
1488		 * to pass the IAG number and inode extent
1489		 * index (within that IAG) from which the
1490		 * the extent being released.  These have been
1491		 * passed to us in the iplist[1] and iplist[2].
1492		 */
1493		lrd->log.noredoinoext.iagnum =
1494		    cpu_to_le32((u32) (size_t) cd->iplist[1]);
1495		lrd->log.noredoinoext.inoext_idx =
1496		    cpu_to_le32((u32) (size_t) cd->iplist[2]);
1497
1498		pxdlock = (struct pxd_lock *) & tlck->lock;
1499		*pxd = pxdlock->pxd;
1500		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1501
1502		/* update bmap */
1503		tlck->flag |= tlckUPDATEMAP;
1504
1505		/* mark page as homeward bound */
1506		tlck->flag |= tlckWRITEPAGE;
1507	} else
1508		jfs_err("diLog: UFO type tlck:0x%p", tlck);
1509#ifdef  _JFS_WIP
1510	/*
1511	 *	alloc/free external EA extent
1512	 *
1513	 * a maplock for txUpdateMap() to update bPWMAP for alloc/free
1514	 * of the extent has been formatted at txLock() time;
1515	 */
1516	else {
1517		assert(tlck->type & tlckEA);
1518
1519		/* log LOG_UPDATEMAP for logredo() to update bmap for
1520		 * alloc of new (and free of old) external EA extent;
1521		 */
1522		lrd->type = cpu_to_le16(LOG_UPDATEMAP);
1523		pxdlock = (struct pxd_lock *) & tlck->lock;
1524		nlock = pxdlock->index;
1525		for (i = 0; i < nlock; i++, pxdlock++) {
1526			if (pxdlock->flag & mlckALLOCPXD)
1527				lrd->log.updatemap.type =
1528				    cpu_to_le16(LOG_ALLOCPXD);
1529			else
1530				lrd->log.updatemap.type =
1531				    cpu_to_le16(LOG_FREEPXD);
1532			lrd->log.updatemap.nxd = cpu_to_le16(1);
1533			lrd->log.updatemap.pxd = pxdlock->pxd;
1534			lrd->backchain =
1535			    cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1536		}
1537
1538		/* update bmap */
1539		tlck->flag |= tlckUPDATEMAP;
1540	}
1541#endif				/* _JFS_WIP */
1542
1543	return rc;
1544}
1545
1546/*
1547 *	dataLog()
1548 *
1549 * function:	log data tlock
1550 */
1551static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1552	    struct tlock * tlck)
1553{
1554	struct metapage *mp;
1555	pxd_t *pxd;
1556
1557	mp = tlck->mp;
1558
1559	/* initialize as REDOPAGE record format */
1560	lrd->log.redopage.type = cpu_to_le16(LOG_DATA);
1561	lrd->log.redopage.l2linesize = cpu_to_le16(L2DATASLOTSIZE);
1562
1563	pxd = &lrd->log.redopage.pxd;
1564
1565	/* log after-image for logredo(): */
1566	lrd->type = cpu_to_le16(LOG_REDOPAGE);
1567
1568	if (jfs_dirtable_inline(tlck->ip)) {
1569		/*
1570		 * The table has been truncated, we've must have deleted
1571		 * the last entry, so don't bother logging this
1572		 */
1573		mp->lid = 0;
1574		grab_metapage(mp);
1575		metapage_homeok(mp);
1576		discard_metapage(mp);
1577		tlck->mp = NULL;
1578		return 0;
1579	}
1580
1581	PXDaddress(pxd, mp->index);
1582	PXDlength(pxd, mp->logical_size >> tblk->sb->s_blocksize_bits);
1583
1584	lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1585
1586	/* mark page as homeward bound */
1587	tlck->flag |= tlckWRITEPAGE;
1588
1589	return 0;
1590}
1591
1592/*
1593 *	dtLog()
1594 *
1595 * function:	log dtree tlock and format maplock to update bmap;
1596 */
1597static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1598	   struct tlock * tlck)
1599{
1600	struct metapage *mp;
1601	struct pxd_lock *pxdlock;
1602	pxd_t *pxd;
1603
1604	mp = tlck->mp;
1605
1606	/* initialize as REDOPAGE/NOREDOPAGE record format */
1607	lrd->log.redopage.type = cpu_to_le16(LOG_DTREE);
1608	lrd->log.redopage.l2linesize = cpu_to_le16(L2DTSLOTSIZE);
1609
1610	pxd = &lrd->log.redopage.pxd;
1611
1612	if (tlck->type & tlckBTROOT)
1613		lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT);
1614
1615	/*
1616	 *	page extension via relocation: entry insertion;
1617	 *	page extension in-place: entry insertion;
1618	 *	new right page from page split, reinitialized in-line
1619	 *	root from root page split: entry insertion;
1620	 */
1621	if (tlck->type & (tlckNEW | tlckEXTEND)) {
1622		/* log after-image of the new page for logredo():
1623		 * mark log (LOG_NEW) for logredo() to initialize
1624		 * freelist and update bmap for alloc of the new page;
1625		 */
1626		lrd->type = cpu_to_le16(LOG_REDOPAGE);
1627		if (tlck->type & tlckEXTEND)
1628			lrd->log.redopage.type |= cpu_to_le16(LOG_EXTEND);
1629		else
1630			lrd->log.redopage.type |= cpu_to_le16(LOG_NEW);
1631		PXDaddress(pxd, mp->index);
1632		PXDlength(pxd,
1633			  mp->logical_size >> tblk->sb->s_blocksize_bits);
1634		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1635
1636		/* format a maplock for txUpdateMap() to update bPMAP for
1637		 * alloc of the new page;
1638		 */
1639		if (tlck->type & tlckBTROOT)
1640			return;
1641		tlck->flag |= tlckUPDATEMAP;
1642		pxdlock = (struct pxd_lock *) & tlck->lock;
1643		pxdlock->flag = mlckALLOCPXD;
1644		pxdlock->pxd = *pxd;
1645
1646		pxdlock->index = 1;
1647
1648		/* mark page as homeward bound */
1649		tlck->flag |= tlckWRITEPAGE;
1650		return;
1651	}
1652
1653	/*
1654	 *	entry insertion/deletion,
1655	 *	sibling page link update (old right page before split);
1656	 */
1657	if (tlck->type & (tlckENTRY | tlckRELINK)) {
1658		/* log after-image for logredo(): */
1659		lrd->type = cpu_to_le16(LOG_REDOPAGE);
1660		PXDaddress(pxd, mp->index);
1661		PXDlength(pxd,
1662			  mp->logical_size >> tblk->sb->s_blocksize_bits);
1663		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1664
1665		/* mark page as homeward bound */
1666		tlck->flag |= tlckWRITEPAGE;
1667		return;
1668	}
1669
1670	/*
1671	 *	page deletion: page has been invalidated
1672	 *	page relocation: source extent
1673	 *
1674	 *	a maplock for free of the page has been formatted
1675	 *	at txLock() time);
1676	 */
1677	if (tlck->type & (tlckFREE | tlckRELOCATE)) {
1678		/* log LOG_NOREDOPAGE of the deleted page for logredo()
1679		 * to start NoRedoPage filter and to update bmap for free
1680		 * of the deletd page
1681		 */
1682		lrd->type = cpu_to_le16(LOG_NOREDOPAGE);
1683		pxdlock = (struct pxd_lock *) & tlck->lock;
1684		*pxd = pxdlock->pxd;
1685		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1686
1687		/* a maplock for txUpdateMap() for free of the page
1688		 * has been formatted at txLock() time;
1689		 */
1690		tlck->flag |= tlckUPDATEMAP;
1691	}
1692	return;
1693}
1694
1695/*
1696 *	xtLog()
1697 *
1698 * function:	log xtree tlock and format maplock to update bmap;
1699 */
1700static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1701	   struct tlock * tlck)
1702{
1703	struct inode *ip;
1704	struct metapage *mp;
1705	xtpage_t *p;
1706	struct xtlock *xtlck;
1707	struct maplock *maplock;
1708	struct xdlistlock *xadlock;
1709	struct pxd_lock *pxdlock;
1710	pxd_t *page_pxd;
1711	int next, lwm, hwm;
1712
1713	ip = tlck->ip;
1714	mp = tlck->mp;
1715
1716	/* initialize as REDOPAGE/NOREDOPAGE record format */
1717	lrd->log.redopage.type = cpu_to_le16(LOG_XTREE);
1718	lrd->log.redopage.l2linesize = cpu_to_le16(L2XTSLOTSIZE);
1719
1720	page_pxd = &lrd->log.redopage.pxd;
1721
1722	if (tlck->type & tlckBTROOT) {
1723		lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT);
1724		p = &JFS_IP(ip)->i_xtroot;
1725		if (S_ISDIR(ip->i_mode))
1726			lrd->log.redopage.type |=
1727			    cpu_to_le16(LOG_DIR_XTREE);
1728	} else
1729		p = (xtpage_t *) mp->data;
1730	next = le16_to_cpu(p->header.nextindex);
1731
1732	xtlck = (struct xtlock *) & tlck->lock;
1733
1734	maplock = (struct maplock *) & tlck->lock;
1735	xadlock = (struct xdlistlock *) maplock;
1736
1737	/*
1738	 *	entry insertion/extension;
1739	 *	sibling page link update (old right page before split);
1740	 */
1741	if (tlck->type & (tlckNEW | tlckGROW | tlckRELINK)) {
1742		/* log after-image for logredo():
1743		 * logredo() will update bmap for alloc of new/extended
1744		 * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from
1745		 * after-image of XADlist;
1746		 * logredo() resets (XAD_NEW|XAD_EXTEND) flag when
1747		 * applying the after-image to the meta-data page.
1748		 */
1749		lrd->type = cpu_to_le16(LOG_REDOPAGE);
1750		PXDaddress(page_pxd, mp->index);
1751		PXDlength(page_pxd,
1752			  mp->logical_size >> tblk->sb->s_blocksize_bits);
1753		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1754
1755		/* format a maplock for txUpdateMap() to update bPMAP
1756		 * for alloc of new/extended extents of XAD[lwm:next)
1757		 * from the page itself;
1758		 * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag.
1759		 */
1760		lwm = xtlck->lwm.offset;
1761		if (lwm == 0)
1762			lwm = XTPAGEMAXSLOT;
1763
1764		if (lwm == next)
1765			goto out;
1766		if (lwm > next) {
1767			jfs_err("xtLog: lwm > next\n");
1768			goto out;
1769		}
1770		tlck->flag |= tlckUPDATEMAP;
1771		xadlock->flag = mlckALLOCXADLIST;
1772		xadlock->count = next - lwm;
1773		if ((xadlock->count <= 4) && (tblk->xflag & COMMIT_LAZY)) {
1774			int i;
1775			pxd_t *pxd;
1776			/*
1777			 * Lazy commit may allow xtree to be modified before
1778			 * txUpdateMap runs.  Copy xad into linelock to
1779			 * preserve correct data.
1780			 *
1781			 * We can fit twice as may pxd's as xads in the lock
1782			 */
1783			xadlock->flag = mlckALLOCPXDLIST;
1784			pxd = xadlock->xdlist = &xtlck->pxdlock;
1785			for (i = 0; i < xadlock->count; i++) {
1786				PXDaddress(pxd, addressXAD(&p->xad[lwm + i]));
1787				PXDlength(pxd, lengthXAD(&p->xad[lwm + i]));
1788				p->xad[lwm + i].flag &=
1789				    ~(XAD_NEW | XAD_EXTENDED);
1790				pxd++;
1791			}
1792		} else {
1793			/*
1794			 * xdlist will point to into inode's xtree, ensure
1795			 * that transaction is not committed lazily.
1796			 */
1797			xadlock->flag = mlckALLOCXADLIST;
1798			xadlock->xdlist = &p->xad[lwm];
1799			tblk->xflag &= ~COMMIT_LAZY;
1800		}
1801		jfs_info("xtLog: alloc ip:0x%p mp:0x%p tlck:0x%p lwm:%d "
1802			 "count:%d", tlck->ip, mp, tlck, lwm, xadlock->count);
1803
1804		maplock->index = 1;
1805
1806	      out:
1807		/* mark page as homeward bound */
1808		tlck->flag |= tlckWRITEPAGE;
1809
1810		return;
1811	}
1812
1813	/*
1814	 *	page deletion: file deletion/truncation (ref. xtTruncate())
1815	 *
1816	 * (page will be invalidated after log is written and bmap
1817	 * is updated from the page);
1818	 */
1819	if (tlck->type & tlckFREE) {
1820		/* LOG_NOREDOPAGE log for NoRedoPage filter:
1821		 * if page free from file delete, NoRedoFile filter from
1822		 * inode image of zero link count will subsume NoRedoPage
1823		 * filters for each page;
1824		 * if page free from file truncattion, write NoRedoPage
1825		 * filter;
1826		 *
1827		 * upadte of block allocation map for the page itself:
1828		 * if page free from deletion and truncation, LOG_UPDATEMAP
1829		 * log for the page itself is generated from processing
1830		 * its parent page xad entries;
1831		 */
1832		/* if page free from file truncation, log LOG_NOREDOPAGE
1833		 * of the deleted page for logredo() to start NoRedoPage
1834		 * filter for the page;
1835		 */
1836		if (tblk->xflag & COMMIT_TRUNCATE) {
1837			/* write NOREDOPAGE for the page */
1838			lrd->type = cpu_to_le16(LOG_NOREDOPAGE);
1839			PXDaddress(page_pxd, mp->index);
1840			PXDlength(page_pxd,
1841				  mp->logical_size >> tblk->sb->
1842				  s_blocksize_bits);
1843			lrd->backchain =
1844			    cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1845
1846			if (tlck->type & tlckBTROOT) {
1847				/* Empty xtree must be logged */
1848				lrd->type = cpu_to_le16(LOG_REDOPAGE);
1849				lrd->backchain =
1850				    cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1851			}
1852		}
1853
1854		/* init LOG_UPDATEMAP of the freed extents
1855		 * XAD[XTENTRYSTART:hwm) from the deleted page itself
1856		 * for logredo() to update bmap;
1857		 */
1858		lrd->type = cpu_to_le16(LOG_UPDATEMAP);
1859		lrd->log.updatemap.type = cpu_to_le16(LOG_FREEXADLIST);
1860		xtlck = (struct xtlock *) & tlck->lock;
1861		hwm = xtlck->hwm.offset;
1862		lrd->log.updatemap.nxd =
1863		    cpu_to_le16(hwm - XTENTRYSTART + 1);
1864		/* reformat linelock for lmLog() */
1865		xtlck->header.offset = XTENTRYSTART;
1866		xtlck->header.length = hwm - XTENTRYSTART + 1;
1867		xtlck->index = 1;
1868		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1869
1870		/* format a maplock for txUpdateMap() to update bmap
1871		 * to free extents of XAD[XTENTRYSTART:hwm) from the
1872		 * deleted page itself;
1873		 */
1874		tlck->flag |= tlckUPDATEMAP;
1875		xadlock->count = hwm - XTENTRYSTART + 1;
1876		if ((xadlock->count <= 4) && (tblk->xflag & COMMIT_LAZY)) {
1877			int i;
1878			pxd_t *pxd;
1879			/*
1880			 * Lazy commit may allow xtree to be modified before
1881			 * txUpdateMap runs.  Copy xad into linelock to
1882			 * preserve correct data.
1883			 *
1884			 * We can fit twice as may pxd's as xads in the lock
1885			 */
1886			xadlock->flag = mlckFREEPXDLIST;
1887			pxd = xadlock->xdlist = &xtlck->pxdlock;
1888			for (i = 0; i < xadlock->count; i++) {
1889				PXDaddress(pxd,
1890					addressXAD(&p->xad[XTENTRYSTART + i]));
1891				PXDlength(pxd,
1892					lengthXAD(&p->xad[XTENTRYSTART + i]));
1893				pxd++;
1894			}
1895		} else {
1896			/*
1897			 * xdlist will point to into inode's xtree, ensure
1898			 * that transaction is not committed lazily.
1899			 */
1900			xadlock->flag = mlckFREEXADLIST;
1901			xadlock->xdlist = &p->xad[XTENTRYSTART];
1902			tblk->xflag &= ~COMMIT_LAZY;
1903		}
1904		jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d lwm:2",
1905			 tlck->ip, mp, xadlock->count);
1906
1907		maplock->index = 1;
1908
1909		/* mark page as invalid */
1910		if (((tblk->xflag & COMMIT_PWMAP) || S_ISDIR(ip->i_mode))
1911		    && !(tlck->type & tlckBTROOT))
1912			tlck->flag |= tlckFREEPAGE;
1913		/*
1914		   else (tblk->xflag & COMMIT_PMAP)
1915		   ? release the page;
1916		 */
1917		return;
1918	}
1919
1920	/*
1921	 *	page/entry truncation: file truncation (ref. xtTruncate())
1922	 *
1923	 *	|----------+------+------+---------------|
1924	 *		   |      |      |
1925	 *		   |      |     hwm - hwm before truncation
1926	 *		   |     next - truncation point
1927	 *		  lwm - lwm before truncation
1928	 * header ?
1929	 */
1930	if (tlck->type & tlckTRUNCATE) {
1931		/* This odd declaration suppresses a bogus gcc warning */
1932		pxd_t pxd = pxd;	/* truncated extent of xad */
1933		int twm;
1934
1935		/*
1936		 * For truncation the entire linelock may be used, so it would
1937		 * be difficult to store xad list in linelock itself.
1938		 * Therefore, we'll just force transaction to be committed
1939		 * synchronously, so that xtree pages won't be changed before
1940		 * txUpdateMap runs.
1941		 */
1942		tblk->xflag &= ~COMMIT_LAZY;
1943		lwm = xtlck->lwm.offset;
1944		if (lwm == 0)
1945			lwm = XTPAGEMAXSLOT;
1946		hwm = xtlck->hwm.offset;
1947		twm = xtlck->twm.offset;
1948
1949		/*
1950		 *	write log records
1951		 */
1952		/* log after-image for logredo():
1953		 *
1954		 * logredo() will update bmap for alloc of new/extended
1955		 * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from
1956		 * after-image of XADlist;
1957		 * logredo() resets (XAD_NEW|XAD_EXTEND) flag when
1958		 * applying the after-image to the meta-data page.
1959		 */
1960		lrd->type = cpu_to_le16(LOG_REDOPAGE);
1961		PXDaddress(page_pxd, mp->index);
1962		PXDlength(page_pxd,
1963			  mp->logical_size >> tblk->sb->s_blocksize_bits);
1964		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1965
1966		/*
1967		 * truncate entry XAD[twm == next - 1]:
1968		 */
1969		if (twm == next - 1) {
1970			/* init LOG_UPDATEMAP for logredo() to update bmap for
1971			 * free of truncated delta extent of the truncated
1972			 * entry XAD[next - 1]:
1973			 * (xtlck->pxdlock = truncated delta extent);
1974			 */
1975			pxdlock = (struct pxd_lock *) & xtlck->pxdlock;
1976			/* assert(pxdlock->type & tlckTRUNCATE); */
1977			lrd->type = cpu_to_le16(LOG_UPDATEMAP);
1978			lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD);
1979			lrd->log.updatemap.nxd = cpu_to_le16(1);
1980			lrd->log.updatemap.pxd = pxdlock->pxd;
1981			pxd = pxdlock->pxd;	/* save to format maplock */
1982			lrd->backchain =
1983			    cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1984		}
1985
1986		/*
1987		 * free entries XAD[next:hwm]:
1988		 */
1989		if (hwm >= next) {
1990			/* init LOG_UPDATEMAP of the freed extents
1991			 * XAD[next:hwm] from the deleted page itself
1992			 * for logredo() to update bmap;
1993			 */
1994			lrd->type = cpu_to_le16(LOG_UPDATEMAP);
1995			lrd->log.updatemap.type =
1996			    cpu_to_le16(LOG_FREEXADLIST);
1997			xtlck = (struct xtlock *) & tlck->lock;
1998			hwm = xtlck->hwm.offset;
1999			lrd->log.updatemap.nxd =
2000			    cpu_to_le16(hwm - next + 1);
2001			/* reformat linelock for lmLog() */
2002			xtlck->header.offset = next;
2003			xtlck->header.length = hwm - next + 1;
2004			xtlck->index = 1;
2005			lrd->backchain =
2006			    cpu_to_le32(lmLog(log, tblk, lrd, tlck));
2007		}
2008
2009		/*
2010		 *	format maplock(s) for txUpdateMap() to update bmap
2011		 */
2012		maplock->index = 0;
2013
2014		/*
2015		 * allocate entries XAD[lwm:next):
2016		 */
2017		if (lwm < next) {
2018			/* format a maplock for txUpdateMap() to update bPMAP
2019			 * for alloc of new/extended extents of XAD[lwm:next)
2020			 * from the page itself;
2021			 * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag.
2022			 */
2023			tlck->flag |= tlckUPDATEMAP;
2024			xadlock->flag = mlckALLOCXADLIST;
2025			xadlock->count = next - lwm;
2026			xadlock->xdlist = &p->xad[lwm];
2027
2028			jfs_info("xtLog: alloc ip:0x%p mp:0x%p count:%d "
2029				 "lwm:%d next:%d",
2030				 tlck->ip, mp, xadlock->count, lwm, next);
2031			maplock->index++;
2032			xadlock++;
2033		}
2034
2035		/*
2036		 * truncate entry XAD[twm == next - 1]:
2037		 */
2038		if (twm == next - 1) {
2039			/* format a maplock for txUpdateMap() to update bmap
2040			 * to free truncated delta extent of the truncated
2041			 * entry XAD[next - 1];
2042			 * (xtlck->pxdlock = truncated delta extent);
2043			 */
2044			tlck->flag |= tlckUPDATEMAP;
2045			pxdlock = (struct pxd_lock *) xadlock;
2046			pxdlock->flag = mlckFREEPXD;
2047			pxdlock->count = 1;
2048			pxdlock->pxd = pxd;
2049
2050			jfs_info("xtLog: truncate ip:0x%p mp:0x%p count:%d "
2051				 "hwm:%d", ip, mp, pxdlock->count, hwm);
2052			maplock->index++;
2053			xadlock++;
2054		}
2055
2056		/*
2057		 * free entries XAD[next:hwm]:
2058		 */
2059		if (hwm >= next) {
2060			/* format a maplock for txUpdateMap() to update bmap
2061			 * to free extents of XAD[next:hwm] from thedeleted
2062			 * page itself;
2063			 */
2064			tlck->flag |= tlckUPDATEMAP;
2065			xadlock->flag = mlckFREEXADLIST;
2066			xadlock->count = hwm - next + 1;
2067			xadlock->xdlist = &p->xad[next];
2068
2069			jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d "
2070				 "next:%d hwm:%d",
2071				 tlck->ip, mp, xadlock->count, next, hwm);
2072			maplock->index++;
2073		}
2074
2075		/* mark page as homeward bound */
2076		tlck->flag |= tlckWRITEPAGE;
2077	}
2078	return;
2079}
2080
2081/*
2082 *	mapLog()
2083 *
2084 * function:	log from maplock of freed data extents;
2085 */
2086static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
2087		   struct tlock * tlck)
2088{
2089	struct pxd_lock *pxdlock;
2090	int i, nlock;
2091	pxd_t *pxd;
2092
2093	/*
2094	 *	page relocation: free the source page extent
2095	 *
2096	 * a maplock for txUpdateMap() for free of the page
2097	 * has been formatted at txLock() time saving the src
2098	 * relocated page address;
2099	 */
2100	if (tlck->type & tlckRELOCATE) {
2101		/* log LOG_NOREDOPAGE of the old relocated page
2102		 * for logredo() to start NoRedoPage filter;
2103		 */
2104		lrd->type = cpu_to_le16(LOG_NOREDOPAGE);
2105		pxdlock = (struct pxd_lock *) & tlck->lock;
2106		pxd = &lrd->log.redopage.pxd;
2107		*pxd = pxdlock->pxd;
2108		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
2109
2110		/* (N.B. currently, logredo() does NOT update bmap
2111		 * for free of the page itself for (LOG_XTREE|LOG_NOREDOPAGE);
2112		 * if page free from relocation, LOG_UPDATEMAP log is
2113		 * specifically generated now for logredo()
2114		 * to update bmap for free of src relocated page;
2115		 * (new flag LOG_RELOCATE may be introduced which will
2116		 * inform logredo() to start NORedoPage filter and also
2117		 * update block allocation map at the same time, thus
2118		 * avoiding an extra log write);
2119		 */
2120		lrd->type = cpu_to_le16(LOG_UPDATEMAP);
2121		lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD);
2122		lrd->log.updatemap.nxd = cpu_to_le16(1);
2123		lrd->log.updatemap.pxd = pxdlock->pxd;
2124		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
2125
2126		/* a maplock for txUpdateMap() for free of the page
2127		 * has been formatted at txLock() time;
2128		 */
2129		tlck->flag |= tlckUPDATEMAP;
2130		return;
2131	}
2132	/*
2133
2134	 * Otherwise it's not a relocate request
2135	 *
2136	 */
2137	else {
2138		/* log LOG_UPDATEMAP for logredo() to update bmap for
2139		 * free of truncated/relocated delta extent of the data;
2140		 * e.g.: external EA extent, relocated/truncated extent
2141		 * from xtTailgate();
2142		 */
2143		lrd->type = cpu_to_le16(LOG_UPDATEMAP);
2144		pxdlock = (struct pxd_lock *) & tlck->lock;
2145		nlock = pxdlock->index;
2146		for (i = 0; i < nlock; i++, pxdlock++) {
2147			if (pxdlock->flag & mlckALLOCPXD)
2148				lrd->log.updatemap.type =
2149				    cpu_to_le16(LOG_ALLOCPXD);
2150			else
2151				lrd->log.updatemap.type =
2152				    cpu_to_le16(LOG_FREEPXD);
2153			lrd->log.updatemap.nxd = cpu_to_le16(1);
2154			lrd->log.updatemap.pxd = pxdlock->pxd;
2155			lrd->backchain =
2156			    cpu_to_le32(lmLog(log, tblk, lrd, NULL));
2157			jfs_info("mapLog: xaddr:0x%lx xlen:0x%x",
2158				 (ulong) addressPXD(&pxdlock->pxd),
2159				 lengthPXD(&pxdlock->pxd));
2160		}
2161
2162		/* update bmap */
2163		tlck->flag |= tlckUPDATEMAP;
2164	}
2165}
2166
2167/*
2168 *	txEA()
2169 *
2170 * function:	acquire maplock for EA/ACL extents or
2171 *		set COMMIT_INLINE flag;
2172 */
2173void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea)
2174{
2175	struct tlock *tlck = NULL;
2176	struct pxd_lock *maplock = NULL, *pxdlock = NULL;
2177
2178	/*
2179	 * format maplock for alloc of new EA extent
2180	 */
2181	if (newea) {
2182		/* Since the newea could be a completely zeroed entry we need to
2183		 * check for the two flags which indicate we should actually
2184		 * commit new EA data
2185		 */
2186		if (newea->flag & DXD_EXTENT) {
2187			tlck = txMaplock(tid, ip, tlckMAP);
2188			maplock = (struct pxd_lock *) & tlck->lock;
2189			pxdlock = (struct pxd_lock *) maplock;
2190			pxdlock->flag = mlckALLOCPXD;
2191			PXDaddress(&pxdlock->pxd, addressDXD(newea));
2192			PXDlength(&pxdlock->pxd, lengthDXD(newea));
2193			pxdlock++;
2194			maplock->index = 1;
2195		} else if (newea->flag & DXD_INLINE) {
2196			tlck = NULL;
2197
2198			set_cflag(COMMIT_Inlineea, ip);
2199		}
2200	}
2201
2202	/*
2203	 * format maplock for free of old EA extent
2204	 */
2205	if (!test_cflag(COMMIT_Nolink, ip) && oldea->flag & DXD_EXTENT) {
2206		if (tlck == NULL) {
2207			tlck = txMaplock(tid, ip, tlckMAP);
2208			maplock = (struct pxd_lock *) & tlck->lock;
2209			pxdlock = (struct pxd_lock *) maplock;
2210			maplock->index = 0;
2211		}
2212		pxdlock->flag = mlckFREEPXD;
2213		PXDaddress(&pxdlock->pxd, addressDXD(oldea));
2214		PXDlength(&pxdlock->pxd, lengthDXD(oldea));
2215		maplock->index++;
2216	}
2217}
2218
2219/*
2220 *	txForce()
2221 *
2222 * function: synchronously write pages locked by transaction
2223 *	     after txLog() but before txUpdateMap();
2224 */
2225static void txForce(struct tblock * tblk)
2226{
2227	struct tlock *tlck;
2228	lid_t lid, next;
2229	struct metapage *mp;
2230
2231	/*
2232	 * reverse the order of transaction tlocks in
2233	 * careful update order of address index pages
2234	 * (right to left, bottom up)
2235	 */
2236	tlck = lid_to_tlock(tblk->next);
2237	lid = tlck->next;
2238	tlck->next = 0;
2239	while (lid) {
2240		tlck = lid_to_tlock(lid);
2241		next = tlck->next;
2242		tlck->next = tblk->next;
2243		tblk->next = lid;
2244		lid = next;
2245	}
2246
2247	/*
2248	 * synchronously write the page, and
2249	 * hold the page for txUpdateMap();
2250	 */
2251	for (lid = tblk->next; lid; lid = next) {
2252		tlck = lid_to_tlock(lid);
2253		next = tlck->next;
2254
2255		if ((mp = tlck->mp) != NULL &&
2256		    (tlck->type & tlckBTROOT) == 0) {
2257			assert(mp->xflag & COMMIT_PAGE);
2258
2259			if (tlck->flag & tlckWRITEPAGE) {
2260				tlck->flag &= ~tlckWRITEPAGE;
2261
2262				/* do not release page to freelist */
2263				force_metapage(mp);
2264#if 0
2265				/*
2266				 * The "right" thing to do here is to
2267				 * synchronously write the metadata.
2268				 * With the current implementation this
2269				 * is hard since write_metapage requires
2270				 * us to kunmap & remap the page.  If we
2271				 * have tlocks pointing into the metadata
2272				 * pages, we don't want to do this.  I think
2273				 * we can get by with synchronously writing
2274				 * the pages when they are released.
2275				 */
2276				assert(mp->nohomeok);
2277				set_bit(META_dirty, &mp->flag);
2278				set_bit(META_sync, &mp->flag);
2279#endif
2280			}
2281		}
2282	}
2283}
2284
2285/*
2286 *	txUpdateMap()
2287 *
2288 * function:	update persistent allocation map (and working map
2289 *		if appropriate);
2290 *
2291 * parameter:
2292 */
2293static void txUpdateMap(struct tblock * tblk)
2294{
2295	struct inode *ip;
2296	struct inode *ipimap;
2297	lid_t lid;
2298	struct tlock *tlck;
2299	struct maplock *maplock;
2300	struct pxd_lock pxdlock;
2301	int maptype;
2302	int k, nlock;
2303	struct metapage *mp = NULL;
2304
2305	ipimap = JFS_SBI(tblk->sb)->ipimap;
2306
2307	maptype = (tblk->xflag & COMMIT_PMAP) ? COMMIT_PMAP : COMMIT_PWMAP;
2308
2309
2310	/*
2311	 *	update block allocation map
2312	 *
2313	 * update allocation state in pmap (and wmap) and
2314	 * update lsn of the pmap page;
2315	 */
2316	/*
2317	 * scan each tlock/page of transaction for block allocation/free:
2318	 *
2319	 * for each tlock/page of transaction, update map.
2320	 *  ? are there tlock for pmap and pwmap at the same time ?
2321	 */
2322	for (lid = tblk->next; lid; lid = tlck->next) {
2323		tlck = lid_to_tlock(lid);
2324
2325		if ((tlck->flag & tlckUPDATEMAP) == 0)
2326			continue;
2327
2328		if (tlck->flag & tlckFREEPAGE) {
2329			/*
2330			 * Another thread may attempt to reuse freed space
2331			 * immediately, so we want to get rid of the metapage
2332			 * before anyone else has a chance to get it.
2333			 * Lock metapage, update maps, then invalidate
2334			 * the metapage.
2335			 */
2336			mp = tlck->mp;
2337			ASSERT(mp->xflag & COMMIT_PAGE);
2338			grab_metapage(mp);
2339		}
2340
2341		/*
2342		 * extent list:
2343		 * . in-line PXD list:
2344		 * . out-of-line XAD list:
2345		 */
2346		maplock = (struct maplock *) & tlck->lock;
2347		nlock = maplock->index;
2348
2349		for (k = 0; k < nlock; k++, maplock++) {
2350			/*
2351			 * allocate blocks in persistent map:
2352			 *
2353			 * blocks have been allocated from wmap at alloc time;
2354			 */
2355			if (maplock->flag & mlckALLOC) {
2356				txAllocPMap(ipimap, maplock, tblk);
2357			}
2358			/*
2359			 * free blocks in persistent and working map:
2360			 * blocks will be freed in pmap and then in wmap;
2361			 *
2362			 * ? tblock specifies the PMAP/PWMAP based upon
2363			 * transaction
2364			 *
2365			 * free blocks in persistent map:
2366			 * blocks will be freed from wmap at last reference
2367			 * release of the object for regular files;
2368			 *
2369			 * Alway free blocks from both persistent & working
2370			 * maps for directories
2371			 */
2372			else {	/* (maplock->flag & mlckFREE) */
2373
2374				if (tlck->flag & tlckDIRECTORY)
2375					txFreeMap(ipimap, maplock,
2376						  tblk, COMMIT_PWMAP);
2377				else
2378					txFreeMap(ipimap, maplock,
2379						  tblk, maptype);
2380			}
2381		}
2382		if (tlck->flag & tlckFREEPAGE) {
2383			if (!(tblk->flag & tblkGC_LAZY)) {
2384				/* This is equivalent to txRelease */
2385				ASSERT(mp->lid == lid);
2386				tlck->mp->lid = 0;
2387			}
2388			assert(mp->nohomeok == 1);
2389			metapage_homeok(mp);
2390			discard_metapage(mp);
2391			tlck->mp = NULL;
2392		}
2393	}
2394	/*
2395	 *	update inode allocation map
2396	 *
2397	 * update allocation state in pmap and
2398	 * update lsn of the pmap page;
2399	 * update in-memory inode flag/state
2400	 *
2401	 * unlock mapper/write lock
2402	 */
2403	if (tblk->xflag & COMMIT_CREATE) {
2404		diUpdatePMap(ipimap, tblk->ino, false, tblk);
2405		/* update persistent block allocation map
2406		 * for the allocation of inode extent;
2407		 */
2408		pxdlock.flag = mlckALLOCPXD;
2409		pxdlock.pxd = tblk->u.ixpxd;
2410		pxdlock.index = 1;
2411		txAllocPMap(ipimap, (struct maplock *) & pxdlock, tblk);
2412	} else if (tblk->xflag & COMMIT_DELETE) {
2413		ip = tblk->u.ip;
2414		diUpdatePMap(ipimap, ip->i_ino, true, tblk);
2415		iput(ip);
2416	}
2417}
2418
2419/*
2420 *	txAllocPMap()
2421 *
2422 * function: allocate from persistent map;
2423 *
2424 * parameter:
2425 *	ipbmap	-
2426 *	malock	-
2427 *		xad list:
2428 *		pxd:
2429 *
2430 *	maptype -
2431 *		allocate from persistent map;
2432 *		free from persistent map;
2433 *		(e.g., tmp file - free from working map at releae
2434 *		 of last reference);
2435 *		free from persistent and working map;
2436 *
2437 *	lsn	- log sequence number;
2438 */
2439static void txAllocPMap(struct inode *ip, struct maplock * maplock,
2440			struct tblock * tblk)
2441{
2442	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
2443	struct xdlistlock *xadlistlock;
2444	xad_t *xad;
2445	s64 xaddr;
2446	int xlen;
2447	struct pxd_lock *pxdlock;
2448	struct xdlistlock *pxdlistlock;
2449	pxd_t *pxd;
2450	int n;
2451
2452	/*
2453	 * allocate from persistent map;
2454	 */
2455	if (maplock->flag & mlckALLOCXADLIST) {
2456		xadlistlock = (struct xdlistlock *) maplock;
2457		xad = xadlistlock->xdlist;
2458		for (n = 0; n < xadlistlock->count; n++, xad++) {
2459			if (xad->flag & (XAD_NEW | XAD_EXTENDED)) {
2460				xaddr = addressXAD(xad);
2461				xlen = lengthXAD(xad);
2462				dbUpdatePMap(ipbmap, false, xaddr,
2463					     (s64) xlen, tblk);
2464				xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
2465				jfs_info("allocPMap: xaddr:0x%lx xlen:%d",
2466					 (ulong) xaddr, xlen);
2467			}
2468		}
2469	} else if (maplock->flag & mlckALLOCPXD) {
2470		pxdlock = (struct pxd_lock *) maplock;
2471		xaddr = addressPXD(&pxdlock->pxd);
2472		xlen = lengthPXD(&pxdlock->pxd);
2473		dbUpdatePMap(ipbmap, false, xaddr, (s64) xlen, tblk);
2474		jfs_info("allocPMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen);
2475	} else {		/* (maplock->flag & mlckALLOCPXDLIST) */
2476
2477		pxdlistlock = (struct xdlistlock *) maplock;
2478		pxd = pxdlistlock->xdlist;
2479		for (n = 0; n < pxdlistlock->count; n++, pxd++) {
2480			xaddr = addressPXD(pxd);
2481			xlen = lengthPXD(pxd);
2482			dbUpdatePMap(ipbmap, false, xaddr, (s64) xlen,
2483				     tblk);
2484			jfs_info("allocPMap: xaddr:0x%lx xlen:%d",
2485				 (ulong) xaddr, xlen);
2486		}
2487	}
2488}
2489
2490/*
2491 *	txFreeMap()
2492 *
2493 * function:	free from persistent and/or working map;
2494 *
2495 * todo: optimization
2496 */
2497void txFreeMap(struct inode *ip,
2498	       struct maplock * maplock, struct tblock * tblk, int maptype)
2499{
2500	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
2501	struct xdlistlock *xadlistlock;
2502	xad_t *xad;
2503	s64 xaddr;
2504	int xlen;
2505	struct pxd_lock *pxdlock;
2506	struct xdlistlock *pxdlistlock;
2507	pxd_t *pxd;
2508	int n;
2509
2510	jfs_info("txFreeMap: tblk:0x%p maplock:0x%p maptype:0x%x",
2511		 tblk, maplock, maptype);
2512
2513	/*
2514	 * free from persistent map;
2515	 */
2516	if (maptype == COMMIT_PMAP || maptype == COMMIT_PWMAP) {
2517		if (maplock->flag & mlckFREEXADLIST) {
2518			xadlistlock = (struct xdlistlock *) maplock;
2519			xad = xadlistlock->xdlist;
2520			for (n = 0; n < xadlistlock->count; n++, xad++) {
2521				if (!(xad->flag & XAD_NEW)) {
2522					xaddr = addressXAD(xad);
2523					xlen = lengthXAD(xad);
2524					dbUpdatePMap(ipbmap, true, xaddr,
2525						     (s64) xlen, tblk);
2526					jfs_info("freePMap: xaddr:0x%lx "
2527						 "xlen:%d",
2528						 (ulong) xaddr, xlen);
2529				}
2530			}
2531		} else if (maplock->flag & mlckFREEPXD) {
2532			pxdlock = (struct pxd_lock *) maplock;
2533			xaddr = addressPXD(&pxdlock->pxd);
2534			xlen = lengthPXD(&pxdlock->pxd);
2535			dbUpdatePMap(ipbmap, true, xaddr, (s64) xlen,
2536				     tblk);
2537			jfs_info("freePMap: xaddr:0x%lx xlen:%d",
2538				 (ulong) xaddr, xlen);
2539		} else {	/* (maplock->flag & mlckALLOCPXDLIST) */
2540
2541			pxdlistlock = (struct xdlistlock *) maplock;
2542			pxd = pxdlistlock->xdlist;
2543			for (n = 0; n < pxdlistlock->count; n++, pxd++) {
2544				xaddr = addressPXD(pxd);
2545				xlen = lengthPXD(pxd);
2546				dbUpdatePMap(ipbmap, true, xaddr,
2547					     (s64) xlen, tblk);
2548				jfs_info("freePMap: xaddr:0x%lx xlen:%d",
2549					 (ulong) xaddr, xlen);
2550			}
2551		}
2552	}
2553
2554	/*
2555	 * free from working map;
2556	 */
2557	if (maptype == COMMIT_PWMAP || maptype == COMMIT_WMAP) {
2558		if (maplock->flag & mlckFREEXADLIST) {
2559			xadlistlock = (struct xdlistlock *) maplock;
2560			xad = xadlistlock->xdlist;
2561			for (n = 0; n < xadlistlock->count; n++, xad++) {
2562				xaddr = addressXAD(xad);
2563				xlen = lengthXAD(xad);
2564				dbFree(ip, xaddr, (s64) xlen);
2565				xad->flag = 0;
2566				jfs_info("freeWMap: xaddr:0x%lx xlen:%d",
2567					 (ulong) xaddr, xlen);
2568			}
2569		} else if (maplock->flag & mlckFREEPXD) {
2570			pxdlock = (struct pxd_lock *) maplock;
2571			xaddr = addressPXD(&pxdlock->pxd);
2572			xlen = lengthPXD(&pxdlock->pxd);
2573			dbFree(ip, xaddr, (s64) xlen);
2574			jfs_info("freeWMap: xaddr:0x%lx xlen:%d",
2575				 (ulong) xaddr, xlen);
2576		} else {	/* (maplock->flag & mlckFREEPXDLIST) */
2577
2578			pxdlistlock = (struct xdlistlock *) maplock;
2579			pxd = pxdlistlock->xdlist;
2580			for (n = 0; n < pxdlistlock->count; n++, pxd++) {
2581				xaddr = addressPXD(pxd);
2582				xlen = lengthPXD(pxd);
2583				dbFree(ip, xaddr, (s64) xlen);
2584				jfs_info("freeWMap: xaddr:0x%lx xlen:%d",
2585					 (ulong) xaddr, xlen);
2586			}
2587		}
2588	}
2589}
2590
2591/*
2592 *	txFreelock()
2593 *
2594 * function:	remove tlock from inode anonymous locklist
2595 */
2596void txFreelock(struct inode *ip)
2597{
2598	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
2599	struct tlock *xtlck, *tlck;
2600	lid_t xlid = 0, lid;
2601
2602	if (!jfs_ip->atlhead)
2603		return;
2604
2605	TXN_LOCK();
2606	xtlck = (struct tlock *) &jfs_ip->atlhead;
2607
2608	while ((lid = xtlck->next) != 0) {
2609		tlck = lid_to_tlock(lid);
2610		if (tlck->flag & tlckFREELOCK) {
2611			xtlck->next = tlck->next;
2612			txLockFree(lid);
2613		} else {
2614			xtlck = tlck;
2615			xlid = lid;
2616		}
2617	}
2618
2619	if (jfs_ip->atlhead)
2620		jfs_ip->atltail = xlid;
2621	else {
2622		jfs_ip->atltail = 0;
2623		/*
2624		 * If inode was on anon_list, remove it
2625		 */
2626		list_del_init(&jfs_ip->anon_inode_list);
2627	}
2628	TXN_UNLOCK();
2629}
2630
2631/*
2632 *	txAbort()
2633 *
2634 * function: abort tx before commit;
2635 *
2636 * frees line-locks and segment locks for all
2637 * segments in comdata structure.
2638 * Optionally sets state of file-system to FM_DIRTY in super-block.
2639 * log age of page-frames in memory for which caller has
2640 * are reset to 0 (to avoid logwarap).
2641 */
2642void txAbort(tid_t tid, int dirty)
2643{
2644	lid_t lid, next;
2645	struct metapage *mp;
2646	struct tblock *tblk = tid_to_tblock(tid);
2647	struct tlock *tlck;
2648
2649	/*
2650	 * free tlocks of the transaction
2651	 */
2652	for (lid = tblk->next; lid; lid = next) {
2653		tlck = lid_to_tlock(lid);
2654		next = tlck->next;
2655		mp = tlck->mp;
2656		JFS_IP(tlck->ip)->xtlid = 0;
2657
2658		if (mp) {
2659			mp->lid = 0;
2660
2661			/*
2662			 * reset lsn of page to avoid logwarap:
2663			 *
2664			 * (page may have been previously committed by another
2665			 * transaction(s) but has not been paged, i.e.,
2666			 * it may be on logsync list even though it has not
2667			 * been logged for the current tx.)
2668			 */
2669			if (mp->xflag & COMMIT_PAGE && mp->lsn)
2670				LogSyncRelease(mp);
2671		}
2672		/* insert tlock at head of freelist */
2673		TXN_LOCK();
2674		txLockFree(lid);
2675		TXN_UNLOCK();
2676	}
2677
2678	/* caller will free the transaction block */
2679
2680	tblk->next = tblk->last = 0;
2681
2682	/*
2683	 * mark filesystem dirty
2684	 */
2685	if (dirty)
2686		jfs_error(tblk->sb, "\n");
2687
2688	return;
2689}
2690
2691/*
2692 *	txLazyCommit(void)
2693 *
2694 *	All transactions except those changing ipimap (COMMIT_FORCE) are
2695 *	processed by this routine.  This insures that the inode and block
2696 *	allocation maps are updated in order.  For synchronous transactions,
2697 *	let the user thread finish processing after txUpdateMap() is called.
2698 */
2699static void txLazyCommit(struct tblock * tblk)
2700{
2701	struct jfs_log *log;
2702
2703	while (((tblk->flag & tblkGC_READY) == 0) &&
2704	       ((tblk->flag & tblkGC_UNLOCKED) == 0)) {
2705		/* We must have gotten ahead of the user thread
2706		 */
2707		jfs_info("jfs_lazycommit: tblk 0x%p not unlocked", tblk);
2708		yield();
2709	}
2710
2711	jfs_info("txLazyCommit: processing tblk 0x%p", tblk);
2712
2713	txUpdateMap(tblk);
2714
2715	log = (struct jfs_log *) JFS_SBI(tblk->sb)->log;
2716
2717	spin_lock_irq(&log->gclock);	// LOGGC_LOCK
2718
2719	tblk->flag |= tblkGC_COMMITTED;
2720
2721	if (tblk->flag & tblkGC_READY)
2722		log->gcrtc--;
2723
2724	wake_up_all(&tblk->gcwait);	// LOGGC_WAKEUP
2725
2726	/*
2727	 * Can't release log->gclock until we've tested tblk->flag
2728	 */
2729	if (tblk->flag & tblkGC_LAZY) {
2730		spin_unlock_irq(&log->gclock);	// LOGGC_UNLOCK
2731		txUnlock(tblk);
2732		tblk->flag &= ~tblkGC_LAZY;
2733		txEnd(tblk - TxBlock);	/* Convert back to tid */
2734	} else
2735		spin_unlock_irq(&log->gclock);	// LOGGC_UNLOCK
2736
2737	jfs_info("txLazyCommit: done: tblk = 0x%p", tblk);
2738}
2739
2740/*
2741 *	jfs_lazycommit(void)
2742 *
2743 *	To be run as a kernel daemon.  If lbmIODone is called in an interrupt
2744 *	context, or where blocking is not wanted, this routine will process
2745 *	committed transactions from the unlock queue.
2746 */
2747int jfs_lazycommit(void *arg)
2748{
2749	int WorkDone;
2750	struct tblock *tblk;
2751	unsigned long flags;
2752	struct jfs_sb_info *sbi;
2753
2754	do {
2755		LAZY_LOCK(flags);
2756		jfs_commit_thread_waking = 0;	/* OK to wake another thread */
2757		while (!list_empty(&TxAnchor.unlock_queue)) {
2758			WorkDone = 0;
2759			list_for_each_entry(tblk, &TxAnchor.unlock_queue,
2760					    cqueue) {
2761
2762				sbi = JFS_SBI(tblk->sb);
2763				/*
2764				 * For each volume, the transactions must be
2765				 * handled in order.  If another commit thread
2766				 * is handling a tblk for this superblock,
2767				 * skip it
2768				 */
2769				if (sbi->commit_state & IN_LAZYCOMMIT)
2770					continue;
2771
2772				sbi->commit_state |= IN_LAZYCOMMIT;
2773				WorkDone = 1;
2774
2775				/*
2776				 * Remove transaction from queue
2777				 */
2778				list_del(&tblk->cqueue);
2779
2780				LAZY_UNLOCK(flags);
2781				txLazyCommit(tblk);
2782				LAZY_LOCK(flags);
2783
2784				sbi->commit_state &= ~IN_LAZYCOMMIT;
2785				/*
2786				 * Don't continue in the for loop.  (We can't
2787				 * anyway, it's unsafe!)  We want to go back to
2788				 * the beginning of the list.
2789				 */
2790				break;
2791			}
2792
2793			/* If there was nothing to do, don't continue */
2794			if (!WorkDone)
2795				break;
2796		}
2797		/* In case a wakeup came while all threads were active */
2798		jfs_commit_thread_waking = 0;
2799
2800		if (freezing(current)) {
2801			LAZY_UNLOCK(flags);
2802			try_to_freeze();
2803		} else {
2804			DECLARE_WAITQUEUE(wq, current);
2805
2806			add_wait_queue(&jfs_commit_thread_wait, &wq);
2807			set_current_state(TASK_INTERRUPTIBLE);
2808			LAZY_UNLOCK(flags);
2809			schedule();
2810			remove_wait_queue(&jfs_commit_thread_wait, &wq);
2811		}
2812	} while (!kthread_should_stop());
2813
2814	if (!list_empty(&TxAnchor.unlock_queue))
2815		jfs_err("jfs_lazycommit being killed w/pending transactions!");
2816	else
2817		jfs_info("jfs_lazycommit being killed\n");
2818	return 0;
2819}
2820
2821void txLazyUnlock(struct tblock * tblk)
2822{
2823	unsigned long flags;
2824
2825	LAZY_LOCK(flags);
2826
2827	list_add_tail(&tblk->cqueue, &TxAnchor.unlock_queue);
2828	/*
2829	 * Don't wake up a commit thread if there is already one servicing
2830	 * this superblock, or if the last one we woke up hasn't started yet.
2831	 */
2832	if (!(JFS_SBI(tblk->sb)->commit_state & IN_LAZYCOMMIT) &&
2833	    !jfs_commit_thread_waking) {
2834		jfs_commit_thread_waking = 1;
2835		wake_up(&jfs_commit_thread_wait);
2836	}
2837	LAZY_UNLOCK(flags);
2838}
2839
2840static void LogSyncRelease(struct metapage * mp)
2841{
2842	struct jfs_log *log = mp->log;
2843
2844	assert(mp->nohomeok);
2845	assert(log);
2846	metapage_homeok(mp);
2847}
2848
2849/*
2850 *	txQuiesce
2851 *
2852 *	Block all new transactions and push anonymous transactions to
2853 *	completion
2854 *
2855 *	This does almost the same thing as jfs_sync below.  We don't
2856 *	worry about deadlocking when jfs_tlocks_low is set, since we would
2857 *	expect jfs_sync to get us out of that jam.
2858 */
2859void txQuiesce(struct super_block *sb)
2860{
2861	struct inode *ip;
2862	struct jfs_inode_info *jfs_ip;
2863	struct jfs_log *log = JFS_SBI(sb)->log;
2864	tid_t tid;
2865
2866	set_bit(log_QUIESCE, &log->flag);
2867
2868	TXN_LOCK();
2869restart:
2870	while (!list_empty(&TxAnchor.anon_list)) {
2871		jfs_ip = list_entry(TxAnchor.anon_list.next,
2872				    struct jfs_inode_info,
2873				    anon_inode_list);
2874		ip = &jfs_ip->vfs_inode;
2875
2876		/*
2877		 * inode will be removed from anonymous list
2878		 * when it is committed
2879		 */
2880		TXN_UNLOCK();
2881		tid = txBegin(ip->i_sb, COMMIT_INODE | COMMIT_FORCE);
2882		mutex_lock(&jfs_ip->commit_mutex);
2883		txCommit(tid, 1, &ip, 0);
2884		txEnd(tid);
2885		mutex_unlock(&jfs_ip->commit_mutex);
2886		/*
2887		 * Just to be safe.  I don't know how
2888		 * long we can run without blocking
2889		 */
2890		cond_resched();
2891		TXN_LOCK();
2892	}
2893
2894	/*
2895	 * If jfs_sync is running in parallel, there could be some inodes
2896	 * on anon_list2.  Let's check.
2897	 */
2898	if (!list_empty(&TxAnchor.anon_list2)) {
2899		list_splice(&TxAnchor.anon_list2, &TxAnchor.anon_list);
2900		INIT_LIST_HEAD(&TxAnchor.anon_list2);
2901		goto restart;
2902	}
2903	TXN_UNLOCK();
2904
2905	/*
2906	 * We may need to kick off the group commit
2907	 */
2908	jfs_flush_journal(log, 0);
2909}
2910
2911/*
2912 * txResume()
2913 *
2914 * Allows transactions to start again following txQuiesce
2915 */
2916void txResume(struct super_block *sb)
2917{
2918	struct jfs_log *log = JFS_SBI(sb)->log;
2919
2920	clear_bit(log_QUIESCE, &log->flag);
2921	TXN_WAKEUP(&log->syncwait);
2922}
2923
2924/*
2925 *	jfs_sync(void)
2926 *
2927 *	To be run as a kernel daemon.  This is awakened when tlocks run low.
2928 *	We write any inodes that have anonymous tlocks so they will become
2929 *	available.
2930 */
2931int jfs_sync(void *arg)
2932{
2933	struct inode *ip;
2934	struct jfs_inode_info *jfs_ip;
2935	tid_t tid;
2936
2937	do {
2938		/*
2939		 * write each inode on the anonymous inode list
2940		 */
2941		TXN_LOCK();
2942		while (jfs_tlocks_low && !list_empty(&TxAnchor.anon_list)) {
2943			jfs_ip = list_entry(TxAnchor.anon_list.next,
2944					    struct jfs_inode_info,
2945					    anon_inode_list);
2946			ip = &jfs_ip->vfs_inode;
2947
2948			if (! igrab(ip)) {
2949				/*
2950				 * Inode is being freed
2951				 */
2952				list_del_init(&jfs_ip->anon_inode_list);
2953			} else if (mutex_trylock(&jfs_ip->commit_mutex)) {
2954				/*
2955				 * inode will be removed from anonymous list
2956				 * when it is committed
2957				 */
2958				TXN_UNLOCK();
2959				tid = txBegin(ip->i_sb, COMMIT_INODE);
2960				txCommit(tid, 1, &ip, 0);
2961				txEnd(tid);
2962				mutex_unlock(&jfs_ip->commit_mutex);
2963
2964				iput(ip);
2965				/*
2966				 * Just to be safe.  I don't know how
2967				 * long we can run without blocking
2968				 */
2969				cond_resched();
2970				TXN_LOCK();
2971			} else {
2972				/* We can't get the commit mutex.  It may
2973				 * be held by a thread waiting for tlock's
2974				 * so let's not block here.  Save it to
2975				 * put back on the anon_list.
2976				 */
2977
2978				/* Move from anon_list to anon_list2 */
2979				list_move(&jfs_ip->anon_inode_list,
2980					  &TxAnchor.anon_list2);
2981
2982				TXN_UNLOCK();
2983				iput(ip);
2984				TXN_LOCK();
2985			}
2986		}
2987		/* Add anon_list2 back to anon_list */
2988		list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list);
2989
2990		if (freezing(current)) {
2991			TXN_UNLOCK();
2992			try_to_freeze();
2993		} else {
2994			set_current_state(TASK_INTERRUPTIBLE);
2995			TXN_UNLOCK();
2996			schedule();
2997		}
2998	} while (!kthread_should_stop());
2999
3000	jfs_info("jfs_sync being killed");
3001	return 0;
3002}
3003
3004#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG)
3005static int jfs_txanchor_proc_show(struct seq_file *m, void *v)
3006{
3007	char *freewait;
3008	char *freelockwait;
3009	char *lowlockwait;
3010
3011	freewait =
3012	    waitqueue_active(&TxAnchor.freewait) ? "active" : "empty";
3013	freelockwait =
3014	    waitqueue_active(&TxAnchor.freelockwait) ? "active" : "empty";
3015	lowlockwait =
3016	    waitqueue_active(&TxAnchor.lowlockwait) ? "active" : "empty";
3017
3018	seq_printf(m,
3019		       "JFS TxAnchor\n"
3020		       "============\n"
3021		       "freetid = %d\n"
3022		       "freewait = %s\n"
3023		       "freelock = %d\n"
3024		       "freelockwait = %s\n"
3025		       "lowlockwait = %s\n"
3026		       "tlocksInUse = %d\n"
3027		       "jfs_tlocks_low = %d\n"
3028		       "unlock_queue is %sempty\n",
3029		       TxAnchor.freetid,
3030		       freewait,
3031		       TxAnchor.freelock,
3032		       freelockwait,
3033		       lowlockwait,
3034		       TxAnchor.tlocksInUse,
3035		       jfs_tlocks_low,
3036		       list_empty(&TxAnchor.unlock_queue) ? "" : "not ");
3037	return 0;
3038}
3039
3040static int jfs_txanchor_proc_open(struct inode *inode, struct file *file)
3041{
3042	return single_open(file, jfs_txanchor_proc_show, NULL);
3043}
3044
3045const struct file_operations jfs_txanchor_proc_fops = {
3046	.owner		= THIS_MODULE,
3047	.open		= jfs_txanchor_proc_open,
3048	.read		= seq_read,
3049	.llseek		= seq_lseek,
3050	.release	= single_release,
3051};
3052#endif
3053
3054#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS)
3055static int jfs_txstats_proc_show(struct seq_file *m, void *v)
3056{
3057	seq_printf(m,
3058		       "JFS TxStats\n"
3059		       "===========\n"
3060		       "calls to txBegin = %d\n"
3061		       "txBegin blocked by sync barrier = %d\n"
3062		       "txBegin blocked by tlocks low = %d\n"
3063		       "txBegin blocked by no free tid = %d\n"
3064		       "calls to txBeginAnon = %d\n"
3065		       "txBeginAnon blocked by sync barrier = %d\n"
3066		       "txBeginAnon blocked by tlocks low = %d\n"
3067		       "calls to txLockAlloc = %d\n"
3068		       "tLockAlloc blocked by no free lock = %d\n",
3069		       TxStat.txBegin,
3070		       TxStat.txBegin_barrier,
3071		       TxStat.txBegin_lockslow,
3072		       TxStat.txBegin_freetid,
3073		       TxStat.txBeginAnon,
3074		       TxStat.txBeginAnon_barrier,
3075		       TxStat.txBeginAnon_lockslow,
3076		       TxStat.txLockAlloc,
3077		       TxStat.txLockAlloc_freelock);
3078	return 0;
3079}
3080
3081static int jfs_txstats_proc_open(struct inode *inode, struct file *file)
3082{
3083	return single_open(file, jfs_txstats_proc_show, NULL);
3084}
3085
3086const struct file_operations jfs_txstats_proc_fops = {
3087	.owner		= THIS_MODULE,
3088	.open		= jfs_txstats_proc_open,
3089	.read		= seq_read,
3090	.llseek		= seq_lseek,
3091	.release	= single_release,
3092};
3093#endif
3094