1/*
2 * Copyright (C) 2011 Fujitsu.  All rights reserved.
3 * Written by Miao Xie <miaox@cn.fujitsu.com>
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public
7 * License v2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public
15 * License along with this program; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 021110-1307, USA.
18 */
19
20#include <linux/slab.h>
21#include "delayed-inode.h"
22#include "disk-io.h"
23#include "transaction.h"
24#include "ctree.h"
25
26#define BTRFS_DELAYED_WRITEBACK		512
27#define BTRFS_DELAYED_BACKGROUND	128
28#define BTRFS_DELAYED_BATCH		16
29
30static struct kmem_cache *delayed_node_cache;
31
32int __init btrfs_delayed_inode_init(void)
33{
34	delayed_node_cache = kmem_cache_create("btrfs_delayed_node",
35					sizeof(struct btrfs_delayed_node),
36					0,
37					SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
38					NULL);
39	if (!delayed_node_cache)
40		return -ENOMEM;
41	return 0;
42}
43
44void btrfs_delayed_inode_exit(void)
45{
46	if (delayed_node_cache)
47		kmem_cache_destroy(delayed_node_cache);
48}
49
50static inline void btrfs_init_delayed_node(
51				struct btrfs_delayed_node *delayed_node,
52				struct btrfs_root *root, u64 inode_id)
53{
54	delayed_node->root = root;
55	delayed_node->inode_id = inode_id;
56	atomic_set(&delayed_node->refs, 0);
57	delayed_node->count = 0;
58	delayed_node->flags = 0;
59	delayed_node->ins_root = RB_ROOT;
60	delayed_node->del_root = RB_ROOT;
61	mutex_init(&delayed_node->mutex);
62	delayed_node->index_cnt = 0;
63	INIT_LIST_HEAD(&delayed_node->n_list);
64	INIT_LIST_HEAD(&delayed_node->p_list);
65	delayed_node->bytes_reserved = 0;
66	memset(&delayed_node->inode_item, 0, sizeof(delayed_node->inode_item));
67}
68
69static inline int btrfs_is_continuous_delayed_item(
70					struct btrfs_delayed_item *item1,
71					struct btrfs_delayed_item *item2)
72{
73	if (item1->key.type == BTRFS_DIR_INDEX_KEY &&
74	    item1->key.objectid == item2->key.objectid &&
75	    item1->key.type == item2->key.type &&
76	    item1->key.offset + 1 == item2->key.offset)
77		return 1;
78	return 0;
79}
80
81static inline struct btrfs_delayed_root *btrfs_get_delayed_root(
82							struct btrfs_root *root)
83{
84	return root->fs_info->delayed_root;
85}
86
87static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode)
88{
89	struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
90	struct btrfs_root *root = btrfs_inode->root;
91	u64 ino = btrfs_ino(inode);
92	struct btrfs_delayed_node *node;
93
94	node = ACCESS_ONCE(btrfs_inode->delayed_node);
95	if (node) {
96		atomic_inc(&node->refs);
97		return node;
98	}
99
100	spin_lock(&root->inode_lock);
101	node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
102	if (node) {
103		if (btrfs_inode->delayed_node) {
104			atomic_inc(&node->refs);	/* can be accessed */
105			BUG_ON(btrfs_inode->delayed_node != node);
106			spin_unlock(&root->inode_lock);
107			return node;
108		}
109		btrfs_inode->delayed_node = node;
110		/* can be accessed and cached in the inode */
111		atomic_add(2, &node->refs);
112		spin_unlock(&root->inode_lock);
113		return node;
114	}
115	spin_unlock(&root->inode_lock);
116
117	return NULL;
118}
119
120/* Will return either the node or PTR_ERR(-ENOMEM) */
121static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
122							struct inode *inode)
123{
124	struct btrfs_delayed_node *node;
125	struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
126	struct btrfs_root *root = btrfs_inode->root;
127	u64 ino = btrfs_ino(inode);
128	int ret;
129
130again:
131	node = btrfs_get_delayed_node(inode);
132	if (node)
133		return node;
134
135	node = kmem_cache_alloc(delayed_node_cache, GFP_NOFS);
136	if (!node)
137		return ERR_PTR(-ENOMEM);
138	btrfs_init_delayed_node(node, root, ino);
139
140	/* cached in the btrfs inode and can be accessed */
141	atomic_add(2, &node->refs);
142
143	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
144	if (ret) {
145		kmem_cache_free(delayed_node_cache, node);
146		return ERR_PTR(ret);
147	}
148
149	spin_lock(&root->inode_lock);
150	ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
151	if (ret == -EEXIST) {
152		spin_unlock(&root->inode_lock);
153		kmem_cache_free(delayed_node_cache, node);
154		radix_tree_preload_end();
155		goto again;
156	}
157	btrfs_inode->delayed_node = node;
158	spin_unlock(&root->inode_lock);
159	radix_tree_preload_end();
160
161	return node;
162}
163
164/*
165 * Call it when holding delayed_node->mutex
166 *
167 * If mod = 1, add this node into the prepared list.
168 */
169static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
170				     struct btrfs_delayed_node *node,
171				     int mod)
172{
173	spin_lock(&root->lock);
174	if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
175		if (!list_empty(&node->p_list))
176			list_move_tail(&node->p_list, &root->prepare_list);
177		else if (mod)
178			list_add_tail(&node->p_list, &root->prepare_list);
179	} else {
180		list_add_tail(&node->n_list, &root->node_list);
181		list_add_tail(&node->p_list, &root->prepare_list);
182		atomic_inc(&node->refs);	/* inserted into list */
183		root->nodes++;
184		set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
185	}
186	spin_unlock(&root->lock);
187}
188
189/* Call it when holding delayed_node->mutex */
190static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
191				       struct btrfs_delayed_node *node)
192{
193	spin_lock(&root->lock);
194	if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
195		root->nodes--;
196		atomic_dec(&node->refs);	/* not in the list */
197		list_del_init(&node->n_list);
198		if (!list_empty(&node->p_list))
199			list_del_init(&node->p_list);
200		clear_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
201	}
202	spin_unlock(&root->lock);
203}
204
205static struct btrfs_delayed_node *btrfs_first_delayed_node(
206			struct btrfs_delayed_root *delayed_root)
207{
208	struct list_head *p;
209	struct btrfs_delayed_node *node = NULL;
210
211	spin_lock(&delayed_root->lock);
212	if (list_empty(&delayed_root->node_list))
213		goto out;
214
215	p = delayed_root->node_list.next;
216	node = list_entry(p, struct btrfs_delayed_node, n_list);
217	atomic_inc(&node->refs);
218out:
219	spin_unlock(&delayed_root->lock);
220
221	return node;
222}
223
224static struct btrfs_delayed_node *btrfs_next_delayed_node(
225						struct btrfs_delayed_node *node)
226{
227	struct btrfs_delayed_root *delayed_root;
228	struct list_head *p;
229	struct btrfs_delayed_node *next = NULL;
230
231	delayed_root = node->root->fs_info->delayed_root;
232	spin_lock(&delayed_root->lock);
233	if (!test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
234		/* not in the list */
235		if (list_empty(&delayed_root->node_list))
236			goto out;
237		p = delayed_root->node_list.next;
238	} else if (list_is_last(&node->n_list, &delayed_root->node_list))
239		goto out;
240	else
241		p = node->n_list.next;
242
243	next = list_entry(p, struct btrfs_delayed_node, n_list);
244	atomic_inc(&next->refs);
245out:
246	spin_unlock(&delayed_root->lock);
247
248	return next;
249}
250
251static void __btrfs_release_delayed_node(
252				struct btrfs_delayed_node *delayed_node,
253				int mod)
254{
255	struct btrfs_delayed_root *delayed_root;
256
257	if (!delayed_node)
258		return;
259
260	delayed_root = delayed_node->root->fs_info->delayed_root;
261
262	mutex_lock(&delayed_node->mutex);
263	if (delayed_node->count)
264		btrfs_queue_delayed_node(delayed_root, delayed_node, mod);
265	else
266		btrfs_dequeue_delayed_node(delayed_root, delayed_node);
267	mutex_unlock(&delayed_node->mutex);
268
269	if (atomic_dec_and_test(&delayed_node->refs)) {
270		bool free = false;
271		struct btrfs_root *root = delayed_node->root;
272		spin_lock(&root->inode_lock);
273		if (atomic_read(&delayed_node->refs) == 0) {
274			radix_tree_delete(&root->delayed_nodes_tree,
275					  delayed_node->inode_id);
276			free = true;
277		}
278		spin_unlock(&root->inode_lock);
279		if (free)
280			kmem_cache_free(delayed_node_cache, delayed_node);
281	}
282}
283
284static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node)
285{
286	__btrfs_release_delayed_node(node, 0);
287}
288
289static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
290					struct btrfs_delayed_root *delayed_root)
291{
292	struct list_head *p;
293	struct btrfs_delayed_node *node = NULL;
294
295	spin_lock(&delayed_root->lock);
296	if (list_empty(&delayed_root->prepare_list))
297		goto out;
298
299	p = delayed_root->prepare_list.next;
300	list_del_init(p);
301	node = list_entry(p, struct btrfs_delayed_node, p_list);
302	atomic_inc(&node->refs);
303out:
304	spin_unlock(&delayed_root->lock);
305
306	return node;
307}
308
309static inline void btrfs_release_prepared_delayed_node(
310					struct btrfs_delayed_node *node)
311{
312	__btrfs_release_delayed_node(node, 1);
313}
314
315static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
316{
317	struct btrfs_delayed_item *item;
318	item = kmalloc(sizeof(*item) + data_len, GFP_NOFS);
319	if (item) {
320		item->data_len = data_len;
321		item->ins_or_del = 0;
322		item->bytes_reserved = 0;
323		item->delayed_node = NULL;
324		atomic_set(&item->refs, 1);
325	}
326	return item;
327}
328
329/*
330 * __btrfs_lookup_delayed_item - look up the delayed item by key
331 * @delayed_node: pointer to the delayed node
332 * @key:	  the key to look up
333 * @prev:	  used to store the prev item if the right item isn't found
334 * @next:	  used to store the next item if the right item isn't found
335 *
336 * Note: if we don't find the right item, we will return the prev item and
337 * the next item.
338 */
339static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
340				struct rb_root *root,
341				struct btrfs_key *key,
342				struct btrfs_delayed_item **prev,
343				struct btrfs_delayed_item **next)
344{
345	struct rb_node *node, *prev_node = NULL;
346	struct btrfs_delayed_item *delayed_item = NULL;
347	int ret = 0;
348
349	node = root->rb_node;
350
351	while (node) {
352		delayed_item = rb_entry(node, struct btrfs_delayed_item,
353					rb_node);
354		prev_node = node;
355		ret = btrfs_comp_cpu_keys(&delayed_item->key, key);
356		if (ret < 0)
357			node = node->rb_right;
358		else if (ret > 0)
359			node = node->rb_left;
360		else
361			return delayed_item;
362	}
363
364	if (prev) {
365		if (!prev_node)
366			*prev = NULL;
367		else if (ret < 0)
368			*prev = delayed_item;
369		else if ((node = rb_prev(prev_node)) != NULL) {
370			*prev = rb_entry(node, struct btrfs_delayed_item,
371					 rb_node);
372		} else
373			*prev = NULL;
374	}
375
376	if (next) {
377		if (!prev_node)
378			*next = NULL;
379		else if (ret > 0)
380			*next = delayed_item;
381		else if ((node = rb_next(prev_node)) != NULL) {
382			*next = rb_entry(node, struct btrfs_delayed_item,
383					 rb_node);
384		} else
385			*next = NULL;
386	}
387	return NULL;
388}
389
390static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
391					struct btrfs_delayed_node *delayed_node,
392					struct btrfs_key *key)
393{
394	struct btrfs_delayed_item *item;
395
396	item = __btrfs_lookup_delayed_item(&delayed_node->ins_root, key,
397					   NULL, NULL);
398	return item;
399}
400
401static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
402				    struct btrfs_delayed_item *ins,
403				    int action)
404{
405	struct rb_node **p, *node;
406	struct rb_node *parent_node = NULL;
407	struct rb_root *root;
408	struct btrfs_delayed_item *item;
409	int cmp;
410
411	if (action == BTRFS_DELAYED_INSERTION_ITEM)
412		root = &delayed_node->ins_root;
413	else if (action == BTRFS_DELAYED_DELETION_ITEM)
414		root = &delayed_node->del_root;
415	else
416		BUG();
417	p = &root->rb_node;
418	node = &ins->rb_node;
419
420	while (*p) {
421		parent_node = *p;
422		item = rb_entry(parent_node, struct btrfs_delayed_item,
423				 rb_node);
424
425		cmp = btrfs_comp_cpu_keys(&item->key, &ins->key);
426		if (cmp < 0)
427			p = &(*p)->rb_right;
428		else if (cmp > 0)
429			p = &(*p)->rb_left;
430		else
431			return -EEXIST;
432	}
433
434	rb_link_node(node, parent_node, p);
435	rb_insert_color(node, root);
436	ins->delayed_node = delayed_node;
437	ins->ins_or_del = action;
438
439	if (ins->key.type == BTRFS_DIR_INDEX_KEY &&
440	    action == BTRFS_DELAYED_INSERTION_ITEM &&
441	    ins->key.offset >= delayed_node->index_cnt)
442			delayed_node->index_cnt = ins->key.offset + 1;
443
444	delayed_node->count++;
445	atomic_inc(&delayed_node->root->fs_info->delayed_root->items);
446	return 0;
447}
448
449static int __btrfs_add_delayed_insertion_item(struct btrfs_delayed_node *node,
450					      struct btrfs_delayed_item *item)
451{
452	return __btrfs_add_delayed_item(node, item,
453					BTRFS_DELAYED_INSERTION_ITEM);
454}
455
456static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node,
457					     struct btrfs_delayed_item *item)
458{
459	return __btrfs_add_delayed_item(node, item,
460					BTRFS_DELAYED_DELETION_ITEM);
461}
462
463static void finish_one_item(struct btrfs_delayed_root *delayed_root)
464{
465	int seq = atomic_inc_return(&delayed_root->items_seq);
466	if ((atomic_dec_return(&delayed_root->items) <
467	    BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) &&
468	    waitqueue_active(&delayed_root->wait))
469		wake_up(&delayed_root->wait);
470}
471
472static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
473{
474	struct rb_root *root;
475	struct btrfs_delayed_root *delayed_root;
476
477	delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root;
478
479	BUG_ON(!delayed_root);
480	BUG_ON(delayed_item->ins_or_del != BTRFS_DELAYED_DELETION_ITEM &&
481	       delayed_item->ins_or_del != BTRFS_DELAYED_INSERTION_ITEM);
482
483	if (delayed_item->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM)
484		root = &delayed_item->delayed_node->ins_root;
485	else
486		root = &delayed_item->delayed_node->del_root;
487
488	rb_erase(&delayed_item->rb_node, root);
489	delayed_item->delayed_node->count--;
490
491	finish_one_item(delayed_root);
492}
493
494static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
495{
496	if (item) {
497		__btrfs_remove_delayed_item(item);
498		if (atomic_dec_and_test(&item->refs))
499			kfree(item);
500	}
501}
502
503static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
504					struct btrfs_delayed_node *delayed_node)
505{
506	struct rb_node *p;
507	struct btrfs_delayed_item *item = NULL;
508
509	p = rb_first(&delayed_node->ins_root);
510	if (p)
511		item = rb_entry(p, struct btrfs_delayed_item, rb_node);
512
513	return item;
514}
515
516static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
517					struct btrfs_delayed_node *delayed_node)
518{
519	struct rb_node *p;
520	struct btrfs_delayed_item *item = NULL;
521
522	p = rb_first(&delayed_node->del_root);
523	if (p)
524		item = rb_entry(p, struct btrfs_delayed_item, rb_node);
525
526	return item;
527}
528
529static struct btrfs_delayed_item *__btrfs_next_delayed_item(
530						struct btrfs_delayed_item *item)
531{
532	struct rb_node *p;
533	struct btrfs_delayed_item *next = NULL;
534
535	p = rb_next(&item->rb_node);
536	if (p)
537		next = rb_entry(p, struct btrfs_delayed_item, rb_node);
538
539	return next;
540}
541
542static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
543					       struct btrfs_root *root,
544					       struct btrfs_delayed_item *item)
545{
546	struct btrfs_block_rsv *src_rsv;
547	struct btrfs_block_rsv *dst_rsv;
548	u64 num_bytes;
549	int ret;
550
551	if (!trans->bytes_reserved)
552		return 0;
553
554	src_rsv = trans->block_rsv;
555	dst_rsv = &root->fs_info->delayed_block_rsv;
556
557	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
558	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
559	if (!ret) {
560		trace_btrfs_space_reservation(root->fs_info, "delayed_item",
561					      item->key.objectid,
562					      num_bytes, 1);
563		item->bytes_reserved = num_bytes;
564	}
565
566	return ret;
567}
568
569static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
570						struct btrfs_delayed_item *item)
571{
572	struct btrfs_block_rsv *rsv;
573
574	if (!item->bytes_reserved)
575		return;
576
577	rsv = &root->fs_info->delayed_block_rsv;
578	trace_btrfs_space_reservation(root->fs_info, "delayed_item",
579				      item->key.objectid, item->bytes_reserved,
580				      0);
581	btrfs_block_rsv_release(root, rsv,
582				item->bytes_reserved);
583}
584
585static int btrfs_delayed_inode_reserve_metadata(
586					struct btrfs_trans_handle *trans,
587					struct btrfs_root *root,
588					struct inode *inode,
589					struct btrfs_delayed_node *node)
590{
591	struct btrfs_block_rsv *src_rsv;
592	struct btrfs_block_rsv *dst_rsv;
593	u64 num_bytes;
594	int ret;
595	bool release = false;
596
597	src_rsv = trans->block_rsv;
598	dst_rsv = &root->fs_info->delayed_block_rsv;
599
600	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
601
602	/*
603	 * btrfs_dirty_inode will update the inode under btrfs_join_transaction
604	 * which doesn't reserve space for speed.  This is a problem since we
605	 * still need to reserve space for this update, so try to reserve the
606	 * space.
607	 *
608	 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
609	 * we're accounted for.
610	 */
611	if (!src_rsv || (!trans->bytes_reserved &&
612			 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
613		ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
614					  BTRFS_RESERVE_NO_FLUSH);
615		/*
616		 * Since we're under a transaction reserve_metadata_bytes could
617		 * try to commit the transaction which will make it return
618		 * EAGAIN to make us stop the transaction we have, so return
619		 * ENOSPC instead so that btrfs_dirty_inode knows what to do.
620		 */
621		if (ret == -EAGAIN)
622			ret = -ENOSPC;
623		if (!ret) {
624			node->bytes_reserved = num_bytes;
625			trace_btrfs_space_reservation(root->fs_info,
626						      "delayed_inode",
627						      btrfs_ino(inode),
628						      num_bytes, 1);
629		}
630		return ret;
631	} else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
632		spin_lock(&BTRFS_I(inode)->lock);
633		if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
634				       &BTRFS_I(inode)->runtime_flags)) {
635			spin_unlock(&BTRFS_I(inode)->lock);
636			release = true;
637			goto migrate;
638		}
639		spin_unlock(&BTRFS_I(inode)->lock);
640
641		/* Ok we didn't have space pre-reserved.  This shouldn't happen
642		 * too often but it can happen if we do delalloc to an existing
643		 * inode which gets dirtied because of the time update, and then
644		 * isn't touched again until after the transaction commits and
645		 * then we try to write out the data.  First try to be nice and
646		 * reserve something strictly for us.  If not be a pain and try
647		 * to steal from the delalloc block rsv.
648		 */
649		ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
650					  BTRFS_RESERVE_NO_FLUSH);
651		if (!ret)
652			goto out;
653
654		ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
655		if (!WARN_ON(ret))
656			goto out;
657
658		/*
659		 * Ok this is a problem, let's just steal from the global rsv
660		 * since this really shouldn't happen that often.
661		 */
662		ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
663					      dst_rsv, num_bytes);
664		goto out;
665	}
666
667migrate:
668	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
669
670out:
671	/*
672	 * Migrate only takes a reservation, it doesn't touch the size of the
673	 * block_rsv.  This is to simplify people who don't normally have things
674	 * migrated from their block rsv.  If they go to release their
675	 * reservation, that will decrease the size as well, so if migrate
676	 * reduced size we'd end up with a negative size.  But for the
677	 * delalloc_meta_reserved stuff we will only know to drop 1 reservation,
678	 * but we could in fact do this reserve/migrate dance several times
679	 * between the time we did the original reservation and we'd clean it
680	 * up.  So to take care of this, release the space for the meta
681	 * reservation here.  I think it may be time for a documentation page on
682	 * how block rsvs. work.
683	 */
684	if (!ret) {
685		trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
686					      btrfs_ino(inode), num_bytes, 1);
687		node->bytes_reserved = num_bytes;
688	}
689
690	if (release) {
691		trace_btrfs_space_reservation(root->fs_info, "delalloc",
692					      btrfs_ino(inode), num_bytes, 0);
693		btrfs_block_rsv_release(root, src_rsv, num_bytes);
694	}
695
696	return ret;
697}
698
699static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
700						struct btrfs_delayed_node *node)
701{
702	struct btrfs_block_rsv *rsv;
703
704	if (!node->bytes_reserved)
705		return;
706
707	rsv = &root->fs_info->delayed_block_rsv;
708	trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
709				      node->inode_id, node->bytes_reserved, 0);
710	btrfs_block_rsv_release(root, rsv,
711				node->bytes_reserved);
712	node->bytes_reserved = 0;
713}
714
715/*
716 * This helper will insert some continuous items into the same leaf according
717 * to the free space of the leaf.
718 */
719static int btrfs_batch_insert_items(struct btrfs_root *root,
720				    struct btrfs_path *path,
721				    struct btrfs_delayed_item *item)
722{
723	struct btrfs_delayed_item *curr, *next;
724	int free_space;
725	int total_data_size = 0, total_size = 0;
726	struct extent_buffer *leaf;
727	char *data_ptr;
728	struct btrfs_key *keys;
729	u32 *data_size;
730	struct list_head head;
731	int slot;
732	int nitems;
733	int i;
734	int ret = 0;
735
736	BUG_ON(!path->nodes[0]);
737
738	leaf = path->nodes[0];
739	free_space = btrfs_leaf_free_space(root, leaf);
740	INIT_LIST_HEAD(&head);
741
742	next = item;
743	nitems = 0;
744
745	/*
746	 * count the number of the continuous items that we can insert in batch
747	 */
748	while (total_size + next->data_len + sizeof(struct btrfs_item) <=
749	       free_space) {
750		total_data_size += next->data_len;
751		total_size += next->data_len + sizeof(struct btrfs_item);
752		list_add_tail(&next->tree_list, &head);
753		nitems++;
754
755		curr = next;
756		next = __btrfs_next_delayed_item(curr);
757		if (!next)
758			break;
759
760		if (!btrfs_is_continuous_delayed_item(curr, next))
761			break;
762	}
763
764	if (!nitems) {
765		ret = 0;
766		goto out;
767	}
768
769	/*
770	 * we need allocate some memory space, but it might cause the task
771	 * to sleep, so we set all locked nodes in the path to blocking locks
772	 * first.
773	 */
774	btrfs_set_path_blocking(path);
775
776	keys = kmalloc_array(nitems, sizeof(struct btrfs_key), GFP_NOFS);
777	if (!keys) {
778		ret = -ENOMEM;
779		goto out;
780	}
781
782	data_size = kmalloc_array(nitems, sizeof(u32), GFP_NOFS);
783	if (!data_size) {
784		ret = -ENOMEM;
785		goto error;
786	}
787
788	/* get keys of all the delayed items */
789	i = 0;
790	list_for_each_entry(next, &head, tree_list) {
791		keys[i] = next->key;
792		data_size[i] = next->data_len;
793		i++;
794	}
795
796	/* reset all the locked nodes in the patch to spinning locks. */
797	btrfs_clear_path_blocking(path, NULL, 0);
798
799	/* insert the keys of the items */
800	setup_items_for_insert(root, path, keys, data_size,
801			       total_data_size, total_size, nitems);
802
803	/* insert the dir index items */
804	slot = path->slots[0];
805	list_for_each_entry_safe(curr, next, &head, tree_list) {
806		data_ptr = btrfs_item_ptr(leaf, slot, char);
807		write_extent_buffer(leaf, &curr->data,
808				    (unsigned long)data_ptr,
809				    curr->data_len);
810		slot++;
811
812		btrfs_delayed_item_release_metadata(root, curr);
813
814		list_del(&curr->tree_list);
815		btrfs_release_delayed_item(curr);
816	}
817
818error:
819	kfree(data_size);
820	kfree(keys);
821out:
822	return ret;
823}
824
825/*
826 * This helper can just do simple insertion that needn't extend item for new
827 * data, such as directory name index insertion, inode insertion.
828 */
829static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
830				     struct btrfs_root *root,
831				     struct btrfs_path *path,
832				     struct btrfs_delayed_item *delayed_item)
833{
834	struct extent_buffer *leaf;
835	char *ptr;
836	int ret;
837
838	ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key,
839				      delayed_item->data_len);
840	if (ret < 0 && ret != -EEXIST)
841		return ret;
842
843	leaf = path->nodes[0];
844
845	ptr = btrfs_item_ptr(leaf, path->slots[0], char);
846
847	write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr,
848			    delayed_item->data_len);
849	btrfs_mark_buffer_dirty(leaf);
850
851	btrfs_delayed_item_release_metadata(root, delayed_item);
852	return 0;
853}
854
855/*
856 * we insert an item first, then if there are some continuous items, we try
857 * to insert those items into the same leaf.
858 */
859static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans,
860				      struct btrfs_path *path,
861				      struct btrfs_root *root,
862				      struct btrfs_delayed_node *node)
863{
864	struct btrfs_delayed_item *curr, *prev;
865	int ret = 0;
866
867do_again:
868	mutex_lock(&node->mutex);
869	curr = __btrfs_first_delayed_insertion_item(node);
870	if (!curr)
871		goto insert_end;
872
873	ret = btrfs_insert_delayed_item(trans, root, path, curr);
874	if (ret < 0) {
875		btrfs_release_path(path);
876		goto insert_end;
877	}
878
879	prev = curr;
880	curr = __btrfs_next_delayed_item(prev);
881	if (curr && btrfs_is_continuous_delayed_item(prev, curr)) {
882		/* insert the continuous items into the same leaf */
883		path->slots[0]++;
884		btrfs_batch_insert_items(root, path, curr);
885	}
886	btrfs_release_delayed_item(prev);
887	btrfs_mark_buffer_dirty(path->nodes[0]);
888
889	btrfs_release_path(path);
890	mutex_unlock(&node->mutex);
891	goto do_again;
892
893insert_end:
894	mutex_unlock(&node->mutex);
895	return ret;
896}
897
898static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
899				    struct btrfs_root *root,
900				    struct btrfs_path *path,
901				    struct btrfs_delayed_item *item)
902{
903	struct btrfs_delayed_item *curr, *next;
904	struct extent_buffer *leaf;
905	struct btrfs_key key;
906	struct list_head head;
907	int nitems, i, last_item;
908	int ret = 0;
909
910	BUG_ON(!path->nodes[0]);
911
912	leaf = path->nodes[0];
913
914	i = path->slots[0];
915	last_item = btrfs_header_nritems(leaf) - 1;
916	if (i > last_item)
917		return -ENOENT;	/* FIXME: Is errno suitable? */
918
919	next = item;
920	INIT_LIST_HEAD(&head);
921	btrfs_item_key_to_cpu(leaf, &key, i);
922	nitems = 0;
923	/*
924	 * count the number of the dir index items that we can delete in batch
925	 */
926	while (btrfs_comp_cpu_keys(&next->key, &key) == 0) {
927		list_add_tail(&next->tree_list, &head);
928		nitems++;
929
930		curr = next;
931		next = __btrfs_next_delayed_item(curr);
932		if (!next)
933			break;
934
935		if (!btrfs_is_continuous_delayed_item(curr, next))
936			break;
937
938		i++;
939		if (i > last_item)
940			break;
941		btrfs_item_key_to_cpu(leaf, &key, i);
942	}
943
944	if (!nitems)
945		return 0;
946
947	ret = btrfs_del_items(trans, root, path, path->slots[0], nitems);
948	if (ret)
949		goto out;
950
951	list_for_each_entry_safe(curr, next, &head, tree_list) {
952		btrfs_delayed_item_release_metadata(root, curr);
953		list_del(&curr->tree_list);
954		btrfs_release_delayed_item(curr);
955	}
956
957out:
958	return ret;
959}
960
961static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
962				      struct btrfs_path *path,
963				      struct btrfs_root *root,
964				      struct btrfs_delayed_node *node)
965{
966	struct btrfs_delayed_item *curr, *prev;
967	int ret = 0;
968
969do_again:
970	mutex_lock(&node->mutex);
971	curr = __btrfs_first_delayed_deletion_item(node);
972	if (!curr)
973		goto delete_fail;
974
975	ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1);
976	if (ret < 0)
977		goto delete_fail;
978	else if (ret > 0) {
979		/*
980		 * can't find the item which the node points to, so this node
981		 * is invalid, just drop it.
982		 */
983		prev = curr;
984		curr = __btrfs_next_delayed_item(prev);
985		btrfs_release_delayed_item(prev);
986		ret = 0;
987		btrfs_release_path(path);
988		if (curr) {
989			mutex_unlock(&node->mutex);
990			goto do_again;
991		} else
992			goto delete_fail;
993	}
994
995	btrfs_batch_delete_items(trans, root, path, curr);
996	btrfs_release_path(path);
997	mutex_unlock(&node->mutex);
998	goto do_again;
999
1000delete_fail:
1001	btrfs_release_path(path);
1002	mutex_unlock(&node->mutex);
1003	return ret;
1004}
1005
1006static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
1007{
1008	struct btrfs_delayed_root *delayed_root;
1009
1010	if (delayed_node &&
1011	    test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
1012		BUG_ON(!delayed_node->root);
1013		clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
1014		delayed_node->count--;
1015
1016		delayed_root = delayed_node->root->fs_info->delayed_root;
1017		finish_one_item(delayed_root);
1018	}
1019}
1020
1021static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node)
1022{
1023	struct btrfs_delayed_root *delayed_root;
1024
1025	ASSERT(delayed_node->root);
1026	clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
1027	delayed_node->count--;
1028
1029	delayed_root = delayed_node->root->fs_info->delayed_root;
1030	finish_one_item(delayed_root);
1031}
1032
1033static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1034					struct btrfs_root *root,
1035					struct btrfs_path *path,
1036					struct btrfs_delayed_node *node)
1037{
1038	struct btrfs_key key;
1039	struct btrfs_inode_item *inode_item;
1040	struct extent_buffer *leaf;
1041	int mod;
1042	int ret;
1043
1044	key.objectid = node->inode_id;
1045	key.type = BTRFS_INODE_ITEM_KEY;
1046	key.offset = 0;
1047
1048	if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
1049		mod = -1;
1050	else
1051		mod = 1;
1052
1053	ret = btrfs_lookup_inode(trans, root, path, &key, mod);
1054	if (ret > 0) {
1055		btrfs_release_path(path);
1056		return -ENOENT;
1057	} else if (ret < 0) {
1058		return ret;
1059	}
1060
1061	leaf = path->nodes[0];
1062	inode_item = btrfs_item_ptr(leaf, path->slots[0],
1063				    struct btrfs_inode_item);
1064	write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item,
1065			    sizeof(struct btrfs_inode_item));
1066	btrfs_mark_buffer_dirty(leaf);
1067
1068	if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
1069		goto no_iref;
1070
1071	path->slots[0]++;
1072	if (path->slots[0] >= btrfs_header_nritems(leaf))
1073		goto search;
1074again:
1075	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1076	if (key.objectid != node->inode_id)
1077		goto out;
1078
1079	if (key.type != BTRFS_INODE_REF_KEY &&
1080	    key.type != BTRFS_INODE_EXTREF_KEY)
1081		goto out;
1082
1083	/*
1084	 * Delayed iref deletion is for the inode who has only one link,
1085	 * so there is only one iref. The case that several irefs are
1086	 * in the same item doesn't exist.
1087	 */
1088	btrfs_del_item(trans, root, path);
1089out:
1090	btrfs_release_delayed_iref(node);
1091no_iref:
1092	btrfs_release_path(path);
1093err_out:
1094	btrfs_delayed_inode_release_metadata(root, node);
1095	btrfs_release_delayed_inode(node);
1096
1097	return ret;
1098
1099search:
1100	btrfs_release_path(path);
1101
1102	key.type = BTRFS_INODE_EXTREF_KEY;
1103	key.offset = -1;
1104	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1105	if (ret < 0)
1106		goto err_out;
1107	ASSERT(ret);
1108
1109	ret = 0;
1110	leaf = path->nodes[0];
1111	path->slots[0]--;
1112	goto again;
1113}
1114
1115static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1116					     struct btrfs_root *root,
1117					     struct btrfs_path *path,
1118					     struct btrfs_delayed_node *node)
1119{
1120	int ret;
1121
1122	mutex_lock(&node->mutex);
1123	if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &node->flags)) {
1124		mutex_unlock(&node->mutex);
1125		return 0;
1126	}
1127
1128	ret = __btrfs_update_delayed_inode(trans, root, path, node);
1129	mutex_unlock(&node->mutex);
1130	return ret;
1131}
1132
1133static inline int
1134__btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1135				   struct btrfs_path *path,
1136				   struct btrfs_delayed_node *node)
1137{
1138	int ret;
1139
1140	ret = btrfs_insert_delayed_items(trans, path, node->root, node);
1141	if (ret)
1142		return ret;
1143
1144	ret = btrfs_delete_delayed_items(trans, path, node->root, node);
1145	if (ret)
1146		return ret;
1147
1148	ret = btrfs_update_delayed_inode(trans, node->root, path, node);
1149	return ret;
1150}
1151
1152/*
1153 * Called when committing the transaction.
1154 * Returns 0 on success.
1155 * Returns < 0 on error and returns with an aborted transaction with any
1156 * outstanding delayed items cleaned up.
1157 */
1158static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1159				     struct btrfs_root *root, int nr)
1160{
1161	struct btrfs_delayed_root *delayed_root;
1162	struct btrfs_delayed_node *curr_node, *prev_node;
1163	struct btrfs_path *path;
1164	struct btrfs_block_rsv *block_rsv;
1165	int ret = 0;
1166	bool count = (nr > 0);
1167
1168	if (trans->aborted)
1169		return -EIO;
1170
1171	path = btrfs_alloc_path();
1172	if (!path)
1173		return -ENOMEM;
1174	path->leave_spinning = 1;
1175
1176	block_rsv = trans->block_rsv;
1177	trans->block_rsv = &root->fs_info->delayed_block_rsv;
1178
1179	delayed_root = btrfs_get_delayed_root(root);
1180
1181	curr_node = btrfs_first_delayed_node(delayed_root);
1182	while (curr_node && (!count || (count && nr--))) {
1183		ret = __btrfs_commit_inode_delayed_items(trans, path,
1184							 curr_node);
1185		if (ret) {
1186			btrfs_release_delayed_node(curr_node);
1187			curr_node = NULL;
1188			btrfs_abort_transaction(trans, root, ret);
1189			break;
1190		}
1191
1192		prev_node = curr_node;
1193		curr_node = btrfs_next_delayed_node(curr_node);
1194		btrfs_release_delayed_node(prev_node);
1195	}
1196
1197	if (curr_node)
1198		btrfs_release_delayed_node(curr_node);
1199	btrfs_free_path(path);
1200	trans->block_rsv = block_rsv;
1201
1202	return ret;
1203}
1204
1205int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1206			    struct btrfs_root *root)
1207{
1208	return __btrfs_run_delayed_items(trans, root, -1);
1209}
1210
1211int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
1212			       struct btrfs_root *root, int nr)
1213{
1214	return __btrfs_run_delayed_items(trans, root, nr);
1215}
1216
1217int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1218				     struct inode *inode)
1219{
1220	struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
1221	struct btrfs_path *path;
1222	struct btrfs_block_rsv *block_rsv;
1223	int ret;
1224
1225	if (!delayed_node)
1226		return 0;
1227
1228	mutex_lock(&delayed_node->mutex);
1229	if (!delayed_node->count) {
1230		mutex_unlock(&delayed_node->mutex);
1231		btrfs_release_delayed_node(delayed_node);
1232		return 0;
1233	}
1234	mutex_unlock(&delayed_node->mutex);
1235
1236	path = btrfs_alloc_path();
1237	if (!path) {
1238		btrfs_release_delayed_node(delayed_node);
1239		return -ENOMEM;
1240	}
1241	path->leave_spinning = 1;
1242
1243	block_rsv = trans->block_rsv;
1244	trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
1245
1246	ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
1247
1248	btrfs_release_delayed_node(delayed_node);
1249	btrfs_free_path(path);
1250	trans->block_rsv = block_rsv;
1251
1252	return ret;
1253}
1254
1255int btrfs_commit_inode_delayed_inode(struct inode *inode)
1256{
1257	struct btrfs_trans_handle *trans;
1258	struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
1259	struct btrfs_path *path;
1260	struct btrfs_block_rsv *block_rsv;
1261	int ret;
1262
1263	if (!delayed_node)
1264		return 0;
1265
1266	mutex_lock(&delayed_node->mutex);
1267	if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
1268		mutex_unlock(&delayed_node->mutex);
1269		btrfs_release_delayed_node(delayed_node);
1270		return 0;
1271	}
1272	mutex_unlock(&delayed_node->mutex);
1273
1274	trans = btrfs_join_transaction(delayed_node->root);
1275	if (IS_ERR(trans)) {
1276		ret = PTR_ERR(trans);
1277		goto out;
1278	}
1279
1280	path = btrfs_alloc_path();
1281	if (!path) {
1282		ret = -ENOMEM;
1283		goto trans_out;
1284	}
1285	path->leave_spinning = 1;
1286
1287	block_rsv = trans->block_rsv;
1288	trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
1289
1290	mutex_lock(&delayed_node->mutex);
1291	if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags))
1292		ret = __btrfs_update_delayed_inode(trans, delayed_node->root,
1293						   path, delayed_node);
1294	else
1295		ret = 0;
1296	mutex_unlock(&delayed_node->mutex);
1297
1298	btrfs_free_path(path);
1299	trans->block_rsv = block_rsv;
1300trans_out:
1301	btrfs_end_transaction(trans, delayed_node->root);
1302	btrfs_btree_balance_dirty(delayed_node->root);
1303out:
1304	btrfs_release_delayed_node(delayed_node);
1305
1306	return ret;
1307}
1308
1309void btrfs_remove_delayed_node(struct inode *inode)
1310{
1311	struct btrfs_delayed_node *delayed_node;
1312
1313	delayed_node = ACCESS_ONCE(BTRFS_I(inode)->delayed_node);
1314	if (!delayed_node)
1315		return;
1316
1317	BTRFS_I(inode)->delayed_node = NULL;
1318	btrfs_release_delayed_node(delayed_node);
1319}
1320
1321struct btrfs_async_delayed_work {
1322	struct btrfs_delayed_root *delayed_root;
1323	int nr;
1324	struct btrfs_work work;
1325};
1326
1327static void btrfs_async_run_delayed_root(struct btrfs_work *work)
1328{
1329	struct btrfs_async_delayed_work *async_work;
1330	struct btrfs_delayed_root *delayed_root;
1331	struct btrfs_trans_handle *trans;
1332	struct btrfs_path *path;
1333	struct btrfs_delayed_node *delayed_node = NULL;
1334	struct btrfs_root *root;
1335	struct btrfs_block_rsv *block_rsv;
1336	int total_done = 0;
1337
1338	async_work = container_of(work, struct btrfs_async_delayed_work, work);
1339	delayed_root = async_work->delayed_root;
1340
1341	path = btrfs_alloc_path();
1342	if (!path)
1343		goto out;
1344
1345again:
1346	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND / 2)
1347		goto free_path;
1348
1349	delayed_node = btrfs_first_prepared_delayed_node(delayed_root);
1350	if (!delayed_node)
1351		goto free_path;
1352
1353	path->leave_spinning = 1;
1354	root = delayed_node->root;
1355
1356	trans = btrfs_join_transaction(root);
1357	if (IS_ERR(trans))
1358		goto release_path;
1359
1360	block_rsv = trans->block_rsv;
1361	trans->block_rsv = &root->fs_info->delayed_block_rsv;
1362
1363	__btrfs_commit_inode_delayed_items(trans, path, delayed_node);
1364
1365	trans->block_rsv = block_rsv;
1366	btrfs_end_transaction(trans, root);
1367	btrfs_btree_balance_dirty_nodelay(root);
1368
1369release_path:
1370	btrfs_release_path(path);
1371	total_done++;
1372
1373	btrfs_release_prepared_delayed_node(delayed_node);
1374	if (async_work->nr == 0 || total_done < async_work->nr)
1375		goto again;
1376
1377free_path:
1378	btrfs_free_path(path);
1379out:
1380	wake_up(&delayed_root->wait);
1381	kfree(async_work);
1382}
1383
1384
1385static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
1386				     struct btrfs_fs_info *fs_info, int nr)
1387{
1388	struct btrfs_async_delayed_work *async_work;
1389
1390	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
1391		return 0;
1392
1393	async_work = kmalloc(sizeof(*async_work), GFP_NOFS);
1394	if (!async_work)
1395		return -ENOMEM;
1396
1397	async_work->delayed_root = delayed_root;
1398	btrfs_init_work(&async_work->work, btrfs_delayed_meta_helper,
1399			btrfs_async_run_delayed_root, NULL, NULL);
1400	async_work->nr = nr;
1401
1402	btrfs_queue_work(fs_info->delayed_workers, &async_work->work);
1403	return 0;
1404}
1405
1406void btrfs_assert_delayed_root_empty(struct btrfs_root *root)
1407{
1408	struct btrfs_delayed_root *delayed_root;
1409	delayed_root = btrfs_get_delayed_root(root);
1410	WARN_ON(btrfs_first_delayed_node(delayed_root));
1411}
1412
1413static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
1414{
1415	int val = atomic_read(&delayed_root->items_seq);
1416
1417	if (val < seq || val >= seq + BTRFS_DELAYED_BATCH)
1418		return 1;
1419
1420	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
1421		return 1;
1422
1423	return 0;
1424}
1425
1426void btrfs_balance_delayed_items(struct btrfs_root *root)
1427{
1428	struct btrfs_delayed_root *delayed_root;
1429	struct btrfs_fs_info *fs_info = root->fs_info;
1430
1431	delayed_root = btrfs_get_delayed_root(root);
1432
1433	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
1434		return;
1435
1436	if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) {
1437		int seq;
1438		int ret;
1439
1440		seq = atomic_read(&delayed_root->items_seq);
1441
1442		ret = btrfs_wq_run_delayed_node(delayed_root, fs_info, 0);
1443		if (ret)
1444			return;
1445
1446		wait_event_interruptible(delayed_root->wait,
1447					 could_end_wait(delayed_root, seq));
1448		return;
1449	}
1450
1451	btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH);
1452}
1453
1454/* Will return 0 or -ENOMEM */
1455int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
1456				   struct btrfs_root *root, const char *name,
1457				   int name_len, struct inode *dir,
1458				   struct btrfs_disk_key *disk_key, u8 type,
1459				   u64 index)
1460{
1461	struct btrfs_delayed_node *delayed_node;
1462	struct btrfs_delayed_item *delayed_item;
1463	struct btrfs_dir_item *dir_item;
1464	int ret;
1465
1466	delayed_node = btrfs_get_or_create_delayed_node(dir);
1467	if (IS_ERR(delayed_node))
1468		return PTR_ERR(delayed_node);
1469
1470	delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len);
1471	if (!delayed_item) {
1472		ret = -ENOMEM;
1473		goto release_node;
1474	}
1475
1476	delayed_item->key.objectid = btrfs_ino(dir);
1477	delayed_item->key.type = BTRFS_DIR_INDEX_KEY;
1478	delayed_item->key.offset = index;
1479
1480	dir_item = (struct btrfs_dir_item *)delayed_item->data;
1481	dir_item->location = *disk_key;
1482	btrfs_set_stack_dir_transid(dir_item, trans->transid);
1483	btrfs_set_stack_dir_data_len(dir_item, 0);
1484	btrfs_set_stack_dir_name_len(dir_item, name_len);
1485	btrfs_set_stack_dir_type(dir_item, type);
1486	memcpy((char *)(dir_item + 1), name, name_len);
1487
1488	ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
1489	/*
1490	 * we have reserved enough space when we start a new transaction,
1491	 * so reserving metadata failure is impossible
1492	 */
1493	BUG_ON(ret);
1494
1495
1496	mutex_lock(&delayed_node->mutex);
1497	ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
1498	if (unlikely(ret)) {
1499		btrfs_err(root->fs_info, "err add delayed dir index item(name: %.*s) "
1500				"into the insertion tree of the delayed node"
1501				"(root id: %llu, inode id: %llu, errno: %d)",
1502				name_len, name, delayed_node->root->objectid,
1503				delayed_node->inode_id, ret);
1504		BUG();
1505	}
1506	mutex_unlock(&delayed_node->mutex);
1507
1508release_node:
1509	btrfs_release_delayed_node(delayed_node);
1510	return ret;
1511}
1512
1513static int btrfs_delete_delayed_insertion_item(struct btrfs_root *root,
1514					       struct btrfs_delayed_node *node,
1515					       struct btrfs_key *key)
1516{
1517	struct btrfs_delayed_item *item;
1518
1519	mutex_lock(&node->mutex);
1520	item = __btrfs_lookup_delayed_insertion_item(node, key);
1521	if (!item) {
1522		mutex_unlock(&node->mutex);
1523		return 1;
1524	}
1525
1526	btrfs_delayed_item_release_metadata(root, item);
1527	btrfs_release_delayed_item(item);
1528	mutex_unlock(&node->mutex);
1529	return 0;
1530}
1531
1532int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
1533				   struct btrfs_root *root, struct inode *dir,
1534				   u64 index)
1535{
1536	struct btrfs_delayed_node *node;
1537	struct btrfs_delayed_item *item;
1538	struct btrfs_key item_key;
1539	int ret;
1540
1541	node = btrfs_get_or_create_delayed_node(dir);
1542	if (IS_ERR(node))
1543		return PTR_ERR(node);
1544
1545	item_key.objectid = btrfs_ino(dir);
1546	item_key.type = BTRFS_DIR_INDEX_KEY;
1547	item_key.offset = index;
1548
1549	ret = btrfs_delete_delayed_insertion_item(root, node, &item_key);
1550	if (!ret)
1551		goto end;
1552
1553	item = btrfs_alloc_delayed_item(0);
1554	if (!item) {
1555		ret = -ENOMEM;
1556		goto end;
1557	}
1558
1559	item->key = item_key;
1560
1561	ret = btrfs_delayed_item_reserve_metadata(trans, root, item);
1562	/*
1563	 * we have reserved enough space when we start a new transaction,
1564	 * so reserving metadata failure is impossible.
1565	 */
1566	BUG_ON(ret);
1567
1568	mutex_lock(&node->mutex);
1569	ret = __btrfs_add_delayed_deletion_item(node, item);
1570	if (unlikely(ret)) {
1571		btrfs_err(root->fs_info, "err add delayed dir index item(index: %llu) "
1572				"into the deletion tree of the delayed node"
1573				"(root id: %llu, inode id: %llu, errno: %d)",
1574				index, node->root->objectid, node->inode_id,
1575				ret);
1576		BUG();
1577	}
1578	mutex_unlock(&node->mutex);
1579end:
1580	btrfs_release_delayed_node(node);
1581	return ret;
1582}
1583
1584int btrfs_inode_delayed_dir_index_count(struct inode *inode)
1585{
1586	struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
1587
1588	if (!delayed_node)
1589		return -ENOENT;
1590
1591	/*
1592	 * Since we have held i_mutex of this directory, it is impossible that
1593	 * a new directory index is added into the delayed node and index_cnt
1594	 * is updated now. So we needn't lock the delayed node.
1595	 */
1596	if (!delayed_node->index_cnt) {
1597		btrfs_release_delayed_node(delayed_node);
1598		return -EINVAL;
1599	}
1600
1601	BTRFS_I(inode)->index_cnt = delayed_node->index_cnt;
1602	btrfs_release_delayed_node(delayed_node);
1603	return 0;
1604}
1605
1606void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,
1607			     struct list_head *del_list)
1608{
1609	struct btrfs_delayed_node *delayed_node;
1610	struct btrfs_delayed_item *item;
1611
1612	delayed_node = btrfs_get_delayed_node(inode);
1613	if (!delayed_node)
1614		return;
1615
1616	mutex_lock(&delayed_node->mutex);
1617	item = __btrfs_first_delayed_insertion_item(delayed_node);
1618	while (item) {
1619		atomic_inc(&item->refs);
1620		list_add_tail(&item->readdir_list, ins_list);
1621		item = __btrfs_next_delayed_item(item);
1622	}
1623
1624	item = __btrfs_first_delayed_deletion_item(delayed_node);
1625	while (item) {
1626		atomic_inc(&item->refs);
1627		list_add_tail(&item->readdir_list, del_list);
1628		item = __btrfs_next_delayed_item(item);
1629	}
1630	mutex_unlock(&delayed_node->mutex);
1631	/*
1632	 * This delayed node is still cached in the btrfs inode, so refs
1633	 * must be > 1 now, and we needn't check it is going to be freed
1634	 * or not.
1635	 *
1636	 * Besides that, this function is used to read dir, we do not
1637	 * insert/delete delayed items in this period. So we also needn't
1638	 * requeue or dequeue this delayed node.
1639	 */
1640	atomic_dec(&delayed_node->refs);
1641}
1642
1643void btrfs_put_delayed_items(struct list_head *ins_list,
1644			     struct list_head *del_list)
1645{
1646	struct btrfs_delayed_item *curr, *next;
1647
1648	list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
1649		list_del(&curr->readdir_list);
1650		if (atomic_dec_and_test(&curr->refs))
1651			kfree(curr);
1652	}
1653
1654	list_for_each_entry_safe(curr, next, del_list, readdir_list) {
1655		list_del(&curr->readdir_list);
1656		if (atomic_dec_and_test(&curr->refs))
1657			kfree(curr);
1658	}
1659}
1660
1661int btrfs_should_delete_dir_index(struct list_head *del_list,
1662				  u64 index)
1663{
1664	struct btrfs_delayed_item *curr, *next;
1665	int ret;
1666
1667	if (list_empty(del_list))
1668		return 0;
1669
1670	list_for_each_entry_safe(curr, next, del_list, readdir_list) {
1671		if (curr->key.offset > index)
1672			break;
1673
1674		list_del(&curr->readdir_list);
1675		ret = (curr->key.offset == index);
1676
1677		if (atomic_dec_and_test(&curr->refs))
1678			kfree(curr);
1679
1680		if (ret)
1681			return 1;
1682		else
1683			continue;
1684	}
1685	return 0;
1686}
1687
1688/*
1689 * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree
1690 *
1691 */
1692int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
1693				    struct list_head *ins_list, bool *emitted)
1694{
1695	struct btrfs_dir_item *di;
1696	struct btrfs_delayed_item *curr, *next;
1697	struct btrfs_key location;
1698	char *name;
1699	int name_len;
1700	int over = 0;
1701	unsigned char d_type;
1702
1703	if (list_empty(ins_list))
1704		return 0;
1705
1706	/*
1707	 * Changing the data of the delayed item is impossible. So
1708	 * we needn't lock them. And we have held i_mutex of the
1709	 * directory, nobody can delete any directory indexes now.
1710	 */
1711	list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
1712		list_del(&curr->readdir_list);
1713
1714		if (curr->key.offset < ctx->pos) {
1715			if (atomic_dec_and_test(&curr->refs))
1716				kfree(curr);
1717			continue;
1718		}
1719
1720		ctx->pos = curr->key.offset;
1721
1722		di = (struct btrfs_dir_item *)curr->data;
1723		name = (char *)(di + 1);
1724		name_len = btrfs_stack_dir_name_len(di);
1725
1726		d_type = btrfs_filetype_table[di->type];
1727		btrfs_disk_key_to_cpu(&location, &di->location);
1728
1729		over = !dir_emit(ctx, name, name_len,
1730			       location.objectid, d_type);
1731
1732		if (atomic_dec_and_test(&curr->refs))
1733			kfree(curr);
1734
1735		if (over)
1736			return 1;
1737		*emitted = true;
1738	}
1739	return 0;
1740}
1741
1742static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
1743				  struct btrfs_inode_item *inode_item,
1744				  struct inode *inode)
1745{
1746	btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode));
1747	btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode));
1748	btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size);
1749	btrfs_set_stack_inode_mode(inode_item, inode->i_mode);
1750	btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink);
1751	btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
1752	btrfs_set_stack_inode_generation(inode_item,
1753					 BTRFS_I(inode)->generation);
1754	btrfs_set_stack_inode_sequence(inode_item, inode->i_version);
1755	btrfs_set_stack_inode_transid(inode_item, trans->transid);
1756	btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
1757	btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
1758	btrfs_set_stack_inode_block_group(inode_item, 0);
1759
1760	btrfs_set_stack_timespec_sec(&inode_item->atime,
1761				     inode->i_atime.tv_sec);
1762	btrfs_set_stack_timespec_nsec(&inode_item->atime,
1763				      inode->i_atime.tv_nsec);
1764
1765	btrfs_set_stack_timespec_sec(&inode_item->mtime,
1766				     inode->i_mtime.tv_sec);
1767	btrfs_set_stack_timespec_nsec(&inode_item->mtime,
1768				      inode->i_mtime.tv_nsec);
1769
1770	btrfs_set_stack_timespec_sec(&inode_item->ctime,
1771				     inode->i_ctime.tv_sec);
1772	btrfs_set_stack_timespec_nsec(&inode_item->ctime,
1773				      inode->i_ctime.tv_nsec);
1774
1775	btrfs_set_stack_timespec_sec(&inode_item->otime,
1776				     BTRFS_I(inode)->i_otime.tv_sec);
1777	btrfs_set_stack_timespec_nsec(&inode_item->otime,
1778				     BTRFS_I(inode)->i_otime.tv_nsec);
1779}
1780
1781int btrfs_fill_inode(struct inode *inode, u32 *rdev)
1782{
1783	struct btrfs_delayed_node *delayed_node;
1784	struct btrfs_inode_item *inode_item;
1785
1786	delayed_node = btrfs_get_delayed_node(inode);
1787	if (!delayed_node)
1788		return -ENOENT;
1789
1790	mutex_lock(&delayed_node->mutex);
1791	if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
1792		mutex_unlock(&delayed_node->mutex);
1793		btrfs_release_delayed_node(delayed_node);
1794		return -ENOENT;
1795	}
1796
1797	inode_item = &delayed_node->inode_item;
1798
1799	i_uid_write(inode, btrfs_stack_inode_uid(inode_item));
1800	i_gid_write(inode, btrfs_stack_inode_gid(inode_item));
1801	btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
1802	inode->i_mode = btrfs_stack_inode_mode(inode_item);
1803	set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
1804	inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
1805	BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
1806        BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item);
1807
1808	inode->i_version = btrfs_stack_inode_sequence(inode_item);
1809	inode->i_rdev = 0;
1810	*rdev = btrfs_stack_inode_rdev(inode_item);
1811	BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
1812
1813	inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
1814	inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
1815
1816	inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime);
1817	inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime);
1818
1819	inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(&inode_item->ctime);
1820	inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->ctime);
1821
1822	BTRFS_I(inode)->i_otime.tv_sec =
1823		btrfs_stack_timespec_sec(&inode_item->otime);
1824	BTRFS_I(inode)->i_otime.tv_nsec =
1825		btrfs_stack_timespec_nsec(&inode_item->otime);
1826
1827	inode->i_generation = BTRFS_I(inode)->generation;
1828	BTRFS_I(inode)->index_cnt = (u64)-1;
1829
1830	mutex_unlock(&delayed_node->mutex);
1831	btrfs_release_delayed_node(delayed_node);
1832	return 0;
1833}
1834
1835int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
1836			       struct btrfs_root *root, struct inode *inode)
1837{
1838	struct btrfs_delayed_node *delayed_node;
1839	int ret = 0;
1840
1841	delayed_node = btrfs_get_or_create_delayed_node(inode);
1842	if (IS_ERR(delayed_node))
1843		return PTR_ERR(delayed_node);
1844
1845	mutex_lock(&delayed_node->mutex);
1846	if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
1847		fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
1848		goto release_node;
1849	}
1850
1851	ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode,
1852						   delayed_node);
1853	if (ret)
1854		goto release_node;
1855
1856	fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
1857	set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
1858	delayed_node->count++;
1859	atomic_inc(&root->fs_info->delayed_root->items);
1860release_node:
1861	mutex_unlock(&delayed_node->mutex);
1862	btrfs_release_delayed_node(delayed_node);
1863	return ret;
1864}
1865
1866int btrfs_delayed_delete_inode_ref(struct inode *inode)
1867{
1868	struct btrfs_delayed_node *delayed_node;
1869
1870	/*
1871	 * we don't do delayed inode updates during log recovery because it
1872	 * leads to enospc problems.  This means we also can't do
1873	 * delayed inode refs
1874	 */
1875	if (BTRFS_I(inode)->root->fs_info->log_root_recovering)
1876		return -EAGAIN;
1877
1878	delayed_node = btrfs_get_or_create_delayed_node(inode);
1879	if (IS_ERR(delayed_node))
1880		return PTR_ERR(delayed_node);
1881
1882	/*
1883	 * We don't reserve space for inode ref deletion is because:
1884	 * - We ONLY do async inode ref deletion for the inode who has only
1885	 *   one link(i_nlink == 1), it means there is only one inode ref.
1886	 *   And in most case, the inode ref and the inode item are in the
1887	 *   same leaf, and we will deal with them at the same time.
1888	 *   Since we are sure we will reserve the space for the inode item,
1889	 *   it is unnecessary to reserve space for inode ref deletion.
1890	 * - If the inode ref and the inode item are not in the same leaf,
1891	 *   We also needn't worry about enospc problem, because we reserve
1892	 *   much more space for the inode update than it needs.
1893	 * - At the worst, we can steal some space from the global reservation.
1894	 *   It is very rare.
1895	 */
1896	mutex_lock(&delayed_node->mutex);
1897	if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags))
1898		goto release_node;
1899
1900	set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
1901	delayed_node->count++;
1902	atomic_inc(&BTRFS_I(inode)->root->fs_info->delayed_root->items);
1903release_node:
1904	mutex_unlock(&delayed_node->mutex);
1905	btrfs_release_delayed_node(delayed_node);
1906	return 0;
1907}
1908
1909static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
1910{
1911	struct btrfs_root *root = delayed_node->root;
1912	struct btrfs_delayed_item *curr_item, *prev_item;
1913
1914	mutex_lock(&delayed_node->mutex);
1915	curr_item = __btrfs_first_delayed_insertion_item(delayed_node);
1916	while (curr_item) {
1917		btrfs_delayed_item_release_metadata(root, curr_item);
1918		prev_item = curr_item;
1919		curr_item = __btrfs_next_delayed_item(prev_item);
1920		btrfs_release_delayed_item(prev_item);
1921	}
1922
1923	curr_item = __btrfs_first_delayed_deletion_item(delayed_node);
1924	while (curr_item) {
1925		btrfs_delayed_item_release_metadata(root, curr_item);
1926		prev_item = curr_item;
1927		curr_item = __btrfs_next_delayed_item(prev_item);
1928		btrfs_release_delayed_item(prev_item);
1929	}
1930
1931	if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags))
1932		btrfs_release_delayed_iref(delayed_node);
1933
1934	if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
1935		btrfs_delayed_inode_release_metadata(root, delayed_node);
1936		btrfs_release_delayed_inode(delayed_node);
1937	}
1938	mutex_unlock(&delayed_node->mutex);
1939}
1940
1941void btrfs_kill_delayed_inode_items(struct inode *inode)
1942{
1943	struct btrfs_delayed_node *delayed_node;
1944
1945	delayed_node = btrfs_get_delayed_node(inode);
1946	if (!delayed_node)
1947		return;
1948
1949	__btrfs_kill_delayed_node(delayed_node);
1950	btrfs_release_delayed_node(delayed_node);
1951}
1952
1953void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
1954{
1955	u64 inode_id = 0;
1956	struct btrfs_delayed_node *delayed_nodes[8];
1957	int i, n;
1958
1959	while (1) {
1960		spin_lock(&root->inode_lock);
1961		n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
1962					   (void **)delayed_nodes, inode_id,
1963					   ARRAY_SIZE(delayed_nodes));
1964		if (!n) {
1965			spin_unlock(&root->inode_lock);
1966			break;
1967		}
1968
1969		inode_id = delayed_nodes[n - 1]->inode_id + 1;
1970
1971		for (i = 0; i < n; i++)
1972			atomic_inc(&delayed_nodes[i]->refs);
1973		spin_unlock(&root->inode_lock);
1974
1975		for (i = 0; i < n; i++) {
1976			__btrfs_kill_delayed_node(delayed_nodes[i]);
1977			btrfs_release_delayed_node(delayed_nodes[i]);
1978		}
1979	}
1980}
1981
1982void btrfs_destroy_delayed_inodes(struct btrfs_root *root)
1983{
1984	struct btrfs_delayed_root *delayed_root;
1985	struct btrfs_delayed_node *curr_node, *prev_node;
1986
1987	delayed_root = btrfs_get_delayed_root(root);
1988
1989	curr_node = btrfs_first_delayed_node(delayed_root);
1990	while (curr_node) {
1991		__btrfs_kill_delayed_node(curr_node);
1992
1993		prev_node = curr_node;
1994		curr_node = btrfs_next_delayed_node(curr_node);
1995		btrfs_release_delayed_node(prev_node);
1996	}
1997}
1998
1999