1/*
2 *  Generic process-grouping system.
3 *
4 *  Based originally on the cpuset system, extracted by Paul Menage
5 *  Copyright (C) 2006 Google, Inc
6 *
7 *  Notifications support
8 *  Copyright (C) 2009 Nokia Corporation
9 *  Author: Kirill A. Shutemov
10 *
11 *  Copyright notices from the original cpuset code:
12 *  --------------------------------------------------
13 *  Copyright (C) 2003 BULL SA.
14 *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
15 *
16 *  Portions derived from Patrick Mochel's sysfs code.
17 *  sysfs is Copyright (c) 2001-3 Patrick Mochel
18 *
19 *  2003-10-10 Written by Simon Derr.
20 *  2003-10-22 Updates by Stephen Hemminger.
21 *  2004 May-July Rework by Paul Jackson.
22 *  ---------------------------------------------------
23 *
24 *  This file is subject to the terms and conditions of the GNU General Public
25 *  License.  See the file COPYING in the main directory of the Linux
26 *  distribution for more details.
27 */
28
29#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
30
31#include <linux/cgroup.h>
32#include <linux/cred.h>
33#include <linux/ctype.h>
34#include <linux/errno.h>
35#include <linux/init_task.h>
36#include <linux/kernel.h>
37#include <linux/list.h>
38#include <linux/magic.h>
39#include <linux/mm.h>
40#include <linux/mutex.h>
41#include <linux/mount.h>
42#include <linux/pagemap.h>
43#include <linux/proc_fs.h>
44#include <linux/rcupdate.h>
45#include <linux/sched.h>
46#include <linux/slab.h>
47#include <linux/spinlock.h>
48#include <linux/rwsem.h>
49#include <linux/string.h>
50#include <linux/sort.h>
51#include <linux/kmod.h>
52#include <linux/delayacct.h>
53#include <linux/cgroupstats.h>
54#include <linux/hashtable.h>
55#include <linux/pid_namespace.h>
56#include <linux/idr.h>
57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
58#include <linux/kthread.h>
59#include <linux/delay.h>
60
61#include <linux/atomic.h>
62
63/*
64 * pidlists linger the following amount before being destroyed.  The goal
65 * is avoiding frequent destruction in the middle of consecutive read calls
66 * Expiring in the middle is a performance problem not a correctness one.
67 * 1 sec should be enough.
68 */
69#define CGROUP_PIDLIST_DESTROY_DELAY	HZ
70
71#define CGROUP_FILE_NAME_MAX		(MAX_CGROUP_TYPE_NAMELEN +	\
72					 MAX_CFTYPE_NAME + 2)
73
74/*
75 * cgroup_mutex is the master lock.  Any modification to cgroup or its
76 * hierarchy must be performed while holding it.
77 *
78 * css_set_rwsem protects task->cgroups pointer, the list of css_set
79 * objects, and the chain of tasks off each css_set.
80 *
81 * These locks are exported if CONFIG_PROVE_RCU so that accessors in
82 * cgroup.h can use them for lockdep annotations.
83 */
84#ifdef CONFIG_PROVE_RCU
85DEFINE_MUTEX(cgroup_mutex);
86DECLARE_RWSEM(css_set_rwsem);
87EXPORT_SYMBOL_GPL(cgroup_mutex);
88EXPORT_SYMBOL_GPL(css_set_rwsem);
89#else
90static DEFINE_MUTEX(cgroup_mutex);
91static DECLARE_RWSEM(css_set_rwsem);
92#endif
93
94/*
95 * Protects cgroup_idr and css_idr so that IDs can be released without
96 * grabbing cgroup_mutex.
97 */
98static DEFINE_SPINLOCK(cgroup_idr_lock);
99
100/*
101 * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
102 * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
103 */
104static DEFINE_SPINLOCK(release_agent_path_lock);
105
106#define cgroup_assert_mutex_or_rcu_locked()				\
107	rcu_lockdep_assert(rcu_read_lock_held() ||			\
108			   lockdep_is_held(&cgroup_mutex),		\
109			   "cgroup_mutex or RCU read lock required");
110
111/*
112 * cgroup destruction makes heavy use of work items and there can be a lot
113 * of concurrent destructions.  Use a separate workqueue so that cgroup
114 * destruction work items don't end up filling up max_active of system_wq
115 * which may lead to deadlock.
116 */
117static struct workqueue_struct *cgroup_destroy_wq;
118
119/*
120 * pidlist destructions need to be flushed on cgroup destruction.  Use a
121 * separate workqueue as flush domain.
122 */
123static struct workqueue_struct *cgroup_pidlist_destroy_wq;
124
125/* generate an array of cgroup subsystem pointers */
126#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
127static struct cgroup_subsys *cgroup_subsys[] = {
128#include <linux/cgroup_subsys.h>
129};
130#undef SUBSYS
131
132/* array of cgroup subsystem names */
133#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
134static const char *cgroup_subsys_name[] = {
135#include <linux/cgroup_subsys.h>
136};
137#undef SUBSYS
138
139/*
140 * The default hierarchy, reserved for the subsystems that are otherwise
141 * unattached - it never has more than a single cgroup, and all tasks are
142 * part of that cgroup.
143 */
144struct cgroup_root cgrp_dfl_root;
145
146/*
147 * The default hierarchy always exists but is hidden until mounted for the
148 * first time.  This is for backward compatibility.
149 */
150static bool cgrp_dfl_root_visible;
151
152/*
153 * Set by the boot param of the same name and makes subsystems with NULL
154 * ->dfl_files to use ->legacy_files on the default hierarchy.
155 */
156static bool cgroup_legacy_files_on_dfl;
157
158/* some controllers are not supported in the default hierarchy */
159static unsigned int cgrp_dfl_root_inhibit_ss_mask;
160
161/* The list of hierarchy roots */
162
163static LIST_HEAD(cgroup_roots);
164static int cgroup_root_count;
165
166/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
167static DEFINE_IDR(cgroup_hierarchy_idr);
168
169/*
170 * Assign a monotonically increasing serial number to csses.  It guarantees
171 * cgroups with bigger numbers are newer than those with smaller numbers.
172 * Also, as csses are always appended to the parent's ->children list, it
173 * guarantees that sibling csses are always sorted in the ascending serial
174 * number order on the list.  Protected by cgroup_mutex.
175 */
176static u64 css_serial_nr_next = 1;
177
178/* This flag indicates whether tasks in the fork and exit paths should
179 * check for fork/exit handlers to call. This avoids us having to do
180 * extra work in the fork/exit path if none of the subsystems need to
181 * be called.
182 */
183static int need_forkexit_callback __read_mostly;
184
185static struct cftype cgroup_dfl_base_files[];
186static struct cftype cgroup_legacy_base_files[];
187
188static int rebind_subsystems(struct cgroup_root *dst_root,
189			     unsigned int ss_mask);
190static int cgroup_destroy_locked(struct cgroup *cgrp);
191static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
192		      bool visible);
193static void css_release(struct percpu_ref *ref);
194static void kill_css(struct cgroup_subsys_state *css);
195static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
196			      bool is_add);
197
198/* IDR wrappers which synchronize using cgroup_idr_lock */
199static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
200			    gfp_t gfp_mask)
201{
202	int ret;
203
204	idr_preload(gfp_mask);
205	spin_lock_bh(&cgroup_idr_lock);
206	ret = idr_alloc(idr, ptr, start, end, gfp_mask);
207	spin_unlock_bh(&cgroup_idr_lock);
208	idr_preload_end();
209	return ret;
210}
211
212static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
213{
214	void *ret;
215
216	spin_lock_bh(&cgroup_idr_lock);
217	ret = idr_replace(idr, ptr, id);
218	spin_unlock_bh(&cgroup_idr_lock);
219	return ret;
220}
221
222static void cgroup_idr_remove(struct idr *idr, int id)
223{
224	spin_lock_bh(&cgroup_idr_lock);
225	idr_remove(idr, id);
226	spin_unlock_bh(&cgroup_idr_lock);
227}
228
229static struct cgroup *cgroup_parent(struct cgroup *cgrp)
230{
231	struct cgroup_subsys_state *parent_css = cgrp->self.parent;
232
233	if (parent_css)
234		return container_of(parent_css, struct cgroup, self);
235	return NULL;
236}
237
238/**
239 * cgroup_css - obtain a cgroup's css for the specified subsystem
240 * @cgrp: the cgroup of interest
241 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
242 *
243 * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
244 * function must be called either under cgroup_mutex or rcu_read_lock() and
245 * the caller is responsible for pinning the returned css if it wants to
246 * keep accessing it outside the said locks.  This function may return
247 * %NULL if @cgrp doesn't have @subsys_id enabled.
248 */
249static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
250					      struct cgroup_subsys *ss)
251{
252	if (ss)
253		return rcu_dereference_check(cgrp->subsys[ss->id],
254					lockdep_is_held(&cgroup_mutex));
255	else
256		return &cgrp->self;
257}
258
259/**
260 * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
261 * @cgrp: the cgroup of interest
262 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
263 *
264 * Similar to cgroup_css() but returns the effctive css, which is defined
265 * as the matching css of the nearest ancestor including self which has @ss
266 * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
267 * function is guaranteed to return non-NULL css.
268 */
269static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
270						struct cgroup_subsys *ss)
271{
272	lockdep_assert_held(&cgroup_mutex);
273
274	if (!ss)
275		return &cgrp->self;
276
277	if (!(cgrp->root->subsys_mask & (1 << ss->id)))
278		return NULL;
279
280	/*
281	 * This function is used while updating css associations and thus
282	 * can't test the csses directly.  Use ->child_subsys_mask.
283	 */
284	while (cgroup_parent(cgrp) &&
285	       !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
286		cgrp = cgroup_parent(cgrp);
287
288	return cgroup_css(cgrp, ss);
289}
290
291/**
292 * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
293 * @cgrp: the cgroup of interest
294 * @ss: the subsystem of interest
295 *
296 * Find and get the effective css of @cgrp for @ss.  The effective css is
297 * defined as the matching css of the nearest ancestor including self which
298 * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
299 * the root css is returned, so this function always returns a valid css.
300 * The returned css must be put using css_put().
301 */
302struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
303					     struct cgroup_subsys *ss)
304{
305	struct cgroup_subsys_state *css;
306
307	rcu_read_lock();
308
309	do {
310		css = cgroup_css(cgrp, ss);
311
312		if (css && css_tryget_online(css))
313			goto out_unlock;
314		cgrp = cgroup_parent(cgrp);
315	} while (cgrp);
316
317	css = init_css_set.subsys[ss->id];
318	css_get(css);
319out_unlock:
320	rcu_read_unlock();
321	return css;
322}
323
324/* convenient tests for these bits */
325static inline bool cgroup_is_dead(const struct cgroup *cgrp)
326{
327	return !(cgrp->self.flags & CSS_ONLINE);
328}
329
330struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
331{
332	struct cgroup *cgrp = of->kn->parent->priv;
333	struct cftype *cft = of_cft(of);
334
335	/*
336	 * This is open and unprotected implementation of cgroup_css().
337	 * seq_css() is only called from a kernfs file operation which has
338	 * an active reference on the file.  Because all the subsystem
339	 * files are drained before a css is disassociated with a cgroup,
340	 * the matching css from the cgroup's subsys table is guaranteed to
341	 * be and stay valid until the enclosing operation is complete.
342	 */
343	if (cft->ss)
344		return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
345	else
346		return &cgrp->self;
347}
348EXPORT_SYMBOL_GPL(of_css);
349
350/**
351 * cgroup_is_descendant - test ancestry
352 * @cgrp: the cgroup to be tested
353 * @ancestor: possible ancestor of @cgrp
354 *
355 * Test whether @cgrp is a descendant of @ancestor.  It also returns %true
356 * if @cgrp == @ancestor.  This function is safe to call as long as @cgrp
357 * and @ancestor are accessible.
358 */
359bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
360{
361	while (cgrp) {
362		if (cgrp == ancestor)
363			return true;
364		cgrp = cgroup_parent(cgrp);
365	}
366	return false;
367}
368
369static int notify_on_release(const struct cgroup *cgrp)
370{
371	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
372}
373
374/**
375 * for_each_css - iterate all css's of a cgroup
376 * @css: the iteration cursor
377 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
378 * @cgrp: the target cgroup to iterate css's of
379 *
380 * Should be called under cgroup_[tree_]mutex.
381 */
382#define for_each_css(css, ssid, cgrp)					\
383	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
384		if (!((css) = rcu_dereference_check(			\
385				(cgrp)->subsys[(ssid)],			\
386				lockdep_is_held(&cgroup_mutex)))) { }	\
387		else
388
389/**
390 * for_each_e_css - iterate all effective css's of a cgroup
391 * @css: the iteration cursor
392 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
393 * @cgrp: the target cgroup to iterate css's of
394 *
395 * Should be called under cgroup_[tree_]mutex.
396 */
397#define for_each_e_css(css, ssid, cgrp)					\
398	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
399		if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
400			;						\
401		else
402
403/**
404 * for_each_subsys - iterate all enabled cgroup subsystems
405 * @ss: the iteration cursor
406 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
407 */
408#define for_each_subsys(ss, ssid)					\
409	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
410	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
411
412/* iterate across the hierarchies */
413#define for_each_root(root)						\
414	list_for_each_entry((root), &cgroup_roots, root_list)
415
416/* iterate over child cgrps, lock should be held throughout iteration */
417#define cgroup_for_each_live_child(child, cgrp)				\
418	list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
419		if (({ lockdep_assert_held(&cgroup_mutex);		\
420		       cgroup_is_dead(child); }))			\
421			;						\
422		else
423
424static void cgroup_release_agent(struct work_struct *work);
425static void check_for_release(struct cgroup *cgrp);
426
427/*
428 * A cgroup can be associated with multiple css_sets as different tasks may
429 * belong to different cgroups on different hierarchies.  In the other
430 * direction, a css_set is naturally associated with multiple cgroups.
431 * This M:N relationship is represented by the following link structure
432 * which exists for each association and allows traversing the associations
433 * from both sides.
434 */
435struct cgrp_cset_link {
436	/* the cgroup and css_set this link associates */
437	struct cgroup		*cgrp;
438	struct css_set		*cset;
439
440	/* list of cgrp_cset_links anchored at cgrp->cset_links */
441	struct list_head	cset_link;
442
443	/* list of cgrp_cset_links anchored at css_set->cgrp_links */
444	struct list_head	cgrp_link;
445};
446
447/*
448 * The default css_set - used by init and its children prior to any
449 * hierarchies being mounted. It contains a pointer to the root state
450 * for each subsystem. Also used to anchor the list of css_sets. Not
451 * reference-counted, to improve performance when child cgroups
452 * haven't been created.
453 */
454struct css_set init_css_set = {
455	.refcount		= ATOMIC_INIT(1),
456	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
457	.tasks			= LIST_HEAD_INIT(init_css_set.tasks),
458	.mg_tasks		= LIST_HEAD_INIT(init_css_set.mg_tasks),
459	.mg_preload_node	= LIST_HEAD_INIT(init_css_set.mg_preload_node),
460	.mg_node		= LIST_HEAD_INIT(init_css_set.mg_node),
461};
462
463static int css_set_count	= 1;	/* 1 for init_css_set */
464
465/**
466 * cgroup_update_populated - updated populated count of a cgroup
467 * @cgrp: the target cgroup
468 * @populated: inc or dec populated count
469 *
470 * @cgrp is either getting the first task (css_set) or losing the last.
471 * Update @cgrp->populated_cnt accordingly.  The count is propagated
472 * towards root so that a given cgroup's populated_cnt is zero iff the
473 * cgroup and all its descendants are empty.
474 *
475 * @cgrp's interface file "cgroup.populated" is zero if
476 * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
477 * changes from or to zero, userland is notified that the content of the
478 * interface file has changed.  This can be used to detect when @cgrp and
479 * its descendants become populated or empty.
480 */
481static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
482{
483	lockdep_assert_held(&css_set_rwsem);
484
485	do {
486		bool trigger;
487
488		if (populated)
489			trigger = !cgrp->populated_cnt++;
490		else
491			trigger = !--cgrp->populated_cnt;
492
493		if (!trigger)
494			break;
495
496		if (cgrp->populated_kn)
497			kernfs_notify(cgrp->populated_kn);
498		cgrp = cgroup_parent(cgrp);
499	} while (cgrp);
500}
501
502/*
503 * hash table for cgroup groups. This improves the performance to find
504 * an existing css_set. This hash doesn't (currently) take into
505 * account cgroups in empty hierarchies.
506 */
507#define CSS_SET_HASH_BITS	7
508static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
509
510static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
511{
512	unsigned long key = 0UL;
513	struct cgroup_subsys *ss;
514	int i;
515
516	for_each_subsys(ss, i)
517		key += (unsigned long)css[i];
518	key = (key >> 16) ^ key;
519
520	return key;
521}
522
523static void put_css_set_locked(struct css_set *cset)
524{
525	struct cgrp_cset_link *link, *tmp_link;
526	struct cgroup_subsys *ss;
527	int ssid;
528
529	lockdep_assert_held(&css_set_rwsem);
530
531	if (!atomic_dec_and_test(&cset->refcount))
532		return;
533
534	/* This css_set is dead. unlink it and release cgroup refcounts */
535	for_each_subsys(ss, ssid)
536		list_del(&cset->e_cset_node[ssid]);
537	hash_del(&cset->hlist);
538	css_set_count--;
539
540	list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
541		struct cgroup *cgrp = link->cgrp;
542
543		list_del(&link->cset_link);
544		list_del(&link->cgrp_link);
545
546		/* @cgrp can't go away while we're holding css_set_rwsem */
547		if (list_empty(&cgrp->cset_links)) {
548			cgroup_update_populated(cgrp, false);
549			check_for_release(cgrp);
550		}
551
552		kfree(link);
553	}
554
555	kfree_rcu(cset, rcu_head);
556}
557
558static void put_css_set(struct css_set *cset)
559{
560	/*
561	 * Ensure that the refcount doesn't hit zero while any readers
562	 * can see it. Similar to atomic_dec_and_lock(), but for an
563	 * rwlock
564	 */
565	if (atomic_add_unless(&cset->refcount, -1, 1))
566		return;
567
568	down_write(&css_set_rwsem);
569	put_css_set_locked(cset);
570	up_write(&css_set_rwsem);
571}
572
573/*
574 * refcounted get/put for css_set objects
575 */
576static inline void get_css_set(struct css_set *cset)
577{
578	atomic_inc(&cset->refcount);
579}
580
581/**
582 * compare_css_sets - helper function for find_existing_css_set().
583 * @cset: candidate css_set being tested
584 * @old_cset: existing css_set for a task
585 * @new_cgrp: cgroup that's being entered by the task
586 * @template: desired set of css pointers in css_set (pre-calculated)
587 *
588 * Returns true if "cset" matches "old_cset" except for the hierarchy
589 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
590 */
591static bool compare_css_sets(struct css_set *cset,
592			     struct css_set *old_cset,
593			     struct cgroup *new_cgrp,
594			     struct cgroup_subsys_state *template[])
595{
596	struct list_head *l1, *l2;
597
598	/*
599	 * On the default hierarchy, there can be csets which are
600	 * associated with the same set of cgroups but different csses.
601	 * Let's first ensure that csses match.
602	 */
603	if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
604		return false;
605
606	/*
607	 * Compare cgroup pointers in order to distinguish between
608	 * different cgroups in hierarchies.  As different cgroups may
609	 * share the same effective css, this comparison is always
610	 * necessary.
611	 */
612	l1 = &cset->cgrp_links;
613	l2 = &old_cset->cgrp_links;
614	while (1) {
615		struct cgrp_cset_link *link1, *link2;
616		struct cgroup *cgrp1, *cgrp2;
617
618		l1 = l1->next;
619		l2 = l2->next;
620		/* See if we reached the end - both lists are equal length. */
621		if (l1 == &cset->cgrp_links) {
622			BUG_ON(l2 != &old_cset->cgrp_links);
623			break;
624		} else {
625			BUG_ON(l2 == &old_cset->cgrp_links);
626		}
627		/* Locate the cgroups associated with these links. */
628		link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
629		link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
630		cgrp1 = link1->cgrp;
631		cgrp2 = link2->cgrp;
632		/* Hierarchies should be linked in the same order. */
633		BUG_ON(cgrp1->root != cgrp2->root);
634
635		/*
636		 * If this hierarchy is the hierarchy of the cgroup
637		 * that's changing, then we need to check that this
638		 * css_set points to the new cgroup; if it's any other
639		 * hierarchy, then this css_set should point to the
640		 * same cgroup as the old css_set.
641		 */
642		if (cgrp1->root == new_cgrp->root) {
643			if (cgrp1 != new_cgrp)
644				return false;
645		} else {
646			if (cgrp1 != cgrp2)
647				return false;
648		}
649	}
650	return true;
651}
652
653/**
654 * find_existing_css_set - init css array and find the matching css_set
655 * @old_cset: the css_set that we're using before the cgroup transition
656 * @cgrp: the cgroup that we're moving into
657 * @template: out param for the new set of csses, should be clear on entry
658 */
659static struct css_set *find_existing_css_set(struct css_set *old_cset,
660					struct cgroup *cgrp,
661					struct cgroup_subsys_state *template[])
662{
663	struct cgroup_root *root = cgrp->root;
664	struct cgroup_subsys *ss;
665	struct css_set *cset;
666	unsigned long key;
667	int i;
668
669	/*
670	 * Build the set of subsystem state objects that we want to see in the
671	 * new css_set. while subsystems can change globally, the entries here
672	 * won't change, so no need for locking.
673	 */
674	for_each_subsys(ss, i) {
675		if (root->subsys_mask & (1UL << i)) {
676			/*
677			 * @ss is in this hierarchy, so we want the
678			 * effective css from @cgrp.
679			 */
680			template[i] = cgroup_e_css(cgrp, ss);
681		} else {
682			/*
683			 * @ss is not in this hierarchy, so we don't want
684			 * to change the css.
685			 */
686			template[i] = old_cset->subsys[i];
687		}
688	}
689
690	key = css_set_hash(template);
691	hash_for_each_possible(css_set_table, cset, hlist, key) {
692		if (!compare_css_sets(cset, old_cset, cgrp, template))
693			continue;
694
695		/* This css_set matches what we need */
696		return cset;
697	}
698
699	/* No existing cgroup group matched */
700	return NULL;
701}
702
703static void free_cgrp_cset_links(struct list_head *links_to_free)
704{
705	struct cgrp_cset_link *link, *tmp_link;
706
707	list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
708		list_del(&link->cset_link);
709		kfree(link);
710	}
711}
712
713/**
714 * allocate_cgrp_cset_links - allocate cgrp_cset_links
715 * @count: the number of links to allocate
716 * @tmp_links: list_head the allocated links are put on
717 *
718 * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
719 * through ->cset_link.  Returns 0 on success or -errno.
720 */
721static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
722{
723	struct cgrp_cset_link *link;
724	int i;
725
726	INIT_LIST_HEAD(tmp_links);
727
728	for (i = 0; i < count; i++) {
729		link = kzalloc(sizeof(*link), GFP_KERNEL);
730		if (!link) {
731			free_cgrp_cset_links(tmp_links);
732			return -ENOMEM;
733		}
734		list_add(&link->cset_link, tmp_links);
735	}
736	return 0;
737}
738
739/**
740 * link_css_set - a helper function to link a css_set to a cgroup
741 * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
742 * @cset: the css_set to be linked
743 * @cgrp: the destination cgroup
744 */
745static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
746			 struct cgroup *cgrp)
747{
748	struct cgrp_cset_link *link;
749
750	BUG_ON(list_empty(tmp_links));
751
752	if (cgroup_on_dfl(cgrp))
753		cset->dfl_cgrp = cgrp;
754
755	link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
756	link->cset = cset;
757	link->cgrp = cgrp;
758
759	if (list_empty(&cgrp->cset_links))
760		cgroup_update_populated(cgrp, true);
761	list_move(&link->cset_link, &cgrp->cset_links);
762
763	/*
764	 * Always add links to the tail of the list so that the list
765	 * is sorted by order of hierarchy creation
766	 */
767	list_add_tail(&link->cgrp_link, &cset->cgrp_links);
768}
769
770/**
771 * find_css_set - return a new css_set with one cgroup updated
772 * @old_cset: the baseline css_set
773 * @cgrp: the cgroup to be updated
774 *
775 * Return a new css_set that's equivalent to @old_cset, but with @cgrp
776 * substituted into the appropriate hierarchy.
777 */
778static struct css_set *find_css_set(struct css_set *old_cset,
779				    struct cgroup *cgrp)
780{
781	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
782	struct css_set *cset;
783	struct list_head tmp_links;
784	struct cgrp_cset_link *link;
785	struct cgroup_subsys *ss;
786	unsigned long key;
787	int ssid;
788
789	lockdep_assert_held(&cgroup_mutex);
790
791	/* First see if we already have a cgroup group that matches
792	 * the desired set */
793	down_read(&css_set_rwsem);
794	cset = find_existing_css_set(old_cset, cgrp, template);
795	if (cset)
796		get_css_set(cset);
797	up_read(&css_set_rwsem);
798
799	if (cset)
800		return cset;
801
802	cset = kzalloc(sizeof(*cset), GFP_KERNEL);
803	if (!cset)
804		return NULL;
805
806	/* Allocate all the cgrp_cset_link objects that we'll need */
807	if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
808		kfree(cset);
809		return NULL;
810	}
811
812	atomic_set(&cset->refcount, 1);
813	INIT_LIST_HEAD(&cset->cgrp_links);
814	INIT_LIST_HEAD(&cset->tasks);
815	INIT_LIST_HEAD(&cset->mg_tasks);
816	INIT_LIST_HEAD(&cset->mg_preload_node);
817	INIT_LIST_HEAD(&cset->mg_node);
818	INIT_HLIST_NODE(&cset->hlist);
819
820	/* Copy the set of subsystem state objects generated in
821	 * find_existing_css_set() */
822	memcpy(cset->subsys, template, sizeof(cset->subsys));
823
824	down_write(&css_set_rwsem);
825	/* Add reference counts and links from the new css_set. */
826	list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
827		struct cgroup *c = link->cgrp;
828
829		if (c->root == cgrp->root)
830			c = cgrp;
831		link_css_set(&tmp_links, cset, c);
832	}
833
834	BUG_ON(!list_empty(&tmp_links));
835
836	css_set_count++;
837
838	/* Add @cset to the hash table */
839	key = css_set_hash(cset->subsys);
840	hash_add(css_set_table, &cset->hlist, key);
841
842	for_each_subsys(ss, ssid)
843		list_add_tail(&cset->e_cset_node[ssid],
844			      &cset->subsys[ssid]->cgroup->e_csets[ssid]);
845
846	up_write(&css_set_rwsem);
847
848	return cset;
849}
850
851static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
852{
853	struct cgroup *root_cgrp = kf_root->kn->priv;
854
855	return root_cgrp->root;
856}
857
858static int cgroup_init_root_id(struct cgroup_root *root)
859{
860	int id;
861
862	lockdep_assert_held(&cgroup_mutex);
863
864	id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
865	if (id < 0)
866		return id;
867
868	root->hierarchy_id = id;
869	return 0;
870}
871
872static void cgroup_exit_root_id(struct cgroup_root *root)
873{
874	lockdep_assert_held(&cgroup_mutex);
875
876	if (root->hierarchy_id) {
877		idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
878		root->hierarchy_id = 0;
879	}
880}
881
882static void cgroup_free_root(struct cgroup_root *root)
883{
884	if (root) {
885		/* hierarhcy ID shoulid already have been released */
886		WARN_ON_ONCE(root->hierarchy_id);
887
888		idr_destroy(&root->cgroup_idr);
889		kfree(root);
890	}
891}
892
893static void cgroup_destroy_root(struct cgroup_root *root)
894{
895	struct cgroup *cgrp = &root->cgrp;
896	struct cgrp_cset_link *link, *tmp_link;
897
898	mutex_lock(&cgroup_mutex);
899
900	BUG_ON(atomic_read(&root->nr_cgrps));
901	BUG_ON(!list_empty(&cgrp->self.children));
902
903	/* Rebind all subsystems back to the default hierarchy */
904	rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
905
906	/*
907	 * Release all the links from cset_links to this hierarchy's
908	 * root cgroup
909	 */
910	down_write(&css_set_rwsem);
911
912	list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
913		list_del(&link->cset_link);
914		list_del(&link->cgrp_link);
915		kfree(link);
916	}
917	up_write(&css_set_rwsem);
918
919	if (!list_empty(&root->root_list)) {
920		list_del(&root->root_list);
921		cgroup_root_count--;
922	}
923
924	cgroup_exit_root_id(root);
925
926	mutex_unlock(&cgroup_mutex);
927
928	kernfs_destroy_root(root->kf_root);
929	cgroup_free_root(root);
930}
931
932/* look up cgroup associated with given css_set on the specified hierarchy */
933static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
934					    struct cgroup_root *root)
935{
936	struct cgroup *res = NULL;
937
938	lockdep_assert_held(&cgroup_mutex);
939	lockdep_assert_held(&css_set_rwsem);
940
941	if (cset == &init_css_set) {
942		res = &root->cgrp;
943	} else {
944		struct cgrp_cset_link *link;
945
946		list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
947			struct cgroup *c = link->cgrp;
948
949			if (c->root == root) {
950				res = c;
951				break;
952			}
953		}
954	}
955
956	BUG_ON(!res);
957	return res;
958}
959
960/*
961 * Return the cgroup for "task" from the given hierarchy. Must be
962 * called with cgroup_mutex and css_set_rwsem held.
963 */
964static struct cgroup *task_cgroup_from_root(struct task_struct *task,
965					    struct cgroup_root *root)
966{
967	/*
968	 * No need to lock the task - since we hold cgroup_mutex the
969	 * task can't change groups, so the only thing that can happen
970	 * is that it exits and its css is set back to init_css_set.
971	 */
972	return cset_cgroup_from_root(task_css_set(task), root);
973}
974
975/*
976 * A task must hold cgroup_mutex to modify cgroups.
977 *
978 * Any task can increment and decrement the count field without lock.
979 * So in general, code holding cgroup_mutex can't rely on the count
980 * field not changing.  However, if the count goes to zero, then only
981 * cgroup_attach_task() can increment it again.  Because a count of zero
982 * means that no tasks are currently attached, therefore there is no
983 * way a task attached to that cgroup can fork (the other way to
984 * increment the count).  So code holding cgroup_mutex can safely
985 * assume that if the count is zero, it will stay zero. Similarly, if
986 * a task holds cgroup_mutex on a cgroup with zero count, it
987 * knows that the cgroup won't be removed, as cgroup_rmdir()
988 * needs that mutex.
989 *
990 * A cgroup can only be deleted if both its 'count' of using tasks
991 * is zero, and its list of 'children' cgroups is empty.  Since all
992 * tasks in the system use _some_ cgroup, and since there is always at
993 * least one task in the system (init, pid == 1), therefore, root cgroup
994 * always has either children cgroups and/or using tasks.  So we don't
995 * need a special hack to ensure that root cgroup cannot be deleted.
996 *
997 * P.S.  One more locking exception.  RCU is used to guard the
998 * update of a tasks cgroup pointer by cgroup_attach_task()
999 */
1000
1001static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);
1002static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
1003static const struct file_operations proc_cgroupstats_operations;
1004
1005static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
1006			      char *buf)
1007{
1008	if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
1009	    !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
1010		snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
1011			 cft->ss->name, cft->name);
1012	else
1013		strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
1014	return buf;
1015}
1016
1017/**
1018 * cgroup_file_mode - deduce file mode of a control file
1019 * @cft: the control file in question
1020 *
1021 * returns cft->mode if ->mode is not 0
1022 * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
1023 * returns S_IRUGO if it has only a read handler
1024 * returns S_IWUSR if it has only a write hander
1025 */
1026static umode_t cgroup_file_mode(const struct cftype *cft)
1027{
1028	umode_t mode = 0;
1029
1030	if (cft->mode)
1031		return cft->mode;
1032
1033	if (cft->read_u64 || cft->read_s64 || cft->seq_show)
1034		mode |= S_IRUGO;
1035
1036	if (cft->write_u64 || cft->write_s64 || cft->write)
1037		mode |= S_IWUSR;
1038
1039	return mode;
1040}
1041
1042static void cgroup_get(struct cgroup *cgrp)
1043{
1044	WARN_ON_ONCE(cgroup_is_dead(cgrp));
1045	css_get(&cgrp->self);
1046}
1047
1048static bool cgroup_tryget(struct cgroup *cgrp)
1049{
1050	return css_tryget(&cgrp->self);
1051}
1052
1053static void cgroup_put(struct cgroup *cgrp)
1054{
1055	css_put(&cgrp->self);
1056}
1057
1058/**
1059 * cgroup_calc_child_subsys_mask - calculate child_subsys_mask
1060 * @cgrp: the target cgroup
1061 * @subtree_control: the new subtree_control mask to consider
1062 *
1063 * On the default hierarchy, a subsystem may request other subsystems to be
1064 * enabled together through its ->depends_on mask.  In such cases, more
1065 * subsystems than specified in "cgroup.subtree_control" may be enabled.
1066 *
1067 * This function calculates which subsystems need to be enabled if
1068 * @subtree_control is to be applied to @cgrp.  The returned mask is always
1069 * a superset of @subtree_control and follows the usual hierarchy rules.
1070 */
1071static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
1072						  unsigned int subtree_control)
1073{
1074	struct cgroup *parent = cgroup_parent(cgrp);
1075	unsigned int cur_ss_mask = subtree_control;
1076	struct cgroup_subsys *ss;
1077	int ssid;
1078
1079	lockdep_assert_held(&cgroup_mutex);
1080
1081	if (!cgroup_on_dfl(cgrp))
1082		return cur_ss_mask;
1083
1084	while (true) {
1085		unsigned int new_ss_mask = cur_ss_mask;
1086
1087		for_each_subsys(ss, ssid)
1088			if (cur_ss_mask & (1 << ssid))
1089				new_ss_mask |= ss->depends_on;
1090
1091		/*
1092		 * Mask out subsystems which aren't available.  This can
1093		 * happen only if some depended-upon subsystems were bound
1094		 * to non-default hierarchies.
1095		 */
1096		if (parent)
1097			new_ss_mask &= parent->child_subsys_mask;
1098		else
1099			new_ss_mask &= cgrp->root->subsys_mask;
1100
1101		if (new_ss_mask == cur_ss_mask)
1102			break;
1103		cur_ss_mask = new_ss_mask;
1104	}
1105
1106	return cur_ss_mask;
1107}
1108
1109/**
1110 * cgroup_refresh_child_subsys_mask - update child_subsys_mask
1111 * @cgrp: the target cgroup
1112 *
1113 * Update @cgrp->child_subsys_mask according to the current
1114 * @cgrp->subtree_control using cgroup_calc_child_subsys_mask().
1115 */
1116static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
1117{
1118	cgrp->child_subsys_mask =
1119		cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control);
1120}
1121
1122/**
1123 * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
1124 * @kn: the kernfs_node being serviced
1125 *
1126 * This helper undoes cgroup_kn_lock_live() and should be invoked before
1127 * the method finishes if locking succeeded.  Note that once this function
1128 * returns the cgroup returned by cgroup_kn_lock_live() may become
1129 * inaccessible any time.  If the caller intends to continue to access the
1130 * cgroup, it should pin it before invoking this function.
1131 */
1132static void cgroup_kn_unlock(struct kernfs_node *kn)
1133{
1134	struct cgroup *cgrp;
1135
1136	if (kernfs_type(kn) == KERNFS_DIR)
1137		cgrp = kn->priv;
1138	else
1139		cgrp = kn->parent->priv;
1140
1141	mutex_unlock(&cgroup_mutex);
1142
1143	kernfs_unbreak_active_protection(kn);
1144	cgroup_put(cgrp);
1145}
1146
1147/**
1148 * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
1149 * @kn: the kernfs_node being serviced
1150 *
1151 * This helper is to be used by a cgroup kernfs method currently servicing
1152 * @kn.  It breaks the active protection, performs cgroup locking and
1153 * verifies that the associated cgroup is alive.  Returns the cgroup if
1154 * alive; otherwise, %NULL.  A successful return should be undone by a
1155 * matching cgroup_kn_unlock() invocation.
1156 *
1157 * Any cgroup kernfs method implementation which requires locking the
1158 * associated cgroup should use this helper.  It avoids nesting cgroup
1159 * locking under kernfs active protection and allows all kernfs operations
1160 * including self-removal.
1161 */
1162static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
1163{
1164	struct cgroup *cgrp;
1165
1166	if (kernfs_type(kn) == KERNFS_DIR)
1167		cgrp = kn->priv;
1168	else
1169		cgrp = kn->parent->priv;
1170
1171	/*
1172	 * We're gonna grab cgroup_mutex which nests outside kernfs
1173	 * active_ref.  cgroup liveliness check alone provides enough
1174	 * protection against removal.  Ensure @cgrp stays accessible and
1175	 * break the active_ref protection.
1176	 */
1177	if (!cgroup_tryget(cgrp))
1178		return NULL;
1179	kernfs_break_active_protection(kn);
1180
1181	mutex_lock(&cgroup_mutex);
1182
1183	if (!cgroup_is_dead(cgrp))
1184		return cgrp;
1185
1186	cgroup_kn_unlock(kn);
1187	return NULL;
1188}
1189
1190static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
1191{
1192	char name[CGROUP_FILE_NAME_MAX];
1193
1194	lockdep_assert_held(&cgroup_mutex);
1195	kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
1196}
1197
1198/**
1199 * cgroup_clear_dir - remove subsys files in a cgroup directory
1200 * @cgrp: target cgroup
1201 * @subsys_mask: mask of the subsystem ids whose files should be removed
1202 */
1203static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
1204{
1205	struct cgroup_subsys *ss;
1206	int i;
1207
1208	for_each_subsys(ss, i) {
1209		struct cftype *cfts;
1210
1211		if (!(subsys_mask & (1 << i)))
1212			continue;
1213		list_for_each_entry(cfts, &ss->cfts, node)
1214			cgroup_addrm_files(cgrp, cfts, false);
1215	}
1216}
1217
1218static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
1219{
1220	struct cgroup_subsys *ss;
1221	unsigned int tmp_ss_mask;
1222	int ssid, i, ret;
1223
1224	lockdep_assert_held(&cgroup_mutex);
1225
1226	for_each_subsys(ss, ssid) {
1227		if (!(ss_mask & (1 << ssid)))
1228			continue;
1229
1230		/* if @ss has non-root csses attached to it, can't move */
1231		if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
1232			return -EBUSY;
1233
1234		/* can't move between two non-dummy roots either */
1235		if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
1236			return -EBUSY;
1237	}
1238
1239	/* skip creating root files on dfl_root for inhibited subsystems */
1240	tmp_ss_mask = ss_mask;
1241	if (dst_root == &cgrp_dfl_root)
1242		tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
1243
1244	ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask);
1245	if (ret) {
1246		if (dst_root != &cgrp_dfl_root)
1247			return ret;
1248
1249		/*
1250		 * Rebinding back to the default root is not allowed to
1251		 * fail.  Using both default and non-default roots should
1252		 * be rare.  Moving subsystems back and forth even more so.
1253		 * Just warn about it and continue.
1254		 */
1255		if (cgrp_dfl_root_visible) {
1256			pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
1257				ret, ss_mask);
1258			pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
1259		}
1260	}
1261
1262	/*
1263	 * Nothing can fail from this point on.  Remove files for the
1264	 * removed subsystems and rebind each subsystem.
1265	 */
1266	for_each_subsys(ss, ssid)
1267		if (ss_mask & (1 << ssid))
1268			cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
1269
1270	for_each_subsys(ss, ssid) {
1271		struct cgroup_root *src_root;
1272		struct cgroup_subsys_state *css;
1273		struct css_set *cset;
1274
1275		if (!(ss_mask & (1 << ssid)))
1276			continue;
1277
1278		src_root = ss->root;
1279		css = cgroup_css(&src_root->cgrp, ss);
1280
1281		WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
1282
1283		RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
1284		rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
1285		ss->root = dst_root;
1286		css->cgroup = &dst_root->cgrp;
1287
1288		down_write(&css_set_rwsem);
1289		hash_for_each(css_set_table, i, cset, hlist)
1290			list_move_tail(&cset->e_cset_node[ss->id],
1291				       &dst_root->cgrp.e_csets[ss->id]);
1292		up_write(&css_set_rwsem);
1293
1294		src_root->subsys_mask &= ~(1 << ssid);
1295		src_root->cgrp.subtree_control &= ~(1 << ssid);
1296		cgroup_refresh_child_subsys_mask(&src_root->cgrp);
1297
1298		/* default hierarchy doesn't enable controllers by default */
1299		dst_root->subsys_mask |= 1 << ssid;
1300		if (dst_root != &cgrp_dfl_root) {
1301			dst_root->cgrp.subtree_control |= 1 << ssid;
1302			cgroup_refresh_child_subsys_mask(&dst_root->cgrp);
1303		}
1304
1305		if (ss->bind)
1306			ss->bind(css);
1307	}
1308
1309	kernfs_activate(dst_root->cgrp.kn);
1310	return 0;
1311}
1312
1313static int cgroup_show_options(struct seq_file *seq,
1314			       struct kernfs_root *kf_root)
1315{
1316	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1317	struct cgroup_subsys *ss;
1318	int ssid;
1319
1320	for_each_subsys(ss, ssid)
1321		if (root->subsys_mask & (1 << ssid))
1322			seq_show_option(seq, ss->name, NULL);
1323	if (root->flags & CGRP_ROOT_NOPREFIX)
1324		seq_puts(seq, ",noprefix");
1325	if (root->flags & CGRP_ROOT_XATTR)
1326		seq_puts(seq, ",xattr");
1327
1328	spin_lock(&release_agent_path_lock);
1329	if (strlen(root->release_agent_path))
1330		seq_show_option(seq, "release_agent",
1331				root->release_agent_path);
1332	spin_unlock(&release_agent_path_lock);
1333
1334	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
1335		seq_puts(seq, ",clone_children");
1336	if (strlen(root->name))
1337		seq_show_option(seq, "name", root->name);
1338	return 0;
1339}
1340
1341struct cgroup_sb_opts {
1342	unsigned int subsys_mask;
1343	unsigned int flags;
1344	char *release_agent;
1345	bool cpuset_clone_children;
1346	char *name;
1347	/* User explicitly requested empty subsystem */
1348	bool none;
1349};
1350
1351static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1352{
1353	char *token, *o = data;
1354	bool all_ss = false, one_ss = false;
1355	unsigned int mask = -1U;
1356	struct cgroup_subsys *ss;
1357	int nr_opts = 0;
1358	int i;
1359
1360#ifdef CONFIG_CPUSETS
1361	mask = ~(1U << cpuset_cgrp_id);
1362#endif
1363
1364	memset(opts, 0, sizeof(*opts));
1365
1366	while ((token = strsep(&o, ",")) != NULL) {
1367		nr_opts++;
1368
1369		if (!*token)
1370			return -EINVAL;
1371		if (!strcmp(token, "none")) {
1372			/* Explicitly have no subsystems */
1373			opts->none = true;
1374			continue;
1375		}
1376		if (!strcmp(token, "all")) {
1377			/* Mutually exclusive option 'all' + subsystem name */
1378			if (one_ss)
1379				return -EINVAL;
1380			all_ss = true;
1381			continue;
1382		}
1383		if (!strcmp(token, "__DEVEL__sane_behavior")) {
1384			opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
1385			continue;
1386		}
1387		if (!strcmp(token, "noprefix")) {
1388			opts->flags |= CGRP_ROOT_NOPREFIX;
1389			continue;
1390		}
1391		if (!strcmp(token, "clone_children")) {
1392			opts->cpuset_clone_children = true;
1393			continue;
1394		}
1395		if (!strcmp(token, "xattr")) {
1396			opts->flags |= CGRP_ROOT_XATTR;
1397			continue;
1398		}
1399		if (!strncmp(token, "release_agent=", 14)) {
1400			/* Specifying two release agents is forbidden */
1401			if (opts->release_agent)
1402				return -EINVAL;
1403			opts->release_agent =
1404				kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1405			if (!opts->release_agent)
1406				return -ENOMEM;
1407			continue;
1408		}
1409		if (!strncmp(token, "name=", 5)) {
1410			const char *name = token + 5;
1411			/* Can't specify an empty name */
1412			if (!strlen(name))
1413				return -EINVAL;
1414			/* Must match [\w.-]+ */
1415			for (i = 0; i < strlen(name); i++) {
1416				char c = name[i];
1417				if (isalnum(c))
1418					continue;
1419				if ((c == '.') || (c == '-') || (c == '_'))
1420					continue;
1421				return -EINVAL;
1422			}
1423			/* Specifying two names is forbidden */
1424			if (opts->name)
1425				return -EINVAL;
1426			opts->name = kstrndup(name,
1427					      MAX_CGROUP_ROOT_NAMELEN - 1,
1428					      GFP_KERNEL);
1429			if (!opts->name)
1430				return -ENOMEM;
1431
1432			continue;
1433		}
1434
1435		for_each_subsys(ss, i) {
1436			if (strcmp(token, ss->name))
1437				continue;
1438			if (ss->disabled)
1439				continue;
1440
1441			/* Mutually exclusive option 'all' + subsystem name */
1442			if (all_ss)
1443				return -EINVAL;
1444			opts->subsys_mask |= (1 << i);
1445			one_ss = true;
1446
1447			break;
1448		}
1449		if (i == CGROUP_SUBSYS_COUNT)
1450			return -ENOENT;
1451	}
1452
1453	if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1454		pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1455		if (nr_opts != 1) {
1456			pr_err("sane_behavior: no other mount options allowed\n");
1457			return -EINVAL;
1458		}
1459		return 0;
1460	}
1461
1462	/*
1463	 * If the 'all' option was specified select all the subsystems,
1464	 * otherwise if 'none', 'name=' and a subsystem name options were
1465	 * not specified, let's default to 'all'
1466	 */
1467	if (all_ss || (!one_ss && !opts->none && !opts->name))
1468		for_each_subsys(ss, i)
1469			if (!ss->disabled)
1470				opts->subsys_mask |= (1 << i);
1471
1472	/*
1473	 * We either have to specify by name or by subsystems. (So all
1474	 * empty hierarchies must have a name).
1475	 */
1476	if (!opts->subsys_mask && !opts->name)
1477		return -EINVAL;
1478
1479	/*
1480	 * Option noprefix was introduced just for backward compatibility
1481	 * with the old cpuset, so we allow noprefix only if mounting just
1482	 * the cpuset subsystem.
1483	 */
1484	if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
1485		return -EINVAL;
1486
1487	/* Can't specify "none" and some subsystems */
1488	if (opts->subsys_mask && opts->none)
1489		return -EINVAL;
1490
1491	return 0;
1492}
1493
1494static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1495{
1496	int ret = 0;
1497	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1498	struct cgroup_sb_opts opts;
1499	unsigned int added_mask, removed_mask;
1500
1501	if (root == &cgrp_dfl_root) {
1502		pr_err("remount is not allowed\n");
1503		return -EINVAL;
1504	}
1505
1506	mutex_lock(&cgroup_mutex);
1507
1508	/* See what subsystems are wanted */
1509	ret = parse_cgroupfs_options(data, &opts);
1510	if (ret)
1511		goto out_unlock;
1512
1513	if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
1514		pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
1515			task_tgid_nr(current), current->comm);
1516
1517	added_mask = opts.subsys_mask & ~root->subsys_mask;
1518	removed_mask = root->subsys_mask & ~opts.subsys_mask;
1519
1520	/* Don't allow flags or name to change at remount */
1521	if ((opts.flags ^ root->flags) ||
1522	    (opts.name && strcmp(opts.name, root->name))) {
1523		pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
1524		       opts.flags, opts.name ?: "", root->flags, root->name);
1525		ret = -EINVAL;
1526		goto out_unlock;
1527	}
1528
1529	/* remounting is not allowed for populated hierarchies */
1530	if (!list_empty(&root->cgrp.self.children)) {
1531		ret = -EBUSY;
1532		goto out_unlock;
1533	}
1534
1535	ret = rebind_subsystems(root, added_mask);
1536	if (ret)
1537		goto out_unlock;
1538
1539	rebind_subsystems(&cgrp_dfl_root, removed_mask);
1540
1541	if (opts.release_agent) {
1542		spin_lock(&release_agent_path_lock);
1543		strcpy(root->release_agent_path, opts.release_agent);
1544		spin_unlock(&release_agent_path_lock);
1545	}
1546 out_unlock:
1547	kfree(opts.release_agent);
1548	kfree(opts.name);
1549	mutex_unlock(&cgroup_mutex);
1550	return ret;
1551}
1552
1553/*
1554 * To reduce the fork() overhead for systems that are not actually using
1555 * their cgroups capability, we don't maintain the lists running through
1556 * each css_set to its tasks until we see the list actually used - in other
1557 * words after the first mount.
1558 */
1559static bool use_task_css_set_links __read_mostly;
1560
1561static void cgroup_enable_task_cg_lists(void)
1562{
1563	struct task_struct *p, *g;
1564
1565	down_write(&css_set_rwsem);
1566
1567	if (use_task_css_set_links)
1568		goto out_unlock;
1569
1570	use_task_css_set_links = true;
1571
1572	/*
1573	 * We need tasklist_lock because RCU is not safe against
1574	 * while_each_thread(). Besides, a forking task that has passed
1575	 * cgroup_post_fork() without seeing use_task_css_set_links = 1
1576	 * is not guaranteed to have its child immediately visible in the
1577	 * tasklist if we walk through it with RCU.
1578	 */
1579	read_lock(&tasklist_lock);
1580	do_each_thread(g, p) {
1581		WARN_ON_ONCE(!list_empty(&p->cg_list) ||
1582			     task_css_set(p) != &init_css_set);
1583
1584		/*
1585		 * We should check if the process is exiting, otherwise
1586		 * it will race with cgroup_exit() in that the list
1587		 * entry won't be deleted though the process has exited.
1588		 * Do it while holding siglock so that we don't end up
1589		 * racing against cgroup_exit().
1590		 */
1591		spin_lock_irq(&p->sighand->siglock);
1592		if (!(p->flags & PF_EXITING)) {
1593			struct css_set *cset = task_css_set(p);
1594
1595			list_add(&p->cg_list, &cset->tasks);
1596			get_css_set(cset);
1597		}
1598		spin_unlock_irq(&p->sighand->siglock);
1599	} while_each_thread(g, p);
1600	read_unlock(&tasklist_lock);
1601out_unlock:
1602	up_write(&css_set_rwsem);
1603}
1604
1605static void init_cgroup_housekeeping(struct cgroup *cgrp)
1606{
1607	struct cgroup_subsys *ss;
1608	int ssid;
1609
1610	INIT_LIST_HEAD(&cgrp->self.sibling);
1611	INIT_LIST_HEAD(&cgrp->self.children);
1612	INIT_LIST_HEAD(&cgrp->cset_links);
1613	INIT_LIST_HEAD(&cgrp->pidlists);
1614	mutex_init(&cgrp->pidlist_mutex);
1615	cgrp->self.cgroup = cgrp;
1616	cgrp->self.flags |= CSS_ONLINE;
1617
1618	for_each_subsys(ss, ssid)
1619		INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1620
1621	init_waitqueue_head(&cgrp->offline_waitq);
1622	INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
1623}
1624
1625static void init_cgroup_root(struct cgroup_root *root,
1626			     struct cgroup_sb_opts *opts)
1627{
1628	struct cgroup *cgrp = &root->cgrp;
1629
1630	INIT_LIST_HEAD(&root->root_list);
1631	atomic_set(&root->nr_cgrps, 1);
1632	cgrp->root = root;
1633	init_cgroup_housekeeping(cgrp);
1634	idr_init(&root->cgroup_idr);
1635
1636	root->flags = opts->flags;
1637	if (opts->release_agent)
1638		strcpy(root->release_agent_path, opts->release_agent);
1639	if (opts->name)
1640		strcpy(root->name, opts->name);
1641	if (opts->cpuset_clone_children)
1642		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1643}
1644
1645static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
1646{
1647	LIST_HEAD(tmp_links);
1648	struct cgroup *root_cgrp = &root->cgrp;
1649	struct cftype *base_files;
1650	struct css_set *cset;
1651	int i, ret;
1652
1653	lockdep_assert_held(&cgroup_mutex);
1654
1655	ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
1656	if (ret < 0)
1657		goto out;
1658	root_cgrp->id = ret;
1659
1660	ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
1661			      GFP_KERNEL);
1662	if (ret)
1663		goto out;
1664
1665	/*
1666	 * We're accessing css_set_count without locking css_set_rwsem here,
1667	 * but that's OK - it can only be increased by someone holding
1668	 * cgroup_lock, and that's us. The worst that can happen is that we
1669	 * have some link structures left over
1670	 */
1671	ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1672	if (ret)
1673		goto cancel_ref;
1674
1675	ret = cgroup_init_root_id(root);
1676	if (ret)
1677		goto cancel_ref;
1678
1679	root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
1680					   KERNFS_ROOT_CREATE_DEACTIVATED,
1681					   root_cgrp);
1682	if (IS_ERR(root->kf_root)) {
1683		ret = PTR_ERR(root->kf_root);
1684		goto exit_root_id;
1685	}
1686	root_cgrp->kn = root->kf_root->kn;
1687
1688	if (root == &cgrp_dfl_root)
1689		base_files = cgroup_dfl_base_files;
1690	else
1691		base_files = cgroup_legacy_base_files;
1692
1693	ret = cgroup_addrm_files(root_cgrp, base_files, true);
1694	if (ret)
1695		goto destroy_root;
1696
1697	ret = rebind_subsystems(root, ss_mask);
1698	if (ret)
1699		goto destroy_root;
1700
1701	/*
1702	 * There must be no failure case after here, since rebinding takes
1703	 * care of subsystems' refcounts, which are explicitly dropped in
1704	 * the failure exit path.
1705	 */
1706	list_add(&root->root_list, &cgroup_roots);
1707	cgroup_root_count++;
1708
1709	/*
1710	 * Link the root cgroup in this hierarchy into all the css_set
1711	 * objects.
1712	 */
1713	down_write(&css_set_rwsem);
1714	hash_for_each(css_set_table, i, cset, hlist)
1715		link_css_set(&tmp_links, cset, root_cgrp);
1716	up_write(&css_set_rwsem);
1717
1718	BUG_ON(!list_empty(&root_cgrp->self.children));
1719	BUG_ON(atomic_read(&root->nr_cgrps) != 1);
1720
1721	kernfs_activate(root_cgrp->kn);
1722	ret = 0;
1723	goto out;
1724
1725destroy_root:
1726	kernfs_destroy_root(root->kf_root);
1727	root->kf_root = NULL;
1728exit_root_id:
1729	cgroup_exit_root_id(root);
1730cancel_ref:
1731	percpu_ref_exit(&root_cgrp->self.refcnt);
1732out:
1733	free_cgrp_cset_links(&tmp_links);
1734	return ret;
1735}
1736
1737static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1738			 int flags, const char *unused_dev_name,
1739			 void *data)
1740{
1741	struct super_block *pinned_sb = NULL;
1742	struct cgroup_subsys *ss;
1743	struct cgroup_root *root;
1744	struct cgroup_sb_opts opts;
1745	struct dentry *dentry;
1746	int ret;
1747	int i;
1748	bool new_sb;
1749
1750	/*
1751	 * The first time anyone tries to mount a cgroup, enable the list
1752	 * linking each css_set to its tasks and fix up all existing tasks.
1753	 */
1754	if (!use_task_css_set_links)
1755		cgroup_enable_task_cg_lists();
1756
1757	mutex_lock(&cgroup_mutex);
1758
1759	/* First find the desired set of subsystems */
1760	ret = parse_cgroupfs_options(data, &opts);
1761	if (ret)
1762		goto out_unlock;
1763
1764	/* look for a matching existing root */
1765	if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
1766		cgrp_dfl_root_visible = true;
1767		root = &cgrp_dfl_root;
1768		cgroup_get(&root->cgrp);
1769		ret = 0;
1770		goto out_unlock;
1771	}
1772
1773	/*
1774	 * Destruction of cgroup root is asynchronous, so subsystems may
1775	 * still be dying after the previous unmount.  Let's drain the
1776	 * dying subsystems.  We just need to ensure that the ones
1777	 * unmounted previously finish dying and don't care about new ones
1778	 * starting.  Testing ref liveliness is good enough.
1779	 */
1780	for_each_subsys(ss, i) {
1781		if (!(opts.subsys_mask & (1 << i)) ||
1782		    ss->root == &cgrp_dfl_root)
1783			continue;
1784
1785		if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
1786			mutex_unlock(&cgroup_mutex);
1787			msleep(10);
1788			ret = restart_syscall();
1789			goto out_free;
1790		}
1791		cgroup_put(&ss->root->cgrp);
1792	}
1793
1794	for_each_root(root) {
1795		bool name_match = false;
1796
1797		if (root == &cgrp_dfl_root)
1798			continue;
1799
1800		/*
1801		 * If we asked for a name then it must match.  Also, if
1802		 * name matches but sybsys_mask doesn't, we should fail.
1803		 * Remember whether name matched.
1804		 */
1805		if (opts.name) {
1806			if (strcmp(opts.name, root->name))
1807				continue;
1808			name_match = true;
1809		}
1810
1811		/*
1812		 * If we asked for subsystems (or explicitly for no
1813		 * subsystems) then they must match.
1814		 */
1815		if ((opts.subsys_mask || opts.none) &&
1816		    (opts.subsys_mask != root->subsys_mask)) {
1817			if (!name_match)
1818				continue;
1819			ret = -EBUSY;
1820			goto out_unlock;
1821		}
1822
1823		if (root->flags ^ opts.flags)
1824			pr_warn("new mount options do not match the existing superblock, will be ignored\n");
1825
1826		/*
1827		 * We want to reuse @root whose lifetime is governed by its
1828		 * ->cgrp.  Let's check whether @root is alive and keep it
1829		 * that way.  As cgroup_kill_sb() can happen anytime, we
1830		 * want to block it by pinning the sb so that @root doesn't
1831		 * get killed before mount is complete.
1832		 *
1833		 * With the sb pinned, tryget_live can reliably indicate
1834		 * whether @root can be reused.  If it's being killed,
1835		 * drain it.  We can use wait_queue for the wait but this
1836		 * path is super cold.  Let's just sleep a bit and retry.
1837		 */
1838		pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
1839		if (IS_ERR(pinned_sb) ||
1840		    !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
1841			mutex_unlock(&cgroup_mutex);
1842			if (!IS_ERR_OR_NULL(pinned_sb))
1843				deactivate_super(pinned_sb);
1844			msleep(10);
1845			ret = restart_syscall();
1846			goto out_free;
1847		}
1848
1849		ret = 0;
1850		goto out_unlock;
1851	}
1852
1853	/*
1854	 * No such thing, create a new one.  name= matching without subsys
1855	 * specification is allowed for already existing hierarchies but we
1856	 * can't create new one without subsys specification.
1857	 */
1858	if (!opts.subsys_mask && !opts.none) {
1859		ret = -EINVAL;
1860		goto out_unlock;
1861	}
1862
1863	root = kzalloc(sizeof(*root), GFP_KERNEL);
1864	if (!root) {
1865		ret = -ENOMEM;
1866		goto out_unlock;
1867	}
1868
1869	init_cgroup_root(root, &opts);
1870
1871	ret = cgroup_setup_root(root, opts.subsys_mask);
1872	if (ret)
1873		cgroup_free_root(root);
1874
1875out_unlock:
1876	mutex_unlock(&cgroup_mutex);
1877out_free:
1878	kfree(opts.release_agent);
1879	kfree(opts.name);
1880
1881	if (ret)
1882		return ERR_PTR(ret);
1883
1884	dentry = kernfs_mount(fs_type, flags, root->kf_root,
1885				CGROUP_SUPER_MAGIC, &new_sb);
1886	if (IS_ERR(dentry) || !new_sb)
1887		cgroup_put(&root->cgrp);
1888
1889	/*
1890	 * If @pinned_sb, we're reusing an existing root and holding an
1891	 * extra ref on its sb.  Mount is complete.  Put the extra ref.
1892	 */
1893	if (pinned_sb) {
1894		WARN_ON(new_sb);
1895		deactivate_super(pinned_sb);
1896	}
1897
1898	return dentry;
1899}
1900
1901static void cgroup_kill_sb(struct super_block *sb)
1902{
1903	struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
1904	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1905
1906	/*
1907	 * If @root doesn't have any mounts or children, start killing it.
1908	 * This prevents new mounts by disabling percpu_ref_tryget_live().
1909	 * cgroup_mount() may wait for @root's release.
1910	 *
1911	 * And don't kill the default root.
1912	 */
1913	if (!list_empty(&root->cgrp.self.children) ||
1914	    root == &cgrp_dfl_root)
1915		cgroup_put(&root->cgrp);
1916	else
1917		percpu_ref_kill(&root->cgrp.self.refcnt);
1918
1919	kernfs_kill_sb(sb);
1920}
1921
1922static struct file_system_type cgroup_fs_type = {
1923	.name = "cgroup",
1924	.mount = cgroup_mount,
1925	.kill_sb = cgroup_kill_sb,
1926};
1927
1928/**
1929 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
1930 * @task: target task
1931 * @buf: the buffer to write the path into
1932 * @buflen: the length of the buffer
1933 *
1934 * Determine @task's cgroup on the first (the one with the lowest non-zero
1935 * hierarchy_id) cgroup hierarchy and copy its path into @buf.  This
1936 * function grabs cgroup_mutex and shouldn't be used inside locks used by
1937 * cgroup controller callbacks.
1938 *
1939 * Return value is the same as kernfs_path().
1940 */
1941char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
1942{
1943	struct cgroup_root *root;
1944	struct cgroup *cgrp;
1945	int hierarchy_id = 1;
1946	char *path = NULL;
1947
1948	mutex_lock(&cgroup_mutex);
1949	down_read(&css_set_rwsem);
1950
1951	root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
1952
1953	if (root) {
1954		cgrp = task_cgroup_from_root(task, root);
1955		path = cgroup_path(cgrp, buf, buflen);
1956	} else {
1957		/* if no hierarchy exists, everyone is in "/" */
1958		if (strlcpy(buf, "/", buflen) < buflen)
1959			path = buf;
1960	}
1961
1962	up_read(&css_set_rwsem);
1963	mutex_unlock(&cgroup_mutex);
1964	return path;
1965}
1966EXPORT_SYMBOL_GPL(task_cgroup_path);
1967
1968/* used to track tasks and other necessary states during migration */
1969struct cgroup_taskset {
1970	/* the src and dst cset list running through cset->mg_node */
1971	struct list_head	src_csets;
1972	struct list_head	dst_csets;
1973
1974	/*
1975	 * Fields for cgroup_taskset_*() iteration.
1976	 *
1977	 * Before migration is committed, the target migration tasks are on
1978	 * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
1979	 * the csets on ->dst_csets.  ->csets point to either ->src_csets
1980	 * or ->dst_csets depending on whether migration is committed.
1981	 *
1982	 * ->cur_csets and ->cur_task point to the current task position
1983	 * during iteration.
1984	 */
1985	struct list_head	*csets;
1986	struct css_set		*cur_cset;
1987	struct task_struct	*cur_task;
1988};
1989
1990/**
1991 * cgroup_taskset_first - reset taskset and return the first task
1992 * @tset: taskset of interest
1993 *
1994 * @tset iteration is initialized and the first task is returned.
1995 */
1996struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
1997{
1998	tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
1999	tset->cur_task = NULL;
2000
2001	return cgroup_taskset_next(tset);
2002}
2003
2004/**
2005 * cgroup_taskset_next - iterate to the next task in taskset
2006 * @tset: taskset of interest
2007 *
2008 * Return the next task in @tset.  Iteration must have been initialized
2009 * with cgroup_taskset_first().
2010 */
2011struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
2012{
2013	struct css_set *cset = tset->cur_cset;
2014	struct task_struct *task = tset->cur_task;
2015
2016	while (&cset->mg_node != tset->csets) {
2017		if (!task)
2018			task = list_first_entry(&cset->mg_tasks,
2019						struct task_struct, cg_list);
2020		else
2021			task = list_next_entry(task, cg_list);
2022
2023		if (&task->cg_list != &cset->mg_tasks) {
2024			tset->cur_cset = cset;
2025			tset->cur_task = task;
2026			return task;
2027		}
2028
2029		cset = list_next_entry(cset, mg_node);
2030		task = NULL;
2031	}
2032
2033	return NULL;
2034}
2035
2036/**
2037 * cgroup_task_migrate - move a task from one cgroup to another.
2038 * @old_cgrp: the cgroup @tsk is being migrated from
2039 * @tsk: the task being migrated
2040 * @new_cset: the new css_set @tsk is being attached to
2041 *
2042 * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
2043 */
2044static void cgroup_task_migrate(struct cgroup *old_cgrp,
2045				struct task_struct *tsk,
2046				struct css_set *new_cset)
2047{
2048	struct css_set *old_cset;
2049
2050	lockdep_assert_held(&cgroup_mutex);
2051	lockdep_assert_held(&css_set_rwsem);
2052
2053	/*
2054	 * We are synchronized through threadgroup_lock() against PF_EXITING
2055	 * setting such that we can't race against cgroup_exit() changing the
2056	 * css_set to init_css_set and dropping the old one.
2057	 */
2058	WARN_ON_ONCE(tsk->flags & PF_EXITING);
2059	old_cset = task_css_set(tsk);
2060
2061	get_css_set(new_cset);
2062	rcu_assign_pointer(tsk->cgroups, new_cset);
2063
2064	/*
2065	 * Use move_tail so that cgroup_taskset_first() still returns the
2066	 * leader after migration.  This works because cgroup_migrate()
2067	 * ensures that the dst_cset of the leader is the first on the
2068	 * tset's dst_csets list.
2069	 */
2070	list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
2071
2072	/*
2073	 * We just gained a reference on old_cset by taking it from the
2074	 * task. As trading it for new_cset is protected by cgroup_mutex,
2075	 * we're safe to drop it here; it will be freed under RCU.
2076	 */
2077	put_css_set_locked(old_cset);
2078}
2079
2080/**
2081 * cgroup_migrate_finish - cleanup after attach
2082 * @preloaded_csets: list of preloaded css_sets
2083 *
2084 * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
2085 * those functions for details.
2086 */
2087static void cgroup_migrate_finish(struct list_head *preloaded_csets)
2088{
2089	struct css_set *cset, *tmp_cset;
2090
2091	lockdep_assert_held(&cgroup_mutex);
2092
2093	down_write(&css_set_rwsem);
2094	list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
2095		cset->mg_src_cgrp = NULL;
2096		cset->mg_dst_cset = NULL;
2097		list_del_init(&cset->mg_preload_node);
2098		put_css_set_locked(cset);
2099	}
2100	up_write(&css_set_rwsem);
2101}
2102
2103/**
2104 * cgroup_migrate_add_src - add a migration source css_set
2105 * @src_cset: the source css_set to add
2106 * @dst_cgrp: the destination cgroup
2107 * @preloaded_csets: list of preloaded css_sets
2108 *
2109 * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
2110 * @src_cset and add it to @preloaded_csets, which should later be cleaned
2111 * up by cgroup_migrate_finish().
2112 *
2113 * This function may be called without holding threadgroup_lock even if the
2114 * target is a process.  Threads may be created and destroyed but as long
2115 * as cgroup_mutex is not dropped, no new css_set can be put into play and
2116 * the preloaded css_sets are guaranteed to cover all migrations.
2117 */
2118static void cgroup_migrate_add_src(struct css_set *src_cset,
2119				   struct cgroup *dst_cgrp,
2120				   struct list_head *preloaded_csets)
2121{
2122	struct cgroup *src_cgrp;
2123
2124	lockdep_assert_held(&cgroup_mutex);
2125	lockdep_assert_held(&css_set_rwsem);
2126
2127	src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
2128
2129	if (!list_empty(&src_cset->mg_preload_node))
2130		return;
2131
2132	WARN_ON(src_cset->mg_src_cgrp);
2133	WARN_ON(!list_empty(&src_cset->mg_tasks));
2134	WARN_ON(!list_empty(&src_cset->mg_node));
2135
2136	src_cset->mg_src_cgrp = src_cgrp;
2137	get_css_set(src_cset);
2138	list_add(&src_cset->mg_preload_node, preloaded_csets);
2139}
2140
2141/**
2142 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
2143 * @dst_cgrp: the destination cgroup (may be %NULL)
2144 * @preloaded_csets: list of preloaded source css_sets
2145 *
2146 * Tasks are about to be moved to @dst_cgrp and all the source css_sets
2147 * have been preloaded to @preloaded_csets.  This function looks up and
2148 * pins all destination css_sets, links each to its source, and append them
2149 * to @preloaded_csets.  If @dst_cgrp is %NULL, the destination of each
2150 * source css_set is assumed to be its cgroup on the default hierarchy.
2151 *
2152 * This function must be called after cgroup_migrate_add_src() has been
2153 * called on each migration source css_set.  After migration is performed
2154 * using cgroup_migrate(), cgroup_migrate_finish() must be called on
2155 * @preloaded_csets.
2156 */
2157static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
2158				      struct list_head *preloaded_csets)
2159{
2160	LIST_HEAD(csets);
2161	struct css_set *src_cset, *tmp_cset;
2162
2163	lockdep_assert_held(&cgroup_mutex);
2164
2165	/*
2166	 * Except for the root, child_subsys_mask must be zero for a cgroup
2167	 * with tasks so that child cgroups don't compete against tasks.
2168	 */
2169	if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) &&
2170	    dst_cgrp->child_subsys_mask)
2171		return -EBUSY;
2172
2173	/* look up the dst cset for each src cset and link it to src */
2174	list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
2175		struct css_set *dst_cset;
2176
2177		dst_cset = find_css_set(src_cset,
2178					dst_cgrp ?: src_cset->dfl_cgrp);
2179		if (!dst_cset)
2180			goto err;
2181
2182		WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
2183
2184		/*
2185		 * If src cset equals dst, it's noop.  Drop the src.
2186		 * cgroup_migrate() will skip the cset too.  Note that we
2187		 * can't handle src == dst as some nodes are used by both.
2188		 */
2189		if (src_cset == dst_cset) {
2190			src_cset->mg_src_cgrp = NULL;
2191			list_del_init(&src_cset->mg_preload_node);
2192			put_css_set(src_cset);
2193			put_css_set(dst_cset);
2194			continue;
2195		}
2196
2197		src_cset->mg_dst_cset = dst_cset;
2198
2199		if (list_empty(&dst_cset->mg_preload_node))
2200			list_add(&dst_cset->mg_preload_node, &csets);
2201		else
2202			put_css_set(dst_cset);
2203	}
2204
2205	list_splice_tail(&csets, preloaded_csets);
2206	return 0;
2207err:
2208	cgroup_migrate_finish(&csets);
2209	return -ENOMEM;
2210}
2211
2212/**
2213 * cgroup_migrate - migrate a process or task to a cgroup
2214 * @cgrp: the destination cgroup
2215 * @leader: the leader of the process or the task to migrate
2216 * @threadgroup: whether @leader points to the whole process or a single task
2217 *
2218 * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
2219 * process, the caller must be holding threadgroup_lock of @leader.  The
2220 * caller is also responsible for invoking cgroup_migrate_add_src() and
2221 * cgroup_migrate_prepare_dst() on the targets before invoking this
2222 * function and following up with cgroup_migrate_finish().
2223 *
2224 * As long as a controller's ->can_attach() doesn't fail, this function is
2225 * guaranteed to succeed.  This means that, excluding ->can_attach()
2226 * failure, when migrating multiple targets, the success or failure can be
2227 * decided for all targets by invoking group_migrate_prepare_dst() before
2228 * actually starting migrating.
2229 */
2230static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
2231			  bool threadgroup)
2232{
2233	struct cgroup_taskset tset = {
2234		.src_csets	= LIST_HEAD_INIT(tset.src_csets),
2235		.dst_csets	= LIST_HEAD_INIT(tset.dst_csets),
2236		.csets		= &tset.src_csets,
2237	};
2238	struct cgroup_subsys_state *css, *failed_css = NULL;
2239	struct css_set *cset, *tmp_cset;
2240	struct task_struct *task, *tmp_task;
2241	int i, ret;
2242
2243	/*
2244	 * Prevent freeing of tasks while we take a snapshot. Tasks that are
2245	 * already PF_EXITING could be freed from underneath us unless we
2246	 * take an rcu_read_lock.
2247	 */
2248	down_write(&css_set_rwsem);
2249	rcu_read_lock();
2250	task = leader;
2251	do {
2252		/* @task either already exited or can't exit until the end */
2253		if (task->flags & PF_EXITING)
2254			goto next;
2255
2256		/* leave @task alone if post_fork() hasn't linked it yet */
2257		if (list_empty(&task->cg_list))
2258			goto next;
2259
2260		cset = task_css_set(task);
2261		if (!cset->mg_src_cgrp)
2262			goto next;
2263
2264		/*
2265		 * cgroup_taskset_first() must always return the leader.
2266		 * Take care to avoid disturbing the ordering.
2267		 */
2268		list_move_tail(&task->cg_list, &cset->mg_tasks);
2269		if (list_empty(&cset->mg_node))
2270			list_add_tail(&cset->mg_node, &tset.src_csets);
2271		if (list_empty(&cset->mg_dst_cset->mg_node))
2272			list_move_tail(&cset->mg_dst_cset->mg_node,
2273				       &tset.dst_csets);
2274	next:
2275		if (!threadgroup)
2276			break;
2277	} while_each_thread(leader, task);
2278	rcu_read_unlock();
2279	up_write(&css_set_rwsem);
2280
2281	/* methods shouldn't be called if no task is actually migrating */
2282	if (list_empty(&tset.src_csets))
2283		return 0;
2284
2285	/* check that we can legitimately attach to the cgroup */
2286	for_each_e_css(css, i, cgrp) {
2287		if (css->ss->can_attach) {
2288			ret = css->ss->can_attach(css, &tset);
2289			if (ret) {
2290				failed_css = css;
2291				goto out_cancel_attach;
2292			}
2293		}
2294	}
2295
2296	/*
2297	 * Now that we're guaranteed success, proceed to move all tasks to
2298	 * the new cgroup.  There are no failure cases after here, so this
2299	 * is the commit point.
2300	 */
2301	down_write(&css_set_rwsem);
2302	list_for_each_entry(cset, &tset.src_csets, mg_node) {
2303		list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
2304			cgroup_task_migrate(cset->mg_src_cgrp, task,
2305					    cset->mg_dst_cset);
2306	}
2307	up_write(&css_set_rwsem);
2308
2309	/*
2310	 * Migration is committed, all target tasks are now on dst_csets.
2311	 * Nothing is sensitive to fork() after this point.  Notify
2312	 * controllers that migration is complete.
2313	 */
2314	tset.csets = &tset.dst_csets;
2315
2316	for_each_e_css(css, i, cgrp)
2317		if (css->ss->attach)
2318			css->ss->attach(css, &tset);
2319
2320	ret = 0;
2321	goto out_release_tset;
2322
2323out_cancel_attach:
2324	for_each_e_css(css, i, cgrp) {
2325		if (css == failed_css)
2326			break;
2327		if (css->ss->cancel_attach)
2328			css->ss->cancel_attach(css, &tset);
2329	}
2330out_release_tset:
2331	down_write(&css_set_rwsem);
2332	list_splice_init(&tset.dst_csets, &tset.src_csets);
2333	list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
2334		list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2335		list_del_init(&cset->mg_node);
2336	}
2337	up_write(&css_set_rwsem);
2338	return ret;
2339}
2340
2341/**
2342 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
2343 * @dst_cgrp: the cgroup to attach to
2344 * @leader: the task or the leader of the threadgroup to be attached
2345 * @threadgroup: attach the whole threadgroup?
2346 *
2347 * Call holding cgroup_mutex and threadgroup_lock of @leader.
2348 */
2349static int cgroup_attach_task(struct cgroup *dst_cgrp,
2350			      struct task_struct *leader, bool threadgroup)
2351{
2352	LIST_HEAD(preloaded_csets);
2353	struct task_struct *task;
2354	int ret;
2355
2356	/* look up all src csets */
2357	down_read(&css_set_rwsem);
2358	rcu_read_lock();
2359	task = leader;
2360	do {
2361		cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
2362				       &preloaded_csets);
2363		if (!threadgroup)
2364			break;
2365	} while_each_thread(leader, task);
2366	rcu_read_unlock();
2367	up_read(&css_set_rwsem);
2368
2369	/* prepare dst csets and commit */
2370	ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
2371	if (!ret)
2372		ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
2373
2374	cgroup_migrate_finish(&preloaded_csets);
2375	return ret;
2376}
2377
2378/*
2379 * Find the task_struct of the task to attach by vpid and pass it along to the
2380 * function to attach either it or all tasks in its threadgroup. Will lock
2381 * cgroup_mutex and threadgroup.
2382 */
2383static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
2384				    size_t nbytes, loff_t off, bool threadgroup)
2385{
2386	struct task_struct *tsk;
2387	const struct cred *cred = current_cred(), *tcred;
2388	struct cgroup *cgrp;
2389	pid_t pid;
2390	int ret;
2391
2392	if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
2393		return -EINVAL;
2394
2395	cgrp = cgroup_kn_lock_live(of->kn);
2396	if (!cgrp)
2397		return -ENODEV;
2398
2399retry_find_task:
2400	rcu_read_lock();
2401	if (pid) {
2402		tsk = find_task_by_vpid(pid);
2403		if (!tsk) {
2404			rcu_read_unlock();
2405			ret = -ESRCH;
2406			goto out_unlock_cgroup;
2407		}
2408		/*
2409		 * even if we're attaching all tasks in the thread group, we
2410		 * only need to check permissions on one of them.
2411		 */
2412		tcred = __task_cred(tsk);
2413		if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
2414		    !uid_eq(cred->euid, tcred->uid) &&
2415		    !uid_eq(cred->euid, tcred->suid)) {
2416			rcu_read_unlock();
2417			ret = -EACCES;
2418			goto out_unlock_cgroup;
2419		}
2420	} else
2421		tsk = current;
2422
2423	if (threadgroup)
2424		tsk = tsk->group_leader;
2425
2426	/*
2427	 * Workqueue threads may acquire PF_NO_SETAFFINITY and become
2428	 * trapped in a cpuset, or RT worker may be born in a cgroup
2429	 * with no rt_runtime allocated.  Just say no.
2430	 */
2431	if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
2432		ret = -EINVAL;
2433		rcu_read_unlock();
2434		goto out_unlock_cgroup;
2435	}
2436
2437	get_task_struct(tsk);
2438	rcu_read_unlock();
2439
2440	threadgroup_lock(tsk);
2441	if (threadgroup) {
2442		if (!thread_group_leader(tsk)) {
2443			/*
2444			 * a race with de_thread from another thread's exec()
2445			 * may strip us of our leadership, if this happens,
2446			 * there is no choice but to throw this task away and
2447			 * try again; this is
2448			 * "double-double-toil-and-trouble-check locking".
2449			 */
2450			threadgroup_unlock(tsk);
2451			put_task_struct(tsk);
2452			goto retry_find_task;
2453		}
2454	}
2455
2456	ret = cgroup_attach_task(cgrp, tsk, threadgroup);
2457
2458	threadgroup_unlock(tsk);
2459
2460	put_task_struct(tsk);
2461out_unlock_cgroup:
2462	cgroup_kn_unlock(of->kn);
2463	return ret ?: nbytes;
2464}
2465
2466/**
2467 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
2468 * @from: attach to all cgroups of a given task
2469 * @tsk: the task to be attached
2470 */
2471int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2472{
2473	struct cgroup_root *root;
2474	int retval = 0;
2475
2476	mutex_lock(&cgroup_mutex);
2477	for_each_root(root) {
2478		struct cgroup *from_cgrp;
2479
2480		if (root == &cgrp_dfl_root)
2481			continue;
2482
2483		down_read(&css_set_rwsem);
2484		from_cgrp = task_cgroup_from_root(from, root);
2485		up_read(&css_set_rwsem);
2486
2487		retval = cgroup_attach_task(from_cgrp, tsk, false);
2488		if (retval)
2489			break;
2490	}
2491	mutex_unlock(&cgroup_mutex);
2492
2493	return retval;
2494}
2495EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2496
2497static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
2498				  char *buf, size_t nbytes, loff_t off)
2499{
2500	return __cgroup_procs_write(of, buf, nbytes, off, false);
2501}
2502
2503static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
2504				  char *buf, size_t nbytes, loff_t off)
2505{
2506	return __cgroup_procs_write(of, buf, nbytes, off, true);
2507}
2508
2509static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
2510					  char *buf, size_t nbytes, loff_t off)
2511{
2512	struct cgroup *cgrp;
2513
2514	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
2515
2516	cgrp = cgroup_kn_lock_live(of->kn);
2517	if (!cgrp)
2518		return -ENODEV;
2519	spin_lock(&release_agent_path_lock);
2520	strlcpy(cgrp->root->release_agent_path, strstrip(buf),
2521		sizeof(cgrp->root->release_agent_path));
2522	spin_unlock(&release_agent_path_lock);
2523	cgroup_kn_unlock(of->kn);
2524	return nbytes;
2525}
2526
2527static int cgroup_release_agent_show(struct seq_file *seq, void *v)
2528{
2529	struct cgroup *cgrp = seq_css(seq)->cgroup;
2530
2531	spin_lock(&release_agent_path_lock);
2532	seq_puts(seq, cgrp->root->release_agent_path);
2533	spin_unlock(&release_agent_path_lock);
2534	seq_putc(seq, '\n');
2535	return 0;
2536}
2537
2538static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2539{
2540	seq_puts(seq, "0\n");
2541	return 0;
2542}
2543
2544static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
2545{
2546	struct cgroup_subsys *ss;
2547	bool printed = false;
2548	int ssid;
2549
2550	for_each_subsys(ss, ssid) {
2551		if (ss_mask & (1 << ssid)) {
2552			if (printed)
2553				seq_putc(seq, ' ');
2554			seq_printf(seq, "%s", ss->name);
2555			printed = true;
2556		}
2557	}
2558	if (printed)
2559		seq_putc(seq, '\n');
2560}
2561
2562/* show controllers which are currently attached to the default hierarchy */
2563static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
2564{
2565	struct cgroup *cgrp = seq_css(seq)->cgroup;
2566
2567	cgroup_print_ss_mask(seq, cgrp->root->subsys_mask &
2568			     ~cgrp_dfl_root_inhibit_ss_mask);
2569	return 0;
2570}
2571
2572/* show controllers which are enabled from the parent */
2573static int cgroup_controllers_show(struct seq_file *seq, void *v)
2574{
2575	struct cgroup *cgrp = seq_css(seq)->cgroup;
2576
2577	cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control);
2578	return 0;
2579}
2580
2581/* show controllers which are enabled for a given cgroup's children */
2582static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2583{
2584	struct cgroup *cgrp = seq_css(seq)->cgroup;
2585
2586	cgroup_print_ss_mask(seq, cgrp->subtree_control);
2587	return 0;
2588}
2589
2590/**
2591 * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
2592 * @cgrp: root of the subtree to update csses for
2593 *
2594 * @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
2595 * css associations need to be updated accordingly.  This function looks up
2596 * all css_sets which are attached to the subtree, creates the matching
2597 * updated css_sets and migrates the tasks to the new ones.
2598 */
2599static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2600{
2601	LIST_HEAD(preloaded_csets);
2602	struct cgroup_subsys_state *css;
2603	struct css_set *src_cset;
2604	int ret;
2605
2606	lockdep_assert_held(&cgroup_mutex);
2607
2608	/* look up all csses currently attached to @cgrp's subtree */
2609	down_read(&css_set_rwsem);
2610	css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
2611		struct cgrp_cset_link *link;
2612
2613		/* self is not affected by child_subsys_mask change */
2614		if (css->cgroup == cgrp)
2615			continue;
2616
2617		list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
2618			cgroup_migrate_add_src(link->cset, cgrp,
2619					       &preloaded_csets);
2620	}
2621	up_read(&css_set_rwsem);
2622
2623	/* NULL dst indicates self on default hierarchy */
2624	ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
2625	if (ret)
2626		goto out_finish;
2627
2628	list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
2629		struct task_struct *last_task = NULL, *task;
2630
2631		/* src_csets precede dst_csets, break on the first dst_cset */
2632		if (!src_cset->mg_src_cgrp)
2633			break;
2634
2635		/*
2636		 * All tasks in src_cset need to be migrated to the
2637		 * matching dst_cset.  Empty it process by process.  We
2638		 * walk tasks but migrate processes.  The leader might even
2639		 * belong to a different cset but such src_cset would also
2640		 * be among the target src_csets because the default
2641		 * hierarchy enforces per-process membership.
2642		 */
2643		while (true) {
2644			down_read(&css_set_rwsem);
2645			task = list_first_entry_or_null(&src_cset->tasks,
2646						struct task_struct, cg_list);
2647			if (task) {
2648				task = task->group_leader;
2649				WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
2650				get_task_struct(task);
2651			}
2652			up_read(&css_set_rwsem);
2653
2654			if (!task)
2655				break;
2656
2657			/* guard against possible infinite loop */
2658			if (WARN(last_task == task,
2659				 "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
2660				goto out_finish;
2661			last_task = task;
2662
2663			threadgroup_lock(task);
2664			/* raced against de_thread() from another thread? */
2665			if (!thread_group_leader(task)) {
2666				threadgroup_unlock(task);
2667				put_task_struct(task);
2668				continue;
2669			}
2670
2671			ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
2672
2673			threadgroup_unlock(task);
2674			put_task_struct(task);
2675
2676			if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
2677				goto out_finish;
2678		}
2679	}
2680
2681out_finish:
2682	cgroup_migrate_finish(&preloaded_csets);
2683	return ret;
2684}
2685
2686/* change the enabled child controllers for a cgroup in the default hierarchy */
2687static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2688					    char *buf, size_t nbytes,
2689					    loff_t off)
2690{
2691	unsigned int enable = 0, disable = 0;
2692	unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
2693	struct cgroup *cgrp, *child;
2694	struct cgroup_subsys *ss;
2695	char *tok;
2696	int ssid, ret;
2697
2698	/*
2699	 * Parse input - space separated list of subsystem names prefixed
2700	 * with either + or -.
2701	 */
2702	buf = strstrip(buf);
2703	while ((tok = strsep(&buf, " "))) {
2704		if (tok[0] == '\0')
2705			continue;
2706		for_each_subsys(ss, ssid) {
2707			if (ss->disabled || strcmp(tok + 1, ss->name) ||
2708			    ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask))
2709				continue;
2710
2711			if (*tok == '+') {
2712				enable |= 1 << ssid;
2713				disable &= ~(1 << ssid);
2714			} else if (*tok == '-') {
2715				disable |= 1 << ssid;
2716				enable &= ~(1 << ssid);
2717			} else {
2718				return -EINVAL;
2719			}
2720			break;
2721		}
2722		if (ssid == CGROUP_SUBSYS_COUNT)
2723			return -EINVAL;
2724	}
2725
2726	cgrp = cgroup_kn_lock_live(of->kn);
2727	if (!cgrp)
2728		return -ENODEV;
2729
2730	for_each_subsys(ss, ssid) {
2731		if (enable & (1 << ssid)) {
2732			if (cgrp->subtree_control & (1 << ssid)) {
2733				enable &= ~(1 << ssid);
2734				continue;
2735			}
2736
2737			/* unavailable or not enabled on the parent? */
2738			if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
2739			    (cgroup_parent(cgrp) &&
2740			     !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) {
2741				ret = -ENOENT;
2742				goto out_unlock;
2743			}
2744		} else if (disable & (1 << ssid)) {
2745			if (!(cgrp->subtree_control & (1 << ssid))) {
2746				disable &= ~(1 << ssid);
2747				continue;
2748			}
2749
2750			/* a child has it enabled? */
2751			cgroup_for_each_live_child(child, cgrp) {
2752				if (child->subtree_control & (1 << ssid)) {
2753					ret = -EBUSY;
2754					goto out_unlock;
2755				}
2756			}
2757		}
2758	}
2759
2760	if (!enable && !disable) {
2761		ret = 0;
2762		goto out_unlock;
2763	}
2764
2765	/*
2766	 * Except for the root, subtree_control must be zero for a cgroup
2767	 * with tasks so that child cgroups don't compete against tasks.
2768	 */
2769	if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
2770		ret = -EBUSY;
2771		goto out_unlock;
2772	}
2773
2774	/*
2775	 * Update subsys masks and calculate what needs to be done.  More
2776	 * subsystems than specified may need to be enabled or disabled
2777	 * depending on subsystem dependencies.
2778	 */
2779	old_sc = cgrp->subtree_control;
2780	old_ss = cgrp->child_subsys_mask;
2781	new_sc = (old_sc | enable) & ~disable;
2782	new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc);
2783
2784	css_enable = ~old_ss & new_ss;
2785	css_disable = old_ss & ~new_ss;
2786	enable |= css_enable;
2787	disable |= css_disable;
2788
2789	/*
2790	 * Because css offlining is asynchronous, userland might try to
2791	 * re-enable the same controller while the previous instance is
2792	 * still around.  In such cases, wait till it's gone using
2793	 * offline_waitq.
2794	 */
2795	for_each_subsys(ss, ssid) {
2796		if (!(css_enable & (1 << ssid)))
2797			continue;
2798
2799		cgroup_for_each_live_child(child, cgrp) {
2800			DEFINE_WAIT(wait);
2801
2802			if (!cgroup_css(child, ss))
2803				continue;
2804
2805			cgroup_get(child);
2806			prepare_to_wait(&child->offline_waitq, &wait,
2807					TASK_UNINTERRUPTIBLE);
2808			cgroup_kn_unlock(of->kn);
2809			schedule();
2810			finish_wait(&child->offline_waitq, &wait);
2811			cgroup_put(child);
2812
2813			return restart_syscall();
2814		}
2815	}
2816
2817	cgrp->subtree_control = new_sc;
2818	cgrp->child_subsys_mask = new_ss;
2819
2820	/*
2821	 * Create new csses or make the existing ones visible.  A css is
2822	 * created invisible if it's being implicitly enabled through
2823	 * dependency.  An invisible css is made visible when the userland
2824	 * explicitly enables it.
2825	 */
2826	for_each_subsys(ss, ssid) {
2827		if (!(enable & (1 << ssid)))
2828			continue;
2829
2830		cgroup_for_each_live_child(child, cgrp) {
2831			if (css_enable & (1 << ssid))
2832				ret = create_css(child, ss,
2833					cgrp->subtree_control & (1 << ssid));
2834			else
2835				ret = cgroup_populate_dir(child, 1 << ssid);
2836			if (ret)
2837				goto err_undo_css;
2838		}
2839	}
2840
2841	/*
2842	 * At this point, cgroup_e_css() results reflect the new csses
2843	 * making the following cgroup_update_dfl_csses() properly update
2844	 * css associations of all tasks in the subtree.
2845	 */
2846	ret = cgroup_update_dfl_csses(cgrp);
2847	if (ret)
2848		goto err_undo_css;
2849
2850	/*
2851	 * All tasks are migrated out of disabled csses.  Kill or hide
2852	 * them.  A css is hidden when the userland requests it to be
2853	 * disabled while other subsystems are still depending on it.  The
2854	 * css must not actively control resources and be in the vanilla
2855	 * state if it's made visible again later.  Controllers which may
2856	 * be depended upon should provide ->css_reset() for this purpose.
2857	 */
2858	for_each_subsys(ss, ssid) {
2859		if (!(disable & (1 << ssid)))
2860			continue;
2861
2862		cgroup_for_each_live_child(child, cgrp) {
2863			struct cgroup_subsys_state *css = cgroup_css(child, ss);
2864
2865			if (css_disable & (1 << ssid)) {
2866				kill_css(css);
2867			} else {
2868				cgroup_clear_dir(child, 1 << ssid);
2869				if (ss->css_reset)
2870					ss->css_reset(css);
2871			}
2872		}
2873	}
2874
2875	/*
2876	 * The effective csses of all the descendants (excluding @cgrp) may
2877	 * have changed.  Subsystems can optionally subscribe to this event
2878	 * by implementing ->css_e_css_changed() which is invoked if any of
2879	 * the effective csses seen from the css's cgroup may have changed.
2880	 */
2881	for_each_subsys(ss, ssid) {
2882		struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss);
2883		struct cgroup_subsys_state *css;
2884
2885		if (!ss->css_e_css_changed || !this_css)
2886			continue;
2887
2888		css_for_each_descendant_pre(css, this_css)
2889			if (css != this_css)
2890				ss->css_e_css_changed(css);
2891	}
2892
2893	kernfs_activate(cgrp->kn);
2894	ret = 0;
2895out_unlock:
2896	cgroup_kn_unlock(of->kn);
2897	return ret ?: nbytes;
2898
2899err_undo_css:
2900	cgrp->subtree_control = old_sc;
2901	cgrp->child_subsys_mask = old_ss;
2902
2903	for_each_subsys(ss, ssid) {
2904		if (!(enable & (1 << ssid)))
2905			continue;
2906
2907		cgroup_for_each_live_child(child, cgrp) {
2908			struct cgroup_subsys_state *css = cgroup_css(child, ss);
2909
2910			if (!css)
2911				continue;
2912
2913			if (css_enable & (1 << ssid))
2914				kill_css(css);
2915			else
2916				cgroup_clear_dir(child, 1 << ssid);
2917		}
2918	}
2919	goto out_unlock;
2920}
2921
2922static int cgroup_populated_show(struct seq_file *seq, void *v)
2923{
2924	seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
2925	return 0;
2926}
2927
2928static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2929				 size_t nbytes, loff_t off)
2930{
2931	struct cgroup *cgrp = of->kn->parent->priv;
2932	struct cftype *cft = of->kn->priv;
2933	struct cgroup_subsys_state *css;
2934	int ret;
2935
2936	if (cft->write)
2937		return cft->write(of, buf, nbytes, off);
2938
2939	/*
2940	 * kernfs guarantees that a file isn't deleted with operations in
2941	 * flight, which means that the matching css is and stays alive and
2942	 * doesn't need to be pinned.  The RCU locking is not necessary
2943	 * either.  It's just for the convenience of using cgroup_css().
2944	 */
2945	rcu_read_lock();
2946	css = cgroup_css(cgrp, cft->ss);
2947	rcu_read_unlock();
2948
2949	if (cft->write_u64) {
2950		unsigned long long v;
2951		ret = kstrtoull(buf, 0, &v);
2952		if (!ret)
2953			ret = cft->write_u64(css, cft, v);
2954	} else if (cft->write_s64) {
2955		long long v;
2956		ret = kstrtoll(buf, 0, &v);
2957		if (!ret)
2958			ret = cft->write_s64(css, cft, v);
2959	} else {
2960		ret = -EINVAL;
2961	}
2962
2963	return ret ?: nbytes;
2964}
2965
2966static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
2967{
2968	return seq_cft(seq)->seq_start(seq, ppos);
2969}
2970
2971static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
2972{
2973	return seq_cft(seq)->seq_next(seq, v, ppos);
2974}
2975
2976static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
2977{
2978	seq_cft(seq)->seq_stop(seq, v);
2979}
2980
2981static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2982{
2983	struct cftype *cft = seq_cft(m);
2984	struct cgroup_subsys_state *css = seq_css(m);
2985
2986	if (cft->seq_show)
2987		return cft->seq_show(m, arg);
2988
2989	if (cft->read_u64)
2990		seq_printf(m, "%llu\n", cft->read_u64(css, cft));
2991	else if (cft->read_s64)
2992		seq_printf(m, "%lld\n", cft->read_s64(css, cft));
2993	else
2994		return -EINVAL;
2995	return 0;
2996}
2997
2998static struct kernfs_ops cgroup_kf_single_ops = {
2999	.atomic_write_len	= PAGE_SIZE,
3000	.write			= cgroup_file_write,
3001	.seq_show		= cgroup_seqfile_show,
3002};
3003
3004static struct kernfs_ops cgroup_kf_ops = {
3005	.atomic_write_len	= PAGE_SIZE,
3006	.write			= cgroup_file_write,
3007	.seq_start		= cgroup_seqfile_start,
3008	.seq_next		= cgroup_seqfile_next,
3009	.seq_stop		= cgroup_seqfile_stop,
3010	.seq_show		= cgroup_seqfile_show,
3011};
3012
3013/*
3014 * cgroup_rename - Only allow simple rename of directories in place.
3015 */
3016static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
3017			 const char *new_name_str)
3018{
3019	struct cgroup *cgrp = kn->priv;
3020	int ret;
3021
3022	if (kernfs_type(kn) != KERNFS_DIR)
3023		return -ENOTDIR;
3024	if (kn->parent != new_parent)
3025		return -EIO;
3026
3027	/*
3028	 * This isn't a proper migration and its usefulness is very
3029	 * limited.  Disallow on the default hierarchy.
3030	 */
3031	if (cgroup_on_dfl(cgrp))
3032		return -EPERM;
3033
3034	/*
3035	 * We're gonna grab cgroup_mutex which nests outside kernfs
3036	 * active_ref.  kernfs_rename() doesn't require active_ref
3037	 * protection.  Break them before grabbing cgroup_mutex.
3038	 */
3039	kernfs_break_active_protection(new_parent);
3040	kernfs_break_active_protection(kn);
3041
3042	mutex_lock(&cgroup_mutex);
3043
3044	ret = kernfs_rename(kn, new_parent, new_name_str);
3045
3046	mutex_unlock(&cgroup_mutex);
3047
3048	kernfs_unbreak_active_protection(kn);
3049	kernfs_unbreak_active_protection(new_parent);
3050	return ret;
3051}
3052
3053/* set uid and gid of cgroup dirs and files to that of the creator */
3054static int cgroup_kn_set_ugid(struct kernfs_node *kn)
3055{
3056	struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
3057			       .ia_uid = current_fsuid(),
3058			       .ia_gid = current_fsgid(), };
3059
3060	if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
3061	    gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
3062		return 0;
3063
3064	return kernfs_setattr(kn, &iattr);
3065}
3066
3067static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
3068{
3069	char name[CGROUP_FILE_NAME_MAX];
3070	struct kernfs_node *kn;
3071	struct lock_class_key *key = NULL;
3072	int ret;
3073
3074#ifdef CONFIG_DEBUG_LOCK_ALLOC
3075	key = &cft->lockdep_key;
3076#endif
3077	kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
3078				  cgroup_file_mode(cft), 0, cft->kf_ops, cft,
3079				  NULL, key);
3080	if (IS_ERR(kn))
3081		return PTR_ERR(kn);
3082
3083	ret = cgroup_kn_set_ugid(kn);
3084	if (ret) {
3085		kernfs_remove(kn);
3086		return ret;
3087	}
3088
3089	if (cft->seq_show == cgroup_populated_show)
3090		cgrp->populated_kn = kn;
3091	return 0;
3092}
3093
3094/**
3095 * cgroup_addrm_files - add or remove files to a cgroup directory
3096 * @cgrp: the target cgroup
3097 * @cfts: array of cftypes to be added
3098 * @is_add: whether to add or remove
3099 *
3100 * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
3101 * For removals, this function never fails.  If addition fails, this
3102 * function doesn't remove files already added.  The caller is responsible
3103 * for cleaning up.
3104 */
3105static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
3106			      bool is_add)
3107{
3108	struct cftype *cft;
3109	int ret;
3110
3111	lockdep_assert_held(&cgroup_mutex);
3112
3113	for (cft = cfts; cft->name[0] != '\0'; cft++) {
3114		/* does cft->flags tell us to skip this file on @cgrp? */
3115		if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
3116			continue;
3117		if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
3118			continue;
3119		if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
3120			continue;
3121		if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
3122			continue;
3123
3124		if (is_add) {
3125			ret = cgroup_add_file(cgrp, cft);
3126			if (ret) {
3127				pr_warn("%s: failed to add %s, err=%d\n",
3128					__func__, cft->name, ret);
3129				return ret;
3130			}
3131		} else {
3132			cgroup_rm_file(cgrp, cft);
3133		}
3134	}
3135	return 0;
3136}
3137
3138static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
3139{
3140	LIST_HEAD(pending);
3141	struct cgroup_subsys *ss = cfts[0].ss;
3142	struct cgroup *root = &ss->root->cgrp;
3143	struct cgroup_subsys_state *css;
3144	int ret = 0;
3145
3146	lockdep_assert_held(&cgroup_mutex);
3147
3148	/* add/rm files for all cgroups created before */
3149	css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
3150		struct cgroup *cgrp = css->cgroup;
3151
3152		if (cgroup_is_dead(cgrp))
3153			continue;
3154
3155		ret = cgroup_addrm_files(cgrp, cfts, is_add);
3156		if (ret)
3157			break;
3158	}
3159
3160	if (is_add && !ret)
3161		kernfs_activate(root->kn);
3162	return ret;
3163}
3164
3165static void cgroup_exit_cftypes(struct cftype *cfts)
3166{
3167	struct cftype *cft;
3168
3169	for (cft = cfts; cft->name[0] != '\0'; cft++) {
3170		/* free copy for custom atomic_write_len, see init_cftypes() */
3171		if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
3172			kfree(cft->kf_ops);
3173		cft->kf_ops = NULL;
3174		cft->ss = NULL;
3175
3176		/* revert flags set by cgroup core while adding @cfts */
3177		cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
3178	}
3179}
3180
3181static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3182{
3183	struct cftype *cft;
3184
3185	for (cft = cfts; cft->name[0] != '\0'; cft++) {
3186		struct kernfs_ops *kf_ops;
3187
3188		WARN_ON(cft->ss || cft->kf_ops);
3189
3190		if (cft->seq_start)
3191			kf_ops = &cgroup_kf_ops;
3192		else
3193			kf_ops = &cgroup_kf_single_ops;
3194
3195		/*
3196		 * Ugh... if @cft wants a custom max_write_len, we need to
3197		 * make a copy of kf_ops to set its atomic_write_len.
3198		 */
3199		if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
3200			kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
3201			if (!kf_ops) {
3202				cgroup_exit_cftypes(cfts);
3203				return -ENOMEM;
3204			}
3205			kf_ops->atomic_write_len = cft->max_write_len;
3206		}
3207
3208		cft->kf_ops = kf_ops;
3209		cft->ss = ss;
3210	}
3211
3212	return 0;
3213}
3214
3215static int cgroup_rm_cftypes_locked(struct cftype *cfts)
3216{
3217	lockdep_assert_held(&cgroup_mutex);
3218
3219	if (!cfts || !cfts[0].ss)
3220		return -ENOENT;
3221
3222	list_del(&cfts->node);
3223	cgroup_apply_cftypes(cfts, false);
3224	cgroup_exit_cftypes(cfts);
3225	return 0;
3226}
3227
3228/**
3229 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
3230 * @cfts: zero-length name terminated array of cftypes
3231 *
3232 * Unregister @cfts.  Files described by @cfts are removed from all
3233 * existing cgroups and all future cgroups won't have them either.  This
3234 * function can be called anytime whether @cfts' subsys is attached or not.
3235 *
3236 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
3237 * registered.
3238 */
3239int cgroup_rm_cftypes(struct cftype *cfts)
3240{
3241	int ret;
3242
3243	mutex_lock(&cgroup_mutex);
3244	ret = cgroup_rm_cftypes_locked(cfts);
3245	mutex_unlock(&cgroup_mutex);
3246	return ret;
3247}
3248
3249/**
3250 * cgroup_add_cftypes - add an array of cftypes to a subsystem
3251 * @ss: target cgroup subsystem
3252 * @cfts: zero-length name terminated array of cftypes
3253 *
3254 * Register @cfts to @ss.  Files described by @cfts are created for all
3255 * existing cgroups to which @ss is attached and all future cgroups will
3256 * have them too.  This function can be called anytime whether @ss is
3257 * attached or not.
3258 *
3259 * Returns 0 on successful registration, -errno on failure.  Note that this
3260 * function currently returns 0 as long as @cfts registration is successful
3261 * even if some file creation attempts on existing cgroups fail.
3262 */
3263static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3264{
3265	int ret;
3266
3267	if (ss->disabled)
3268		return 0;
3269
3270	if (!cfts || cfts[0].name[0] == '\0')
3271		return 0;
3272
3273	ret = cgroup_init_cftypes(ss, cfts);
3274	if (ret)
3275		return ret;
3276
3277	mutex_lock(&cgroup_mutex);
3278
3279	list_add_tail(&cfts->node, &ss->cfts);
3280	ret = cgroup_apply_cftypes(cfts, true);
3281	if (ret)
3282		cgroup_rm_cftypes_locked(cfts);
3283
3284	mutex_unlock(&cgroup_mutex);
3285	return ret;
3286}
3287
3288/**
3289 * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
3290 * @ss: target cgroup subsystem
3291 * @cfts: zero-length name terminated array of cftypes
3292 *
3293 * Similar to cgroup_add_cftypes() but the added files are only used for
3294 * the default hierarchy.
3295 */
3296int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3297{
3298	struct cftype *cft;
3299
3300	for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3301		cft->flags |= __CFTYPE_ONLY_ON_DFL;
3302	return cgroup_add_cftypes(ss, cfts);
3303}
3304
3305/**
3306 * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
3307 * @ss: target cgroup subsystem
3308 * @cfts: zero-length name terminated array of cftypes
3309 *
3310 * Similar to cgroup_add_cftypes() but the added files are only used for
3311 * the legacy hierarchies.
3312 */
3313int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3314{
3315	struct cftype *cft;
3316
3317	/*
3318	 * If legacy_flies_on_dfl, we want to show the legacy files on the
3319	 * dfl hierarchy but iff the target subsystem hasn't been updated
3320	 * for the dfl hierarchy yet.
3321	 */
3322	if (!cgroup_legacy_files_on_dfl ||
3323	    ss->dfl_cftypes != ss->legacy_cftypes) {
3324		for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3325			cft->flags |= __CFTYPE_NOT_ON_DFL;
3326	}
3327
3328	return cgroup_add_cftypes(ss, cfts);
3329}
3330
3331/**
3332 * cgroup_task_count - count the number of tasks in a cgroup.
3333 * @cgrp: the cgroup in question
3334 *
3335 * Return the number of tasks in the cgroup.
3336 */
3337static int cgroup_task_count(const struct cgroup *cgrp)
3338{
3339	int count = 0;
3340	struct cgrp_cset_link *link;
3341
3342	down_read(&css_set_rwsem);
3343	list_for_each_entry(link, &cgrp->cset_links, cset_link)
3344		count += atomic_read(&link->cset->refcount);
3345	up_read(&css_set_rwsem);
3346	return count;
3347}
3348
3349/**
3350 * css_next_child - find the next child of a given css
3351 * @pos: the current position (%NULL to initiate traversal)
3352 * @parent: css whose children to walk
3353 *
3354 * This function returns the next child of @parent and should be called
3355 * under either cgroup_mutex or RCU read lock.  The only requirement is
3356 * that @parent and @pos are accessible.  The next sibling is guaranteed to
3357 * be returned regardless of their states.
3358 *
3359 * If a subsystem synchronizes ->css_online() and the start of iteration, a
3360 * css which finished ->css_online() is guaranteed to be visible in the
3361 * future iterations and will stay visible until the last reference is put.
3362 * A css which hasn't finished ->css_online() or already finished
3363 * ->css_offline() may show up during traversal.  It's each subsystem's
3364 * responsibility to synchronize against on/offlining.
3365 */
3366struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
3367					   struct cgroup_subsys_state *parent)
3368{
3369	struct cgroup_subsys_state *next;
3370
3371	cgroup_assert_mutex_or_rcu_locked();
3372
3373	/*
3374	 * @pos could already have been unlinked from the sibling list.
3375	 * Once a cgroup is removed, its ->sibling.next is no longer
3376	 * updated when its next sibling changes.  CSS_RELEASED is set when
3377	 * @pos is taken off list, at which time its next pointer is valid,
3378	 * and, as releases are serialized, the one pointed to by the next
3379	 * pointer is guaranteed to not have started release yet.  This
3380	 * implies that if we observe !CSS_RELEASED on @pos in this RCU
3381	 * critical section, the one pointed to by its next pointer is
3382	 * guaranteed to not have finished its RCU grace period even if we
3383	 * have dropped rcu_read_lock() inbetween iterations.
3384	 *
3385	 * If @pos has CSS_RELEASED set, its next pointer can't be
3386	 * dereferenced; however, as each css is given a monotonically
3387	 * increasing unique serial number and always appended to the
3388	 * sibling list, the next one can be found by walking the parent's
3389	 * children until the first css with higher serial number than
3390	 * @pos's.  While this path can be slower, it happens iff iteration
3391	 * races against release and the race window is very small.
3392	 */
3393	if (!pos) {
3394		next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
3395	} else if (likely(!(pos->flags & CSS_RELEASED))) {
3396		next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
3397	} else {
3398		list_for_each_entry_rcu(next, &parent->children, sibling)
3399			if (next->serial_nr > pos->serial_nr)
3400				break;
3401	}
3402
3403	/*
3404	 * @next, if not pointing to the head, can be dereferenced and is
3405	 * the next sibling.
3406	 */
3407	if (&next->sibling != &parent->children)
3408		return next;
3409	return NULL;
3410}
3411
3412/**
3413 * css_next_descendant_pre - find the next descendant for pre-order walk
3414 * @pos: the current position (%NULL to initiate traversal)
3415 * @root: css whose descendants to walk
3416 *
3417 * To be used by css_for_each_descendant_pre().  Find the next descendant
3418 * to visit for pre-order traversal of @root's descendants.  @root is
3419 * included in the iteration and the first node to be visited.
3420 *
3421 * While this function requires cgroup_mutex or RCU read locking, it
3422 * doesn't require the whole traversal to be contained in a single critical
3423 * section.  This function will return the correct next descendant as long
3424 * as both @pos and @root are accessible and @pos is a descendant of @root.
3425 *
3426 * If a subsystem synchronizes ->css_online() and the start of iteration, a
3427 * css which finished ->css_online() is guaranteed to be visible in the
3428 * future iterations and will stay visible until the last reference is put.
3429 * A css which hasn't finished ->css_online() or already finished
3430 * ->css_offline() may show up during traversal.  It's each subsystem's
3431 * responsibility to synchronize against on/offlining.
3432 */
3433struct cgroup_subsys_state *
3434css_next_descendant_pre(struct cgroup_subsys_state *pos,
3435			struct cgroup_subsys_state *root)
3436{
3437	struct cgroup_subsys_state *next;
3438
3439	cgroup_assert_mutex_or_rcu_locked();
3440
3441	/* if first iteration, visit @root */
3442	if (!pos)
3443		return root;
3444
3445	/* visit the first child if exists */
3446	next = css_next_child(NULL, pos);
3447	if (next)
3448		return next;
3449
3450	/* no child, visit my or the closest ancestor's next sibling */
3451	while (pos != root) {
3452		next = css_next_child(pos, pos->parent);
3453		if (next)
3454			return next;
3455		pos = pos->parent;
3456	}
3457
3458	return NULL;
3459}
3460
3461/**
3462 * css_rightmost_descendant - return the rightmost descendant of a css
3463 * @pos: css of interest
3464 *
3465 * Return the rightmost descendant of @pos.  If there's no descendant, @pos
3466 * is returned.  This can be used during pre-order traversal to skip
3467 * subtree of @pos.
3468 *
3469 * While this function requires cgroup_mutex or RCU read locking, it
3470 * doesn't require the whole traversal to be contained in a single critical
3471 * section.  This function will return the correct rightmost descendant as
3472 * long as @pos is accessible.
3473 */
3474struct cgroup_subsys_state *
3475css_rightmost_descendant(struct cgroup_subsys_state *pos)
3476{
3477	struct cgroup_subsys_state *last, *tmp;
3478
3479	cgroup_assert_mutex_or_rcu_locked();
3480
3481	do {
3482		last = pos;
3483		/* ->prev isn't RCU safe, walk ->next till the end */
3484		pos = NULL;
3485		css_for_each_child(tmp, last)
3486			pos = tmp;
3487	} while (pos);
3488
3489	return last;
3490}
3491
3492static struct cgroup_subsys_state *
3493css_leftmost_descendant(struct cgroup_subsys_state *pos)
3494{
3495	struct cgroup_subsys_state *last;
3496
3497	do {
3498		last = pos;
3499		pos = css_next_child(NULL, pos);
3500	} while (pos);
3501
3502	return last;
3503}
3504
3505/**
3506 * css_next_descendant_post - find the next descendant for post-order walk
3507 * @pos: the current position (%NULL to initiate traversal)
3508 * @root: css whose descendants to walk
3509 *
3510 * To be used by css_for_each_descendant_post().  Find the next descendant
3511 * to visit for post-order traversal of @root's descendants.  @root is
3512 * included in the iteration and the last node to be visited.
3513 *
3514 * While this function requires cgroup_mutex or RCU read locking, it
3515 * doesn't require the whole traversal to be contained in a single critical
3516 * section.  This function will return the correct next descendant as long
3517 * as both @pos and @cgroup are accessible and @pos is a descendant of
3518 * @cgroup.
3519 *
3520 * If a subsystem synchronizes ->css_online() and the start of iteration, a
3521 * css which finished ->css_online() is guaranteed to be visible in the
3522 * future iterations and will stay visible until the last reference is put.
3523 * A css which hasn't finished ->css_online() or already finished
3524 * ->css_offline() may show up during traversal.  It's each subsystem's
3525 * responsibility to synchronize against on/offlining.
3526 */
3527struct cgroup_subsys_state *
3528css_next_descendant_post(struct cgroup_subsys_state *pos,
3529			 struct cgroup_subsys_state *root)
3530{
3531	struct cgroup_subsys_state *next;
3532
3533	cgroup_assert_mutex_or_rcu_locked();
3534
3535	/* if first iteration, visit leftmost descendant which may be @root */
3536	if (!pos)
3537		return css_leftmost_descendant(root);
3538
3539	/* if we visited @root, we're done */
3540	if (pos == root)
3541		return NULL;
3542
3543	/* if there's an unvisited sibling, visit its leftmost descendant */
3544	next = css_next_child(pos, pos->parent);
3545	if (next)
3546		return css_leftmost_descendant(next);
3547
3548	/* no sibling left, visit parent */
3549	return pos->parent;
3550}
3551
3552/**
3553 * css_has_online_children - does a css have online children
3554 * @css: the target css
3555 *
3556 * Returns %true if @css has any online children; otherwise, %false.  This
3557 * function can be called from any context but the caller is responsible
3558 * for synchronizing against on/offlining as necessary.
3559 */
3560bool css_has_online_children(struct cgroup_subsys_state *css)
3561{
3562	struct cgroup_subsys_state *child;
3563	bool ret = false;
3564
3565	rcu_read_lock();
3566	css_for_each_child(child, css) {
3567		if (child->flags & CSS_ONLINE) {
3568			ret = true;
3569			break;
3570		}
3571	}
3572	rcu_read_unlock();
3573	return ret;
3574}
3575
3576/**
3577 * css_advance_task_iter - advance a task itererator to the next css_set
3578 * @it: the iterator to advance
3579 *
3580 * Advance @it to the next css_set to walk.
3581 */
3582static void css_advance_task_iter(struct css_task_iter *it)
3583{
3584	struct list_head *l = it->cset_pos;
3585	struct cgrp_cset_link *link;
3586	struct css_set *cset;
3587
3588	/* Advance to the next non-empty css_set */
3589	do {
3590		l = l->next;
3591		if (l == it->cset_head) {
3592			it->cset_pos = NULL;
3593			return;
3594		}
3595
3596		if (it->ss) {
3597			cset = container_of(l, struct css_set,
3598					    e_cset_node[it->ss->id]);
3599		} else {
3600			link = list_entry(l, struct cgrp_cset_link, cset_link);
3601			cset = link->cset;
3602		}
3603	} while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
3604
3605	it->cset_pos = l;
3606
3607	if (!list_empty(&cset->tasks))
3608		it->task_pos = cset->tasks.next;
3609	else
3610		it->task_pos = cset->mg_tasks.next;
3611
3612	it->tasks_head = &cset->tasks;
3613	it->mg_tasks_head = &cset->mg_tasks;
3614}
3615
3616/**
3617 * css_task_iter_start - initiate task iteration
3618 * @css: the css to walk tasks of
3619 * @it: the task iterator to use
3620 *
3621 * Initiate iteration through the tasks of @css.  The caller can call
3622 * css_task_iter_next() to walk through the tasks until the function
3623 * returns NULL.  On completion of iteration, css_task_iter_end() must be
3624 * called.
3625 *
3626 * Note that this function acquires a lock which is released when the
3627 * iteration finishes.  The caller can't sleep while iteration is in
3628 * progress.
3629 */
3630void css_task_iter_start(struct cgroup_subsys_state *css,
3631			 struct css_task_iter *it)
3632	__acquires(css_set_rwsem)
3633{
3634	/* no one should try to iterate before mounting cgroups */
3635	WARN_ON_ONCE(!use_task_css_set_links);
3636
3637	down_read(&css_set_rwsem);
3638
3639	it->ss = css->ss;
3640
3641	if (it->ss)
3642		it->cset_pos = &css->cgroup->e_csets[css->ss->id];
3643	else
3644		it->cset_pos = &css->cgroup->cset_links;
3645
3646	it->cset_head = it->cset_pos;
3647
3648	css_advance_task_iter(it);
3649}
3650
3651/**
3652 * css_task_iter_next - return the next task for the iterator
3653 * @it: the task iterator being iterated
3654 *
3655 * The "next" function for task iteration.  @it should have been
3656 * initialized via css_task_iter_start().  Returns NULL when the iteration
3657 * reaches the end.
3658 */
3659struct task_struct *css_task_iter_next(struct css_task_iter *it)
3660{
3661	struct task_struct *res;
3662	struct list_head *l = it->task_pos;
3663
3664	/* If the iterator cg is NULL, we have no tasks */
3665	if (!it->cset_pos)
3666		return NULL;
3667	res = list_entry(l, struct task_struct, cg_list);
3668
3669	/*
3670	 * Advance iterator to find next entry.  cset->tasks is consumed
3671	 * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
3672	 * next cset.
3673	 */
3674	l = l->next;
3675
3676	if (l == it->tasks_head)
3677		l = it->mg_tasks_head->next;
3678
3679	if (l == it->mg_tasks_head)
3680		css_advance_task_iter(it);
3681	else
3682		it->task_pos = l;
3683
3684	return res;
3685}
3686
3687/**
3688 * css_task_iter_end - finish task iteration
3689 * @it: the task iterator to finish
3690 *
3691 * Finish task iteration started by css_task_iter_start().
3692 */
3693void css_task_iter_end(struct css_task_iter *it)
3694	__releases(css_set_rwsem)
3695{
3696	up_read(&css_set_rwsem);
3697}
3698
3699/**
3700 * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
3701 * @to: cgroup to which the tasks will be moved
3702 * @from: cgroup in which the tasks currently reside
3703 *
3704 * Locking rules between cgroup_post_fork() and the migration path
3705 * guarantee that, if a task is forking while being migrated, the new child
3706 * is guaranteed to be either visible in the source cgroup after the
3707 * parent's migration is complete or put into the target cgroup.  No task
3708 * can slip out of migration through forking.
3709 */
3710int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3711{
3712	LIST_HEAD(preloaded_csets);
3713	struct cgrp_cset_link *link;
3714	struct css_task_iter it;
3715	struct task_struct *task;
3716	int ret;
3717
3718	mutex_lock(&cgroup_mutex);
3719
3720	/* all tasks in @from are being moved, all csets are source */
3721	down_read(&css_set_rwsem);
3722	list_for_each_entry(link, &from->cset_links, cset_link)
3723		cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
3724	up_read(&css_set_rwsem);
3725
3726	ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
3727	if (ret)
3728		goto out_err;
3729
3730	/*
3731	 * Migrate tasks one-by-one until @form is empty.  This fails iff
3732	 * ->can_attach() fails.
3733	 */
3734	do {
3735		css_task_iter_start(&from->self, &it);
3736		task = css_task_iter_next(&it);
3737		if (task)
3738			get_task_struct(task);
3739		css_task_iter_end(&it);
3740
3741		if (task) {
3742			ret = cgroup_migrate(to, task, false);
3743			put_task_struct(task);
3744		}
3745	} while (task && !ret);
3746out_err:
3747	cgroup_migrate_finish(&preloaded_csets);
3748	mutex_unlock(&cgroup_mutex);
3749	return ret;
3750}
3751
3752/*
3753 * Stuff for reading the 'tasks'/'procs' files.
3754 *
3755 * Reading this file can return large amounts of data if a cgroup has
3756 * *lots* of attached tasks. So it may need several calls to read(),
3757 * but we cannot guarantee that the information we produce is correct
3758 * unless we produce it entirely atomically.
3759 *
3760 */
3761
3762/* which pidlist file are we talking about? */
3763enum cgroup_filetype {
3764	CGROUP_FILE_PROCS,
3765	CGROUP_FILE_TASKS,
3766};
3767
3768/*
3769 * A pidlist is a list of pids that virtually represents the contents of one
3770 * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
3771 * a pair (one each for procs, tasks) for each pid namespace that's relevant
3772 * to the cgroup.
3773 */
3774struct cgroup_pidlist {
3775	/*
3776	 * used to find which pidlist is wanted. doesn't change as long as
3777	 * this particular list stays in the list.
3778	*/
3779	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
3780	/* array of xids */
3781	pid_t *list;
3782	/* how many elements the above list has */
3783	int length;
3784	/* each of these stored in a list by its cgroup */
3785	struct list_head links;
3786	/* pointer to the cgroup we belong to, for list removal purposes */
3787	struct cgroup *owner;
3788	/* for delayed destruction */
3789	struct delayed_work destroy_dwork;
3790};
3791
3792/*
3793 * The following two functions "fix" the issue where there are more pids
3794 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
3795 * TODO: replace with a kernel-wide solution to this problem
3796 */
3797#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
3798static void *pidlist_allocate(int count)
3799{
3800	if (PIDLIST_TOO_LARGE(count))
3801		return vmalloc(count * sizeof(pid_t));
3802	else
3803		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
3804}
3805
3806static void pidlist_free(void *p)
3807{
3808	kvfree(p);
3809}
3810
3811/*
3812 * Used to destroy all pidlists lingering waiting for destroy timer.  None
3813 * should be left afterwards.
3814 */
3815static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
3816{
3817	struct cgroup_pidlist *l, *tmp_l;
3818
3819	mutex_lock(&cgrp->pidlist_mutex);
3820	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
3821		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
3822	mutex_unlock(&cgrp->pidlist_mutex);
3823
3824	flush_workqueue(cgroup_pidlist_destroy_wq);
3825	BUG_ON(!list_empty(&cgrp->pidlists));
3826}
3827
3828static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
3829{
3830	struct delayed_work *dwork = to_delayed_work(work);
3831	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
3832						destroy_dwork);
3833	struct cgroup_pidlist *tofree = NULL;
3834
3835	mutex_lock(&l->owner->pidlist_mutex);
3836
3837	/*
3838	 * Destroy iff we didn't get queued again.  The state won't change
3839	 * as destroy_dwork can only be queued while locked.
3840	 */
3841	if (!delayed_work_pending(dwork)) {
3842		list_del(&l->links);
3843		pidlist_free(l->list);
3844		put_pid_ns(l->key.ns);
3845		tofree = l;
3846	}
3847
3848	mutex_unlock(&l->owner->pidlist_mutex);
3849	kfree(tofree);
3850}
3851
3852/*
3853 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
3854 * Returns the number of unique elements.
3855 */
3856static int pidlist_uniq(pid_t *list, int length)
3857{
3858	int src, dest = 1;
3859
3860	/*
3861	 * we presume the 0th element is unique, so i starts at 1. trivial
3862	 * edge cases first; no work needs to be done for either
3863	 */
3864	if (length == 0 || length == 1)
3865		return length;
3866	/* src and dest walk down the list; dest counts unique elements */
3867	for (src = 1; src < length; src++) {
3868		/* find next unique element */
3869		while (list[src] == list[src-1]) {
3870			src++;
3871			if (src == length)
3872				goto after;
3873		}
3874		/* dest always points to where the next unique element goes */
3875		list[dest] = list[src];
3876		dest++;
3877	}
3878after:
3879	return dest;
3880}
3881
3882/*
3883 * The two pid files - task and cgroup.procs - guaranteed that the result
3884 * is sorted, which forced this whole pidlist fiasco.  As pid order is
3885 * different per namespace, each namespace needs differently sorted list,
3886 * making it impossible to use, for example, single rbtree of member tasks
3887 * sorted by task pointer.  As pidlists can be fairly large, allocating one
3888 * per open file is dangerous, so cgroup had to implement shared pool of
3889 * pidlists keyed by cgroup and namespace.
3890 *
3891 * All this extra complexity was caused by the original implementation
3892 * committing to an entirely unnecessary property.  In the long term, we
3893 * want to do away with it.  Explicitly scramble sort order if on the
3894 * default hierarchy so that no such expectation exists in the new
3895 * interface.
3896 *
3897 * Scrambling is done by swapping every two consecutive bits, which is
3898 * non-identity one-to-one mapping which disturbs sort order sufficiently.
3899 */
3900static pid_t pid_fry(pid_t pid)
3901{
3902	unsigned a = pid & 0x55555555;
3903	unsigned b = pid & 0xAAAAAAAA;
3904
3905	return (a << 1) | (b >> 1);
3906}
3907
3908static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
3909{
3910	if (cgroup_on_dfl(cgrp))
3911		return pid_fry(pid);
3912	else
3913		return pid;
3914}
3915
3916static int cmppid(const void *a, const void *b)
3917{
3918	return *(pid_t *)a - *(pid_t *)b;
3919}
3920
3921static int fried_cmppid(const void *a, const void *b)
3922{
3923	return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
3924}
3925
3926static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3927						  enum cgroup_filetype type)
3928{
3929	struct cgroup_pidlist *l;
3930	/* don't need task_nsproxy() if we're looking at ourself */
3931	struct pid_namespace *ns = task_active_pid_ns(current);
3932
3933	lockdep_assert_held(&cgrp->pidlist_mutex);
3934
3935	list_for_each_entry(l, &cgrp->pidlists, links)
3936		if (l->key.type == type && l->key.ns == ns)
3937			return l;
3938	return NULL;
3939}
3940
3941/*
3942 * find the appropriate pidlist for our purpose (given procs vs tasks)
3943 * returns with the lock on that pidlist already held, and takes care
3944 * of the use count, or returns NULL with no locks held if we're out of
3945 * memory.
3946 */
3947static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
3948						enum cgroup_filetype type)
3949{
3950	struct cgroup_pidlist *l;
3951
3952	lockdep_assert_held(&cgrp->pidlist_mutex);
3953
3954	l = cgroup_pidlist_find(cgrp, type);
3955	if (l)
3956		return l;
3957
3958	/* entry not found; create a new one */
3959	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
3960	if (!l)
3961		return l;
3962
3963	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
3964	l->key.type = type;
3965	/* don't need task_nsproxy() if we're looking at ourself */
3966	l->key.ns = get_pid_ns(task_active_pid_ns(current));
3967	l->owner = cgrp;
3968	list_add(&l->links, &cgrp->pidlists);
3969	return l;
3970}
3971
3972/*
3973 * Load a cgroup's pidarray with either procs' tgids or tasks' pids
3974 */
3975static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3976			      struct cgroup_pidlist **lp)
3977{
3978	pid_t *array;
3979	int length;
3980	int pid, n = 0; /* used for populating the array */
3981	struct css_task_iter it;
3982	struct task_struct *tsk;
3983	struct cgroup_pidlist *l;
3984
3985	lockdep_assert_held(&cgrp->pidlist_mutex);
3986
3987	/*
3988	 * If cgroup gets more users after we read count, we won't have
3989	 * enough space - tough.  This race is indistinguishable to the
3990	 * caller from the case that the additional cgroup users didn't
3991	 * show up until sometime later on.
3992	 */
3993	length = cgroup_task_count(cgrp);
3994	array = pidlist_allocate(length);
3995	if (!array)
3996		return -ENOMEM;
3997	/* now, populate the array */
3998	css_task_iter_start(&cgrp->self, &it);
3999	while ((tsk = css_task_iter_next(&it))) {
4000		if (unlikely(n == length))
4001			break;
4002		/* get tgid or pid for procs or tasks file respectively */
4003		if (type == CGROUP_FILE_PROCS)
4004			pid = task_tgid_vnr(tsk);
4005		else
4006			pid = task_pid_vnr(tsk);
4007		if (pid > 0) /* make sure to only use valid results */
4008			array[n++] = pid;
4009	}
4010	css_task_iter_end(&it);
4011	length = n;
4012	/* now sort & (if procs) strip out duplicates */
4013	if (cgroup_on_dfl(cgrp))
4014		sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
4015	else
4016		sort(array, length, sizeof(pid_t), cmppid, NULL);
4017	if (type == CGROUP_FILE_PROCS)
4018		length = pidlist_uniq(array, length);
4019
4020	l = cgroup_pidlist_find_create(cgrp, type);
4021	if (!l) {
4022		pidlist_free(array);
4023		return -ENOMEM;
4024	}
4025
4026	/* store array, freeing old if necessary */
4027	pidlist_free(l->list);
4028	l->list = array;
4029	l->length = length;
4030	*lp = l;
4031	return 0;
4032}
4033
4034/**
4035 * cgroupstats_build - build and fill cgroupstats
4036 * @stats: cgroupstats to fill information into
4037 * @dentry: A dentry entry belonging to the cgroup for which stats have
4038 * been requested.
4039 *
4040 * Build and fill cgroupstats so that taskstats can export it to user
4041 * space.
4042 */
4043int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
4044{
4045	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
4046	struct cgroup *cgrp;
4047	struct css_task_iter it;
4048	struct task_struct *tsk;
4049
4050	/* it should be kernfs_node belonging to cgroupfs and is a directory */
4051	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
4052	    kernfs_type(kn) != KERNFS_DIR)
4053		return -EINVAL;
4054
4055	mutex_lock(&cgroup_mutex);
4056
4057	/*
4058	 * We aren't being called from kernfs and there's no guarantee on
4059	 * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
4060	 * @kn->priv is RCU safe.  Let's do the RCU dancing.
4061	 */
4062	rcu_read_lock();
4063	cgrp = rcu_dereference(kn->priv);
4064	if (!cgrp || cgroup_is_dead(cgrp)) {
4065		rcu_read_unlock();
4066		mutex_unlock(&cgroup_mutex);
4067		return -ENOENT;
4068	}
4069	rcu_read_unlock();
4070
4071	css_task_iter_start(&cgrp->self, &it);
4072	while ((tsk = css_task_iter_next(&it))) {
4073		switch (tsk->state) {
4074		case TASK_RUNNING:
4075			stats->nr_running++;
4076			break;
4077		case TASK_INTERRUPTIBLE:
4078			stats->nr_sleeping++;
4079			break;
4080		case TASK_UNINTERRUPTIBLE:
4081			stats->nr_uninterruptible++;
4082			break;
4083		case TASK_STOPPED:
4084			stats->nr_stopped++;
4085			break;
4086		default:
4087			if (delayacct_is_task_waiting_on_io(tsk))
4088				stats->nr_io_wait++;
4089			break;
4090		}
4091	}
4092	css_task_iter_end(&it);
4093
4094	mutex_unlock(&cgroup_mutex);
4095	return 0;
4096}
4097
4098
4099/*
4100 * seq_file methods for the tasks/procs files. The seq_file position is the
4101 * next pid to display; the seq_file iterator is a pointer to the pid
4102 * in the cgroup->l->list array.
4103 */
4104
4105static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
4106{
4107	/*
4108	 * Initially we receive a position value that corresponds to
4109	 * one more than the last pid shown (or 0 on the first call or
4110	 * after a seek to the start). Use a binary-search to find the
4111	 * next pid to display, if any
4112	 */
4113	struct kernfs_open_file *of = s->private;
4114	struct cgroup *cgrp = seq_css(s)->cgroup;
4115	struct cgroup_pidlist *l;
4116	enum cgroup_filetype type = seq_cft(s)->private;
4117	int index = 0, pid = *pos;
4118	int *iter, ret;
4119
4120	mutex_lock(&cgrp->pidlist_mutex);
4121
4122	/*
4123	 * !NULL @of->priv indicates that this isn't the first start()
4124	 * after open.  If the matching pidlist is around, we can use that.
4125	 * Look for it.  Note that @of->priv can't be used directly.  It
4126	 * could already have been destroyed.
4127	 */
4128	if (of->priv)
4129		of->priv = cgroup_pidlist_find(cgrp, type);
4130
4131	/*
4132	 * Either this is the first start() after open or the matching
4133	 * pidlist has been destroyed inbetween.  Create a new one.
4134	 */
4135	if (!of->priv) {
4136		ret = pidlist_array_load(cgrp, type,
4137					 (struct cgroup_pidlist **)&of->priv);
4138		if (ret)
4139			return ERR_PTR(ret);
4140	}
4141	l = of->priv;
4142
4143	if (pid) {
4144		int end = l->length;
4145
4146		while (index < end) {
4147			int mid = (index + end) / 2;
4148			if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
4149				index = mid;
4150				break;
4151			} else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
4152				index = mid + 1;
4153			else
4154				end = mid;
4155		}
4156	}
4157	/* If we're off the end of the array, we're done */
4158	if (index >= l->length)
4159		return NULL;
4160	/* Update the abstract position to be the actual pid that we found */
4161	iter = l->list + index;
4162	*pos = cgroup_pid_fry(cgrp, *iter);
4163	return iter;
4164}
4165
4166static void cgroup_pidlist_stop(struct seq_file *s, void *v)
4167{
4168	struct kernfs_open_file *of = s->private;
4169	struct cgroup_pidlist *l = of->priv;
4170
4171	if (l)
4172		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
4173				 CGROUP_PIDLIST_DESTROY_DELAY);
4174	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
4175}
4176
4177static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
4178{
4179	struct kernfs_open_file *of = s->private;
4180	struct cgroup_pidlist *l = of->priv;
4181	pid_t *p = v;
4182	pid_t *end = l->list + l->length;
4183	/*
4184	 * Advance to the next pid in the array. If this goes off the
4185	 * end, we're done
4186	 */
4187	p++;
4188	if (p >= end) {
4189		return NULL;
4190	} else {
4191		*pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
4192		return p;
4193	}
4194}
4195
4196static int cgroup_pidlist_show(struct seq_file *s, void *v)
4197{
4198	seq_printf(s, "%d\n", *(int *)v);
4199
4200	return 0;
4201}
4202
4203static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
4204					 struct cftype *cft)
4205{
4206	return notify_on_release(css->cgroup);
4207}
4208
4209static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
4210					  struct cftype *cft, u64 val)
4211{
4212	if (val)
4213		set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
4214	else
4215		clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
4216	return 0;
4217}
4218
4219static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
4220				      struct cftype *cft)
4221{
4222	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4223}
4224
4225static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
4226				       struct cftype *cft, u64 val)
4227{
4228	if (val)
4229		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4230	else
4231		clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4232	return 0;
4233}
4234
4235/* cgroup core interface files for the default hierarchy */
4236static struct cftype cgroup_dfl_base_files[] = {
4237	{
4238		.name = "cgroup.procs",
4239		.seq_start = cgroup_pidlist_start,
4240		.seq_next = cgroup_pidlist_next,
4241		.seq_stop = cgroup_pidlist_stop,
4242		.seq_show = cgroup_pidlist_show,
4243		.private = CGROUP_FILE_PROCS,
4244		.write = cgroup_procs_write,
4245		.mode = S_IRUGO | S_IWUSR,
4246	},
4247	{
4248		.name = "cgroup.controllers",
4249		.flags = CFTYPE_ONLY_ON_ROOT,
4250		.seq_show = cgroup_root_controllers_show,
4251	},
4252	{
4253		.name = "cgroup.controllers",
4254		.flags = CFTYPE_NOT_ON_ROOT,
4255		.seq_show = cgroup_controllers_show,
4256	},
4257	{
4258		.name = "cgroup.subtree_control",
4259		.seq_show = cgroup_subtree_control_show,
4260		.write = cgroup_subtree_control_write,
4261	},
4262	{
4263		.name = "cgroup.populated",
4264		.flags = CFTYPE_NOT_ON_ROOT,
4265		.seq_show = cgroup_populated_show,
4266	},
4267	{ }	/* terminate */
4268};
4269
4270/* cgroup core interface files for the legacy hierarchies */
4271static struct cftype cgroup_legacy_base_files[] = {
4272	{
4273		.name = "cgroup.procs",
4274		.seq_start = cgroup_pidlist_start,
4275		.seq_next = cgroup_pidlist_next,
4276		.seq_stop = cgroup_pidlist_stop,
4277		.seq_show = cgroup_pidlist_show,
4278		.private = CGROUP_FILE_PROCS,
4279		.write = cgroup_procs_write,
4280		.mode = S_IRUGO | S_IWUSR,
4281	},
4282	{
4283		.name = "cgroup.clone_children",
4284		.read_u64 = cgroup_clone_children_read,
4285		.write_u64 = cgroup_clone_children_write,
4286	},
4287	{
4288		.name = "cgroup.sane_behavior",
4289		.flags = CFTYPE_ONLY_ON_ROOT,
4290		.seq_show = cgroup_sane_behavior_show,
4291	},
4292	{
4293		.name = "tasks",
4294		.seq_start = cgroup_pidlist_start,
4295		.seq_next = cgroup_pidlist_next,
4296		.seq_stop = cgroup_pidlist_stop,
4297		.seq_show = cgroup_pidlist_show,
4298		.private = CGROUP_FILE_TASKS,
4299		.write = cgroup_tasks_write,
4300		.mode = S_IRUGO | S_IWUSR,
4301	},
4302	{
4303		.name = "notify_on_release",
4304		.read_u64 = cgroup_read_notify_on_release,
4305		.write_u64 = cgroup_write_notify_on_release,
4306	},
4307	{
4308		.name = "release_agent",
4309		.flags = CFTYPE_ONLY_ON_ROOT,
4310		.seq_show = cgroup_release_agent_show,
4311		.write = cgroup_release_agent_write,
4312		.max_write_len = PATH_MAX - 1,
4313	},
4314	{ }	/* terminate */
4315};
4316
4317/**
4318 * cgroup_populate_dir - create subsys files in a cgroup directory
4319 * @cgrp: target cgroup
4320 * @subsys_mask: mask of the subsystem ids whose files should be added
4321 *
4322 * On failure, no file is added.
4323 */
4324static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)
4325{
4326	struct cgroup_subsys *ss;
4327	int i, ret = 0;
4328
4329	/* process cftsets of each subsystem */
4330	for_each_subsys(ss, i) {
4331		struct cftype *cfts;
4332
4333		if (!(subsys_mask & (1 << i)))
4334			continue;
4335
4336		list_for_each_entry(cfts, &ss->cfts, node) {
4337			ret = cgroup_addrm_files(cgrp, cfts, true);
4338			if (ret < 0)
4339				goto err;
4340		}
4341	}
4342	return 0;
4343err:
4344	cgroup_clear_dir(cgrp, subsys_mask);
4345	return ret;
4346}
4347
4348/*
4349 * css destruction is four-stage process.
4350 *
4351 * 1. Destruction starts.  Killing of the percpu_ref is initiated.
4352 *    Implemented in kill_css().
4353 *
4354 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
4355 *    and thus css_tryget_online() is guaranteed to fail, the css can be
4356 *    offlined by invoking offline_css().  After offlining, the base ref is
4357 *    put.  Implemented in css_killed_work_fn().
4358 *
4359 * 3. When the percpu_ref reaches zero, the only possible remaining
4360 *    accessors are inside RCU read sections.  css_release() schedules the
4361 *    RCU callback.
4362 *
4363 * 4. After the grace period, the css can be freed.  Implemented in
4364 *    css_free_work_fn().
4365 *
4366 * It is actually hairier because both step 2 and 4 require process context
4367 * and thus involve punting to css->destroy_work adding two additional
4368 * steps to the already complex sequence.
4369 */
4370static void css_free_work_fn(struct work_struct *work)
4371{
4372	struct cgroup_subsys_state *css =
4373		container_of(work, struct cgroup_subsys_state, destroy_work);
4374	struct cgroup_subsys *ss = css->ss;
4375	struct cgroup *cgrp = css->cgroup;
4376
4377	percpu_ref_exit(&css->refcnt);
4378
4379	if (ss) {
4380		/* css free path */
4381		int id = css->id;
4382
4383		if (css->parent)
4384			css_put(css->parent);
4385
4386		ss->css_free(css);
4387		cgroup_idr_remove(&ss->css_idr, id);
4388		cgroup_put(cgrp);
4389	} else {
4390		/* cgroup free path */
4391		atomic_dec(&cgrp->root->nr_cgrps);
4392		cgroup_pidlist_destroy_all(cgrp);
4393		cancel_work_sync(&cgrp->release_agent_work);
4394
4395		if (cgroup_parent(cgrp)) {
4396			/*
4397			 * We get a ref to the parent, and put the ref when
4398			 * this cgroup is being freed, so it's guaranteed
4399			 * that the parent won't be destroyed before its
4400			 * children.
4401			 */
4402			cgroup_put(cgroup_parent(cgrp));
4403			kernfs_put(cgrp->kn);
4404			kfree(cgrp);
4405		} else {
4406			/*
4407			 * This is root cgroup's refcnt reaching zero,
4408			 * which indicates that the root should be
4409			 * released.
4410			 */
4411			cgroup_destroy_root(cgrp->root);
4412		}
4413	}
4414}
4415
4416static void css_free_rcu_fn(struct rcu_head *rcu_head)
4417{
4418	struct cgroup_subsys_state *css =
4419		container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
4420
4421	INIT_WORK(&css->destroy_work, css_free_work_fn);
4422	queue_work(cgroup_destroy_wq, &css->destroy_work);
4423}
4424
4425static void css_release_work_fn(struct work_struct *work)
4426{
4427	struct cgroup_subsys_state *css =
4428		container_of(work, struct cgroup_subsys_state, destroy_work);
4429	struct cgroup_subsys *ss = css->ss;
4430	struct cgroup *cgrp = css->cgroup;
4431
4432	mutex_lock(&cgroup_mutex);
4433
4434	css->flags |= CSS_RELEASED;
4435	list_del_rcu(&css->sibling);
4436
4437	if (ss) {
4438		/* css release path */
4439		cgroup_idr_replace(&ss->css_idr, NULL, css->id);
4440		if (ss->css_released)
4441			ss->css_released(css);
4442	} else {
4443		/* cgroup release path */
4444		cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
4445		cgrp->id = -1;
4446
4447		/*
4448		 * There are two control paths which try to determine
4449		 * cgroup from dentry without going through kernfs -
4450		 * cgroupstats_build() and css_tryget_online_from_dir().
4451		 * Those are supported by RCU protecting clearing of
4452		 * cgrp->kn->priv backpointer.
4453		 */
4454		RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
4455	}
4456
4457	mutex_unlock(&cgroup_mutex);
4458
4459	call_rcu(&css->rcu_head, css_free_rcu_fn);
4460}
4461
4462static void css_release(struct percpu_ref *ref)
4463{
4464	struct cgroup_subsys_state *css =
4465		container_of(ref, struct cgroup_subsys_state, refcnt);
4466
4467	INIT_WORK(&css->destroy_work, css_release_work_fn);
4468	queue_work(cgroup_destroy_wq, &css->destroy_work);
4469}
4470
4471static void init_and_link_css(struct cgroup_subsys_state *css,
4472			      struct cgroup_subsys *ss, struct cgroup *cgrp)
4473{
4474	lockdep_assert_held(&cgroup_mutex);
4475
4476	cgroup_get(cgrp);
4477
4478	memset(css, 0, sizeof(*css));
4479	css->cgroup = cgrp;
4480	css->ss = ss;
4481	INIT_LIST_HEAD(&css->sibling);
4482	INIT_LIST_HEAD(&css->children);
4483	css->serial_nr = css_serial_nr_next++;
4484	atomic_set(&css->online_cnt, 0);
4485
4486	if (cgroup_parent(cgrp)) {
4487		css->parent = cgroup_css(cgroup_parent(cgrp), ss);
4488		css_get(css->parent);
4489	}
4490
4491	BUG_ON(cgroup_css(cgrp, ss));
4492}
4493
4494/* invoke ->css_online() on a new CSS and mark it online if successful */
4495static int online_css(struct cgroup_subsys_state *css)
4496{
4497	struct cgroup_subsys *ss = css->ss;
4498	int ret = 0;
4499
4500	lockdep_assert_held(&cgroup_mutex);
4501
4502	if (ss->css_online)
4503		ret = ss->css_online(css);
4504	if (!ret) {
4505		css->flags |= CSS_ONLINE;
4506		rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
4507
4508		atomic_inc(&css->online_cnt);
4509		if (css->parent)
4510			atomic_inc(&css->parent->online_cnt);
4511	}
4512	return ret;
4513}
4514
4515/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
4516static void offline_css(struct cgroup_subsys_state *css)
4517{
4518	struct cgroup_subsys *ss = css->ss;
4519
4520	lockdep_assert_held(&cgroup_mutex);
4521
4522	if (!(css->flags & CSS_ONLINE))
4523		return;
4524
4525	if (ss->css_offline)
4526		ss->css_offline(css);
4527
4528	css->flags &= ~CSS_ONLINE;
4529	RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
4530
4531	wake_up_all(&css->cgroup->offline_waitq);
4532}
4533
4534/**
4535 * create_css - create a cgroup_subsys_state
4536 * @cgrp: the cgroup new css will be associated with
4537 * @ss: the subsys of new css
4538 * @visible: whether to create control knobs for the new css or not
4539 *
4540 * Create a new css associated with @cgrp - @ss pair.  On success, the new
4541 * css is online and installed in @cgrp with all interface files created if
4542 * @visible.  Returns 0 on success, -errno on failure.
4543 */
4544static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
4545		      bool visible)
4546{
4547	struct cgroup *parent = cgroup_parent(cgrp);
4548	struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
4549	struct cgroup_subsys_state *css;
4550	int err;
4551
4552	lockdep_assert_held(&cgroup_mutex);
4553
4554	css = ss->css_alloc(parent_css);
4555	if (IS_ERR(css))
4556		return PTR_ERR(css);
4557
4558	init_and_link_css(css, ss, cgrp);
4559
4560	err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
4561	if (err)
4562		goto err_free_css;
4563
4564	err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
4565	if (err < 0)
4566		goto err_free_percpu_ref;
4567	css->id = err;
4568
4569	if (visible) {
4570		err = cgroup_populate_dir(cgrp, 1 << ss->id);
4571		if (err)
4572			goto err_free_id;
4573	}
4574
4575	/* @css is ready to be brought online now, make it visible */
4576	list_add_tail_rcu(&css->sibling, &parent_css->children);
4577	cgroup_idr_replace(&ss->css_idr, css, css->id);
4578
4579	err = online_css(css);
4580	if (err)
4581		goto err_list_del;
4582
4583	if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4584	    cgroup_parent(parent)) {
4585		pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4586			current->comm, current->pid, ss->name);
4587		if (!strcmp(ss->name, "memory"))
4588			pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
4589		ss->warned_broken_hierarchy = true;
4590	}
4591
4592	return 0;
4593
4594err_list_del:
4595	list_del_rcu(&css->sibling);
4596	cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
4597err_free_id:
4598	cgroup_idr_remove(&ss->css_idr, css->id);
4599err_free_percpu_ref:
4600	percpu_ref_exit(&css->refcnt);
4601err_free_css:
4602	call_rcu(&css->rcu_head, css_free_rcu_fn);
4603	return err;
4604}
4605
4606static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
4607			umode_t mode)
4608{
4609	struct cgroup *parent, *cgrp;
4610	struct cgroup_root *root;
4611	struct cgroup_subsys *ss;
4612	struct kernfs_node *kn;
4613	struct cftype *base_files;
4614	int ssid, ret;
4615
4616	/* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
4617	 */
4618	if (strchr(name, '\n'))
4619		return -EINVAL;
4620
4621	parent = cgroup_kn_lock_live(parent_kn);
4622	if (!parent)
4623		return -ENODEV;
4624	root = parent->root;
4625
4626	/* allocate the cgroup and its ID, 0 is reserved for the root */
4627	cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
4628	if (!cgrp) {
4629		ret = -ENOMEM;
4630		goto out_unlock;
4631	}
4632
4633	ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
4634	if (ret)
4635		goto out_free_cgrp;
4636
4637	/*
4638	 * Temporarily set the pointer to NULL, so idr_find() won't return
4639	 * a half-baked cgroup.
4640	 */
4641	cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
4642	if (cgrp->id < 0) {
4643		ret = -ENOMEM;
4644		goto out_cancel_ref;
4645	}
4646
4647	init_cgroup_housekeeping(cgrp);
4648
4649	cgrp->self.parent = &parent->self;
4650	cgrp->root = root;
4651
4652	if (notify_on_release(parent))
4653		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
4654
4655	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4656		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4657
4658	/* create the directory */
4659	kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
4660	if (IS_ERR(kn)) {
4661		ret = PTR_ERR(kn);
4662		goto out_free_id;
4663	}
4664	cgrp->kn = kn;
4665
4666	/*
4667	 * This extra ref will be put in cgroup_free_fn() and guarantees
4668	 * that @cgrp->kn is always accessible.
4669	 */
4670	kernfs_get(kn);
4671
4672	cgrp->self.serial_nr = css_serial_nr_next++;
4673
4674	/* allocation complete, commit to creation */
4675	list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
4676	atomic_inc(&root->nr_cgrps);
4677	cgroup_get(parent);
4678
4679	/*
4680	 * @cgrp is now fully operational.  If something fails after this
4681	 * point, it'll be released via the normal destruction path.
4682	 */
4683	cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
4684
4685	ret = cgroup_kn_set_ugid(kn);
4686	if (ret)
4687		goto out_destroy;
4688
4689	if (cgroup_on_dfl(cgrp))
4690		base_files = cgroup_dfl_base_files;
4691	else
4692		base_files = cgroup_legacy_base_files;
4693
4694	ret = cgroup_addrm_files(cgrp, base_files, true);
4695	if (ret)
4696		goto out_destroy;
4697
4698	/* let's create and online css's */
4699	for_each_subsys(ss, ssid) {
4700		if (parent->child_subsys_mask & (1 << ssid)) {
4701			ret = create_css(cgrp, ss,
4702					 parent->subtree_control & (1 << ssid));
4703			if (ret)
4704				goto out_destroy;
4705		}
4706	}
4707
4708	/*
4709	 * On the default hierarchy, a child doesn't automatically inherit
4710	 * subtree_control from the parent.  Each is configured manually.
4711	 */
4712	if (!cgroup_on_dfl(cgrp)) {
4713		cgrp->subtree_control = parent->subtree_control;
4714		cgroup_refresh_child_subsys_mask(cgrp);
4715	}
4716
4717	kernfs_activate(kn);
4718
4719	ret = 0;
4720	goto out_unlock;
4721
4722out_free_id:
4723	cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
4724out_cancel_ref:
4725	percpu_ref_exit(&cgrp->self.refcnt);
4726out_free_cgrp:
4727	kfree(cgrp);
4728out_unlock:
4729	cgroup_kn_unlock(parent_kn);
4730	return ret;
4731
4732out_destroy:
4733	cgroup_destroy_locked(cgrp);
4734	goto out_unlock;
4735}
4736
4737/*
4738 * This is called when the refcnt of a css is confirmed to be killed.
4739 * css_tryget_online() is now guaranteed to fail.  Tell the subsystem to
4740 * initate destruction and put the css ref from kill_css().
4741 */
4742static void css_killed_work_fn(struct work_struct *work)
4743{
4744	struct cgroup_subsys_state *css =
4745		container_of(work, struct cgroup_subsys_state, destroy_work);
4746
4747	mutex_lock(&cgroup_mutex);
4748
4749	do {
4750		offline_css(css);
4751		css_put(css);
4752		/* @css can't go away while we're holding cgroup_mutex */
4753		css = css->parent;
4754	} while (css && atomic_dec_and_test(&css->online_cnt));
4755
4756	mutex_unlock(&cgroup_mutex);
4757}
4758
4759/* css kill confirmation processing requires process context, bounce */
4760static void css_killed_ref_fn(struct percpu_ref *ref)
4761{
4762	struct cgroup_subsys_state *css =
4763		container_of(ref, struct cgroup_subsys_state, refcnt);
4764
4765	if (atomic_dec_and_test(&css->online_cnt)) {
4766		INIT_WORK(&css->destroy_work, css_killed_work_fn);
4767		queue_work(cgroup_destroy_wq, &css->destroy_work);
4768	}
4769}
4770
4771/**
4772 * kill_css - destroy a css
4773 * @css: css to destroy
4774 *
4775 * This function initiates destruction of @css by removing cgroup interface
4776 * files and putting its base reference.  ->css_offline() will be invoked
4777 * asynchronously once css_tryget_online() is guaranteed to fail and when
4778 * the reference count reaches zero, @css will be released.
4779 */
4780static void kill_css(struct cgroup_subsys_state *css)
4781{
4782	lockdep_assert_held(&cgroup_mutex);
4783
4784	/*
4785	 * This must happen before css is disassociated with its cgroup.
4786	 * See seq_css() for details.
4787	 */
4788	cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
4789
4790	/*
4791	 * Killing would put the base ref, but we need to keep it alive
4792	 * until after ->css_offline().
4793	 */
4794	css_get(css);
4795
4796	/*
4797	 * cgroup core guarantees that, by the time ->css_offline() is
4798	 * invoked, no new css reference will be given out via
4799	 * css_tryget_online().  We can't simply call percpu_ref_kill() and
4800	 * proceed to offlining css's because percpu_ref_kill() doesn't
4801	 * guarantee that the ref is seen as killed on all CPUs on return.
4802	 *
4803	 * Use percpu_ref_kill_and_confirm() to get notifications as each
4804	 * css is confirmed to be seen as killed on all CPUs.
4805	 */
4806	percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
4807}
4808
4809/**
4810 * cgroup_destroy_locked - the first stage of cgroup destruction
4811 * @cgrp: cgroup to be destroyed
4812 *
4813 * css's make use of percpu refcnts whose killing latency shouldn't be
4814 * exposed to userland and are RCU protected.  Also, cgroup core needs to
4815 * guarantee that css_tryget_online() won't succeed by the time
4816 * ->css_offline() is invoked.  To satisfy all the requirements,
4817 * destruction is implemented in the following two steps.
4818 *
4819 * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
4820 *     userland visible parts and start killing the percpu refcnts of
4821 *     css's.  Set up so that the next stage will be kicked off once all
4822 *     the percpu refcnts are confirmed to be killed.
4823 *
4824 * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
4825 *     rest of destruction.  Once all cgroup references are gone, the
4826 *     cgroup is RCU-freed.
4827 *
4828 * This function implements s1.  After this step, @cgrp is gone as far as
4829 * the userland is concerned and a new cgroup with the same name may be
4830 * created.  As cgroup doesn't care about the names internally, this
4831 * doesn't cause any problem.
4832 */
4833static int cgroup_destroy_locked(struct cgroup *cgrp)
4834	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4835{
4836	struct cgroup_subsys_state *css;
4837	bool empty;
4838	int ssid;
4839
4840	lockdep_assert_held(&cgroup_mutex);
4841
4842	/*
4843	 * css_set_rwsem synchronizes access to ->cset_links and prevents
4844	 * @cgrp from being removed while put_css_set() is in progress.
4845	 */
4846	down_read(&css_set_rwsem);
4847	empty = list_empty(&cgrp->cset_links);
4848	up_read(&css_set_rwsem);
4849	if (!empty)
4850		return -EBUSY;
4851
4852	/*
4853	 * Make sure there's no live children.  We can't test emptiness of
4854	 * ->self.children as dead children linger on it while being
4855	 * drained; otherwise, "rmdir parent/child parent" may fail.
4856	 */
4857	if (css_has_online_children(&cgrp->self))
4858		return -EBUSY;
4859
4860	/*
4861	 * Mark @cgrp dead.  This prevents further task migration and child
4862	 * creation by disabling cgroup_lock_live_group().
4863	 */
4864	cgrp->self.flags &= ~CSS_ONLINE;
4865
4866	/* initiate massacre of all css's */
4867	for_each_css(css, ssid, cgrp)
4868		kill_css(css);
4869
4870	/*
4871	 * Remove @cgrp directory along with the base files.  @cgrp has an
4872	 * extra ref on its kn.
4873	 */
4874	kernfs_remove(cgrp->kn);
4875
4876	check_for_release(cgroup_parent(cgrp));
4877
4878	/* put the base reference */
4879	percpu_ref_kill(&cgrp->self.refcnt);
4880
4881	return 0;
4882};
4883
4884static int cgroup_rmdir(struct kernfs_node *kn)
4885{
4886	struct cgroup *cgrp;
4887	int ret = 0;
4888
4889	cgrp = cgroup_kn_lock_live(kn);
4890	if (!cgrp)
4891		return 0;
4892
4893	ret = cgroup_destroy_locked(cgrp);
4894
4895	cgroup_kn_unlock(kn);
4896	return ret;
4897}
4898
4899static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
4900	.remount_fs		= cgroup_remount,
4901	.show_options		= cgroup_show_options,
4902	.mkdir			= cgroup_mkdir,
4903	.rmdir			= cgroup_rmdir,
4904	.rename			= cgroup_rename,
4905};
4906
4907static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
4908{
4909	struct cgroup_subsys_state *css;
4910
4911	printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4912
4913	mutex_lock(&cgroup_mutex);
4914
4915	idr_init(&ss->css_idr);
4916	INIT_LIST_HEAD(&ss->cfts);
4917
4918	/* Create the root cgroup state for this subsystem */
4919	ss->root = &cgrp_dfl_root;
4920	css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
4921	/* We don't handle early failures gracefully */
4922	BUG_ON(IS_ERR(css));
4923	init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
4924
4925	/*
4926	 * Root csses are never destroyed and we can't initialize
4927	 * percpu_ref during early init.  Disable refcnting.
4928	 */
4929	css->flags |= CSS_NO_REF;
4930
4931	if (early) {
4932		/* allocation can't be done safely during early init */
4933		css->id = 1;
4934	} else {
4935		css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
4936		BUG_ON(css->id < 0);
4937	}
4938
4939	/* Update the init_css_set to contain a subsys
4940	 * pointer to this state - since the subsystem is
4941	 * newly registered, all tasks and hence the
4942	 * init_css_set is in the subsystem's root cgroup. */
4943	init_css_set.subsys[ss->id] = css;
4944
4945	need_forkexit_callback |= ss->fork || ss->exit;
4946
4947	/* At system boot, before all subsystems have been
4948	 * registered, no tasks have been forked, so we don't
4949	 * need to invoke fork callbacks here. */
4950	BUG_ON(!list_empty(&init_task.tasks));
4951
4952	BUG_ON(online_css(css));
4953
4954	mutex_unlock(&cgroup_mutex);
4955}
4956
4957/**
4958 * cgroup_init_early - cgroup initialization at system boot
4959 *
4960 * Initialize cgroups at system boot, and initialize any
4961 * subsystems that request early init.
4962 */
4963int __init cgroup_init_early(void)
4964{
4965	static struct cgroup_sb_opts __initdata opts;
4966	struct cgroup_subsys *ss;
4967	int i;
4968
4969	init_cgroup_root(&cgrp_dfl_root, &opts);
4970	cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
4971
4972	RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
4973
4974	for_each_subsys(ss, i) {
4975		WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
4976		     "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
4977		     i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
4978		     ss->id, ss->name);
4979		WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
4980		     "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
4981
4982		ss->id = i;
4983		ss->name = cgroup_subsys_name[i];
4984
4985		if (ss->early_init)
4986			cgroup_init_subsys(ss, true);
4987	}
4988	return 0;
4989}
4990
4991/**
4992 * cgroup_init - cgroup initialization
4993 *
4994 * Register cgroup filesystem and /proc file, and initialize
4995 * any subsystems that didn't request early init.
4996 */
4997int __init cgroup_init(void)
4998{
4999	struct cgroup_subsys *ss;
5000	unsigned long key;
5001	int ssid, err;
5002
5003	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
5004	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
5005
5006	mutex_lock(&cgroup_mutex);
5007
5008	/* Add init_css_set to the hash table */
5009	key = css_set_hash(init_css_set.subsys);
5010	hash_add(css_set_table, &init_css_set.hlist, key);
5011
5012	BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
5013
5014	mutex_unlock(&cgroup_mutex);
5015
5016	for_each_subsys(ss, ssid) {
5017		if (ss->early_init) {
5018			struct cgroup_subsys_state *css =
5019				init_css_set.subsys[ss->id];
5020
5021			css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
5022						   GFP_KERNEL);
5023			BUG_ON(css->id < 0);
5024		} else {
5025			cgroup_init_subsys(ss, false);
5026		}
5027
5028		list_add_tail(&init_css_set.e_cset_node[ssid],
5029			      &cgrp_dfl_root.cgrp.e_csets[ssid]);
5030
5031		/*
5032		 * Setting dfl_root subsys_mask needs to consider the
5033		 * disabled flag and cftype registration needs kmalloc,
5034		 * both of which aren't available during early_init.
5035		 */
5036		if (ss->disabled)
5037			continue;
5038
5039		cgrp_dfl_root.subsys_mask |= 1 << ss->id;
5040
5041		if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes)
5042			ss->dfl_cftypes = ss->legacy_cftypes;
5043
5044		if (!ss->dfl_cftypes)
5045			cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
5046
5047		if (ss->dfl_cftypes == ss->legacy_cftypes) {
5048			WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
5049		} else {
5050			WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
5051			WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
5052		}
5053
5054		if (ss->bind)
5055			ss->bind(init_css_set.subsys[ssid]);
5056	}
5057
5058	err = sysfs_create_mount_point(fs_kobj, "cgroup");
5059	if (err)
5060		return err;
5061
5062	err = register_filesystem(&cgroup_fs_type);
5063	if (err < 0) {
5064		sysfs_remove_mount_point(fs_kobj, "cgroup");
5065		return err;
5066	}
5067
5068	proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
5069	return 0;
5070}
5071
5072static int __init cgroup_wq_init(void)
5073{
5074	/*
5075	 * There isn't much point in executing destruction path in
5076	 * parallel.  Good chunk is serialized with cgroup_mutex anyway.
5077	 * Use 1 for @max_active.
5078	 *
5079	 * We would prefer to do this in cgroup_init() above, but that
5080	 * is called before init_workqueues(): so leave this until after.
5081	 */
5082	cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
5083	BUG_ON(!cgroup_destroy_wq);
5084
5085	/*
5086	 * Used to destroy pidlists and separate to serve as flush domain.
5087	 * Cap @max_active to 1 too.
5088	 */
5089	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
5090						    0, 1);
5091	BUG_ON(!cgroup_pidlist_destroy_wq);
5092
5093	return 0;
5094}
5095core_initcall(cgroup_wq_init);
5096
5097/*
5098 * proc_cgroup_show()
5099 *  - Print task's cgroup paths into seq_file, one line for each hierarchy
5100 *  - Used for /proc/<pid>/cgroup.
5101 */
5102int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
5103		     struct pid *pid, struct task_struct *tsk)
5104{
5105	char *buf, *path;
5106	int retval;
5107	struct cgroup_root *root;
5108
5109	retval = -ENOMEM;
5110	buf = kmalloc(PATH_MAX, GFP_KERNEL);
5111	if (!buf)
5112		goto out;
5113
5114	mutex_lock(&cgroup_mutex);
5115	down_read(&css_set_rwsem);
5116
5117	for_each_root(root) {
5118		struct cgroup_subsys *ss;
5119		struct cgroup *cgrp;
5120		int ssid, count = 0;
5121
5122		if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
5123			continue;
5124
5125		seq_printf(m, "%d:", root->hierarchy_id);
5126		for_each_subsys(ss, ssid)
5127			if (root->subsys_mask & (1 << ssid))
5128				seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
5129		if (strlen(root->name))
5130			seq_printf(m, "%sname=%s", count ? "," : "",
5131				   root->name);
5132		seq_putc(m, ':');
5133		cgrp = task_cgroup_from_root(tsk, root);
5134		path = cgroup_path(cgrp, buf, PATH_MAX);
5135		if (!path) {
5136			retval = -ENAMETOOLONG;
5137			goto out_unlock;
5138		}
5139		seq_puts(m, path);
5140		seq_putc(m, '\n');
5141	}
5142
5143	retval = 0;
5144out_unlock:
5145	up_read(&css_set_rwsem);
5146	mutex_unlock(&cgroup_mutex);
5147	kfree(buf);
5148out:
5149	return retval;
5150}
5151
5152/* Display information about each subsystem and each hierarchy */
5153static int proc_cgroupstats_show(struct seq_file *m, void *v)
5154{
5155	struct cgroup_subsys *ss;
5156	int i;
5157
5158	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
5159	/*
5160	 * ideally we don't want subsystems moving around while we do this.
5161	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
5162	 * subsys/hierarchy state.
5163	 */
5164	mutex_lock(&cgroup_mutex);
5165
5166	for_each_subsys(ss, i)
5167		seq_printf(m, "%s\t%d\t%d\t%d\n",
5168			   ss->name, ss->root->hierarchy_id,
5169			   atomic_read(&ss->root->nr_cgrps), !ss->disabled);
5170
5171	mutex_unlock(&cgroup_mutex);
5172	return 0;
5173}
5174
5175static int cgroupstats_open(struct inode *inode, struct file *file)
5176{
5177	return single_open(file, proc_cgroupstats_show, NULL);
5178}
5179
5180static const struct file_operations proc_cgroupstats_operations = {
5181	.open = cgroupstats_open,
5182	.read = seq_read,
5183	.llseek = seq_lseek,
5184	.release = single_release,
5185};
5186
5187/**
5188 * cgroup_fork - initialize cgroup related fields during copy_process()
5189 * @child: pointer to task_struct of forking parent process.
5190 *
5191 * A task is associated with the init_css_set until cgroup_post_fork()
5192 * attaches it to the parent's css_set.  Empty cg_list indicates that
5193 * @child isn't holding reference to its css_set.
5194 */
5195void cgroup_fork(struct task_struct *child)
5196{
5197	RCU_INIT_POINTER(child->cgroups, &init_css_set);
5198	INIT_LIST_HEAD(&child->cg_list);
5199}
5200
5201/**
5202 * cgroup_post_fork - called on a new task after adding it to the task list
5203 * @child: the task in question
5204 *
5205 * Adds the task to the list running through its css_set if necessary and
5206 * call the subsystem fork() callbacks.  Has to be after the task is
5207 * visible on the task list in case we race with the first call to
5208 * cgroup_task_iter_start() - to guarantee that the new task ends up on its
5209 * list.
5210 */
5211void cgroup_post_fork(struct task_struct *child)
5212{
5213	struct cgroup_subsys *ss;
5214	int i;
5215
5216	/*
5217	 * This may race against cgroup_enable_task_cg_lists().  As that
5218	 * function sets use_task_css_set_links before grabbing
5219	 * tasklist_lock and we just went through tasklist_lock to add
5220	 * @child, it's guaranteed that either we see the set
5221	 * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
5222	 * @child during its iteration.
5223	 *
5224	 * If we won the race, @child is associated with %current's
5225	 * css_set.  Grabbing css_set_rwsem guarantees both that the
5226	 * association is stable, and, on completion of the parent's
5227	 * migration, @child is visible in the source of migration or
5228	 * already in the destination cgroup.  This guarantee is necessary
5229	 * when implementing operations which need to migrate all tasks of
5230	 * a cgroup to another.
5231	 *
5232	 * Note that if we lose to cgroup_enable_task_cg_lists(), @child
5233	 * will remain in init_css_set.  This is safe because all tasks are
5234	 * in the init_css_set before cg_links is enabled and there's no
5235	 * operation which transfers all tasks out of init_css_set.
5236	 */
5237	if (use_task_css_set_links) {
5238		struct css_set *cset;
5239
5240		down_write(&css_set_rwsem);
5241		cset = task_css_set(current);
5242		if (list_empty(&child->cg_list)) {
5243			rcu_assign_pointer(child->cgroups, cset);
5244			list_add(&child->cg_list, &cset->tasks);
5245			get_css_set(cset);
5246		}
5247		up_write(&css_set_rwsem);
5248	}
5249
5250	/*
5251	 * Call ss->fork().  This must happen after @child is linked on
5252	 * css_set; otherwise, @child might change state between ->fork()
5253	 * and addition to css_set.
5254	 */
5255	if (need_forkexit_callback) {
5256		for_each_subsys(ss, i)
5257			if (ss->fork)
5258				ss->fork(child);
5259	}
5260}
5261
5262/**
5263 * cgroup_exit - detach cgroup from exiting task
5264 * @tsk: pointer to task_struct of exiting process
5265 *
5266 * Description: Detach cgroup from @tsk and release it.
5267 *
5268 * Note that cgroups marked notify_on_release force every task in
5269 * them to take the global cgroup_mutex mutex when exiting.
5270 * This could impact scaling on very large systems.  Be reluctant to
5271 * use notify_on_release cgroups where very high task exit scaling
5272 * is required on large systems.
5273 *
5274 * We set the exiting tasks cgroup to the root cgroup (top_cgroup).  We
5275 * call cgroup_exit() while the task is still competent to handle
5276 * notify_on_release(), then leave the task attached to the root cgroup in
5277 * each hierarchy for the remainder of its exit.  No need to bother with
5278 * init_css_set refcnting.  init_css_set never goes away and we can't race
5279 * with migration path - PF_EXITING is visible to migration path.
5280 */
5281void cgroup_exit(struct task_struct *tsk)
5282{
5283	struct cgroup_subsys *ss;
5284	struct css_set *cset;
5285	bool put_cset = false;
5286	int i;
5287
5288	/*
5289	 * Unlink from @tsk from its css_set.  As migration path can't race
5290	 * with us, we can check cg_list without grabbing css_set_rwsem.
5291	 */
5292	if (!list_empty(&tsk->cg_list)) {
5293		down_write(&css_set_rwsem);
5294		list_del_init(&tsk->cg_list);
5295		up_write(&css_set_rwsem);
5296		put_cset = true;
5297	}
5298
5299	/* Reassign the task to the init_css_set. */
5300	cset = task_css_set(tsk);
5301	RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
5302
5303	if (need_forkexit_callback) {
5304		/* see cgroup_post_fork() for details */
5305		for_each_subsys(ss, i) {
5306			if (ss->exit) {
5307				struct cgroup_subsys_state *old_css = cset->subsys[i];
5308				struct cgroup_subsys_state *css = task_css(tsk, i);
5309
5310				ss->exit(css, old_css, tsk);
5311			}
5312		}
5313	}
5314
5315	if (put_cset)
5316		put_css_set(cset);
5317}
5318
5319static void check_for_release(struct cgroup *cgrp)
5320{
5321	if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) &&
5322	    !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
5323		schedule_work(&cgrp->release_agent_work);
5324}
5325
5326/*
5327 * Notify userspace when a cgroup is released, by running the
5328 * configured release agent with the name of the cgroup (path
5329 * relative to the root of cgroup file system) as the argument.
5330 *
5331 * Most likely, this user command will try to rmdir this cgroup.
5332 *
5333 * This races with the possibility that some other task will be
5334 * attached to this cgroup before it is removed, or that some other
5335 * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
5336 * The presumed 'rmdir' will fail quietly if this cgroup is no longer
5337 * unused, and this cgroup will be reprieved from its death sentence,
5338 * to continue to serve a useful existence.  Next time it's released,
5339 * we will get notified again, if it still has 'notify_on_release' set.
5340 *
5341 * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
5342 * means only wait until the task is successfully execve()'d.  The
5343 * separate release agent task is forked by call_usermodehelper(),
5344 * then control in this thread returns here, without waiting for the
5345 * release agent task.  We don't bother to wait because the caller of
5346 * this routine has no use for the exit status of the release agent
5347 * task, so no sense holding our caller up for that.
5348 */
5349static void cgroup_release_agent(struct work_struct *work)
5350{
5351	struct cgroup *cgrp =
5352		container_of(work, struct cgroup, release_agent_work);
5353	char *pathbuf = NULL, *agentbuf = NULL, *path;
5354	char *argv[3], *envp[3];
5355
5356	mutex_lock(&cgroup_mutex);
5357
5358	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
5359	agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
5360	if (!pathbuf || !agentbuf)
5361		goto out;
5362
5363	path = cgroup_path(cgrp, pathbuf, PATH_MAX);
5364	if (!path)
5365		goto out;
5366
5367	argv[0] = agentbuf;
5368	argv[1] = path;
5369	argv[2] = NULL;
5370
5371	/* minimal command environment */
5372	envp[0] = "HOME=/";
5373	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
5374	envp[2] = NULL;
5375
5376	mutex_unlock(&cgroup_mutex);
5377	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
5378	goto out_free;
5379out:
5380	mutex_unlock(&cgroup_mutex);
5381out_free:
5382	kfree(agentbuf);
5383	kfree(pathbuf);
5384}
5385
5386static int __init cgroup_disable(char *str)
5387{
5388	struct cgroup_subsys *ss;
5389	char *token;
5390	int i;
5391
5392	while ((token = strsep(&str, ",")) != NULL) {
5393		if (!*token)
5394			continue;
5395
5396		for_each_subsys(ss, i) {
5397			if (!strcmp(token, ss->name)) {
5398				ss->disabled = 1;
5399				printk(KERN_INFO "Disabling %s control group"
5400					" subsystem\n", ss->name);
5401				break;
5402			}
5403		}
5404	}
5405	return 1;
5406}
5407__setup("cgroup_disable=", cgroup_disable);
5408
5409static int __init cgroup_set_legacy_files_on_dfl(char *str)
5410{
5411	printk("cgroup: using legacy files on the default hierarchy\n");
5412	cgroup_legacy_files_on_dfl = true;
5413	return 0;
5414}
5415__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl);
5416
5417/**
5418 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
5419 * @dentry: directory dentry of interest
5420 * @ss: subsystem of interest
5421 *
5422 * If @dentry is a directory for a cgroup which has @ss enabled on it, try
5423 * to get the corresponding css and return it.  If such css doesn't exist
5424 * or can't be pinned, an ERR_PTR value is returned.
5425 */
5426struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
5427						       struct cgroup_subsys *ss)
5428{
5429	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
5430	struct cgroup_subsys_state *css = NULL;
5431	struct cgroup *cgrp;
5432
5433	/* is @dentry a cgroup dir? */
5434	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
5435	    kernfs_type(kn) != KERNFS_DIR)
5436		return ERR_PTR(-EBADF);
5437
5438	rcu_read_lock();
5439
5440	/*
5441	 * This path doesn't originate from kernfs and @kn could already
5442	 * have been or be removed at any point.  @kn->priv is RCU
5443	 * protected for this access.  See css_release_work_fn() for details.
5444	 */
5445	cgrp = rcu_dereference(kn->priv);
5446	if (cgrp)
5447		css = cgroup_css(cgrp, ss);
5448
5449	if (!css || !css_tryget_online(css))
5450		css = ERR_PTR(-ENOENT);
5451
5452	rcu_read_unlock();
5453	return css;
5454}
5455
5456/**
5457 * css_from_id - lookup css by id
5458 * @id: the cgroup id
5459 * @ss: cgroup subsys to be looked into
5460 *
5461 * Returns the css if there's valid one with @id, otherwise returns NULL.
5462 * Should be called under rcu_read_lock().
5463 */
5464struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5465{
5466	WARN_ON_ONCE(!rcu_read_lock_held());
5467	return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
5468}
5469
5470#ifdef CONFIG_CGROUP_DEBUG
5471static struct cgroup_subsys_state *
5472debug_css_alloc(struct cgroup_subsys_state *parent_css)
5473{
5474	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5475
5476	if (!css)
5477		return ERR_PTR(-ENOMEM);
5478
5479	return css;
5480}
5481
5482static void debug_css_free(struct cgroup_subsys_state *css)
5483{
5484	kfree(css);
5485}
5486
5487static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
5488				struct cftype *cft)
5489{
5490	return cgroup_task_count(css->cgroup);
5491}
5492
5493static u64 current_css_set_read(struct cgroup_subsys_state *css,
5494				struct cftype *cft)
5495{
5496	return (u64)(unsigned long)current->cgroups;
5497}
5498
5499static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
5500					 struct cftype *cft)
5501{
5502	u64 count;
5503
5504	rcu_read_lock();
5505	count = atomic_read(&task_css_set(current)->refcount);
5506	rcu_read_unlock();
5507	return count;
5508}
5509
5510static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
5511{
5512	struct cgrp_cset_link *link;
5513	struct css_set *cset;
5514	char *name_buf;
5515
5516	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
5517	if (!name_buf)
5518		return -ENOMEM;
5519
5520	down_read(&css_set_rwsem);
5521	rcu_read_lock();
5522	cset = rcu_dereference(current->cgroups);
5523	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
5524		struct cgroup *c = link->cgrp;
5525
5526		cgroup_name(c, name_buf, NAME_MAX + 1);
5527		seq_printf(seq, "Root %d group %s\n",
5528			   c->root->hierarchy_id, name_buf);
5529	}
5530	rcu_read_unlock();
5531	up_read(&css_set_rwsem);
5532	kfree(name_buf);
5533	return 0;
5534}
5535
5536#define MAX_TASKS_SHOWN_PER_CSS 25
5537static int cgroup_css_links_read(struct seq_file *seq, void *v)
5538{
5539	struct cgroup_subsys_state *css = seq_css(seq);
5540	struct cgrp_cset_link *link;
5541
5542	down_read(&css_set_rwsem);
5543	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
5544		struct css_set *cset = link->cset;
5545		struct task_struct *task;
5546		int count = 0;
5547
5548		seq_printf(seq, "css_set %p\n", cset);
5549
5550		list_for_each_entry(task, &cset->tasks, cg_list) {
5551			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
5552				goto overflow;
5553			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
5554		}
5555
5556		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
5557			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
5558				goto overflow;
5559			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
5560		}
5561		continue;
5562	overflow:
5563		seq_puts(seq, "  ...\n");
5564	}
5565	up_read(&css_set_rwsem);
5566	return 0;
5567}
5568
5569static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
5570{
5571	return (!cgroup_has_tasks(css->cgroup) &&
5572		!css_has_online_children(&css->cgroup->self));
5573}
5574
5575static struct cftype debug_files[] =  {
5576	{
5577		.name = "taskcount",
5578		.read_u64 = debug_taskcount_read,
5579	},
5580
5581	{
5582		.name = "current_css_set",
5583		.read_u64 = current_css_set_read,
5584	},
5585
5586	{
5587		.name = "current_css_set_refcount",
5588		.read_u64 = current_css_set_refcount_read,
5589	},
5590
5591	{
5592		.name = "current_css_set_cg_links",
5593		.seq_show = current_css_set_cg_links_read,
5594	},
5595
5596	{
5597		.name = "cgroup_css_links",
5598		.seq_show = cgroup_css_links_read,
5599	},
5600
5601	{
5602		.name = "releasable",
5603		.read_u64 = releasable_read,
5604	},
5605
5606	{ }	/* terminate */
5607};
5608
5609struct cgroup_subsys debug_cgrp_subsys = {
5610	.css_alloc = debug_css_alloc,
5611	.css_free = debug_css_free,
5612	.legacy_cftypes = debug_files,
5613};
5614#endif /* CONFIG_CGROUP_DEBUG */
5615