1/*
2 * kernel/workqueue.c - generic async execution with shared worker pool
3 *
4 * Copyright (C) 2002		Ingo Molnar
5 *
6 *   Derived from the taskqueue/keventd code by:
7 *     David Woodhouse <dwmw2@infradead.org>
8 *     Andrew Morton
9 *     Kai Petzke <wpp@marie.physik.tu-berlin.de>
10 *     Theodore Ts'o <tytso@mit.edu>
11 *
12 * Made to use alloc_percpu by Christoph Lameter.
13 *
14 * Copyright (C) 2010		SUSE Linux Products GmbH
15 * Copyright (C) 2010		Tejun Heo <tj@kernel.org>
16 *
17 * This is the generic async execution mechanism.  Work items as are
18 * executed in process context.  The worker pool is shared and
19 * automatically managed.  There are two worker pools for each CPU (one for
20 * normal work items and the other for high priority ones) and some extra
21 * pools for workqueues which are not bound to any specific CPU - the
22 * number of these backing pools is dynamic.
23 *
24 * Please read Documentation/workqueue.txt for details.
25 */
26
27#include <linux/export.h>
28#include <linux/kernel.h>
29#include <linux/sched.h>
30#include <linux/init.h>
31#include <linux/signal.h>
32#include <linux/completion.h>
33#include <linux/workqueue.h>
34#include <linux/slab.h>
35#include <linux/cpu.h>
36#include <linux/notifier.h>
37#include <linux/kthread.h>
38#include <linux/hardirq.h>
39#include <linux/mempolicy.h>
40#include <linux/freezer.h>
41#include <linux/kallsyms.h>
42#include <linux/debug_locks.h>
43#include <linux/lockdep.h>
44#include <linux/idr.h>
45#include <linux/jhash.h>
46#include <linux/hashtable.h>
47#include <linux/rculist.h>
48#include <linux/nodemask.h>
49#include <linux/moduleparam.h>
50#include <linux/uaccess.h>
51
52#include "workqueue_internal.h"
53
54enum {
55	/*
56	 * worker_pool flags
57	 *
58	 * A bound pool is either associated or disassociated with its CPU.
59	 * While associated (!DISASSOCIATED), all workers are bound to the
60	 * CPU and none has %WORKER_UNBOUND set and concurrency management
61	 * is in effect.
62	 *
63	 * While DISASSOCIATED, the cpu may be offline and all workers have
64	 * %WORKER_UNBOUND set and concurrency management disabled, and may
65	 * be executing on any CPU.  The pool behaves as an unbound one.
66	 *
67	 * Note that DISASSOCIATED should be flipped only while holding
68	 * attach_mutex to avoid changing binding state while
69	 * worker_attach_to_pool() is in progress.
70	 */
71	POOL_DISASSOCIATED	= 1 << 2,	/* cpu can't serve workers */
72
73	/* worker flags */
74	WORKER_DIE		= 1 << 1,	/* die die die */
75	WORKER_IDLE		= 1 << 2,	/* is idle */
76	WORKER_PREP		= 1 << 3,	/* preparing to run works */
77	WORKER_CPU_INTENSIVE	= 1 << 6,	/* cpu intensive */
78	WORKER_UNBOUND		= 1 << 7,	/* worker is unbound */
79	WORKER_REBOUND		= 1 << 8,	/* worker was rebound */
80
81	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_CPU_INTENSIVE |
82				  WORKER_UNBOUND | WORKER_REBOUND,
83
84	NR_STD_WORKER_POOLS	= 2,		/* # standard pools per cpu */
85
86	UNBOUND_POOL_HASH_ORDER	= 6,		/* hashed by pool->attrs */
87	BUSY_WORKER_HASH_ORDER	= 6,		/* 64 pointers */
88
89	MAX_IDLE_WORKERS_RATIO	= 4,		/* 1/4 of busy can be idle */
90	IDLE_WORKER_TIMEOUT	= 300 * HZ,	/* keep idle ones for 5 mins */
91
92	MAYDAY_INITIAL_TIMEOUT  = HZ / 100 >= 2 ? HZ / 100 : 2,
93						/* call for help after 10ms
94						   (min two ticks) */
95	MAYDAY_INTERVAL		= HZ / 10,	/* and then every 100ms */
96	CREATE_COOLDOWN		= HZ,		/* time to breath after fail */
97
98	/*
99	 * Rescue workers are used only on emergencies and shared by
100	 * all cpus.  Give MIN_NICE.
101	 */
102	RESCUER_NICE_LEVEL	= MIN_NICE,
103	HIGHPRI_NICE_LEVEL	= MIN_NICE,
104
105	WQ_NAME_LEN		= 24,
106};
107
108/*
109 * Structure fields follow one of the following exclusion rules.
110 *
111 * I: Modifiable by initialization/destruction paths and read-only for
112 *    everyone else.
113 *
114 * P: Preemption protected.  Disabling preemption is enough and should
115 *    only be modified and accessed from the local cpu.
116 *
117 * L: pool->lock protected.  Access with pool->lock held.
118 *
119 * X: During normal operation, modification requires pool->lock and should
120 *    be done only from local cpu.  Either disabling preemption on local
121 *    cpu or grabbing pool->lock is enough for read access.  If
122 *    POOL_DISASSOCIATED is set, it's identical to L.
123 *
124 * A: pool->attach_mutex protected.
125 *
126 * PL: wq_pool_mutex protected.
127 *
128 * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
129 *
130 * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
131 *
132 * PWR: wq_pool_mutex and wq->mutex protected for writes.  Either or
133 *      sched-RCU for reads.
134 *
135 * WQ: wq->mutex protected.
136 *
137 * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
138 *
139 * MD: wq_mayday_lock protected.
140 */
141
142/* struct worker is defined in workqueue_internal.h */
143
144struct worker_pool {
145	spinlock_t		lock;		/* the pool lock */
146	int			cpu;		/* I: the associated cpu */
147	int			node;		/* I: the associated node ID */
148	int			id;		/* I: pool ID */
149	unsigned int		flags;		/* X: flags */
150
151	struct list_head	worklist;	/* L: list of pending works */
152	int			nr_workers;	/* L: total number of workers */
153
154	/* nr_idle includes the ones off idle_list for rebinding */
155	int			nr_idle;	/* L: currently idle ones */
156
157	struct list_head	idle_list;	/* X: list of idle workers */
158	struct timer_list	idle_timer;	/* L: worker idle timeout */
159	struct timer_list	mayday_timer;	/* L: SOS timer for workers */
160
161	/* a workers is either on busy_hash or idle_list, or the manager */
162	DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
163						/* L: hash of busy workers */
164
165	/* see manage_workers() for details on the two manager mutexes */
166	struct mutex		manager_arb;	/* manager arbitration */
167	struct worker		*manager;	/* L: purely informational */
168	struct mutex		attach_mutex;	/* attach/detach exclusion */
169	struct list_head	workers;	/* A: attached workers */
170	struct completion	*detach_completion; /* all workers detached */
171
172	struct ida		worker_ida;	/* worker IDs for task name */
173
174	struct workqueue_attrs	*attrs;		/* I: worker attributes */
175	struct hlist_node	hash_node;	/* PL: unbound_pool_hash node */
176	int			refcnt;		/* PL: refcnt for unbound pools */
177
178	/*
179	 * The current concurrency level.  As it's likely to be accessed
180	 * from other CPUs during try_to_wake_up(), put it in a separate
181	 * cacheline.
182	 */
183	atomic_t		nr_running ____cacheline_aligned_in_smp;
184
185	/*
186	 * Destruction of pool is sched-RCU protected to allow dereferences
187	 * from get_work_pool().
188	 */
189	struct rcu_head		rcu;
190} ____cacheline_aligned_in_smp;
191
192/*
193 * The per-pool workqueue.  While queued, the lower WORK_STRUCT_FLAG_BITS
194 * of work_struct->data are used for flags and the remaining high bits
195 * point to the pwq; thus, pwqs need to be aligned at two's power of the
196 * number of flag bits.
197 */
198struct pool_workqueue {
199	struct worker_pool	*pool;		/* I: the associated pool */
200	struct workqueue_struct *wq;		/* I: the owning workqueue */
201	int			work_color;	/* L: current color */
202	int			flush_color;	/* L: flushing color */
203	int			refcnt;		/* L: reference count */
204	int			nr_in_flight[WORK_NR_COLORS];
205						/* L: nr of in_flight works */
206	int			nr_active;	/* L: nr of active works */
207	int			max_active;	/* L: max active works */
208	struct list_head	delayed_works;	/* L: delayed works */
209	struct list_head	pwqs_node;	/* WR: node on wq->pwqs */
210	struct list_head	mayday_node;	/* MD: node on wq->maydays */
211
212	/*
213	 * Release of unbound pwq is punted to system_wq.  See put_pwq()
214	 * and pwq_unbound_release_workfn() for details.  pool_workqueue
215	 * itself is also sched-RCU protected so that the first pwq can be
216	 * determined without grabbing wq->mutex.
217	 */
218	struct work_struct	unbound_release_work;
219	struct rcu_head		rcu;
220} __aligned(1 << WORK_STRUCT_FLAG_BITS);
221
222/*
223 * Structure used to wait for workqueue flush.
224 */
225struct wq_flusher {
226	struct list_head	list;		/* WQ: list of flushers */
227	int			flush_color;	/* WQ: flush color waiting for */
228	struct completion	done;		/* flush completion */
229};
230
231struct wq_device;
232
233/*
234 * The externally visible workqueue.  It relays the issued work items to
235 * the appropriate worker_pool through its pool_workqueues.
236 */
237struct workqueue_struct {
238	struct list_head	pwqs;		/* WR: all pwqs of this wq */
239	struct list_head	list;		/* PR: list of all workqueues */
240
241	struct mutex		mutex;		/* protects this wq */
242	int			work_color;	/* WQ: current work color */
243	int			flush_color;	/* WQ: current flush color */
244	atomic_t		nr_pwqs_to_flush; /* flush in progress */
245	struct wq_flusher	*first_flusher;	/* WQ: first flusher */
246	struct list_head	flusher_queue;	/* WQ: flush waiters */
247	struct list_head	flusher_overflow; /* WQ: flush overflow list */
248
249	struct list_head	maydays;	/* MD: pwqs requesting rescue */
250	struct worker		*rescuer;	/* I: rescue worker */
251
252	int			nr_drainers;	/* WQ: drain in progress */
253	int			saved_max_active; /* WQ: saved pwq max_active */
254
255	struct workqueue_attrs	*unbound_attrs;	/* PW: only for unbound wqs */
256	struct pool_workqueue	*dfl_pwq;	/* PW: only for unbound wqs */
257
258#ifdef CONFIG_SYSFS
259	struct wq_device	*wq_dev;	/* I: for sysfs interface */
260#endif
261#ifdef CONFIG_LOCKDEP
262	struct lockdep_map	lockdep_map;
263#endif
264	char			name[WQ_NAME_LEN]; /* I: workqueue name */
265
266	/*
267	 * Destruction of workqueue_struct is sched-RCU protected to allow
268	 * walking the workqueues list without grabbing wq_pool_mutex.
269	 * This is used to dump all workqueues from sysrq.
270	 */
271	struct rcu_head		rcu;
272
273	/* hot fields used during command issue, aligned to cacheline */
274	unsigned int		flags ____cacheline_aligned; /* WQ: WQ_* flags */
275	struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
276	struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */
277};
278
279static struct kmem_cache *pwq_cache;
280
281static cpumask_var_t *wq_numa_possible_cpumask;
282					/* possible CPUs of each node */
283
284static bool wq_disable_numa;
285module_param_named(disable_numa, wq_disable_numa, bool, 0444);
286
287/* see the comment above the definition of WQ_POWER_EFFICIENT */
288#ifdef CONFIG_WQ_POWER_EFFICIENT_DEFAULT
289static bool wq_power_efficient = true;
290#else
291static bool wq_power_efficient;
292#endif
293
294module_param_named(power_efficient, wq_power_efficient, bool, 0444);
295
296static bool wq_numa_enabled;		/* unbound NUMA affinity enabled */
297
298/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
299static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
300
301static DEFINE_MUTEX(wq_pool_mutex);	/* protects pools and workqueues list */
302static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
303
304static LIST_HEAD(workqueues);		/* PR: list of all workqueues */
305static bool workqueue_freezing;		/* PL: have wqs started freezing? */
306
307/* the per-cpu worker pools */
308static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
309				     cpu_worker_pools);
310
311static DEFINE_IDR(worker_pool_idr);	/* PR: idr of all pools */
312
313/* PL: hash of all unbound pools keyed by pool->attrs */
314static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
315
316/* I: attributes used when instantiating standard unbound pools on demand */
317static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
318
319/* I: attributes used when instantiating ordered pools on demand */
320static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
321
322struct workqueue_struct *system_wq __read_mostly;
323EXPORT_SYMBOL(system_wq);
324struct workqueue_struct *system_highpri_wq __read_mostly;
325EXPORT_SYMBOL_GPL(system_highpri_wq);
326struct workqueue_struct *system_long_wq __read_mostly;
327EXPORT_SYMBOL_GPL(system_long_wq);
328struct workqueue_struct *system_unbound_wq __read_mostly;
329EXPORT_SYMBOL_GPL(system_unbound_wq);
330struct workqueue_struct *system_freezable_wq __read_mostly;
331EXPORT_SYMBOL_GPL(system_freezable_wq);
332struct workqueue_struct *system_power_efficient_wq __read_mostly;
333EXPORT_SYMBOL_GPL(system_power_efficient_wq);
334struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
335EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
336
337static int worker_thread(void *__worker);
338static void copy_workqueue_attrs(struct workqueue_attrs *to,
339				 const struct workqueue_attrs *from);
340static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
341
342#define CREATE_TRACE_POINTS
343#include <trace/events/workqueue.h>
344
345#define assert_rcu_or_pool_mutex()					\
346	rcu_lockdep_assert(rcu_read_lock_sched_held() ||		\
347			   lockdep_is_held(&wq_pool_mutex),		\
348			   "sched RCU or wq_pool_mutex should be held")
349
350#define assert_rcu_or_wq_mutex(wq)					\
351	rcu_lockdep_assert(rcu_read_lock_sched_held() ||		\
352			   lockdep_is_held(&wq->mutex),			\
353			   "sched RCU or wq->mutex should be held")
354
355#define assert_rcu_or_wq_mutex_or_pool_mutex(wq)			\
356	rcu_lockdep_assert(rcu_read_lock_sched_held() ||		\
357			   lockdep_is_held(&wq->mutex) ||		\
358			   lockdep_is_held(&wq_pool_mutex),		\
359			   "sched RCU, wq->mutex or wq_pool_mutex should be held")
360
361#define for_each_cpu_worker_pool(pool, cpu)				\
362	for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];		\
363	     (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
364	     (pool)++)
365
366/**
367 * for_each_pool - iterate through all worker_pools in the system
368 * @pool: iteration cursor
369 * @pi: integer used for iteration
370 *
371 * This must be called either with wq_pool_mutex held or sched RCU read
372 * locked.  If the pool needs to be used beyond the locking in effect, the
373 * caller is responsible for guaranteeing that the pool stays online.
374 *
375 * The if/else clause exists only for the lockdep assertion and can be
376 * ignored.
377 */
378#define for_each_pool(pool, pi)						\
379	idr_for_each_entry(&worker_pool_idr, pool, pi)			\
380		if (({ assert_rcu_or_pool_mutex(); false; })) { }	\
381		else
382
383/**
384 * for_each_pool_worker - iterate through all workers of a worker_pool
385 * @worker: iteration cursor
386 * @pool: worker_pool to iterate workers of
387 *
388 * This must be called with @pool->attach_mutex.
389 *
390 * The if/else clause exists only for the lockdep assertion and can be
391 * ignored.
392 */
393#define for_each_pool_worker(worker, pool)				\
394	list_for_each_entry((worker), &(pool)->workers, node)		\
395		if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \
396		else
397
398/**
399 * for_each_pwq - iterate through all pool_workqueues of the specified workqueue
400 * @pwq: iteration cursor
401 * @wq: the target workqueue
402 *
403 * This must be called either with wq->mutex held or sched RCU read locked.
404 * If the pwq needs to be used beyond the locking in effect, the caller is
405 * responsible for guaranteeing that the pwq stays online.
406 *
407 * The if/else clause exists only for the lockdep assertion and can be
408 * ignored.
409 */
410#define for_each_pwq(pwq, wq)						\
411	list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node)		\
412		if (({ assert_rcu_or_wq_mutex(wq); false; })) { }	\
413		else
414
415#ifdef CONFIG_DEBUG_OBJECTS_WORK
416
417static struct debug_obj_descr work_debug_descr;
418
419static void *work_debug_hint(void *addr)
420{
421	return ((struct work_struct *) addr)->func;
422}
423
424/*
425 * fixup_init is called when:
426 * - an active object is initialized
427 */
428static int work_fixup_init(void *addr, enum debug_obj_state state)
429{
430	struct work_struct *work = addr;
431
432	switch (state) {
433	case ODEBUG_STATE_ACTIVE:
434		cancel_work_sync(work);
435		debug_object_init(work, &work_debug_descr);
436		return 1;
437	default:
438		return 0;
439	}
440}
441
442/*
443 * fixup_activate is called when:
444 * - an active object is activated
445 * - an unknown object is activated (might be a statically initialized object)
446 */
447static int work_fixup_activate(void *addr, enum debug_obj_state state)
448{
449	struct work_struct *work = addr;
450
451	switch (state) {
452
453	case ODEBUG_STATE_NOTAVAILABLE:
454		/*
455		 * This is not really a fixup. The work struct was
456		 * statically initialized. We just make sure that it
457		 * is tracked in the object tracker.
458		 */
459		if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
460			debug_object_init(work, &work_debug_descr);
461			debug_object_activate(work, &work_debug_descr);
462			return 0;
463		}
464		WARN_ON_ONCE(1);
465		return 0;
466
467	case ODEBUG_STATE_ACTIVE:
468		WARN_ON(1);
469
470	default:
471		return 0;
472	}
473}
474
475/*
476 * fixup_free is called when:
477 * - an active object is freed
478 */
479static int work_fixup_free(void *addr, enum debug_obj_state state)
480{
481	struct work_struct *work = addr;
482
483	switch (state) {
484	case ODEBUG_STATE_ACTIVE:
485		cancel_work_sync(work);
486		debug_object_free(work, &work_debug_descr);
487		return 1;
488	default:
489		return 0;
490	}
491}
492
493static struct debug_obj_descr work_debug_descr = {
494	.name		= "work_struct",
495	.debug_hint	= work_debug_hint,
496	.fixup_init	= work_fixup_init,
497	.fixup_activate	= work_fixup_activate,
498	.fixup_free	= work_fixup_free,
499};
500
501static inline void debug_work_activate(struct work_struct *work)
502{
503	debug_object_activate(work, &work_debug_descr);
504}
505
506static inline void debug_work_deactivate(struct work_struct *work)
507{
508	debug_object_deactivate(work, &work_debug_descr);
509}
510
511void __init_work(struct work_struct *work, int onstack)
512{
513	if (onstack)
514		debug_object_init_on_stack(work, &work_debug_descr);
515	else
516		debug_object_init(work, &work_debug_descr);
517}
518EXPORT_SYMBOL_GPL(__init_work);
519
520void destroy_work_on_stack(struct work_struct *work)
521{
522	debug_object_free(work, &work_debug_descr);
523}
524EXPORT_SYMBOL_GPL(destroy_work_on_stack);
525
526void destroy_delayed_work_on_stack(struct delayed_work *work)
527{
528	destroy_timer_on_stack(&work->timer);
529	debug_object_free(&work->work, &work_debug_descr);
530}
531EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack);
532
533#else
534static inline void debug_work_activate(struct work_struct *work) { }
535static inline void debug_work_deactivate(struct work_struct *work) { }
536#endif
537
538/**
539 * worker_pool_assign_id - allocate ID and assing it to @pool
540 * @pool: the pool pointer of interest
541 *
542 * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
543 * successfully, -errno on failure.
544 */
545static int worker_pool_assign_id(struct worker_pool *pool)
546{
547	int ret;
548
549	lockdep_assert_held(&wq_pool_mutex);
550
551	ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,
552			GFP_KERNEL);
553	if (ret >= 0) {
554		pool->id = ret;
555		return 0;
556	}
557	return ret;
558}
559
560/**
561 * unbound_pwq_by_node - return the unbound pool_workqueue for the given node
562 * @wq: the target workqueue
563 * @node: the node ID
564 *
565 * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
566 * read locked.
567 * If the pwq needs to be used beyond the locking in effect, the caller is
568 * responsible for guaranteeing that the pwq stays online.
569 *
570 * Return: The unbound pool_workqueue for @node.
571 */
572static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
573						  int node)
574{
575	assert_rcu_or_wq_mutex_or_pool_mutex(wq);
576
577	/*
578	 * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a
579	 * delayed item is pending.  The plan is to keep CPU -> NODE
580	 * mapping valid and stable across CPU on/offlines.  Once that
581	 * happens, this workaround can be removed.
582	 */
583	if (unlikely(node == NUMA_NO_NODE))
584		return wq->dfl_pwq;
585
586	return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
587}
588
589static unsigned int work_color_to_flags(int color)
590{
591	return color << WORK_STRUCT_COLOR_SHIFT;
592}
593
594static int get_work_color(struct work_struct *work)
595{
596	return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
597		((1 << WORK_STRUCT_COLOR_BITS) - 1);
598}
599
600static int work_next_color(int color)
601{
602	return (color + 1) % WORK_NR_COLORS;
603}
604
605/*
606 * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
607 * contain the pointer to the queued pwq.  Once execution starts, the flag
608 * is cleared and the high bits contain OFFQ flags and pool ID.
609 *
610 * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling()
611 * and clear_work_data() can be used to set the pwq, pool or clear
612 * work->data.  These functions should only be called while the work is
613 * owned - ie. while the PENDING bit is set.
614 *
615 * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
616 * corresponding to a work.  Pool is available once the work has been
617 * queued anywhere after initialization until it is sync canceled.  pwq is
618 * available only while the work item is queued.
619 *
620 * %WORK_OFFQ_CANCELING is used to mark a work item which is being
621 * canceled.  While being canceled, a work item may have its PENDING set
622 * but stay off timer and worklist for arbitrarily long and nobody should
623 * try to steal the PENDING bit.
624 */
625static inline void set_work_data(struct work_struct *work, unsigned long data,
626				 unsigned long flags)
627{
628	WARN_ON_ONCE(!work_pending(work));
629	atomic_long_set(&work->data, data | flags | work_static(work));
630}
631
632static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
633			 unsigned long extra_flags)
634{
635	set_work_data(work, (unsigned long)pwq,
636		      WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags);
637}
638
639static void set_work_pool_and_keep_pending(struct work_struct *work,
640					   int pool_id)
641{
642	set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT,
643		      WORK_STRUCT_PENDING);
644}
645
646static void set_work_pool_and_clear_pending(struct work_struct *work,
647					    int pool_id)
648{
649	/*
650	 * The following wmb is paired with the implied mb in
651	 * test_and_set_bit(PENDING) and ensures all updates to @work made
652	 * here are visible to and precede any updates by the next PENDING
653	 * owner.
654	 */
655	smp_wmb();
656	set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
657	/*
658	 * The following mb guarantees that previous clear of a PENDING bit
659	 * will not be reordered with any speculative LOADS or STORES from
660	 * work->current_func, which is executed afterwards.  This possible
661	 * reordering can lead to a missed execution on attempt to qeueue
662	 * the same @work.  E.g. consider this case:
663	 *
664	 *   CPU#0                         CPU#1
665	 *   ----------------------------  --------------------------------
666	 *
667	 * 1  STORE event_indicated
668	 * 2  queue_work_on() {
669	 * 3    test_and_set_bit(PENDING)
670	 * 4 }                             set_..._and_clear_pending() {
671	 * 5                                 set_work_data() # clear bit
672	 * 6                                 smp_mb()
673	 * 7                               work->current_func() {
674	 * 8				      LOAD event_indicated
675	 *				   }
676	 *
677	 * Without an explicit full barrier speculative LOAD on line 8 can
678	 * be executed before CPU#0 does STORE on line 1.  If that happens,
679	 * CPU#0 observes the PENDING bit is still set and new execution of
680	 * a @work is not queued in a hope, that CPU#1 will eventually
681	 * finish the queued @work.  Meanwhile CPU#1 does not see
682	 * event_indicated is set, because speculative LOAD was executed
683	 * before actual STORE.
684	 */
685	smp_mb();
686}
687
688static void clear_work_data(struct work_struct *work)
689{
690	smp_wmb();	/* see set_work_pool_and_clear_pending() */
691	set_work_data(work, WORK_STRUCT_NO_POOL, 0);
692}
693
694static struct pool_workqueue *get_work_pwq(struct work_struct *work)
695{
696	unsigned long data = atomic_long_read(&work->data);
697
698	if (data & WORK_STRUCT_PWQ)
699		return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
700	else
701		return NULL;
702}
703
704/**
705 * get_work_pool - return the worker_pool a given work was associated with
706 * @work: the work item of interest
707 *
708 * Pools are created and destroyed under wq_pool_mutex, and allows read
709 * access under sched-RCU read lock.  As such, this function should be
710 * called under wq_pool_mutex or with preemption disabled.
711 *
712 * All fields of the returned pool are accessible as long as the above
713 * mentioned locking is in effect.  If the returned pool needs to be used
714 * beyond the critical section, the caller is responsible for ensuring the
715 * returned pool is and stays online.
716 *
717 * Return: The worker_pool @work was last associated with.  %NULL if none.
718 */
719static struct worker_pool *get_work_pool(struct work_struct *work)
720{
721	unsigned long data = atomic_long_read(&work->data);
722	int pool_id;
723
724	assert_rcu_or_pool_mutex();
725
726	if (data & WORK_STRUCT_PWQ)
727		return ((struct pool_workqueue *)
728			(data & WORK_STRUCT_WQ_DATA_MASK))->pool;
729
730	pool_id = data >> WORK_OFFQ_POOL_SHIFT;
731	if (pool_id == WORK_OFFQ_POOL_NONE)
732		return NULL;
733
734	return idr_find(&worker_pool_idr, pool_id);
735}
736
737/**
738 * get_work_pool_id - return the worker pool ID a given work is associated with
739 * @work: the work item of interest
740 *
741 * Return: The worker_pool ID @work was last associated with.
742 * %WORK_OFFQ_POOL_NONE if none.
743 */
744static int get_work_pool_id(struct work_struct *work)
745{
746	unsigned long data = atomic_long_read(&work->data);
747
748	if (data & WORK_STRUCT_PWQ)
749		return ((struct pool_workqueue *)
750			(data & WORK_STRUCT_WQ_DATA_MASK))->pool->id;
751
752	return data >> WORK_OFFQ_POOL_SHIFT;
753}
754
755static void mark_work_canceling(struct work_struct *work)
756{
757	unsigned long pool_id = get_work_pool_id(work);
758
759	pool_id <<= WORK_OFFQ_POOL_SHIFT;
760	set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING);
761}
762
763static bool work_is_canceling(struct work_struct *work)
764{
765	unsigned long data = atomic_long_read(&work->data);
766
767	return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING);
768}
769
770/*
771 * Policy functions.  These define the policies on how the global worker
772 * pools are managed.  Unless noted otherwise, these functions assume that
773 * they're being called with pool->lock held.
774 */
775
776static bool __need_more_worker(struct worker_pool *pool)
777{
778	return !atomic_read(&pool->nr_running);
779}
780
781/*
782 * Need to wake up a worker?  Called from anything but currently
783 * running workers.
784 *
785 * Note that, because unbound workers never contribute to nr_running, this
786 * function will always return %true for unbound pools as long as the
787 * worklist isn't empty.
788 */
789static bool need_more_worker(struct worker_pool *pool)
790{
791	return !list_empty(&pool->worklist) && __need_more_worker(pool);
792}
793
794/* Can I start working?  Called from busy but !running workers. */
795static bool may_start_working(struct worker_pool *pool)
796{
797	return pool->nr_idle;
798}
799
800/* Do I need to keep working?  Called from currently running workers. */
801static bool keep_working(struct worker_pool *pool)
802{
803	return !list_empty(&pool->worklist) &&
804		atomic_read(&pool->nr_running) <= 1;
805}
806
807/* Do we need a new worker?  Called from manager. */
808static bool need_to_create_worker(struct worker_pool *pool)
809{
810	return need_more_worker(pool) && !may_start_working(pool);
811}
812
813/* Do we have too many workers and should some go away? */
814static bool too_many_workers(struct worker_pool *pool)
815{
816	bool managing = mutex_is_locked(&pool->manager_arb);
817	int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
818	int nr_busy = pool->nr_workers - nr_idle;
819
820	return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
821}
822
823/*
824 * Wake up functions.
825 */
826
827/* Return the first idle worker.  Safe with preemption disabled */
828static struct worker *first_idle_worker(struct worker_pool *pool)
829{
830	if (unlikely(list_empty(&pool->idle_list)))
831		return NULL;
832
833	return list_first_entry(&pool->idle_list, struct worker, entry);
834}
835
836/**
837 * wake_up_worker - wake up an idle worker
838 * @pool: worker pool to wake worker from
839 *
840 * Wake up the first idle worker of @pool.
841 *
842 * CONTEXT:
843 * spin_lock_irq(pool->lock).
844 */
845static void wake_up_worker(struct worker_pool *pool)
846{
847	struct worker *worker = first_idle_worker(pool);
848
849	if (likely(worker))
850		wake_up_process(worker->task);
851}
852
853/**
854 * wq_worker_waking_up - a worker is waking up
855 * @task: task waking up
856 * @cpu: CPU @task is waking up to
857 *
858 * This function is called during try_to_wake_up() when a worker is
859 * being awoken.
860 *
861 * CONTEXT:
862 * spin_lock_irq(rq->lock)
863 */
864void wq_worker_waking_up(struct task_struct *task, int cpu)
865{
866	struct worker *worker = kthread_data(task);
867
868	if (!(worker->flags & WORKER_NOT_RUNNING)) {
869		WARN_ON_ONCE(worker->pool->cpu != cpu);
870		atomic_inc(&worker->pool->nr_running);
871	}
872}
873
874/**
875 * wq_worker_sleeping - a worker is going to sleep
876 * @task: task going to sleep
877 * @cpu: CPU in question, must be the current CPU number
878 *
879 * This function is called during schedule() when a busy worker is
880 * going to sleep.  Worker on the same cpu can be woken up by
881 * returning pointer to its task.
882 *
883 * CONTEXT:
884 * spin_lock_irq(rq->lock)
885 *
886 * Return:
887 * Worker task on @cpu to wake up, %NULL if none.
888 */
889struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
890{
891	struct worker *worker = kthread_data(task), *to_wakeup = NULL;
892	struct worker_pool *pool;
893
894	/*
895	 * Rescuers, which may not have all the fields set up like normal
896	 * workers, also reach here, let's not access anything before
897	 * checking NOT_RUNNING.
898	 */
899	if (worker->flags & WORKER_NOT_RUNNING)
900		return NULL;
901
902	pool = worker->pool;
903
904	/* this can only happen on the local cpu */
905	if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu))
906		return NULL;
907
908	/*
909	 * The counterpart of the following dec_and_test, implied mb,
910	 * worklist not empty test sequence is in insert_work().
911	 * Please read comment there.
912	 *
913	 * NOT_RUNNING is clear.  This means that we're bound to and
914	 * running on the local cpu w/ rq lock held and preemption
915	 * disabled, which in turn means that none else could be
916	 * manipulating idle_list, so dereferencing idle_list without pool
917	 * lock is safe.
918	 */
919	if (atomic_dec_and_test(&pool->nr_running) &&
920	    !list_empty(&pool->worklist))
921		to_wakeup = first_idle_worker(pool);
922	return to_wakeup ? to_wakeup->task : NULL;
923}
924
925/**
926 * worker_set_flags - set worker flags and adjust nr_running accordingly
927 * @worker: self
928 * @flags: flags to set
929 *
930 * Set @flags in @worker->flags and adjust nr_running accordingly.
931 *
932 * CONTEXT:
933 * spin_lock_irq(pool->lock)
934 */
935static inline void worker_set_flags(struct worker *worker, unsigned int flags)
936{
937	struct worker_pool *pool = worker->pool;
938
939	WARN_ON_ONCE(worker->task != current);
940
941	/* If transitioning into NOT_RUNNING, adjust nr_running. */
942	if ((flags & WORKER_NOT_RUNNING) &&
943	    !(worker->flags & WORKER_NOT_RUNNING)) {
944		atomic_dec(&pool->nr_running);
945	}
946
947	worker->flags |= flags;
948}
949
950/**
951 * worker_clr_flags - clear worker flags and adjust nr_running accordingly
952 * @worker: self
953 * @flags: flags to clear
954 *
955 * Clear @flags in @worker->flags and adjust nr_running accordingly.
956 *
957 * CONTEXT:
958 * spin_lock_irq(pool->lock)
959 */
960static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
961{
962	struct worker_pool *pool = worker->pool;
963	unsigned int oflags = worker->flags;
964
965	WARN_ON_ONCE(worker->task != current);
966
967	worker->flags &= ~flags;
968
969	/*
970	 * If transitioning out of NOT_RUNNING, increment nr_running.  Note
971	 * that the nested NOT_RUNNING is not a noop.  NOT_RUNNING is mask
972	 * of multiple flags, not a single flag.
973	 */
974	if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
975		if (!(worker->flags & WORKER_NOT_RUNNING))
976			atomic_inc(&pool->nr_running);
977}
978
979/**
980 * find_worker_executing_work - find worker which is executing a work
981 * @pool: pool of interest
982 * @work: work to find worker for
983 *
984 * Find a worker which is executing @work on @pool by searching
985 * @pool->busy_hash which is keyed by the address of @work.  For a worker
986 * to match, its current execution should match the address of @work and
987 * its work function.  This is to avoid unwanted dependency between
988 * unrelated work executions through a work item being recycled while still
989 * being executed.
990 *
991 * This is a bit tricky.  A work item may be freed once its execution
992 * starts and nothing prevents the freed area from being recycled for
993 * another work item.  If the same work item address ends up being reused
994 * before the original execution finishes, workqueue will identify the
995 * recycled work item as currently executing and make it wait until the
996 * current execution finishes, introducing an unwanted dependency.
997 *
998 * This function checks the work item address and work function to avoid
999 * false positives.  Note that this isn't complete as one may construct a
1000 * work function which can introduce dependency onto itself through a
1001 * recycled work item.  Well, if somebody wants to shoot oneself in the
1002 * foot that badly, there's only so much we can do, and if such deadlock
1003 * actually occurs, it should be easy to locate the culprit work function.
1004 *
1005 * CONTEXT:
1006 * spin_lock_irq(pool->lock).
1007 *
1008 * Return:
1009 * Pointer to worker which is executing @work if found, %NULL
1010 * otherwise.
1011 */
1012static struct worker *find_worker_executing_work(struct worker_pool *pool,
1013						 struct work_struct *work)
1014{
1015	struct worker *worker;
1016
1017	hash_for_each_possible(pool->busy_hash, worker, hentry,
1018			       (unsigned long)work)
1019		if (worker->current_work == work &&
1020		    worker->current_func == work->func)
1021			return worker;
1022
1023	return NULL;
1024}
1025
1026/**
1027 * move_linked_works - move linked works to a list
1028 * @work: start of series of works to be scheduled
1029 * @head: target list to append @work to
1030 * @nextp: out paramter for nested worklist walking
1031 *
1032 * Schedule linked works starting from @work to @head.  Work series to
1033 * be scheduled starts at @work and includes any consecutive work with
1034 * WORK_STRUCT_LINKED set in its predecessor.
1035 *
1036 * If @nextp is not NULL, it's updated to point to the next work of
1037 * the last scheduled work.  This allows move_linked_works() to be
1038 * nested inside outer list_for_each_entry_safe().
1039 *
1040 * CONTEXT:
1041 * spin_lock_irq(pool->lock).
1042 */
1043static void move_linked_works(struct work_struct *work, struct list_head *head,
1044			      struct work_struct **nextp)
1045{
1046	struct work_struct *n;
1047
1048	/*
1049	 * Linked worklist will always end before the end of the list,
1050	 * use NULL for list head.
1051	 */
1052	list_for_each_entry_safe_from(work, n, NULL, entry) {
1053		list_move_tail(&work->entry, head);
1054		if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
1055			break;
1056	}
1057
1058	/*
1059	 * If we're already inside safe list traversal and have moved
1060	 * multiple works to the scheduled queue, the next position
1061	 * needs to be updated.
1062	 */
1063	if (nextp)
1064		*nextp = n;
1065}
1066
1067/**
1068 * get_pwq - get an extra reference on the specified pool_workqueue
1069 * @pwq: pool_workqueue to get
1070 *
1071 * Obtain an extra reference on @pwq.  The caller should guarantee that
1072 * @pwq has positive refcnt and be holding the matching pool->lock.
1073 */
1074static void get_pwq(struct pool_workqueue *pwq)
1075{
1076	lockdep_assert_held(&pwq->pool->lock);
1077	WARN_ON_ONCE(pwq->refcnt <= 0);
1078	pwq->refcnt++;
1079}
1080
1081/**
1082 * put_pwq - put a pool_workqueue reference
1083 * @pwq: pool_workqueue to put
1084 *
1085 * Drop a reference of @pwq.  If its refcnt reaches zero, schedule its
1086 * destruction.  The caller should be holding the matching pool->lock.
1087 */
1088static void put_pwq(struct pool_workqueue *pwq)
1089{
1090	lockdep_assert_held(&pwq->pool->lock);
1091	if (likely(--pwq->refcnt))
1092		return;
1093	if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND)))
1094		return;
1095	/*
1096	 * @pwq can't be released under pool->lock, bounce to
1097	 * pwq_unbound_release_workfn().  This never recurses on the same
1098	 * pool->lock as this path is taken only for unbound workqueues and
1099	 * the release work item is scheduled on a per-cpu workqueue.  To
1100	 * avoid lockdep warning, unbound pool->locks are given lockdep
1101	 * subclass of 1 in get_unbound_pool().
1102	 */
1103	schedule_work(&pwq->unbound_release_work);
1104}
1105
1106/**
1107 * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock
1108 * @pwq: pool_workqueue to put (can be %NULL)
1109 *
1110 * put_pwq() with locking.  This function also allows %NULL @pwq.
1111 */
1112static void put_pwq_unlocked(struct pool_workqueue *pwq)
1113{
1114	if (pwq) {
1115		/*
1116		 * As both pwqs and pools are sched-RCU protected, the
1117		 * following lock operations are safe.
1118		 */
1119		spin_lock_irq(&pwq->pool->lock);
1120		put_pwq(pwq);
1121		spin_unlock_irq(&pwq->pool->lock);
1122	}
1123}
1124
1125static void pwq_activate_delayed_work(struct work_struct *work)
1126{
1127	struct pool_workqueue *pwq = get_work_pwq(work);
1128
1129	trace_workqueue_activate_work(work);
1130	move_linked_works(work, &pwq->pool->worklist, NULL);
1131	__clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
1132	pwq->nr_active++;
1133}
1134
1135static void pwq_activate_first_delayed(struct pool_workqueue *pwq)
1136{
1137	struct work_struct *work = list_first_entry(&pwq->delayed_works,
1138						    struct work_struct, entry);
1139
1140	pwq_activate_delayed_work(work);
1141}
1142
1143/**
1144 * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight
1145 * @pwq: pwq of interest
1146 * @color: color of work which left the queue
1147 *
1148 * A work either has completed or is removed from pending queue,
1149 * decrement nr_in_flight of its pwq and handle workqueue flushing.
1150 *
1151 * CONTEXT:
1152 * spin_lock_irq(pool->lock).
1153 */
1154static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
1155{
1156	/* uncolored work items don't participate in flushing or nr_active */
1157	if (color == WORK_NO_COLOR)
1158		goto out_put;
1159
1160	pwq->nr_in_flight[color]--;
1161
1162	pwq->nr_active--;
1163	if (!list_empty(&pwq->delayed_works)) {
1164		/* one down, submit a delayed one */
1165		if (pwq->nr_active < pwq->max_active)
1166			pwq_activate_first_delayed(pwq);
1167	}
1168
1169	/* is flush in progress and are we at the flushing tip? */
1170	if (likely(pwq->flush_color != color))
1171		goto out_put;
1172
1173	/* are there still in-flight works? */
1174	if (pwq->nr_in_flight[color])
1175		goto out_put;
1176
1177	/* this pwq is done, clear flush_color */
1178	pwq->flush_color = -1;
1179
1180	/*
1181	 * If this was the last pwq, wake up the first flusher.  It
1182	 * will handle the rest.
1183	 */
1184	if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
1185		complete(&pwq->wq->first_flusher->done);
1186out_put:
1187	put_pwq(pwq);
1188}
1189
1190/**
1191 * try_to_grab_pending - steal work item from worklist and disable irq
1192 * @work: work item to steal
1193 * @is_dwork: @work is a delayed_work
1194 * @flags: place to store irq state
1195 *
1196 * Try to grab PENDING bit of @work.  This function can handle @work in any
1197 * stable state - idle, on timer or on worklist.
1198 *
1199 * Return:
1200 *  1		if @work was pending and we successfully stole PENDING
1201 *  0		if @work was idle and we claimed PENDING
1202 *  -EAGAIN	if PENDING couldn't be grabbed at the moment, safe to busy-retry
1203 *  -ENOENT	if someone else is canceling @work, this state may persist
1204 *		for arbitrarily long
1205 *
1206 * Note:
1207 * On >= 0 return, the caller owns @work's PENDING bit.  To avoid getting
1208 * interrupted while holding PENDING and @work off queue, irq must be
1209 * disabled on entry.  This, combined with delayed_work->timer being
1210 * irqsafe, ensures that we return -EAGAIN for finite short period of time.
1211 *
1212 * On successful return, >= 0, irq is disabled and the caller is
1213 * responsible for releasing it using local_irq_restore(*@flags).
1214 *
1215 * This function is safe to call from any context including IRQ handler.
1216 */
1217static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
1218			       unsigned long *flags)
1219{
1220	struct worker_pool *pool;
1221	struct pool_workqueue *pwq;
1222
1223	local_irq_save(*flags);
1224
1225	/* try to steal the timer if it exists */
1226	if (is_dwork) {
1227		struct delayed_work *dwork = to_delayed_work(work);
1228
1229		/*
1230		 * dwork->timer is irqsafe.  If del_timer() fails, it's
1231		 * guaranteed that the timer is not queued anywhere and not
1232		 * running on the local CPU.
1233		 */
1234		if (likely(del_timer(&dwork->timer)))
1235			return 1;
1236	}
1237
1238	/* try to claim PENDING the normal way */
1239	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
1240		return 0;
1241
1242	/*
1243	 * The queueing is in progress, or it is already queued. Try to
1244	 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
1245	 */
1246	pool = get_work_pool(work);
1247	if (!pool)
1248		goto fail;
1249
1250	spin_lock(&pool->lock);
1251	/*
1252	 * work->data is guaranteed to point to pwq only while the work
1253	 * item is queued on pwq->wq, and both updating work->data to point
1254	 * to pwq on queueing and to pool on dequeueing are done under
1255	 * pwq->pool->lock.  This in turn guarantees that, if work->data
1256	 * points to pwq which is associated with a locked pool, the work
1257	 * item is currently queued on that pool.
1258	 */
1259	pwq = get_work_pwq(work);
1260	if (pwq && pwq->pool == pool) {
1261		debug_work_deactivate(work);
1262
1263		/*
1264		 * A delayed work item cannot be grabbed directly because
1265		 * it might have linked NO_COLOR work items which, if left
1266		 * on the delayed_list, will confuse pwq->nr_active
1267		 * management later on and cause stall.  Make sure the work
1268		 * item is activated before grabbing.
1269		 */
1270		if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
1271			pwq_activate_delayed_work(work);
1272
1273		list_del_init(&work->entry);
1274		pwq_dec_nr_in_flight(pwq, get_work_color(work));
1275
1276		/* work->data points to pwq iff queued, point to pool */
1277		set_work_pool_and_keep_pending(work, pool->id);
1278
1279		spin_unlock(&pool->lock);
1280		return 1;
1281	}
1282	spin_unlock(&pool->lock);
1283fail:
1284	local_irq_restore(*flags);
1285	if (work_is_canceling(work))
1286		return -ENOENT;
1287	cpu_relax();
1288	return -EAGAIN;
1289}
1290
1291/**
1292 * insert_work - insert a work into a pool
1293 * @pwq: pwq @work belongs to
1294 * @work: work to insert
1295 * @head: insertion point
1296 * @extra_flags: extra WORK_STRUCT_* flags to set
1297 *
1298 * Insert @work which belongs to @pwq after @head.  @extra_flags is or'd to
1299 * work_struct flags.
1300 *
1301 * CONTEXT:
1302 * spin_lock_irq(pool->lock).
1303 */
1304static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
1305			struct list_head *head, unsigned int extra_flags)
1306{
1307	struct worker_pool *pool = pwq->pool;
1308
1309	/* we own @work, set data and link */
1310	set_work_pwq(work, pwq, extra_flags);
1311	list_add_tail(&work->entry, head);
1312	get_pwq(pwq);
1313
1314	/*
1315	 * Ensure either wq_worker_sleeping() sees the above
1316	 * list_add_tail() or we see zero nr_running to avoid workers lying
1317	 * around lazily while there are works to be processed.
1318	 */
1319	smp_mb();
1320
1321	if (__need_more_worker(pool))
1322		wake_up_worker(pool);
1323}
1324
1325/*
1326 * Test whether @work is being queued from another work executing on the
1327 * same workqueue.
1328 */
1329static bool is_chained_work(struct workqueue_struct *wq)
1330{
1331	struct worker *worker;
1332
1333	worker = current_wq_worker();
1334	/*
1335	 * Return %true iff I'm a worker execuing a work item on @wq.  If
1336	 * I'm @worker, it's safe to dereference it without locking.
1337	 */
1338	return worker && worker->current_pwq->wq == wq;
1339}
1340
1341static void __queue_work(int cpu, struct workqueue_struct *wq,
1342			 struct work_struct *work)
1343{
1344	struct pool_workqueue *pwq;
1345	struct worker_pool *last_pool;
1346	struct list_head *worklist;
1347	unsigned int work_flags;
1348	unsigned int req_cpu = cpu;
1349
1350	/*
1351	 * While a work item is PENDING && off queue, a task trying to
1352	 * steal the PENDING will busy-loop waiting for it to either get
1353	 * queued or lose PENDING.  Grabbing PENDING and queueing should
1354	 * happen with IRQ disabled.
1355	 */
1356	WARN_ON_ONCE(!irqs_disabled());
1357
1358	debug_work_activate(work);
1359
1360	/* if draining, only works from the same workqueue are allowed */
1361	if (unlikely(wq->flags & __WQ_DRAINING) &&
1362	    WARN_ON_ONCE(!is_chained_work(wq)))
1363		return;
1364retry:
1365	if (req_cpu == WORK_CPU_UNBOUND)
1366		cpu = raw_smp_processor_id();
1367
1368	/* pwq which will be used unless @work is executing elsewhere */
1369	if (!(wq->flags & WQ_UNBOUND))
1370		pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
1371	else
1372		pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
1373
1374	/*
1375	 * If @work was previously on a different pool, it might still be
1376	 * running there, in which case the work needs to be queued on that
1377	 * pool to guarantee non-reentrancy.
1378	 */
1379	last_pool = get_work_pool(work);
1380	if (last_pool && last_pool != pwq->pool) {
1381		struct worker *worker;
1382
1383		spin_lock(&last_pool->lock);
1384
1385		worker = find_worker_executing_work(last_pool, work);
1386
1387		if (worker && worker->current_pwq->wq == wq) {
1388			pwq = worker->current_pwq;
1389		} else {
1390			/* meh... not running there, queue here */
1391			spin_unlock(&last_pool->lock);
1392			spin_lock(&pwq->pool->lock);
1393		}
1394	} else {
1395		spin_lock(&pwq->pool->lock);
1396	}
1397
1398	/*
1399	 * pwq is determined and locked.  For unbound pools, we could have
1400	 * raced with pwq release and it could already be dead.  If its
1401	 * refcnt is zero, repeat pwq selection.  Note that pwqs never die
1402	 * without another pwq replacing it in the numa_pwq_tbl or while
1403	 * work items are executing on it, so the retrying is guaranteed to
1404	 * make forward-progress.
1405	 */
1406	if (unlikely(!pwq->refcnt)) {
1407		if (wq->flags & WQ_UNBOUND) {
1408			spin_unlock(&pwq->pool->lock);
1409			cpu_relax();
1410			goto retry;
1411		}
1412		/* oops */
1413		WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
1414			  wq->name, cpu);
1415	}
1416
1417	/* pwq determined, queue */
1418	trace_workqueue_queue_work(req_cpu, pwq, work);
1419
1420	if (WARN_ON(!list_empty(&work->entry))) {
1421		spin_unlock(&pwq->pool->lock);
1422		return;
1423	}
1424
1425	pwq->nr_in_flight[pwq->work_color]++;
1426	work_flags = work_color_to_flags(pwq->work_color);
1427
1428	if (likely(pwq->nr_active < pwq->max_active)) {
1429		trace_workqueue_activate_work(work);
1430		pwq->nr_active++;
1431		worklist = &pwq->pool->worklist;
1432	} else {
1433		work_flags |= WORK_STRUCT_DELAYED;
1434		worklist = &pwq->delayed_works;
1435	}
1436
1437	insert_work(pwq, work, worklist, work_flags);
1438
1439	spin_unlock(&pwq->pool->lock);
1440}
1441
1442/**
1443 * queue_work_on - queue work on specific cpu
1444 * @cpu: CPU number to execute work on
1445 * @wq: workqueue to use
1446 * @work: work to queue
1447 *
1448 * We queue the work to a specific CPU, the caller must ensure it
1449 * can't go away.
1450 *
1451 * Return: %false if @work was already on a queue, %true otherwise.
1452 */
1453bool queue_work_on(int cpu, struct workqueue_struct *wq,
1454		   struct work_struct *work)
1455{
1456	bool ret = false;
1457	unsigned long flags;
1458
1459	local_irq_save(flags);
1460
1461	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1462		__queue_work(cpu, wq, work);
1463		ret = true;
1464	}
1465
1466	local_irq_restore(flags);
1467	return ret;
1468}
1469EXPORT_SYMBOL(queue_work_on);
1470
1471void delayed_work_timer_fn(unsigned long __data)
1472{
1473	struct delayed_work *dwork = (struct delayed_work *)__data;
1474
1475	/* should have been called from irqsafe timer with irq already off */
1476	__queue_work(dwork->cpu, dwork->wq, &dwork->work);
1477}
1478EXPORT_SYMBOL(delayed_work_timer_fn);
1479
1480static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
1481				struct delayed_work *dwork, unsigned long delay)
1482{
1483	struct timer_list *timer = &dwork->timer;
1484	struct work_struct *work = &dwork->work;
1485
1486	WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
1487		     timer->data != (unsigned long)dwork);
1488	WARN_ON_ONCE(timer_pending(timer));
1489	WARN_ON_ONCE(!list_empty(&work->entry));
1490
1491	/*
1492	 * If @delay is 0, queue @dwork->work immediately.  This is for
1493	 * both optimization and correctness.  The earliest @timer can
1494	 * expire is on the closest next tick and delayed_work users depend
1495	 * on that there's no such delay when @delay is 0.
1496	 */
1497	if (!delay) {
1498		__queue_work(cpu, wq, &dwork->work);
1499		return;
1500	}
1501
1502	timer_stats_timer_set_start_info(&dwork->timer);
1503
1504	dwork->wq = wq;
1505	dwork->cpu = cpu;
1506	timer->expires = jiffies + delay;
1507
1508	if (unlikely(cpu != WORK_CPU_UNBOUND))
1509		add_timer_on(timer, cpu);
1510	else
1511		add_timer(timer);
1512}
1513
1514/**
1515 * queue_delayed_work_on - queue work on specific CPU after delay
1516 * @cpu: CPU number to execute work on
1517 * @wq: workqueue to use
1518 * @dwork: work to queue
1519 * @delay: number of jiffies to wait before queueing
1520 *
1521 * Return: %false if @work was already on a queue, %true otherwise.  If
1522 * @delay is zero and @dwork is idle, it will be scheduled for immediate
1523 * execution.
1524 */
1525bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
1526			   struct delayed_work *dwork, unsigned long delay)
1527{
1528	struct work_struct *work = &dwork->work;
1529	bool ret = false;
1530	unsigned long flags;
1531
1532	/* read the comment in __queue_work() */
1533	local_irq_save(flags);
1534
1535	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1536		__queue_delayed_work(cpu, wq, dwork, delay);
1537		ret = true;
1538	}
1539
1540	local_irq_restore(flags);
1541	return ret;
1542}
1543EXPORT_SYMBOL(queue_delayed_work_on);
1544
1545/**
1546 * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
1547 * @cpu: CPU number to execute work on
1548 * @wq: workqueue to use
1549 * @dwork: work to queue
1550 * @delay: number of jiffies to wait before queueing
1551 *
1552 * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise,
1553 * modify @dwork's timer so that it expires after @delay.  If @delay is
1554 * zero, @work is guaranteed to be scheduled immediately regardless of its
1555 * current state.
1556 *
1557 * Return: %false if @dwork was idle and queued, %true if @dwork was
1558 * pending and its timer was modified.
1559 *
1560 * This function is safe to call from any context including IRQ handler.
1561 * See try_to_grab_pending() for details.
1562 */
1563bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
1564			 struct delayed_work *dwork, unsigned long delay)
1565{
1566	unsigned long flags;
1567	int ret;
1568
1569	do {
1570		ret = try_to_grab_pending(&dwork->work, true, &flags);
1571	} while (unlikely(ret == -EAGAIN));
1572
1573	if (likely(ret >= 0)) {
1574		__queue_delayed_work(cpu, wq, dwork, delay);
1575		local_irq_restore(flags);
1576	}
1577
1578	/* -ENOENT from try_to_grab_pending() becomes %true */
1579	return ret;
1580}
1581EXPORT_SYMBOL_GPL(mod_delayed_work_on);
1582
1583/**
1584 * worker_enter_idle - enter idle state
1585 * @worker: worker which is entering idle state
1586 *
1587 * @worker is entering idle state.  Update stats and idle timer if
1588 * necessary.
1589 *
1590 * LOCKING:
1591 * spin_lock_irq(pool->lock).
1592 */
1593static void worker_enter_idle(struct worker *worker)
1594{
1595	struct worker_pool *pool = worker->pool;
1596
1597	if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
1598	    WARN_ON_ONCE(!list_empty(&worker->entry) &&
1599			 (worker->hentry.next || worker->hentry.pprev)))
1600		return;
1601
1602	/* can't use worker_set_flags(), also called from create_worker() */
1603	worker->flags |= WORKER_IDLE;
1604	pool->nr_idle++;
1605	worker->last_active = jiffies;
1606
1607	/* idle_list is LIFO */
1608	list_add(&worker->entry, &pool->idle_list);
1609
1610	if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
1611		mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
1612
1613	/*
1614	 * Sanity check nr_running.  Because wq_unbind_fn() releases
1615	 * pool->lock between setting %WORKER_UNBOUND and zapping
1616	 * nr_running, the warning may trigger spuriously.  Check iff
1617	 * unbind is not in progress.
1618	 */
1619	WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
1620		     pool->nr_workers == pool->nr_idle &&
1621		     atomic_read(&pool->nr_running));
1622}
1623
1624/**
1625 * worker_leave_idle - leave idle state
1626 * @worker: worker which is leaving idle state
1627 *
1628 * @worker is leaving idle state.  Update stats.
1629 *
1630 * LOCKING:
1631 * spin_lock_irq(pool->lock).
1632 */
1633static void worker_leave_idle(struct worker *worker)
1634{
1635	struct worker_pool *pool = worker->pool;
1636
1637	if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))
1638		return;
1639	worker_clr_flags(worker, WORKER_IDLE);
1640	pool->nr_idle--;
1641	list_del_init(&worker->entry);
1642}
1643
1644static struct worker *alloc_worker(int node)
1645{
1646	struct worker *worker;
1647
1648	worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node);
1649	if (worker) {
1650		INIT_LIST_HEAD(&worker->entry);
1651		INIT_LIST_HEAD(&worker->scheduled);
1652		INIT_LIST_HEAD(&worker->node);
1653		/* on creation a worker is in !idle && prep state */
1654		worker->flags = WORKER_PREP;
1655	}
1656	return worker;
1657}
1658
1659/**
1660 * worker_attach_to_pool() - attach a worker to a pool
1661 * @worker: worker to be attached
1662 * @pool: the target pool
1663 *
1664 * Attach @worker to @pool.  Once attached, the %WORKER_UNBOUND flag and
1665 * cpu-binding of @worker are kept coordinated with the pool across
1666 * cpu-[un]hotplugs.
1667 */
1668static void worker_attach_to_pool(struct worker *worker,
1669				   struct worker_pool *pool)
1670{
1671	mutex_lock(&pool->attach_mutex);
1672
1673	/*
1674	 * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
1675	 * online CPUs.  It'll be re-applied when any of the CPUs come up.
1676	 */
1677	set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
1678
1679	/*
1680	 * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains
1681	 * stable across this function.  See the comments above the
1682	 * flag definition for details.
1683	 */
1684	if (pool->flags & POOL_DISASSOCIATED)
1685		worker->flags |= WORKER_UNBOUND;
1686
1687	list_add_tail(&worker->node, &pool->workers);
1688
1689	mutex_unlock(&pool->attach_mutex);
1690}
1691
1692/**
1693 * worker_detach_from_pool() - detach a worker from its pool
1694 * @worker: worker which is attached to its pool
1695 * @pool: the pool @worker is attached to
1696 *
1697 * Undo the attaching which had been done in worker_attach_to_pool().  The
1698 * caller worker shouldn't access to the pool after detached except it has
1699 * other reference to the pool.
1700 */
1701static void worker_detach_from_pool(struct worker *worker,
1702				    struct worker_pool *pool)
1703{
1704	struct completion *detach_completion = NULL;
1705
1706	mutex_lock(&pool->attach_mutex);
1707	list_del(&worker->node);
1708	if (list_empty(&pool->workers))
1709		detach_completion = pool->detach_completion;
1710	mutex_unlock(&pool->attach_mutex);
1711
1712	/* clear leftover flags without pool->lock after it is detached */
1713	worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
1714
1715	if (detach_completion)
1716		complete(detach_completion);
1717}
1718
1719/**
1720 * create_worker - create a new workqueue worker
1721 * @pool: pool the new worker will belong to
1722 *
1723 * Create and start a new worker which is attached to @pool.
1724 *
1725 * CONTEXT:
1726 * Might sleep.  Does GFP_KERNEL allocations.
1727 *
1728 * Return:
1729 * Pointer to the newly created worker.
1730 */
1731static struct worker *create_worker(struct worker_pool *pool)
1732{
1733	struct worker *worker = NULL;
1734	int id = -1;
1735	char id_buf[16];
1736
1737	/* ID is needed to determine kthread name */
1738	id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL);
1739	if (id < 0)
1740		goto fail;
1741
1742	worker = alloc_worker(pool->node);
1743	if (!worker)
1744		goto fail;
1745
1746	worker->pool = pool;
1747	worker->id = id;
1748
1749	if (pool->cpu >= 0)
1750		snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
1751			 pool->attrs->nice < 0  ? "H" : "");
1752	else
1753		snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);
1754
1755	worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
1756					      "kworker/%s", id_buf);
1757	if (IS_ERR(worker->task))
1758		goto fail;
1759
1760	set_user_nice(worker->task, pool->attrs->nice);
1761
1762	/* prevent userland from meddling with cpumask of workqueue workers */
1763	worker->task->flags |= PF_NO_SETAFFINITY;
1764
1765	/* successful, attach the worker to the pool */
1766	worker_attach_to_pool(worker, pool);
1767
1768	/* start the newly created worker */
1769	spin_lock_irq(&pool->lock);
1770	worker->pool->nr_workers++;
1771	worker_enter_idle(worker);
1772	wake_up_process(worker->task);
1773	spin_unlock_irq(&pool->lock);
1774
1775	return worker;
1776
1777fail:
1778	if (id >= 0)
1779		ida_simple_remove(&pool->worker_ida, id);
1780	kfree(worker);
1781	return NULL;
1782}
1783
1784/**
1785 * destroy_worker - destroy a workqueue worker
1786 * @worker: worker to be destroyed
1787 *
1788 * Destroy @worker and adjust @pool stats accordingly.  The worker should
1789 * be idle.
1790 *
1791 * CONTEXT:
1792 * spin_lock_irq(pool->lock).
1793 */
1794static void destroy_worker(struct worker *worker)
1795{
1796	struct worker_pool *pool = worker->pool;
1797
1798	lockdep_assert_held(&pool->lock);
1799
1800	/* sanity check frenzy */
1801	if (WARN_ON(worker->current_work) ||
1802	    WARN_ON(!list_empty(&worker->scheduled)) ||
1803	    WARN_ON(!(worker->flags & WORKER_IDLE)))
1804		return;
1805
1806	pool->nr_workers--;
1807	pool->nr_idle--;
1808
1809	list_del_init(&worker->entry);
1810	worker->flags |= WORKER_DIE;
1811	wake_up_process(worker->task);
1812}
1813
1814static void idle_worker_timeout(unsigned long __pool)
1815{
1816	struct worker_pool *pool = (void *)__pool;
1817
1818	spin_lock_irq(&pool->lock);
1819
1820	while (too_many_workers(pool)) {
1821		struct worker *worker;
1822		unsigned long expires;
1823
1824		/* idle_list is kept in LIFO order, check the last one */
1825		worker = list_entry(pool->idle_list.prev, struct worker, entry);
1826		expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1827
1828		if (time_before(jiffies, expires)) {
1829			mod_timer(&pool->idle_timer, expires);
1830			break;
1831		}
1832
1833		destroy_worker(worker);
1834	}
1835
1836	spin_unlock_irq(&pool->lock);
1837}
1838
1839static void send_mayday(struct work_struct *work)
1840{
1841	struct pool_workqueue *pwq = get_work_pwq(work);
1842	struct workqueue_struct *wq = pwq->wq;
1843
1844	lockdep_assert_held(&wq_mayday_lock);
1845
1846	if (!wq->rescuer)
1847		return;
1848
1849	/* mayday mayday mayday */
1850	if (list_empty(&pwq->mayday_node)) {
1851		/*
1852		 * If @pwq is for an unbound wq, its base ref may be put at
1853		 * any time due to an attribute change.  Pin @pwq until the
1854		 * rescuer is done with it.
1855		 */
1856		get_pwq(pwq);
1857		list_add_tail(&pwq->mayday_node, &wq->maydays);
1858		wake_up_process(wq->rescuer->task);
1859	}
1860}
1861
1862static void pool_mayday_timeout(unsigned long __pool)
1863{
1864	struct worker_pool *pool = (void *)__pool;
1865	struct work_struct *work;
1866
1867	spin_lock_irq(&pool->lock);
1868	spin_lock(&wq_mayday_lock);		/* for wq->maydays */
1869
1870	if (need_to_create_worker(pool)) {
1871		/*
1872		 * We've been trying to create a new worker but
1873		 * haven't been successful.  We might be hitting an
1874		 * allocation deadlock.  Send distress signals to
1875		 * rescuers.
1876		 */
1877		list_for_each_entry(work, &pool->worklist, entry)
1878			send_mayday(work);
1879	}
1880
1881	spin_unlock(&wq_mayday_lock);
1882	spin_unlock_irq(&pool->lock);
1883
1884	mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
1885}
1886
1887/**
1888 * maybe_create_worker - create a new worker if necessary
1889 * @pool: pool to create a new worker for
1890 *
1891 * Create a new worker for @pool if necessary.  @pool is guaranteed to
1892 * have at least one idle worker on return from this function.  If
1893 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
1894 * sent to all rescuers with works scheduled on @pool to resolve
1895 * possible allocation deadlock.
1896 *
1897 * On return, need_to_create_worker() is guaranteed to be %false and
1898 * may_start_working() %true.
1899 *
1900 * LOCKING:
1901 * spin_lock_irq(pool->lock) which may be released and regrabbed
1902 * multiple times.  Does GFP_KERNEL allocations.  Called only from
1903 * manager.
1904 */
1905static void maybe_create_worker(struct worker_pool *pool)
1906__releases(&pool->lock)
1907__acquires(&pool->lock)
1908{
1909restart:
1910	spin_unlock_irq(&pool->lock);
1911
1912	/* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
1913	mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
1914
1915	while (true) {
1916		if (create_worker(pool) || !need_to_create_worker(pool))
1917			break;
1918
1919		schedule_timeout_interruptible(CREATE_COOLDOWN);
1920
1921		if (!need_to_create_worker(pool))
1922			break;
1923	}
1924
1925	del_timer_sync(&pool->mayday_timer);
1926	spin_lock_irq(&pool->lock);
1927	/*
1928	 * This is necessary even after a new worker was just successfully
1929	 * created as @pool->lock was dropped and the new worker might have
1930	 * already become busy.
1931	 */
1932	if (need_to_create_worker(pool))
1933		goto restart;
1934}
1935
1936/**
1937 * manage_workers - manage worker pool
1938 * @worker: self
1939 *
1940 * Assume the manager role and manage the worker pool @worker belongs
1941 * to.  At any given time, there can be only zero or one manager per
1942 * pool.  The exclusion is handled automatically by this function.
1943 *
1944 * The caller can safely start processing works on false return.  On
1945 * true return, it's guaranteed that need_to_create_worker() is false
1946 * and may_start_working() is true.
1947 *
1948 * CONTEXT:
1949 * spin_lock_irq(pool->lock) which may be released and regrabbed
1950 * multiple times.  Does GFP_KERNEL allocations.
1951 *
1952 * Return:
1953 * %false if the pool doesn't need management and the caller can safely
1954 * start processing works, %true if management function was performed and
1955 * the conditions that the caller verified before calling the function may
1956 * no longer be true.
1957 */
1958static bool manage_workers(struct worker *worker)
1959{
1960	struct worker_pool *pool = worker->pool;
1961
1962	/*
1963	 * Anyone who successfully grabs manager_arb wins the arbitration
1964	 * and becomes the manager.  mutex_trylock() on pool->manager_arb
1965	 * failure while holding pool->lock reliably indicates that someone
1966	 * else is managing the pool and the worker which failed trylock
1967	 * can proceed to executing work items.  This means that anyone
1968	 * grabbing manager_arb is responsible for actually performing
1969	 * manager duties.  If manager_arb is grabbed and released without
1970	 * actual management, the pool may stall indefinitely.
1971	 */
1972	if (!mutex_trylock(&pool->manager_arb))
1973		return false;
1974	pool->manager = worker;
1975
1976	maybe_create_worker(pool);
1977
1978	pool->manager = NULL;
1979	mutex_unlock(&pool->manager_arb);
1980	return true;
1981}
1982
1983/**
1984 * process_one_work - process single work
1985 * @worker: self
1986 * @work: work to process
1987 *
1988 * Process @work.  This function contains all the logics necessary to
1989 * process a single work including synchronization against and
1990 * interaction with other workers on the same cpu, queueing and
1991 * flushing.  As long as context requirement is met, any worker can
1992 * call this function to process a work.
1993 *
1994 * CONTEXT:
1995 * spin_lock_irq(pool->lock) which is released and regrabbed.
1996 */
1997static void process_one_work(struct worker *worker, struct work_struct *work)
1998__releases(&pool->lock)
1999__acquires(&pool->lock)
2000{
2001	struct pool_workqueue *pwq = get_work_pwq(work);
2002	struct worker_pool *pool = worker->pool;
2003	bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
2004	int work_color;
2005	struct worker *collision;
2006#ifdef CONFIG_LOCKDEP
2007	/*
2008	 * It is permissible to free the struct work_struct from
2009	 * inside the function that is called from it, this we need to
2010	 * take into account for lockdep too.  To avoid bogus "held
2011	 * lock freed" warnings as well as problems when looking into
2012	 * work->lockdep_map, make a copy and use that here.
2013	 */
2014	struct lockdep_map lockdep_map;
2015
2016	lockdep_copy_map(&lockdep_map, &work->lockdep_map);
2017#endif
2018	/* ensure we're on the correct CPU */
2019	WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
2020		     raw_smp_processor_id() != pool->cpu);
2021
2022	/*
2023	 * A single work shouldn't be executed concurrently by
2024	 * multiple workers on a single cpu.  Check whether anyone is
2025	 * already processing the work.  If so, defer the work to the
2026	 * currently executing one.
2027	 */
2028	collision = find_worker_executing_work(pool, work);
2029	if (unlikely(collision)) {
2030		move_linked_works(work, &collision->scheduled, NULL);
2031		return;
2032	}
2033
2034	/* claim and dequeue */
2035	debug_work_deactivate(work);
2036	hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
2037	worker->current_work = work;
2038	worker->current_func = work->func;
2039	worker->current_pwq = pwq;
2040	work_color = get_work_color(work);
2041
2042	list_del_init(&work->entry);
2043
2044	/*
2045	 * CPU intensive works don't participate in concurrency management.
2046	 * They're the scheduler's responsibility.  This takes @worker out
2047	 * of concurrency management and the next code block will chain
2048	 * execution of the pending work items.
2049	 */
2050	if (unlikely(cpu_intensive))
2051		worker_set_flags(worker, WORKER_CPU_INTENSIVE);
2052
2053	/*
2054	 * Wake up another worker if necessary.  The condition is always
2055	 * false for normal per-cpu workers since nr_running would always
2056	 * be >= 1 at this point.  This is used to chain execution of the
2057	 * pending work items for WORKER_NOT_RUNNING workers such as the
2058	 * UNBOUND and CPU_INTENSIVE ones.
2059	 */
2060	if (need_more_worker(pool))
2061		wake_up_worker(pool);
2062
2063	/*
2064	 * Record the last pool and clear PENDING which should be the last
2065	 * update to @work.  Also, do this inside @pool->lock so that
2066	 * PENDING and queued state changes happen together while IRQ is
2067	 * disabled.
2068	 */
2069	set_work_pool_and_clear_pending(work, pool->id);
2070
2071	spin_unlock_irq(&pool->lock);
2072
2073	lock_map_acquire_read(&pwq->wq->lockdep_map);
2074	lock_map_acquire(&lockdep_map);
2075	trace_workqueue_execute_start(work);
2076	worker->current_func(work);
2077	/*
2078	 * While we must be careful to not use "work" after this, the trace
2079	 * point will only record its address.
2080	 */
2081	trace_workqueue_execute_end(work);
2082	lock_map_release(&lockdep_map);
2083	lock_map_release(&pwq->wq->lockdep_map);
2084
2085	if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
2086		pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
2087		       "     last function: %pf\n",
2088		       current->comm, preempt_count(), task_pid_nr(current),
2089		       worker->current_func);
2090		debug_show_held_locks(current);
2091		dump_stack();
2092	}
2093
2094	/*
2095	 * The following prevents a kworker from hogging CPU on !PREEMPT
2096	 * kernels, where a requeueing work item waiting for something to
2097	 * happen could deadlock with stop_machine as such work item could
2098	 * indefinitely requeue itself while all other CPUs are trapped in
2099	 * stop_machine. At the same time, report a quiescent RCU state so
2100	 * the same condition doesn't freeze RCU.
2101	 */
2102	cond_resched_rcu_qs();
2103
2104	spin_lock_irq(&pool->lock);
2105
2106	/* clear cpu intensive status */
2107	if (unlikely(cpu_intensive))
2108		worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
2109
2110	/* we're done with it, release */
2111	hash_del(&worker->hentry);
2112	worker->current_work = NULL;
2113	worker->current_func = NULL;
2114	worker->current_pwq = NULL;
2115	worker->desc_valid = false;
2116	pwq_dec_nr_in_flight(pwq, work_color);
2117}
2118
2119/**
2120 * process_scheduled_works - process scheduled works
2121 * @worker: self
2122 *
2123 * Process all scheduled works.  Please note that the scheduled list
2124 * may change while processing a work, so this function repeatedly
2125 * fetches a work from the top and executes it.
2126 *
2127 * CONTEXT:
2128 * spin_lock_irq(pool->lock) which may be released and regrabbed
2129 * multiple times.
2130 */
2131static void process_scheduled_works(struct worker *worker)
2132{
2133	while (!list_empty(&worker->scheduled)) {
2134		struct work_struct *work = list_first_entry(&worker->scheduled,
2135						struct work_struct, entry);
2136		process_one_work(worker, work);
2137	}
2138}
2139
2140/**
2141 * worker_thread - the worker thread function
2142 * @__worker: self
2143 *
2144 * The worker thread function.  All workers belong to a worker_pool -
2145 * either a per-cpu one or dynamic unbound one.  These workers process all
2146 * work items regardless of their specific target workqueue.  The only
2147 * exception is work items which belong to workqueues with a rescuer which
2148 * will be explained in rescuer_thread().
2149 *
2150 * Return: 0
2151 */
2152static int worker_thread(void *__worker)
2153{
2154	struct worker *worker = __worker;
2155	struct worker_pool *pool = worker->pool;
2156
2157	/* tell the scheduler that this is a workqueue worker */
2158	worker->task->flags |= PF_WQ_WORKER;
2159woke_up:
2160	spin_lock_irq(&pool->lock);
2161
2162	/* am I supposed to die? */
2163	if (unlikely(worker->flags & WORKER_DIE)) {
2164		spin_unlock_irq(&pool->lock);
2165		WARN_ON_ONCE(!list_empty(&worker->entry));
2166		worker->task->flags &= ~PF_WQ_WORKER;
2167
2168		set_task_comm(worker->task, "kworker/dying");
2169		ida_simple_remove(&pool->worker_ida, worker->id);
2170		worker_detach_from_pool(worker, pool);
2171		kfree(worker);
2172		return 0;
2173	}
2174
2175	worker_leave_idle(worker);
2176recheck:
2177	/* no more worker necessary? */
2178	if (!need_more_worker(pool))
2179		goto sleep;
2180
2181	/* do we need to manage? */
2182	if (unlikely(!may_start_working(pool)) && manage_workers(worker))
2183		goto recheck;
2184
2185	/*
2186	 * ->scheduled list can only be filled while a worker is
2187	 * preparing to process a work or actually processing it.
2188	 * Make sure nobody diddled with it while I was sleeping.
2189	 */
2190	WARN_ON_ONCE(!list_empty(&worker->scheduled));
2191
2192	/*
2193	 * Finish PREP stage.  We're guaranteed to have at least one idle
2194	 * worker or that someone else has already assumed the manager
2195	 * role.  This is where @worker starts participating in concurrency
2196	 * management if applicable and concurrency management is restored
2197	 * after being rebound.  See rebind_workers() for details.
2198	 */
2199	worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);
2200
2201	do {
2202		struct work_struct *work =
2203			list_first_entry(&pool->worklist,
2204					 struct work_struct, entry);
2205
2206		if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
2207			/* optimization path, not strictly necessary */
2208			process_one_work(worker, work);
2209			if (unlikely(!list_empty(&worker->scheduled)))
2210				process_scheduled_works(worker);
2211		} else {
2212			move_linked_works(work, &worker->scheduled, NULL);
2213			process_scheduled_works(worker);
2214		}
2215	} while (keep_working(pool));
2216
2217	worker_set_flags(worker, WORKER_PREP);
2218sleep:
2219	/*
2220	 * pool->lock is held and there's no work to process and no need to
2221	 * manage, sleep.  Workers are woken up only while holding
2222	 * pool->lock or from local cpu, so setting the current state
2223	 * before releasing pool->lock is enough to prevent losing any
2224	 * event.
2225	 */
2226	worker_enter_idle(worker);
2227	__set_current_state(TASK_INTERRUPTIBLE);
2228	spin_unlock_irq(&pool->lock);
2229	schedule();
2230	goto woke_up;
2231}
2232
2233/**
2234 * rescuer_thread - the rescuer thread function
2235 * @__rescuer: self
2236 *
2237 * Workqueue rescuer thread function.  There's one rescuer for each
2238 * workqueue which has WQ_MEM_RECLAIM set.
2239 *
2240 * Regular work processing on a pool may block trying to create a new
2241 * worker which uses GFP_KERNEL allocation which has slight chance of
2242 * developing into deadlock if some works currently on the same queue
2243 * need to be processed to satisfy the GFP_KERNEL allocation.  This is
2244 * the problem rescuer solves.
2245 *
2246 * When such condition is possible, the pool summons rescuers of all
2247 * workqueues which have works queued on the pool and let them process
2248 * those works so that forward progress can be guaranteed.
2249 *
2250 * This should happen rarely.
2251 *
2252 * Return: 0
2253 */
2254static int rescuer_thread(void *__rescuer)
2255{
2256	struct worker *rescuer = __rescuer;
2257	struct workqueue_struct *wq = rescuer->rescue_wq;
2258	struct list_head *scheduled = &rescuer->scheduled;
2259	bool should_stop;
2260
2261	set_user_nice(current, RESCUER_NICE_LEVEL);
2262
2263	/*
2264	 * Mark rescuer as worker too.  As WORKER_PREP is never cleared, it
2265	 * doesn't participate in concurrency management.
2266	 */
2267	rescuer->task->flags |= PF_WQ_WORKER;
2268repeat:
2269	set_current_state(TASK_INTERRUPTIBLE);
2270
2271	/*
2272	 * By the time the rescuer is requested to stop, the workqueue
2273	 * shouldn't have any work pending, but @wq->maydays may still have
2274	 * pwq(s) queued.  This can happen by non-rescuer workers consuming
2275	 * all the work items before the rescuer got to them.  Go through
2276	 * @wq->maydays processing before acting on should_stop so that the
2277	 * list is always empty on exit.
2278	 */
2279	should_stop = kthread_should_stop();
2280
2281	/* see whether any pwq is asking for help */
2282	spin_lock_irq(&wq_mayday_lock);
2283
2284	while (!list_empty(&wq->maydays)) {
2285		struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
2286					struct pool_workqueue, mayday_node);
2287		struct worker_pool *pool = pwq->pool;
2288		struct work_struct *work, *n;
2289
2290		__set_current_state(TASK_RUNNING);
2291		list_del_init(&pwq->mayday_node);
2292
2293		spin_unlock_irq(&wq_mayday_lock);
2294
2295		worker_attach_to_pool(rescuer, pool);
2296
2297		spin_lock_irq(&pool->lock);
2298		rescuer->pool = pool;
2299
2300		/*
2301		 * Slurp in all works issued via this workqueue and
2302		 * process'em.
2303		 */
2304		WARN_ON_ONCE(!list_empty(scheduled));
2305		list_for_each_entry_safe(work, n, &pool->worklist, entry)
2306			if (get_work_pwq(work) == pwq)
2307				move_linked_works(work, scheduled, &n);
2308
2309		if (!list_empty(scheduled)) {
2310			process_scheduled_works(rescuer);
2311
2312			/*
2313			 * The above execution of rescued work items could
2314			 * have created more to rescue through
2315			 * pwq_activate_first_delayed() or chained
2316			 * queueing.  Let's put @pwq back on mayday list so
2317			 * that such back-to-back work items, which may be
2318			 * being used to relieve memory pressure, don't
2319			 * incur MAYDAY_INTERVAL delay inbetween.
2320			 */
2321			if (need_to_create_worker(pool)) {
2322				spin_lock(&wq_mayday_lock);
2323				get_pwq(pwq);
2324				list_move_tail(&pwq->mayday_node, &wq->maydays);
2325				spin_unlock(&wq_mayday_lock);
2326			}
2327		}
2328
2329		/*
2330		 * Put the reference grabbed by send_mayday().  @pool won't
2331		 * go away while we're still attached to it.
2332		 */
2333		put_pwq(pwq);
2334
2335		/*
2336		 * Leave this pool.  If need_more_worker() is %true, notify a
2337		 * regular worker; otherwise, we end up with 0 concurrency
2338		 * and stalling the execution.
2339		 */
2340		if (need_more_worker(pool))
2341			wake_up_worker(pool);
2342
2343		rescuer->pool = NULL;
2344		spin_unlock_irq(&pool->lock);
2345
2346		worker_detach_from_pool(rescuer, pool);
2347
2348		spin_lock_irq(&wq_mayday_lock);
2349	}
2350
2351	spin_unlock_irq(&wq_mayday_lock);
2352
2353	if (should_stop) {
2354		__set_current_state(TASK_RUNNING);
2355		rescuer->task->flags &= ~PF_WQ_WORKER;
2356		return 0;
2357	}
2358
2359	/* rescuers should never participate in concurrency management */
2360	WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
2361	schedule();
2362	goto repeat;
2363}
2364
2365struct wq_barrier {
2366	struct work_struct	work;
2367	struct completion	done;
2368	struct task_struct	*task;	/* purely informational */
2369};
2370
2371static void wq_barrier_func(struct work_struct *work)
2372{
2373	struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
2374	complete(&barr->done);
2375}
2376
2377/**
2378 * insert_wq_barrier - insert a barrier work
2379 * @pwq: pwq to insert barrier into
2380 * @barr: wq_barrier to insert
2381 * @target: target work to attach @barr to
2382 * @worker: worker currently executing @target, NULL if @target is not executing
2383 *
2384 * @barr is linked to @target such that @barr is completed only after
2385 * @target finishes execution.  Please note that the ordering
2386 * guarantee is observed only with respect to @target and on the local
2387 * cpu.
2388 *
2389 * Currently, a queued barrier can't be canceled.  This is because
2390 * try_to_grab_pending() can't determine whether the work to be
2391 * grabbed is at the head of the queue and thus can't clear LINKED
2392 * flag of the previous work while there must be a valid next work
2393 * after a work with LINKED flag set.
2394 *
2395 * Note that when @worker is non-NULL, @target may be modified
2396 * underneath us, so we can't reliably determine pwq from @target.
2397 *
2398 * CONTEXT:
2399 * spin_lock_irq(pool->lock).
2400 */
2401static void insert_wq_barrier(struct pool_workqueue *pwq,
2402			      struct wq_barrier *barr,
2403			      struct work_struct *target, struct worker *worker)
2404{
2405	struct list_head *head;
2406	unsigned int linked = 0;
2407
2408	/*
2409	 * debugobject calls are safe here even with pool->lock locked
2410	 * as we know for sure that this will not trigger any of the
2411	 * checks and call back into the fixup functions where we
2412	 * might deadlock.
2413	 */
2414	INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
2415	__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
2416	init_completion(&barr->done);
2417	barr->task = current;
2418
2419	/*
2420	 * If @target is currently being executed, schedule the
2421	 * barrier to the worker; otherwise, put it after @target.
2422	 */
2423	if (worker)
2424		head = worker->scheduled.next;
2425	else {
2426		unsigned long *bits = work_data_bits(target);
2427
2428		head = target->entry.next;
2429		/* there can already be other linked works, inherit and set */
2430		linked = *bits & WORK_STRUCT_LINKED;
2431		__set_bit(WORK_STRUCT_LINKED_BIT, bits);
2432	}
2433
2434	debug_work_activate(&barr->work);
2435	insert_work(pwq, &barr->work, head,
2436		    work_color_to_flags(WORK_NO_COLOR) | linked);
2437}
2438
2439/**
2440 * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing
2441 * @wq: workqueue being flushed
2442 * @flush_color: new flush color, < 0 for no-op
2443 * @work_color: new work color, < 0 for no-op
2444 *
2445 * Prepare pwqs for workqueue flushing.
2446 *
2447 * If @flush_color is non-negative, flush_color on all pwqs should be
2448 * -1.  If no pwq has in-flight commands at the specified color, all
2449 * pwq->flush_color's stay at -1 and %false is returned.  If any pwq
2450 * has in flight commands, its pwq->flush_color is set to
2451 * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq
2452 * wakeup logic is armed and %true is returned.
2453 *
2454 * The caller should have initialized @wq->first_flusher prior to
2455 * calling this function with non-negative @flush_color.  If
2456 * @flush_color is negative, no flush color update is done and %false
2457 * is returned.
2458 *
2459 * If @work_color is non-negative, all pwqs should have the same
2460 * work_color which is previous to @work_color and all will be
2461 * advanced to @work_color.
2462 *
2463 * CONTEXT:
2464 * mutex_lock(wq->mutex).
2465 *
2466 * Return:
2467 * %true if @flush_color >= 0 and there's something to flush.  %false
2468 * otherwise.
2469 */
2470static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
2471				      int flush_color, int work_color)
2472{
2473	bool wait = false;
2474	struct pool_workqueue *pwq;
2475
2476	if (flush_color >= 0) {
2477		WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush));
2478		atomic_set(&wq->nr_pwqs_to_flush, 1);
2479	}
2480
2481	for_each_pwq(pwq, wq) {
2482		struct worker_pool *pool = pwq->pool;
2483
2484		spin_lock_irq(&pool->lock);
2485
2486		if (flush_color >= 0) {
2487			WARN_ON_ONCE(pwq->flush_color != -1);
2488
2489			if (pwq->nr_in_flight[flush_color]) {
2490				pwq->flush_color = flush_color;
2491				atomic_inc(&wq->nr_pwqs_to_flush);
2492				wait = true;
2493			}
2494		}
2495
2496		if (work_color >= 0) {
2497			WARN_ON_ONCE(work_color != work_next_color(pwq->work_color));
2498			pwq->work_color = work_color;
2499		}
2500
2501		spin_unlock_irq(&pool->lock);
2502	}
2503
2504	if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
2505		complete(&wq->first_flusher->done);
2506
2507	return wait;
2508}
2509
2510/**
2511 * flush_workqueue - ensure that any scheduled work has run to completion.
2512 * @wq: workqueue to flush
2513 *
2514 * This function sleeps until all work items which were queued on entry
2515 * have finished execution, but it is not livelocked by new incoming ones.
2516 */
2517void flush_workqueue(struct workqueue_struct *wq)
2518{
2519	struct wq_flusher this_flusher = {
2520		.list = LIST_HEAD_INIT(this_flusher.list),
2521		.flush_color = -1,
2522		.done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
2523	};
2524	int next_color;
2525
2526	lock_map_acquire(&wq->lockdep_map);
2527	lock_map_release(&wq->lockdep_map);
2528
2529	mutex_lock(&wq->mutex);
2530
2531	/*
2532	 * Start-to-wait phase
2533	 */
2534	next_color = work_next_color(wq->work_color);
2535
2536	if (next_color != wq->flush_color) {
2537		/*
2538		 * Color space is not full.  The current work_color
2539		 * becomes our flush_color and work_color is advanced
2540		 * by one.
2541		 */
2542		WARN_ON_ONCE(!list_empty(&wq->flusher_overflow));
2543		this_flusher.flush_color = wq->work_color;
2544		wq->work_color = next_color;
2545
2546		if (!wq->first_flusher) {
2547			/* no flush in progress, become the first flusher */
2548			WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
2549
2550			wq->first_flusher = &this_flusher;
2551
2552			if (!flush_workqueue_prep_pwqs(wq, wq->flush_color,
2553						       wq->work_color)) {
2554				/* nothing to flush, done */
2555				wq->flush_color = next_color;
2556				wq->first_flusher = NULL;
2557				goto out_unlock;
2558			}
2559		} else {
2560			/* wait in queue */
2561			WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color);
2562			list_add_tail(&this_flusher.list, &wq->flusher_queue);
2563			flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
2564		}
2565	} else {
2566		/*
2567		 * Oops, color space is full, wait on overflow queue.
2568		 * The next flush completion will assign us
2569		 * flush_color and transfer to flusher_queue.
2570		 */
2571		list_add_tail(&this_flusher.list, &wq->flusher_overflow);
2572	}
2573
2574	mutex_unlock(&wq->mutex);
2575
2576	wait_for_completion(&this_flusher.done);
2577
2578	/*
2579	 * Wake-up-and-cascade phase
2580	 *
2581	 * First flushers are responsible for cascading flushes and
2582	 * handling overflow.  Non-first flushers can simply return.
2583	 */
2584	if (wq->first_flusher != &this_flusher)
2585		return;
2586
2587	mutex_lock(&wq->mutex);
2588
2589	/* we might have raced, check again with mutex held */
2590	if (wq->first_flusher != &this_flusher)
2591		goto out_unlock;
2592
2593	wq->first_flusher = NULL;
2594
2595	WARN_ON_ONCE(!list_empty(&this_flusher.list));
2596	WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
2597
2598	while (true) {
2599		struct wq_flusher *next, *tmp;
2600
2601		/* complete all the flushers sharing the current flush color */
2602		list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
2603			if (next->flush_color != wq->flush_color)
2604				break;
2605			list_del_init(&next->list);
2606			complete(&next->done);
2607		}
2608
2609		WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) &&
2610			     wq->flush_color != work_next_color(wq->work_color));
2611
2612		/* this flush_color is finished, advance by one */
2613		wq->flush_color = work_next_color(wq->flush_color);
2614
2615		/* one color has been freed, handle overflow queue */
2616		if (!list_empty(&wq->flusher_overflow)) {
2617			/*
2618			 * Assign the same color to all overflowed
2619			 * flushers, advance work_color and append to
2620			 * flusher_queue.  This is the start-to-wait
2621			 * phase for these overflowed flushers.
2622			 */
2623			list_for_each_entry(tmp, &wq->flusher_overflow, list)
2624				tmp->flush_color = wq->work_color;
2625
2626			wq->work_color = work_next_color(wq->work_color);
2627
2628			list_splice_tail_init(&wq->flusher_overflow,
2629					      &wq->flusher_queue);
2630			flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
2631		}
2632
2633		if (list_empty(&wq->flusher_queue)) {
2634			WARN_ON_ONCE(wq->flush_color != wq->work_color);
2635			break;
2636		}
2637
2638		/*
2639		 * Need to flush more colors.  Make the next flusher
2640		 * the new first flusher and arm pwqs.
2641		 */
2642		WARN_ON_ONCE(wq->flush_color == wq->work_color);
2643		WARN_ON_ONCE(wq->flush_color != next->flush_color);
2644
2645		list_del_init(&next->list);
2646		wq->first_flusher = next;
2647
2648		if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1))
2649			break;
2650
2651		/*
2652		 * Meh... this color is already done, clear first
2653		 * flusher and repeat cascading.
2654		 */
2655		wq->first_flusher = NULL;
2656	}
2657
2658out_unlock:
2659	mutex_unlock(&wq->mutex);
2660}
2661EXPORT_SYMBOL_GPL(flush_workqueue);
2662
2663/**
2664 * drain_workqueue - drain a workqueue
2665 * @wq: workqueue to drain
2666 *
2667 * Wait until the workqueue becomes empty.  While draining is in progress,
2668 * only chain queueing is allowed.  IOW, only currently pending or running
2669 * work items on @wq can queue further work items on it.  @wq is flushed
2670 * repeatedly until it becomes empty.  The number of flushing is detemined
2671 * by the depth of chaining and should be relatively short.  Whine if it
2672 * takes too long.
2673 */
2674void drain_workqueue(struct workqueue_struct *wq)
2675{
2676	unsigned int flush_cnt = 0;
2677	struct pool_workqueue *pwq;
2678
2679	/*
2680	 * __queue_work() needs to test whether there are drainers, is much
2681	 * hotter than drain_workqueue() and already looks at @wq->flags.
2682	 * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers.
2683	 */
2684	mutex_lock(&wq->mutex);
2685	if (!wq->nr_drainers++)
2686		wq->flags |= __WQ_DRAINING;
2687	mutex_unlock(&wq->mutex);
2688reflush:
2689	flush_workqueue(wq);
2690
2691	mutex_lock(&wq->mutex);
2692
2693	for_each_pwq(pwq, wq) {
2694		bool drained;
2695
2696		spin_lock_irq(&pwq->pool->lock);
2697		drained = !pwq->nr_active && list_empty(&pwq->delayed_works);
2698		spin_unlock_irq(&pwq->pool->lock);
2699
2700		if (drained)
2701			continue;
2702
2703		if (++flush_cnt == 10 ||
2704		    (flush_cnt % 100 == 0 && flush_cnt <= 1000))
2705			pr_warn("workqueue %s: drain_workqueue() isn't complete after %u tries\n",
2706				wq->name, flush_cnt);
2707
2708		mutex_unlock(&wq->mutex);
2709		goto reflush;
2710	}
2711
2712	if (!--wq->nr_drainers)
2713		wq->flags &= ~__WQ_DRAINING;
2714	mutex_unlock(&wq->mutex);
2715}
2716EXPORT_SYMBOL_GPL(drain_workqueue);
2717
2718static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
2719{
2720	struct worker *worker = NULL;
2721	struct worker_pool *pool;
2722	struct pool_workqueue *pwq;
2723
2724	might_sleep();
2725
2726	local_irq_disable();
2727	pool = get_work_pool(work);
2728	if (!pool) {
2729		local_irq_enable();
2730		return false;
2731	}
2732
2733	spin_lock(&pool->lock);
2734	/* see the comment in try_to_grab_pending() with the same code */
2735	pwq = get_work_pwq(work);
2736	if (pwq) {
2737		if (unlikely(pwq->pool != pool))
2738			goto already_gone;
2739	} else {
2740		worker = find_worker_executing_work(pool, work);
2741		if (!worker)
2742			goto already_gone;
2743		pwq = worker->current_pwq;
2744	}
2745
2746	insert_wq_barrier(pwq, barr, work, worker);
2747	spin_unlock_irq(&pool->lock);
2748
2749	/*
2750	 * If @max_active is 1 or rescuer is in use, flushing another work
2751	 * item on the same workqueue may lead to deadlock.  Make sure the
2752	 * flusher is not running on the same workqueue by verifying write
2753	 * access.
2754	 */
2755	if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)
2756		lock_map_acquire(&pwq->wq->lockdep_map);
2757	else
2758		lock_map_acquire_read(&pwq->wq->lockdep_map);
2759	lock_map_release(&pwq->wq->lockdep_map);
2760
2761	return true;
2762already_gone:
2763	spin_unlock_irq(&pool->lock);
2764	return false;
2765}
2766
2767/**
2768 * flush_work - wait for a work to finish executing the last queueing instance
2769 * @work: the work to flush
2770 *
2771 * Wait until @work has finished execution.  @work is guaranteed to be idle
2772 * on return if it hasn't been requeued since flush started.
2773 *
2774 * Return:
2775 * %true if flush_work() waited for the work to finish execution,
2776 * %false if it was already idle.
2777 */
2778bool flush_work(struct work_struct *work)
2779{
2780	struct wq_barrier barr;
2781
2782	lock_map_acquire(&work->lockdep_map);
2783	lock_map_release(&work->lockdep_map);
2784
2785	if (start_flush_work(work, &barr)) {
2786		wait_for_completion(&barr.done);
2787		destroy_work_on_stack(&barr.work);
2788		return true;
2789	} else {
2790		return false;
2791	}
2792}
2793EXPORT_SYMBOL_GPL(flush_work);
2794
2795struct cwt_wait {
2796	wait_queue_t		wait;
2797	struct work_struct	*work;
2798};
2799
2800static int cwt_wakefn(wait_queue_t *wait, unsigned mode, int sync, void *key)
2801{
2802	struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait);
2803
2804	if (cwait->work != key)
2805		return 0;
2806	return autoremove_wake_function(wait, mode, sync, key);
2807}
2808
2809static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
2810{
2811	static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq);
2812	unsigned long flags;
2813	int ret;
2814
2815	do {
2816		ret = try_to_grab_pending(work, is_dwork, &flags);
2817		/*
2818		 * If someone else is already canceling, wait for it to
2819		 * finish.  flush_work() doesn't work for PREEMPT_NONE
2820		 * because we may get scheduled between @work's completion
2821		 * and the other canceling task resuming and clearing
2822		 * CANCELING - flush_work() will return false immediately
2823		 * as @work is no longer busy, try_to_grab_pending() will
2824		 * return -ENOENT as @work is still being canceled and the
2825		 * other canceling task won't be able to clear CANCELING as
2826		 * we're hogging the CPU.
2827		 *
2828		 * Let's wait for completion using a waitqueue.  As this
2829		 * may lead to the thundering herd problem, use a custom
2830		 * wake function which matches @work along with exclusive
2831		 * wait and wakeup.
2832		 */
2833		if (unlikely(ret == -ENOENT)) {
2834			struct cwt_wait cwait;
2835
2836			init_wait(&cwait.wait);
2837			cwait.wait.func = cwt_wakefn;
2838			cwait.work = work;
2839
2840			prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait,
2841						  TASK_UNINTERRUPTIBLE);
2842			if (work_is_canceling(work))
2843				schedule();
2844			finish_wait(&cancel_waitq, &cwait.wait);
2845		}
2846	} while (unlikely(ret < 0));
2847
2848	/* tell other tasks trying to grab @work to back off */
2849	mark_work_canceling(work);
2850	local_irq_restore(flags);
2851
2852	flush_work(work);
2853	clear_work_data(work);
2854
2855	/*
2856	 * Paired with prepare_to_wait() above so that either
2857	 * waitqueue_active() is visible here or !work_is_canceling() is
2858	 * visible there.
2859	 */
2860	smp_mb();
2861	if (waitqueue_active(&cancel_waitq))
2862		__wake_up(&cancel_waitq, TASK_NORMAL, 1, work);
2863
2864	return ret;
2865}
2866
2867/**
2868 * cancel_work_sync - cancel a work and wait for it to finish
2869 * @work: the work to cancel
2870 *
2871 * Cancel @work and wait for its execution to finish.  This function
2872 * can be used even if the work re-queues itself or migrates to
2873 * another workqueue.  On return from this function, @work is
2874 * guaranteed to be not pending or executing on any CPU.
2875 *
2876 * cancel_work_sync(&delayed_work->work) must not be used for
2877 * delayed_work's.  Use cancel_delayed_work_sync() instead.
2878 *
2879 * The caller must ensure that the workqueue on which @work was last
2880 * queued can't be destroyed before this function returns.
2881 *
2882 * Return:
2883 * %true if @work was pending, %false otherwise.
2884 */
2885bool cancel_work_sync(struct work_struct *work)
2886{
2887	return __cancel_work_timer(work, false);
2888}
2889EXPORT_SYMBOL_GPL(cancel_work_sync);
2890
2891/**
2892 * flush_delayed_work - wait for a dwork to finish executing the last queueing
2893 * @dwork: the delayed work to flush
2894 *
2895 * Delayed timer is cancelled and the pending work is queued for
2896 * immediate execution.  Like flush_work(), this function only
2897 * considers the last queueing instance of @dwork.
2898 *
2899 * Return:
2900 * %true if flush_work() waited for the work to finish execution,
2901 * %false if it was already idle.
2902 */
2903bool flush_delayed_work(struct delayed_work *dwork)
2904{
2905	local_irq_disable();
2906	if (del_timer_sync(&dwork->timer))
2907		__queue_work(dwork->cpu, dwork->wq, &dwork->work);
2908	local_irq_enable();
2909	return flush_work(&dwork->work);
2910}
2911EXPORT_SYMBOL(flush_delayed_work);
2912
2913/**
2914 * cancel_delayed_work - cancel a delayed work
2915 * @dwork: delayed_work to cancel
2916 *
2917 * Kill off a pending delayed_work.
2918 *
2919 * Return: %true if @dwork was pending and canceled; %false if it wasn't
2920 * pending.
2921 *
2922 * Note:
2923 * The work callback function may still be running on return, unless
2924 * it returns %true and the work doesn't re-arm itself.  Explicitly flush or
2925 * use cancel_delayed_work_sync() to wait on it.
2926 *
2927 * This function is safe to call from any context including IRQ handler.
2928 */
2929bool cancel_delayed_work(struct delayed_work *dwork)
2930{
2931	unsigned long flags;
2932	int ret;
2933
2934	do {
2935		ret = try_to_grab_pending(&dwork->work, true, &flags);
2936	} while (unlikely(ret == -EAGAIN));
2937
2938	if (unlikely(ret < 0))
2939		return false;
2940
2941	set_work_pool_and_clear_pending(&dwork->work,
2942					get_work_pool_id(&dwork->work));
2943	local_irq_restore(flags);
2944	return ret;
2945}
2946EXPORT_SYMBOL(cancel_delayed_work);
2947
2948/**
2949 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
2950 * @dwork: the delayed work cancel
2951 *
2952 * This is cancel_work_sync() for delayed works.
2953 *
2954 * Return:
2955 * %true if @dwork was pending, %false otherwise.
2956 */
2957bool cancel_delayed_work_sync(struct delayed_work *dwork)
2958{
2959	return __cancel_work_timer(&dwork->work, true);
2960}
2961EXPORT_SYMBOL(cancel_delayed_work_sync);
2962
2963/**
2964 * schedule_on_each_cpu - execute a function synchronously on each online CPU
2965 * @func: the function to call
2966 *
2967 * schedule_on_each_cpu() executes @func on each online CPU using the
2968 * system workqueue and blocks until all CPUs have completed.
2969 * schedule_on_each_cpu() is very slow.
2970 *
2971 * Return:
2972 * 0 on success, -errno on failure.
2973 */
2974int schedule_on_each_cpu(work_func_t func)
2975{
2976	int cpu;
2977	struct work_struct __percpu *works;
2978
2979	works = alloc_percpu(struct work_struct);
2980	if (!works)
2981		return -ENOMEM;
2982
2983	get_online_cpus();
2984
2985	for_each_online_cpu(cpu) {
2986		struct work_struct *work = per_cpu_ptr(works, cpu);
2987
2988		INIT_WORK(work, func);
2989		schedule_work_on(cpu, work);
2990	}
2991
2992	for_each_online_cpu(cpu)
2993		flush_work(per_cpu_ptr(works, cpu));
2994
2995	put_online_cpus();
2996	free_percpu(works);
2997	return 0;
2998}
2999
3000/**
3001 * flush_scheduled_work - ensure that any scheduled work has run to completion.
3002 *
3003 * Forces execution of the kernel-global workqueue and blocks until its
3004 * completion.
3005 *
3006 * Think twice before calling this function!  It's very easy to get into
3007 * trouble if you don't take great care.  Either of the following situations
3008 * will lead to deadlock:
3009 *
3010 *	One of the work items currently on the workqueue needs to acquire
3011 *	a lock held by your code or its caller.
3012 *
3013 *	Your code is running in the context of a work routine.
3014 *
3015 * They will be detected by lockdep when they occur, but the first might not
3016 * occur very often.  It depends on what work items are on the workqueue and
3017 * what locks they need, which you have no control over.
3018 *
3019 * In most situations flushing the entire workqueue is overkill; you merely
3020 * need to know that a particular work item isn't queued and isn't running.
3021 * In such cases you should use cancel_delayed_work_sync() or
3022 * cancel_work_sync() instead.
3023 */
3024void flush_scheduled_work(void)
3025{
3026	flush_workqueue(system_wq);
3027}
3028EXPORT_SYMBOL(flush_scheduled_work);
3029
3030/**
3031 * execute_in_process_context - reliably execute the routine with user context
3032 * @fn:		the function to execute
3033 * @ew:		guaranteed storage for the execute work structure (must
3034 *		be available when the work executes)
3035 *
3036 * Executes the function immediately if process context is available,
3037 * otherwise schedules the function for delayed execution.
3038 *
3039 * Return:	0 - function was executed
3040 *		1 - function was scheduled for execution
3041 */
3042int execute_in_process_context(work_func_t fn, struct execute_work *ew)
3043{
3044	if (!in_interrupt()) {
3045		fn(&ew->work);
3046		return 0;
3047	}
3048
3049	INIT_WORK(&ew->work, fn);
3050	schedule_work(&ew->work);
3051
3052	return 1;
3053}
3054EXPORT_SYMBOL_GPL(execute_in_process_context);
3055
3056/**
3057 * free_workqueue_attrs - free a workqueue_attrs
3058 * @attrs: workqueue_attrs to free
3059 *
3060 * Undo alloc_workqueue_attrs().
3061 */
3062void free_workqueue_attrs(struct workqueue_attrs *attrs)
3063{
3064	if (attrs) {
3065		free_cpumask_var(attrs->cpumask);
3066		kfree(attrs);
3067	}
3068}
3069
3070/**
3071 * alloc_workqueue_attrs - allocate a workqueue_attrs
3072 * @gfp_mask: allocation mask to use
3073 *
3074 * Allocate a new workqueue_attrs, initialize with default settings and
3075 * return it.
3076 *
3077 * Return: The allocated new workqueue_attr on success. %NULL on failure.
3078 */
3079struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask)
3080{
3081	struct workqueue_attrs *attrs;
3082
3083	attrs = kzalloc(sizeof(*attrs), gfp_mask);
3084	if (!attrs)
3085		goto fail;
3086	if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask))
3087		goto fail;
3088
3089	cpumask_copy(attrs->cpumask, cpu_possible_mask);
3090	return attrs;
3091fail:
3092	free_workqueue_attrs(attrs);
3093	return NULL;
3094}
3095
3096static void copy_workqueue_attrs(struct workqueue_attrs *to,
3097				 const struct workqueue_attrs *from)
3098{
3099	to->nice = from->nice;
3100	cpumask_copy(to->cpumask, from->cpumask);
3101	/*
3102	 * Unlike hash and equality test, this function doesn't ignore
3103	 * ->no_numa as it is used for both pool and wq attrs.  Instead,
3104	 * get_unbound_pool() explicitly clears ->no_numa after copying.
3105	 */
3106	to->no_numa = from->no_numa;
3107}
3108
3109/* hash value of the content of @attr */
3110static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
3111{
3112	u32 hash = 0;
3113
3114	hash = jhash_1word(attrs->nice, hash);
3115	hash = jhash(cpumask_bits(attrs->cpumask),
3116		     BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
3117	return hash;
3118}
3119
3120/* content equality test */
3121static bool wqattrs_equal(const struct workqueue_attrs *a,
3122			  const struct workqueue_attrs *b)
3123{
3124	if (a->nice != b->nice)
3125		return false;
3126	if (!cpumask_equal(a->cpumask, b->cpumask))
3127		return false;
3128	return true;
3129}
3130
3131/**
3132 * init_worker_pool - initialize a newly zalloc'd worker_pool
3133 * @pool: worker_pool to initialize
3134 *
3135 * Initiailize a newly zalloc'd @pool.  It also allocates @pool->attrs.
3136 *
3137 * Return: 0 on success, -errno on failure.  Even on failure, all fields
3138 * inside @pool proper are initialized and put_unbound_pool() can be called
3139 * on @pool safely to release it.
3140 */
3141static int init_worker_pool(struct worker_pool *pool)
3142{
3143	spin_lock_init(&pool->lock);
3144	pool->id = -1;
3145	pool->cpu = -1;
3146	pool->node = NUMA_NO_NODE;
3147	pool->flags |= POOL_DISASSOCIATED;
3148	INIT_LIST_HEAD(&pool->worklist);
3149	INIT_LIST_HEAD(&pool->idle_list);
3150	hash_init(pool->busy_hash);
3151
3152	init_timer_deferrable(&pool->idle_timer);
3153	pool->idle_timer.function = idle_worker_timeout;
3154	pool->idle_timer.data = (unsigned long)pool;
3155
3156	setup_timer(&pool->mayday_timer, pool_mayday_timeout,
3157		    (unsigned long)pool);
3158
3159	mutex_init(&pool->manager_arb);
3160	mutex_init(&pool->attach_mutex);
3161	INIT_LIST_HEAD(&pool->workers);
3162
3163	ida_init(&pool->worker_ida);
3164	INIT_HLIST_NODE(&pool->hash_node);
3165	pool->refcnt = 1;
3166
3167	/* shouldn't fail above this point */
3168	pool->attrs = alloc_workqueue_attrs(GFP_KERNEL);
3169	if (!pool->attrs)
3170		return -ENOMEM;
3171	return 0;
3172}
3173
3174static void rcu_free_wq(struct rcu_head *rcu)
3175{
3176	struct workqueue_struct *wq =
3177		container_of(rcu, struct workqueue_struct, rcu);
3178
3179	if (!(wq->flags & WQ_UNBOUND))
3180		free_percpu(wq->cpu_pwqs);
3181	else
3182		free_workqueue_attrs(wq->unbound_attrs);
3183
3184	kfree(wq->rescuer);
3185	kfree(wq);
3186}
3187
3188static void rcu_free_pool(struct rcu_head *rcu)
3189{
3190	struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
3191
3192	ida_destroy(&pool->worker_ida);
3193	free_workqueue_attrs(pool->attrs);
3194	kfree(pool);
3195}
3196
3197/**
3198 * put_unbound_pool - put a worker_pool
3199 * @pool: worker_pool to put
3200 *
3201 * Put @pool.  If its refcnt reaches zero, it gets destroyed in sched-RCU
3202 * safe manner.  get_unbound_pool() calls this function on its failure path
3203 * and this function should be able to release pools which went through,
3204 * successfully or not, init_worker_pool().
3205 *
3206 * Should be called with wq_pool_mutex held.
3207 */
3208static void put_unbound_pool(struct worker_pool *pool)
3209{
3210	DECLARE_COMPLETION_ONSTACK(detach_completion);
3211	struct worker *worker;
3212
3213	lockdep_assert_held(&wq_pool_mutex);
3214
3215	if (--pool->refcnt)
3216		return;
3217
3218	/* sanity checks */
3219	if (WARN_ON(!(pool->cpu < 0)) ||
3220	    WARN_ON(!list_empty(&pool->worklist)))
3221		return;
3222
3223	/* release id and unhash */
3224	if (pool->id >= 0)
3225		idr_remove(&worker_pool_idr, pool->id);
3226	hash_del(&pool->hash_node);
3227
3228	/*
3229	 * Become the manager and destroy all workers.  Grabbing
3230	 * manager_arb prevents @pool's workers from blocking on
3231	 * attach_mutex.
3232	 */
3233	mutex_lock(&pool->manager_arb);
3234
3235	spin_lock_irq(&pool->lock);
3236	while ((worker = first_idle_worker(pool)))
3237		destroy_worker(worker);
3238	WARN_ON(pool->nr_workers || pool->nr_idle);
3239	spin_unlock_irq(&pool->lock);
3240
3241	mutex_lock(&pool->attach_mutex);
3242	if (!list_empty(&pool->workers))
3243		pool->detach_completion = &detach_completion;
3244	mutex_unlock(&pool->attach_mutex);
3245
3246	if (pool->detach_completion)
3247		wait_for_completion(pool->detach_completion);
3248
3249	mutex_unlock(&pool->manager_arb);
3250
3251	/* shut down the timers */
3252	del_timer_sync(&pool->idle_timer);
3253	del_timer_sync(&pool->mayday_timer);
3254
3255	/* sched-RCU protected to allow dereferences from get_work_pool() */
3256	call_rcu_sched(&pool->rcu, rcu_free_pool);
3257}
3258
3259/**
3260 * get_unbound_pool - get a worker_pool with the specified attributes
3261 * @attrs: the attributes of the worker_pool to get
3262 *
3263 * Obtain a worker_pool which has the same attributes as @attrs, bump the
3264 * reference count and return it.  If there already is a matching
3265 * worker_pool, it will be used; otherwise, this function attempts to
3266 * create a new one.
3267 *
3268 * Should be called with wq_pool_mutex held.
3269 *
3270 * Return: On success, a worker_pool with the same attributes as @attrs.
3271 * On failure, %NULL.
3272 */
3273static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
3274{
3275	u32 hash = wqattrs_hash(attrs);
3276	struct worker_pool *pool;
3277	int node;
3278
3279	lockdep_assert_held(&wq_pool_mutex);
3280
3281	/* do we already have a matching pool? */
3282	hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
3283		if (wqattrs_equal(pool->attrs, attrs)) {
3284			pool->refcnt++;
3285			return pool;
3286		}
3287	}
3288
3289	/* nope, create a new one */
3290	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
3291	if (!pool || init_worker_pool(pool) < 0)
3292		goto fail;
3293
3294	lockdep_set_subclass(&pool->lock, 1);	/* see put_pwq() */
3295	copy_workqueue_attrs(pool->attrs, attrs);
3296
3297	/*
3298	 * no_numa isn't a worker_pool attribute, always clear it.  See
3299	 * 'struct workqueue_attrs' comments for detail.
3300	 */
3301	pool->attrs->no_numa = false;
3302
3303	/* if cpumask is contained inside a NUMA node, we belong to that node */
3304	if (wq_numa_enabled) {
3305		for_each_node(node) {
3306			if (cpumask_subset(pool->attrs->cpumask,
3307					   wq_numa_possible_cpumask[node])) {
3308				pool->node = node;
3309				break;
3310			}
3311		}
3312	}
3313
3314	if (worker_pool_assign_id(pool) < 0)
3315		goto fail;
3316
3317	/* create and start the initial worker */
3318	if (!create_worker(pool))
3319		goto fail;
3320
3321	/* install */
3322	hash_add(unbound_pool_hash, &pool->hash_node, hash);
3323
3324	return pool;
3325fail:
3326	if (pool)
3327		put_unbound_pool(pool);
3328	return NULL;
3329}
3330
3331static void rcu_free_pwq(struct rcu_head *rcu)
3332{
3333	kmem_cache_free(pwq_cache,
3334			container_of(rcu, struct pool_workqueue, rcu));
3335}
3336
3337/*
3338 * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt
3339 * and needs to be destroyed.
3340 */
3341static void pwq_unbound_release_workfn(struct work_struct *work)
3342{
3343	struct pool_workqueue *pwq = container_of(work, struct pool_workqueue,
3344						  unbound_release_work);
3345	struct workqueue_struct *wq = pwq->wq;
3346	struct worker_pool *pool = pwq->pool;
3347	bool is_last;
3348
3349	if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
3350		return;
3351
3352	mutex_lock(&wq->mutex);
3353	list_del_rcu(&pwq->pwqs_node);
3354	is_last = list_empty(&wq->pwqs);
3355	mutex_unlock(&wq->mutex);
3356
3357	mutex_lock(&wq_pool_mutex);
3358	put_unbound_pool(pool);
3359	mutex_unlock(&wq_pool_mutex);
3360
3361	call_rcu_sched(&pwq->rcu, rcu_free_pwq);
3362
3363	/*
3364	 * If we're the last pwq going away, @wq is already dead and no one
3365	 * is gonna access it anymore.  Schedule RCU free.
3366	 */
3367	if (is_last)
3368		call_rcu_sched(&wq->rcu, rcu_free_wq);
3369}
3370
3371/**
3372 * pwq_adjust_max_active - update a pwq's max_active to the current setting
3373 * @pwq: target pool_workqueue
3374 *
3375 * If @pwq isn't freezing, set @pwq->max_active to the associated
3376 * workqueue's saved_max_active and activate delayed work items
3377 * accordingly.  If @pwq is freezing, clear @pwq->max_active to zero.
3378 */
3379static void pwq_adjust_max_active(struct pool_workqueue *pwq)
3380{
3381	struct workqueue_struct *wq = pwq->wq;
3382	bool freezable = wq->flags & WQ_FREEZABLE;
3383
3384	/* for @wq->saved_max_active */
3385	lockdep_assert_held(&wq->mutex);
3386
3387	/* fast exit for non-freezable wqs */
3388	if (!freezable && pwq->max_active == wq->saved_max_active)
3389		return;
3390
3391	spin_lock_irq(&pwq->pool->lock);
3392
3393	/*
3394	 * During [un]freezing, the caller is responsible for ensuring that
3395	 * this function is called at least once after @workqueue_freezing
3396	 * is updated and visible.
3397	 */
3398	if (!freezable || !workqueue_freezing) {
3399		pwq->max_active = wq->saved_max_active;
3400
3401		while (!list_empty(&pwq->delayed_works) &&
3402		       pwq->nr_active < pwq->max_active)
3403			pwq_activate_first_delayed(pwq);
3404
3405		/*
3406		 * Need to kick a worker after thawed or an unbound wq's
3407		 * max_active is bumped.  It's a slow path.  Do it always.
3408		 */
3409		wake_up_worker(pwq->pool);
3410	} else {
3411		pwq->max_active = 0;
3412	}
3413
3414	spin_unlock_irq(&pwq->pool->lock);
3415}
3416
3417/* initialize newly alloced @pwq which is associated with @wq and @pool */
3418static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
3419		     struct worker_pool *pool)
3420{
3421	BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
3422
3423	memset(pwq, 0, sizeof(*pwq));
3424
3425	pwq->pool = pool;
3426	pwq->wq = wq;
3427	pwq->flush_color = -1;
3428	pwq->refcnt = 1;
3429	INIT_LIST_HEAD(&pwq->delayed_works);
3430	INIT_LIST_HEAD(&pwq->pwqs_node);
3431	INIT_LIST_HEAD(&pwq->mayday_node);
3432	INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn);
3433}
3434
3435/* sync @pwq with the current state of its associated wq and link it */
3436static void link_pwq(struct pool_workqueue *pwq)
3437{
3438	struct workqueue_struct *wq = pwq->wq;
3439
3440	lockdep_assert_held(&wq->mutex);
3441
3442	/* may be called multiple times, ignore if already linked */
3443	if (!list_empty(&pwq->pwqs_node))
3444		return;
3445
3446	/* set the matching work_color */
3447	pwq->work_color = wq->work_color;
3448
3449	/* sync max_active to the current setting */
3450	pwq_adjust_max_active(pwq);
3451
3452	/* link in @pwq */
3453	list_add_rcu(&pwq->pwqs_node, &wq->pwqs);
3454}
3455
3456/* obtain a pool matching @attr and create a pwq associating the pool and @wq */
3457static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
3458					const struct workqueue_attrs *attrs)
3459{
3460	struct worker_pool *pool;
3461	struct pool_workqueue *pwq;
3462
3463	lockdep_assert_held(&wq_pool_mutex);
3464
3465	pool = get_unbound_pool(attrs);
3466	if (!pool)
3467		return NULL;
3468
3469	pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node);
3470	if (!pwq) {
3471		put_unbound_pool(pool);
3472		return NULL;
3473	}
3474
3475	init_pwq(pwq, wq, pool);
3476	return pwq;
3477}
3478
3479/**
3480 * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node
3481 * @attrs: the wq_attrs of interest
3482 * @node: the target NUMA node
3483 * @cpu_going_down: if >= 0, the CPU to consider as offline
3484 * @cpumask: outarg, the resulting cpumask
3485 *
3486 * Calculate the cpumask a workqueue with @attrs should use on @node.  If
3487 * @cpu_going_down is >= 0, that cpu is considered offline during
3488 * calculation.  The result is stored in @cpumask.
3489 *
3490 * If NUMA affinity is not enabled, @attrs->cpumask is always used.  If
3491 * enabled and @node has online CPUs requested by @attrs, the returned
3492 * cpumask is the intersection of the possible CPUs of @node and
3493 * @attrs->cpumask.
3494 *
3495 * The caller is responsible for ensuring that the cpumask of @node stays
3496 * stable.
3497 *
3498 * Return: %true if the resulting @cpumask is different from @attrs->cpumask,
3499 * %false if equal.
3500 */
3501static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
3502				 int cpu_going_down, cpumask_t *cpumask)
3503{
3504	if (!wq_numa_enabled || attrs->no_numa)
3505		goto use_dfl;
3506
3507	/* does @node have any online CPUs @attrs wants? */
3508	cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask);
3509	if (cpu_going_down >= 0)
3510		cpumask_clear_cpu(cpu_going_down, cpumask);
3511
3512	if (cpumask_empty(cpumask))
3513		goto use_dfl;
3514
3515	/* yeap, return possible CPUs in @node that @attrs wants */
3516	cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
3517	return !cpumask_equal(cpumask, attrs->cpumask);
3518
3519use_dfl:
3520	cpumask_copy(cpumask, attrs->cpumask);
3521	return false;
3522}
3523
3524/* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */
3525static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
3526						   int node,
3527						   struct pool_workqueue *pwq)
3528{
3529	struct pool_workqueue *old_pwq;
3530
3531	lockdep_assert_held(&wq_pool_mutex);
3532	lockdep_assert_held(&wq->mutex);
3533
3534	/* link_pwq() can handle duplicate calls */
3535	link_pwq(pwq);
3536
3537	old_pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
3538	rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq);
3539	return old_pwq;
3540}
3541
3542/* context to store the prepared attrs & pwqs before applying */
3543struct apply_wqattrs_ctx {
3544	struct workqueue_struct	*wq;		/* target workqueue */
3545	struct workqueue_attrs	*attrs;		/* attrs to apply */
3546	struct pool_workqueue	*dfl_pwq;
3547	struct pool_workqueue	*pwq_tbl[];
3548};
3549
3550/* free the resources after success or abort */
3551static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
3552{
3553	if (ctx) {
3554		int node;
3555
3556		for_each_node(node)
3557			put_pwq_unlocked(ctx->pwq_tbl[node]);
3558		put_pwq_unlocked(ctx->dfl_pwq);
3559
3560		free_workqueue_attrs(ctx->attrs);
3561
3562		kfree(ctx);
3563	}
3564}
3565
3566/* allocate the attrs and pwqs for later installation */
3567static struct apply_wqattrs_ctx *
3568apply_wqattrs_prepare(struct workqueue_struct *wq,
3569		      const struct workqueue_attrs *attrs)
3570{
3571	struct apply_wqattrs_ctx *ctx;
3572	struct workqueue_attrs *new_attrs, *tmp_attrs;
3573	int node;
3574
3575	lockdep_assert_held(&wq_pool_mutex);
3576
3577	ctx = kzalloc(sizeof(*ctx) + nr_node_ids * sizeof(ctx->pwq_tbl[0]),
3578		      GFP_KERNEL);
3579
3580	new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
3581	tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
3582	if (!ctx || !new_attrs || !tmp_attrs)
3583		goto out_free;
3584
3585	/* make a copy of @attrs and sanitize it */
3586	copy_workqueue_attrs(new_attrs, attrs);
3587	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
3588
3589	/*
3590	 * We may create multiple pwqs with differing cpumasks.  Make a
3591	 * copy of @new_attrs which will be modified and used to obtain
3592	 * pools.
3593	 */
3594	copy_workqueue_attrs(tmp_attrs, new_attrs);
3595
3596	/*
3597	 * If something goes wrong during CPU up/down, we'll fall back to
3598	 * the default pwq covering whole @attrs->cpumask.  Always create
3599	 * it even if we don't use it immediately.
3600	 */
3601	ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
3602	if (!ctx->dfl_pwq)
3603		goto out_free;
3604
3605	for_each_node(node) {
3606		if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
3607			ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
3608			if (!ctx->pwq_tbl[node])
3609				goto out_free;
3610		} else {
3611			ctx->dfl_pwq->refcnt++;
3612			ctx->pwq_tbl[node] = ctx->dfl_pwq;
3613		}
3614	}
3615
3616	ctx->attrs = new_attrs;
3617	ctx->wq = wq;
3618	free_workqueue_attrs(tmp_attrs);
3619	return ctx;
3620
3621out_free:
3622	free_workqueue_attrs(tmp_attrs);
3623	free_workqueue_attrs(new_attrs);
3624	apply_wqattrs_cleanup(ctx);
3625	return NULL;
3626}
3627
3628/* set attrs and install prepared pwqs, @ctx points to old pwqs on return */
3629static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
3630{
3631	int node;
3632
3633	/* all pwqs have been created successfully, let's install'em */
3634	mutex_lock(&ctx->wq->mutex);
3635
3636	copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);
3637
3638	/* save the previous pwq and install the new one */
3639	for_each_node(node)
3640		ctx->pwq_tbl[node] = numa_pwq_tbl_install(ctx->wq, node,
3641							  ctx->pwq_tbl[node]);
3642
3643	/* @dfl_pwq might not have been used, ensure it's linked */
3644	link_pwq(ctx->dfl_pwq);
3645	swap(ctx->wq->dfl_pwq, ctx->dfl_pwq);
3646
3647	mutex_unlock(&ctx->wq->mutex);
3648}
3649
3650/**
3651 * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
3652 * @wq: the target workqueue
3653 * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
3654 *
3655 * Apply @attrs to an unbound workqueue @wq.  Unless disabled, on NUMA
3656 * machines, this function maps a separate pwq to each NUMA node with
3657 * possibles CPUs in @attrs->cpumask so that work items are affine to the
3658 * NUMA node it was issued on.  Older pwqs are released as in-flight work
3659 * items finish.  Note that a work item which repeatedly requeues itself
3660 * back-to-back will stay on its current pwq.
3661 *
3662 * Performs GFP_KERNEL allocations.
3663 *
3664 * Return: 0 on success and -errno on failure.
3665 */
3666int apply_workqueue_attrs(struct workqueue_struct *wq,
3667			  const struct workqueue_attrs *attrs)
3668{
3669	struct apply_wqattrs_ctx *ctx;
3670	int ret = -ENOMEM;
3671
3672	/* only unbound workqueues can change attributes */
3673	if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
3674		return -EINVAL;
3675
3676	/* creating multiple pwqs breaks ordering guarantee */
3677	if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
3678		return -EINVAL;
3679
3680	/*
3681	 * CPUs should stay stable across pwq creations and installations.
3682	 * Pin CPUs, determine the target cpumask for each node and create
3683	 * pwqs accordingly.
3684	 */
3685	get_online_cpus();
3686	mutex_lock(&wq_pool_mutex);
3687
3688	ctx = apply_wqattrs_prepare(wq, attrs);
3689
3690	/* the ctx has been prepared successfully, let's commit it */
3691	if (ctx) {
3692		apply_wqattrs_commit(ctx);
3693		ret = 0;
3694	}
3695
3696	mutex_unlock(&wq_pool_mutex);
3697	put_online_cpus();
3698
3699	apply_wqattrs_cleanup(ctx);
3700
3701	return ret;
3702}
3703
3704/**
3705 * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
3706 * @wq: the target workqueue
3707 * @cpu: the CPU coming up or going down
3708 * @online: whether @cpu is coming up or going down
3709 *
3710 * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
3711 * %CPU_DOWN_FAILED.  @cpu is being hot[un]plugged, update NUMA affinity of
3712 * @wq accordingly.
3713 *
3714 * If NUMA affinity can't be adjusted due to memory allocation failure, it
3715 * falls back to @wq->dfl_pwq which may not be optimal but is always
3716 * correct.
3717 *
3718 * Note that when the last allowed CPU of a NUMA node goes offline for a
3719 * workqueue with a cpumask spanning multiple nodes, the workers which were
3720 * already executing the work items for the workqueue will lose their CPU
3721 * affinity and may execute on any CPU.  This is similar to how per-cpu
3722 * workqueues behave on CPU_DOWN.  If a workqueue user wants strict
3723 * affinity, it's the user's responsibility to flush the work item from
3724 * CPU_DOWN_PREPARE.
3725 */
3726static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
3727				   bool online)
3728{
3729	int node = cpu_to_node(cpu);
3730	int cpu_off = online ? -1 : cpu;
3731	struct pool_workqueue *old_pwq = NULL, *pwq;
3732	struct workqueue_attrs *target_attrs;
3733	cpumask_t *cpumask;
3734
3735	lockdep_assert_held(&wq_pool_mutex);
3736
3737	if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND))
3738		return;
3739
3740	/*
3741	 * We don't wanna alloc/free wq_attrs for each wq for each CPU.
3742	 * Let's use a preallocated one.  The following buf is protected by
3743	 * CPU hotplug exclusion.
3744	 */
3745	target_attrs = wq_update_unbound_numa_attrs_buf;
3746	cpumask = target_attrs->cpumask;
3747
3748	mutex_lock(&wq->mutex);
3749	if (wq->unbound_attrs->no_numa)
3750		goto out_unlock;
3751
3752	copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
3753	pwq = unbound_pwq_by_node(wq, node);
3754
3755	/*
3756	 * Let's determine what needs to be done.  If the target cpumask is
3757	 * different from wq's, we need to compare it to @pwq's and create
3758	 * a new one if they don't match.  If the target cpumask equals
3759	 * wq's, the default pwq should be used.
3760	 */
3761	if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
3762		if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
3763			goto out_unlock;
3764	} else {
3765		goto use_dfl_pwq;
3766	}
3767
3768	mutex_unlock(&wq->mutex);
3769
3770	/* create a new pwq */
3771	pwq = alloc_unbound_pwq(wq, target_attrs);
3772	if (!pwq) {
3773		pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
3774			wq->name);
3775		mutex_lock(&wq->mutex);
3776		goto use_dfl_pwq;
3777	}
3778
3779	/*
3780	 * Install the new pwq.  As this function is called only from CPU
3781	 * hotplug callbacks and applying a new attrs is wrapped with
3782	 * get/put_online_cpus(), @wq->unbound_attrs couldn't have changed
3783	 * inbetween.
3784	 */
3785	mutex_lock(&wq->mutex);
3786	old_pwq = numa_pwq_tbl_install(wq, node, pwq);
3787	goto out_unlock;
3788
3789use_dfl_pwq:
3790	spin_lock_irq(&wq->dfl_pwq->pool->lock);
3791	get_pwq(wq->dfl_pwq);
3792	spin_unlock_irq(&wq->dfl_pwq->pool->lock);
3793	old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq);
3794out_unlock:
3795	mutex_unlock(&wq->mutex);
3796	put_pwq_unlocked(old_pwq);
3797}
3798
3799static int alloc_and_link_pwqs(struct workqueue_struct *wq)
3800{
3801	bool highpri = wq->flags & WQ_HIGHPRI;
3802	int cpu, ret;
3803
3804	if (!(wq->flags & WQ_UNBOUND)) {
3805		wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
3806		if (!wq->cpu_pwqs)
3807			return -ENOMEM;
3808
3809		for_each_possible_cpu(cpu) {
3810			struct pool_workqueue *pwq =
3811				per_cpu_ptr(wq->cpu_pwqs, cpu);
3812			struct worker_pool *cpu_pools =
3813				per_cpu(cpu_worker_pools, cpu);
3814
3815			init_pwq(pwq, wq, &cpu_pools[highpri]);
3816
3817			mutex_lock(&wq->mutex);
3818			link_pwq(pwq);
3819			mutex_unlock(&wq->mutex);
3820		}
3821		return 0;
3822	} else if (wq->flags & __WQ_ORDERED) {
3823		ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
3824		/* there should only be single pwq for ordering guarantee */
3825		WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
3826			      wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
3827		     "ordering guarantee broken for workqueue %s\n", wq->name);
3828		return ret;
3829	} else {
3830		return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
3831	}
3832}
3833
3834static int wq_clamp_max_active(int max_active, unsigned int flags,
3835			       const char *name)
3836{
3837	int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
3838
3839	if (max_active < 1 || max_active > lim)
3840		pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
3841			max_active, name, 1, lim);
3842
3843	return clamp_val(max_active, 1, lim);
3844}
3845
3846struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3847					       unsigned int flags,
3848					       int max_active,
3849					       struct lock_class_key *key,
3850					       const char *lock_name, ...)
3851{
3852	size_t tbl_size = 0;
3853	va_list args;
3854	struct workqueue_struct *wq;
3855	struct pool_workqueue *pwq;
3856
3857	/* see the comment above the definition of WQ_POWER_EFFICIENT */
3858	if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
3859		flags |= WQ_UNBOUND;
3860
3861	/* allocate wq and format name */
3862	if (flags & WQ_UNBOUND)
3863		tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]);
3864
3865	wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
3866	if (!wq)
3867		return NULL;
3868
3869	if (flags & WQ_UNBOUND) {
3870		wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL);
3871		if (!wq->unbound_attrs)
3872			goto err_free_wq;
3873	}
3874
3875	va_start(args, lock_name);
3876	vsnprintf(wq->name, sizeof(wq->name), fmt, args);
3877	va_end(args);
3878
3879	max_active = max_active ?: WQ_DFL_ACTIVE;
3880	max_active = wq_clamp_max_active(max_active, flags, wq->name);
3881
3882	/* init wq */
3883	wq->flags = flags;
3884	wq->saved_max_active = max_active;
3885	mutex_init(&wq->mutex);
3886	atomic_set(&wq->nr_pwqs_to_flush, 0);
3887	INIT_LIST_HEAD(&wq->pwqs);
3888	INIT_LIST_HEAD(&wq->flusher_queue);
3889	INIT_LIST_HEAD(&wq->flusher_overflow);
3890	INIT_LIST_HEAD(&wq->maydays);
3891
3892	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
3893	INIT_LIST_HEAD(&wq->list);
3894
3895	if (alloc_and_link_pwqs(wq) < 0)
3896		goto err_free_wq;
3897
3898	/*
3899	 * Workqueues which may be used during memory reclaim should
3900	 * have a rescuer to guarantee forward progress.
3901	 */
3902	if (flags & WQ_MEM_RECLAIM) {
3903		struct worker *rescuer;
3904
3905		rescuer = alloc_worker(NUMA_NO_NODE);
3906		if (!rescuer)
3907			goto err_destroy;
3908
3909		rescuer->rescue_wq = wq;
3910		rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",
3911					       wq->name);
3912		if (IS_ERR(rescuer->task)) {
3913			kfree(rescuer);
3914			goto err_destroy;
3915		}
3916
3917		wq->rescuer = rescuer;
3918		rescuer->task->flags |= PF_NO_SETAFFINITY;
3919		wake_up_process(rescuer->task);
3920	}
3921
3922	if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
3923		goto err_destroy;
3924
3925	/*
3926	 * wq_pool_mutex protects global freeze state and workqueues list.
3927	 * Grab it, adjust max_active and add the new @wq to workqueues
3928	 * list.
3929	 */
3930	mutex_lock(&wq_pool_mutex);
3931
3932	mutex_lock(&wq->mutex);
3933	for_each_pwq(pwq, wq)
3934		pwq_adjust_max_active(pwq);
3935	mutex_unlock(&wq->mutex);
3936
3937	list_add_tail_rcu(&wq->list, &workqueues);
3938
3939	mutex_unlock(&wq_pool_mutex);
3940
3941	return wq;
3942
3943err_free_wq:
3944	free_workqueue_attrs(wq->unbound_attrs);
3945	kfree(wq);
3946	return NULL;
3947err_destroy:
3948	destroy_workqueue(wq);
3949	return NULL;
3950}
3951EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
3952
3953/**
3954 * destroy_workqueue - safely terminate a workqueue
3955 * @wq: target workqueue
3956 *
3957 * Safely destroy a workqueue. All work currently pending will be done first.
3958 */
3959void destroy_workqueue(struct workqueue_struct *wq)
3960{
3961	struct pool_workqueue *pwq;
3962	int node;
3963
3964	/* drain it before proceeding with destruction */
3965	drain_workqueue(wq);
3966
3967	/* sanity checks */
3968	mutex_lock(&wq->mutex);
3969	for_each_pwq(pwq, wq) {
3970		int i;
3971
3972		for (i = 0; i < WORK_NR_COLORS; i++) {
3973			if (WARN_ON(pwq->nr_in_flight[i])) {
3974				mutex_unlock(&wq->mutex);
3975				return;
3976			}
3977		}
3978
3979		if (WARN_ON((pwq != wq->dfl_pwq) && (pwq->refcnt > 1)) ||
3980		    WARN_ON(pwq->nr_active) ||
3981		    WARN_ON(!list_empty(&pwq->delayed_works))) {
3982			mutex_unlock(&wq->mutex);
3983			return;
3984		}
3985	}
3986	mutex_unlock(&wq->mutex);
3987
3988	/*
3989	 * wq list is used to freeze wq, remove from list after
3990	 * flushing is complete in case freeze races us.
3991	 */
3992	mutex_lock(&wq_pool_mutex);
3993	list_del_rcu(&wq->list);
3994	mutex_unlock(&wq_pool_mutex);
3995
3996	workqueue_sysfs_unregister(wq);
3997
3998	if (wq->rescuer)
3999		kthread_stop(wq->rescuer->task);
4000
4001	if (!(wq->flags & WQ_UNBOUND)) {
4002		/*
4003		 * The base ref is never dropped on per-cpu pwqs.  Directly
4004		 * schedule RCU free.
4005		 */
4006		call_rcu_sched(&wq->rcu, rcu_free_wq);
4007	} else {
4008		/*
4009		 * We're the sole accessor of @wq at this point.  Directly
4010		 * access numa_pwq_tbl[] and dfl_pwq to put the base refs.
4011		 * @wq will be freed when the last pwq is released.
4012		 */
4013		for_each_node(node) {
4014			pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
4015			RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL);
4016			put_pwq_unlocked(pwq);
4017		}
4018
4019		/*
4020		 * Put dfl_pwq.  @wq may be freed any time after dfl_pwq is
4021		 * put.  Don't access it afterwards.
4022		 */
4023		pwq = wq->dfl_pwq;
4024		wq->dfl_pwq = NULL;
4025		put_pwq_unlocked(pwq);
4026	}
4027}
4028EXPORT_SYMBOL_GPL(destroy_workqueue);
4029
4030/**
4031 * workqueue_set_max_active - adjust max_active of a workqueue
4032 * @wq: target workqueue
4033 * @max_active: new max_active value.
4034 *
4035 * Set max_active of @wq to @max_active.
4036 *
4037 * CONTEXT:
4038 * Don't call from IRQ context.
4039 */
4040void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
4041{
4042	struct pool_workqueue *pwq;
4043
4044	/* disallow meddling with max_active for ordered workqueues */
4045	if (WARN_ON(wq->flags & __WQ_ORDERED))
4046		return;
4047
4048	max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
4049
4050	mutex_lock(&wq->mutex);
4051
4052	wq->saved_max_active = max_active;
4053
4054	for_each_pwq(pwq, wq)
4055		pwq_adjust_max_active(pwq);
4056
4057	mutex_unlock(&wq->mutex);
4058}
4059EXPORT_SYMBOL_GPL(workqueue_set_max_active);
4060
4061/**
4062 * current_is_workqueue_rescuer - is %current workqueue rescuer?
4063 *
4064 * Determine whether %current is a workqueue rescuer.  Can be used from
4065 * work functions to determine whether it's being run off the rescuer task.
4066 *
4067 * Return: %true if %current is a workqueue rescuer. %false otherwise.
4068 */
4069bool current_is_workqueue_rescuer(void)
4070{
4071	struct worker *worker = current_wq_worker();
4072
4073	return worker && worker->rescue_wq;
4074}
4075
4076/**
4077 * workqueue_congested - test whether a workqueue is congested
4078 * @cpu: CPU in question
4079 * @wq: target workqueue
4080 *
4081 * Test whether @wq's cpu workqueue for @cpu is congested.  There is
4082 * no synchronization around this function and the test result is
4083 * unreliable and only useful as advisory hints or for debugging.
4084 *
4085 * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU.
4086 * Note that both per-cpu and unbound workqueues may be associated with
4087 * multiple pool_workqueues which have separate congested states.  A
4088 * workqueue being congested on one CPU doesn't mean the workqueue is also
4089 * contested on other CPUs / NUMA nodes.
4090 *
4091 * Return:
4092 * %true if congested, %false otherwise.
4093 */
4094bool workqueue_congested(int cpu, struct workqueue_struct *wq)
4095{
4096	struct pool_workqueue *pwq;
4097	bool ret;
4098
4099	rcu_read_lock_sched();
4100
4101	if (cpu == WORK_CPU_UNBOUND)
4102		cpu = smp_processor_id();
4103
4104	if (!(wq->flags & WQ_UNBOUND))
4105		pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
4106	else
4107		pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
4108
4109	ret = !list_empty(&pwq->delayed_works);
4110	rcu_read_unlock_sched();
4111
4112	return ret;
4113}
4114EXPORT_SYMBOL_GPL(workqueue_congested);
4115
4116/**
4117 * work_busy - test whether a work is currently pending or running
4118 * @work: the work to be tested
4119 *
4120 * Test whether @work is currently pending or running.  There is no
4121 * synchronization around this function and the test result is
4122 * unreliable and only useful as advisory hints or for debugging.
4123 *
4124 * Return:
4125 * OR'd bitmask of WORK_BUSY_* bits.
4126 */
4127unsigned int work_busy(struct work_struct *work)
4128{
4129	struct worker_pool *pool;
4130	unsigned long flags;
4131	unsigned int ret = 0;
4132
4133	if (work_pending(work))
4134		ret |= WORK_BUSY_PENDING;
4135
4136	local_irq_save(flags);
4137	pool = get_work_pool(work);
4138	if (pool) {
4139		spin_lock(&pool->lock);
4140		if (find_worker_executing_work(pool, work))
4141			ret |= WORK_BUSY_RUNNING;
4142		spin_unlock(&pool->lock);
4143	}
4144	local_irq_restore(flags);
4145
4146	return ret;
4147}
4148EXPORT_SYMBOL_GPL(work_busy);
4149
4150/**
4151 * set_worker_desc - set description for the current work item
4152 * @fmt: printf-style format string
4153 * @...: arguments for the format string
4154 *
4155 * This function can be called by a running work function to describe what
4156 * the work item is about.  If the worker task gets dumped, this
4157 * information will be printed out together to help debugging.  The
4158 * description can be at most WORKER_DESC_LEN including the trailing '\0'.
4159 */
4160void set_worker_desc(const char *fmt, ...)
4161{
4162	struct worker *worker = current_wq_worker();
4163	va_list args;
4164
4165	if (worker) {
4166		va_start(args, fmt);
4167		vsnprintf(worker->desc, sizeof(worker->desc), fmt, args);
4168		va_end(args);
4169		worker->desc_valid = true;
4170	}
4171}
4172
4173/**
4174 * print_worker_info - print out worker information and description
4175 * @log_lvl: the log level to use when printing
4176 * @task: target task
4177 *
4178 * If @task is a worker and currently executing a work item, print out the
4179 * name of the workqueue being serviced and worker description set with
4180 * set_worker_desc() by the currently executing work item.
4181 *
4182 * This function can be safely called on any task as long as the
4183 * task_struct itself is accessible.  While safe, this function isn't
4184 * synchronized and may print out mixups or garbages of limited length.
4185 */
4186void print_worker_info(const char *log_lvl, struct task_struct *task)
4187{
4188	work_func_t *fn = NULL;
4189	char name[WQ_NAME_LEN] = { };
4190	char desc[WORKER_DESC_LEN] = { };
4191	struct pool_workqueue *pwq = NULL;
4192	struct workqueue_struct *wq = NULL;
4193	bool desc_valid = false;
4194	struct worker *worker;
4195
4196	if (!(task->flags & PF_WQ_WORKER))
4197		return;
4198
4199	/*
4200	 * This function is called without any synchronization and @task
4201	 * could be in any state.  Be careful with dereferences.
4202	 */
4203	worker = probe_kthread_data(task);
4204
4205	/*
4206	 * Carefully copy the associated workqueue's workfn and name.  Keep
4207	 * the original last '\0' in case the original contains garbage.
4208	 */
4209	probe_kernel_read(&fn, &worker->current_func, sizeof(fn));
4210	probe_kernel_read(&pwq, &worker->current_pwq, sizeof(pwq));
4211	probe_kernel_read(&wq, &pwq->wq, sizeof(wq));
4212	probe_kernel_read(name, wq->name, sizeof(name) - 1);
4213
4214	/* copy worker description */
4215	probe_kernel_read(&desc_valid, &worker->desc_valid, sizeof(desc_valid));
4216	if (desc_valid)
4217		probe_kernel_read(desc, worker->desc, sizeof(desc) - 1);
4218
4219	if (fn || name[0] || desc[0]) {
4220		printk("%sWorkqueue: %s %pf", log_lvl, name, fn);
4221		if (desc[0])
4222			pr_cont(" (%s)", desc);
4223		pr_cont("\n");
4224	}
4225}
4226
4227static void pr_cont_pool_info(struct worker_pool *pool)
4228{
4229	pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask);
4230	if (pool->node != NUMA_NO_NODE)
4231		pr_cont(" node=%d", pool->node);
4232	pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);
4233}
4234
4235static void pr_cont_work(bool comma, struct work_struct *work)
4236{
4237	if (work->func == wq_barrier_func) {
4238		struct wq_barrier *barr;
4239
4240		barr = container_of(work, struct wq_barrier, work);
4241
4242		pr_cont("%s BAR(%d)", comma ? "," : "",
4243			task_pid_nr(barr->task));
4244	} else {
4245		pr_cont("%s %pf", comma ? "," : "", work->func);
4246	}
4247}
4248
4249static void show_pwq(struct pool_workqueue *pwq)
4250{
4251	struct worker_pool *pool = pwq->pool;
4252	struct work_struct *work;
4253	struct worker *worker;
4254	bool has_in_flight = false, has_pending = false;
4255	int bkt;
4256
4257	pr_info("  pwq %d:", pool->id);
4258	pr_cont_pool_info(pool);
4259
4260	pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active,
4261		!list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
4262
4263	hash_for_each(pool->busy_hash, bkt, worker, hentry) {
4264		if (worker->current_pwq == pwq) {
4265			has_in_flight = true;
4266			break;
4267		}
4268	}
4269	if (has_in_flight) {
4270		bool comma = false;
4271
4272		pr_info("    in-flight:");
4273		hash_for_each(pool->busy_hash, bkt, worker, hentry) {
4274			if (worker->current_pwq != pwq)
4275				continue;
4276
4277			pr_cont("%s %d%s:%pf", comma ? "," : "",
4278				task_pid_nr(worker->task),
4279				worker == pwq->wq->rescuer ? "(RESCUER)" : "",
4280				worker->current_func);
4281			list_for_each_entry(work, &worker->scheduled, entry)
4282				pr_cont_work(false, work);
4283			comma = true;
4284		}
4285		pr_cont("\n");
4286	}
4287
4288	list_for_each_entry(work, &pool->worklist, entry) {
4289		if (get_work_pwq(work) == pwq) {
4290			has_pending = true;
4291			break;
4292		}
4293	}
4294	if (has_pending) {
4295		bool comma = false;
4296
4297		pr_info("    pending:");
4298		list_for_each_entry(work, &pool->worklist, entry) {
4299			if (get_work_pwq(work) != pwq)
4300				continue;
4301
4302			pr_cont_work(comma, work);
4303			comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
4304		}
4305		pr_cont("\n");
4306	}
4307
4308	if (!list_empty(&pwq->delayed_works)) {
4309		bool comma = false;
4310
4311		pr_info("    delayed:");
4312		list_for_each_entry(work, &pwq->delayed_works, entry) {
4313			pr_cont_work(comma, work);
4314			comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
4315		}
4316		pr_cont("\n");
4317	}
4318}
4319
4320/**
4321 * show_workqueue_state - dump workqueue state
4322 *
4323 * Called from a sysrq handler and prints out all busy workqueues and
4324 * pools.
4325 */
4326void show_workqueue_state(void)
4327{
4328	struct workqueue_struct *wq;
4329	struct worker_pool *pool;
4330	unsigned long flags;
4331	int pi;
4332
4333	rcu_read_lock_sched();
4334
4335	pr_info("Showing busy workqueues and worker pools:\n");
4336
4337	list_for_each_entry_rcu(wq, &workqueues, list) {
4338		struct pool_workqueue *pwq;
4339		bool idle = true;
4340
4341		for_each_pwq(pwq, wq) {
4342			if (pwq->nr_active || !list_empty(&pwq->delayed_works)) {
4343				idle = false;
4344				break;
4345			}
4346		}
4347		if (idle)
4348			continue;
4349
4350		pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
4351
4352		for_each_pwq(pwq, wq) {
4353			spin_lock_irqsave(&pwq->pool->lock, flags);
4354			if (pwq->nr_active || !list_empty(&pwq->delayed_works))
4355				show_pwq(pwq);
4356			spin_unlock_irqrestore(&pwq->pool->lock, flags);
4357		}
4358	}
4359
4360	for_each_pool(pool, pi) {
4361		struct worker *worker;
4362		bool first = true;
4363
4364		spin_lock_irqsave(&pool->lock, flags);
4365		if (pool->nr_workers == pool->nr_idle)
4366			goto next_pool;
4367
4368		pr_info("pool %d:", pool->id);
4369		pr_cont_pool_info(pool);
4370		pr_cont(" workers=%d", pool->nr_workers);
4371		if (pool->manager)
4372			pr_cont(" manager: %d",
4373				task_pid_nr(pool->manager->task));
4374		list_for_each_entry(worker, &pool->idle_list, entry) {
4375			pr_cont(" %s%d", first ? "idle: " : "",
4376				task_pid_nr(worker->task));
4377			first = false;
4378		}
4379		pr_cont("\n");
4380	next_pool:
4381		spin_unlock_irqrestore(&pool->lock, flags);
4382	}
4383
4384	rcu_read_unlock_sched();
4385}
4386
4387/*
4388 * CPU hotplug.
4389 *
4390 * There are two challenges in supporting CPU hotplug.  Firstly, there
4391 * are a lot of assumptions on strong associations among work, pwq and
4392 * pool which make migrating pending and scheduled works very
4393 * difficult to implement without impacting hot paths.  Secondly,
4394 * worker pools serve mix of short, long and very long running works making
4395 * blocked draining impractical.
4396 *
4397 * This is solved by allowing the pools to be disassociated from the CPU
4398 * running as an unbound one and allowing it to be reattached later if the
4399 * cpu comes back online.
4400 */
4401
4402static void wq_unbind_fn(struct work_struct *work)
4403{
4404	int cpu = smp_processor_id();
4405	struct worker_pool *pool;
4406	struct worker *worker;
4407
4408	for_each_cpu_worker_pool(pool, cpu) {
4409		mutex_lock(&pool->attach_mutex);
4410		spin_lock_irq(&pool->lock);
4411
4412		/*
4413		 * We've blocked all attach/detach operations. Make all workers
4414		 * unbound and set DISASSOCIATED.  Before this, all workers
4415		 * except for the ones which are still executing works from
4416		 * before the last CPU down must be on the cpu.  After
4417		 * this, they may become diasporas.
4418		 */
4419		for_each_pool_worker(worker, pool)
4420			worker->flags |= WORKER_UNBOUND;
4421
4422		pool->flags |= POOL_DISASSOCIATED;
4423
4424		spin_unlock_irq(&pool->lock);
4425		mutex_unlock(&pool->attach_mutex);
4426
4427		/*
4428		 * Call schedule() so that we cross rq->lock and thus can
4429		 * guarantee sched callbacks see the %WORKER_UNBOUND flag.
4430		 * This is necessary as scheduler callbacks may be invoked
4431		 * from other cpus.
4432		 */
4433		schedule();
4434
4435		/*
4436		 * Sched callbacks are disabled now.  Zap nr_running.
4437		 * After this, nr_running stays zero and need_more_worker()
4438		 * and keep_working() are always true as long as the
4439		 * worklist is not empty.  This pool now behaves as an
4440		 * unbound (in terms of concurrency management) pool which
4441		 * are served by workers tied to the pool.
4442		 */
4443		atomic_set(&pool->nr_running, 0);
4444
4445		/*
4446		 * With concurrency management just turned off, a busy
4447		 * worker blocking could lead to lengthy stalls.  Kick off
4448		 * unbound chain execution of currently pending work items.
4449		 */
4450		spin_lock_irq(&pool->lock);
4451		wake_up_worker(pool);
4452		spin_unlock_irq(&pool->lock);
4453	}
4454}
4455
4456/**
4457 * rebind_workers - rebind all workers of a pool to the associated CPU
4458 * @pool: pool of interest
4459 *
4460 * @pool->cpu is coming online.  Rebind all workers to the CPU.
4461 */
4462static void rebind_workers(struct worker_pool *pool)
4463{
4464	struct worker *worker;
4465
4466	lockdep_assert_held(&pool->attach_mutex);
4467
4468	/*
4469	 * Restore CPU affinity of all workers.  As all idle workers should
4470	 * be on the run-queue of the associated CPU before any local
4471	 * wake-ups for concurrency management happen, restore CPU affinty
4472	 * of all workers first and then clear UNBOUND.  As we're called
4473	 * from CPU_ONLINE, the following shouldn't fail.
4474	 */
4475	for_each_pool_worker(worker, pool)
4476		WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
4477						  pool->attrs->cpumask) < 0);
4478
4479	spin_lock_irq(&pool->lock);
4480
4481	/*
4482	 * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED
4483	 * w/o preceding DOWN_PREPARE.  Work around it.  CPU hotplug is
4484	 * being reworked and this can go away in time.
4485	 */
4486	if (!(pool->flags & POOL_DISASSOCIATED)) {
4487		spin_unlock_irq(&pool->lock);
4488		return;
4489	}
4490
4491	pool->flags &= ~POOL_DISASSOCIATED;
4492
4493	for_each_pool_worker(worker, pool) {
4494		unsigned int worker_flags = worker->flags;
4495
4496		/*
4497		 * A bound idle worker should actually be on the runqueue
4498		 * of the associated CPU for local wake-ups targeting it to
4499		 * work.  Kick all idle workers so that they migrate to the
4500		 * associated CPU.  Doing this in the same loop as
4501		 * replacing UNBOUND with REBOUND is safe as no worker will
4502		 * be bound before @pool->lock is released.
4503		 */
4504		if (worker_flags & WORKER_IDLE)
4505			wake_up_process(worker->task);
4506
4507		/*
4508		 * We want to clear UNBOUND but can't directly call
4509		 * worker_clr_flags() or adjust nr_running.  Atomically
4510		 * replace UNBOUND with another NOT_RUNNING flag REBOUND.
4511		 * @worker will clear REBOUND using worker_clr_flags() when
4512		 * it initiates the next execution cycle thus restoring
4513		 * concurrency management.  Note that when or whether
4514		 * @worker clears REBOUND doesn't affect correctness.
4515		 *
4516		 * ACCESS_ONCE() is necessary because @worker->flags may be
4517		 * tested without holding any lock in
4518		 * wq_worker_waking_up().  Without it, NOT_RUNNING test may
4519		 * fail incorrectly leading to premature concurrency
4520		 * management operations.
4521		 */
4522		WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND));
4523		worker_flags |= WORKER_REBOUND;
4524		worker_flags &= ~WORKER_UNBOUND;
4525		ACCESS_ONCE(worker->flags) = worker_flags;
4526	}
4527
4528	spin_unlock_irq(&pool->lock);
4529}
4530
4531/**
4532 * restore_unbound_workers_cpumask - restore cpumask of unbound workers
4533 * @pool: unbound pool of interest
4534 * @cpu: the CPU which is coming up
4535 *
4536 * An unbound pool may end up with a cpumask which doesn't have any online
4537 * CPUs.  When a worker of such pool get scheduled, the scheduler resets
4538 * its cpus_allowed.  If @cpu is in @pool's cpumask which didn't have any
4539 * online CPU before, cpus_allowed of all its workers should be restored.
4540 */
4541static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
4542{
4543	static cpumask_t cpumask;
4544	struct worker *worker;
4545
4546	lockdep_assert_held(&pool->attach_mutex);
4547
4548	/* is @cpu allowed for @pool? */
4549	if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
4550		return;
4551
4552	/* is @cpu the only online CPU? */
4553	cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask);
4554	if (cpumask_weight(&cpumask) != 1)
4555		return;
4556
4557	/* as we're called from CPU_ONLINE, the following shouldn't fail */
4558	for_each_pool_worker(worker, pool)
4559		WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
4560						  pool->attrs->cpumask) < 0);
4561}
4562
4563/*
4564 * Workqueues should be brought up before normal priority CPU notifiers.
4565 * This will be registered high priority CPU notifier.
4566 */
4567static int workqueue_cpu_up_callback(struct notifier_block *nfb,
4568					       unsigned long action,
4569					       void *hcpu)
4570{
4571	int cpu = (unsigned long)hcpu;
4572	struct worker_pool *pool;
4573	struct workqueue_struct *wq;
4574	int pi;
4575
4576	switch (action & ~CPU_TASKS_FROZEN) {
4577	case CPU_UP_PREPARE:
4578		for_each_cpu_worker_pool(pool, cpu) {
4579			if (pool->nr_workers)
4580				continue;
4581			if (!create_worker(pool))
4582				return NOTIFY_BAD;
4583		}
4584		break;
4585
4586	case CPU_DOWN_FAILED:
4587	case CPU_ONLINE:
4588		mutex_lock(&wq_pool_mutex);
4589
4590		for_each_pool(pool, pi) {
4591			mutex_lock(&pool->attach_mutex);
4592
4593			if (pool->cpu == cpu)
4594				rebind_workers(pool);
4595			else if (pool->cpu < 0)
4596				restore_unbound_workers_cpumask(pool, cpu);
4597
4598			mutex_unlock(&pool->attach_mutex);
4599		}
4600
4601		/* update NUMA affinity of unbound workqueues */
4602		list_for_each_entry(wq, &workqueues, list)
4603			wq_update_unbound_numa(wq, cpu, true);
4604
4605		mutex_unlock(&wq_pool_mutex);
4606		break;
4607	}
4608	return NOTIFY_OK;
4609}
4610
4611/*
4612 * Workqueues should be brought down after normal priority CPU notifiers.
4613 * This will be registered as low priority CPU notifier.
4614 */
4615static int workqueue_cpu_down_callback(struct notifier_block *nfb,
4616						 unsigned long action,
4617						 void *hcpu)
4618{
4619	int cpu = (unsigned long)hcpu;
4620	struct work_struct unbind_work;
4621	struct workqueue_struct *wq;
4622
4623	switch (action & ~CPU_TASKS_FROZEN) {
4624	case CPU_DOWN_PREPARE:
4625		/* unbinding per-cpu workers should happen on the local CPU */
4626		INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
4627		queue_work_on(cpu, system_highpri_wq, &unbind_work);
4628
4629		/* update NUMA affinity of unbound workqueues */
4630		mutex_lock(&wq_pool_mutex);
4631		list_for_each_entry(wq, &workqueues, list)
4632			wq_update_unbound_numa(wq, cpu, false);
4633		mutex_unlock(&wq_pool_mutex);
4634
4635		/* wait for per-cpu unbinding to finish */
4636		flush_work(&unbind_work);
4637		destroy_work_on_stack(&unbind_work);
4638		break;
4639	}
4640	return NOTIFY_OK;
4641}
4642
4643#ifdef CONFIG_SMP
4644
4645struct work_for_cpu {
4646	struct work_struct work;
4647	long (*fn)(void *);
4648	void *arg;
4649	long ret;
4650};
4651
4652static void work_for_cpu_fn(struct work_struct *work)
4653{
4654	struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);
4655
4656	wfc->ret = wfc->fn(wfc->arg);
4657}
4658
4659/**
4660 * work_on_cpu - run a function in user context on a particular cpu
4661 * @cpu: the cpu to run on
4662 * @fn: the function to run
4663 * @arg: the function arg
4664 *
4665 * It is up to the caller to ensure that the cpu doesn't go offline.
4666 * The caller must not hold any locks which would prevent @fn from completing.
4667 *
4668 * Return: The value @fn returns.
4669 */
4670long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
4671{
4672	struct work_for_cpu wfc = { .fn = fn, .arg = arg };
4673
4674	INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
4675	schedule_work_on(cpu, &wfc.work);
4676	flush_work(&wfc.work);
4677	destroy_work_on_stack(&wfc.work);
4678	return wfc.ret;
4679}
4680EXPORT_SYMBOL_GPL(work_on_cpu);
4681#endif /* CONFIG_SMP */
4682
4683#ifdef CONFIG_FREEZER
4684
4685/**
4686 * freeze_workqueues_begin - begin freezing workqueues
4687 *
4688 * Start freezing workqueues.  After this function returns, all freezable
4689 * workqueues will queue new works to their delayed_works list instead of
4690 * pool->worklist.
4691 *
4692 * CONTEXT:
4693 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
4694 */
4695void freeze_workqueues_begin(void)
4696{
4697	struct workqueue_struct *wq;
4698	struct pool_workqueue *pwq;
4699
4700	mutex_lock(&wq_pool_mutex);
4701
4702	WARN_ON_ONCE(workqueue_freezing);
4703	workqueue_freezing = true;
4704
4705	list_for_each_entry(wq, &workqueues, list) {
4706		mutex_lock(&wq->mutex);
4707		for_each_pwq(pwq, wq)
4708			pwq_adjust_max_active(pwq);
4709		mutex_unlock(&wq->mutex);
4710	}
4711
4712	mutex_unlock(&wq_pool_mutex);
4713}
4714
4715/**
4716 * freeze_workqueues_busy - are freezable workqueues still busy?
4717 *
4718 * Check whether freezing is complete.  This function must be called
4719 * between freeze_workqueues_begin() and thaw_workqueues().
4720 *
4721 * CONTEXT:
4722 * Grabs and releases wq_pool_mutex.
4723 *
4724 * Return:
4725 * %true if some freezable workqueues are still busy.  %false if freezing
4726 * is complete.
4727 */
4728bool freeze_workqueues_busy(void)
4729{
4730	bool busy = false;
4731	struct workqueue_struct *wq;
4732	struct pool_workqueue *pwq;
4733
4734	mutex_lock(&wq_pool_mutex);
4735
4736	WARN_ON_ONCE(!workqueue_freezing);
4737
4738	list_for_each_entry(wq, &workqueues, list) {
4739		if (!(wq->flags & WQ_FREEZABLE))
4740			continue;
4741		/*
4742		 * nr_active is monotonically decreasing.  It's safe
4743		 * to peek without lock.
4744		 */
4745		rcu_read_lock_sched();
4746		for_each_pwq(pwq, wq) {
4747			WARN_ON_ONCE(pwq->nr_active < 0);
4748			if (pwq->nr_active) {
4749				busy = true;
4750				rcu_read_unlock_sched();
4751				goto out_unlock;
4752			}
4753		}
4754		rcu_read_unlock_sched();
4755	}
4756out_unlock:
4757	mutex_unlock(&wq_pool_mutex);
4758	return busy;
4759}
4760
4761/**
4762 * thaw_workqueues - thaw workqueues
4763 *
4764 * Thaw workqueues.  Normal queueing is restored and all collected
4765 * frozen works are transferred to their respective pool worklists.
4766 *
4767 * CONTEXT:
4768 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
4769 */
4770void thaw_workqueues(void)
4771{
4772	struct workqueue_struct *wq;
4773	struct pool_workqueue *pwq;
4774
4775	mutex_lock(&wq_pool_mutex);
4776
4777	if (!workqueue_freezing)
4778		goto out_unlock;
4779
4780	workqueue_freezing = false;
4781
4782	/* restore max_active and repopulate worklist */
4783	list_for_each_entry(wq, &workqueues, list) {
4784		mutex_lock(&wq->mutex);
4785		for_each_pwq(pwq, wq)
4786			pwq_adjust_max_active(pwq);
4787		mutex_unlock(&wq->mutex);
4788	}
4789
4790out_unlock:
4791	mutex_unlock(&wq_pool_mutex);
4792}
4793#endif /* CONFIG_FREEZER */
4794
4795#ifdef CONFIG_SYSFS
4796/*
4797 * Workqueues with WQ_SYSFS flag set is visible to userland via
4798 * /sys/bus/workqueue/devices/WQ_NAME.  All visible workqueues have the
4799 * following attributes.
4800 *
4801 *  per_cpu	RO bool	: whether the workqueue is per-cpu or unbound
4802 *  max_active	RW int	: maximum number of in-flight work items
4803 *
4804 * Unbound workqueues have the following extra attributes.
4805 *
4806 *  id		RO int	: the associated pool ID
4807 *  nice	RW int	: nice value of the workers
4808 *  cpumask	RW mask	: bitmask of allowed CPUs for the workers
4809 */
4810struct wq_device {
4811	struct workqueue_struct		*wq;
4812	struct device			dev;
4813};
4814
4815static struct workqueue_struct *dev_to_wq(struct device *dev)
4816{
4817	struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
4818
4819	return wq_dev->wq;
4820}
4821
4822static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
4823			    char *buf)
4824{
4825	struct workqueue_struct *wq = dev_to_wq(dev);
4826
4827	return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
4828}
4829static DEVICE_ATTR_RO(per_cpu);
4830
4831static ssize_t max_active_show(struct device *dev,
4832			       struct device_attribute *attr, char *buf)
4833{
4834	struct workqueue_struct *wq = dev_to_wq(dev);
4835
4836	return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
4837}
4838
4839static ssize_t max_active_store(struct device *dev,
4840				struct device_attribute *attr, const char *buf,
4841				size_t count)
4842{
4843	struct workqueue_struct *wq = dev_to_wq(dev);
4844	int val;
4845
4846	if (sscanf(buf, "%d", &val) != 1 || val <= 0)
4847		return -EINVAL;
4848
4849	workqueue_set_max_active(wq, val);
4850	return count;
4851}
4852static DEVICE_ATTR_RW(max_active);
4853
4854static struct attribute *wq_sysfs_attrs[] = {
4855	&dev_attr_per_cpu.attr,
4856	&dev_attr_max_active.attr,
4857	NULL,
4858};
4859ATTRIBUTE_GROUPS(wq_sysfs);
4860
4861static ssize_t wq_pool_ids_show(struct device *dev,
4862				struct device_attribute *attr, char *buf)
4863{
4864	struct workqueue_struct *wq = dev_to_wq(dev);
4865	const char *delim = "";
4866	int node, written = 0;
4867
4868	rcu_read_lock_sched();
4869	for_each_node(node) {
4870		written += scnprintf(buf + written, PAGE_SIZE - written,
4871				     "%s%d:%d", delim, node,
4872				     unbound_pwq_by_node(wq, node)->pool->id);
4873		delim = " ";
4874	}
4875	written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
4876	rcu_read_unlock_sched();
4877
4878	return written;
4879}
4880
4881static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
4882			    char *buf)
4883{
4884	struct workqueue_struct *wq = dev_to_wq(dev);
4885	int written;
4886
4887	mutex_lock(&wq->mutex);
4888	written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
4889	mutex_unlock(&wq->mutex);
4890
4891	return written;
4892}
4893
4894/* prepare workqueue_attrs for sysfs store operations */
4895static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
4896{
4897	struct workqueue_attrs *attrs;
4898
4899	attrs = alloc_workqueue_attrs(GFP_KERNEL);
4900	if (!attrs)
4901		return NULL;
4902
4903	mutex_lock(&wq->mutex);
4904	copy_workqueue_attrs(attrs, wq->unbound_attrs);
4905	mutex_unlock(&wq->mutex);
4906	return attrs;
4907}
4908
4909static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
4910			     const char *buf, size_t count)
4911{
4912	struct workqueue_struct *wq = dev_to_wq(dev);
4913	struct workqueue_attrs *attrs;
4914	int ret;
4915
4916	attrs = wq_sysfs_prep_attrs(wq);
4917	if (!attrs)
4918		return -ENOMEM;
4919
4920	if (sscanf(buf, "%d", &attrs->nice) == 1 &&
4921	    attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
4922		ret = apply_workqueue_attrs(wq, attrs);
4923	else
4924		ret = -EINVAL;
4925
4926	free_workqueue_attrs(attrs);
4927	return ret ?: count;
4928}
4929
4930static ssize_t wq_cpumask_show(struct device *dev,
4931			       struct device_attribute *attr, char *buf)
4932{
4933	struct workqueue_struct *wq = dev_to_wq(dev);
4934	int written;
4935
4936	mutex_lock(&wq->mutex);
4937	written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
4938			    cpumask_pr_args(wq->unbound_attrs->cpumask));
4939	mutex_unlock(&wq->mutex);
4940	return written;
4941}
4942
4943static ssize_t wq_cpumask_store(struct device *dev,
4944				struct device_attribute *attr,
4945				const char *buf, size_t count)
4946{
4947	struct workqueue_struct *wq = dev_to_wq(dev);
4948	struct workqueue_attrs *attrs;
4949	int ret;
4950
4951	attrs = wq_sysfs_prep_attrs(wq);
4952	if (!attrs)
4953		return -ENOMEM;
4954
4955	ret = cpumask_parse(buf, attrs->cpumask);
4956	if (!ret)
4957		ret = apply_workqueue_attrs(wq, attrs);
4958
4959	free_workqueue_attrs(attrs);
4960	return ret ?: count;
4961}
4962
4963static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
4964			    char *buf)
4965{
4966	struct workqueue_struct *wq = dev_to_wq(dev);
4967	int written;
4968
4969	mutex_lock(&wq->mutex);
4970	written = scnprintf(buf, PAGE_SIZE, "%d\n",
4971			    !wq->unbound_attrs->no_numa);
4972	mutex_unlock(&wq->mutex);
4973
4974	return written;
4975}
4976
4977static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
4978			     const char *buf, size_t count)
4979{
4980	struct workqueue_struct *wq = dev_to_wq(dev);
4981	struct workqueue_attrs *attrs;
4982	int v, ret;
4983
4984	attrs = wq_sysfs_prep_attrs(wq);
4985	if (!attrs)
4986		return -ENOMEM;
4987
4988	ret = -EINVAL;
4989	if (sscanf(buf, "%d", &v) == 1) {
4990		attrs->no_numa = !v;
4991		ret = apply_workqueue_attrs(wq, attrs);
4992	}
4993
4994	free_workqueue_attrs(attrs);
4995	return ret ?: count;
4996}
4997
4998static struct device_attribute wq_sysfs_unbound_attrs[] = {
4999	__ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
5000	__ATTR(nice, 0644, wq_nice_show, wq_nice_store),
5001	__ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
5002	__ATTR(numa, 0644, wq_numa_show, wq_numa_store),
5003	__ATTR_NULL,
5004};
5005
5006static struct bus_type wq_subsys = {
5007	.name				= "workqueue",
5008	.dev_groups			= wq_sysfs_groups,
5009};
5010
5011static int __init wq_sysfs_init(void)
5012{
5013	return subsys_virtual_register(&wq_subsys, NULL);
5014}
5015core_initcall(wq_sysfs_init);
5016
5017static void wq_device_release(struct device *dev)
5018{
5019	struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
5020
5021	kfree(wq_dev);
5022}
5023
5024/**
5025 * workqueue_sysfs_register - make a workqueue visible in sysfs
5026 * @wq: the workqueue to register
5027 *
5028 * Expose @wq in sysfs under /sys/bus/workqueue/devices.
5029 * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
5030 * which is the preferred method.
5031 *
5032 * Workqueue user should use this function directly iff it wants to apply
5033 * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
5034 * apply_workqueue_attrs() may race against userland updating the
5035 * attributes.
5036 *
5037 * Return: 0 on success, -errno on failure.
5038 */
5039int workqueue_sysfs_register(struct workqueue_struct *wq)
5040{
5041	struct wq_device *wq_dev;
5042	int ret;
5043
5044	/*
5045	 * Adjusting max_active or creating new pwqs by applyting
5046	 * attributes breaks ordering guarantee.  Disallow exposing ordered
5047	 * workqueues.
5048	 */
5049	if (WARN_ON(wq->flags & __WQ_ORDERED))
5050		return -EINVAL;
5051
5052	wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
5053	if (!wq_dev)
5054		return -ENOMEM;
5055
5056	wq_dev->wq = wq;
5057	wq_dev->dev.bus = &wq_subsys;
5058	wq_dev->dev.init_name = wq->name;
5059	wq_dev->dev.release = wq_device_release;
5060
5061	/*
5062	 * unbound_attrs are created separately.  Suppress uevent until
5063	 * everything is ready.
5064	 */
5065	dev_set_uevent_suppress(&wq_dev->dev, true);
5066
5067	ret = device_register(&wq_dev->dev);
5068	if (ret) {
5069		kfree(wq_dev);
5070		wq->wq_dev = NULL;
5071		return ret;
5072	}
5073
5074	if (wq->flags & WQ_UNBOUND) {
5075		struct device_attribute *attr;
5076
5077		for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
5078			ret = device_create_file(&wq_dev->dev, attr);
5079			if (ret) {
5080				device_unregister(&wq_dev->dev);
5081				wq->wq_dev = NULL;
5082				return ret;
5083			}
5084		}
5085	}
5086
5087	dev_set_uevent_suppress(&wq_dev->dev, false);
5088	kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
5089	return 0;
5090}
5091
5092/**
5093 * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
5094 * @wq: the workqueue to unregister
5095 *
5096 * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
5097 */
5098static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
5099{
5100	struct wq_device *wq_dev = wq->wq_dev;
5101
5102	if (!wq->wq_dev)
5103		return;
5104
5105	wq->wq_dev = NULL;
5106	device_unregister(&wq_dev->dev);
5107}
5108#else	/* CONFIG_SYSFS */
5109static void workqueue_sysfs_unregister(struct workqueue_struct *wq)	{ }
5110#endif	/* CONFIG_SYSFS */
5111
5112static void __init wq_numa_init(void)
5113{
5114	cpumask_var_t *tbl;
5115	int node, cpu;
5116
5117	if (num_possible_nodes() <= 1)
5118		return;
5119
5120	if (wq_disable_numa) {
5121		pr_info("workqueue: NUMA affinity support disabled\n");
5122		return;
5123	}
5124
5125	wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
5126	BUG_ON(!wq_update_unbound_numa_attrs_buf);
5127
5128	/*
5129	 * We want masks of possible CPUs of each node which isn't readily
5130	 * available.  Build one from cpu_to_node() which should have been
5131	 * fully initialized by now.
5132	 */
5133	tbl = kzalloc(nr_node_ids * sizeof(tbl[0]), GFP_KERNEL);
5134	BUG_ON(!tbl);
5135
5136	for_each_node(node)
5137		BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
5138				node_online(node) ? node : NUMA_NO_NODE));
5139
5140	for_each_possible_cpu(cpu) {
5141		node = cpu_to_node(cpu);
5142		if (WARN_ON(node == NUMA_NO_NODE)) {
5143			pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu);
5144			/* happens iff arch is bonkers, let's just proceed */
5145			return;
5146		}
5147		cpumask_set_cpu(cpu, tbl[node]);
5148	}
5149
5150	wq_numa_possible_cpumask = tbl;
5151	wq_numa_enabled = true;
5152}
5153
5154static int __init init_workqueues(void)
5155{
5156	int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
5157	int i, cpu;
5158
5159	WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
5160
5161	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
5162
5163	cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
5164	hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
5165
5166	wq_numa_init();
5167
5168	/* initialize CPU pools */
5169	for_each_possible_cpu(cpu) {
5170		struct worker_pool *pool;
5171
5172		i = 0;
5173		for_each_cpu_worker_pool(pool, cpu) {
5174			BUG_ON(init_worker_pool(pool));
5175			pool->cpu = cpu;
5176			cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
5177			pool->attrs->nice = std_nice[i++];
5178			pool->node = cpu_to_node(cpu);
5179
5180			/* alloc pool ID */
5181			mutex_lock(&wq_pool_mutex);
5182			BUG_ON(worker_pool_assign_id(pool));
5183			mutex_unlock(&wq_pool_mutex);
5184		}
5185	}
5186
5187	/* create the initial worker */
5188	for_each_online_cpu(cpu) {
5189		struct worker_pool *pool;
5190
5191		for_each_cpu_worker_pool(pool, cpu) {
5192			pool->flags &= ~POOL_DISASSOCIATED;
5193			BUG_ON(!create_worker(pool));
5194		}
5195	}
5196
5197	/* create default unbound and ordered wq attrs */
5198	for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
5199		struct workqueue_attrs *attrs;
5200
5201		BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
5202		attrs->nice = std_nice[i];
5203		unbound_std_wq_attrs[i] = attrs;
5204
5205		/*
5206		 * An ordered wq should have only one pwq as ordering is
5207		 * guaranteed by max_active which is enforced by pwqs.
5208		 * Turn off NUMA so that dfl_pwq is used for all nodes.
5209		 */
5210		BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
5211		attrs->nice = std_nice[i];
5212		attrs->no_numa = true;
5213		ordered_wq_attrs[i] = attrs;
5214	}
5215
5216	system_wq = alloc_workqueue("events", 0, 0);
5217	system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
5218	system_long_wq = alloc_workqueue("events_long", 0, 0);
5219	system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
5220					    WQ_UNBOUND_MAX_ACTIVE);
5221	system_freezable_wq = alloc_workqueue("events_freezable",
5222					      WQ_FREEZABLE, 0);
5223	system_power_efficient_wq = alloc_workqueue("events_power_efficient",
5224					      WQ_POWER_EFFICIENT, 0);
5225	system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
5226					      WQ_FREEZABLE | WQ_POWER_EFFICIENT,
5227					      0);
5228	BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
5229	       !system_unbound_wq || !system_freezable_wq ||
5230	       !system_power_efficient_wq ||
5231	       !system_freezable_power_efficient_wq);
5232	return 0;
5233}
5234early_initcall(init_workqueues);
5235