1/*
2 * kernel/stop_machine.c
3 *
4 * Copyright (C) 2008, 2005	IBM Corporation.
5 * Copyright (C) 2008, 2005	Rusty Russell rusty@rustcorp.com.au
6 * Copyright (C) 2010		SUSE Linux Products GmbH
7 * Copyright (C) 2010		Tejun Heo <tj@kernel.org>
8 *
9 * This file is released under the GPLv2 and any later version.
10 */
11#include <linux/completion.h>
12#include <linux/cpu.h>
13#include <linux/init.h>
14#include <linux/kthread.h>
15#include <linux/export.h>
16#include <linux/percpu.h>
17#include <linux/sched.h>
18#include <linux/stop_machine.h>
19#include <linux/interrupt.h>
20#include <linux/kallsyms.h>
21#include <linux/smpboot.h>
22#include <linux/atomic.h>
23#include <linux/lglock.h>
24
25/*
26 * Structure to determine completion condition and record errors.  May
27 * be shared by works on different cpus.
28 */
29struct cpu_stop_done {
30	atomic_t		nr_todo;	/* nr left to execute */
31	bool			executed;	/* actually executed? */
32	int			ret;		/* collected return value */
33	struct completion	completion;	/* fired if nr_todo reaches 0 */
34};
35
36/* the actual stopper, one per every possible cpu, enabled on online cpus */
37struct cpu_stopper {
38	spinlock_t		lock;
39	bool			enabled;	/* is this stopper enabled? */
40	struct list_head	works;		/* list of pending works */
41};
42
43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
44static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
45static bool stop_machine_initialized = false;
46
47/*
48 * Avoids a race between stop_two_cpus and global stop_cpus, where
49 * the stoppers could get queued up in reverse order, leading to
50 * system deadlock. Using an lglock means stop_two_cpus remains
51 * relatively cheap.
52 */
53DEFINE_STATIC_LGLOCK(stop_cpus_lock);
54
55static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
56{
57	memset(done, 0, sizeof(*done));
58	atomic_set(&done->nr_todo, nr_todo);
59	init_completion(&done->completion);
60}
61
62/* signal completion unless @done is NULL */
63static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
64{
65	if (done) {
66		if (executed)
67			done->executed = true;
68		if (atomic_dec_and_test(&done->nr_todo))
69			complete(&done->completion);
70	}
71}
72
73/* queue @work to @stopper.  if offline, @work is completed immediately */
74static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
75{
76	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
77	struct task_struct *p = per_cpu(cpu_stopper_task, cpu);
78
79	unsigned long flags;
80
81	spin_lock_irqsave(&stopper->lock, flags);
82
83	if (stopper->enabled) {
84		list_add_tail(&work->list, &stopper->works);
85		wake_up_process(p);
86	} else
87		cpu_stop_signal_done(work->done, false);
88
89	spin_unlock_irqrestore(&stopper->lock, flags);
90}
91
92/**
93 * stop_one_cpu - stop a cpu
94 * @cpu: cpu to stop
95 * @fn: function to execute
96 * @arg: argument to @fn
97 *
98 * Execute @fn(@arg) on @cpu.  @fn is run in a process context with
99 * the highest priority preempting any task on the cpu and
100 * monopolizing it.  This function returns after the execution is
101 * complete.
102 *
103 * This function doesn't guarantee @cpu stays online till @fn
104 * completes.  If @cpu goes down in the middle, execution may happen
105 * partially or fully on different cpus.  @fn should either be ready
106 * for that or the caller should ensure that @cpu stays online until
107 * this function completes.
108 *
109 * CONTEXT:
110 * Might sleep.
111 *
112 * RETURNS:
113 * -ENOENT if @fn(@arg) was not executed because @cpu was offline;
114 * otherwise, the return value of @fn.
115 */
116int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
117{
118	struct cpu_stop_done done;
119	struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
120
121	cpu_stop_init_done(&done, 1);
122	cpu_stop_queue_work(cpu, &work);
123	wait_for_completion(&done.completion);
124	return done.executed ? done.ret : -ENOENT;
125}
126
127/* This controls the threads on each CPU. */
128enum multi_stop_state {
129	/* Dummy starting state for thread. */
130	MULTI_STOP_NONE,
131	/* Awaiting everyone to be scheduled. */
132	MULTI_STOP_PREPARE,
133	/* Disable interrupts. */
134	MULTI_STOP_DISABLE_IRQ,
135	/* Run the function */
136	MULTI_STOP_RUN,
137	/* Exit */
138	MULTI_STOP_EXIT,
139};
140
141struct multi_stop_data {
142	int			(*fn)(void *);
143	void			*data;
144	/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
145	unsigned int		num_threads;
146	const struct cpumask	*active_cpus;
147
148	enum multi_stop_state	state;
149	atomic_t		thread_ack;
150};
151
152static void set_state(struct multi_stop_data *msdata,
153		      enum multi_stop_state newstate)
154{
155	/* Reset ack counter. */
156	atomic_set(&msdata->thread_ack, msdata->num_threads);
157	smp_wmb();
158	msdata->state = newstate;
159}
160
161/* Last one to ack a state moves to the next state. */
162static void ack_state(struct multi_stop_data *msdata)
163{
164	if (atomic_dec_and_test(&msdata->thread_ack))
165		set_state(msdata, msdata->state + 1);
166}
167
168/* This is the cpu_stop function which stops the CPU. */
169static int multi_cpu_stop(void *data)
170{
171	struct multi_stop_data *msdata = data;
172	enum multi_stop_state curstate = MULTI_STOP_NONE;
173	int cpu = smp_processor_id(), err = 0;
174	unsigned long flags;
175	bool is_active;
176
177	/*
178	 * When called from stop_machine_from_inactive_cpu(), irq might
179	 * already be disabled.  Save the state and restore it on exit.
180	 */
181	local_save_flags(flags);
182
183	if (!msdata->active_cpus)
184		is_active = cpu == cpumask_first(cpu_online_mask);
185	else
186		is_active = cpumask_test_cpu(cpu, msdata->active_cpus);
187
188	/* Simple state machine */
189	do {
190		/* Chill out and ensure we re-read multi_stop_state. */
191		cpu_relax();
192		if (msdata->state != curstate) {
193			curstate = msdata->state;
194			switch (curstate) {
195			case MULTI_STOP_DISABLE_IRQ:
196				local_irq_disable();
197				hard_irq_disable();
198				break;
199			case MULTI_STOP_RUN:
200				if (is_active)
201					err = msdata->fn(msdata->data);
202				break;
203			default:
204				break;
205			}
206			ack_state(msdata);
207		}
208	} while (curstate != MULTI_STOP_EXIT);
209
210	local_irq_restore(flags);
211	return err;
212}
213
214struct irq_cpu_stop_queue_work_info {
215	int cpu1;
216	int cpu2;
217	struct cpu_stop_work *work1;
218	struct cpu_stop_work *work2;
219};
220
221/*
222 * This function is always run with irqs and preemption disabled.
223 * This guarantees that both work1 and work2 get queued, before
224 * our local migrate thread gets the chance to preempt us.
225 */
226static void irq_cpu_stop_queue_work(void *arg)
227{
228	struct irq_cpu_stop_queue_work_info *info = arg;
229	cpu_stop_queue_work(info->cpu1, info->work1);
230	cpu_stop_queue_work(info->cpu2, info->work2);
231}
232
233/**
234 * stop_two_cpus - stops two cpus
235 * @cpu1: the cpu to stop
236 * @cpu2: the other cpu to stop
237 * @fn: function to execute
238 * @arg: argument to @fn
239 *
240 * Stops both the current and specified CPU and runs @fn on one of them.
241 *
242 * returns when both are completed.
243 */
244int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg)
245{
246	struct cpu_stop_done done;
247	struct cpu_stop_work work1, work2;
248	struct irq_cpu_stop_queue_work_info call_args;
249	struct multi_stop_data msdata;
250
251	preempt_disable();
252	msdata = (struct multi_stop_data){
253		.fn = fn,
254		.data = arg,
255		.num_threads = 2,
256		.active_cpus = cpumask_of(cpu1),
257	};
258
259	work1 = work2 = (struct cpu_stop_work){
260		.fn = multi_cpu_stop,
261		.arg = &msdata,
262		.done = &done
263	};
264
265	call_args = (struct irq_cpu_stop_queue_work_info){
266		.cpu1 = cpu1,
267		.cpu2 = cpu2,
268		.work1 = &work1,
269		.work2 = &work2,
270	};
271
272	cpu_stop_init_done(&done, 2);
273	set_state(&msdata, MULTI_STOP_PREPARE);
274
275	/*
276	 * If we observe both CPUs active we know _cpu_down() cannot yet have
277	 * queued its stop_machine works and therefore ours will get executed
278	 * first. Or its not either one of our CPUs that's getting unplugged,
279	 * in which case we don't care.
280	 *
281	 * This relies on the stopper workqueues to be FIFO.
282	 */
283	if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
284		preempt_enable();
285		return -ENOENT;
286	}
287
288	lg_local_lock(&stop_cpus_lock);
289	/*
290	 * Queuing needs to be done by the lowest numbered CPU, to ensure
291	 * that works are always queued in the same order on every CPU.
292	 * This prevents deadlocks.
293	 */
294	smp_call_function_single(min(cpu1, cpu2),
295				 &irq_cpu_stop_queue_work,
296				 &call_args, 1);
297	lg_local_unlock(&stop_cpus_lock);
298	preempt_enable();
299
300	wait_for_completion(&done.completion);
301
302	return done.executed ? done.ret : -ENOENT;
303}
304
305/**
306 * stop_one_cpu_nowait - stop a cpu but don't wait for completion
307 * @cpu: cpu to stop
308 * @fn: function to execute
309 * @arg: argument to @fn
310 * @work_buf: pointer to cpu_stop_work structure
311 *
312 * Similar to stop_one_cpu() but doesn't wait for completion.  The
313 * caller is responsible for ensuring @work_buf is currently unused
314 * and will remain untouched until stopper starts executing @fn.
315 *
316 * CONTEXT:
317 * Don't care.
318 */
319void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
320			struct cpu_stop_work *work_buf)
321{
322	*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
323	cpu_stop_queue_work(cpu, work_buf);
324}
325
326/* static data for stop_cpus */
327static DEFINE_MUTEX(stop_cpus_mutex);
328static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
329
330static void queue_stop_cpus_work(const struct cpumask *cpumask,
331				 cpu_stop_fn_t fn, void *arg,
332				 struct cpu_stop_done *done)
333{
334	struct cpu_stop_work *work;
335	unsigned int cpu;
336
337	/* initialize works and done */
338	for_each_cpu(cpu, cpumask) {
339		work = &per_cpu(stop_cpus_work, cpu);
340		work->fn = fn;
341		work->arg = arg;
342		work->done = done;
343	}
344
345	/*
346	 * Disable preemption while queueing to avoid getting
347	 * preempted by a stopper which might wait for other stoppers
348	 * to enter @fn which can lead to deadlock.
349	 */
350	lg_global_lock(&stop_cpus_lock);
351	for_each_cpu(cpu, cpumask)
352		cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
353	lg_global_unlock(&stop_cpus_lock);
354}
355
356static int __stop_cpus(const struct cpumask *cpumask,
357		       cpu_stop_fn_t fn, void *arg)
358{
359	struct cpu_stop_done done;
360
361	cpu_stop_init_done(&done, cpumask_weight(cpumask));
362	queue_stop_cpus_work(cpumask, fn, arg, &done);
363	wait_for_completion(&done.completion);
364	return done.executed ? done.ret : -ENOENT;
365}
366
367/**
368 * stop_cpus - stop multiple cpus
369 * @cpumask: cpus to stop
370 * @fn: function to execute
371 * @arg: argument to @fn
372 *
373 * Execute @fn(@arg) on online cpus in @cpumask.  On each target cpu,
374 * @fn is run in a process context with the highest priority
375 * preempting any task on the cpu and monopolizing it.  This function
376 * returns after all executions are complete.
377 *
378 * This function doesn't guarantee the cpus in @cpumask stay online
379 * till @fn completes.  If some cpus go down in the middle, execution
380 * on the cpu may happen partially or fully on different cpus.  @fn
381 * should either be ready for that or the caller should ensure that
382 * the cpus stay online until this function completes.
383 *
384 * All stop_cpus() calls are serialized making it safe for @fn to wait
385 * for all cpus to start executing it.
386 *
387 * CONTEXT:
388 * Might sleep.
389 *
390 * RETURNS:
391 * -ENOENT if @fn(@arg) was not executed at all because all cpus in
392 * @cpumask were offline; otherwise, 0 if all executions of @fn
393 * returned 0, any non zero return value if any returned non zero.
394 */
395int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
396{
397	int ret;
398
399	/* static works are used, process one request at a time */
400	mutex_lock(&stop_cpus_mutex);
401	ret = __stop_cpus(cpumask, fn, arg);
402	mutex_unlock(&stop_cpus_mutex);
403	return ret;
404}
405
406/**
407 * try_stop_cpus - try to stop multiple cpus
408 * @cpumask: cpus to stop
409 * @fn: function to execute
410 * @arg: argument to @fn
411 *
412 * Identical to stop_cpus() except that it fails with -EAGAIN if
413 * someone else is already using the facility.
414 *
415 * CONTEXT:
416 * Might sleep.
417 *
418 * RETURNS:
419 * -EAGAIN if someone else is already stopping cpus, -ENOENT if
420 * @fn(@arg) was not executed at all because all cpus in @cpumask were
421 * offline; otherwise, 0 if all executions of @fn returned 0, any non
422 * zero return value if any returned non zero.
423 */
424int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
425{
426	int ret;
427
428	/* static works are used, process one request at a time */
429	if (!mutex_trylock(&stop_cpus_mutex))
430		return -EAGAIN;
431	ret = __stop_cpus(cpumask, fn, arg);
432	mutex_unlock(&stop_cpus_mutex);
433	return ret;
434}
435
436static int cpu_stop_should_run(unsigned int cpu)
437{
438	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
439	unsigned long flags;
440	int run;
441
442	spin_lock_irqsave(&stopper->lock, flags);
443	run = !list_empty(&stopper->works);
444	spin_unlock_irqrestore(&stopper->lock, flags);
445	return run;
446}
447
448static void cpu_stopper_thread(unsigned int cpu)
449{
450	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
451	struct cpu_stop_work *work;
452	int ret;
453
454repeat:
455	work = NULL;
456	spin_lock_irq(&stopper->lock);
457	if (!list_empty(&stopper->works)) {
458		work = list_first_entry(&stopper->works,
459					struct cpu_stop_work, list);
460		list_del_init(&work->list);
461	}
462	spin_unlock_irq(&stopper->lock);
463
464	if (work) {
465		cpu_stop_fn_t fn = work->fn;
466		void *arg = work->arg;
467		struct cpu_stop_done *done = work->done;
468		char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
469
470		/* cpu stop callbacks are not allowed to sleep */
471		preempt_disable();
472
473		ret = fn(arg);
474		if (ret)
475			done->ret = ret;
476
477		/* restore preemption and check it's still balanced */
478		preempt_enable();
479		WARN_ONCE(preempt_count(),
480			  "cpu_stop: %s(%p) leaked preempt count\n",
481			  kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
482					  ksym_buf), arg);
483
484		cpu_stop_signal_done(done, true);
485		goto repeat;
486	}
487}
488
489extern void sched_set_stop_task(int cpu, struct task_struct *stop);
490
491static void cpu_stop_create(unsigned int cpu)
492{
493	sched_set_stop_task(cpu, per_cpu(cpu_stopper_task, cpu));
494}
495
496static void cpu_stop_park(unsigned int cpu)
497{
498	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
499	struct cpu_stop_work *work;
500	unsigned long flags;
501
502	/* drain remaining works */
503	spin_lock_irqsave(&stopper->lock, flags);
504	list_for_each_entry(work, &stopper->works, list)
505		cpu_stop_signal_done(work->done, false);
506	stopper->enabled = false;
507	spin_unlock_irqrestore(&stopper->lock, flags);
508}
509
510static void cpu_stop_unpark(unsigned int cpu)
511{
512	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
513
514	spin_lock_irq(&stopper->lock);
515	stopper->enabled = true;
516	spin_unlock_irq(&stopper->lock);
517}
518
519static struct smp_hotplug_thread cpu_stop_threads = {
520	.store			= &cpu_stopper_task,
521	.thread_should_run	= cpu_stop_should_run,
522	.thread_fn		= cpu_stopper_thread,
523	.thread_comm		= "migration/%u",
524	.create			= cpu_stop_create,
525	.setup			= cpu_stop_unpark,
526	.park			= cpu_stop_park,
527	.pre_unpark		= cpu_stop_unpark,
528	.selfparking		= true,
529};
530
531static int __init cpu_stop_init(void)
532{
533	unsigned int cpu;
534
535	for_each_possible_cpu(cpu) {
536		struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
537
538		spin_lock_init(&stopper->lock);
539		INIT_LIST_HEAD(&stopper->works);
540	}
541
542	BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
543	stop_machine_initialized = true;
544	return 0;
545}
546early_initcall(cpu_stop_init);
547
548#ifdef CONFIG_STOP_MACHINE
549
550int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
551{
552	struct multi_stop_data msdata = {
553		.fn = fn,
554		.data = data,
555		.num_threads = num_online_cpus(),
556		.active_cpus = cpus,
557	};
558
559	if (!stop_machine_initialized) {
560		/*
561		 * Handle the case where stop_machine() is called
562		 * early in boot before stop_machine() has been
563		 * initialized.
564		 */
565		unsigned long flags;
566		int ret;
567
568		WARN_ON_ONCE(msdata.num_threads != 1);
569
570		local_irq_save(flags);
571		hard_irq_disable();
572		ret = (*fn)(data);
573		local_irq_restore(flags);
574
575		return ret;
576	}
577
578	/* Set the initial state and stop all online cpus. */
579	set_state(&msdata, MULTI_STOP_PREPARE);
580	return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
581}
582
583int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
584{
585	int ret;
586
587	/* No CPUs can come up or down during this. */
588	get_online_cpus();
589	ret = __stop_machine(fn, data, cpus);
590	put_online_cpus();
591	return ret;
592}
593EXPORT_SYMBOL_GPL(stop_machine);
594
595/**
596 * stop_machine_from_inactive_cpu - stop_machine() from inactive CPU
597 * @fn: the function to run
598 * @data: the data ptr for the @fn()
599 * @cpus: the cpus to run the @fn() on (NULL = any online cpu)
600 *
601 * This is identical to stop_machine() but can be called from a CPU which
602 * is not active.  The local CPU is in the process of hotplug (so no other
603 * CPU hotplug can start) and not marked active and doesn't have enough
604 * context to sleep.
605 *
606 * This function provides stop_machine() functionality for such state by
607 * using busy-wait for synchronization and executing @fn directly for local
608 * CPU.
609 *
610 * CONTEXT:
611 * Local CPU is inactive.  Temporarily stops all active CPUs.
612 *
613 * RETURNS:
614 * 0 if all executions of @fn returned 0, any non zero return value if any
615 * returned non zero.
616 */
617int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
618				  const struct cpumask *cpus)
619{
620	struct multi_stop_data msdata = { .fn = fn, .data = data,
621					    .active_cpus = cpus };
622	struct cpu_stop_done done;
623	int ret;
624
625	/* Local CPU must be inactive and CPU hotplug in progress. */
626	BUG_ON(cpu_active(raw_smp_processor_id()));
627	msdata.num_threads = num_active_cpus() + 1;	/* +1 for local */
628
629	/* No proper task established and can't sleep - busy wait for lock. */
630	while (!mutex_trylock(&stop_cpus_mutex))
631		cpu_relax();
632
633	/* Schedule work on other CPUs and execute directly for local CPU */
634	set_state(&msdata, MULTI_STOP_PREPARE);
635	cpu_stop_init_done(&done, num_active_cpus());
636	queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
637			     &done);
638	ret = multi_cpu_stop(&msdata);
639
640	/* Busy wait for completion. */
641	while (!completion_done(&done.completion))
642		cpu_relax();
643
644	mutex_unlock(&stop_cpus_mutex);
645	return ret ?: done.ret;
646}
647
648#endif	/* CONFIG_STOP_MACHINE */
649