1
2#include <linux/sched.h>
3#include <linux/sched/sysctl.h>
4#include <linux/sched/rt.h>
5#include <linux/sched/deadline.h>
6#include <linux/mutex.h>
7#include <linux/spinlock.h>
8#include <linux/stop_machine.h>
9#include <linux/irq_work.h>
10#include <linux/tick.h>
11#include <linux/slab.h>
12
13#include "cpupri.h"
14#include "cpudeadline.h"
15#include "cpuacct.h"
16
17struct rq;
18struct cpuidle_state;
19
20/* task_struct::on_rq states: */
21#define TASK_ON_RQ_QUEUED	1
22#define TASK_ON_RQ_MIGRATING	2
23
24extern __read_mostly int scheduler_running;
25
26extern unsigned long calc_load_update;
27extern atomic_long_t calc_load_tasks;
28
29extern long calc_load_fold_active(struct rq *this_rq);
30extern void update_cpu_load_active(struct rq *this_rq);
31
32/*
33 * Helpers for converting nanosecond timing to jiffy resolution
34 */
35#define NS_TO_JIFFIES(TIME)	((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
36
37/*
38 * Increase resolution of nice-level calculations for 64-bit architectures.
39 * The extra resolution improves shares distribution and load balancing of
40 * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
41 * hierarchies, especially on larger systems. This is not a user-visible change
42 * and does not change the user-interface for setting shares/weights.
43 *
44 * We increase resolution only if we have enough bits to allow this increased
45 * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
46 * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
47 * increased costs.
48 */
49#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load  */
50# define SCHED_LOAD_RESOLUTION	10
51# define scale_load(w)		((w) << SCHED_LOAD_RESOLUTION)
52# define scale_load_down(w)	((w) >> SCHED_LOAD_RESOLUTION)
53#else
54# define SCHED_LOAD_RESOLUTION	0
55# define scale_load(w)		(w)
56# define scale_load_down(w)	(w)
57#endif
58
59#define SCHED_LOAD_SHIFT	(10 + SCHED_LOAD_RESOLUTION)
60#define SCHED_LOAD_SCALE	(1L << SCHED_LOAD_SHIFT)
61
62#define NICE_0_LOAD		SCHED_LOAD_SCALE
63#define NICE_0_SHIFT		SCHED_LOAD_SHIFT
64
65/*
66 * Single value that decides SCHED_DEADLINE internal math precision.
67 * 10 -> just above 1us
68 * 9  -> just above 0.5us
69 */
70#define DL_SCALE (10)
71
72/*
73 * These are the 'tuning knobs' of the scheduler:
74 */
75
76/*
77 * single value that denotes runtime == period, ie unlimited time.
78 */
79#define RUNTIME_INF	((u64)~0ULL)
80
81static inline int fair_policy(int policy)
82{
83	return policy == SCHED_NORMAL || policy == SCHED_BATCH;
84}
85
86static inline int rt_policy(int policy)
87{
88	return policy == SCHED_FIFO || policy == SCHED_RR;
89}
90
91static inline int dl_policy(int policy)
92{
93	return policy == SCHED_DEADLINE;
94}
95
96static inline int task_has_rt_policy(struct task_struct *p)
97{
98	return rt_policy(p->policy);
99}
100
101static inline int task_has_dl_policy(struct task_struct *p)
102{
103	return dl_policy(p->policy);
104}
105
106static inline bool dl_time_before(u64 a, u64 b)
107{
108	return (s64)(a - b) < 0;
109}
110
111/*
112 * Tells if entity @a should preempt entity @b.
113 */
114static inline bool
115dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
116{
117	return dl_time_before(a->deadline, b->deadline);
118}
119
120/*
121 * This is the priority-queue data structure of the RT scheduling class:
122 */
123struct rt_prio_array {
124	DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
125	struct list_head queue[MAX_RT_PRIO];
126};
127
128struct rt_bandwidth {
129	/* nests inside the rq lock: */
130	raw_spinlock_t		rt_runtime_lock;
131	ktime_t			rt_period;
132	u64			rt_runtime;
133	struct hrtimer		rt_period_timer;
134};
135
136void __dl_clear_params(struct task_struct *p);
137
138/*
139 * To keep the bandwidth of -deadline tasks and groups under control
140 * we need some place where:
141 *  - store the maximum -deadline bandwidth of the system (the group);
142 *  - cache the fraction of that bandwidth that is currently allocated.
143 *
144 * This is all done in the data structure below. It is similar to the
145 * one used for RT-throttling (rt_bandwidth), with the main difference
146 * that, since here we are only interested in admission control, we
147 * do not decrease any runtime while the group "executes", neither we
148 * need a timer to replenish it.
149 *
150 * With respect to SMP, the bandwidth is given on a per-CPU basis,
151 * meaning that:
152 *  - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
153 *  - dl_total_bw array contains, in the i-eth element, the currently
154 *    allocated bandwidth on the i-eth CPU.
155 * Moreover, groups consume bandwidth on each CPU, while tasks only
156 * consume bandwidth on the CPU they're running on.
157 * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
158 * that will be shown the next time the proc or cgroup controls will
159 * be red. It on its turn can be changed by writing on its own
160 * control.
161 */
162struct dl_bandwidth {
163	raw_spinlock_t dl_runtime_lock;
164	u64 dl_runtime;
165	u64 dl_period;
166};
167
168static inline int dl_bandwidth_enabled(void)
169{
170	return sysctl_sched_rt_runtime >= 0;
171}
172
173extern struct dl_bw *dl_bw_of(int i);
174
175struct dl_bw {
176	raw_spinlock_t lock;
177	u64 bw, total_bw;
178};
179
180static inline
181void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
182{
183	dl_b->total_bw -= tsk_bw;
184}
185
186static inline
187void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
188{
189	dl_b->total_bw += tsk_bw;
190}
191
192static inline
193bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
194{
195	return dl_b->bw != -1 &&
196	       dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
197}
198
199extern struct mutex sched_domains_mutex;
200
201#ifdef CONFIG_CGROUP_SCHED
202
203#include <linux/cgroup.h>
204
205struct cfs_rq;
206struct rt_rq;
207
208extern struct list_head task_groups;
209
210struct cfs_bandwidth {
211#ifdef CONFIG_CFS_BANDWIDTH
212	raw_spinlock_t lock;
213	ktime_t period;
214	u64 quota, runtime;
215	s64 hierarchical_quota;
216	u64 runtime_expires;
217
218	int idle, timer_active;
219	struct hrtimer period_timer, slack_timer;
220	struct list_head throttled_cfs_rq;
221
222	/* statistics */
223	int nr_periods, nr_throttled;
224	u64 throttled_time;
225#endif
226};
227
228/* task group related information */
229struct task_group {
230	struct cgroup_subsys_state css;
231
232#ifdef CONFIG_FAIR_GROUP_SCHED
233	/* schedulable entities of this group on each cpu */
234	struct sched_entity **se;
235	/* runqueue "owned" by this group on each cpu */
236	struct cfs_rq **cfs_rq;
237	unsigned long shares;
238
239#ifdef	CONFIG_SMP
240	atomic_long_t load_avg;
241	atomic_t runnable_avg;
242#endif
243#endif
244
245#ifdef CONFIG_RT_GROUP_SCHED
246	struct sched_rt_entity **rt_se;
247	struct rt_rq **rt_rq;
248
249	struct rt_bandwidth rt_bandwidth;
250#endif
251
252	struct rcu_head rcu;
253	struct list_head list;
254
255	struct task_group *parent;
256	struct list_head siblings;
257	struct list_head children;
258
259#ifdef CONFIG_SCHED_AUTOGROUP
260	struct autogroup *autogroup;
261#endif
262
263	struct cfs_bandwidth cfs_bandwidth;
264};
265
266#ifdef CONFIG_FAIR_GROUP_SCHED
267#define ROOT_TASK_GROUP_LOAD	NICE_0_LOAD
268
269/*
270 * A weight of 0 or 1 can cause arithmetics problems.
271 * A weight of a cfs_rq is the sum of weights of which entities
272 * are queued on this cfs_rq, so a weight of a entity should not be
273 * too large, so as the shares value of a task group.
274 * (The default weight is 1024 - so there's no practical
275 *  limitation from this.)
276 */
277#define MIN_SHARES	(1UL <<  1)
278#define MAX_SHARES	(1UL << 18)
279#endif
280
281typedef int (*tg_visitor)(struct task_group *, void *);
282
283extern int walk_tg_tree_from(struct task_group *from,
284			     tg_visitor down, tg_visitor up, void *data);
285
286/*
287 * Iterate the full tree, calling @down when first entering a node and @up when
288 * leaving it for the final time.
289 *
290 * Caller must hold rcu_lock or sufficient equivalent.
291 */
292static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
293{
294	return walk_tg_tree_from(&root_task_group, down, up, data);
295}
296
297extern int tg_nop(struct task_group *tg, void *data);
298
299extern void free_fair_sched_group(struct task_group *tg);
300extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
301extern void unregister_fair_sched_group(struct task_group *tg, int cpu);
302extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
303			struct sched_entity *se, int cpu,
304			struct sched_entity *parent);
305extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
306extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
307
308extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
309extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force);
310extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
311
312extern void free_rt_sched_group(struct task_group *tg);
313extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
314extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
315		struct sched_rt_entity *rt_se, int cpu,
316		struct sched_rt_entity *parent);
317
318extern struct task_group *sched_create_group(struct task_group *parent);
319extern void sched_online_group(struct task_group *tg,
320			       struct task_group *parent);
321extern void sched_destroy_group(struct task_group *tg);
322extern void sched_offline_group(struct task_group *tg);
323
324extern void sched_move_task(struct task_struct *tsk);
325
326#ifdef CONFIG_FAIR_GROUP_SCHED
327extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
328#endif
329
330#else /* CONFIG_CGROUP_SCHED */
331
332struct cfs_bandwidth { };
333
334#endif	/* CONFIG_CGROUP_SCHED */
335
336/* CFS-related fields in a runqueue */
337struct cfs_rq {
338	struct load_weight load;
339	unsigned int nr_running, h_nr_running;
340
341	u64 exec_clock;
342	u64 min_vruntime;
343#ifndef CONFIG_64BIT
344	u64 min_vruntime_copy;
345#endif
346
347	struct rb_root tasks_timeline;
348	struct rb_node *rb_leftmost;
349
350	/*
351	 * 'curr' points to currently running entity on this cfs_rq.
352	 * It is set to NULL otherwise (i.e when none are currently running).
353	 */
354	struct sched_entity *curr, *next, *last, *skip;
355
356#ifdef	CONFIG_SCHED_DEBUG
357	unsigned int nr_spread_over;
358#endif
359
360#ifdef CONFIG_SMP
361	/*
362	 * CFS Load tracking
363	 * Under CFS, load is tracked on a per-entity basis and aggregated up.
364	 * This allows for the description of both thread and group usage (in
365	 * the FAIR_GROUP_SCHED case).
366	 * runnable_load_avg is the sum of the load_avg_contrib of the
367	 * sched_entities on the rq.
368	 * blocked_load_avg is similar to runnable_load_avg except that its
369	 * the blocked sched_entities on the rq.
370	 * utilization_load_avg is the sum of the average running time of the
371	 * sched_entities on the rq.
372	 */
373	unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg;
374	atomic64_t decay_counter;
375	u64 last_decay;
376	atomic_long_t removed_load;
377
378#ifdef CONFIG_FAIR_GROUP_SCHED
379	/* Required to track per-cpu representation of a task_group */
380	u32 tg_runnable_contrib;
381	unsigned long tg_load_contrib;
382
383	/*
384	 *   h_load = weight * f(tg)
385	 *
386	 * Where f(tg) is the recursive weight fraction assigned to
387	 * this group.
388	 */
389	unsigned long h_load;
390	u64 last_h_load_update;
391	struct sched_entity *h_load_next;
392#endif /* CONFIG_FAIR_GROUP_SCHED */
393#endif /* CONFIG_SMP */
394
395#ifdef CONFIG_FAIR_GROUP_SCHED
396	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
397
398	/*
399	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
400	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
401	 * (like users, containers etc.)
402	 *
403	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
404	 * list is used during load balance.
405	 */
406	int on_list;
407	struct list_head leaf_cfs_rq_list;
408	struct task_group *tg;	/* group that "owns" this runqueue */
409
410#ifdef CONFIG_CFS_BANDWIDTH
411	int runtime_enabled;
412	u64 runtime_expires;
413	s64 runtime_remaining;
414
415	u64 throttled_clock, throttled_clock_task;
416	u64 throttled_clock_task_time;
417	int throttled, throttle_count;
418	struct list_head throttled_list;
419#endif /* CONFIG_CFS_BANDWIDTH */
420#endif /* CONFIG_FAIR_GROUP_SCHED */
421};
422
423static inline int rt_bandwidth_enabled(void)
424{
425	return sysctl_sched_rt_runtime >= 0;
426}
427
428/* RT IPI pull logic requires IRQ_WORK */
429#ifdef CONFIG_IRQ_WORK
430# define HAVE_RT_PUSH_IPI
431#endif
432
433/* Real-Time classes' related field in a runqueue: */
434struct rt_rq {
435	struct rt_prio_array active;
436	unsigned int rt_nr_running;
437#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
438	struct {
439		int curr; /* highest queued rt task prio */
440#ifdef CONFIG_SMP
441		int next; /* next highest */
442#endif
443	} highest_prio;
444#endif
445#ifdef CONFIG_SMP
446	unsigned long rt_nr_migratory;
447	unsigned long rt_nr_total;
448	int overloaded;
449	struct plist_head pushable_tasks;
450#ifdef HAVE_RT_PUSH_IPI
451	int push_flags;
452	int push_cpu;
453	struct irq_work push_work;
454	raw_spinlock_t push_lock;
455#endif
456#endif /* CONFIG_SMP */
457	int rt_queued;
458
459	int rt_throttled;
460	u64 rt_time;
461	u64 rt_runtime;
462	/* Nests inside the rq lock: */
463	raw_spinlock_t rt_runtime_lock;
464
465#ifdef CONFIG_RT_GROUP_SCHED
466	unsigned long rt_nr_boosted;
467
468	struct rq *rq;
469	struct task_group *tg;
470#endif
471};
472
473/* Deadline class' related fields in a runqueue */
474struct dl_rq {
475	/* runqueue is an rbtree, ordered by deadline */
476	struct rb_root rb_root;
477	struct rb_node *rb_leftmost;
478
479	unsigned long dl_nr_running;
480
481#ifdef CONFIG_SMP
482	/*
483	 * Deadline values of the currently executing and the
484	 * earliest ready task on this rq. Caching these facilitates
485	 * the decision wether or not a ready but not running task
486	 * should migrate somewhere else.
487	 */
488	struct {
489		u64 curr;
490		u64 next;
491	} earliest_dl;
492
493	unsigned long dl_nr_migratory;
494	int overloaded;
495
496	/*
497	 * Tasks on this rq that can be pushed away. They are kept in
498	 * an rb-tree, ordered by tasks' deadlines, with caching
499	 * of the leftmost (earliest deadline) element.
500	 */
501	struct rb_root pushable_dl_tasks_root;
502	struct rb_node *pushable_dl_tasks_leftmost;
503#else
504	struct dl_bw dl_bw;
505#endif
506};
507
508#ifdef CONFIG_SMP
509
510/*
511 * We add the notion of a root-domain which will be used to define per-domain
512 * variables. Each exclusive cpuset essentially defines an island domain by
513 * fully partitioning the member cpus from any other cpuset. Whenever a new
514 * exclusive cpuset is created, we also create and attach a new root-domain
515 * object.
516 *
517 */
518struct root_domain {
519	atomic_t refcount;
520	atomic_t rto_count;
521	struct rcu_head rcu;
522	cpumask_var_t span;
523	cpumask_var_t online;
524
525	/* Indicate more than one runnable task for any CPU */
526	bool overload;
527
528	/*
529	 * The bit corresponding to a CPU gets set here if such CPU has more
530	 * than one runnable -deadline task (as it is below for RT tasks).
531	 */
532	cpumask_var_t dlo_mask;
533	atomic_t dlo_count;
534	struct dl_bw dl_bw;
535	struct cpudl cpudl;
536
537	/*
538	 * The "RT overload" flag: it gets set if a CPU has more than
539	 * one runnable RT task.
540	 */
541	cpumask_var_t rto_mask;
542	struct cpupri cpupri;
543};
544
545extern struct root_domain def_root_domain;
546
547#endif /* CONFIG_SMP */
548
549/*
550 * This is the main, per-CPU runqueue data structure.
551 *
552 * Locking rule: those places that want to lock multiple runqueues
553 * (such as the load balancing or the thread migration code), lock
554 * acquire operations must be ordered by ascending &runqueue.
555 */
556struct rq {
557	/* runqueue lock: */
558	raw_spinlock_t lock;
559
560	/*
561	 * nr_running and cpu_load should be in the same cacheline because
562	 * remote CPUs use both these fields when doing load calculation.
563	 */
564	unsigned int nr_running;
565#ifdef CONFIG_NUMA_BALANCING
566	unsigned int nr_numa_running;
567	unsigned int nr_preferred_running;
568#endif
569	#define CPU_LOAD_IDX_MAX 5
570	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
571	unsigned long last_load_update_tick;
572#ifdef CONFIG_NO_HZ_COMMON
573	u64 nohz_stamp;
574	unsigned long nohz_flags;
575#endif
576#ifdef CONFIG_NO_HZ_FULL
577	unsigned long last_sched_tick;
578#endif
579	/* capture load from *all* tasks on this cpu: */
580	struct load_weight load;
581	unsigned long nr_load_updates;
582	u64 nr_switches;
583
584	struct cfs_rq cfs;
585	struct rt_rq rt;
586	struct dl_rq dl;
587
588#ifdef CONFIG_FAIR_GROUP_SCHED
589	/* list of leaf cfs_rq on this cpu: */
590	struct list_head leaf_cfs_rq_list;
591
592	struct sched_avg avg;
593#endif /* CONFIG_FAIR_GROUP_SCHED */
594
595	/*
596	 * This is part of a global counter where only the total sum
597	 * over all CPUs matters. A task can increase this counter on
598	 * one CPU and if it got migrated afterwards it may decrease
599	 * it on another CPU. Always updated under the runqueue lock:
600	 */
601	unsigned long nr_uninterruptible;
602
603	struct task_struct *curr, *idle, *stop;
604	unsigned long next_balance;
605	struct mm_struct *prev_mm;
606
607	unsigned int clock_skip_update;
608	u64 clock;
609	u64 clock_task;
610
611	atomic_t nr_iowait;
612
613#ifdef CONFIG_SMP
614	struct root_domain *rd;
615	struct sched_domain *sd;
616
617	unsigned long cpu_capacity;
618	unsigned long cpu_capacity_orig;
619
620	unsigned char idle_balance;
621	/* For active balancing */
622	int post_schedule;
623	int active_balance;
624	int push_cpu;
625	struct cpu_stop_work active_balance_work;
626	/* cpu of this runqueue: */
627	int cpu;
628	int online;
629
630	struct list_head cfs_tasks;
631
632	u64 rt_avg;
633	u64 age_stamp;
634	u64 idle_stamp;
635	u64 avg_idle;
636
637	/* This is used to determine avg_idle's max value */
638	u64 max_idle_balance_cost;
639#endif
640
641#ifdef CONFIG_IRQ_TIME_ACCOUNTING
642	u64 prev_irq_time;
643#endif
644#ifdef CONFIG_PARAVIRT
645	u64 prev_steal_time;
646#endif
647#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
648	u64 prev_steal_time_rq;
649#endif
650
651	/* calc_load related fields */
652	unsigned long calc_load_update;
653	long calc_load_active;
654
655#ifdef CONFIG_SCHED_HRTICK
656#ifdef CONFIG_SMP
657	int hrtick_csd_pending;
658	struct call_single_data hrtick_csd;
659#endif
660	struct hrtimer hrtick_timer;
661#endif
662
663#ifdef CONFIG_SCHEDSTATS
664	/* latency stats */
665	struct sched_info rq_sched_info;
666	unsigned long long rq_cpu_time;
667	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
668
669	/* sys_sched_yield() stats */
670	unsigned int yld_count;
671
672	/* schedule() stats */
673	unsigned int sched_count;
674	unsigned int sched_goidle;
675
676	/* try_to_wake_up() stats */
677	unsigned int ttwu_count;
678	unsigned int ttwu_local;
679#endif
680
681#ifdef CONFIG_SMP
682	struct llist_head wake_list;
683#endif
684
685#ifdef CONFIG_CPU_IDLE
686	/* Must be inspected within a rcu lock section */
687	struct cpuidle_state *idle_state;
688#endif
689};
690
691static inline int cpu_of(struct rq *rq)
692{
693#ifdef CONFIG_SMP
694	return rq->cpu;
695#else
696	return 0;
697#endif
698}
699
700DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
701
702#define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
703#define this_rq()		this_cpu_ptr(&runqueues)
704#define task_rq(p)		cpu_rq(task_cpu(p))
705#define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
706#define raw_rq()		raw_cpu_ptr(&runqueues)
707
708static inline u64 __rq_clock_broken(struct rq *rq)
709{
710	return ACCESS_ONCE(rq->clock);
711}
712
713static inline u64 rq_clock(struct rq *rq)
714{
715	lockdep_assert_held(&rq->lock);
716	return rq->clock;
717}
718
719static inline u64 rq_clock_task(struct rq *rq)
720{
721	lockdep_assert_held(&rq->lock);
722	return rq->clock_task;
723}
724
725#define RQCF_REQ_SKIP	0x01
726#define RQCF_ACT_SKIP	0x02
727
728static inline void rq_clock_skip_update(struct rq *rq, bool skip)
729{
730	lockdep_assert_held(&rq->lock);
731	if (skip)
732		rq->clock_skip_update |= RQCF_REQ_SKIP;
733	else
734		rq->clock_skip_update &= ~RQCF_REQ_SKIP;
735}
736
737#ifdef CONFIG_NUMA
738enum numa_topology_type {
739	NUMA_DIRECT,
740	NUMA_GLUELESS_MESH,
741	NUMA_BACKPLANE,
742};
743extern enum numa_topology_type sched_numa_topology_type;
744extern int sched_max_numa_distance;
745extern bool find_numa_distance(int distance);
746#endif
747
748#ifdef CONFIG_NUMA_BALANCING
749/* The regions in numa_faults array from task_struct */
750enum numa_faults_stats {
751	NUMA_MEM = 0,
752	NUMA_CPU,
753	NUMA_MEMBUF,
754	NUMA_CPUBUF
755};
756extern void sched_setnuma(struct task_struct *p, int node);
757extern int migrate_task_to(struct task_struct *p, int cpu);
758extern int migrate_swap(struct task_struct *, struct task_struct *);
759#endif /* CONFIG_NUMA_BALANCING */
760
761#ifdef CONFIG_SMP
762
763extern void sched_ttwu_pending(void);
764
765#define rcu_dereference_check_sched_domain(p) \
766	rcu_dereference_check((p), \
767			      lockdep_is_held(&sched_domains_mutex))
768
769/*
770 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
771 * See detach_destroy_domains: synchronize_sched for details.
772 *
773 * The domain tree of any CPU may only be accessed from within
774 * preempt-disabled sections.
775 */
776#define for_each_domain(cpu, __sd) \
777	for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
778			__sd; __sd = __sd->parent)
779
780#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
781
782/**
783 * highest_flag_domain - Return highest sched_domain containing flag.
784 * @cpu:	The cpu whose highest level of sched domain is to
785 *		be returned.
786 * @flag:	The flag to check for the highest sched_domain
787 *		for the given cpu.
788 *
789 * Returns the highest sched_domain of a cpu which contains the given flag.
790 */
791static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
792{
793	struct sched_domain *sd, *hsd = NULL;
794
795	for_each_domain(cpu, sd) {
796		if (!(sd->flags & flag))
797			break;
798		hsd = sd;
799	}
800
801	return hsd;
802}
803
804static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
805{
806	struct sched_domain *sd;
807
808	for_each_domain(cpu, sd) {
809		if (sd->flags & flag)
810			break;
811	}
812
813	return sd;
814}
815
816DECLARE_PER_CPU(struct sched_domain *, sd_llc);
817DECLARE_PER_CPU(int, sd_llc_size);
818DECLARE_PER_CPU(int, sd_llc_id);
819DECLARE_PER_CPU(struct sched_domain *, sd_numa);
820DECLARE_PER_CPU(struct sched_domain *, sd_busy);
821DECLARE_PER_CPU(struct sched_domain *, sd_asym);
822
823struct sched_group_capacity {
824	atomic_t ref;
825	/*
826	 * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
827	 * for a single CPU.
828	 */
829	unsigned int capacity;
830	unsigned long next_update;
831	int imbalance; /* XXX unrelated to capacity but shared group state */
832	/*
833	 * Number of busy cpus in this group.
834	 */
835	atomic_t nr_busy_cpus;
836
837	unsigned long cpumask[0]; /* iteration mask */
838};
839
840struct sched_group {
841	struct sched_group *next;	/* Must be a circular list */
842	atomic_t ref;
843
844	unsigned int group_weight;
845	struct sched_group_capacity *sgc;
846
847	/*
848	 * The CPUs this group covers.
849	 *
850	 * NOTE: this field is variable length. (Allocated dynamically
851	 * by attaching extra space to the end of the structure,
852	 * depending on how many CPUs the kernel has booted up with)
853	 */
854	unsigned long cpumask[0];
855};
856
857static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
858{
859	return to_cpumask(sg->cpumask);
860}
861
862/*
863 * cpumask masking which cpus in the group are allowed to iterate up the domain
864 * tree.
865 */
866static inline struct cpumask *sched_group_mask(struct sched_group *sg)
867{
868	return to_cpumask(sg->sgc->cpumask);
869}
870
871/**
872 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
873 * @group: The group whose first cpu is to be returned.
874 */
875static inline unsigned int group_first_cpu(struct sched_group *group)
876{
877	return cpumask_first(sched_group_cpus(group));
878}
879
880extern int group_balance_cpu(struct sched_group *sg);
881
882#else
883
884static inline void sched_ttwu_pending(void) { }
885
886#endif /* CONFIG_SMP */
887
888#include "stats.h"
889#include "auto_group.h"
890
891#ifdef CONFIG_CGROUP_SCHED
892
893/*
894 * Return the group to which this tasks belongs.
895 *
896 * We cannot use task_css() and friends because the cgroup subsystem
897 * changes that value before the cgroup_subsys::attach() method is called,
898 * therefore we cannot pin it and might observe the wrong value.
899 *
900 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
901 * core changes this before calling sched_move_task().
902 *
903 * Instead we use a 'copy' which is updated from sched_move_task() while
904 * holding both task_struct::pi_lock and rq::lock.
905 */
906static inline struct task_group *task_group(struct task_struct *p)
907{
908	return p->sched_task_group;
909}
910
911/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
912static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
913{
914#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)
915	struct task_group *tg = task_group(p);
916#endif
917
918#ifdef CONFIG_FAIR_GROUP_SCHED
919	p->se.cfs_rq = tg->cfs_rq[cpu];
920	p->se.parent = tg->se[cpu];
921#endif
922
923#ifdef CONFIG_RT_GROUP_SCHED
924	p->rt.rt_rq  = tg->rt_rq[cpu];
925	p->rt.parent = tg->rt_se[cpu];
926#endif
927}
928
929#else /* CONFIG_CGROUP_SCHED */
930
931static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
932static inline struct task_group *task_group(struct task_struct *p)
933{
934	return NULL;
935}
936
937#endif /* CONFIG_CGROUP_SCHED */
938
939static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
940{
941	set_task_rq(p, cpu);
942#ifdef CONFIG_SMP
943	/*
944	 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
945	 * successfuly executed on another CPU. We must ensure that updates of
946	 * per-task data have been completed by this moment.
947	 */
948	smp_wmb();
949	task_thread_info(p)->cpu = cpu;
950	p->wake_cpu = cpu;
951#endif
952}
953
954/*
955 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
956 */
957#ifdef CONFIG_SCHED_DEBUG
958# include <linux/static_key.h>
959# define const_debug __read_mostly
960#else
961# define const_debug const
962#endif
963
964extern const_debug unsigned int sysctl_sched_features;
965
966#define SCHED_FEAT(name, enabled)	\
967	__SCHED_FEAT_##name ,
968
969enum {
970#include "features.h"
971	__SCHED_FEAT_NR,
972};
973
974#undef SCHED_FEAT
975
976#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
977#define SCHED_FEAT(name, enabled)					\
978static __always_inline bool static_branch_##name(struct static_key *key) \
979{									\
980	return static_key_##enabled(key);				\
981}
982
983#include "features.h"
984
985#undef SCHED_FEAT
986
987extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
988#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
989#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
990#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
991#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
992
993#ifdef CONFIG_NUMA_BALANCING
994#define sched_feat_numa(x) sched_feat(x)
995#ifdef CONFIG_SCHED_DEBUG
996#define numabalancing_enabled sched_feat_numa(NUMA)
997#else
998extern bool numabalancing_enabled;
999#endif /* CONFIG_SCHED_DEBUG */
1000#else
1001#define sched_feat_numa(x) (0)
1002#define numabalancing_enabled (0)
1003#endif /* CONFIG_NUMA_BALANCING */
1004
1005static inline u64 global_rt_period(void)
1006{
1007	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
1008}
1009
1010static inline u64 global_rt_runtime(void)
1011{
1012	if (sysctl_sched_rt_runtime < 0)
1013		return RUNTIME_INF;
1014
1015	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
1016}
1017
1018static inline int task_current(struct rq *rq, struct task_struct *p)
1019{
1020	return rq->curr == p;
1021}
1022
1023static inline int task_running(struct rq *rq, struct task_struct *p)
1024{
1025#ifdef CONFIG_SMP
1026	return p->on_cpu;
1027#else
1028	return task_current(rq, p);
1029#endif
1030}
1031
1032static inline int task_on_rq_queued(struct task_struct *p)
1033{
1034	return p->on_rq == TASK_ON_RQ_QUEUED;
1035}
1036
1037static inline int task_on_rq_migrating(struct task_struct *p)
1038{
1039	return p->on_rq == TASK_ON_RQ_MIGRATING;
1040}
1041
1042#ifndef prepare_arch_switch
1043# define prepare_arch_switch(next)	do { } while (0)
1044#endif
1045#ifndef finish_arch_switch
1046# define finish_arch_switch(prev)	do { } while (0)
1047#endif
1048#ifndef finish_arch_post_lock_switch
1049# define finish_arch_post_lock_switch()	do { } while (0)
1050#endif
1051
1052static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
1053{
1054#ifdef CONFIG_SMP
1055	/*
1056	 * We can optimise this out completely for !SMP, because the
1057	 * SMP rebalancing from interrupt is the only thing that cares
1058	 * here.
1059	 */
1060	next->on_cpu = 1;
1061#endif
1062}
1063
1064static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1065{
1066#ifdef CONFIG_SMP
1067	/*
1068	 * After ->on_cpu is cleared, the task can be moved to a different CPU.
1069	 * We must ensure this doesn't happen until the switch is completely
1070	 * finished.
1071	 *
1072	 * Pairs with the control dependency and rmb in try_to_wake_up().
1073	 */
1074	smp_store_release(&prev->on_cpu, 0);
1075#endif
1076#ifdef CONFIG_DEBUG_SPINLOCK
1077	/* this is a valid case when another task releases the spinlock */
1078	rq->lock.owner = current;
1079#endif
1080	/*
1081	 * If we are tracking spinlock dependencies then we have to
1082	 * fix up the runqueue lock - which gets 'carried over' from
1083	 * prev into current:
1084	 */
1085	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
1086
1087	raw_spin_unlock_irq(&rq->lock);
1088}
1089
1090/*
1091 * wake flags
1092 */
1093#define WF_SYNC		0x01		/* waker goes to sleep after wakeup */
1094#define WF_FORK		0x02		/* child wakeup after fork */
1095#define WF_MIGRATED	0x4		/* internal use, task got migrated */
1096
1097/*
1098 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1099 * of tasks with abnormal "nice" values across CPUs the contribution that
1100 * each task makes to its run queue's load is weighted according to its
1101 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
1102 * scaled version of the new time slice allocation that they receive on time
1103 * slice expiry etc.
1104 */
1105
1106#define WEIGHT_IDLEPRIO                3
1107#define WMULT_IDLEPRIO         1431655765
1108
1109/*
1110 * Nice levels are multiplicative, with a gentle 10% change for every
1111 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
1112 * nice 1, it will get ~10% less CPU time than another CPU-bound task
1113 * that remained on nice 0.
1114 *
1115 * The "10% effect" is relative and cumulative: from _any_ nice level,
1116 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
1117 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
1118 * If a task goes up by ~10% and another task goes down by ~10% then
1119 * the relative distance between them is ~25%.)
1120 */
1121static const int prio_to_weight[40] = {
1122 /* -20 */     88761,     71755,     56483,     46273,     36291,
1123 /* -15 */     29154,     23254,     18705,     14949,     11916,
1124 /* -10 */      9548,      7620,      6100,      4904,      3906,
1125 /*  -5 */      3121,      2501,      1991,      1586,      1277,
1126 /*   0 */      1024,       820,       655,       526,       423,
1127 /*   5 */       335,       272,       215,       172,       137,
1128 /*  10 */       110,        87,        70,        56,        45,
1129 /*  15 */        36,        29,        23,        18,        15,
1130};
1131
1132/*
1133 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
1134 *
1135 * In cases where the weight does not change often, we can use the
1136 * precalculated inverse to speed up arithmetics by turning divisions
1137 * into multiplications:
1138 */
1139static const u32 prio_to_wmult[40] = {
1140 /* -20 */     48388,     59856,     76040,     92818,    118348,
1141 /* -15 */    147320,    184698,    229616,    287308,    360437,
1142 /* -10 */    449829,    563644,    704093,    875809,   1099582,
1143 /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
1144 /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
1145 /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
1146 /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
1147 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1148};
1149
1150#define ENQUEUE_WAKEUP		1
1151#define ENQUEUE_HEAD		2
1152#ifdef CONFIG_SMP
1153#define ENQUEUE_WAKING		4	/* sched_class::task_waking was called */
1154#else
1155#define ENQUEUE_WAKING		0
1156#endif
1157#define ENQUEUE_REPLENISH	8
1158
1159#define DEQUEUE_SLEEP		1
1160
1161#define RETRY_TASK		((void *)-1UL)
1162
1163struct sched_class {
1164	const struct sched_class *next;
1165
1166	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
1167	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
1168	void (*yield_task) (struct rq *rq);
1169	bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
1170
1171	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
1172
1173	/*
1174	 * It is the responsibility of the pick_next_task() method that will
1175	 * return the next task to call put_prev_task() on the @prev task or
1176	 * something equivalent.
1177	 *
1178	 * May return RETRY_TASK when it finds a higher prio class has runnable
1179	 * tasks.
1180	 */
1181	struct task_struct * (*pick_next_task) (struct rq *rq,
1182						struct task_struct *prev);
1183	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
1184
1185#ifdef CONFIG_SMP
1186	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
1187	void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
1188
1189	void (*post_schedule) (struct rq *this_rq);
1190	void (*task_waking) (struct task_struct *task);
1191	void (*task_woken) (struct rq *this_rq, struct task_struct *task);
1192
1193	void (*set_cpus_allowed)(struct task_struct *p,
1194				 const struct cpumask *newmask);
1195
1196	void (*rq_online)(struct rq *rq);
1197	void (*rq_offline)(struct rq *rq);
1198#endif
1199
1200	void (*set_curr_task) (struct rq *rq);
1201	void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
1202	void (*task_fork) (struct task_struct *p);
1203	void (*task_dead) (struct task_struct *p);
1204
1205	/*
1206	 * The switched_from() call is allowed to drop rq->lock, therefore we
1207	 * cannot assume the switched_from/switched_to pair is serliazed by
1208	 * rq->lock. They are however serialized by p->pi_lock.
1209	 */
1210	void (*switched_from) (struct rq *this_rq, struct task_struct *task);
1211	void (*switched_to) (struct rq *this_rq, struct task_struct *task);
1212	void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
1213			     int oldprio);
1214
1215	unsigned int (*get_rr_interval) (struct rq *rq,
1216					 struct task_struct *task);
1217
1218	void (*update_curr) (struct rq *rq);
1219
1220#ifdef CONFIG_FAIR_GROUP_SCHED
1221	void (*task_move_group) (struct task_struct *p, int on_rq);
1222#endif
1223};
1224
1225static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
1226{
1227	prev->sched_class->put_prev_task(rq, prev);
1228}
1229
1230#define sched_class_highest (&stop_sched_class)
1231#define for_each_class(class) \
1232   for (class = sched_class_highest; class; class = class->next)
1233
1234extern const struct sched_class stop_sched_class;
1235extern const struct sched_class dl_sched_class;
1236extern const struct sched_class rt_sched_class;
1237extern const struct sched_class fair_sched_class;
1238extern const struct sched_class idle_sched_class;
1239
1240
1241#ifdef CONFIG_SMP
1242
1243extern void update_group_capacity(struct sched_domain *sd, int cpu);
1244
1245extern void trigger_load_balance(struct rq *rq);
1246
1247extern void idle_enter_fair(struct rq *this_rq);
1248extern void idle_exit_fair(struct rq *this_rq);
1249
1250#else
1251
1252static inline void idle_enter_fair(struct rq *rq) { }
1253static inline void idle_exit_fair(struct rq *rq) { }
1254
1255#endif
1256
1257#ifdef CONFIG_CPU_IDLE
1258static inline void idle_set_state(struct rq *rq,
1259				  struct cpuidle_state *idle_state)
1260{
1261	rq->idle_state = idle_state;
1262}
1263
1264static inline struct cpuidle_state *idle_get_state(struct rq *rq)
1265{
1266	WARN_ON(!rcu_read_lock_held());
1267	return rq->idle_state;
1268}
1269#else
1270static inline void idle_set_state(struct rq *rq,
1271				  struct cpuidle_state *idle_state)
1272{
1273}
1274
1275static inline struct cpuidle_state *idle_get_state(struct rq *rq)
1276{
1277	return NULL;
1278}
1279#endif
1280
1281extern void sysrq_sched_debug_show(void);
1282extern void sched_init_granularity(void);
1283extern void update_max_interval(void);
1284
1285extern void init_sched_dl_class(void);
1286extern void init_sched_rt_class(void);
1287extern void init_sched_fair_class(void);
1288extern void init_sched_dl_class(void);
1289
1290extern void resched_curr(struct rq *rq);
1291extern void resched_cpu(int cpu);
1292
1293extern struct rt_bandwidth def_rt_bandwidth;
1294extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
1295
1296extern struct dl_bandwidth def_dl_bandwidth;
1297extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
1298extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
1299
1300unsigned long to_ratio(u64 period, u64 runtime);
1301
1302extern void update_idle_cpu_load(struct rq *this_rq);
1303
1304extern void init_task_runnable_average(struct task_struct *p);
1305
1306static inline void add_nr_running(struct rq *rq, unsigned count)
1307{
1308	unsigned prev_nr = rq->nr_running;
1309
1310	rq->nr_running = prev_nr + count;
1311
1312	if (prev_nr < 2 && rq->nr_running >= 2) {
1313#ifdef CONFIG_SMP
1314		if (!rq->rd->overload)
1315			rq->rd->overload = true;
1316#endif
1317
1318#ifdef CONFIG_NO_HZ_FULL
1319		if (tick_nohz_full_cpu(rq->cpu)) {
1320			/*
1321			 * Tick is needed if more than one task runs on a CPU.
1322			 * Send the target an IPI to kick it out of nohz mode.
1323			 *
1324			 * We assume that IPI implies full memory barrier and the
1325			 * new value of rq->nr_running is visible on reception
1326			 * from the target.
1327			 */
1328			tick_nohz_full_kick_cpu(rq->cpu);
1329		}
1330#endif
1331	}
1332}
1333
1334static inline void sub_nr_running(struct rq *rq, unsigned count)
1335{
1336	rq->nr_running -= count;
1337}
1338
1339static inline void rq_last_tick_reset(struct rq *rq)
1340{
1341#ifdef CONFIG_NO_HZ_FULL
1342	rq->last_sched_tick = jiffies;
1343#endif
1344}
1345
1346extern void update_rq_clock(struct rq *rq);
1347
1348extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
1349extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
1350
1351extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
1352
1353extern const_debug unsigned int sysctl_sched_time_avg;
1354extern const_debug unsigned int sysctl_sched_nr_migrate;
1355extern const_debug unsigned int sysctl_sched_migration_cost;
1356
1357static inline u64 sched_avg_period(void)
1358{
1359	return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1360}
1361
1362#ifdef CONFIG_SCHED_HRTICK
1363
1364/*
1365 * Use hrtick when:
1366 *  - enabled by features
1367 *  - hrtimer is actually high res
1368 */
1369static inline int hrtick_enabled(struct rq *rq)
1370{
1371	if (!sched_feat(HRTICK))
1372		return 0;
1373	if (!cpu_active(cpu_of(rq)))
1374		return 0;
1375	return hrtimer_is_hres_active(&rq->hrtick_timer);
1376}
1377
1378void hrtick_start(struct rq *rq, u64 delay);
1379
1380#else
1381
1382static inline int hrtick_enabled(struct rq *rq)
1383{
1384	return 0;
1385}
1386
1387#endif /* CONFIG_SCHED_HRTICK */
1388
1389#ifdef CONFIG_SMP
1390extern void sched_avg_update(struct rq *rq);
1391
1392#ifndef arch_scale_freq_capacity
1393static __always_inline
1394unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
1395{
1396	return SCHED_CAPACITY_SCALE;
1397}
1398#endif
1399
1400static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1401{
1402	rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
1403	sched_avg_update(rq);
1404}
1405#else
1406static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
1407static inline void sched_avg_update(struct rq *rq) { }
1408#endif
1409
1410extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
1411
1412/*
1413 * __task_rq_lock - lock the rq @p resides on.
1414 */
1415static inline struct rq *__task_rq_lock(struct task_struct *p)
1416	__acquires(rq->lock)
1417{
1418	struct rq *rq;
1419
1420	lockdep_assert_held(&p->pi_lock);
1421
1422	for (;;) {
1423		rq = task_rq(p);
1424		raw_spin_lock(&rq->lock);
1425		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
1426			return rq;
1427		raw_spin_unlock(&rq->lock);
1428
1429		while (unlikely(task_on_rq_migrating(p)))
1430			cpu_relax();
1431	}
1432}
1433
1434/*
1435 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
1436 */
1437static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
1438	__acquires(p->pi_lock)
1439	__acquires(rq->lock)
1440{
1441	struct rq *rq;
1442
1443	for (;;) {
1444		raw_spin_lock_irqsave(&p->pi_lock, *flags);
1445		rq = task_rq(p);
1446		raw_spin_lock(&rq->lock);
1447		/*
1448		 *	move_queued_task()		task_rq_lock()
1449		 *
1450		 *	ACQUIRE (rq->lock)
1451		 *	[S] ->on_rq = MIGRATING		[L] rq = task_rq()
1452		 *	WMB (__set_task_cpu())		ACQUIRE (rq->lock);
1453		 *	[S] ->cpu = new_cpu		[L] task_rq()
1454		 *					[L] ->on_rq
1455		 *	RELEASE (rq->lock)
1456		 *
1457		 * If we observe the old cpu in task_rq_lock, the acquire of
1458		 * the old rq->lock will fully serialize against the stores.
1459		 *
1460		 * If we observe the new cpu in task_rq_lock, the acquire will
1461		 * pair with the WMB to ensure we must then also see migrating.
1462		 */
1463		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
1464			return rq;
1465		raw_spin_unlock(&rq->lock);
1466		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
1467
1468		while (unlikely(task_on_rq_migrating(p)))
1469			cpu_relax();
1470	}
1471}
1472
1473static inline void __task_rq_unlock(struct rq *rq)
1474	__releases(rq->lock)
1475{
1476	raw_spin_unlock(&rq->lock);
1477}
1478
1479static inline void
1480task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
1481	__releases(rq->lock)
1482	__releases(p->pi_lock)
1483{
1484	raw_spin_unlock(&rq->lock);
1485	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
1486}
1487
1488#ifdef CONFIG_SMP
1489#ifdef CONFIG_PREEMPT
1490
1491static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
1492
1493/*
1494 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1495 * way at the expense of forcing extra atomic operations in all
1496 * invocations.  This assures that the double_lock is acquired using the
1497 * same underlying policy as the spinlock_t on this architecture, which
1498 * reduces latency compared to the unfair variant below.  However, it
1499 * also adds more overhead and therefore may reduce throughput.
1500 */
1501static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1502	__releases(this_rq->lock)
1503	__acquires(busiest->lock)
1504	__acquires(this_rq->lock)
1505{
1506	raw_spin_unlock(&this_rq->lock);
1507	double_rq_lock(this_rq, busiest);
1508
1509	return 1;
1510}
1511
1512#else
1513/*
1514 * Unfair double_lock_balance: Optimizes throughput at the expense of
1515 * latency by eliminating extra atomic operations when the locks are
1516 * already in proper order on entry.  This favors lower cpu-ids and will
1517 * grant the double lock to lower cpus over higher ids under contention,
1518 * regardless of entry order into the function.
1519 */
1520static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1521	__releases(this_rq->lock)
1522	__acquires(busiest->lock)
1523	__acquires(this_rq->lock)
1524{
1525	int ret = 0;
1526
1527	if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1528		if (busiest < this_rq) {
1529			raw_spin_unlock(&this_rq->lock);
1530			raw_spin_lock(&busiest->lock);
1531			raw_spin_lock_nested(&this_rq->lock,
1532					      SINGLE_DEPTH_NESTING);
1533			ret = 1;
1534		} else
1535			raw_spin_lock_nested(&busiest->lock,
1536					      SINGLE_DEPTH_NESTING);
1537	}
1538	return ret;
1539}
1540
1541#endif /* CONFIG_PREEMPT */
1542
1543/*
1544 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1545 */
1546static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1547{
1548	if (unlikely(!irqs_disabled())) {
1549		/* printk() doesn't work good under rq->lock */
1550		raw_spin_unlock(&this_rq->lock);
1551		BUG_ON(1);
1552	}
1553
1554	return _double_lock_balance(this_rq, busiest);
1555}
1556
1557static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1558	__releases(busiest->lock)
1559{
1560	raw_spin_unlock(&busiest->lock);
1561	lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1562}
1563
1564static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
1565{
1566	if (l1 > l2)
1567		swap(l1, l2);
1568
1569	spin_lock(l1);
1570	spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
1571}
1572
1573static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2)
1574{
1575	if (l1 > l2)
1576		swap(l1, l2);
1577
1578	spin_lock_irq(l1);
1579	spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
1580}
1581
1582static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
1583{
1584	if (l1 > l2)
1585		swap(l1, l2);
1586
1587	raw_spin_lock(l1);
1588	raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
1589}
1590
1591/*
1592 * double_rq_lock - safely lock two runqueues
1593 *
1594 * Note this does not disable interrupts like task_rq_lock,
1595 * you need to do so manually before calling.
1596 */
1597static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
1598	__acquires(rq1->lock)
1599	__acquires(rq2->lock)
1600{
1601	BUG_ON(!irqs_disabled());
1602	if (rq1 == rq2) {
1603		raw_spin_lock(&rq1->lock);
1604		__acquire(rq2->lock);	/* Fake it out ;) */
1605	} else {
1606		if (rq1 < rq2) {
1607			raw_spin_lock(&rq1->lock);
1608			raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1609		} else {
1610			raw_spin_lock(&rq2->lock);
1611			raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1612		}
1613	}
1614}
1615
1616/*
1617 * double_rq_unlock - safely unlock two runqueues
1618 *
1619 * Note this does not restore interrupts like task_rq_unlock,
1620 * you need to do so manually after calling.
1621 */
1622static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1623	__releases(rq1->lock)
1624	__releases(rq2->lock)
1625{
1626	raw_spin_unlock(&rq1->lock);
1627	if (rq1 != rq2)
1628		raw_spin_unlock(&rq2->lock);
1629	else
1630		__release(rq2->lock);
1631}
1632
1633#else /* CONFIG_SMP */
1634
1635/*
1636 * double_rq_lock - safely lock two runqueues
1637 *
1638 * Note this does not disable interrupts like task_rq_lock,
1639 * you need to do so manually before calling.
1640 */
1641static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
1642	__acquires(rq1->lock)
1643	__acquires(rq2->lock)
1644{
1645	BUG_ON(!irqs_disabled());
1646	BUG_ON(rq1 != rq2);
1647	raw_spin_lock(&rq1->lock);
1648	__acquire(rq2->lock);	/* Fake it out ;) */
1649}
1650
1651/*
1652 * double_rq_unlock - safely unlock two runqueues
1653 *
1654 * Note this does not restore interrupts like task_rq_unlock,
1655 * you need to do so manually after calling.
1656 */
1657static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1658	__releases(rq1->lock)
1659	__releases(rq2->lock)
1660{
1661	BUG_ON(rq1 != rq2);
1662	raw_spin_unlock(&rq1->lock);
1663	__release(rq2->lock);
1664}
1665
1666#endif
1667
1668extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
1669extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
1670extern void print_cfs_stats(struct seq_file *m, int cpu);
1671extern void print_rt_stats(struct seq_file *m, int cpu);
1672extern void print_dl_stats(struct seq_file *m, int cpu);
1673
1674extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1675extern void init_rt_rq(struct rt_rq *rt_rq);
1676extern void init_dl_rq(struct dl_rq *dl_rq);
1677
1678extern void cfs_bandwidth_usage_inc(void);
1679extern void cfs_bandwidth_usage_dec(void);
1680
1681#ifdef CONFIG_NO_HZ_COMMON
1682enum rq_nohz_flag_bits {
1683	NOHZ_TICK_STOPPED,
1684	NOHZ_BALANCE_KICK,
1685};
1686
1687#define nohz_flags(cpu)	(&cpu_rq(cpu)->nohz_flags)
1688#endif
1689
1690#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1691
1692DECLARE_PER_CPU(u64, cpu_hardirq_time);
1693DECLARE_PER_CPU(u64, cpu_softirq_time);
1694
1695#ifndef CONFIG_64BIT
1696DECLARE_PER_CPU(seqcount_t, irq_time_seq);
1697
1698static inline void irq_time_write_begin(void)
1699{
1700	__this_cpu_inc(irq_time_seq.sequence);
1701	smp_wmb();
1702}
1703
1704static inline void irq_time_write_end(void)
1705{
1706	smp_wmb();
1707	__this_cpu_inc(irq_time_seq.sequence);
1708}
1709
1710static inline u64 irq_time_read(int cpu)
1711{
1712	u64 irq_time;
1713	unsigned seq;
1714
1715	do {
1716		seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1717		irq_time = per_cpu(cpu_softirq_time, cpu) +
1718			   per_cpu(cpu_hardirq_time, cpu);
1719	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1720
1721	return irq_time;
1722}
1723#else /* CONFIG_64BIT */
1724static inline void irq_time_write_begin(void)
1725{
1726}
1727
1728static inline void irq_time_write_end(void)
1729{
1730}
1731
1732static inline u64 irq_time_read(int cpu)
1733{
1734	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1735}
1736#endif /* CONFIG_64BIT */
1737#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
1738