1/*
2 * intel_powerclamp.c - package c-state idle injection
3 *
4 * Copyright (c) 2012, Intel Corporation.
5 *
6 * Authors:
7 *     Arjan van de Ven <arjan@linux.intel.com>
8 *     Jacob Pan <jacob.jun.pan@linux.intel.com>
9 *
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms and conditions of the GNU General Public License,
12 * version 2, as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope it will be useful, but WITHOUT
15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
17 * more details.
18 *
19 * You should have received a copy of the GNU General Public License along with
20 * this program; if not, write to the Free Software Foundation, Inc.,
21 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
22 *
23 *
24 *	TODO:
25 *           1. better handle wakeup from external interrupts, currently a fixed
26 *              compensation is added to clamping duration when excessive amount
27 *              of wakeups are observed during idle time. the reason is that in
28 *              case of external interrupts without need for ack, clamping down
29 *              cpu in non-irq context does not reduce irq. for majority of the
30 *              cases, clamping down cpu does help reduce irq as well, we should
31 *              be able to differenciate the two cases and give a quantitative
32 *              solution for the irqs that we can control. perhaps based on
33 *              get_cpu_iowait_time_us()
34 *
35 *	     2. synchronization with other hw blocks
36 *
37 *
38 */
39
40#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
41
42#include <linux/module.h>
43#include <linux/kernel.h>
44#include <linux/delay.h>
45#include <linux/kthread.h>
46#include <linux/freezer.h>
47#include <linux/cpu.h>
48#include <linux/thermal.h>
49#include <linux/slab.h>
50#include <linux/tick.h>
51#include <linux/debugfs.h>
52#include <linux/seq_file.h>
53#include <linux/sched/rt.h>
54
55#include <asm/nmi.h>
56#include <asm/msr.h>
57#include <asm/mwait.h>
58#include <asm/cpu_device_id.h>
59#include <asm/idle.h>
60#include <asm/hardirq.h>
61
62#define MAX_TARGET_RATIO (50U)
63/* For each undisturbed clamping period (no extra wake ups during idle time),
64 * we increment the confidence counter for the given target ratio.
65 * CONFIDENCE_OK defines the level where runtime calibration results are
66 * valid.
67 */
68#define CONFIDENCE_OK (3)
69/* Default idle injection duration, driver adjust sleep time to meet target
70 * idle ratio. Similar to frequency modulation.
71 */
72#define DEFAULT_DURATION_JIFFIES (6)
73
74static unsigned int target_mwait;
75static struct dentry *debug_dir;
76
77/* user selected target */
78static unsigned int set_target_ratio;
79static unsigned int current_ratio;
80static bool should_skip;
81static bool reduce_irq;
82static atomic_t idle_wakeup_counter;
83static unsigned int control_cpu; /* The cpu assigned to collect stat and update
84				  * control parameters. default to BSP but BSP
85				  * can be offlined.
86				  */
87static bool clamping;
88
89
90static struct task_struct * __percpu *powerclamp_thread;
91static struct thermal_cooling_device *cooling_dev;
92static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
93					   * clamping thread
94					   */
95
96static unsigned int duration;
97static unsigned int pkg_cstate_ratio_cur;
98static unsigned int window_size;
99
100static int duration_set(const char *arg, const struct kernel_param *kp)
101{
102	int ret = 0;
103	unsigned long new_duration;
104
105	ret = kstrtoul(arg, 10, &new_duration);
106	if (ret)
107		goto exit;
108	if (new_duration > 25 || new_duration < 6) {
109		pr_err("Out of recommended range %lu, between 6-25ms\n",
110			new_duration);
111		ret = -EINVAL;
112	}
113
114	duration = clamp(new_duration, 6ul, 25ul);
115	smp_mb();
116
117exit:
118
119	return ret;
120}
121
122static struct kernel_param_ops duration_ops = {
123	.set = duration_set,
124	.get = param_get_int,
125};
126
127
128module_param_cb(duration, &duration_ops, &duration, 0644);
129MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
130
131struct powerclamp_calibration_data {
132	unsigned long confidence;  /* used for calibration, basically a counter
133				    * gets incremented each time a clamping
134				    * period is completed without extra wakeups
135				    * once that counter is reached given level,
136				    * compensation is deemed usable.
137				    */
138	unsigned long steady_comp; /* steady state compensation used when
139				    * no extra wakeups occurred.
140				    */
141	unsigned long dynamic_comp; /* compensate excessive wakeup from idle
142				     * mostly from external interrupts.
143				     */
144};
145
146static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
147
148static int window_size_set(const char *arg, const struct kernel_param *kp)
149{
150	int ret = 0;
151	unsigned long new_window_size;
152
153	ret = kstrtoul(arg, 10, &new_window_size);
154	if (ret)
155		goto exit_win;
156	if (new_window_size > 10 || new_window_size < 2) {
157		pr_err("Out of recommended window size %lu, between 2-10\n",
158			new_window_size);
159		ret = -EINVAL;
160	}
161
162	window_size = clamp(new_window_size, 2ul, 10ul);
163	smp_mb();
164
165exit_win:
166
167	return ret;
168}
169
170static struct kernel_param_ops window_size_ops = {
171	.set = window_size_set,
172	.get = param_get_int,
173};
174
175module_param_cb(window_size, &window_size_ops, &window_size, 0644);
176MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
177	"\tpowerclamp controls idle ratio within this window. larger\n"
178	"\twindow size results in slower response time but more smooth\n"
179	"\tclamping results. default to 2.");
180
181static void find_target_mwait(void)
182{
183	unsigned int eax, ebx, ecx, edx;
184	unsigned int highest_cstate = 0;
185	unsigned int highest_subcstate = 0;
186	int i;
187
188	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
189		return;
190
191	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
192
193	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
194	    !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
195		return;
196
197	edx >>= MWAIT_SUBSTATE_SIZE;
198	for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
199		if (edx & MWAIT_SUBSTATE_MASK) {
200			highest_cstate = i;
201			highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
202		}
203	}
204	target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
205		(highest_subcstate - 1);
206
207}
208
209struct pkg_cstate_info {
210	bool skip;
211	int msr_index;
212	int cstate_id;
213};
214
215#define PKG_CSTATE_INIT(id) {				\
216		.msr_index = MSR_PKG_C##id##_RESIDENCY, \
217		.cstate_id = id				\
218			}
219
220static struct pkg_cstate_info pkg_cstates[] = {
221	PKG_CSTATE_INIT(2),
222	PKG_CSTATE_INIT(3),
223	PKG_CSTATE_INIT(6),
224	PKG_CSTATE_INIT(7),
225	PKG_CSTATE_INIT(8),
226	PKG_CSTATE_INIT(9),
227	PKG_CSTATE_INIT(10),
228	{NULL},
229};
230
231static bool has_pkg_state_counter(void)
232{
233	u64 val;
234	struct pkg_cstate_info *info = pkg_cstates;
235
236	/* check if any one of the counter msrs exists */
237	while (info->msr_index) {
238		if (!rdmsrl_safe(info->msr_index, &val))
239			return true;
240		info++;
241	}
242
243	return false;
244}
245
246static u64 pkg_state_counter(void)
247{
248	u64 val;
249	u64 count = 0;
250	struct pkg_cstate_info *info = pkg_cstates;
251
252	while (info->msr_index) {
253		if (!info->skip) {
254			if (!rdmsrl_safe(info->msr_index, &val))
255				count += val;
256			else
257				info->skip = true;
258		}
259		info++;
260	}
261
262	return count;
263}
264
265static void noop_timer(unsigned long foo)
266{
267	/* empty... just the fact that we get the interrupt wakes us up */
268}
269
270static unsigned int get_compensation(int ratio)
271{
272	unsigned int comp = 0;
273
274	/* we only use compensation if all adjacent ones are good */
275	if (ratio == 1 &&
276		cal_data[ratio].confidence >= CONFIDENCE_OK &&
277		cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
278		cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
279		comp = (cal_data[ratio].steady_comp +
280			cal_data[ratio + 1].steady_comp +
281			cal_data[ratio + 2].steady_comp) / 3;
282	} else if (ratio == MAX_TARGET_RATIO - 1 &&
283		cal_data[ratio].confidence >= CONFIDENCE_OK &&
284		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
285		cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
286		comp = (cal_data[ratio].steady_comp +
287			cal_data[ratio - 1].steady_comp +
288			cal_data[ratio - 2].steady_comp) / 3;
289	} else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
290		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
291		cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
292		comp = (cal_data[ratio].steady_comp +
293			cal_data[ratio - 1].steady_comp +
294			cal_data[ratio + 1].steady_comp) / 3;
295	}
296
297	/* REVISIT: simple penalty of double idle injection */
298	if (reduce_irq)
299		comp = ratio;
300	/* do not exceed limit */
301	if (comp + ratio >= MAX_TARGET_RATIO)
302		comp = MAX_TARGET_RATIO - ratio - 1;
303
304	return comp;
305}
306
307static void adjust_compensation(int target_ratio, unsigned int win)
308{
309	int delta;
310	struct powerclamp_calibration_data *d = &cal_data[target_ratio];
311
312	/*
313	 * adjust compensations if confidence level has not been reached or
314	 * there are too many wakeups during the last idle injection period, we
315	 * cannot trust the data for compensation.
316	 */
317	if (d->confidence >= CONFIDENCE_OK ||
318		atomic_read(&idle_wakeup_counter) >
319		win * num_online_cpus())
320		return;
321
322	delta = set_target_ratio - current_ratio;
323	/* filter out bad data */
324	if (delta >= 0 && delta <= (1+target_ratio/10)) {
325		if (d->steady_comp)
326			d->steady_comp =
327				roundup(delta+d->steady_comp, 2)/2;
328		else
329			d->steady_comp = delta;
330		d->confidence++;
331	}
332}
333
334static bool powerclamp_adjust_controls(unsigned int target_ratio,
335				unsigned int guard, unsigned int win)
336{
337	static u64 msr_last, tsc_last;
338	u64 msr_now, tsc_now;
339	u64 val64;
340
341	/* check result for the last window */
342	msr_now = pkg_state_counter();
343	rdtscll(tsc_now);
344
345	/* calculate pkg cstate vs tsc ratio */
346	if (!msr_last || !tsc_last)
347		current_ratio = 1;
348	else if (tsc_now-tsc_last) {
349		val64 = 100*(msr_now-msr_last);
350		do_div(val64, (tsc_now-tsc_last));
351		current_ratio = val64;
352	}
353
354	/* update record */
355	msr_last = msr_now;
356	tsc_last = tsc_now;
357
358	adjust_compensation(target_ratio, win);
359	/*
360	 * too many external interrupts, set flag such
361	 * that we can take measure later.
362	 */
363	reduce_irq = atomic_read(&idle_wakeup_counter) >=
364		2 * win * num_online_cpus();
365
366	atomic_set(&idle_wakeup_counter, 0);
367	/* if we are above target+guard, skip */
368	return set_target_ratio + guard <= current_ratio;
369}
370
371static int clamp_thread(void *arg)
372{
373	int cpunr = (unsigned long)arg;
374	DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0);
375	static const struct sched_param param = {
376		.sched_priority = MAX_USER_RT_PRIO/2,
377	};
378	unsigned int count = 0;
379	unsigned int target_ratio;
380
381	set_bit(cpunr, cpu_clamping_mask);
382	set_freezable();
383	init_timer_on_stack(&wakeup_timer);
384	sched_setscheduler(current, SCHED_FIFO, &param);
385
386	while (true == clamping && !kthread_should_stop() &&
387		cpu_online(cpunr)) {
388		int sleeptime;
389		unsigned long target_jiffies;
390		unsigned int guard;
391		unsigned int compensation = 0;
392		int interval; /* jiffies to sleep for each attempt */
393		unsigned int duration_jiffies = msecs_to_jiffies(duration);
394		unsigned int window_size_now;
395
396		try_to_freeze();
397		/*
398		 * make sure user selected ratio does not take effect until
399		 * the next round. adjust target_ratio if user has changed
400		 * target such that we can converge quickly.
401		 */
402		target_ratio = set_target_ratio;
403		guard = 1 + target_ratio/20;
404		window_size_now = window_size;
405		count++;
406
407		/*
408		 * systems may have different ability to enter package level
409		 * c-states, thus we need to compensate the injected idle ratio
410		 * to achieve the actual target reported by the HW.
411		 */
412		compensation = get_compensation(target_ratio);
413		interval = duration_jiffies*100/(target_ratio+compensation);
414
415		/* align idle time */
416		target_jiffies = roundup(jiffies, interval);
417		sleeptime = target_jiffies - jiffies;
418		if (sleeptime <= 0)
419			sleeptime = 1;
420		schedule_timeout_interruptible(sleeptime);
421		/*
422		 * only elected controlling cpu can collect stats and update
423		 * control parameters.
424		 */
425		if (cpunr == control_cpu && !(count%window_size_now)) {
426			should_skip =
427				powerclamp_adjust_controls(target_ratio,
428							guard, window_size_now);
429			smp_mb();
430		}
431
432		if (should_skip)
433			continue;
434
435		target_jiffies = jiffies + duration_jiffies;
436		mod_timer(&wakeup_timer, target_jiffies);
437		if (unlikely(local_softirq_pending()))
438			continue;
439		/*
440		 * stop tick sched during idle time, interrupts are still
441		 * allowed. thus jiffies are updated properly.
442		 */
443		preempt_disable();
444		/* mwait until target jiffies is reached */
445		while (time_before(jiffies, target_jiffies)) {
446			unsigned long ecx = 1;
447			unsigned long eax = target_mwait;
448
449			/*
450			 * REVISIT: may call enter_idle() to notify drivers who
451			 * can save power during cpu idle. same for exit_idle()
452			 */
453			local_touch_nmi();
454			stop_critical_timings();
455			mwait_idle_with_hints(eax, ecx);
456			start_critical_timings();
457			atomic_inc(&idle_wakeup_counter);
458		}
459		preempt_enable();
460	}
461	del_timer_sync(&wakeup_timer);
462	clear_bit(cpunr, cpu_clamping_mask);
463
464	return 0;
465}
466
467/*
468 * 1 HZ polling while clamping is active, useful for userspace
469 * to monitor actual idle ratio.
470 */
471static void poll_pkg_cstate(struct work_struct *dummy);
472static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
473static void poll_pkg_cstate(struct work_struct *dummy)
474{
475	static u64 msr_last;
476	static u64 tsc_last;
477	static unsigned long jiffies_last;
478
479	u64 msr_now;
480	unsigned long jiffies_now;
481	u64 tsc_now;
482	u64 val64;
483
484	msr_now = pkg_state_counter();
485	rdtscll(tsc_now);
486	jiffies_now = jiffies;
487
488	/* calculate pkg cstate vs tsc ratio */
489	if (!msr_last || !tsc_last)
490		pkg_cstate_ratio_cur = 1;
491	else {
492		if (tsc_now - tsc_last) {
493			val64 = 100 * (msr_now - msr_last);
494			do_div(val64, (tsc_now - tsc_last));
495			pkg_cstate_ratio_cur = val64;
496		}
497	}
498
499	/* update record */
500	msr_last = msr_now;
501	jiffies_last = jiffies_now;
502	tsc_last = tsc_now;
503
504	if (true == clamping)
505		schedule_delayed_work(&poll_pkg_cstate_work, HZ);
506}
507
508static int start_power_clamp(void)
509{
510	unsigned long cpu;
511	struct task_struct *thread;
512
513	/* check if pkg cstate counter is completely 0, abort in this case */
514	if (!has_pkg_state_counter()) {
515		pr_err("pkg cstate counter not functional, abort\n");
516		return -EINVAL;
517	}
518
519	set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
520	/* prevent cpu hotplug */
521	get_online_cpus();
522
523	/* prefer BSP */
524	control_cpu = 0;
525	if (!cpu_online(control_cpu))
526		control_cpu = smp_processor_id();
527
528	clamping = true;
529	schedule_delayed_work(&poll_pkg_cstate_work, 0);
530
531	/* start one thread per online cpu */
532	for_each_online_cpu(cpu) {
533		struct task_struct **p =
534			per_cpu_ptr(powerclamp_thread, cpu);
535
536		thread = kthread_create_on_node(clamp_thread,
537						(void *) cpu,
538						cpu_to_node(cpu),
539						"kidle_inject/%ld", cpu);
540		/* bind to cpu here */
541		if (likely(!IS_ERR(thread))) {
542			kthread_bind(thread, cpu);
543			wake_up_process(thread);
544			*p = thread;
545		}
546
547	}
548	put_online_cpus();
549
550	return 0;
551}
552
553static void end_power_clamp(void)
554{
555	int i;
556	struct task_struct *thread;
557
558	clamping = false;
559	/*
560	 * make clamping visible to other cpus and give per cpu clamping threads
561	 * sometime to exit, or gets killed later.
562	 */
563	smp_mb();
564	msleep(20);
565	if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
566		for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
567			pr_debug("clamping thread for cpu %d alive, kill\n", i);
568			thread = *per_cpu_ptr(powerclamp_thread, i);
569			kthread_stop(thread);
570		}
571	}
572}
573
574static int powerclamp_cpu_callback(struct notifier_block *nfb,
575				unsigned long action, void *hcpu)
576{
577	unsigned long cpu = (unsigned long)hcpu;
578	struct task_struct *thread;
579	struct task_struct **percpu_thread =
580		per_cpu_ptr(powerclamp_thread, cpu);
581
582	if (false == clamping)
583		goto exit_ok;
584
585	switch (action) {
586	case CPU_ONLINE:
587		thread = kthread_create_on_node(clamp_thread,
588						(void *) cpu,
589						cpu_to_node(cpu),
590						"kidle_inject/%lu", cpu);
591		if (likely(!IS_ERR(thread))) {
592			kthread_bind(thread, cpu);
593			wake_up_process(thread);
594			*percpu_thread = thread;
595		}
596		/* prefer BSP as controlling CPU */
597		if (cpu == 0) {
598			control_cpu = 0;
599			smp_mb();
600		}
601		break;
602	case CPU_DEAD:
603		if (test_bit(cpu, cpu_clamping_mask)) {
604			pr_err("cpu %lu dead but powerclamping thread is not\n",
605				cpu);
606			kthread_stop(*percpu_thread);
607		}
608		if (cpu == control_cpu) {
609			control_cpu = smp_processor_id();
610			smp_mb();
611		}
612	}
613
614exit_ok:
615	return NOTIFY_OK;
616}
617
618static struct notifier_block powerclamp_cpu_notifier = {
619	.notifier_call = powerclamp_cpu_callback,
620};
621
622static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
623				 unsigned long *state)
624{
625	*state = MAX_TARGET_RATIO;
626
627	return 0;
628}
629
630static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
631				 unsigned long *state)
632{
633	if (true == clamping)
634		*state = pkg_cstate_ratio_cur;
635	else
636		/* to save power, do not poll idle ratio while not clamping */
637		*state = -1; /* indicates invalid state */
638
639	return 0;
640}
641
642static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
643				 unsigned long new_target_ratio)
644{
645	int ret = 0;
646
647	new_target_ratio = clamp(new_target_ratio, 0UL,
648				(unsigned long) (MAX_TARGET_RATIO-1));
649	if (set_target_ratio == 0 && new_target_ratio > 0) {
650		pr_info("Start idle injection to reduce power\n");
651		set_target_ratio = new_target_ratio;
652		ret = start_power_clamp();
653		goto exit_set;
654	} else	if (set_target_ratio > 0 && new_target_ratio == 0) {
655		pr_info("Stop forced idle injection\n");
656		set_target_ratio = 0;
657		end_power_clamp();
658	} else	/* adjust currently running */ {
659		set_target_ratio = new_target_ratio;
660		/* make new set_target_ratio visible to other cpus */
661		smp_mb();
662	}
663
664exit_set:
665	return ret;
666}
667
668/* bind to generic thermal layer as cooling device*/
669static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
670	.get_max_state = powerclamp_get_max_state,
671	.get_cur_state = powerclamp_get_cur_state,
672	.set_cur_state = powerclamp_set_cur_state,
673};
674
675/* runs on Nehalem and later */
676static const struct x86_cpu_id intel_powerclamp_ids[] __initconst = {
677	{ X86_VENDOR_INTEL, 6, 0x1a},
678	{ X86_VENDOR_INTEL, 6, 0x1c},
679	{ X86_VENDOR_INTEL, 6, 0x1e},
680	{ X86_VENDOR_INTEL, 6, 0x1f},
681	{ X86_VENDOR_INTEL, 6, 0x25},
682	{ X86_VENDOR_INTEL, 6, 0x26},
683	{ X86_VENDOR_INTEL, 6, 0x2a},
684	{ X86_VENDOR_INTEL, 6, 0x2c},
685	{ X86_VENDOR_INTEL, 6, 0x2d},
686	{ X86_VENDOR_INTEL, 6, 0x2e},
687	{ X86_VENDOR_INTEL, 6, 0x2f},
688	{ X86_VENDOR_INTEL, 6, 0x37},
689	{ X86_VENDOR_INTEL, 6, 0x3a},
690	{ X86_VENDOR_INTEL, 6, 0x3c},
691	{ X86_VENDOR_INTEL, 6, 0x3d},
692	{ X86_VENDOR_INTEL, 6, 0x3e},
693	{ X86_VENDOR_INTEL, 6, 0x3f},
694	{ X86_VENDOR_INTEL, 6, 0x45},
695	{ X86_VENDOR_INTEL, 6, 0x46},
696	{ X86_VENDOR_INTEL, 6, 0x4c},
697	{ X86_VENDOR_INTEL, 6, 0x4d},
698	{ X86_VENDOR_INTEL, 6, 0x4f},
699	{ X86_VENDOR_INTEL, 6, 0x56},
700	{}
701};
702MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
703
704static int __init powerclamp_probe(void)
705{
706	if (!x86_match_cpu(intel_powerclamp_ids)) {
707		pr_err("Intel powerclamp does not run on family %d model %d\n",
708				boot_cpu_data.x86, boot_cpu_data.x86_model);
709		return -ENODEV;
710	}
711	if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
712		!boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ||
713		!boot_cpu_has(X86_FEATURE_MWAIT) ||
714		!boot_cpu_has(X86_FEATURE_ARAT))
715		return -ENODEV;
716
717	/* find the deepest mwait value */
718	find_target_mwait();
719
720	return 0;
721}
722
723static int powerclamp_debug_show(struct seq_file *m, void *unused)
724{
725	int i = 0;
726
727	seq_printf(m, "controlling cpu: %d\n", control_cpu);
728	seq_printf(m, "pct confidence steady dynamic (compensation)\n");
729	for (i = 0; i < MAX_TARGET_RATIO; i++) {
730		seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
731			i,
732			cal_data[i].confidence,
733			cal_data[i].steady_comp,
734			cal_data[i].dynamic_comp);
735	}
736
737	return 0;
738}
739
740static int powerclamp_debug_open(struct inode *inode,
741			struct file *file)
742{
743	return single_open(file, powerclamp_debug_show, inode->i_private);
744}
745
746static const struct file_operations powerclamp_debug_fops = {
747	.open		= powerclamp_debug_open,
748	.read		= seq_read,
749	.llseek		= seq_lseek,
750	.release	= single_release,
751	.owner		= THIS_MODULE,
752};
753
754static inline void powerclamp_create_debug_files(void)
755{
756	debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
757	if (!debug_dir)
758		return;
759
760	if (!debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir,
761					cal_data, &powerclamp_debug_fops))
762		goto file_error;
763
764	return;
765
766file_error:
767	debugfs_remove_recursive(debug_dir);
768}
769
770static int __init powerclamp_init(void)
771{
772	int retval;
773	int bitmap_size;
774
775	bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
776	cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
777	if (!cpu_clamping_mask)
778		return -ENOMEM;
779
780	/* probe cpu features and ids here */
781	retval = powerclamp_probe();
782	if (retval)
783		goto exit_free;
784
785	/* set default limit, maybe adjusted during runtime based on feedback */
786	window_size = 2;
787	register_hotcpu_notifier(&powerclamp_cpu_notifier);
788
789	powerclamp_thread = alloc_percpu(struct task_struct *);
790	if (!powerclamp_thread) {
791		retval = -ENOMEM;
792		goto exit_unregister;
793	}
794
795	cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
796						&powerclamp_cooling_ops);
797	if (IS_ERR(cooling_dev)) {
798		retval = -ENODEV;
799		goto exit_free_thread;
800	}
801
802	if (!duration)
803		duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
804
805	powerclamp_create_debug_files();
806
807	return 0;
808
809exit_free_thread:
810	free_percpu(powerclamp_thread);
811exit_unregister:
812	unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
813exit_free:
814	kfree(cpu_clamping_mask);
815	return retval;
816}
817module_init(powerclamp_init);
818
819static void __exit powerclamp_exit(void)
820{
821	unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
822	end_power_clamp();
823	free_percpu(powerclamp_thread);
824	thermal_cooling_device_unregister(cooling_dev);
825	kfree(cpu_clamping_mask);
826
827	cancel_delayed_work_sync(&poll_pkg_cstate_work);
828	debugfs_remove_recursive(debug_dir);
829}
830module_exit(powerclamp_exit);
831
832MODULE_LICENSE("GPL");
833MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
834MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
835MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
836