1/*
2 * Architecture specific (i386/x86_64) functions for kexec based crash dumps.
3 *
4 * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
5 *
6 * Copyright (C) IBM Corporation, 2004. All rights reserved.
7 * Copyright (C) Red Hat Inc., 2014. All rights reserved.
8 * Authors:
9 *      Vivek Goyal <vgoyal@redhat.com>
10 *
11 */
12
13#define pr_fmt(fmt)	"kexec: " fmt
14
15#include <linux/types.h>
16#include <linux/kernel.h>
17#include <linux/smp.h>
18#include <linux/reboot.h>
19#include <linux/kexec.h>
20#include <linux/delay.h>
21#include <linux/elf.h>
22#include <linux/elfcore.h>
23#include <linux/module.h>
24#include <linux/slab.h>
25
26#include <asm/processor.h>
27#include <asm/hardirq.h>
28#include <asm/nmi.h>
29#include <asm/hw_irq.h>
30#include <asm/apic.h>
31#include <asm/io_apic.h>
32#include <asm/hpet.h>
33#include <linux/kdebug.h>
34#include <asm/cpu.h>
35#include <asm/reboot.h>
36#include <asm/virtext.h>
37
38/* Alignment required for elf header segment */
39#define ELF_CORE_HEADER_ALIGN   4096
40
41/* This primarily represents number of split ranges due to exclusion */
42#define CRASH_MAX_RANGES	16
43
44struct crash_mem_range {
45	u64 start, end;
46};
47
48struct crash_mem {
49	unsigned int nr_ranges;
50	struct crash_mem_range ranges[CRASH_MAX_RANGES];
51};
52
53/* Misc data about ram ranges needed to prepare elf headers */
54struct crash_elf_data {
55	struct kimage *image;
56	/*
57	 * Total number of ram ranges we have after various adjustments for
58	 * GART, crash reserved region etc.
59	 */
60	unsigned int max_nr_ranges;
61	unsigned long gart_start, gart_end;
62
63	/* Pointer to elf header */
64	void *ehdr;
65	/* Pointer to next phdr */
66	void *bufp;
67	struct crash_mem mem;
68};
69
70/* Used while preparing memory map entries for second kernel */
71struct crash_memmap_data {
72	struct boot_params *params;
73	/* Type of memory */
74	unsigned int type;
75};
76
77int in_crash_kexec;
78
79/*
80 * This is used to VMCLEAR all VMCSs loaded on the
81 * processor. And when loading kvm_intel module, the
82 * callback function pointer will be assigned.
83 *
84 * protected by rcu.
85 */
86crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL;
87EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss);
88unsigned long crash_zero_bytes;
89
90static inline void cpu_crash_vmclear_loaded_vmcss(void)
91{
92	crash_vmclear_fn *do_vmclear_operation = NULL;
93
94	rcu_read_lock();
95	do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss);
96	if (do_vmclear_operation)
97		do_vmclear_operation();
98	rcu_read_unlock();
99}
100
101#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
102
103static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
104{
105#ifdef CONFIG_X86_32
106	struct pt_regs fixed_regs;
107
108	if (!user_mode(regs)) {
109		crash_fixup_ss_esp(&fixed_regs, regs);
110		regs = &fixed_regs;
111	}
112#endif
113	crash_save_cpu(regs, cpu);
114
115	/*
116	 * VMCLEAR VMCSs loaded on all cpus if needed.
117	 */
118	cpu_crash_vmclear_loaded_vmcss();
119
120	/* Disable VMX or SVM if needed.
121	 *
122	 * We need to disable virtualization on all CPUs.
123	 * Having VMX or SVM enabled on any CPU may break rebooting
124	 * after the kdump kernel has finished its task.
125	 */
126	cpu_emergency_vmxoff();
127	cpu_emergency_svm_disable();
128
129	disable_local_APIC();
130}
131
132static void kdump_nmi_shootdown_cpus(void)
133{
134	in_crash_kexec = 1;
135	nmi_shootdown_cpus(kdump_nmi_callback);
136
137	disable_local_APIC();
138}
139
140#else
141static void kdump_nmi_shootdown_cpus(void)
142{
143	/* There are no cpus to shootdown */
144}
145#endif
146
147void native_machine_crash_shutdown(struct pt_regs *regs)
148{
149	/* This function is only called after the system
150	 * has panicked or is otherwise in a critical state.
151	 * The minimum amount of code to allow a kexec'd kernel
152	 * to run successfully needs to happen here.
153	 *
154	 * In practice this means shooting down the other cpus in
155	 * an SMP system.
156	 */
157	/* The kernel is broken so disable interrupts */
158	local_irq_disable();
159
160	kdump_nmi_shootdown_cpus();
161
162	/*
163	 * VMCLEAR VMCSs loaded on this cpu if needed.
164	 */
165	cpu_crash_vmclear_loaded_vmcss();
166
167	/* Booting kdump kernel with VMX or SVM enabled won't work,
168	 * because (among other limitations) we can't disable paging
169	 * with the virt flags.
170	 */
171	cpu_emergency_vmxoff();
172	cpu_emergency_svm_disable();
173
174#ifdef CONFIG_X86_IO_APIC
175	/* Prevent crash_kexec() from deadlocking on ioapic_lock. */
176	ioapic_zap_locks();
177	disable_IO_APIC();
178#endif
179	lapic_shutdown();
180#ifdef CONFIG_HPET_TIMER
181	hpet_disable();
182#endif
183	crash_save_cpu(regs, safe_smp_processor_id());
184}
185
186#ifdef CONFIG_KEXEC_FILE
187static int get_nr_ram_ranges_callback(u64 start, u64 end, void *arg)
188{
189	unsigned int *nr_ranges = arg;
190
191	(*nr_ranges)++;
192	return 0;
193}
194
195static int get_gart_ranges_callback(u64 start, u64 end, void *arg)
196{
197	struct crash_elf_data *ced = arg;
198
199	ced->gart_start = start;
200	ced->gart_end = end;
201
202	/* Not expecting more than 1 gart aperture */
203	return 1;
204}
205
206
207/* Gather all the required information to prepare elf headers for ram regions */
208static void fill_up_crash_elf_data(struct crash_elf_data *ced,
209				   struct kimage *image)
210{
211	unsigned int nr_ranges = 0;
212
213	ced->image = image;
214
215	walk_system_ram_res(0, -1, &nr_ranges,
216				get_nr_ram_ranges_callback);
217
218	ced->max_nr_ranges = nr_ranges;
219
220	/*
221	 * We don't create ELF headers for GART aperture as an attempt
222	 * to dump this memory in second kernel leads to hang/crash.
223	 * If gart aperture is present, one needs to exclude that region
224	 * and that could lead to need of extra phdr.
225	 */
226	walk_iomem_res("GART", IORESOURCE_MEM, 0, -1,
227				ced, get_gart_ranges_callback);
228
229	/*
230	 * If we have gart region, excluding that could potentially split
231	 * a memory range, resulting in extra header. Account for  that.
232	 */
233	if (ced->gart_end)
234		ced->max_nr_ranges++;
235
236	/* Exclusion of crash region could split memory ranges */
237	ced->max_nr_ranges++;
238
239	/* If crashk_low_res is not 0, another range split possible */
240	if (crashk_low_res.end)
241		ced->max_nr_ranges++;
242}
243
244static int exclude_mem_range(struct crash_mem *mem,
245		unsigned long long mstart, unsigned long long mend)
246{
247	int i, j;
248	unsigned long long start, end;
249	struct crash_mem_range temp_range = {0, 0};
250
251	for (i = 0; i < mem->nr_ranges; i++) {
252		start = mem->ranges[i].start;
253		end = mem->ranges[i].end;
254
255		if (mstart > end || mend < start)
256			continue;
257
258		/* Truncate any area outside of range */
259		if (mstart < start)
260			mstart = start;
261		if (mend > end)
262			mend = end;
263
264		/* Found completely overlapping range */
265		if (mstart == start && mend == end) {
266			mem->ranges[i].start = 0;
267			mem->ranges[i].end = 0;
268			if (i < mem->nr_ranges - 1) {
269				/* Shift rest of the ranges to left */
270				for (j = i; j < mem->nr_ranges - 1; j++) {
271					mem->ranges[j].start =
272						mem->ranges[j+1].start;
273					mem->ranges[j].end =
274							mem->ranges[j+1].end;
275				}
276			}
277			mem->nr_ranges--;
278			return 0;
279		}
280
281		if (mstart > start && mend < end) {
282			/* Split original range */
283			mem->ranges[i].end = mstart - 1;
284			temp_range.start = mend + 1;
285			temp_range.end = end;
286		} else if (mstart != start)
287			mem->ranges[i].end = mstart - 1;
288		else
289			mem->ranges[i].start = mend + 1;
290		break;
291	}
292
293	/* If a split happend, add the split to array */
294	if (!temp_range.end)
295		return 0;
296
297	/* Split happened */
298	if (i == CRASH_MAX_RANGES - 1) {
299		pr_err("Too many crash ranges after split\n");
300		return -ENOMEM;
301	}
302
303	/* Location where new range should go */
304	j = i + 1;
305	if (j < mem->nr_ranges) {
306		/* Move over all ranges one slot towards the end */
307		for (i = mem->nr_ranges - 1; i >= j; i--)
308			mem->ranges[i + 1] = mem->ranges[i];
309	}
310
311	mem->ranges[j].start = temp_range.start;
312	mem->ranges[j].end = temp_range.end;
313	mem->nr_ranges++;
314	return 0;
315}
316
317/*
318 * Look for any unwanted ranges between mstart, mend and remove them. This
319 * might lead to split and split ranges are put in ced->mem.ranges[] array
320 */
321static int elf_header_exclude_ranges(struct crash_elf_data *ced,
322		unsigned long long mstart, unsigned long long mend)
323{
324	struct crash_mem *cmem = &ced->mem;
325	int ret = 0;
326
327	memset(cmem->ranges, 0, sizeof(cmem->ranges));
328
329	cmem->ranges[0].start = mstart;
330	cmem->ranges[0].end = mend;
331	cmem->nr_ranges = 1;
332
333	/* Exclude crashkernel region */
334	ret = exclude_mem_range(cmem, crashk_res.start, crashk_res.end);
335	if (ret)
336		return ret;
337
338	if (crashk_low_res.end) {
339		ret = exclude_mem_range(cmem, crashk_low_res.start, crashk_low_res.end);
340		if (ret)
341			return ret;
342	}
343
344	/* Exclude GART region */
345	if (ced->gart_end) {
346		ret = exclude_mem_range(cmem, ced->gart_start, ced->gart_end);
347		if (ret)
348			return ret;
349	}
350
351	return ret;
352}
353
354static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg)
355{
356	struct crash_elf_data *ced = arg;
357	Elf64_Ehdr *ehdr;
358	Elf64_Phdr *phdr;
359	unsigned long mstart, mend;
360	struct kimage *image = ced->image;
361	struct crash_mem *cmem;
362	int ret, i;
363
364	ehdr = ced->ehdr;
365
366	/* Exclude unwanted mem ranges */
367	ret = elf_header_exclude_ranges(ced, start, end);
368	if (ret)
369		return ret;
370
371	/* Go through all the ranges in ced->mem.ranges[] and prepare phdr */
372	cmem = &ced->mem;
373
374	for (i = 0; i < cmem->nr_ranges; i++) {
375		mstart = cmem->ranges[i].start;
376		mend = cmem->ranges[i].end;
377
378		phdr = ced->bufp;
379		ced->bufp += sizeof(Elf64_Phdr);
380
381		phdr->p_type = PT_LOAD;
382		phdr->p_flags = PF_R|PF_W|PF_X;
383		phdr->p_offset  = mstart;
384
385		/*
386		 * If a range matches backup region, adjust offset to backup
387		 * segment.
388		 */
389		if (mstart == image->arch.backup_src_start &&
390		    (mend - mstart + 1) == image->arch.backup_src_sz)
391			phdr->p_offset = image->arch.backup_load_addr;
392
393		phdr->p_paddr = mstart;
394		phdr->p_vaddr = (unsigned long long) __va(mstart);
395		phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
396		phdr->p_align = 0;
397		ehdr->e_phnum++;
398		pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
399			phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
400			ehdr->e_phnum, phdr->p_offset);
401	}
402
403	return ret;
404}
405
406static int prepare_elf64_headers(struct crash_elf_data *ced,
407		void **addr, unsigned long *sz)
408{
409	Elf64_Ehdr *ehdr;
410	Elf64_Phdr *phdr;
411	unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz;
412	unsigned char *buf, *bufp;
413	unsigned int cpu;
414	unsigned long long notes_addr;
415	int ret;
416
417	/* extra phdr for vmcoreinfo elf note */
418	nr_phdr = nr_cpus + 1;
419	nr_phdr += ced->max_nr_ranges;
420
421	/*
422	 * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
423	 * area on x86_64 (ffffffff80000000 - ffffffffa0000000).
424	 * I think this is required by tools like gdb. So same physical
425	 * memory will be mapped in two elf headers. One will contain kernel
426	 * text virtual addresses and other will have __va(physical) addresses.
427	 */
428
429	nr_phdr++;
430	elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr);
431	elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN);
432
433	buf = vzalloc(elf_sz);
434	if (!buf)
435		return -ENOMEM;
436
437	bufp = buf;
438	ehdr = (Elf64_Ehdr *)bufp;
439	bufp += sizeof(Elf64_Ehdr);
440	memcpy(ehdr->e_ident, ELFMAG, SELFMAG);
441	ehdr->e_ident[EI_CLASS] = ELFCLASS64;
442	ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
443	ehdr->e_ident[EI_VERSION] = EV_CURRENT;
444	ehdr->e_ident[EI_OSABI] = ELF_OSABI;
445	memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD);
446	ehdr->e_type = ET_CORE;
447	ehdr->e_machine = ELF_ARCH;
448	ehdr->e_version = EV_CURRENT;
449	ehdr->e_phoff = sizeof(Elf64_Ehdr);
450	ehdr->e_ehsize = sizeof(Elf64_Ehdr);
451	ehdr->e_phentsize = sizeof(Elf64_Phdr);
452
453	/* Prepare one phdr of type PT_NOTE for each present cpu */
454	for_each_present_cpu(cpu) {
455		phdr = (Elf64_Phdr *)bufp;
456		bufp += sizeof(Elf64_Phdr);
457		phdr->p_type = PT_NOTE;
458		notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
459		phdr->p_offset = phdr->p_paddr = notes_addr;
460		phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t);
461		(ehdr->e_phnum)++;
462	}
463
464	/* Prepare one PT_NOTE header for vmcoreinfo */
465	phdr = (Elf64_Phdr *)bufp;
466	bufp += sizeof(Elf64_Phdr);
467	phdr->p_type = PT_NOTE;
468	phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
469	phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note);
470	(ehdr->e_phnum)++;
471
472#ifdef CONFIG_X86_64
473	/* Prepare PT_LOAD type program header for kernel text region */
474	phdr = (Elf64_Phdr *)bufp;
475	bufp += sizeof(Elf64_Phdr);
476	phdr->p_type = PT_LOAD;
477	phdr->p_flags = PF_R|PF_W|PF_X;
478	phdr->p_vaddr = (Elf64_Addr)_text;
479	phdr->p_filesz = phdr->p_memsz = _end - _text;
480	phdr->p_offset = phdr->p_paddr = __pa_symbol(_text);
481	(ehdr->e_phnum)++;
482#endif
483
484	/* Prepare PT_LOAD headers for system ram chunks. */
485	ced->ehdr = ehdr;
486	ced->bufp = bufp;
487	ret = walk_system_ram_res(0, -1, ced,
488			prepare_elf64_ram_headers_callback);
489	if (ret < 0)
490		return ret;
491
492	*addr = buf;
493	*sz = elf_sz;
494	return 0;
495}
496
497/* Prepare elf headers. Return addr and size */
498static int prepare_elf_headers(struct kimage *image, void **addr,
499					unsigned long *sz)
500{
501	struct crash_elf_data *ced;
502	int ret;
503
504	ced = kzalloc(sizeof(*ced), GFP_KERNEL);
505	if (!ced)
506		return -ENOMEM;
507
508	fill_up_crash_elf_data(ced, image);
509
510	/* By default prepare 64bit headers */
511	ret =  prepare_elf64_headers(ced, addr, sz);
512	kfree(ced);
513	return ret;
514}
515
516static int add_e820_entry(struct boot_params *params, struct e820entry *entry)
517{
518	unsigned int nr_e820_entries;
519
520	nr_e820_entries = params->e820_entries;
521	if (nr_e820_entries >= E820MAX)
522		return 1;
523
524	memcpy(&params->e820_map[nr_e820_entries], entry,
525			sizeof(struct e820entry));
526	params->e820_entries++;
527	return 0;
528}
529
530static int memmap_entry_callback(u64 start, u64 end, void *arg)
531{
532	struct crash_memmap_data *cmd = arg;
533	struct boot_params *params = cmd->params;
534	struct e820entry ei;
535
536	ei.addr = start;
537	ei.size = end - start + 1;
538	ei.type = cmd->type;
539	add_e820_entry(params, &ei);
540
541	return 0;
542}
543
544static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem,
545				 unsigned long long mstart,
546				 unsigned long long mend)
547{
548	unsigned long start, end;
549	int ret = 0;
550
551	cmem->ranges[0].start = mstart;
552	cmem->ranges[0].end = mend;
553	cmem->nr_ranges = 1;
554
555	/* Exclude Backup region */
556	start = image->arch.backup_load_addr;
557	end = start + image->arch.backup_src_sz - 1;
558	ret = exclude_mem_range(cmem, start, end);
559	if (ret)
560		return ret;
561
562	/* Exclude elf header region */
563	start = image->arch.elf_load_addr;
564	end = start + image->arch.elf_headers_sz - 1;
565	return exclude_mem_range(cmem, start, end);
566}
567
568/* Prepare memory map for crash dump kernel */
569int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
570{
571	int i, ret = 0;
572	unsigned long flags;
573	struct e820entry ei;
574	struct crash_memmap_data cmd;
575	struct crash_mem *cmem;
576
577	cmem = vzalloc(sizeof(struct crash_mem));
578	if (!cmem)
579		return -ENOMEM;
580
581	memset(&cmd, 0, sizeof(struct crash_memmap_data));
582	cmd.params = params;
583
584	/* Add first 640K segment */
585	ei.addr = image->arch.backup_src_start;
586	ei.size = image->arch.backup_src_sz;
587	ei.type = E820_RAM;
588	add_e820_entry(params, &ei);
589
590	/* Add ACPI tables */
591	cmd.type = E820_ACPI;
592	flags = IORESOURCE_MEM | IORESOURCE_BUSY;
593	walk_iomem_res("ACPI Tables", flags, 0, -1, &cmd,
594		       memmap_entry_callback);
595
596	/* Add ACPI Non-volatile Storage */
597	cmd.type = E820_NVS;
598	walk_iomem_res("ACPI Non-volatile Storage", flags, 0, -1, &cmd,
599			memmap_entry_callback);
600
601	/* Add crashk_low_res region */
602	if (crashk_low_res.end) {
603		ei.addr = crashk_low_res.start;
604		ei.size = crashk_low_res.end - crashk_low_res.start + 1;
605		ei.type = E820_RAM;
606		add_e820_entry(params, &ei);
607	}
608
609	/* Exclude some ranges from crashk_res and add rest to memmap */
610	ret = memmap_exclude_ranges(image, cmem, crashk_res.start,
611						crashk_res.end);
612	if (ret)
613		goto out;
614
615	for (i = 0; i < cmem->nr_ranges; i++) {
616		ei.size = cmem->ranges[i].end - cmem->ranges[i].start + 1;
617
618		/* If entry is less than a page, skip it */
619		if (ei.size < PAGE_SIZE)
620			continue;
621		ei.addr = cmem->ranges[i].start;
622		ei.type = E820_RAM;
623		add_e820_entry(params, &ei);
624	}
625
626out:
627	vfree(cmem);
628	return ret;
629}
630
631static int determine_backup_region(u64 start, u64 end, void *arg)
632{
633	struct kimage *image = arg;
634
635	image->arch.backup_src_start = start;
636	image->arch.backup_src_sz = end - start + 1;
637
638	/* Expecting only one range for backup region */
639	return 1;
640}
641
642int crash_load_segments(struct kimage *image)
643{
644	unsigned long src_start, src_sz, elf_sz;
645	void *elf_addr;
646	int ret;
647
648	/*
649	 * Determine and load a segment for backup area. First 640K RAM
650	 * region is backup source
651	 */
652
653	ret = walk_system_ram_res(KEXEC_BACKUP_SRC_START, KEXEC_BACKUP_SRC_END,
654				image, determine_backup_region);
655
656	/* Zero or postive return values are ok */
657	if (ret < 0)
658		return ret;
659
660	src_start = image->arch.backup_src_start;
661	src_sz = image->arch.backup_src_sz;
662
663	/* Add backup segment. */
664	if (src_sz) {
665		/*
666		 * Ideally there is no source for backup segment. This is
667		 * copied in purgatory after crash. Just add a zero filled
668		 * segment for now to make sure checksum logic works fine.
669		 */
670		ret = kexec_add_buffer(image, (char *)&crash_zero_bytes,
671				       sizeof(crash_zero_bytes), src_sz,
672				       PAGE_SIZE, 0, -1, 0,
673				       &image->arch.backup_load_addr);
674		if (ret)
675			return ret;
676		pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n",
677			 image->arch.backup_load_addr, src_start, src_sz);
678	}
679
680	/* Prepare elf headers and add a segment */
681	ret = prepare_elf_headers(image, &elf_addr, &elf_sz);
682	if (ret)
683		return ret;
684
685	image->arch.elf_headers = elf_addr;
686	image->arch.elf_headers_sz = elf_sz;
687
688	ret = kexec_add_buffer(image, (char *)elf_addr, elf_sz, elf_sz,
689			ELF_CORE_HEADER_ALIGN, 0, -1, 0,
690			&image->arch.elf_load_addr);
691	if (ret) {
692		vfree((void *)image->arch.elf_headers);
693		return ret;
694	}
695	pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
696		 image->arch.elf_load_addr, elf_sz, elf_sz);
697
698	return ret;
699}
700#endif /* CONFIG_KEXEC_FILE */
701