Commit | Line | Data |
---|---|---|
5033cba0 | 1 | /* |
835c34a1 | 2 | * handle transition of Linux booting another kernel |
5033cba0 EB |
3 | * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> |
4 | * | |
5 | * This source code is licensed under the GNU General Public License, | |
6 | * Version 2. See the file COPYING for more details. | |
7 | */ | |
8 | ||
9 | #include <linux/mm.h> | |
10 | #include <linux/kexec.h> | |
11 | #include <linux/delay.h> | |
fd59d231 | 12 | #include <linux/numa.h> |
f43fdad8 | 13 | #include <linux/ftrace.h> |
3122c331 | 14 | #include <linux/suspend.h> |
92be3d6b | 15 | #include <linux/gfp.h> |
fef3a7a1 | 16 | #include <linux/io.h> |
f43fdad8 | 17 | |
5033cba0 EB |
18 | #include <asm/pgtable.h> |
19 | #include <asm/pgalloc.h> | |
20 | #include <asm/tlbflush.h> | |
21 | #include <asm/mmu_context.h> | |
5033cba0 | 22 | #include <asm/apic.h> |
8643e28d | 23 | #include <asm/io_apic.h> |
5033cba0 | 24 | #include <asm/cpufeature.h> |
e7b47cca | 25 | #include <asm/desc.h> |
d1163651 | 26 | #include <asm/set_memory.h> |
17f557e5 | 27 | #include <asm/debugreg.h> |
5033cba0 | 28 | |
5033cba0 EB |
29 | static void set_gdt(void *newgdt, __u16 limit) |
30 | { | |
6b68f01b | 31 | struct desc_ptr curgdt; |
5033cba0 EB |
32 | |
33 | /* ia32 supports unaligned loads & stores */ | |
e7b47cca EB |
34 | curgdt.size = limit; |
35 | curgdt.address = (unsigned long)newgdt; | |
5033cba0 | 36 | |
f2ab4461 | 37 | load_gdt(&curgdt); |
378fc6ee | 38 | } |
5033cba0 EB |
39 | |
40 | static void load_segments(void) | |
41 | { | |
42 | #define __STR(X) #X | |
43 | #define STR(X) __STR(X) | |
44 | ||
45 | __asm__ __volatile__ ( | |
46 | "\tljmp $"STR(__KERNEL_CS)",$1f\n" | |
47 | "\t1:\n" | |
2ec5e3a8 MM |
48 | "\tmovl $"STR(__KERNEL_DS)",%%eax\n" |
49 | "\tmovl %%eax,%%ds\n" | |
50 | "\tmovl %%eax,%%es\n" | |
51 | "\tmovl %%eax,%%fs\n" | |
52 | "\tmovl %%eax,%%gs\n" | |
53 | "\tmovl %%eax,%%ss\n" | |
fef3a7a1 | 54 | : : : "eax", "memory"); |
5033cba0 EB |
55 | #undef STR |
56 | #undef __STR | |
57 | } | |
58 | ||
92be3d6b HY |
59 | static void machine_kexec_free_page_tables(struct kimage *image) |
60 | { | |
61 | free_page((unsigned long)image->arch.pgd); | |
62 | #ifdef CONFIG_X86_PAE | |
63 | free_page((unsigned long)image->arch.pmd0); | |
64 | free_page((unsigned long)image->arch.pmd1); | |
65 | #endif | |
66 | free_page((unsigned long)image->arch.pte0); | |
67 | free_page((unsigned long)image->arch.pte1); | |
68 | } | |
69 | ||
70 | static int machine_kexec_alloc_page_tables(struct kimage *image) | |
71 | { | |
72 | image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL); | |
73 | #ifdef CONFIG_X86_PAE | |
74 | image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL); | |
75 | image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL); | |
76 | #endif | |
77 | image->arch.pte0 = (pte_t *)get_zeroed_page(GFP_KERNEL); | |
78 | image->arch.pte1 = (pte_t *)get_zeroed_page(GFP_KERNEL); | |
79 | if (!image->arch.pgd || | |
80 | #ifdef CONFIG_X86_PAE | |
81 | !image->arch.pmd0 || !image->arch.pmd1 || | |
82 | #endif | |
83 | !image->arch.pte0 || !image->arch.pte1) { | |
84 | machine_kexec_free_page_tables(image); | |
85 | return -ENOMEM; | |
86 | } | |
87 | return 0; | |
88 | } | |
89 | ||
9868ee63 HY |
90 | static void machine_kexec_page_table_set_one( |
91 | pgd_t *pgd, pmd_t *pmd, pte_t *pte, | |
92 | unsigned long vaddr, unsigned long paddr) | |
93 | { | |
7f689041 | 94 | p4d_t *p4d; |
9868ee63 HY |
95 | pud_t *pud; |
96 | ||
97 | pgd += pgd_index(vaddr); | |
98 | #ifdef CONFIG_X86_PAE | |
99 | if (!(pgd_val(*pgd) & _PAGE_PRESENT)) | |
100 | set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT)); | |
101 | #endif | |
7f689041 KS |
102 | p4d = p4d_offset(pgd, vaddr); |
103 | pud = pud_offset(p4d, vaddr); | |
9868ee63 HY |
104 | pmd = pmd_offset(pud, vaddr); |
105 | if (!(pmd_val(*pmd) & _PAGE_PRESENT)) | |
106 | set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); | |
107 | pte = pte_offset_kernel(pmd, vaddr); | |
108 | set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); | |
109 | } | |
110 | ||
111 | static void machine_kexec_prepare_page_tables(struct kimage *image) | |
112 | { | |
113 | void *control_page; | |
fc6fcdfb | 114 | pmd_t *pmd = NULL; |
9868ee63 HY |
115 | |
116 | control_page = page_address(image->control_code_page); | |
117 | #ifdef CONFIG_X86_PAE | |
118 | pmd = image->arch.pmd0; | |
119 | #endif | |
120 | machine_kexec_page_table_set_one( | |
121 | image->arch.pgd, pmd, image->arch.pte0, | |
122 | (unsigned long)control_page, __pa(control_page)); | |
123 | #ifdef CONFIG_X86_PAE | |
124 | pmd = image->arch.pmd1; | |
125 | #endif | |
126 | machine_kexec_page_table_set_one( | |
127 | image->arch.pgd, pmd, image->arch.pte1, | |
128 | __pa(control_page), __pa(control_page)); | |
129 | } | |
130 | ||
5033cba0 EB |
131 | /* |
132 | * A architecture hook called to validate the | |
133 | * proposed image and prepare the control pages | |
163f6876 | 134 | * as needed. The pages for KEXEC_CONTROL_PAGE_SIZE |
5033cba0 EB |
135 | * have been allocated, but the segments have yet |
136 | * been copied into the kernel. | |
137 | * | |
138 | * Do what every setup is needed on image and the | |
139 | * reboot code buffer to allow us to avoid allocations | |
140 | * later. | |
141 | * | |
92be3d6b HY |
142 | * - Make control page executable. |
143 | * - Allocate page tables | |
9868ee63 | 144 | * - Setup page tables |
5033cba0 EB |
145 | */ |
146 | int machine_kexec_prepare(struct kimage *image) | |
147 | { | |
9868ee63 HY |
148 | int error; |
149 | ||
583140af | 150 | set_pages_x(image->control_code_page, 1); |
9868ee63 HY |
151 | error = machine_kexec_alloc_page_tables(image); |
152 | if (error) | |
153 | return error; | |
154 | machine_kexec_prepare_page_tables(image); | |
155 | return 0; | |
5033cba0 EB |
156 | } |
157 | ||
158 | /* | |
159 | * Undo anything leftover by machine_kexec_prepare | |
160 | * when an image is freed. | |
161 | */ | |
162 | void machine_kexec_cleanup(struct kimage *image) | |
163 | { | |
583140af | 164 | set_pages_nx(image->control_code_page, 1); |
92be3d6b | 165 | machine_kexec_free_page_tables(image); |
5033cba0 EB |
166 | } |
167 | ||
168 | /* | |
169 | * Do not allocate memory (or fail in any way) in machine_kexec(). | |
170 | * We are past the point of no return, committed to rebooting now. | |
171 | */ | |
3ab83521 | 172 | void machine_kexec(struct kimage *image) |
5033cba0 | 173 | { |
3566561b MD |
174 | unsigned long page_list[PAGES_NR]; |
175 | void *control_page; | |
3122c331 | 176 | int save_ftrace_enabled; |
3ab83521 HY |
177 | asmlinkage unsigned long |
178 | (*relocate_kernel_ptr)(unsigned long indirection_page, | |
179 | unsigned long control_page, | |
180 | unsigned long start_address, | |
181 | unsigned int has_pae, | |
182 | unsigned int preserve_context); | |
5033cba0 | 183 | |
3122c331 | 184 | #ifdef CONFIG_KEXEC_JUMP |
6407df5c | 185 | if (image->preserve_context) |
3122c331 HY |
186 | save_processor_state(); |
187 | #endif | |
188 | ||
189 | save_ftrace_enabled = __ftrace_enabled_save(); | |
f43fdad8 | 190 | |
5033cba0 EB |
191 | /* Interrupts aren't acceptable while we reboot */ |
192 | local_irq_disable(); | |
17f557e5 | 193 | hw_breakpoint_disable(); |
5033cba0 | 194 | |
89081d17 HY |
195 | if (image->preserve_context) { |
196 | #ifdef CONFIG_X86_IO_APIC | |
fef3a7a1 HY |
197 | /* |
198 | * We need to put APICs in legacy mode so that we can | |
89081d17 HY |
199 | * get timer interrupts in second kernel. kexec/kdump |
200 | * paths already have calls to disable_IO_APIC() in | |
201 | * one form or other. kexec jump path also need | |
202 | * one. | |
203 | */ | |
204 | disable_IO_APIC(); | |
205 | #endif | |
206 | } | |
207 | ||
3566561b | 208 | control_page = page_address(image->control_code_page); |
fb45daa6 | 209 | memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); |
3566561b | 210 | |
3ab83521 | 211 | relocate_kernel_ptr = control_page; |
3566561b | 212 | page_list[PA_CONTROL_PAGE] = __pa(control_page); |
3ab83521 | 213 | page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; |
92be3d6b | 214 | page_list[PA_PGD] = __pa(image->arch.pgd); |
e7706fc6 KO |
215 | |
216 | if (image->type == KEXEC_TYPE_DEFAULT) | |
217 | page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) | |
218 | << PAGE_SHIFT); | |
5033cba0 | 219 | |
fef3a7a1 HY |
220 | /* |
221 | * The segment registers are funny things, they have both a | |
2a8a3d5b EB |
222 | * visible and an invisible part. Whenever the visible part is |
223 | * set to a specific selector, the invisible part is loaded | |
224 | * with from a table in memory. At no other time is the | |
225 | * descriptor table in memory accessed. | |
5033cba0 EB |
226 | * |
227 | * I take advantage of this here by force loading the | |
228 | * segments, before I zap the gdt with an invalid value. | |
229 | */ | |
230 | load_segments(); | |
fef3a7a1 HY |
231 | /* |
232 | * The gdt & idt are now invalid. | |
5033cba0 EB |
233 | * If you want to load them you must set up your own idt & gdt. |
234 | */ | |
fef3a7a1 | 235 | set_gdt(phys_to_virt(0), 0); |
e802a51e | 236 | idt_invalidate(phys_to_virt(0)); |
5033cba0 EB |
237 | |
238 | /* now call it */ | |
3ab83521 HY |
239 | image->start = relocate_kernel_ptr((unsigned long)image->head, |
240 | (unsigned long)page_list, | |
c8128cce DH |
241 | image->start, |
242 | boot_cpu_has(X86_FEATURE_PAE), | |
3ab83521 | 243 | image->preserve_context); |
3122c331 HY |
244 | |
245 | #ifdef CONFIG_KEXEC_JUMP | |
6407df5c | 246 | if (image->preserve_context) |
3122c331 HY |
247 | restore_processor_state(); |
248 | #endif | |
249 | ||
250 | __ftrace_enabled_restore(save_ftrace_enabled); | |
5033cba0 | 251 | } |
1a3f239d | 252 | |
fd59d231 KO |
253 | void arch_crash_save_vmcoreinfo(void) |
254 | { | |
92df5c3e | 255 | #ifdef CONFIG_NUMA |
bcbba6c1 KO |
256 | VMCOREINFO_SYMBOL(node_data); |
257 | VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); | |
fd59d231 KO |
258 | #endif |
259 | #ifdef CONFIG_X86_PAE | |
bcbba6c1 | 260 | VMCOREINFO_CONFIG(X86_PAE); |
fd59d231 KO |
261 | #endif |
262 | } | |
263 |