Commit | Line | Data |
---|---|---|
5234f5eb | 1 | /* |
835c34a1 | 2 | * handle transition of Linux booting another kernel |
5234f5eb EB |
3 | * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> |
4 | * | |
5 | * This source code is licensed under the GNU General Public License, | |
6 | * Version 2. See the file COPYING for more details. | |
7 | */ | |
8 | ||
9 | #include <linux/mm.h> | |
10 | #include <linux/kexec.h> | |
5234f5eb | 11 | #include <linux/string.h> |
5a0e3ad6 | 12 | #include <linux/gfp.h> |
5234f5eb | 13 | #include <linux/reboot.h> |
fd59d231 | 14 | #include <linux/numa.h> |
f43fdad8 | 15 | #include <linux/ftrace.h> |
fef3a7a1 | 16 | #include <linux/io.h> |
fee7b0d8 | 17 | #include <linux/suspend.h> |
f43fdad8 | 18 | |
9ebdc79f | 19 | #include <asm/init.h> |
5234f5eb | 20 | #include <asm/pgtable.h> |
5234f5eb EB |
21 | #include <asm/tlbflush.h> |
22 | #include <asm/mmu_context.h> | |
17f557e5 | 23 | #include <asm/debugreg.h> |
8bf27556 | 24 | |
f5deb796 HY |
25 | static void free_transition_pgtable(struct kimage *image) |
26 | { | |
27 | free_page((unsigned long)image->arch.pud); | |
28 | free_page((unsigned long)image->arch.pmd); | |
29 | free_page((unsigned long)image->arch.pte); | |
30 | } | |
31 | ||
32 | static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) | |
33 | { | |
34 | pud_t *pud; | |
35 | pmd_t *pmd; | |
36 | pte_t *pte; | |
37 | unsigned long vaddr, paddr; | |
38 | int result = -ENOMEM; | |
39 | ||
40 | vaddr = (unsigned long)relocate_kernel; | |
41 | paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE); | |
42 | pgd += pgd_index(vaddr); | |
43 | if (!pgd_present(*pgd)) { | |
44 | pud = (pud_t *)get_zeroed_page(GFP_KERNEL); | |
45 | if (!pud) | |
46 | goto err; | |
47 | image->arch.pud = pud; | |
48 | set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); | |
49 | } | |
50 | pud = pud_offset(pgd, vaddr); | |
51 | if (!pud_present(*pud)) { | |
52 | pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); | |
53 | if (!pmd) | |
54 | goto err; | |
55 | image->arch.pmd = pmd; | |
56 | set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); | |
57 | } | |
58 | pmd = pmd_offset(pud, vaddr); | |
59 | if (!pmd_present(*pmd)) { | |
60 | pte = (pte_t *)get_zeroed_page(GFP_KERNEL); | |
61 | if (!pte) | |
62 | goto err; | |
63 | image->arch.pte = pte; | |
64 | set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); | |
65 | } | |
66 | pte = pte_offset_kernel(pmd, vaddr); | |
67 | set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); | |
68 | return 0; | |
69 | err: | |
70 | free_transition_pgtable(image); | |
71 | return result; | |
72 | } | |
73 | ||
9ebdc79f YL |
74 | static void *alloc_pgt_page(void *data) |
75 | { | |
76 | struct kimage *image = (struct kimage *)data; | |
77 | struct page *page; | |
78 | void *p = NULL; | |
79 | ||
80 | page = kimage_alloc_control_pages(image, 0); | |
81 | if (page) { | |
82 | p = page_address(page); | |
83 | clear_page(p); | |
84 | } | |
85 | ||
86 | return p; | |
87 | } | |
88 | ||
5234f5eb EB |
89 | static int init_pgtable(struct kimage *image, unsigned long start_pgtable) |
90 | { | |
9ebdc79f YL |
91 | struct x86_mapping_info info = { |
92 | .alloc_pgt_page = alloc_pgt_page, | |
93 | .context = image, | |
94 | .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, | |
95 | }; | |
084d1283 | 96 | unsigned long mstart, mend; |
8bf27556 | 97 | pgd_t *level4p; |
f5deb796 | 98 | int result; |
084d1283 YL |
99 | int i; |
100 | ||
8bf27556 | 101 | level4p = (pgd_t *)__va(start_pgtable); |
9ebdc79f | 102 | clear_page(level4p); |
0e691cf8 YL |
103 | for (i = 0; i < nr_pfn_mapped; i++) { |
104 | mstart = pfn_mapped[i].start << PAGE_SHIFT; | |
105 | mend = pfn_mapped[i].end << PAGE_SHIFT; | |
106 | ||
107 | result = kernel_ident_mapping_init(&info, | |
108 | level4p, mstart, mend); | |
109 | if (result) | |
110 | return result; | |
111 | } | |
084d1283 | 112 | |
53594547 | 113 | /* |
084d1283 YL |
114 | * segments's mem ranges could be outside 0 ~ max_pfn, |
115 | * for example when jump back to original kernel from kexeced kernel. | |
116 | * or first kernel is booted with user mem map, and second kernel | |
117 | * could be loaded out of that range. | |
53594547 | 118 | */ |
084d1283 YL |
119 | for (i = 0; i < image->nr_segments; i++) { |
120 | mstart = image->segment[i].mem; | |
121 | mend = mstart + image->segment[i].memsz; | |
122 | ||
9ebdc79f YL |
123 | result = kernel_ident_mapping_init(&info, |
124 | level4p, mstart, mend); | |
084d1283 YL |
125 | |
126 | if (result) | |
127 | return result; | |
128 | } | |
129 | ||
f5deb796 | 130 | return init_transition_pgtable(image, level4p); |
5234f5eb EB |
131 | } |
132 | ||
133 | static void set_idt(void *newidt, u16 limit) | |
134 | { | |
36c4fd23 | 135 | struct desc_ptr curidt; |
5234f5eb EB |
136 | |
137 | /* x86-64 supports unaliged loads & stores */ | |
36c4fd23 EB |
138 | curidt.size = limit; |
139 | curidt.address = (unsigned long)newidt; | |
5234f5eb EB |
140 | |
141 | __asm__ __volatile__ ( | |
36c4fd23 EB |
142 | "lidtq %0\n" |
143 | : : "m" (curidt) | |
5234f5eb EB |
144 | ); |
145 | }; | |
146 | ||
147 | ||
148 | static void set_gdt(void *newgdt, u16 limit) | |
149 | { | |
36c4fd23 | 150 | struct desc_ptr curgdt; |
5234f5eb EB |
151 | |
152 | /* x86-64 supports unaligned loads & stores */ | |
36c4fd23 EB |
153 | curgdt.size = limit; |
154 | curgdt.address = (unsigned long)newgdt; | |
5234f5eb EB |
155 | |
156 | __asm__ __volatile__ ( | |
36c4fd23 EB |
157 | "lgdtq %0\n" |
158 | : : "m" (curgdt) | |
5234f5eb EB |
159 | ); |
160 | }; | |
161 | ||
162 | static void load_segments(void) | |
163 | { | |
164 | __asm__ __volatile__ ( | |
36c4fd23 EB |
165 | "\tmovl %0,%%ds\n" |
166 | "\tmovl %0,%%es\n" | |
167 | "\tmovl %0,%%ss\n" | |
168 | "\tmovl %0,%%fs\n" | |
169 | "\tmovl %0,%%gs\n" | |
2ec5e3a8 | 170 | : : "a" (__KERNEL_DS) : "memory" |
5234f5eb | 171 | ); |
5234f5eb EB |
172 | } |
173 | ||
5234f5eb EB |
174 | int machine_kexec_prepare(struct kimage *image) |
175 | { | |
4bfaaef0 | 176 | unsigned long start_pgtable; |
5234f5eb EB |
177 | int result; |
178 | ||
179 | /* Calculate the offsets */ | |
72414d3f | 180 | start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; |
5234f5eb EB |
181 | |
182 | /* Setup the identity mapped 64bit page table */ | |
183 | result = init_pgtable(image, start_pgtable); | |
72414d3f | 184 | if (result) |
5234f5eb | 185 | return result; |
5234f5eb | 186 | |
5234f5eb EB |
187 | return 0; |
188 | } | |
189 | ||
190 | void machine_kexec_cleanup(struct kimage *image) | |
191 | { | |
f5deb796 | 192 | free_transition_pgtable(image); |
5234f5eb EB |
193 | } |
194 | ||
195 | /* | |
196 | * Do not allocate memory (or fail in any way) in machine_kexec(). | |
197 | * We are past the point of no return, committed to rebooting now. | |
198 | */ | |
3ab83521 | 199 | void machine_kexec(struct kimage *image) |
5234f5eb | 200 | { |
4bfaaef0 MD |
201 | unsigned long page_list[PAGES_NR]; |
202 | void *control_page; | |
fee7b0d8 | 203 | int save_ftrace_enabled; |
5234f5eb | 204 | |
fee7b0d8 | 205 | #ifdef CONFIG_KEXEC_JUMP |
6407df5c | 206 | if (image->preserve_context) |
fee7b0d8 HY |
207 | save_processor_state(); |
208 | #endif | |
209 | ||
210 | save_ftrace_enabled = __ftrace_enabled_save(); | |
f43fdad8 | 211 | |
5234f5eb EB |
212 | /* Interrupts aren't acceptable while we reboot */ |
213 | local_irq_disable(); | |
17f557e5 | 214 | hw_breakpoint_disable(); |
5234f5eb | 215 | |
fee7b0d8 HY |
216 | if (image->preserve_context) { |
217 | #ifdef CONFIG_X86_IO_APIC | |
218 | /* | |
219 | * We need to put APICs in legacy mode so that we can | |
220 | * get timer interrupts in second kernel. kexec/kdump | |
221 | * paths already have calls to disable_IO_APIC() in | |
222 | * one form or other. kexec jump path also need | |
223 | * one. | |
224 | */ | |
225 | disable_IO_APIC(); | |
226 | #endif | |
227 | } | |
228 | ||
4bfaaef0 | 229 | control_page = page_address(image->control_code_page) + PAGE_SIZE; |
fee7b0d8 | 230 | memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); |
4bfaaef0 | 231 | |
e3ebadd9 | 232 | page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); |
fee7b0d8 | 233 | page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; |
4bfaaef0 MD |
234 | page_list[PA_TABLE_PAGE] = |
235 | (unsigned long)__pa(page_address(image->control_code_page)); | |
5234f5eb | 236 | |
fee7b0d8 HY |
237 | if (image->type == KEXEC_TYPE_DEFAULT) |
238 | page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) | |
239 | << PAGE_SHIFT); | |
240 | ||
fef3a7a1 HY |
241 | /* |
242 | * The segment registers are funny things, they have both a | |
2a8a3d5b EB |
243 | * visible and an invisible part. Whenever the visible part is |
244 | * set to a specific selector, the invisible part is loaded | |
245 | * with from a table in memory. At no other time is the | |
246 | * descriptor table in memory accessed. | |
5234f5eb EB |
247 | * |
248 | * I take advantage of this here by force loading the | |
249 | * segments, before I zap the gdt with an invalid value. | |
250 | */ | |
251 | load_segments(); | |
fef3a7a1 HY |
252 | /* |
253 | * The gdt & idt are now invalid. | |
5234f5eb EB |
254 | * If you want to load them you must set up your own idt & gdt. |
255 | */ | |
fef3a7a1 HY |
256 | set_gdt(phys_to_virt(0), 0); |
257 | set_idt(phys_to_virt(0), 0); | |
4bfaaef0 | 258 | |
5234f5eb | 259 | /* now call it */ |
fee7b0d8 HY |
260 | image->start = relocate_kernel((unsigned long)image->head, |
261 | (unsigned long)page_list, | |
262 | image->start, | |
263 | image->preserve_context); | |
264 | ||
265 | #ifdef CONFIG_KEXEC_JUMP | |
6407df5c | 266 | if (image->preserve_context) |
fee7b0d8 HY |
267 | restore_processor_state(); |
268 | #endif | |
269 | ||
270 | __ftrace_enabled_restore(save_ftrace_enabled); | |
5234f5eb | 271 | } |
2c8c0e6b | 272 | |
fd59d231 KO |
273 | void arch_crash_save_vmcoreinfo(void) |
274 | { | |
629c8b4c | 275 | VMCOREINFO_SYMBOL(phys_base); |
69243f91 | 276 | VMCOREINFO_SYMBOL(init_level4_pgt); |
92df5c3e KO |
277 | |
278 | #ifdef CONFIG_NUMA | |
279 | VMCOREINFO_SYMBOL(node_data); | |
280 | VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); | |
281 | #endif | |
fd59d231 KO |
282 | } |
283 |