[PATCH] kdump: Use real pt_regs from exception
[linux-2.6-block.git] / kernel / kexec.c
CommitLineData
dc009d92
EB
1/*
2 * kexec.c - kexec system call
3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
9#include <linux/mm.h>
10#include <linux/file.h>
11#include <linux/slab.h>
12#include <linux/fs.h>
13#include <linux/kexec.h>
14#include <linux/spinlock.h>
15#include <linux/list.h>
16#include <linux/highmem.h>
17#include <linux/syscalls.h>
18#include <linux/reboot.h>
19#include <linux/syscalls.h>
20#include <linux/ioport.h>
6e274d14
AN
21#include <linux/hardirq.h>
22
dc009d92
EB
23#include <asm/page.h>
24#include <asm/uaccess.h>
25#include <asm/io.h>
26#include <asm/system.h>
27#include <asm/semaphore.h>
28
29/* Location of the reserved area for the crash kernel */
30struct resource crashk_res = {
31 .name = "Crash kernel",
32 .start = 0,
33 .end = 0,
34 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
35};
36
6e274d14
AN
37int kexec_should_crash(struct task_struct *p)
38{
39 if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
40 return 1;
41 return 0;
42}
43
dc009d92
EB
44/*
45 * When kexec transitions to the new kernel there is a one-to-one
46 * mapping between physical and virtual addresses. On processors
47 * where you can disable the MMU this is trivial, and easy. For
48 * others it is still a simple predictable page table to setup.
49 *
50 * In that environment kexec copies the new kernel to its final
51 * resting place. This means I can only support memory whose
52 * physical address can fit in an unsigned long. In particular
53 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
54 * If the assembly stub has more restrictive requirements
55 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
56 * defined more restrictively in <asm/kexec.h>.
57 *
58 * The code for the transition from the current kernel to the
59 * the new kernel is placed in the control_code_buffer, whose size
60 * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single
61 * page of memory is necessary, but some architectures require more.
62 * Because this memory must be identity mapped in the transition from
63 * virtual to physical addresses it must live in the range
64 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
65 * modifiable.
66 *
67 * The assembly stub in the control code buffer is passed a linked list
68 * of descriptor pages detailing the source pages of the new kernel,
69 * and the destination addresses of those source pages. As this data
70 * structure is not used in the context of the current OS, it must
71 * be self-contained.
72 *
73 * The code has been made to work with highmem pages and will use a
74 * destination page in its final resting place (if it happens
75 * to allocate it). The end product of this is that most of the
76 * physical address space, and most of RAM can be used.
77 *
78 * Future directions include:
79 * - allocating a page table with the control code buffer identity
80 * mapped, to simplify machine_kexec and make kexec_on_panic more
81 * reliable.
82 */
83
84/*
85 * KIMAGE_NO_DEST is an impossible destination address..., for
86 * allocating pages whose destination address we do not care about.
87 */
88#define KIMAGE_NO_DEST (-1UL)
89
90static int kimage_is_destination_range(
91 struct kimage *image, unsigned long start, unsigned long end);
92static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest);
93
94static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
95 unsigned long nr_segments, struct kexec_segment __user *segments)
96{
97 size_t segment_bytes;
98 struct kimage *image;
99 unsigned long i;
100 int result;
101
102 /* Allocate a controlling structure */
103 result = -ENOMEM;
104 image = kmalloc(sizeof(*image), GFP_KERNEL);
105 if (!image) {
106 goto out;
107 }
108 memset(image, 0, sizeof(*image));
109 image->head = 0;
110 image->entry = &image->head;
111 image->last_entry = &image->head;
112 image->control_page = ~0; /* By default this does not apply */
113 image->start = entry;
114 image->type = KEXEC_TYPE_DEFAULT;
115
116 /* Initialize the list of control pages */
117 INIT_LIST_HEAD(&image->control_pages);
118
119 /* Initialize the list of destination pages */
120 INIT_LIST_HEAD(&image->dest_pages);
121
122 /* Initialize the list of unuseable pages */
123 INIT_LIST_HEAD(&image->unuseable_pages);
124
125 /* Read in the segments */
126 image->nr_segments = nr_segments;
127 segment_bytes = nr_segments * sizeof(*segments);
128 result = copy_from_user(image->segment, segments, segment_bytes);
129 if (result)
130 goto out;
131
132 /*
133 * Verify we have good destination addresses. The caller is
134 * responsible for making certain we don't attempt to load
135 * the new image into invalid or reserved areas of RAM. This
136 * just verifies it is an address we can use.
137 *
138 * Since the kernel does everything in page size chunks ensure
139 * the destination addreses are page aligned. Too many
140 * special cases crop of when we don't do this. The most
141 * insidious is getting overlapping destination addresses
142 * simply because addresses are changed to page size
143 * granularity.
144 */
145 result = -EADDRNOTAVAIL;
146 for (i = 0; i < nr_segments; i++) {
147 unsigned long mstart, mend;
148 mstart = image->segment[i].mem;
149 mend = mstart + image->segment[i].memsz;
150 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
151 goto out;
152 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
153 goto out;
154 }
155
156 /* Verify our destination addresses do not overlap.
157 * If we alloed overlapping destination addresses
158 * through very weird things can happen with no
159 * easy explanation as one segment stops on another.
160 */
161 result = -EINVAL;
162 for(i = 0; i < nr_segments; i++) {
163 unsigned long mstart, mend;
164 unsigned long j;
165 mstart = image->segment[i].mem;
166 mend = mstart + image->segment[i].memsz;
167 for(j = 0; j < i; j++) {
168 unsigned long pstart, pend;
169 pstart = image->segment[j].mem;
170 pend = pstart + image->segment[j].memsz;
171 /* Do the segments overlap ? */
172 if ((mend > pstart) && (mstart < pend))
173 goto out;
174 }
175 }
176
177 /* Ensure our buffer sizes are strictly less than
178 * our memory sizes. This should always be the case,
179 * and it is easier to check up front than to be surprised
180 * later on.
181 */
182 result = -EINVAL;
183 for(i = 0; i < nr_segments; i++) {
184 if (image->segment[i].bufsz > image->segment[i].memsz)
185 goto out;
186 }
187
188
189 result = 0;
190 out:
191 if (result == 0) {
192 *rimage = image;
193 } else {
194 kfree(image);
195 }
196 return result;
197
198}
199
200static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
201 unsigned long nr_segments, struct kexec_segment __user *segments)
202{
203 int result;
204 struct kimage *image;
205
206 /* Allocate and initialize a controlling structure */
207 image = NULL;
208 result = do_kimage_alloc(&image, entry, nr_segments, segments);
209 if (result) {
210 goto out;
211 }
212 *rimage = image;
213
214 /*
215 * Find a location for the control code buffer, and add it
216 * the vector of segments so that it's pages will also be
217 * counted as destination pages.
218 */
219 result = -ENOMEM;
220 image->control_code_page = kimage_alloc_control_pages(image,
221 get_order(KEXEC_CONTROL_CODE_SIZE));
222 if (!image->control_code_page) {
223 printk(KERN_ERR "Could not allocate control_code_buffer\n");
224 goto out;
225 }
226
227 result = 0;
228 out:
229 if (result == 0) {
230 *rimage = image;
231 } else {
232 kfree(image);
233 }
234 return result;
235}
236
237static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
238 unsigned long nr_segments, struct kexec_segment *segments)
239{
240 int result;
241 struct kimage *image;
242 unsigned long i;
243
244 image = NULL;
245 /* Verify we have a valid entry point */
246 if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
247 result = -EADDRNOTAVAIL;
248 goto out;
249 }
250
251 /* Allocate and initialize a controlling structure */
252 result = do_kimage_alloc(&image, entry, nr_segments, segments);
253 if (result) {
254 goto out;
255 }
256
257 /* Enable the special crash kernel control page
258 * allocation policy.
259 */
260 image->control_page = crashk_res.start;
261 image->type = KEXEC_TYPE_CRASH;
262
263 /*
264 * Verify we have good destination addresses. Normally
265 * the caller is responsible for making certain we don't
266 * attempt to load the new image into invalid or reserved
267 * areas of RAM. But crash kernels are preloaded into a
268 * reserved area of ram. We must ensure the addresses
269 * are in the reserved area otherwise preloading the
270 * kernel could corrupt things.
271 */
272 result = -EADDRNOTAVAIL;
273 for (i = 0; i < nr_segments; i++) {
274 unsigned long mstart, mend;
275 mstart = image->segment[i].mem;
50cccc69 276 mend = mstart + image->segment[i].memsz - 1;
dc009d92
EB
277 /* Ensure we are within the crash kernel limits */
278 if ((mstart < crashk_res.start) || (mend > crashk_res.end))
279 goto out;
280 }
281
282
283 /*
284 * Find a location for the control code buffer, and add
285 * the vector of segments so that it's pages will also be
286 * counted as destination pages.
287 */
288 result = -ENOMEM;
289 image->control_code_page = kimage_alloc_control_pages(image,
290 get_order(KEXEC_CONTROL_CODE_SIZE));
291 if (!image->control_code_page) {
292 printk(KERN_ERR "Could not allocate control_code_buffer\n");
293 goto out;
294 }
295
296 result = 0;
297 out:
298 if (result == 0) {
299 *rimage = image;
300 } else {
301 kfree(image);
302 }
303 return result;
304}
305
306static int kimage_is_destination_range(
307 struct kimage *image, unsigned long start, unsigned long end)
308{
309 unsigned long i;
310
311 for (i = 0; i < image->nr_segments; i++) {
312 unsigned long mstart, mend;
313 mstart = image->segment[i].mem;
314 mend = mstart + image->segment[i].memsz;
315 if ((end > mstart) && (start < mend)) {
316 return 1;
317 }
318 }
319 return 0;
320}
321
322static struct page *kimage_alloc_pages(unsigned int gfp_mask, unsigned int order)
323{
324 struct page *pages;
325 pages = alloc_pages(gfp_mask, order);
326 if (pages) {
327 unsigned int count, i;
328 pages->mapping = NULL;
329 pages->private = order;
330 count = 1 << order;
331 for(i = 0; i < count; i++) {
332 SetPageReserved(pages + i);
333 }
334 }
335 return pages;
336}
337
338static void kimage_free_pages(struct page *page)
339{
340 unsigned int order, count, i;
341 order = page->private;
342 count = 1 << order;
343 for(i = 0; i < count; i++) {
344 ClearPageReserved(page + i);
345 }
346 __free_pages(page, order);
347}
348
349static void kimage_free_page_list(struct list_head *list)
350{
351 struct list_head *pos, *next;
352 list_for_each_safe(pos, next, list) {
353 struct page *page;
354
355 page = list_entry(pos, struct page, lru);
356 list_del(&page->lru);
357
358 kimage_free_pages(page);
359 }
360}
361
362static struct page *kimage_alloc_normal_control_pages(
363 struct kimage *image, unsigned int order)
364{
365 /* Control pages are special, they are the intermediaries
366 * that are needed while we copy the rest of the pages
367 * to their final resting place. As such they must
368 * not conflict with either the destination addresses
369 * or memory the kernel is already using.
370 *
371 * The only case where we really need more than one of
372 * these are for architectures where we cannot disable
373 * the MMU and must instead generate an identity mapped
374 * page table for all of the memory.
375 *
376 * At worst this runs in O(N) of the image size.
377 */
378 struct list_head extra_pages;
379 struct page *pages;
380 unsigned int count;
381
382 count = 1 << order;
383 INIT_LIST_HEAD(&extra_pages);
384
385 /* Loop while I can allocate a page and the page allocated
386 * is a destination page.
387 */
388 do {
389 unsigned long pfn, epfn, addr, eaddr;
390 pages = kimage_alloc_pages(GFP_KERNEL, order);
391 if (!pages)
392 break;
393 pfn = page_to_pfn(pages);
394 epfn = pfn + count;
395 addr = pfn << PAGE_SHIFT;
396 eaddr = epfn << PAGE_SHIFT;
397 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
398 kimage_is_destination_range(image, addr, eaddr))
399 {
400 list_add(&pages->lru, &extra_pages);
401 pages = NULL;
402 }
403 } while(!pages);
404 if (pages) {
405 /* Remember the allocated page... */
406 list_add(&pages->lru, &image->control_pages);
407
408 /* Because the page is already in it's destination
409 * location we will never allocate another page at
410 * that address. Therefore kimage_alloc_pages
411 * will not return it (again) and we don't need
412 * to give it an entry in image->segment[].
413 */
414 }
415 /* Deal with the destination pages I have inadvertently allocated.
416 *
417 * Ideally I would convert multi-page allocations into single
418 * page allocations, and add everyting to image->dest_pages.
419 *
420 * For now it is simpler to just free the pages.
421 */
422 kimage_free_page_list(&extra_pages);
423 return pages;
424
425}
426
427static struct page *kimage_alloc_crash_control_pages(
428 struct kimage *image, unsigned int order)
429{
430 /* Control pages are special, they are the intermediaries
431 * that are needed while we copy the rest of the pages
432 * to their final resting place. As such they must
433 * not conflict with either the destination addresses
434 * or memory the kernel is already using.
435 *
436 * Control pages are also the only pags we must allocate
437 * when loading a crash kernel. All of the other pages
438 * are specified by the segments and we just memcpy
439 * into them directly.
440 *
441 * The only case where we really need more than one of
442 * these are for architectures where we cannot disable
443 * the MMU and must instead generate an identity mapped
444 * page table for all of the memory.
445 *
446 * Given the low demand this implements a very simple
447 * allocator that finds the first hole of the appropriate
448 * size in the reserved memory region, and allocates all
449 * of the memory up to and including the hole.
450 */
451 unsigned long hole_start, hole_end, size;
452 struct page *pages;
453 pages = NULL;
454 size = (1 << order) << PAGE_SHIFT;
455 hole_start = (image->control_page + (size - 1)) & ~(size - 1);
456 hole_end = hole_start + size - 1;
457 while(hole_end <= crashk_res.end) {
458 unsigned long i;
459 if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) {
460 break;
461 }
462 if (hole_end > crashk_res.end) {
463 break;
464 }
465 /* See if I overlap any of the segments */
466 for(i = 0; i < image->nr_segments; i++) {
467 unsigned long mstart, mend;
468 mstart = image->segment[i].mem;
469 mend = mstart + image->segment[i].memsz - 1;
470 if ((hole_end >= mstart) && (hole_start <= mend)) {
471 /* Advance the hole to the end of the segment */
472 hole_start = (mend + (size - 1)) & ~(size - 1);
473 hole_end = hole_start + size - 1;
474 break;
475 }
476 }
477 /* If I don't overlap any segments I have found my hole! */
478 if (i == image->nr_segments) {
479 pages = pfn_to_page(hole_start >> PAGE_SHIFT);
480 break;
481 }
482 }
483 if (pages) {
484 image->control_page = hole_end;
485 }
486 return pages;
487}
488
489
490struct page *kimage_alloc_control_pages(
491 struct kimage *image, unsigned int order)
492{
493 struct page *pages = NULL;
494 switch(image->type) {
495 case KEXEC_TYPE_DEFAULT:
496 pages = kimage_alloc_normal_control_pages(image, order);
497 break;
498 case KEXEC_TYPE_CRASH:
499 pages = kimage_alloc_crash_control_pages(image, order);
500 break;
501 }
502 return pages;
503}
504
505static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
506{
507 if (*image->entry != 0) {
508 image->entry++;
509 }
510 if (image->entry == image->last_entry) {
511 kimage_entry_t *ind_page;
512 struct page *page;
513 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
514 if (!page) {
515 return -ENOMEM;
516 }
517 ind_page = page_address(page);
518 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
519 image->entry = ind_page;
520 image->last_entry =
521 ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
522 }
523 *image->entry = entry;
524 image->entry++;
525 *image->entry = 0;
526 return 0;
527}
528
529static int kimage_set_destination(
530 struct kimage *image, unsigned long destination)
531{
532 int result;
533
534 destination &= PAGE_MASK;
535 result = kimage_add_entry(image, destination | IND_DESTINATION);
536 if (result == 0) {
537 image->destination = destination;
538 }
539 return result;
540}
541
542
543static int kimage_add_page(struct kimage *image, unsigned long page)
544{
545 int result;
546
547 page &= PAGE_MASK;
548 result = kimage_add_entry(image, page | IND_SOURCE);
549 if (result == 0) {
550 image->destination += PAGE_SIZE;
551 }
552 return result;
553}
554
555
556static void kimage_free_extra_pages(struct kimage *image)
557{
558 /* Walk through and free any extra destination pages I may have */
559 kimage_free_page_list(&image->dest_pages);
560
561 /* Walk through and free any unuseable pages I have cached */
562 kimage_free_page_list(&image->unuseable_pages);
563
564}
565static int kimage_terminate(struct kimage *image)
566{
567 if (*image->entry != 0) {
568 image->entry++;
569 }
570 *image->entry = IND_DONE;
571 return 0;
572}
573
574#define for_each_kimage_entry(image, ptr, entry) \
575 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
576 ptr = (entry & IND_INDIRECTION)? \
577 phys_to_virt((entry & PAGE_MASK)): ptr +1)
578
579static void kimage_free_entry(kimage_entry_t entry)
580{
581 struct page *page;
582
583 page = pfn_to_page(entry >> PAGE_SHIFT);
584 kimage_free_pages(page);
585}
586
587static void kimage_free(struct kimage *image)
588{
589 kimage_entry_t *ptr, entry;
590 kimage_entry_t ind = 0;
591
592 if (!image)
593 return;
594 kimage_free_extra_pages(image);
595 for_each_kimage_entry(image, ptr, entry) {
596 if (entry & IND_INDIRECTION) {
597 /* Free the previous indirection page */
598 if (ind & IND_INDIRECTION) {
599 kimage_free_entry(ind);
600 }
601 /* Save this indirection page until we are
602 * done with it.
603 */
604 ind = entry;
605 }
606 else if (entry & IND_SOURCE) {
607 kimage_free_entry(entry);
608 }
609 }
610 /* Free the final indirection page */
611 if (ind & IND_INDIRECTION) {
612 kimage_free_entry(ind);
613 }
614
615 /* Handle any machine specific cleanup */
616 machine_kexec_cleanup(image);
617
618 /* Free the kexec control pages... */
619 kimage_free_page_list(&image->control_pages);
620 kfree(image);
621}
622
623static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page)
624{
625 kimage_entry_t *ptr, entry;
626 unsigned long destination = 0;
627
628 for_each_kimage_entry(image, ptr, entry) {
629 if (entry & IND_DESTINATION) {
630 destination = entry & PAGE_MASK;
631 }
632 else if (entry & IND_SOURCE) {
633 if (page == destination) {
634 return ptr;
635 }
636 destination += PAGE_SIZE;
637 }
638 }
639 return 0;
640}
641
642static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination)
643{
644 /*
645 * Here we implement safeguards to ensure that a source page
646 * is not copied to its destination page before the data on
647 * the destination page is no longer useful.
648 *
649 * To do this we maintain the invariant that a source page is
650 * either its own destination page, or it is not a
651 * destination page at all.
652 *
653 * That is slightly stronger than required, but the proof
654 * that no problems will not occur is trivial, and the
655 * implementation is simply to verify.
656 *
657 * When allocating all pages normally this algorithm will run
658 * in O(N) time, but in the worst case it will run in O(N^2)
659 * time. If the runtime is a problem the data structures can
660 * be fixed.
661 */
662 struct page *page;
663 unsigned long addr;
664
665 /*
666 * Walk through the list of destination pages, and see if I
667 * have a match.
668 */
669 list_for_each_entry(page, &image->dest_pages, lru) {
670 addr = page_to_pfn(page) << PAGE_SHIFT;
671 if (addr == destination) {
672 list_del(&page->lru);
673 return page;
674 }
675 }
676 page = NULL;
677 while (1) {
678 kimage_entry_t *old;
679
680 /* Allocate a page, if we run out of memory give up */
681 page = kimage_alloc_pages(gfp_mask, 0);
682 if (!page) {
683 return 0;
684 }
685 /* If the page cannot be used file it away */
686 if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
687 list_add(&page->lru, &image->unuseable_pages);
688 continue;
689 }
690 addr = page_to_pfn(page) << PAGE_SHIFT;
691
692 /* If it is the destination page we want use it */
693 if (addr == destination)
694 break;
695
696 /* If the page is not a destination page use it */
697 if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE))
698 break;
699
700 /*
701 * I know that the page is someones destination page.
702 * See if there is already a source page for this
703 * destination page. And if so swap the source pages.
704 */
705 old = kimage_dst_used(image, addr);
706 if (old) {
707 /* If so move it */
708 unsigned long old_addr;
709 struct page *old_page;
710
711 old_addr = *old & PAGE_MASK;
712 old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
713 copy_highpage(page, old_page);
714 *old = addr | (*old & ~PAGE_MASK);
715
716 /* The old page I have found cannot be a
717 * destination page, so return it.
718 */
719 addr = old_addr;
720 page = old_page;
721 break;
722 }
723 else {
724 /* Place the page on the destination list I
725 * will use it later.
726 */
727 list_add(&page->lru, &image->dest_pages);
728 }
729 }
730 return page;
731}
732
733static int kimage_load_normal_segment(struct kimage *image,
734 struct kexec_segment *segment)
735{
736 unsigned long maddr;
737 unsigned long ubytes, mbytes;
738 int result;
739 unsigned char *buf;
740
741 result = 0;
742 buf = segment->buf;
743 ubytes = segment->bufsz;
744 mbytes = segment->memsz;
745 maddr = segment->mem;
746
747 result = kimage_set_destination(image, maddr);
748 if (result < 0) {
749 goto out;
750 }
751 while(mbytes) {
752 struct page *page;
753 char *ptr;
754 size_t uchunk, mchunk;
755 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
756 if (page == 0) {
757 result = -ENOMEM;
758 goto out;
759 }
760 result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT);
761 if (result < 0) {
762 goto out;
763 }
764 ptr = kmap(page);
765 /* Start with a clear page */
766 memset(ptr, 0, PAGE_SIZE);
767 ptr += maddr & ~PAGE_MASK;
768 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
769 if (mchunk > mbytes) {
770 mchunk = mbytes;
771 }
772 uchunk = mchunk;
773 if (uchunk > ubytes) {
774 uchunk = ubytes;
775 }
776 result = copy_from_user(ptr, buf, uchunk);
777 kunmap(page);
778 if (result) {
779 result = (result < 0) ? result : -EIO;
780 goto out;
781 }
782 ubytes -= uchunk;
783 maddr += mchunk;
784 buf += mchunk;
785 mbytes -= mchunk;
786 }
787 out:
788 return result;
789}
790
791static int kimage_load_crash_segment(struct kimage *image,
792 struct kexec_segment *segment)
793{
794 /* For crash dumps kernels we simply copy the data from
795 * user space to it's destination.
796 * We do things a page at a time for the sake of kmap.
797 */
798 unsigned long maddr;
799 unsigned long ubytes, mbytes;
800 int result;
801 unsigned char *buf;
802
803 result = 0;
804 buf = segment->buf;
805 ubytes = segment->bufsz;
806 mbytes = segment->memsz;
807 maddr = segment->mem;
808 while(mbytes) {
809 struct page *page;
810 char *ptr;
811 size_t uchunk, mchunk;
812 page = pfn_to_page(maddr >> PAGE_SHIFT);
813 if (page == 0) {
814 result = -ENOMEM;
815 goto out;
816 }
817 ptr = kmap(page);
818 ptr += maddr & ~PAGE_MASK;
819 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
820 if (mchunk > mbytes) {
821 mchunk = mbytes;
822 }
823 uchunk = mchunk;
824 if (uchunk > ubytes) {
825 uchunk = ubytes;
826 /* Zero the trailing part of the page */
827 memset(ptr + uchunk, 0, mchunk - uchunk);
828 }
829 result = copy_from_user(ptr, buf, uchunk);
830 kunmap(page);
831 if (result) {
832 result = (result < 0) ? result : -EIO;
833 goto out;
834 }
835 ubytes -= uchunk;
836 maddr += mchunk;
837 buf += mchunk;
838 mbytes -= mchunk;
839 }
840 out:
841 return result;
842}
843
844static int kimage_load_segment(struct kimage *image,
845 struct kexec_segment *segment)
846{
847 int result = -ENOMEM;
848 switch(image->type) {
849 case KEXEC_TYPE_DEFAULT:
850 result = kimage_load_normal_segment(image, segment);
851 break;
852 case KEXEC_TYPE_CRASH:
853 result = kimage_load_crash_segment(image, segment);
854 break;
855 }
856 return result;
857}
858
859/*
860 * Exec Kernel system call: for obvious reasons only root may call it.
861 *
862 * This call breaks up into three pieces.
863 * - A generic part which loads the new kernel from the current
864 * address space, and very carefully places the data in the
865 * allocated pages.
866 *
867 * - A generic part that interacts with the kernel and tells all of
868 * the devices to shut down. Preventing on-going dmas, and placing
869 * the devices in a consistent state so a later kernel can
870 * reinitialize them.
871 *
872 * - A machine specific part that includes the syscall number
873 * and the copies the image to it's final destination. And
874 * jumps into the image at entry.
875 *
876 * kexec does not sync, or unmount filesystems so if you need
877 * that to happen you need to do that yourself.
878 */
879struct kimage *kexec_image = NULL;
880static struct kimage *kexec_crash_image = NULL;
881/*
882 * A home grown binary mutex.
883 * Nothing can wait so this mutex is safe to use
884 * in interrupt context :)
885 */
886static int kexec_lock = 0;
887
888asmlinkage long sys_kexec_load(unsigned long entry,
889 unsigned long nr_segments, struct kexec_segment __user *segments,
890 unsigned long flags)
891{
892 struct kimage **dest_image, *image;
893 int locked;
894 int result;
895
896 /* We only trust the superuser with rebooting the system. */
897 if (!capable(CAP_SYS_BOOT))
898 return -EPERM;
899
900 /*
901 * Verify we have a legal set of flags
902 * This leaves us room for future extensions.
903 */
904 if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
905 return -EINVAL;
906
907 /* Verify we are on the appropriate architecture */
908 if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
909 ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
910 {
911 return -EINVAL;
912 }
913
914 /* Put an artificial cap on the number
915 * of segments passed to kexec_load.
916 */
917 if (nr_segments > KEXEC_SEGMENT_MAX)
918 return -EINVAL;
919
920 image = NULL;
921 result = 0;
922
923 /* Because we write directly to the reserved memory
924 * region when loading crash kernels we need a mutex here to
925 * prevent multiple crash kernels from attempting to load
926 * simultaneously, and to prevent a crash kernel from loading
927 * over the top of a in use crash kernel.
928 *
929 * KISS: always take the mutex.
930 */
931 locked = xchg(&kexec_lock, 1);
932 if (locked) {
933 return -EBUSY;
934 }
935 dest_image = &kexec_image;
936 if (flags & KEXEC_ON_CRASH) {
937 dest_image = &kexec_crash_image;
938 }
939 if (nr_segments > 0) {
940 unsigned long i;
941 /* Loading another kernel to reboot into */
942 if ((flags & KEXEC_ON_CRASH) == 0) {
943 result = kimage_normal_alloc(&image, entry, nr_segments, segments);
944 }
945 /* Loading another kernel to switch to if this one crashes */
946 else if (flags & KEXEC_ON_CRASH) {
947 /* Free any current crash dump kernel before
948 * we corrupt it.
949 */
950 kimage_free(xchg(&kexec_crash_image, NULL));
951 result = kimage_crash_alloc(&image, entry, nr_segments, segments);
952 }
953 if (result) {
954 goto out;
955 }
956 result = machine_kexec_prepare(image);
957 if (result) {
958 goto out;
959 }
960 for(i = 0; i < nr_segments; i++) {
961 result = kimage_load_segment(image, &image->segment[i]);
962 if (result) {
963 goto out;
964 }
965 }
966 result = kimage_terminate(image);
967 if (result) {
968 goto out;
969 }
970 }
971 /* Install the new kernel, and Uninstall the old */
972 image = xchg(dest_image, image);
973
974 out:
975 xchg(&kexec_lock, 0); /* Release the mutex */
976 kimage_free(image);
977 return result;
978}
979
980#ifdef CONFIG_COMPAT
981asmlinkage long compat_sys_kexec_load(unsigned long entry,
982 unsigned long nr_segments, struct compat_kexec_segment __user *segments,
983 unsigned long flags)
984{
985 struct compat_kexec_segment in;
986 struct kexec_segment out, __user *ksegments;
987 unsigned long i, result;
988
989 /* Don't allow clients that don't understand the native
990 * architecture to do anything.
991 */
992 if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) {
993 return -EINVAL;
994 }
995
996 if (nr_segments > KEXEC_SEGMENT_MAX) {
997 return -EINVAL;
998 }
999
1000 ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
1001 for (i=0; i < nr_segments; i++) {
1002 result = copy_from_user(&in, &segments[i], sizeof(in));
1003 if (result) {
1004 return -EFAULT;
1005 }
1006
1007 out.buf = compat_ptr(in.buf);
1008 out.bufsz = in.bufsz;
1009 out.mem = in.mem;
1010 out.memsz = in.memsz;
1011
1012 result = copy_to_user(&ksegments[i], &out, sizeof(out));
1013 if (result) {
1014 return -EFAULT;
1015 }
1016 }
1017
1018 return sys_kexec_load(entry, nr_segments, ksegments, flags);
1019}
1020#endif
1021
6e274d14 1022void crash_kexec(struct pt_regs *regs)
dc009d92
EB
1023{
1024 struct kimage *image;
1025 int locked;
1026
1027
1028 /* Take the kexec_lock here to prevent sys_kexec_load
1029 * running on one cpu from replacing the crash kernel
1030 * we are using after a panic on a different cpu.
1031 *
1032 * If the crash kernel was not located in a fixed area
1033 * of memory the xchg(&kexec_crash_image) would be
1034 * sufficient. But since I reuse the memory...
1035 */
1036 locked = xchg(&kexec_lock, 1);
1037 if (!locked) {
1038 image = xchg(&kexec_crash_image, NULL);
1039 if (image) {
6e274d14 1040 machine_crash_shutdown(regs);
dc009d92
EB
1041 machine_kexec(image);
1042 }
1043 xchg(&kexec_lock, 0);
1044 }
1045}