kernel/kexec.c

   1 /*
   2  * kexec.c - kexec system call
   3  * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
   4  *
   5  * This source code is licensed under the GNU General Public License,
   6  * Version 2.  See the file COPYING for more details.
   7  */
   8
   9 #include <linux/capability.h>
  10 #include <linux/mm.h>
  11 #include <linux/file.h>
  12 #include <linux/slab.h>
  13 #include <linux/fs.h>
  14 #include <linux/kexec.h>
  15 #include <linux/spinlock.h>
  16 #include <linux/list.h>
  17 #include <linux/highmem.h>
  18 #include <linux/syscalls.h>
  19 #include <linux/reboot.h>
  20 #include <linux/ioport.h>
  21 #include <linux/hardirq.h>
  22 #include <linux/elf.h>
  23 #include <linux/elfcore.h>
  24 #include <linux/utsrelease.h>
  25 #include <linux/utsname.h>
  26 #include <linux/numa.h>
  27
  28 #include <asm/page.h>
  29 #include <asm/uaccess.h>
  30 #include <asm/io.h>
  31 #include <asm/system.h>
  32 #include <asm/sections.h>
  33
  34 /* Per cpu memory for storing cpu states in case of system crash. */
  35 note_buf_t* crash_notes;
  36
  37 /* vmcoreinfo stuff */
  38 unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
  39 u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
  40 size_t vmcoreinfo_size;
  41 size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
  42
  43 /* Location of the reserved area for the crash kernel */
  44 struct resource crashk_res = {
  45         .name  = "Crash kernel",
  46         .start = 0,
  47         .end   = 0,
  48         .flags = IORESOURCE_BUSY | IORESOURCE_MEM
  49 };
  50
  51 int kexec_should_crash(struct task_struct *p)
  52 {
  53         if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
  54                 return 1;
  55         return 0;
  56 }
  57
  58 /*
  59  * When kexec transitions to the new kernel there is a one-to-one
  60  * mapping between physical and virtual addresses.  On processors
  61  * where you can disable the MMU this is trivial, and easy.  For
  62  * others it is still a simple predictable page table to setup.
  63  *
  64  * In that environment kexec copies the new kernel to its final
  65  * resting place.  This means I can only support memory whose
  66  * physical address can fit in an unsigned long.  In particular
  67  * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
  68  * If the assembly stub has more restrictive requirements
  69  * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
  70  * defined more restrictively in <asm/kexec.h>.
  71  *
  72  * The code for the transition from the current kernel to the
  73  * the new kernel is placed in the control_code_buffer, whose size
  74  * is given by KEXEC_CONTROL_CODE_SIZE.  In the best case only a single
  75  * page of memory is necessary, but some architectures require more.
  76  * Because this memory must be identity mapped in the transition from
  77  * virtual to physical addresses it must live in the range
  78  * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
  79  * modifiable.
  80  *
  81  * The assembly stub in the control code buffer is passed a linked list
  82  * of descriptor pages detailing the source pages of the new kernel,
  83  * and the destination addresses of those source pages.  As this data
  84  * structure is not used in the context of the current OS, it must
  85  * be self-contained.
  86  *
  87  * The code has been made to work with highmem pages and will use a
  88  * destination page in its final resting place (if it happens
  89  * to allocate it).  The end product of this is that most of the
  90  * physical address space, and most of RAM can be used.
  91  *
  92  * Future directions include:
  93  *  - allocating a page table with the control code buffer identity
  94  *    mapped, to simplify machine_kexec and make kexec_on_panic more
  95  *    reliable.
  96  */
  97
  98 /*
  99  * KIMAGE_NO_DEST is an impossible destination address..., for
 100  * allocating pages whose destination address we do not care about.
 101  */
 102 #define KIMAGE_NO_DEST (-1UL)
 103
 104 static int kimage_is_destination_range(struct kimage *image,
 105                                        unsigned long start, unsigned long end);
 106 static struct page *kimage_alloc_page(struct kimage *image,
 107                                        gfp_t gfp_mask,
 108                                        unsigned long dest);
 109
 110 static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
 111                             unsigned long nr_segments,
 112                             struct kexec_segment __user *segments)
 113 {
 114         size_t segment_bytes;
 115         struct kimage *image;
 116         unsigned long i;
 117         int result;
 118
 119         /* Allocate a controlling structure */
 120         result = -ENOMEM;
 121         image = kzalloc(sizeof(*image), GFP_KERNEL);
 122         if (!image)
 123                 goto out;
 124
 125         image->head = 0;
 126         image->entry = &image->head;
 127         image->last_entry = &image->head;
 128         image->control_page = ~0; /* By default this does not apply */
 129         image->start = entry;
 130         image->type = KEXEC_TYPE_DEFAULT;
 131
 132         /* Initialize the list of control pages */
 133         INIT_LIST_HEAD(&image->control_pages);
 134
 135         /* Initialize the list of destination pages */
 136         INIT_LIST_HEAD(&image->dest_pages);
 137
 138         /* Initialize the list of unuseable pages */
 139         INIT_LIST_HEAD(&image->unuseable_pages);
 140
 141         /* Read in the segments */
 142         image->nr_segments = nr_segments;
 143         segment_bytes = nr_segments * sizeof(*segments);
 144         result = copy_from_user(image->segment, segments, segment_bytes);
 145         if (result)
 146                 goto out;
 147
 148         /*
 149          * Verify we have good destination addresses.  The caller is
 150          * responsible for making certain we don't attempt to load
 151          * the new image into invalid or reserved areas of RAM.  This
 152          * just verifies it is an address we can use.
 153          *
 154          * Since the kernel does everything in page size chunks ensure
 155          * the destination addreses are page aligned.  Too many
 156          * special cases crop of when we don't do this.  The most
 157          * insidious is getting overlapping destination addresses
 158          * simply because addresses are changed to page size
 159          * granularity.
 160          */
 161         result = -EADDRNOTAVAIL;
 162         for (i = 0; i < nr_segments; i++) {
 163                 unsigned long mstart, mend;
 164
 165                 mstart = image->segment[i].mem;
 166                 mend   = mstart + image->segment[i].memsz;
 167                 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
 168                         goto out;
 169                 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
 170                         goto out;
 171         }
 172
 173         /* Verify our destination addresses do not overlap.
 174          * If we alloed overlapping destination addresses
 175          * through very weird things can happen with no
 176          * easy explanation as one segment stops on another.
 177          */
 178         result = -EINVAL;
 179         for (i = 0; i < nr_segments; i++) {
 180                 unsigned long mstart, mend;
 181                 unsigned long j;
 182
 183                 mstart = image->segment[i].mem;
 184                 mend   = mstart + image->segment[i].memsz;
 185                 for (j = 0; j < i; j++) {
 186                         unsigned long pstart, pend;
 187                         pstart = image->segment[j].mem;
 188                         pend   = pstart + image->segment[j].memsz;
 189                         /* Do the segments overlap ? */
 190                         if ((mend > pstart) && (mstart < pend))
 191                                 goto out;
 192                 }
 193         }
 194
 195         /* Ensure our buffer sizes are strictly less than
 196          * our memory sizes.  This should always be the case,
 197          * and it is easier to check up front than to be surprised
 198          * later on.
 199          */
 200         result = -EINVAL;
 201         for (i = 0; i < nr_segments; i++) {
 202                 if (image->segment[i].bufsz > image->segment[i].memsz)
 203                         goto out;
 204         }
 205
 206         result = 0;
 207 out:
 208         if (result == 0)
 209                 *rimage = image;
 210         else
 211                 kfree(image);
 212
 213         return result;
 214
 215 }
 216
 217 static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
 218                                 unsigned long nr_segments,
 219                                 struct kexec_segment __user *segments)
 220 {
 221         int result;
 222         struct kimage *image;
 223
 224         /* Allocate and initialize a controlling structure */
 225         image = NULL;
 226         result = do_kimage_alloc(&image, entry, nr_segments, segments);
 227         if (result)
 228                 goto out;
 229
 230         *rimage = image;
 231
 232         /*
 233          * Find a location for the control code buffer, and add it
 234          * the vector of segments so that it's pages will also be
 235          * counted as destination pages.
 236          */
 237         result = -ENOMEM;
 238         image->control_code_page = kimage_alloc_control_pages(image,
 239                                            get_order(KEXEC_CONTROL_CODE_SIZE));
 240         if (!image->control_code_page) {
 241                 printk(KERN_ERR "Could not allocate control_code_buffer\n");
 242                 goto out;
 243         }
 244
 245         result = 0;
 246  out:
 247         if (result == 0)
 248                 *rimage = image;
 249         else
 250                 kfree(image);
 251
 252         return result;
 253 }
 254
 255 static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
 256                                 unsigned long nr_segments,
 257                                 struct kexec_segment __user *segments)
 258 {
 259         int result;
 260         struct kimage *image;
 261         unsigned long i;
 262
 263         image = NULL;
 264         /* Verify we have a valid entry point */
 265         if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
 266                 result = -EADDRNOTAVAIL;
 267                 goto out;
 268         }
 269
 270         /* Allocate and initialize a controlling structure */
 271         result = do_kimage_alloc(&image, entry, nr_segments, segments);
 272         if (result)
 273                 goto out;
 274
 275         /* Enable the special crash kernel control page
 276          * allocation policy.
 277          */
 278         image->control_page = crashk_res.start;
 279         image->type = KEXEC_TYPE_CRASH;
 280
 281         /*
 282          * Verify we have good destination addresses.  Normally
 283          * the caller is responsible for making certain we don't
 284          * attempt to load the new image into invalid or reserved
 285          * areas of RAM.  But crash kernels are preloaded into a
 286          * reserved area of ram.  We must ensure the addresses
 287          * are in the reserved area otherwise preloading the
 288          * kernel could corrupt things.
 289          */
 290         result = -EADDRNOTAVAIL;
 291         for (i = 0; i < nr_segments; i++) {
 292                 unsigned long mstart, mend;
 293
 294                 mstart = image->segment[i].mem;
 295                 mend = mstart + image->segment[i].memsz - 1;
 296                 /* Ensure we are within the crash kernel limits */
 297                 if ((mstart < crashk_res.start) || (mend > crashk_res.end))
 298                         goto out;
 299         }
 300
 301         /*
 302          * Find a location for the control code buffer, and add
 303          * the vector of segments so that it's pages will also be
 304          * counted as destination pages.
 305          */
 306         result = -ENOMEM;
 307         image->control_code_page = kimage_alloc_control_pages(image,
 308                                            get_order(KEXEC_CONTROL_CODE_SIZE));
 309         if (!image->control_code_page) {
 310                 printk(KERN_ERR "Could not allocate control_code_buffer\n");
 311                 goto out;
 312         }
 313
 314         result = 0;
 315 out:
 316         if (result == 0)
 317                 *rimage = image;
 318         else
 319                 kfree(image);
 320
 321         return result;
 322 }
 323
 324 static int kimage_is_destination_range(struct kimage *image,
 325                                         unsigned long start,
 326                                         unsigned long end)
 327 {
 328         unsigned long i;
 329
 330         for (i = 0; i < image->nr_segments; i++) {
 331                 unsigned long mstart, mend;
 332
 333                 mstart = image->segment[i].mem;
 334                 mend = mstart + image->segment[i].memsz;
 335                 if ((end > mstart) && (start < mend))
 336                         return 1;
 337         }
 338
 339         return 0;
 340 }
 341
 342 static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
 343 {
 344         struct page *pages;
 345
 346         pages = alloc_pages(gfp_mask, order);
 347         if (pages) {
 348                 unsigned int count, i;
 349                 pages->mapping = NULL;
 350                 set_page_private(pages, order);
 351                 count = 1 << order;
 352                 for (i = 0; i < count; i++)
 353                         SetPageReserved(pages + i);
 354         }
 355
 356         return pages;
 357 }
 358
 359 static void kimage_free_pages(struct page *page)
 360 {
 361         unsigned int order, count, i;
 362
 363         order = page_private(page);
 364         count = 1 << order;
 365         for (i = 0; i < count; i++)
 366                 ClearPageReserved(page + i);
 367         __free_pages(page, order);
 368 }
 369
 370 static void kimage_free_page_list(struct list_head *list)
 371 {
 372         struct list_head *pos, *next;
 373
 374         list_for_each_safe(pos, next, list) {
 375                 struct page *page;
 376
 377                 page = list_entry(pos, struct page, lru);
 378                 list_del(&page->lru);
 379                 kimage_free_pages(page);
 380         }
 381 }
 382
 383 static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
 384                                                         unsigned int order)
 385 {
 386         /* Control pages are special, they are the intermediaries
 387          * that are needed while we copy the rest of the pages
 388          * to their final resting place.  As such they must
 389          * not conflict with either the destination addresses
 390          * or memory the kernel is already using.
 391          *
 392          * The only case where we really need more than one of
 393          * these are for architectures where we cannot disable
 394          * the MMU and must instead generate an identity mapped
 395          * page table for all of the memory.
 396          *
 397          * At worst this runs in O(N) of the image size.
 398          */
 399         struct list_head extra_pages;
 400         struct page *pages;
 401         unsigned int count;
 402
 403         count = 1 << order;
 404         INIT_LIST_HEAD(&extra_pages);
 405
 406         /* Loop while I can allocate a page and the page allocated
 407          * is a destination page.
 408          */
 409         do {
 410                 unsigned long pfn, epfn, addr, eaddr;
 411
 412                 pages = kimage_alloc_pages(GFP_KERNEL, order);
 413                 if (!pages)
 414                         break;
 415                 pfn   = page_to_pfn(pages);
 416                 epfn  = pfn + count;
 417                 addr  = pfn << PAGE_SHIFT;
 418                 eaddr = epfn << PAGE_SHIFT;
 419                 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
 420                               kimage_is_destination_range(image, addr, eaddr)) {
 421                         list_add(&pages->lru, &extra_pages);
 422                         pages = NULL;
 423                 }
 424         } while (!pages);
 425
 426         if (pages) {
 427                 /* Remember the allocated page... */
 428                 list_add(&pages->lru, &image->control_pages);
 429
 430                 /* Because the page is already in it's destination
 431                  * location we will never allocate another page at
 432                  * that address.  Therefore kimage_alloc_pages
 433                  * will not return it (again) and we don't need
 434                  * to give it an entry in image->segment[].
 435                  */
 436         }
 437         /* Deal with the destination pages I have inadvertently allocated.
 438          *
 439          * Ideally I would convert multi-page allocations into single
 440          * page allocations, and add everyting to image->dest_pages.
 441          *
 442          * For now it is simpler to just free the pages.
 443          */
 444         kimage_free_page_list(&extra_pages);
 445
 446         return pages;
 447 }
 448
 449 static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 450                                                       unsigned int order)
 451 {
 452         /* Control pages are special, they are the intermediaries
 453          * that are needed while we copy the rest of the pages
 454          * to their final resting place.  As such they must
 455          * not conflict with either the destination addresses
 456          * or memory the kernel is already using.
 457          *
 458          * Control pages are also the only pags we must allocate
 459          * when loading a crash kernel.  All of the other pages
 460          * are specified by the segments and we just memcpy
 461          * into them directly.
 462          *
 463          * The only case where we really need more than one of
 464          * these are for architectures where we cannot disable
 465          * the MMU and must instead generate an identity mapped
 466          * page table for all of the memory.
 467          *
 468          * Given the low demand this implements a very simple
 469          * allocator that finds the first hole of the appropriate
 470          * size in the reserved memory region, and allocates all
 471          * of the memory up to and including the hole.
 472          */
 473         unsigned long hole_start, hole_end, size;
 474         struct page *pages;
 475
 476         pages = NULL;
 477         size = (1 << order) << PAGE_SHIFT;
 478         hole_start = (image->control_page + (size - 1)) & ~(size - 1);
 479         hole_end   = hole_start + size - 1;
 480         while (hole_end <= crashk_res.end) {
 481                 unsigned long i;
 482
 483                 if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
 484                         break;
 485                 if (hole_end > crashk_res.end)
 486                         break;
 487                 /* See if I overlap any of the segments */
 488                 for (i = 0; i < image->nr_segments; i++) {
 489                         unsigned long mstart, mend;
 490
 491                         mstart = image->segment[i].mem;
 492                         mend   = mstart + image->segment[i].memsz - 1;
 493                         if ((hole_end >= mstart) && (hole_start <= mend)) {
 494                                 /* Advance the hole to the end of the segment */
 495                                 hole_start = (mend + (size - 1)) & ~(size - 1);
 496                                 hole_end   = hole_start + size - 1;
 497                                 break;
 498                         }
 499                 }
 500                 /* If I don't overlap any segments I have found my hole! */
 501                 if (i == image->nr_segments) {
 502                         pages = pfn_to_page(hole_start >> PAGE_SHIFT);
 503                         break;
 504                 }
 505         }
 506         if (pages)
 507                 image->control_page = hole_end;
 508
 509         return pages;
 510 }
 511
 512
 513 struct page *kimage_alloc_control_pages(struct kimage *image,
 514                                          unsigned int order)
 515 {
 516         struct page *pages = NULL;
 517
 518         switch (image->type) {
 519         case KEXEC_TYPE_DEFAULT:
 520                 pages = kimage_alloc_normal_control_pages(image, order);
 521                 break;
 522         case KEXEC_TYPE_CRASH:
 523                 pages = kimage_alloc_crash_control_pages(image, order);
 524                 break;
 525         }
 526
 527         return pages;
 528 }
 529
 530 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 531 {
 532         if (*image->entry != 0)
 533                 image->entry++;
 534
 535         if (image->entry == image->last_entry) {
 536                 kimage_entry_t *ind_page;
 537                 struct page *page;
 538
 539                 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
 540                 if (!page)
 541                         return -ENOMEM;
 542
 543                 ind_page = page_address(page);
 544                 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
 545                 image->entry = ind_page;
 546                 image->last_entry = ind_page +
 547                                       ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
 548         }
 549         *image->entry = entry;
 550         image->entry++;
 551         *image->entry = 0;
 552
 553         return 0;
 554 }
 555
 556 static int kimage_set_destination(struct kimage *image,
 557                                    unsigned long destination)
 558 {
 559         int result;
 560
 561         destination &= PAGE_MASK;
 562         result = kimage_add_entry(image, destination | IND_DESTINATION);
 563         if (result == 0)
 564                 image->destination = destination;
 565
 566         return result;
 567 }
 568
 569
 570 static int kimage_add_page(struct kimage *image, unsigned long page)
 571 {
 572         int result;
 573
 574         page &= PAGE_MASK;
 575         result = kimage_add_entry(image, page | IND_SOURCE);
 576         if (result == 0)
 577                 image->destination += PAGE_SIZE;
 578
 579         return result;
 580 }
 581
 582
 583 static void kimage_free_extra_pages(struct kimage *image)
 584 {
 585         /* Walk through and free any extra destination pages I may have */
 586         kimage_free_page_list(&image->dest_pages);
 587
 588         /* Walk through and free any unuseable pages I have cached */
 589         kimage_free_page_list(&image->unuseable_pages);
 590
 591 }
 592 static void kimage_terminate(struct kimage *image)
 593 {
 594         if (*image->entry != 0)
 595                 image->entry++;
 596
 597         *image->entry = IND_DONE;
 598 }
 599
 600 #define for_each_kimage_entry(image, ptr, entry) \
 601         for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
 602                 ptr = (entry & IND_INDIRECTION)? \
 603                         phys_to_virt((entry & PAGE_MASK)): ptr +1)
 604
 605 static void kimage_free_entry(kimage_entry_t entry)
 606 {
 607         struct page *page;
 608
 609         page = pfn_to_page(entry >> PAGE_SHIFT);
 610         kimage_free_pages(page);
 611 }
 612
 613 static void kimage_free(struct kimage *image)
 614 {
 615         kimage_entry_t *ptr, entry;
 616         kimage_entry_t ind = 0;
 617
 618         if (!image)
 619                 return;
 620
 621         kimage_free_extra_pages(image);
 622         for_each_kimage_entry(image, ptr, entry) {
 623                 if (entry & IND_INDIRECTION) {
 624                         /* Free the previous indirection page */
 625                         if (ind & IND_INDIRECTION)
 626                                 kimage_free_entry(ind);
 627                         /* Save this indirection page until we are
 628                          * done with it.
 629                          */
 630                         ind = entry;
 631                 }
 632                 else if (entry & IND_SOURCE)
 633                         kimage_free_entry(entry);
 634         }
 635         /* Free the final indirection page */
 636         if (ind & IND_INDIRECTION)
 637                 kimage_free_entry(ind);
 638
 639         /* Handle any machine specific cleanup */
 640         machine_kexec_cleanup(image);
 641
 642         /* Free the kexec control pages... */
 643         kimage_free_page_list(&image->control_pages);
 644         kfree(image);
 645 }
 646
 647 static kimage_entry_t *kimage_dst_used(struct kimage *image,
 648                                         unsigned long page)
 649 {
 650         kimage_entry_t *ptr, entry;
 651         unsigned long destination = 0;
 652
 653         for_each_kimage_entry(image, ptr, entry) {
 654                 if (entry & IND_DESTINATION)
 655                         destination = entry & PAGE_MASK;
 656                 else if (entry & IND_SOURCE) {
 657                         if (page == destination)
 658                                 return ptr;
 659                         destination += PAGE_SIZE;
 660                 }
 661         }
 662
 663         return NULL;
 664 }
 665
 666 static struct page *kimage_alloc_page(struct kimage *image,
 667                                         gfp_t gfp_mask,
 668                                         unsigned long destination)
 669 {
 670         /*
 671          * Here we implement safeguards to ensure that a source page
 672          * is not copied to its destination page before the data on
 673          * the destination page is no longer useful.
 674          *
 675          * To do this we maintain the invariant that a source page is
 676          * either its own destination page, or it is not a
 677          * destination page at all.
 678          *
 679          * That is slightly stronger than required, but the proof
 680          * that no problems will not occur is trivial, and the
 681          * implementation is simply to verify.
 682          *
 683          * When allocating all pages normally this algorithm will run
 684          * in O(N) time, but in the worst case it will run in O(N^2)
 685          * time.   If the runtime is a problem the data structures can
 686          * be fixed.
 687          */
 688         struct page *page;
 689         unsigned long addr;
 690
 691         /*
 692          * Walk through the list of destination pages, and see if I
 693          * have a match.
 694          */
 695         list_for_each_entry(page, &image->dest_pages, lru) {
 696                 addr = page_to_pfn(page) << PAGE_SHIFT;
 697                 if (addr == destination) {
 698                         list_del(&page->lru);
 699                         return page;
 700                 }
 701         }
 702         page = NULL;
 703         while (1) {
 704                 kimage_entry_t *old;
 705
 706                 /* Allocate a page, if we run out of memory give up */
 707                 page = kimage_alloc_pages(gfp_mask, 0);
 708                 if (!page)
 709                         return NULL;
 710                 /* If the page cannot be used file it away */
 711                 if (page_to_pfn(page) >
 712                                 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
 713                         list_add(&page->lru, &image->unuseable_pages);
 714                         continue;
 715                 }
 716                 addr = page_to_pfn(page) << PAGE_SHIFT;
 717
 718                 /* If it is the destination page we want use it */
 719                 if (addr == destination)
 720                         break;
 721
 722                 /* If the page is not a destination page use it */
 723                 if (!kimage_is_destination_range(image, addr,
 724                                                   addr + PAGE_SIZE))
 725                         break;
 726
 727                 /*
 728                  * I know that the page is someones destination page.
 729                  * See if there is already a source page for this
 730                  * destination page.  And if so swap the source pages.
 731                  */
 732                 old = kimage_dst_used(image, addr);
 733                 if (old) {
 734                         /* If so move it */
 735                         unsigned long old_addr;
 736                         struct page *old_page;
 737
 738                         old_addr = *old & PAGE_MASK;
 739                         old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
 740                         copy_highpage(page, old_page);
 741                         *old = addr | (*old & ~PAGE_MASK);
 742
 743                         /* The old page I have found cannot be a
 744                          * destination page, so return it.
 745                          */
 746                         addr = old_addr;
 747                         page = old_page;
 748                         break;
 749                 }
 750                 else {
 751                         /* Place the page on the destination list I
 752                          * will use it later.
 753                          */
 754                         list_add(&page->lru, &image->dest_pages);
 755                 }
 756         }
 757
 758         return page;
 759 }
 760
 761 static int kimage_load_normal_segment(struct kimage *image,
 762                                          struct kexec_segment *segment)
 763 {
 764         unsigned long maddr;
 765         unsigned long ubytes, mbytes;
 766         int result;
 767         unsigned char __user *buf;
 768
 769         result = 0;
 770         buf = segment->buf;
 771         ubytes = segment->bufsz;
 772         mbytes = segment->memsz;
 773         maddr = segment->mem;
 774
 775         result = kimage_set_destination(image, maddr);
 776         if (result < 0)
 777                 goto out;
 778
 779         while (mbytes) {
 780                 struct page *page;
 781                 char *ptr;
 782                 size_t uchunk, mchunk;
 783
 784                 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
 785                 if (!page) {
 786                         result  = -ENOMEM;
 787                         goto out;
 788                 }
 789                 result = kimage_add_page(image, page_to_pfn(page)
 790                                                                 << PAGE_SHIFT);
 791                 if (result < 0)
 792                         goto out;
 793
 794                 ptr = kmap(page);
 795                 /* Start with a clear page */
 796                 memset(ptr, 0, PAGE_SIZE);
 797                 ptr += maddr & ~PAGE_MASK;
 798                 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
 799                 if (mchunk > mbytes)
 800                         mchunk = mbytes;
 801
 802                 uchunk = mchunk;
 803                 if (uchunk > ubytes)
 804                         uchunk = ubytes;
 805
 806                 result = copy_from_user(ptr, buf, uchunk);
 807                 kunmap(page);
 808                 if (result) {
 809                         result = (result < 0) ? result : -EIO;
 810                         goto out;
 811                 }
 812                 ubytes -= uchunk;
 813                 maddr  += mchunk;
 814                 buf    += mchunk;
 815                 mbytes -= mchunk;
 816         }
 817 out:
 818         return result;
 819 }
 820
 821 static int kimage_load_crash_segment(struct kimage *image,
 822                                         struct kexec_segment *segment)
 823 {
 824         /* For crash dumps kernels we simply copy the data from
 825          * user space to it's destination.
 826          * We do things a page at a time for the sake of kmap.
 827          */
 828         unsigned long maddr;
 829         unsigned long ubytes, mbytes;
 830         int result;
 831         unsigned char __user *buf;
 832
 833         result = 0;
 834         buf = segment->buf;
 835         ubytes = segment->bufsz;
 836         mbytes = segment->memsz;
 837         maddr = segment->mem;
 838         while (mbytes) {
 839                 struct page *page;
 840                 char *ptr;
 841                 size_t uchunk, mchunk;
 842
 843                 page = pfn_to_page(maddr >> PAGE_SHIFT);
 844                 if (!page) {
 845                         result  = -ENOMEM;
 846                         goto out;
 847                 }
 848                 ptr = kmap(page);
 849                 ptr += maddr & ~PAGE_MASK;
 850                 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
 851                 if (mchunk > mbytes)
 852                         mchunk = mbytes;
 853
 854                 uchunk = mchunk;
 855                 if (uchunk > ubytes) {
 856                         uchunk = ubytes;
 857                         /* Zero the trailing part of the page */
 858                         memset(ptr + uchunk, 0, mchunk - uchunk);
 859                 }
 860                 result = copy_from_user(ptr, buf, uchunk);
 861                 kexec_flush_icache_page(page);
 862                 kunmap(page);
 863                 if (result) {
 864                         result = (result < 0) ? result : -EIO;
 865                         goto out;
 866                 }
 867                 ubytes -= uchunk;
 868                 maddr  += mchunk;
 869                 buf    += mchunk;
 870                 mbytes -= mchunk;
 871         }
 872 out:
 873         return result;
 874 }
 875
 876 static int kimage_load_segment(struct kimage *image,
 877                                 struct kexec_segment *segment)
 878 {
 879         int result = -ENOMEM;
 880
 881         switch (image->type) {
 882         case KEXEC_TYPE_DEFAULT:
 883                 result = kimage_load_normal_segment(image, segment);
 884                 break;
 885         case KEXEC_TYPE_CRASH:
 886                 result = kimage_load_crash_segment(image, segment);
 887                 break;
 888         }
 889
 890         return result;
 891 }
 892
 893 /*
 894  * Exec Kernel system call: for obvious reasons only root may call it.
 895  *
 896  * This call breaks up into three pieces.
 897  * - A generic part which loads the new kernel from the current
 898  *   address space, and very carefully places the data in the
 899  *   allocated pages.
 900  *
 901  * - A generic part that interacts with the kernel and tells all of
 902  *   the devices to shut down.  Preventing on-going dmas, and placing
 903  *   the devices in a consistent state so a later kernel can
 904  *   reinitialize them.
 905  *
 906  * - A machine specific part that includes the syscall number
 907  *   and the copies the image to it's final destination.  And
 908  *   jumps into the image at entry.
 909  *
 910  * kexec does not sync, or unmount filesystems so if you need
 911  * that to happen you need to do that yourself.
 912  */
 913 struct kimage *kexec_image;
 914 struct kimage *kexec_crash_image;
 915 /*
 916  * A home grown binary mutex.
 917  * Nothing can wait so this mutex is safe to use
 918  * in interrupt context :)
 919  */
 920 static int kexec_lock;
 921
 922 asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
 923                                 struct kexec_segment __user *segments,
 924                                 unsigned long flags)
 925 {
 926         struct kimage **dest_image, *image;
 927         int locked;
 928         int result;
 929
 930         /* We only trust the superuser with rebooting the system. */
 931         if (!capable(CAP_SYS_BOOT))
 932                 return -EPERM;
 933
 934         /*
 935          * Verify we have a legal set of flags
 936          * This leaves us room for future extensions.
 937          */
 938         if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
 939                 return -EINVAL;
 940
 941         /* Verify we are on the appropriate architecture */
 942         if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
 943                 ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
 944                 return -EINVAL;
 945
 946         /* Put an artificial cap on the number
 947          * of segments passed to kexec_load.
 948          */
 949         if (nr_segments > KEXEC_SEGMENT_MAX)
 950                 return -EINVAL;
 951
 952         image = NULL;
 953         result = 0;
 954
 955         /* Because we write directly to the reserved memory
 956          * region when loading crash kernels we need a mutex here to
 957          * prevent multiple crash  kernels from attempting to load
 958          * simultaneously, and to prevent a crash kernel from loading
 959          * over the top of a in use crash kernel.
 960          *
 961          * KISS: always take the mutex.
 962          */
 963         locked = xchg(&kexec_lock, 1);
 964         if (locked)
 965                 return -EBUSY;
 966
 967         dest_image = &kexec_image;
 968         if (flags & KEXEC_ON_CRASH)
 969                 dest_image = &kexec_crash_image;
 970         if (nr_segments > 0) {
 971                 unsigned long i;
 972
 973                 /* Loading another kernel to reboot into */
 974                 if ((flags & KEXEC_ON_CRASH) == 0)
 975                         result = kimage_normal_alloc(&image, entry,
 976                                                         nr_segments, segments);
 977                 /* Loading another kernel to switch to if this one crashes */
 978                 else if (flags & KEXEC_ON_CRASH) {
 979                         /* Free any current crash dump kernel before
 980                          * we corrupt it.
 981                          */
 982                         kimage_free(xchg(&kexec_crash_image, NULL));
 983                         result = kimage_crash_alloc(&image, entry,
 984                                                      nr_segments, segments);
 985                 }
 986                 if (result)
 987                         goto out;
 988
 989                 result = machine_kexec_prepare(image);
 990                 if (result)
 991                         goto out;
 992
 993                 for (i = 0; i < nr_segments; i++) {
 994                         result = kimage_load_segment(image, &image->segment[i]);
 995                         if (result)
 996                                 goto out;
 997                 }
 998                 kimage_terminate(image);
 999         }
1000         /* Install the new kernel, and  Uninstall the old */
1001         image = xchg(dest_image, image);
1002
1003 out:
1004         locked = xchg(&kexec_lock, 0); /* Release the mutex */
1005         BUG_ON(!locked);
1006         kimage_free(image);
1007
1008         return result;
1009 }
1010
1011 #ifdef CONFIG_COMPAT
1012 asmlinkage long compat_sys_kexec_load(unsigned long entry,
1013                                 unsigned long nr_segments,
1014                                 struct compat_kexec_segment __user *segments,
1015                                 unsigned long flags)
1016 {
1017         struct compat_kexec_segment in;
1018         struct kexec_segment out, __user *ksegments;
1019         unsigned long i, result;
1020
1021         /* Don't allow clients that don't understand the native
1022          * architecture to do anything.
1023          */
1024         if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
1025                 return -EINVAL;
1026
1027         if (nr_segments > KEXEC_SEGMENT_MAX)
1028                 return -EINVAL;
1029
1030         ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
1031         for (i=0; i < nr_segments; i++) {
1032                 result = copy_from_user(&in, &segments[i], sizeof(in));
1033                 if (result)
1034                         return -EFAULT;
1035
1036                 out.buf   = compat_ptr(in.buf);
1037                 out.bufsz = in.bufsz;
1038                 out.mem   = in.mem;
1039                 out.memsz = in.memsz;
1040
1041                 result = copy_to_user(&ksegments[i], &out, sizeof(out));
1042                 if (result)
1043                         return -EFAULT;
1044         }
1045
1046         return sys_kexec_load(entry, nr_segments, ksegments, flags);
1047 }
1048 #endif
1049
1050 void crash_kexec(struct pt_regs *regs)
1051 {
1052         int locked;
1053
1054
1055         /* Take the kexec_lock here to prevent sys_kexec_load
1056          * running on one cpu from replacing the crash kernel
1057          * we are using after a panic on a different cpu.
1058          *
1059          * If the crash kernel was not located in a fixed area
1060          * of memory the xchg(&kexec_crash_image) would be
1061          * sufficient.  But since I reuse the memory...
1062          */
1063         locked = xchg(&kexec_lock, 1);
1064         if (!locked) {
1065                 if (kexec_crash_image) {
1066                         struct pt_regs fixed_regs;
1067                         crash_setup_regs(&fixed_regs, regs);
1068                         crash_save_vmcoreinfo();
1069                         machine_crash_shutdown(&fixed_regs);
1070                         machine_kexec(kexec_crash_image);
1071                 }
1072                 locked = xchg(&kexec_lock, 0);
1073                 BUG_ON(!locked);
1074         }
1075 }
1076
1077 static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
1078                             size_t data_len)
1079 {
1080         struct elf_note note;
1081
1082         note.n_namesz = strlen(name) + 1;
1083         note.n_descsz = data_len;
1084         note.n_type   = type;
1085         memcpy(buf, &note, sizeof(note));
1086         buf += (sizeof(note) + 3)/4;
1087         memcpy(buf, name, note.n_namesz);
1088         buf += (note.n_namesz + 3)/4;
1089         memcpy(buf, data, note.n_descsz);
1090         buf += (note.n_descsz + 3)/4;
1091
1092         return buf;
1093 }
1094
1095 static void final_note(u32 *buf)
1096 {
1097         struct elf_note note;
1098
1099         note.n_namesz = 0;
1100         note.n_descsz = 0;
1101         note.n_type   = 0;
1102         memcpy(buf, &note, sizeof(note));
1103 }
1104
1105 void crash_save_cpu(struct pt_regs *regs, int cpu)
1106 {
1107         struct elf_prstatus prstatus;
1108         u32 *buf;
1109
1110         if ((cpu < 0) || (cpu >= NR_CPUS))
1111                 return;
1112
1113         /* Using ELF notes here is opportunistic.
1114          * I need a well defined structure format
1115          * for the data I pass, and I need tags
1116          * on the data to indicate what information I have
1117          * squirrelled away.  ELF notes happen to provide
1118          * all of that, so there is no need to invent something new.
1119          */
1120         buf = (u32*)per_cpu_ptr(crash_notes, cpu);
1121         if (!buf)
1122                 return;
1123         memset(&prstatus, 0, sizeof(prstatus));
1124         prstatus.pr_pid = current->pid;
1125         elf_core_copy_regs(&prstatus.pr_reg, regs);
1126         buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
1127                               &prstatus, sizeof(prstatus));
1128         final_note(buf);
1129 }
1130
1131 static int __init crash_notes_memory_init(void)
1132 {
1133         /* Allocate memory for saving cpu registers. */
1134         crash_notes = alloc_percpu(note_buf_t);
1135         if (!crash_notes) {
1136                 printk("Kexec: Memory allocation for saving cpu register"
1137                 " states failed\n");
1138                 return -ENOMEM;
1139         }
1140         return 0;
1141 }
1142 module_init(crash_notes_memory_init)
1143
1144
1145 /*
1146  * parsing the "crashkernel" commandline
1147  *
1148  * this code is intended to be called from architecture specific code
1149  */
1150
1151
1152 /*
1153  * This function parses command lines in the format
1154  *
1155  *   crashkernel=ramsize-range:size[,...][@offset]
1156  *
1157  * The function returns 0 on success and -EINVAL on failure.
1158  */
1159 static int __init parse_crashkernel_mem(char                    *cmdline,
1160                                         unsigned long long      system_ram,
1161                                         unsigned long long      *crash_size,
1162                                         unsigned long long      *crash_base)
1163 {
1164         char *cur = cmdline, *tmp;
1165
1166         /* for each entry of the comma-separated list */
1167         do {
1168                 unsigned long long start, end = ULLONG_MAX, size;
1169
1170                 /* get the start of the range */
1171                 start = memparse(cur, &tmp);
1172                 if (cur == tmp) {
1173                         pr_warning("crashkernel: Memory value expected\n");
1174                         return -EINVAL;
1175                 }
1176                 cur = tmp;
1177                 if (*cur != '-') {
1178                         pr_warning("crashkernel: '-' expected\n");
1179                         return -EINVAL;
1180                 }
1181                 cur++;
1182
1183                 /* if no ':' is here, than we read the end */
1184                 if (*cur != ':') {
1185                         end = memparse(cur, &tmp);
1186                         if (cur == tmp) {
1187                                 pr_warning("crashkernel: Memory "
1188                                                 "value expected\n");
1189                                 return -EINVAL;
1190                         }
1191                         cur = tmp;
1192                         if (end <= start) {
1193                                 pr_warning("crashkernel: end <= start\n");
1194                                 return -EINVAL;
1195                         }
1196                 }
1197
1198                 if (*cur != ':') {
1199                         pr_warning("crashkernel: ':' expected\n");
1200                         return -EINVAL;
1201                 }
1202                 cur++;
1203
1204                 size = memparse(cur, &tmp);
1205                 if (cur == tmp) {
1206                         pr_warning("Memory value expected\n");
1207                         return -EINVAL;
1208                 }
1209                 cur = tmp;
1210                 if (size >= system_ram) {
1211                         pr_warning("crashkernel: invalid size\n");
1212                         return -EINVAL;
1213                 }
1214
1215                 /* match ? */
1216                 if (system_ram >= start && system_ram < end) {
1217                         *crash_size = size;
1218                         break;
1219                 }
1220         } while (*cur++ == ',');
1221
1222         if (*crash_size > 0) {
1223                 while (*cur != ' ' && *cur != '@')
1224                         cur++;
1225                 if (*cur == '@') {
1226                         cur++;
1227                         *crash_base = memparse(cur, &tmp);
1228                         if (cur == tmp) {
1229                                 pr_warning("Memory value expected "
1230                                                 "after '@'\n");
1231                                 return -EINVAL;
1232                         }
1233                 }
1234         }
1235
1236         return 0;
1237 }
1238
1239 /*
1240  * That function parses "simple" (old) crashkernel command lines like
1241  *
1242  *      crashkernel=size[@offset]
1243  *
1244  * It returns 0 on success and -EINVAL on failure.
1245  */
1246 static int __init parse_crashkernel_simple(char                 *cmdline,
1247                                            unsigned long long   *crash_size,
1248                                            unsigned long long   *crash_base)
1249 {
1250         char *cur = cmdline;
1251
1252         *crash_size = memparse(cmdline, &cur);
1253         if (cmdline == cur) {
1254                 pr_warning("crashkernel: memory value expected\n");
1255                 return -EINVAL;
1256         }
1257
1258         if (*cur == '@')
1259                 *crash_base = memparse(cur+1, &cur);
1260
1261         return 0;
1262 }
1263
1264 /*
1265  * That function is the entry point for command line parsing and should be
1266  * called from the arch-specific code.
1267  */
1268 int __init parse_crashkernel(char                *cmdline,
1269                              unsigned long long system_ram,
1270                              unsigned long long *crash_size,
1271                              unsigned long long *crash_base)
1272 {
1273         char    *p = cmdline, *ck_cmdline = NULL;
1274         char    *first_colon, *first_space;
1275
1276         BUG_ON(!crash_size || !crash_base);
1277         *crash_size = 0;
1278         *crash_base = 0;
1279
1280         /* find crashkernel and use the last one if there are more */
1281         p = strstr(p, "crashkernel=");
1282         while (p) {
1283                 ck_cmdline = p;
1284                 p = strstr(p+1, "crashkernel=");
1285         }
1286
1287         if (!ck_cmdline)
1288                 return -EINVAL;
1289
1290         ck_cmdline += 12; /* strlen("crashkernel=") */
1291
1292         /*
1293          * if the commandline contains a ':', then that's the extended
1294          * syntax -- if not, it must be the classic syntax
1295          */
1296         first_colon = strchr(ck_cmdline, ':');
1297         first_space = strchr(ck_cmdline, ' ');
1298         if (first_colon && (!first_space || first_colon < first_space))
1299                 return parse_crashkernel_mem(ck_cmdline, system_ram,
1300                                 crash_size, crash_base);
1301         else
1302                 return parse_crashkernel_simple(ck_cmdline, crash_size,
1303                                 crash_base);
1304
1305         return 0;
1306 }
1307
1308
1309
1310 void crash_save_vmcoreinfo(void)
1311 {
1312         u32 *buf;
1313
1314         if (!vmcoreinfo_size)
1315                 return;
1316
1317         vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds());
1318
1319         buf = (u32 *)vmcoreinfo_note;
1320
1321         buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
1322                               vmcoreinfo_size);
1323
1324         final_note(buf);
1325 }
1326
1327 void vmcoreinfo_append_str(const char *fmt, ...)
1328 {
1329         va_list args;
1330         char buf[0x50];
1331         int r;
1332
1333         va_start(args, fmt);
1334         r = vsnprintf(buf, sizeof(buf), fmt, args);
1335         va_end(args);
1336
1337         if (r + vmcoreinfo_size > vmcoreinfo_max_size)
1338                 r = vmcoreinfo_max_size - vmcoreinfo_size;
1339
1340         memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
1341
1342         vmcoreinfo_size += r;
1343 }
1344
1345 /*
1346  * provide an empty default implementation here -- architecture
1347  * code may override this
1348  */
1349 void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void)
1350 {}
1351
1352 unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void)
1353 {
1354         return __pa((unsigned long)(char *)&vmcoreinfo_note);
1355 }
1356
1357 static int __init crash_save_vmcoreinfo_init(void)
1358 {
1359         VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
1360         VMCOREINFO_PAGESIZE(PAGE_SIZE);
1361
1362         VMCOREINFO_SYMBOL(init_uts_ns);
1363         VMCOREINFO_SYMBOL(node_online_map);
1364         VMCOREINFO_SYMBOL(swapper_pg_dir);
1365         VMCOREINFO_SYMBOL(_stext);
1366
1367 #ifndef CONFIG_NEED_MULTIPLE_NODES
1368         VMCOREINFO_SYMBOL(mem_map);
1369         VMCOREINFO_SYMBOL(contig_page_data);
1370 #endif
1371 #ifdef CONFIG_SPARSEMEM
1372         VMCOREINFO_SYMBOL(mem_section);
1373         VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
1374         VMCOREINFO_STRUCT_SIZE(mem_section);
1375         VMCOREINFO_OFFSET(mem_section, section_mem_map);
1376 #endif
1377         VMCOREINFO_STRUCT_SIZE(page);
1378         VMCOREINFO_STRUCT_SIZE(pglist_data);
1379         VMCOREINFO_STRUCT_SIZE(zone);
1380         VMCOREINFO_STRUCT_SIZE(free_area);
1381         VMCOREINFO_STRUCT_SIZE(list_head);
1382         VMCOREINFO_SIZE(nodemask_t);
1383         VMCOREINFO_OFFSET(page, flags);
1384         VMCOREINFO_OFFSET(page, _count);
1385         VMCOREINFO_OFFSET(page, mapping);
1386         VMCOREINFO_OFFSET(page, lru);
1387         VMCOREINFO_OFFSET(pglist_data, node_zones);
1388         VMCOREINFO_OFFSET(pglist_data, nr_zones);
1389 #ifdef CONFIG_FLAT_NODE_MEM_MAP
1390         VMCOREINFO_OFFSET(pglist_data, node_mem_map);
1391 #endif
1392         VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
1393         VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
1394         VMCOREINFO_OFFSET(pglist_data, node_id);
1395         VMCOREINFO_OFFSET(zone, free_area);
1396         VMCOREINFO_OFFSET(zone, vm_stat);
1397         VMCOREINFO_OFFSET(zone, spanned_pages);
1398         VMCOREINFO_OFFSET(free_area, free_list);
1399         VMCOREINFO_OFFSET(list_head, next);
1400         VMCOREINFO_OFFSET(list_head, prev);
1401         VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
1402         VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
1403         VMCOREINFO_NUMBER(NR_FREE_PAGES);
1404         VMCOREINFO_NUMBER(PG_lru);
1405         VMCOREINFO_NUMBER(PG_private);
1406         VMCOREINFO_NUMBER(PG_swapcache);
1407
1408         arch_crash_save_vmcoreinfo();
1409
1410         return 0;
1411 }
1412
1413 module_init(crash_save_vmcoreinfo_init)