mm/vmalloc.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  *  linux/mm/vmalloc.c
   4  *
   5  *  Copyright (C) 1993  Linus Torvalds
   6  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
   7  *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
   8  *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
   9  *  Numa awareness, Christoph Lameter, SGI, June 2005
  10  */
  11
  12 #include <linux/vmalloc.h>
  13 #include <linux/mm.h>
  14 #include <linux/module.h>
  15 #include <linux/highmem.h>
  16 #include <linux/sched/signal.h>
  17 #include <linux/slab.h>
  18 #include <linux/spinlock.h>
  19 #include <linux/interrupt.h>
  20 #include <linux/proc_fs.h>
  21 #include <linux/seq_file.h>
  22 #include <linux/set_memory.h>
  23 #include <linux/debugobjects.h>
  24 #include <linux/kallsyms.h>
  25 #include <linux/list.h>
  26 #include <linux/notifier.h>
  27 #include <linux/rbtree.h>
  28 #include <linux/xarray.h>
  29 #include <linux/rcupdate.h>
  30 #include <linux/pfn.h>
  31 #include <linux/kmemleak.h>
  32 #include <linux/atomic.h>
  33 #include <linux/compiler.h>
  34 #include <linux/llist.h>
  35 #include <linux/bitops.h>
  36 #include <linux/rbtree_augmented.h>
  37 #include <linux/overflow.h>
  38
  39 #include <linux/uaccess.h>
  40 #include <asm/tlbflush.h>
  41 #include <asm/shmparam.h>
  42
  43 #include "internal.h"
  44 #include "pgalloc-track.h"
  45
  46 bool is_vmalloc_addr(const void *x)
  47 {
  48         unsigned long addr = (unsigned long)x;
  49
  50         return addr >= VMALLOC_START && addr < VMALLOC_END;
  51 }
  52 EXPORT_SYMBOL(is_vmalloc_addr);
  53
  54 struct vfree_deferred {
  55         struct llist_head list;
  56         struct work_struct wq;
  57 };
  58 static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
  59
  60 static void __vunmap(const void *, int);
  61
  62 static void free_work(struct work_struct *w)
  63 {
  64         struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
  65         struct llist_node *t, *llnode;
  66
  67         llist_for_each_safe(llnode, t, llist_del_all(&p->list))
  68                 __vunmap((void *)llnode, 1);
  69 }
  70
  71 /*** Page table manipulation functions ***/
  72
  73 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
  74                              pgtbl_mod_mask *mask)
  75 {
  76         pte_t *pte;
  77
  78         pte = pte_offset_kernel(pmd, addr);
  79         do {
  80                 pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
  81                 WARN_ON(!pte_none(ptent) && !pte_present(ptent));
  82         } while (pte++, addr += PAGE_SIZE, addr != end);
  83         *mask |= PGTBL_PTE_MODIFIED;
  84 }
  85
  86 static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
  87                              pgtbl_mod_mask *mask)
  88 {
  89         pmd_t *pmd;
  90         unsigned long next;
  91         int cleared;
  92
  93         pmd = pmd_offset(pud, addr);
  94         do {
  95                 next = pmd_addr_end(addr, end);
  96
  97                 cleared = pmd_clear_huge(pmd);
  98                 if (cleared || pmd_bad(*pmd))
  99                         *mask |= PGTBL_PMD_MODIFIED;
 100
 101                 if (cleared)
 102                         continue;
 103                 if (pmd_none_or_clear_bad(pmd))
 104                         continue;
 105                 vunmap_pte_range(pmd, addr, next, mask);
 106         } while (pmd++, addr = next, addr != end);
 107 }
 108
 109 static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
 110                              pgtbl_mod_mask *mask)
 111 {
 112         pud_t *pud;
 113         unsigned long next;
 114         int cleared;
 115
 116         pud = pud_offset(p4d, addr);
 117         do {
 118                 next = pud_addr_end(addr, end);
 119
 120                 cleared = pud_clear_huge(pud);
 121                 if (cleared || pud_bad(*pud))
 122                         *mask |= PGTBL_PUD_MODIFIED;
 123
 124                 if (cleared)
 125                         continue;
 126                 if (pud_none_or_clear_bad(pud))
 127                         continue;
 128                 vunmap_pmd_range(pud, addr, next, mask);
 129         } while (pud++, addr = next, addr != end);
 130 }
 131
 132 static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 133                              pgtbl_mod_mask *mask)
 134 {
 135         p4d_t *p4d;
 136         unsigned long next;
 137         int cleared;
 138
 139         p4d = p4d_offset(pgd, addr);
 140         do {
 141                 next = p4d_addr_end(addr, end);
 142
 143                 cleared = p4d_clear_huge(p4d);
 144                 if (cleared || p4d_bad(*p4d))
 145                         *mask |= PGTBL_P4D_MODIFIED;
 146
 147                 if (cleared)
 148                         continue;
 149                 if (p4d_none_or_clear_bad(p4d))
 150                         continue;
 151                 vunmap_pud_range(p4d, addr, next, mask);
 152         } while (p4d++, addr = next, addr != end);
 153 }
 154
 155 /**
 156  * unmap_kernel_range_noflush - unmap kernel VM area
 157  * @start: start of the VM area to unmap
 158  * @size: size of the VM area to unmap
 159  *
 160  * Unmap PFN_UP(@size) pages at @addr.  The VM area @addr and @size specify
 161  * should have been allocated using get_vm_area() and its friends.
 162  *
 163  * NOTE:
 164  * This function does NOT do any cache flushing.  The caller is responsible
 165  * for calling flush_cache_vunmap() on to-be-mapped areas before calling this
 166  * function and flush_tlb_kernel_range() after.
 167  */
 168 void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
 169 {
 170         unsigned long end = start + size;
 171         unsigned long next;
 172         pgd_t *pgd;
 173         unsigned long addr = start;
 174         pgtbl_mod_mask mask = 0;
 175
 176         BUG_ON(addr >= end);
 177         start = addr;
 178         pgd = pgd_offset_k(addr);
 179         do {
 180                 next = pgd_addr_end(addr, end);
 181                 if (pgd_bad(*pgd))
 182                         mask |= PGTBL_PGD_MODIFIED;
 183                 if (pgd_none_or_clear_bad(pgd))
 184                         continue;
 185                 vunmap_p4d_range(pgd, addr, next, &mask);
 186         } while (pgd++, addr = next, addr != end);
 187
 188         if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
 189                 arch_sync_kernel_mappings(start, end);
 190 }
 191
 192 static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
 193                 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
 194                 pgtbl_mod_mask *mask)
 195 {
 196         pte_t *pte;
 197
 198         /*
 199          * nr is a running index into the array which helps higher level
 200          * callers keep track of where we're up to.
 201          */
 202
 203         pte = pte_alloc_kernel_track(pmd, addr, mask);
 204         if (!pte)
 205                 return -ENOMEM;
 206         do {
 207                 struct page *page = pages[*nr];
 208
 209                 if (WARN_ON(!pte_none(*pte)))
 210                         return -EBUSY;
 211                 if (WARN_ON(!page))
 212                         return -ENOMEM;
 213                 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
 214                 (*nr)++;
 215         } while (pte++, addr += PAGE_SIZE, addr != end);
 216         *mask |= PGTBL_PTE_MODIFIED;
 217         return 0;
 218 }
 219
 220 static int vmap_pmd_range(pud_t *pud, unsigned long addr,
 221                 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
 222                 pgtbl_mod_mask *mask)
 223 {
 224         pmd_t *pmd;
 225         unsigned long next;
 226
 227         pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
 228         if (!pmd)
 229                 return -ENOMEM;
 230         do {
 231                 next = pmd_addr_end(addr, end);
 232                 if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask))
 233                         return -ENOMEM;
 234         } while (pmd++, addr = next, addr != end);
 235         return 0;
 236 }
 237
 238 static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
 239                 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
 240                 pgtbl_mod_mask *mask)
 241 {
 242         pud_t *pud;
 243         unsigned long next;
 244
 245         pud = pud_alloc_track(&init_mm, p4d, addr, mask);
 246         if (!pud)
 247                 return -ENOMEM;
 248         do {
 249                 next = pud_addr_end(addr, end);
 250                 if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask))
 251                         return -ENOMEM;
 252         } while (pud++, addr = next, addr != end);
 253         return 0;
 254 }
 255
 256 static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
 257                 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
 258                 pgtbl_mod_mask *mask)
 259 {
 260         p4d_t *p4d;
 261         unsigned long next;
 262
 263         p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
 264         if (!p4d)
 265                 return -ENOMEM;
 266         do {
 267                 next = p4d_addr_end(addr, end);
 268                 if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask))
 269                         return -ENOMEM;
 270         } while (p4d++, addr = next, addr != end);
 271         return 0;
 272 }
 273
 274 /**
 275  * map_kernel_range_noflush - map kernel VM area with the specified pages
 276  * @addr: start of the VM area to map
 277  * @size: size of the VM area to map
 278  * @prot: page protection flags to use
 279  * @pages: pages to map
 280  *
 281  * Map PFN_UP(@size) pages at @addr.  The VM area @addr and @size specify should
 282  * have been allocated using get_vm_area() and its friends.
 283  *
 284  * NOTE:
 285  * This function does NOT do any cache flushing.  The caller is responsible for
 286  * calling flush_cache_vmap() on to-be-mapped areas before calling this
 287  * function.
 288  *
 289  * RETURNS:
 290  * 0 on success, -errno on failure.
 291  */
 292 int map_kernel_range_noflush(unsigned long addr, unsigned long size,
 293                              pgprot_t prot, struct page **pages)
 294 {
 295         unsigned long start = addr;
 296         unsigned long end = addr + size;
 297         unsigned long next;
 298         pgd_t *pgd;
 299         int err = 0;
 300         int nr = 0;
 301         pgtbl_mod_mask mask = 0;
 302
 303         BUG_ON(addr >= end);
 304         pgd = pgd_offset_k(addr);
 305         do {
 306                 next = pgd_addr_end(addr, end);
 307                 if (pgd_bad(*pgd))
 308                         mask |= PGTBL_PGD_MODIFIED;
 309                 err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
 310                 if (err)
 311                         return err;
 312         } while (pgd++, addr = next, addr != end);
 313
 314         if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
 315                 arch_sync_kernel_mappings(start, end);
 316
 317         return 0;
 318 }
 319
 320 int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
 321                 struct page **pages)
 322 {
 323         int ret;
 324
 325         ret = map_kernel_range_noflush(start, size, prot, pages);
 326         flush_cache_vmap(start, start + size);
 327         return ret;
 328 }
 329
 330 int is_vmalloc_or_module_addr(const void *x)
 331 {
 332         /*
 333          * ARM, x86-64 and sparc64 put modules in a special place,
 334          * and fall back on vmalloc() if that fails. Others
 335          * just put it in the vmalloc space.
 336          */
 337 #if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
 338         unsigned long addr = (unsigned long)x;
 339         if (addr >= MODULES_VADDR && addr < MODULES_END)
 340                 return 1;
 341 #endif
 342         return is_vmalloc_addr(x);
 343 }
 344
 345 /*
 346  * Walk a vmap address to the struct page it maps.
 347  */
 348 struct page *vmalloc_to_page(const void *vmalloc_addr)
 349 {
 350         unsigned long addr = (unsigned long) vmalloc_addr;
 351         struct page *page = NULL;
 352         pgd_t *pgd = pgd_offset_k(addr);
 353         p4d_t *p4d;
 354         pud_t *pud;
 355         pmd_t *pmd;
 356         pte_t *ptep, pte;
 357
 358         /*
 359          * XXX we might need to change this if we add VIRTUAL_BUG_ON for
 360          * architectures that do not vmalloc module space
 361          */
 362         VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
 363
 364         if (pgd_none(*pgd))
 365                 return NULL;
 366         p4d = p4d_offset(pgd, addr);
 367         if (p4d_none(*p4d))
 368                 return NULL;
 369         pud = pud_offset(p4d, addr);
 370
 371         /*
 372          * Don't dereference bad PUD or PMD (below) entries. This will also
 373          * identify huge mappings, which we may encounter on architectures
 374          * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
 375          * identified as vmalloc addresses by is_vmalloc_addr(), but are
 376          * not [unambiguously] associated with a struct page, so there is
 377          * no correct value to return for them.
 378          */
 379         WARN_ON_ONCE(pud_bad(*pud));
 380         if (pud_none(*pud) || pud_bad(*pud))
 381                 return NULL;
 382         pmd = pmd_offset(pud, addr);
 383         WARN_ON_ONCE(pmd_bad(*pmd));
 384         if (pmd_none(*pmd) || pmd_bad(*pmd))
 385                 return NULL;
 386
 387         ptep = pte_offset_map(pmd, addr);
 388         pte = *ptep;
 389         if (pte_present(pte))
 390                 page = pte_page(pte);
 391         pte_unmap(ptep);
 392         return page;
 393 }
 394 EXPORT_SYMBOL(vmalloc_to_page);
 395
 396 /*
 397  * Map a vmalloc()-space virtual address to the physical page frame number.
 398  */
 399 unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
 400 {
 401         return page_to_pfn(vmalloc_to_page(vmalloc_addr));
 402 }
 403 EXPORT_SYMBOL(vmalloc_to_pfn);
 404
 405
 406 /*** Global kva allocator ***/
 407
 408 #define DEBUG_AUGMENT_PROPAGATE_CHECK 0
 409 #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
 410
 411
 412 static DEFINE_SPINLOCK(vmap_area_lock);
 413 static DEFINE_SPINLOCK(free_vmap_area_lock);
 414 /* Export for kexec only */
 415 LIST_HEAD(vmap_area_list);
 416 static LLIST_HEAD(vmap_purge_list);
 417 static struct rb_root vmap_area_root = RB_ROOT;
 418 static bool vmap_initialized __read_mostly;
 419
 420 /*
 421  * This kmem_cache is used for vmap_area objects. Instead of
 422  * allocating from slab we reuse an object from this cache to
 423  * make things faster. Especially in "no edge" splitting of
 424  * free block.
 425  */
 426 static struct kmem_cache *vmap_area_cachep;
 427
 428 /*
 429  * This linked list is used in pair with free_vmap_area_root.
 430  * It gives O(1) access to prev/next to perform fast coalescing.
 431  */
 432 static LIST_HEAD(free_vmap_area_list);
 433
 434 /*
 435  * This augment red-black tree represents the free vmap space.
 436  * All vmap_area objects in this tree are sorted by va->va_start
 437  * address. It is used for allocation and merging when a vmap
 438  * object is released.
 439  *
 440  * Each vmap_area node contains a maximum available free block
 441  * of its sub-tree, right or left. Therefore it is possible to
 442  * find a lowest match of free area.
 443  */
 444 static struct rb_root free_vmap_area_root = RB_ROOT;
 445
 446 /*
 447  * Preload a CPU with one object for "no edge" split case. The
 448  * aim is to get rid of allocations from the atomic context, thus
 449  * to use more permissive allocation masks.
 450  */
 451 static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
 452
 453 static __always_inline unsigned long
 454 va_size(struct vmap_area *va)
 455 {
 456         return (va->va_end - va->va_start);
 457 }
 458
 459 static __always_inline unsigned long
 460 get_subtree_max_size(struct rb_node *node)
 461 {
 462         struct vmap_area *va;
 463
 464         va = rb_entry_safe(node, struct vmap_area, rb_node);
 465         return va ? va->subtree_max_size : 0;
 466 }
 467
 468 /*
 469  * Gets called when remove the node and rotate.
 470  */
 471 static __always_inline unsigned long
 472 compute_subtree_max_size(struct vmap_area *va)
 473 {
 474         return max3(va_size(va),
 475                 get_subtree_max_size(va->rb_node.rb_left),
 476                 get_subtree_max_size(va->rb_node.rb_right));
 477 }
 478
 479 RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
 480         struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
 481
 482 static void purge_vmap_area_lazy(void);
 483 static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
 484 static unsigned long lazy_max_pages(void);
 485
 486 static atomic_long_t nr_vmalloc_pages;
 487
 488 unsigned long vmalloc_nr_pages(void)
 489 {
 490         return atomic_long_read(&nr_vmalloc_pages);
 491 }
 492
 493 static struct vmap_area *__find_vmap_area(unsigned long addr)
 494 {
 495         struct rb_node *n = vmap_area_root.rb_node;
 496
 497         while (n) {
 498                 struct vmap_area *va;
 499
 500                 va = rb_entry(n, struct vmap_area, rb_node);
 501                 if (addr < va->va_start)
 502                         n = n->rb_left;
 503                 else if (addr >= va->va_end)
 504                         n = n->rb_right;
 505                 else
 506                         return va;
 507         }
 508
 509         return NULL;
 510 }
 511
 512 /*
 513  * This function returns back addresses of parent node
 514  * and its left or right link for further processing.
 515  */
 516 static __always_inline struct rb_node **
 517 find_va_links(struct vmap_area *va,
 518         struct rb_root *root, struct rb_node *from,
 519         struct rb_node **parent)
 520 {
 521         struct vmap_area *tmp_va;
 522         struct rb_node **link;
 523
 524         if (root) {
 525                 link = &root->rb_node;
 526                 if (unlikely(!*link)) {
 527                         *parent = NULL;
 528                         return link;
 529                 }
 530         } else {
 531                 link = &from;
 532         }
 533
 534         /*
 535          * Go to the bottom of the tree. When we hit the last point
 536          * we end up with parent rb_node and correct direction, i name
 537          * it link, where the new va->rb_node will be attached to.
 538          */
 539         do {
 540                 tmp_va = rb_entry(*link, struct vmap_area, rb_node);
 541
 542                 /*
 543                  * During the traversal we also do some sanity check.
 544                  * Trigger the BUG() if there are sides(left/right)
 545                  * or full overlaps.
 546                  */
 547                 if (va->va_start < tmp_va->va_end &&
 548                                 va->va_end <= tmp_va->va_start)
 549                         link = &(*link)->rb_left;
 550                 else if (va->va_end > tmp_va->va_start &&
 551                                 va->va_start >= tmp_va->va_end)
 552                         link = &(*link)->rb_right;
 553                 else
 554                         BUG();
 555         } while (*link);
 556
 557         *parent = &tmp_va->rb_node;
 558         return link;
 559 }
 560
 561 static __always_inline struct list_head *
 562 get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
 563 {
 564         struct list_head *list;
 565
 566         if (unlikely(!parent))
 567                 /*
 568                  * The red-black tree where we try to find VA neighbors
 569                  * before merging or inserting is empty, i.e. it means
 570                  * there is no free vmap space. Normally it does not
 571                  * happen but we handle this case anyway.
 572                  */
 573                 return NULL;
 574
 575         list = &rb_entry(parent, struct vmap_area, rb_node)->list;
 576         return (&parent->rb_right == link ? list->next : list);
 577 }
 578
 579 static __always_inline void
 580 link_va(struct vmap_area *va, struct rb_root *root,
 581         struct rb_node *parent, struct rb_node **link, struct list_head *head)
 582 {
 583         /*
 584          * VA is still not in the list, but we can
 585          * identify its future previous list_head node.
 586          */
 587         if (likely(parent)) {
 588                 head = &rb_entry(parent, struct vmap_area, rb_node)->list;
 589                 if (&parent->rb_right != link)
 590                         head = head->prev;
 591         }
 592
 593         /* Insert to the rb-tree */
 594         rb_link_node(&va->rb_node, parent, link);
 595         if (root == &free_vmap_area_root) {
 596                 /*
 597                  * Some explanation here. Just perform simple insertion
 598                  * to the tree. We do not set va->subtree_max_size to
 599                  * its current size before calling rb_insert_augmented().
 600                  * It is because of we populate the tree from the bottom
 601                  * to parent levels when the node _is_ in the tree.
 602                  *
 603                  * Therefore we set subtree_max_size to zero after insertion,
 604                  * to let __augment_tree_propagate_from() puts everything to
 605                  * the correct order later on.
 606                  */
 607                 rb_insert_augmented(&va->rb_node,
 608                         root, &free_vmap_area_rb_augment_cb);
 609                 va->subtree_max_size = 0;
 610         } else {
 611                 rb_insert_color(&va->rb_node, root);
 612         }
 613
 614         /* Address-sort this list */
 615         list_add(&va->list, head);
 616 }
 617
 618 static __always_inline void
 619 unlink_va(struct vmap_area *va, struct rb_root *root)
 620 {
 621         if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
 622                 return;
 623
 624         if (root == &free_vmap_area_root)
 625                 rb_erase_augmented(&va->rb_node,
 626                         root, &free_vmap_area_rb_augment_cb);
 627         else
 628                 rb_erase(&va->rb_node, root);
 629
 630         list_del(&va->list);
 631         RB_CLEAR_NODE(&va->rb_node);
 632 }
 633
 634 #if DEBUG_AUGMENT_PROPAGATE_CHECK
 635 static void
 636 augment_tree_propagate_check(void)
 637 {
 638         struct vmap_area *va;
 639         unsigned long computed_size;
 640
 641         list_for_each_entry(va, &free_vmap_area_list, list) {
 642                 computed_size = compute_subtree_max_size(va);
 643                 if (computed_size != va->subtree_max_size)
 644                         pr_emerg("tree is corrupted: %lu, %lu\n",
 645                                 va_size(va), va->subtree_max_size);
 646         }
 647 }
 648 #endif
 649
 650 /*
 651  * This function populates subtree_max_size from bottom to upper
 652  * levels starting from VA point. The propagation must be done
 653  * when VA size is modified by changing its va_start/va_end. Or
 654  * in case of newly inserting of VA to the tree.
 655  *
 656  * It means that __augment_tree_propagate_from() must be called:
 657  * - After VA has been inserted to the tree(free path);
 658  * - After VA has been shrunk(allocation path);
 659  * - After VA has been increased(merging path).
 660  *
 661  * Please note that, it does not mean that upper parent nodes
 662  * and their subtree_max_size are recalculated all the time up
 663  * to the root node.
 664  *
 665  *       4--8
 666  *        /\
 667  *       /  \
 668  *      /    \
 669  *    2--2  8--8
 670  *
 671  * For example if we modify the node 4, shrinking it to 2, then
 672  * no any modification is required. If we shrink the node 2 to 1
 673  * its subtree_max_size is updated only, and set to 1. If we shrink
 674  * the node 8 to 6, then its subtree_max_size is set to 6 and parent
 675  * node becomes 4--6.
 676  */
 677 static __always_inline void
 678 augment_tree_propagate_from(struct vmap_area *va)
 679 {
 680         /*
 681          * Populate the tree from bottom towards the root until
 682          * the calculated maximum available size of checked node
 683          * is equal to its current one.
 684          */
 685         free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);
 686
 687 #if DEBUG_AUGMENT_PROPAGATE_CHECK
 688         augment_tree_propagate_check();
 689 #endif
 690 }
 691
 692 static void
 693 insert_vmap_area(struct vmap_area *va,
 694         struct rb_root *root, struct list_head *head)
 695 {
 696         struct rb_node **link;
 697         struct rb_node *parent;
 698
 699         link = find_va_links(va, root, NULL, &parent);
 700         link_va(va, root, parent, link, head);
 701 }
 702
 703 static void
 704 insert_vmap_area_augment(struct vmap_area *va,
 705         struct rb_node *from, struct rb_root *root,
 706         struct list_head *head)
 707 {
 708         struct rb_node **link;
 709         struct rb_node *parent;
 710
 711         if (from)
 712                 link = find_va_links(va, NULL, from, &parent);
 713         else
 714                 link = find_va_links(va, root, NULL, &parent);
 715
 716         link_va(va, root, parent, link, head);
 717         augment_tree_propagate_from(va);
 718 }
 719
 720 /*
 721  * Merge de-allocated chunk of VA memory with previous
 722  * and next free blocks. If coalesce is not done a new
 723  * free area is inserted. If VA has been merged, it is
 724  * freed.
 725  */
 726 static __always_inline struct vmap_area *
 727 merge_or_add_vmap_area(struct vmap_area *va,
 728         struct rb_root *root, struct list_head *head)
 729 {
 730         struct vmap_area *sibling;
 731         struct list_head *next;
 732         struct rb_node **link;
 733         struct rb_node *parent;
 734         bool merged = false;
 735
 736         /*
 737          * Find a place in the tree where VA potentially will be
 738          * inserted, unless it is merged with its sibling/siblings.
 739          */
 740         link = find_va_links(va, root, NULL, &parent);
 741
 742         /*
 743          * Get next node of VA to check if merging can be done.
 744          */
 745         next = get_va_next_sibling(parent, link);
 746         if (unlikely(next == NULL))
 747                 goto insert;
 748
 749         /*
 750          * start            end
 751          * |                |
 752          * |<------VA------>|<-----Next----->|
 753          *                  |                |
 754          *                  start            end
 755          */
 756         if (next != head) {
 757                 sibling = list_entry(next, struct vmap_area, list);
 758                 if (sibling->va_start == va->va_end) {
 759                         sibling->va_start = va->va_start;
 760
 761                         /* Free vmap_area object. */
 762                         kmem_cache_free(vmap_area_cachep, va);
 763
 764                         /* Point to the new merged area. */
 765                         va = sibling;
 766                         merged = true;
 767                 }
 768         }
 769
 770         /*
 771          * start            end
 772          * |                |
 773          * |<-----Prev----->|<------VA------>|
 774          *                  |                |
 775          *                  start            end
 776          */
 777         if (next->prev != head) {
 778                 sibling = list_entry(next->prev, struct vmap_area, list);
 779                 if (sibling->va_end == va->va_start) {
 780                         /*
 781                          * If both neighbors are coalesced, it is important
 782                          * to unlink the "next" node first, followed by merging
 783                          * with "previous" one. Otherwise the tree might not be
 784                          * fully populated if a sibling's augmented value is
 785                          * "normalized" because of rotation operations.
 786                          */
 787                         if (merged)
 788                                 unlink_va(va, root);
 789
 790                         sibling->va_end = va->va_end;
 791
 792                         /* Free vmap_area object. */
 793                         kmem_cache_free(vmap_area_cachep, va);
 794
 795                         /* Point to the new merged area. */
 796                         va = sibling;
 797                         merged = true;
 798                 }
 799         }
 800
 801 insert:
 802         if (!merged)
 803                 link_va(va, root, parent, link, head);
 804
 805         /*
 806          * Last step is to check and update the tree.
 807          */
 808         augment_tree_propagate_from(va);
 809         return va;
 810 }
 811
 812 static __always_inline bool
 813 is_within_this_va(struct vmap_area *va, unsigned long size,
 814         unsigned long align, unsigned long vstart)
 815 {
 816         unsigned long nva_start_addr;
 817
 818         if (va->va_start > vstart)
 819                 nva_start_addr = ALIGN(va->va_start, align);
 820         else
 821                 nva_start_addr = ALIGN(vstart, align);
 822
 823         /* Can be overflowed due to big size or alignment. */
 824         if (nva_start_addr + size < nva_start_addr ||
 825                         nva_start_addr < vstart)
 826                 return false;
 827
 828         return (nva_start_addr + size <= va->va_end);
 829 }
 830
 831 /*
 832  * Find the first free block(lowest start address) in the tree,
 833  * that will accomplish the request corresponding to passing
 834  * parameters.
 835  */
 836 static __always_inline struct vmap_area *
 837 find_vmap_lowest_match(unsigned long size,
 838         unsigned long align, unsigned long vstart)
 839 {
 840         struct vmap_area *va;
 841         struct rb_node *node;
 842         unsigned long length;
 843
 844         /* Start from the root. */
 845         node = free_vmap_area_root.rb_node;
 846
 847         /* Adjust the search size for alignment overhead. */
 848         length = size + align - 1;
 849
 850         while (node) {
 851                 va = rb_entry(node, struct vmap_area, rb_node);
 852
 853                 if (get_subtree_max_size(node->rb_left) >= length &&
 854                                 vstart < va->va_start) {
 855                         node = node->rb_left;
 856                 } else {
 857                         if (is_within_this_va(va, size, align, vstart))
 858                                 return va;
 859
 860                         /*
 861                          * Does not make sense to go deeper towards the right
 862                          * sub-tree if it does not have a free block that is
 863                          * equal or bigger to the requested search length.
 864                          */
 865                         if (get_subtree_max_size(node->rb_right) >= length) {
 866                                 node = node->rb_right;
 867                                 continue;
 868                         }
 869
 870                         /*
 871                          * OK. We roll back and find the first right sub-tree,
 872                          * that will satisfy the search criteria. It can happen
 873                          * only once due to "vstart" restriction.
 874                          */
 875                         while ((node = rb_parent(node))) {
 876                                 va = rb_entry(node, struct vmap_area, rb_node);
 877                                 if (is_within_this_va(va, size, align, vstart))
 878                                         return va;
 879
 880                                 if (get_subtree_max_size(node->rb_right) >= length &&
 881                                                 vstart <= va->va_start) {
 882                                         node = node->rb_right;
 883                                         break;
 884                                 }
 885                         }
 886                 }
 887         }
 888
 889         return NULL;
 890 }
 891
 892 #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
 893 #include <linux/random.h>
 894
 895 static struct vmap_area *
 896 find_vmap_lowest_linear_match(unsigned long size,
 897         unsigned long align, unsigned long vstart)
 898 {
 899         struct vmap_area *va;
 900
 901         list_for_each_entry(va, &free_vmap_area_list, list) {
 902                 if (!is_within_this_va(va, size, align, vstart))
 903                         continue;
 904
 905                 return va;
 906         }
 907
 908         return NULL;
 909 }
 910
 911 static void
 912 find_vmap_lowest_match_check(unsigned long size)
 913 {
 914         struct vmap_area *va_1, *va_2;
 915         unsigned long vstart;
 916         unsigned int rnd;
 917
 918         get_random_bytes(&rnd, sizeof(rnd));
 919         vstart = VMALLOC_START + rnd;
 920
 921         va_1 = find_vmap_lowest_match(size, 1, vstart);
 922         va_2 = find_vmap_lowest_linear_match(size, 1, vstart);
 923
 924         if (va_1 != va_2)
 925                 pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
 926                         va_1, va_2, vstart);
 927 }
 928 #endif
 929
 930 enum fit_type {
 931         NOTHING_FIT = 0,
 932         FL_FIT_TYPE = 1,        /* full fit */
 933         LE_FIT_TYPE = 2,        /* left edge fit */
 934         RE_FIT_TYPE = 3,        /* right edge fit */
 935         NE_FIT_TYPE = 4         /* no edge fit */
 936 };
 937
 938 static __always_inline enum fit_type
 939 classify_va_fit_type(struct vmap_area *va,
 940         unsigned long nva_start_addr, unsigned long size)
 941 {
 942         enum fit_type type;
 943
 944         /* Check if it is within VA. */
 945         if (nva_start_addr < va->va_start ||
 946                         nva_start_addr + size > va->va_end)
 947                 return NOTHING_FIT;
 948
 949         /* Now classify. */
 950         if (va->va_start == nva_start_addr) {
 951                 if (va->va_end == nva_start_addr + size)
 952                         type = FL_FIT_TYPE;
 953                 else
 954                         type = LE_FIT_TYPE;
 955         } else if (va->va_end == nva_start_addr + size) {
 956                 type = RE_FIT_TYPE;
 957         } else {
 958                 type = NE_FIT_TYPE;
 959         }
 960
 961         return type;
 962 }
 963
 964 static __always_inline int
 965 adjust_va_to_fit_type(struct vmap_area *va,
 966         unsigned long nva_start_addr, unsigned long size,
 967         enum fit_type type)
 968 {
 969         struct vmap_area *lva = NULL;
 970
 971         if (type == FL_FIT_TYPE) {
 972                 /*
 973                  * No need to split VA, it fully fits.
 974                  *
 975                  * |               |
 976                  * V      NVA      V
 977                  * |---------------|
 978                  */
 979                 unlink_va(va, &free_vmap_area_root);
 980                 kmem_cache_free(vmap_area_cachep, va);
 981         } else if (type == LE_FIT_TYPE) {
 982                 /*
 983                  * Split left edge of fit VA.
 984                  *
 985                  * |       |
 986                  * V  NVA  V   R
 987                  * |-------|-------|
 988                  */
 989                 va->va_start += size;
 990         } else if (type == RE_FIT_TYPE) {
 991                 /*
 992                  * Split right edge of fit VA.
 993                  *
 994                  *         |       |
 995                  *     L   V  NVA  V
 996                  * |-------|-------|
 997                  */
 998                 va->va_end = nva_start_addr;
 999         } else if (type == NE_FIT_TYPE) {
1000                 /*
1001                  * Split no edge of fit VA.
1002                  *
1003                  *     |       |
1004                  *   L V  NVA  V R
1005                  * |---|-------|---|
1006                  */
1007                 lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
1008                 if (unlikely(!lva)) {
1009                         /*
1010                          * For percpu allocator we do not do any pre-allocation
1011                          * and leave it as it is. The reason is it most likely
1012                          * never ends up with NE_FIT_TYPE splitting. In case of
1013                          * percpu allocations offsets and sizes are aligned to
1014                          * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
1015                          * are its main fitting cases.
1016                          *
1017                          * There are a few exceptions though, as an example it is
1018                          * a first allocation (early boot up) when we have "one"
1019                          * big free space that has to be split.
1020                          *
1021                          * Also we can hit this path in case of regular "vmap"
1022                          * allocations, if "this" current CPU was not preloaded.
1023                          * See the comment in alloc_vmap_area() why. If so, then
1024                          * GFP_NOWAIT is used instead to get an extra object for
1025                          * split purpose. That is rare and most time does not
1026                          * occur.
1027                          *
1028                          * What happens if an allocation gets failed. Basically,
1029                          * an "overflow" path is triggered to purge lazily freed
1030                          * areas to free some memory, then, the "retry" path is
1031                          * triggered to repeat one more time. See more details
1032                          * in alloc_vmap_area() function.
1033                          */
1034                         lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
1035                         if (!lva)
1036                                 return -1;
1037                 }
1038
1039                 /*
1040                  * Build the remainder.
1041                  */
1042                 lva->va_start = va->va_start;
1043                 lva->va_end = nva_start_addr;
1044
1045                 /*
1046                  * Shrink this VA to remaining size.
1047                  */
1048                 va->va_start = nva_start_addr + size;
1049         } else {
1050                 return -1;
1051         }
1052
1053         if (type != FL_FIT_TYPE) {
1054                 augment_tree_propagate_from(va);
1055
1056                 if (lva)        /* type == NE_FIT_TYPE */
1057                         insert_vmap_area_augment(lva, &va->rb_node,
1058                                 &free_vmap_area_root, &free_vmap_area_list);
1059         }
1060
1061         return 0;
1062 }
1063
1064 /*
1065  * Returns a start address of the newly allocated area, if success.
1066  * Otherwise a vend is returned that indicates failure.
1067  */
1068 static __always_inline unsigned long
1069 __alloc_vmap_area(unsigned long size, unsigned long align,
1070         unsigned long vstart, unsigned long vend)
1071 {
1072         unsigned long nva_start_addr;
1073         struct vmap_area *va;
1074         enum fit_type type;
1075         int ret;
1076
1077         va = find_vmap_lowest_match(size, align, vstart);
1078         if (unlikely(!va))
1079                 return vend;
1080
1081         if (va->va_start > vstart)
1082                 nva_start_addr = ALIGN(va->va_start, align);
1083         else
1084                 nva_start_addr = ALIGN(vstart, align);
1085
1086         /* Check the "vend" restriction. */
1087         if (nva_start_addr + size > vend)
1088                 return vend;
1089
1090         /* Classify what we have found. */
1091         type = classify_va_fit_type(va, nva_start_addr, size);
1092         if (WARN_ON_ONCE(type == NOTHING_FIT))
1093                 return vend;
1094
1095         /* Update the free vmap_area. */
1096         ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
1097         if (ret)
1098                 return vend;
1099
1100 #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
1101         find_vmap_lowest_match_check(size);
1102 #endif
1103
1104         return nva_start_addr;
1105 }
1106
1107 /*
1108  * Free a region of KVA allocated by alloc_vmap_area
1109  */
1110 static void free_vmap_area(struct vmap_area *va)
1111 {
1112         /*
1113          * Remove from the busy tree/list.
1114          */
1115         spin_lock(&vmap_area_lock);
1116         unlink_va(va, &vmap_area_root);
1117         spin_unlock(&vmap_area_lock);
1118
1119         /*
1120          * Insert/Merge it back to the free tree/list.
1121          */
1122         spin_lock(&free_vmap_area_lock);
1123         merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list);
1124         spin_unlock(&free_vmap_area_lock);
1125 }
1126
1127 /*
1128  * Allocate a region of KVA of the specified size and alignment, within the
1129  * vstart and vend.
1130  */
1131 static struct vmap_area *alloc_vmap_area(unsigned long size,
1132                                 unsigned long align,
1133                                 unsigned long vstart, unsigned long vend,
1134                                 int node, gfp_t gfp_mask)
1135 {
1136         struct vmap_area *va, *pva;
1137         unsigned long addr;
1138         int purged = 0;
1139         int ret;
1140
1141         BUG_ON(!size);
1142         BUG_ON(offset_in_page(size));
1143         BUG_ON(!is_power_of_2(align));
1144
1145         if (unlikely(!vmap_initialized))
1146                 return ERR_PTR(-EBUSY);
1147
1148         might_sleep();
1149         gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
1150
1151         va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
1152         if (unlikely(!va))
1153                 return ERR_PTR(-ENOMEM);
1154
1155         /*
1156          * Only scan the relevant parts containing pointers to other objects
1157          * to avoid false negatives.
1158          */
1159         kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
1160
1161 retry:
1162         /*
1163          * Preload this CPU with one extra vmap_area object. It is used
1164          * when fit type of free area is NE_FIT_TYPE. Please note, it
1165          * does not guarantee that an allocation occurs on a CPU that
1166          * is preloaded, instead we minimize the case when it is not.
1167          * It can happen because of cpu migration, because there is a
1168          * race until the below spinlock is taken.
1169          *
1170          * The preload is done in non-atomic context, thus it allows us
1171          * to use more permissive allocation masks to be more stable under
1172          * low memory condition and high memory pressure. In rare case,
1173          * if not preloaded, GFP_NOWAIT is used.
1174          *
1175          * Set "pva" to NULL here, because of "retry" path.
1176          */
1177         pva = NULL;
1178
1179         if (!this_cpu_read(ne_fit_preload_node))
1180                 /*
1181                  * Even if it fails we do not really care about that.
1182                  * Just proceed as it is. If needed "overflow" path
1183                  * will refill the cache we allocate from.
1184                  */
1185                 pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
1186
1187         spin_lock(&free_vmap_area_lock);
1188
1189         if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva))
1190                 kmem_cache_free(vmap_area_cachep, pva);
1191
1192         /*
1193          * If an allocation fails, the "vend" address is
1194          * returned. Therefore trigger the overflow path.
1195          */
1196         addr = __alloc_vmap_area(size, align, vstart, vend);
1197         spin_unlock(&free_vmap_area_lock);
1198
1199         if (unlikely(addr == vend))
1200                 goto overflow;
1201
1202         va->va_start = addr;
1203         va->va_end = addr + size;
1204         va->vm = NULL;
1205
1206
1207         spin_lock(&vmap_area_lock);
1208         insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
1209         spin_unlock(&vmap_area_lock);
1210
1211         BUG_ON(!IS_ALIGNED(va->va_start, align));
1212         BUG_ON(va->va_start < vstart);
1213         BUG_ON(va->va_end > vend);
1214
1215         ret = kasan_populate_vmalloc(addr, size);
1216         if (ret) {
1217                 free_vmap_area(va);
1218                 return ERR_PTR(ret);
1219         }
1220
1221         return va;
1222
1223 overflow:
1224         if (!purged) {
1225                 purge_vmap_area_lazy();
1226                 purged = 1;
1227                 goto retry;
1228         }
1229
1230         if (gfpflags_allow_blocking(gfp_mask)) {
1231                 unsigned long freed = 0;
1232                 blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
1233                 if (freed > 0) {
1234                         purged = 0;
1235                         goto retry;
1236                 }
1237         }
1238
1239         if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
1240                 pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
1241                         size);
1242
1243         kmem_cache_free(vmap_area_cachep, va);
1244         return ERR_PTR(-EBUSY);
1245 }
1246
1247 int register_vmap_purge_notifier(struct notifier_block *nb)
1248 {
1249         return blocking_notifier_chain_register(&vmap_notify_list, nb);
1250 }
1251 EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);
1252
1253 int unregister_vmap_purge_notifier(struct notifier_block *nb)
1254 {
1255         return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
1256 }
1257 EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
1258
1259 /*
1260  * lazy_max_pages is the maximum amount of virtual address space we gather up
1261  * before attempting to purge with a TLB flush.
1262  *
1263  * There is a tradeoff here: a larger number will cover more kernel page tables
1264  * and take slightly longer to purge, but it will linearly reduce the number of
1265  * global TLB flushes that must be performed. It would seem natural to scale
1266  * this number up linearly with the number of CPUs (because vmapping activity
1267  * could also scale linearly with the number of CPUs), however it is likely
1268  * that in practice, workloads might be constrained in other ways that mean
1269  * vmap activity will not scale linearly with CPUs. Also, I want to be
1270  * conservative and not introduce a big latency on huge systems, so go with
1271  * a less aggressive log scale. It will still be an improvement over the old
1272  * code, and it will be simple to change the scale factor if we find that it
1273  * becomes a problem on bigger systems.
1274  */
1275 static unsigned long lazy_max_pages(void)
1276 {
1277         unsigned int log;
1278
1279         log = fls(num_online_cpus());
1280
1281         return log * (32UL * 1024 * 1024 / PAGE_SIZE);
1282 }
1283
1284 static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
1285
1286 /*
1287  * Serialize vmap purging.  There is no actual criticial section protected
1288  * by this look, but we want to avoid concurrent calls for performance
1289  * reasons and to make the pcpu_get_vm_areas more deterministic.
1290  */
1291 static DEFINE_MUTEX(vmap_purge_lock);
1292
1293 /* for per-CPU blocks */
1294 static void purge_fragmented_blocks_allcpus(void);
1295
1296 /*
1297  * called before a call to iounmap() if the caller wants vm_area_struct's
1298  * immediately freed.
1299  */
1300 void set_iounmap_nonlazy(void)
1301 {
1302         atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1);
1303 }
1304
1305 /*
1306  * Purges all lazily-freed vmap areas.
1307  */
1308 static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
1309 {
1310         unsigned long resched_threshold;
1311         struct llist_node *valist;
1312         struct vmap_area *va;
1313         struct vmap_area *n_va;
1314
1315         lockdep_assert_held(&vmap_purge_lock);
1316
1317         valist = llist_del_all(&vmap_purge_list);
1318         if (unlikely(valist == NULL))
1319                 return false;
1320
1321         /*
1322          * TODO: to calculate a flush range without looping.
1323          * The list can be up to lazy_max_pages() elements.
1324          */
1325         llist_for_each_entry(va, valist, purge_list) {
1326                 if (va->va_start < start)
1327                         start = va->va_start;
1328                 if (va->va_end > end)
1329                         end = va->va_end;
1330         }
1331
1332         flush_tlb_kernel_range(start, end);
1333         resched_threshold = lazy_max_pages() << 1;
1334
1335         spin_lock(&free_vmap_area_lock);
1336         llist_for_each_entry_safe(va, n_va, valist, purge_list) {
1337                 unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
1338                 unsigned long orig_start = va->va_start;
1339                 unsigned long orig_end = va->va_end;
1340
1341                 /*
1342                  * Finally insert or merge lazily-freed area. It is
1343                  * detached and there is no need to "unlink" it from
1344                  * anything.
1345                  */
1346                 va = merge_or_add_vmap_area(va, &free_vmap_area_root,
1347                                             &free_vmap_area_list);
1348
1349                 if (is_vmalloc_or_module_addr((void *)orig_start))
1350                         kasan_release_vmalloc(orig_start, orig_end,
1351                                               va->va_start, va->va_end);
1352
1353                 atomic_long_sub(nr, &vmap_lazy_nr);
1354
1355                 if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
1356                         cond_resched_lock(&free_vmap_area_lock);
1357         }
1358         spin_unlock(&free_vmap_area_lock);
1359         return true;
1360 }
1361
1362 /*
1363  * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
1364  * is already purging.
1365  */
1366 static void try_purge_vmap_area_lazy(void)
1367 {
1368         if (mutex_trylock(&vmap_purge_lock)) {
1369                 __purge_vmap_area_lazy(ULONG_MAX, 0);
1370                 mutex_unlock(&vmap_purge_lock);
1371         }
1372 }
1373
1374 /*
1375  * Kick off a purge of the outstanding lazy areas.
1376  */
1377 static void purge_vmap_area_lazy(void)
1378 {
1379         mutex_lock(&vmap_purge_lock);
1380         purge_fragmented_blocks_allcpus();
1381         __purge_vmap_area_lazy(ULONG_MAX, 0);
1382         mutex_unlock(&vmap_purge_lock);
1383 }
1384
1385 /*
1386  * Free a vmap area, caller ensuring that the area has been unmapped
1387  * and flush_cache_vunmap had been called for the correct range
1388  * previously.
1389  */
1390 static void free_vmap_area_noflush(struct vmap_area *va)
1391 {
1392         unsigned long nr_lazy;
1393
1394         spin_lock(&vmap_area_lock);
1395         unlink_va(va, &vmap_area_root);
1396         spin_unlock(&vmap_area_lock);
1397
1398         nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
1399                                 PAGE_SHIFT, &vmap_lazy_nr);
1400
1401         /* After this point, we may free va at any time */
1402         llist_add(&va->purge_list, &vmap_purge_list);
1403
1404         if (unlikely(nr_lazy > lazy_max_pages()))
1405                 try_purge_vmap_area_lazy();
1406 }
1407
1408 /*
1409  * Free and unmap a vmap area
1410  */
1411 static void free_unmap_vmap_area(struct vmap_area *va)
1412 {
1413         flush_cache_vunmap(va->va_start, va->va_end);
1414         unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start);
1415         if (debug_pagealloc_enabled_static())
1416                 flush_tlb_kernel_range(va->va_start, va->va_end);
1417
1418         free_vmap_area_noflush(va);
1419 }
1420
1421 static struct vmap_area *find_vmap_area(unsigned long addr)
1422 {
1423         struct vmap_area *va;
1424
1425         spin_lock(&vmap_area_lock);
1426         va = __find_vmap_area(addr);
1427         spin_unlock(&vmap_area_lock);
1428
1429         return va;
1430 }
1431
1432 /*** Per cpu kva allocator ***/
1433
1434 /*
1435  * vmap space is limited especially on 32 bit architectures. Ensure there is
1436  * room for at least 16 percpu vmap blocks per CPU.
1437  */
1438 /*
1439  * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
1440  * to #define VMALLOC_SPACE             (VMALLOC_END-VMALLOC_START). Guess
1441  * instead (we just need a rough idea)
1442  */
1443 #if BITS_PER_LONG == 32
1444 #define VMALLOC_SPACE           (128UL*1024*1024)
1445 #else
1446 #define VMALLOC_SPACE           (128UL*1024*1024*1024)
1447 #endif
1448
1449 #define VMALLOC_PAGES           (VMALLOC_SPACE / PAGE_SIZE)
1450 #define VMAP_MAX_ALLOC          BITS_PER_LONG   /* 256K with 4K pages */
1451 #define VMAP_BBMAP_BITS_MAX     1024    /* 4MB with 4K pages */
1452 #define VMAP_BBMAP_BITS_MIN     (VMAP_MAX_ALLOC*2)
1453 #define VMAP_MIN(x, y)          ((x) < (y) ? (x) : (y)) /* can't use min() */
1454 #define VMAP_MAX(x, y)          ((x) > (y) ? (x) : (y)) /* can't use max() */
1455 #define VMAP_BBMAP_BITS         \
1456                 VMAP_MIN(VMAP_BBMAP_BITS_MAX,   \
1457                 VMAP_MAX(VMAP_BBMAP_BITS_MIN,   \
1458                         VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
1459
1460 #define VMAP_BLOCK_SIZE         (VMAP_BBMAP_BITS * PAGE_SIZE)
1461
1462 struct vmap_block_queue {
1463         spinlock_t lock;
1464         struct list_head free;
1465 };
1466
1467 struct vmap_block {
1468         spinlock_t lock;
1469         struct vmap_area *va;
1470         unsigned long free, dirty;
1471         unsigned long dirty_min, dirty_max; /*< dirty range */
1472         struct list_head free_list;
1473         struct rcu_head rcu_head;
1474         struct list_head purge;
1475 };
1476
1477 /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
1478 static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
1479
1480 /*
1481  * XArray of vmap blocks, indexed by address, to quickly find a vmap block
1482  * in the free path. Could get rid of this if we change the API to return a
1483  * "cookie" from alloc, to be passed to free. But no big deal yet.
1484  */
1485 static DEFINE_XARRAY(vmap_blocks);
1486
1487 /*
1488  * We should probably have a fallback mechanism to allocate virtual memory
1489  * out of partially filled vmap blocks. However vmap block sizing should be
1490  * fairly reasonable according to the vmalloc size, so it shouldn't be a
1491  * big problem.
1492  */
1493
1494 static unsigned long addr_to_vb_idx(unsigned long addr)
1495 {
1496         addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
1497         addr /= VMAP_BLOCK_SIZE;
1498         return addr;
1499 }
1500
1501 static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
1502 {
1503         unsigned long addr;
1504
1505         addr = va_start + (pages_off << PAGE_SHIFT);
1506         BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
1507         return (void *)addr;
1508 }
1509
1510 /**
1511  * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
1512  *                  block. Of course pages number can't exceed VMAP_BBMAP_BITS
1513  * @order:    how many 2^order pages should be occupied in newly allocated block
1514  * @gfp_mask: flags for the page level allocator
1515  *
1516  * Return: virtual address in a newly allocated block or ERR_PTR(-errno)
1517  */
1518 static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
1519 {
1520         struct vmap_block_queue *vbq;
1521         struct vmap_block *vb;
1522         struct vmap_area *va;
1523         unsigned long vb_idx;
1524         int node, err;
1525         void *vaddr;
1526
1527         node = numa_node_id();
1528
1529         vb = kmalloc_node(sizeof(struct vmap_block),
1530                         gfp_mask & GFP_RECLAIM_MASK, node);
1531         if (unlikely(!vb))
1532                 return ERR_PTR(-ENOMEM);
1533
1534         va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
1535                                         VMALLOC_START, VMALLOC_END,
1536                                         node, gfp_mask);
1537         if (IS_ERR(va)) {
1538                 kfree(vb);
1539                 return ERR_CAST(va);
1540         }
1541
1542         vaddr = vmap_block_vaddr(va->va_start, 0);
1543         spin_lock_init(&vb->lock);
1544         vb->va = va;
1545         /* At least something should be left free */
1546         BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
1547         vb->free = VMAP_BBMAP_BITS - (1UL << order);
1548         vb->dirty = 0;
1549         vb->dirty_min = VMAP_BBMAP_BITS;
1550         vb->dirty_max = 0;
1551         INIT_LIST_HEAD(&vb->free_list);
1552
1553         vb_idx = addr_to_vb_idx(va->va_start);
1554         err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask);
1555         if (err) {
1556                 kfree(vb);
1557                 free_vmap_area(va);
1558                 return ERR_PTR(err);
1559         }
1560
1561         vbq = &get_cpu_var(vmap_block_queue);
1562         spin_lock(&vbq->lock);
1563         list_add_tail_rcu(&vb->free_list, &vbq->free);
1564         spin_unlock(&vbq->lock);
1565         put_cpu_var(vmap_block_queue);
1566
1567         return vaddr;
1568 }
1569
1570 static void free_vmap_block(struct vmap_block *vb)
1571 {
1572         struct vmap_block *tmp;
1573
1574         tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start));
1575         BUG_ON(tmp != vb);
1576
1577         free_vmap_area_noflush(vb->va);
1578         kfree_rcu(vb, rcu_head);
1579 }
1580
1581 static void purge_fragmented_blocks(int cpu)
1582 {
1583         LIST_HEAD(purge);
1584         struct vmap_block *vb;
1585         struct vmap_block *n_vb;
1586         struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
1587
1588         rcu_read_lock();
1589         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
1590
1591                 if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
1592                         continue;
1593
1594                 spin_lock(&vb->lock);
1595                 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
1596                         vb->free = 0; /* prevent further allocs after releasing lock */
1597                         vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
1598                         vb->dirty_min = 0;
1599                         vb->dirty_max = VMAP_BBMAP_BITS;
1600                         spin_lock(&vbq->lock);
1601                         list_del_rcu(&vb->free_list);
1602                         spin_unlock(&vbq->lock);
1603                         spin_unlock(&vb->lock);
1604                         list_add_tail(&vb->purge, &purge);
1605                 } else
1606                         spin_unlock(&vb->lock);
1607         }
1608         rcu_read_unlock();
1609
1610         list_for_each_entry_safe(vb, n_vb, &purge, purge) {
1611                 list_del(&vb->purge);
1612                 free_vmap_block(vb);
1613         }
1614 }
1615
1616 static void purge_fragmented_blocks_allcpus(void)
1617 {
1618         int cpu;
1619
1620         for_each_possible_cpu(cpu)
1621                 purge_fragmented_blocks(cpu);
1622 }
1623
1624 static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
1625 {
1626         struct vmap_block_queue *vbq;
1627         struct vmap_block *vb;
1628         void *vaddr = NULL;
1629         unsigned int order;
1630
1631         BUG_ON(offset_in_page(size));
1632         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
1633         if (WARN_ON(size == 0)) {
1634                 /*
1635                  * Allocating 0 bytes isn't what caller wants since
1636                  * get_order(0) returns funny result. Just warn and terminate
1637                  * early.
1638                  */
1639                 return NULL;
1640         }
1641         order = get_order(size);
1642
1643         rcu_read_lock();
1644         vbq = &get_cpu_var(vmap_block_queue);
1645         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
1646                 unsigned long pages_off;
1647
1648                 spin_lock(&vb->lock);
1649                 if (vb->free < (1UL << order)) {
1650                         spin_unlock(&vb->lock);
1651                         continue;
1652                 }
1653
1654                 pages_off = VMAP_BBMAP_BITS - vb->free;
1655                 vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
1656                 vb->free -= 1UL << order;
1657                 if (vb->free == 0) {
1658                         spin_lock(&vbq->lock);
1659                         list_del_rcu(&vb->free_list);
1660                         spin_unlock(&vbq->lock);
1661                 }
1662
1663                 spin_unlock(&vb->lock);
1664                 break;
1665         }
1666
1667         put_cpu_var(vmap_block_queue);
1668         rcu_read_unlock();
1669
1670         /* Allocate new block if nothing was found */
1671         if (!vaddr)
1672                 vaddr = new_vmap_block(order, gfp_mask);
1673
1674         return vaddr;
1675 }
1676
1677 static void vb_free(unsigned long addr, unsigned long size)
1678 {
1679         unsigned long offset;
1680         unsigned int order;
1681         struct vmap_block *vb;
1682
1683         BUG_ON(offset_in_page(size));
1684         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
1685
1686         flush_cache_vunmap(addr, addr + size);
1687
1688         order = get_order(size);
1689         offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
1690         vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));
1691
1692         unmap_kernel_range_noflush(addr, size);
1693
1694         if (debug_pagealloc_enabled_static())
1695                 flush_tlb_kernel_range(addr, addr + size);
1696
1697         spin_lock(&vb->lock);
1698
1699         /* Expand dirty range */
1700         vb->dirty_min = min(vb->dirty_min, offset);
1701         vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
1702
1703         vb->dirty += 1UL << order;
1704         if (vb->dirty == VMAP_BBMAP_BITS) {
1705                 BUG_ON(vb->free);
1706                 spin_unlock(&vb->lock);
1707                 free_vmap_block(vb);
1708         } else
1709                 spin_unlock(&vb->lock);
1710 }
1711
1712 static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
1713 {
1714         int cpu;
1715
1716         if (unlikely(!vmap_initialized))
1717                 return;
1718
1719         might_sleep();
1720
1721         for_each_possible_cpu(cpu) {
1722                 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
1723                 struct vmap_block *vb;
1724
1725                 rcu_read_lock();
1726                 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
1727                         spin_lock(&vb->lock);
1728                         if (vb->dirty) {
1729                                 unsigned long va_start = vb->va->va_start;
1730                                 unsigned long s, e;
1731
1732                                 s = va_start + (vb->dirty_min << PAGE_SHIFT);
1733                                 e = va_start + (vb->dirty_max << PAGE_SHIFT);
1734
1735                                 start = min(s, start);
1736                                 end   = max(e, end);
1737
1738                                 flush = 1;
1739                         }
1740                         spin_unlock(&vb->lock);
1741                 }
1742                 rcu_read_unlock();
1743         }
1744
1745         mutex_lock(&vmap_purge_lock);
1746         purge_fragmented_blocks_allcpus();
1747         if (!__purge_vmap_area_lazy(start, end) && flush)
1748                 flush_tlb_kernel_range(start, end);
1749         mutex_unlock(&vmap_purge_lock);
1750 }
1751
1752 /**
1753  * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
1754  *
1755  * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
1756  * to amortize TLB flushing overheads. What this means is that any page you
1757  * have now, may, in a former life, have been mapped into kernel virtual
1758  * address by the vmap layer and so there might be some CPUs with TLB entries
1759  * still referencing that page (additional to the regular 1:1 kernel mapping).
1760  *
1761  * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
1762  * be sure that none of the pages we have control over will have any aliases
1763  * from the vmap layer.
1764  */
1765 void vm_unmap_aliases(void)
1766 {
1767         unsigned long start = ULONG_MAX, end = 0;
1768         int flush = 0;
1769
1770         _vm_unmap_aliases(start, end, flush);
1771 }
1772 EXPORT_SYMBOL_GPL(vm_unmap_aliases);
1773
1774 /**
1775  * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
1776  * @mem: the pointer returned by vm_map_ram
1777  * @count: the count passed to that vm_map_ram call (cannot unmap partial)
1778  */
1779 void vm_unmap_ram(const void *mem, unsigned int count)
1780 {
1781         unsigned long size = (unsigned long)count << PAGE_SHIFT;
1782         unsigned long addr = (unsigned long)mem;
1783         struct vmap_area *va;
1784
1785         might_sleep();
1786         BUG_ON(!addr);
1787         BUG_ON(addr < VMALLOC_START);
1788         BUG_ON(addr > VMALLOC_END);
1789         BUG_ON(!PAGE_ALIGNED(addr));
1790
1791         kasan_poison_vmalloc(mem, size);
1792
1793         if (likely(count <= VMAP_MAX_ALLOC)) {
1794                 debug_check_no_locks_freed(mem, size);
1795                 vb_free(addr, size);
1796                 return;
1797         }
1798
1799         va = find_vmap_area(addr);
1800         BUG_ON(!va);
1801         debug_check_no_locks_freed((void *)va->va_start,
1802                                     (va->va_end - va->va_start));
1803         free_unmap_vmap_area(va);
1804 }
1805 EXPORT_SYMBOL(vm_unmap_ram);
1806
1807 /**
1808  * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
1809  * @pages: an array of pointers to the pages to be mapped
1810  * @count: number of pages
1811  * @node: prefer to allocate data structures on this node
1812  *
1813  * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
1814  * faster than vmap so it's good.  But if you mix long-life and short-life
1815  * objects with vm_map_ram(), it could consume lots of address space through
1816  * fragmentation (especially on a 32bit machine).  You could see failures in
1817  * the end.  Please use this function for short-lived objects.
1818  *
1819  * Returns: a pointer to the address that has been mapped, or %NULL on failure
1820  */
1821 void *vm_map_ram(struct page **pages, unsigned int count, int node)
1822 {
1823         unsigned long size = (unsigned long)count << PAGE_SHIFT;
1824         unsigned long addr;
1825         void *mem;
1826
1827         if (likely(count <= VMAP_MAX_ALLOC)) {
1828                 mem = vb_alloc(size, GFP_KERNEL);
1829                 if (IS_ERR(mem))
1830                         return NULL;
1831                 addr = (unsigned long)mem;
1832         } else {
1833                 struct vmap_area *va;
1834                 va = alloc_vmap_area(size, PAGE_SIZE,
1835                                 VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
1836                 if (IS_ERR(va))
1837                         return NULL;
1838
1839                 addr = va->va_start;
1840                 mem = (void *)addr;
1841         }
1842
1843         kasan_unpoison_vmalloc(mem, size);
1844
1845         if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) {
1846                 vm_unmap_ram(mem, count);
1847                 return NULL;
1848         }
1849         return mem;
1850 }
1851 EXPORT_SYMBOL(vm_map_ram);
1852
1853 static struct vm_struct *vmlist __initdata;
1854
1855 /**
1856  * vm_area_add_early - add vmap area early during boot
1857  * @vm: vm_struct to add
1858  *
1859  * This function is used to add fixed kernel vm area to vmlist before
1860  * vmalloc_init() is called.  @vm->addr, @vm->size, and @vm->flags
1861  * should contain proper values and the other fields should be zero.
1862  *
1863  * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
1864  */
1865 void __init vm_area_add_early(struct vm_struct *vm)
1866 {
1867         struct vm_struct *tmp, **p;
1868
1869         BUG_ON(vmap_initialized);
1870         for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
1871                 if (tmp->addr >= vm->addr) {
1872                         BUG_ON(tmp->addr < vm->addr + vm->size);
1873                         break;
1874                 } else
1875                         BUG_ON(tmp->addr + tmp->size > vm->addr);
1876         }
1877         vm->next = *p;
1878         *p = vm;
1879 }
1880
1881 /**
1882  * vm_area_register_early - register vmap area early during boot
1883  * @vm: vm_struct to register
1884  * @align: requested alignment
1885  *
1886  * This function is used to register kernel vm area before
1887  * vmalloc_init() is called.  @vm->size and @vm->flags should contain
1888  * proper values on entry and other fields should be zero.  On return,
1889  * vm->addr contains the allocated address.
1890  *
1891  * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
1892  */
1893 void __init vm_area_register_early(struct vm_struct *vm, size_t align)
1894 {
1895         static size_t vm_init_off __initdata;
1896         unsigned long addr;
1897
1898         addr = ALIGN(VMALLOC_START + vm_init_off, align);
1899         vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
1900
1901         vm->addr = (void *)addr;
1902
1903         vm_area_add_early(vm);
1904 }
1905
1906 static void vmap_init_free_space(void)
1907 {
1908         unsigned long vmap_start = 1;
1909         const unsigned long vmap_end = ULONG_MAX;
1910         struct vmap_area *busy, *free;
1911
1912         /*
1913          *     B     F     B     B     B     F
1914          * -|-----|.....|-----|-----|-----|.....|-
1915          *  |           The KVA space           |
1916          *  |<--------------------------------->|
1917          */
1918         list_for_each_entry(busy, &vmap_area_list, list) {
1919                 if (busy->va_start - vmap_start > 0) {
1920                         free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
1921                         if (!WARN_ON_ONCE(!free)) {
1922                                 free->va_start = vmap_start;
1923                                 free->va_end = busy->va_start;
1924
1925                                 insert_vmap_area_augment(free, NULL,
1926                                         &free_vmap_area_root,
1927                                                 &free_vmap_area_list);
1928                         }
1929                 }
1930
1931                 vmap_start = busy->va_end;
1932         }
1933
1934         if (vmap_end - vmap_start > 0) {
1935                 free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
1936                 if (!WARN_ON_ONCE(!free)) {
1937                         free->va_start = vmap_start;
1938                         free->va_end = vmap_end;
1939
1940                         insert_vmap_area_augment(free, NULL,
1941                                 &free_vmap_area_root,
1942                                         &free_vmap_area_list);
1943                 }
1944         }
1945 }
1946
1947 void __init vmalloc_init(void)
1948 {
1949         struct vmap_area *va;
1950         struct vm_struct *tmp;
1951         int i;
1952
1953         /*
1954          * Create the cache for vmap_area objects.
1955          */
1956         vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
1957
1958         for_each_possible_cpu(i) {
1959                 struct vmap_block_queue *vbq;
1960                 struct vfree_deferred *p;
1961
1962                 vbq = &per_cpu(vmap_block_queue, i);
1963                 spin_lock_init(&vbq->lock);
1964                 INIT_LIST_HEAD(&vbq->free);
1965                 p = &per_cpu(vfree_deferred, i);
1966                 init_llist_head(&p->list);
1967                 INIT_WORK(&p->wq, free_work);
1968         }
1969
1970         /* Import existing vmlist entries. */
1971         for (tmp = vmlist; tmp; tmp = tmp->next) {
1972                 va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
1973                 if (WARN_ON_ONCE(!va))
1974                         continue;
1975
1976                 va->va_start = (unsigned long)tmp->addr;
1977                 va->va_end = va->va_start + tmp->size;
1978                 va->vm = tmp;
1979                 insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
1980         }
1981
1982         /*
1983          * Now we can initialize a free vmap space.
1984          */
1985         vmap_init_free_space();
1986         vmap_initialized = true;
1987 }
1988
1989 /**
1990  * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
1991  * @addr: start of the VM area to unmap
1992  * @size: size of the VM area to unmap
1993  *
1994  * Similar to unmap_kernel_range_noflush() but flushes vcache before
1995  * the unmapping and tlb after.
1996  */
1997 void unmap_kernel_range(unsigned long addr, unsigned long size)
1998 {
1999         unsigned long end = addr + size;
2000
2001         flush_cache_vunmap(addr, end);
2002         unmap_kernel_range_noflush(addr, size);
2003         flush_tlb_kernel_range(addr, end);
2004 }
2005
2006 static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
2007         struct vmap_area *va, unsigned long flags, const void *caller)
2008 {
2009         vm->flags = flags;
2010         vm->addr = (void *)va->va_start;
2011         vm->size = va->va_end - va->va_start;
2012         vm->caller = caller;
2013         va->vm = vm;
2014 }
2015
2016 static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
2017                               unsigned long flags, const void *caller)
2018 {
2019         spin_lock(&vmap_area_lock);
2020         setup_vmalloc_vm_locked(vm, va, flags, caller);
2021         spin_unlock(&vmap_area_lock);
2022 }
2023
2024 static void clear_vm_uninitialized_flag(struct vm_struct *vm)
2025 {
2026         /*
2027          * Before removing VM_UNINITIALIZED,
2028          * we should make sure that vm has proper values.
2029          * Pair with smp_rmb() in show_numa_info().
2030          */
2031         smp_wmb();
2032         vm->flags &= ~VM_UNINITIALIZED;
2033 }
2034
2035 static struct vm_struct *__get_vm_area_node(unsigned long size,
2036                 unsigned long align, unsigned long flags, unsigned long start,
2037                 unsigned long end, int node, gfp_t gfp_mask, const void *caller)
2038 {
2039         struct vmap_area *va;
2040         struct vm_struct *area;
2041         unsigned long requested_size = size;
2042
2043         BUG_ON(in_interrupt());
2044         size = PAGE_ALIGN(size);
2045         if (unlikely(!size))
2046                 return NULL;
2047
2048         if (flags & VM_IOREMAP)
2049                 align = 1ul << clamp_t(int, get_count_order_long(size),
2050                                        PAGE_SHIFT, IOREMAP_MAX_ORDER);
2051
2052         area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
2053         if (unlikely(!area))
2054                 return NULL;
2055
2056         if (!(flags & VM_NO_GUARD))
2057                 size += PAGE_SIZE;
2058
2059         va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
2060         if (IS_ERR(va)) {
2061                 kfree(area);
2062                 return NULL;
2063         }
2064
2065         kasan_unpoison_vmalloc((void *)va->va_start, requested_size);
2066
2067         setup_vmalloc_vm(area, va, flags, caller);
2068
2069         return area;
2070 }
2071
2072 struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
2073                                        unsigned long start, unsigned long end,
2074                                        const void *caller)
2075 {
2076         return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
2077                                   GFP_KERNEL, caller);
2078 }
2079
2080 /**
2081  * get_vm_area - reserve a contiguous kernel virtual area
2082  * @size:        size of the area
2083  * @flags:       %VM_IOREMAP for I/O mappings or VM_ALLOC
2084  *
2085  * Search an area of @size in the kernel virtual mapping area,
2086  * and reserved it for out purposes.  Returns the area descriptor
2087  * on success or %NULL on failure.
2088  *
2089  * Return: the area descriptor on success or %NULL on failure.
2090  */
2091 struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
2092 {
2093         return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
2094                                   NUMA_NO_NODE, GFP_KERNEL,
2095                                   __builtin_return_address(0));
2096 }
2097
2098 struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
2099                                 const void *caller)
2100 {
2101         return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
2102                                   NUMA_NO_NODE, GFP_KERNEL, caller);
2103 }
2104
2105 /**
2106  * find_vm_area - find a continuous kernel virtual area
2107  * @addr:         base address
2108  *
2109  * Search for the kernel VM area starting at @addr, and return it.
2110  * It is up to the caller to do all required locking to keep the returned
2111  * pointer valid.
2112  *
2113  * Return: pointer to the found area or %NULL on faulure
2114  */
2115 struct vm_struct *find_vm_area(const void *addr)
2116 {
2117         struct vmap_area *va;
2118
2119         va = find_vmap_area((unsigned long)addr);
2120         if (!va)
2121                 return NULL;
2122
2123         return va->vm;
2124 }
2125
2126 /**
2127  * remove_vm_area - find and remove a continuous kernel virtual area
2128  * @addr:           base address
2129  *
2130  * Search for the kernel VM area starting at @addr, and remove it.
2131  * This function returns the found VM area, but using it is NOT safe
2132  * on SMP machines, except for its size or flags.
2133  *
2134  * Return: pointer to the found area or %NULL on faulure
2135  */
2136 struct vm_struct *remove_vm_area(const void *addr)
2137 {
2138         struct vmap_area *va;
2139
2140         might_sleep();
2141
2142         spin_lock(&vmap_area_lock);
2143         va = __find_vmap_area((unsigned long)addr);
2144         if (va && va->vm) {
2145                 struct vm_struct *vm = va->vm;
2146
2147                 va->vm = NULL;
2148                 spin_unlock(&vmap_area_lock);
2149
2150                 kasan_free_shadow(vm);
2151                 free_unmap_vmap_area(va);
2152
2153                 return vm;
2154         }
2155
2156         spin_unlock(&vmap_area_lock);
2157         return NULL;
2158 }
2159
2160 static inline void set_area_direct_map(const struct vm_struct *area,
2161                                        int (*set_direct_map)(struct page *page))
2162 {
2163         int i;
2164
2165         for (i = 0; i < area->nr_pages; i++)
2166                 if (page_address(area->pages[i]))
2167                         set_direct_map(area->pages[i]);
2168 }
2169
2170 /* Handle removing and resetting vm mappings related to the vm_struct. */
2171 static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
2172 {
2173         unsigned long start = ULONG_MAX, end = 0;
2174         int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
2175         int flush_dmap = 0;
2176         int i;
2177
2178         remove_vm_area(area->addr);
2179
2180         /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */
2181         if (!flush_reset)
2182                 return;
2183
2184         /*
2185          * If not deallocating pages, just do the flush of the VM area and
2186          * return.
2187          */
2188         if (!deallocate_pages) {
2189                 vm_unmap_aliases();
2190                 return;
2191         }
2192
2193         /*
2194          * If execution gets here, flush the vm mapping and reset the direct
2195          * map. Find the start and end range of the direct mappings to make sure
2196          * the vm_unmap_aliases() flush includes the direct map.
2197          */
2198         for (i = 0; i < area->nr_pages; i++) {
2199                 unsigned long addr = (unsigned long)page_address(area->pages[i]);
2200                 if (addr) {
2201                         start = min(addr, start);
2202                         end = max(addr + PAGE_SIZE, end);
2203                         flush_dmap = 1;
2204                 }
2205         }
2206
2207         /*
2208          * Set direct map to something invalid so that it won't be cached if
2209          * there are any accesses after the TLB flush, then flush the TLB and
2210          * reset the direct map permissions to the default.
2211          */
2212         set_area_direct_map(area, set_direct_map_invalid_noflush);
2213         _vm_unmap_aliases(start, end, flush_dmap);
2214         set_area_direct_map(area, set_direct_map_default_noflush);
2215 }
2216
2217 static void __vunmap(const void *addr, int deallocate_pages)
2218 {
2219         struct vm_struct *area;
2220
2221         if (!addr)
2222                 return;
2223
2224         if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
2225                         addr))
2226                 return;
2227
2228         area = find_vm_area(addr);
2229         if (unlikely(!area)) {
2230                 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
2231                                 addr);
2232                 return;
2233         }
2234
2235         debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
2236         debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
2237
2238         kasan_poison_vmalloc(area->addr, area->size);
2239
2240         vm_remove_mappings(area, deallocate_pages);
2241
2242         if (deallocate_pages) {
2243                 int i;
2244
2245                 for (i = 0; i < area->nr_pages; i++) {
2246                         struct page *page = area->pages[i];
2247
2248                         BUG_ON(!page);
2249                         __free_pages(page, 0);
2250                 }
2251                 atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
2252
2253                 kvfree(area->pages);
2254         }
2255
2256         kfree(area);
2257         return;
2258 }
2259
2260 static inline void __vfree_deferred(const void *addr)
2261 {
2262         /*
2263          * Use raw_cpu_ptr() because this can be called from preemptible
2264          * context. Preemption is absolutely fine here, because the llist_add()
2265          * implementation is lockless, so it works even if we are adding to
2266          * another cpu's list. schedule_work() should be fine with this too.
2267          */
2268         struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
2269
2270         if (llist_add((struct llist_node *)addr, &p->list))
2271                 schedule_work(&p->wq);
2272 }
2273
2274 /**
2275  * vfree_atomic - release memory allocated by vmalloc()
2276  * @addr:         memory base address
2277  *
2278  * This one is just like vfree() but can be called in any atomic context
2279  * except NMIs.
2280  */
2281 void vfree_atomic(const void *addr)
2282 {
2283         BUG_ON(in_nmi());
2284
2285         kmemleak_free(addr);
2286
2287         if (!addr)
2288                 return;
2289         __vfree_deferred(addr);
2290 }
2291
2292 static void __vfree(const void *addr)
2293 {
2294         if (unlikely(in_interrupt()))
2295                 __vfree_deferred(addr);
2296         else
2297                 __vunmap(addr, 1);
2298 }
2299
2300 /**
2301  * vfree - release memory allocated by vmalloc()
2302  * @addr:  memory base address
2303  *
2304  * Free the virtually continuous memory area starting at @addr, as
2305  * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
2306  * NULL, no operation is performed.
2307  *
2308  * Must not be called in NMI context (strictly speaking, only if we don't
2309  * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
2310  * conventions for vfree() arch-depenedent would be a really bad idea)
2311  *
2312  * May sleep if called *not* from interrupt context.
2313  *
2314  * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
2315  */
2316 void vfree(const void *addr)
2317 {
2318         BUG_ON(in_nmi());
2319
2320         kmemleak_free(addr);
2321
2322         might_sleep_if(!in_interrupt());
2323
2324         if (!addr)
2325                 return;
2326
2327         __vfree(addr);
2328 }
2329 EXPORT_SYMBOL(vfree);
2330
2331 /**
2332  * vunmap - release virtual mapping obtained by vmap()
2333  * @addr:   memory base address
2334  *
2335  * Free the virtually contiguous memory area starting at @addr,
2336  * which was created from the page array passed to vmap().
2337  *
2338  * Must not be called in interrupt context.
2339  */
2340 void vunmap(const void *addr)
2341 {
2342         BUG_ON(in_interrupt());
2343         might_sleep();
2344         if (addr)
2345                 __vunmap(addr, 0);
2346 }
2347 EXPORT_SYMBOL(vunmap);
2348
2349 /**
2350  * vmap - map an array of pages into virtually contiguous space
2351  * @pages: array of page pointers
2352  * @count: number of pages to map
2353  * @flags: vm_area->flags
2354  * @prot: page protection for the mapping
2355  *
2356  * Maps @count pages from @pages into contiguous kernel virtual
2357  * space.
2358  *
2359  * Return: the address of the area or %NULL on failure
2360  */
2361 void *vmap(struct page **pages, unsigned int count,
2362            unsigned long flags, pgprot_t prot)
2363 {
2364         struct vm_struct *area;
2365         unsigned long size;             /* In bytes */
2366
2367         might_sleep();
2368
2369         if (count > totalram_pages())
2370                 return NULL;
2371
2372         size = (unsigned long)count << PAGE_SHIFT;
2373         area = get_vm_area_caller(size, flags, __builtin_return_address(0));
2374         if (!area)
2375                 return NULL;
2376
2377         if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot),
2378                         pages) < 0) {
2379                 vunmap(area->addr);
2380                 return NULL;
2381         }
2382
2383         return area->addr;
2384 }
2385 EXPORT_SYMBOL(vmap);
2386
2387 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
2388                                  pgprot_t prot, int node)
2389 {
2390         struct page **pages;
2391         unsigned int nr_pages, array_size, i;
2392         const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
2393         const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
2394         const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
2395                                         0 :
2396                                         __GFP_HIGHMEM;
2397
2398         nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
2399         array_size = (nr_pages * sizeof(struct page *));
2400
2401         /* Please note that the recursion is strictly bounded. */
2402         if (array_size > PAGE_SIZE) {
2403                 pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
2404                                 node, area->caller);
2405         } else {
2406                 pages = kmalloc_node(array_size, nested_gfp, node);
2407         }
2408
2409         if (!pages) {
2410                 remove_vm_area(area->addr);
2411                 kfree(area);
2412                 return NULL;
2413         }
2414
2415         area->pages = pages;
2416         area->nr_pages = nr_pages;
2417
2418         for (i = 0; i < area->nr_pages; i++) {
2419                 struct page *page;
2420
2421                 if (node == NUMA_NO_NODE)
2422                         page = alloc_page(alloc_mask|highmem_mask);
2423                 else
2424                         page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
2425
2426                 if (unlikely(!page)) {
2427                         /* Successfully allocated i pages, free them in __vunmap() */
2428                         area->nr_pages = i;
2429                         atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
2430                         goto fail;
2431                 }
2432                 area->pages[i] = page;
2433                 if (gfpflags_allow_blocking(gfp_mask))
2434                         cond_resched();
2435         }
2436         atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
2437
2438         if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area),
2439                         prot, pages) < 0)
2440                 goto fail;
2441
2442         return area->addr;
2443
2444 fail:
2445         warn_alloc(gfp_mask, NULL,
2446                           "vmalloc: allocation failure, allocated %ld of %ld bytes",
2447                           (area->nr_pages*PAGE_SIZE), area->size);
2448         __vfree(area->addr);
2449         return NULL;
2450 }
2451
2452 /**
2453  * __vmalloc_node_range - allocate virtually contiguous memory
2454  * @size:                 allocation size
2455  * @align:                desired alignment
2456  * @start:                vm area range start
2457  * @end:                  vm area range end
2458  * @gfp_mask:             flags for the page level allocator
2459  * @prot:                 protection mask for the allocated pages
2460  * @vm_flags:             additional vm area flags (e.g. %VM_NO_GUARD)
2461  * @node:                 node to use for allocation or NUMA_NO_NODE
2462  * @caller:               caller's return address
2463  *
2464  * Allocate enough pages to cover @size from the page level
2465  * allocator with @gfp_mask flags.  Map them into contiguous
2466  * kernel virtual space, using a pagetable protection of @prot.
2467  *
2468  * Return: the address of the area or %NULL on failure
2469  */
2470 void *__vmalloc_node_range(unsigned long size, unsigned long align,
2471                         unsigned long start, unsigned long end, gfp_t gfp_mask,
2472                         pgprot_t prot, unsigned long vm_flags, int node,
2473                         const void *caller)
2474 {
2475         struct vm_struct *area;
2476         void *addr;
2477         unsigned long real_size = size;
2478
2479         size = PAGE_ALIGN(size);
2480         if (!size || (size >> PAGE_SHIFT) > totalram_pages())
2481                 goto fail;
2482
2483         area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED |
2484                                 vm_flags, start, end, node, gfp_mask, caller);
2485         if (!area)
2486                 goto fail;
2487
2488         addr = __vmalloc_area_node(area, gfp_mask, prot, node);
2489         if (!addr)
2490                 return NULL;
2491
2492         /*
2493          * In this function, newly allocated vm_struct has VM_UNINITIALIZED
2494          * flag. It means that vm_struct is not fully initialized.
2495          * Now, it is fully initialized, so remove this flag here.
2496          */
2497         clear_vm_uninitialized_flag(area);
2498
2499         kmemleak_vmalloc(area, size, gfp_mask);
2500
2501         return addr;
2502
2503 fail:
2504         warn_alloc(gfp_mask, NULL,
2505                           "vmalloc: allocation failure: %lu bytes", real_size);
2506         return NULL;
2507 }
2508
2509 /**
2510  * __vmalloc_node - allocate virtually contiguous memory
2511  * @size:           allocation size
2512  * @align:          desired alignment
2513  * @gfp_mask:       flags for the page level allocator
2514  * @node:           node to use for allocation or NUMA_NO_NODE
2515  * @caller:         caller's return address
2516  *
2517  * Allocate enough pages to cover @size from the page level allocator with
2518  * @gfp_mask flags.  Map them into contiguous kernel virtual space.
2519  *
2520  * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
2521  * and __GFP_NOFAIL are not supported
2522  *
2523  * Any use of gfp flags outside of GFP_KERNEL should be consulted
2524  * with mm people.
2525  *
2526  * Return: pointer to the allocated memory or %NULL on error
2527  */
2528 void *__vmalloc_node(unsigned long size, unsigned long align,
2529                             gfp_t gfp_mask, int node, const void *caller)
2530 {
2531         return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
2532                                 gfp_mask, PAGE_KERNEL, 0, node, caller);
2533 }
2534 /*
2535  * This is only for performance analysis of vmalloc and stress purpose.
2536  * It is required by vmalloc test module, therefore do not use it other
2537  * than that.
2538  */
2539 #ifdef CONFIG_TEST_VMALLOC_MODULE
2540 EXPORT_SYMBOL_GPL(__vmalloc_node);
2541 #endif
2542
2543 void *__vmalloc(unsigned long size, gfp_t gfp_mask)
2544 {
2545         return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE,
2546                                 __builtin_return_address(0));
2547 }
2548 EXPORT_SYMBOL(__vmalloc);
2549
2550 /**
2551  * vmalloc - allocate virtually contiguous memory
2552  * @size:    allocation size
2553  *
2554  * Allocate enough pages to cover @size from the page level
2555  * allocator and map them into contiguous kernel virtual space.
2556  *
2557  * For tight control over page level allocator and protection flags
2558  * use __vmalloc() instead.
2559  *
2560  * Return: pointer to the allocated memory or %NULL on error
2561  */
2562 void *vmalloc(unsigned long size)
2563 {
2564         return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE,
2565                                 __builtin_return_address(0));
2566 }
2567 EXPORT_SYMBOL(vmalloc);
2568
2569 /**
2570  * vzalloc - allocate virtually contiguous memory with zero fill
2571  * @size:    allocation size
2572  *
2573  * Allocate enough pages to cover @size from the page level
2574  * allocator and map them into contiguous kernel virtual space.
2575  * The memory allocated is set to zero.
2576  *
2577  * For tight control over page level allocator and protection flags
2578  * use __vmalloc() instead.
2579  *
2580  * Return: pointer to the allocated memory or %NULL on error
2581  */
2582 void *vzalloc(unsigned long size)
2583 {
2584         return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
2585                                 __builtin_return_address(0));
2586 }
2587 EXPORT_SYMBOL(vzalloc);
2588
2589 /**
2590  * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
2591  * @size: allocation size
2592  *
2593  * The resulting memory area is zeroed so it can be mapped to userspace
2594  * without leaking data.
2595  *
2596  * Return: pointer to the allocated memory or %NULL on error
2597  */
2598 void *vmalloc_user(unsigned long size)
2599 {
2600         return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
2601                                     GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
2602                                     VM_USERMAP, NUMA_NO_NODE,
2603                                     __builtin_return_address(0));
2604 }
2605 EXPORT_SYMBOL(vmalloc_user);
2606
2607 /**
2608  * vmalloc_node - allocate memory on a specific node
2609  * @size:         allocation size
2610  * @node:         numa node
2611  *
2612  * Allocate enough pages to cover @size from the page level
2613  * allocator and map them into contiguous kernel virtual space.
2614  *
2615  * For tight control over page level allocator and protection flags
2616  * use __vmalloc() instead.
2617  *
2618  * Return: pointer to the allocated memory or %NULL on error
2619  */
2620 void *vmalloc_node(unsigned long size, int node)
2621 {
2622         return __vmalloc_node(size, 1, GFP_KERNEL, node,
2623                         __builtin_return_address(0));
2624 }
2625 EXPORT_SYMBOL(vmalloc_node);
2626
2627 /**
2628  * vzalloc_node - allocate memory on a specific node with zero fill
2629  * @size:       allocation size
2630  * @node:       numa node
2631  *
2632  * Allocate enough pages to cover @size from the page level
2633  * allocator and map them into contiguous kernel virtual space.
2634  * The memory allocated is set to zero.
2635  *
2636  * Return: pointer to the allocated memory or %NULL on error
2637  */
2638 void *vzalloc_node(unsigned long size, int node)
2639 {
2640         return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node,
2641                                 __builtin_return_address(0));
2642 }
2643 EXPORT_SYMBOL(vzalloc_node);
2644
2645 #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
2646 #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
2647 #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
2648 #define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL)
2649 #else
2650 /*
2651  * 64b systems should always have either DMA or DMA32 zones. For others
2652  * GFP_DMA32 should do the right thing and use the normal zone.
2653  */
2654 #define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL
2655 #endif
2656
2657 /**
2658  * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
2659  * @size:       allocation size
2660  *
2661  * Allocate enough 32bit PA addressable pages to cover @size from the
2662  * page level allocator and map them into contiguous kernel virtual space.
2663  *
2664  * Return: pointer to the allocated memory or %NULL on error
2665  */
2666 void *vmalloc_32(unsigned long size)
2667 {
2668         return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
2669                         __builtin_return_address(0));
2670 }
2671 EXPORT_SYMBOL(vmalloc_32);
2672
2673 /**
2674  * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
2675  * @size:            allocation size
2676  *
2677  * The resulting memory area is 32bit addressable and zeroed so it can be
2678  * mapped to userspace without leaking data.
2679  *
2680  * Return: pointer to the allocated memory or %NULL on error
2681  */
2682 void *vmalloc_32_user(unsigned long size)
2683 {
2684         return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
2685                                     GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
2686                                     VM_USERMAP, NUMA_NO_NODE,
2687                                     __builtin_return_address(0));
2688 }
2689 EXPORT_SYMBOL(vmalloc_32_user);
2690
2691 /*
2692  * small helper routine , copy contents to buf from addr.
2693  * If the page is not present, fill zero.
2694  */
2695
2696 static int aligned_vread(char *buf, char *addr, unsigned long count)
2697 {
2698         struct page *p;
2699         int copied = 0;
2700
2701         while (count) {
2702                 unsigned long offset, length;
2703
2704                 offset = offset_in_page(addr);
2705                 length = PAGE_SIZE - offset;
2706                 if (length > count)
2707                         length = count;
2708                 p = vmalloc_to_page(addr);
2709                 /*
2710                  * To do safe access to this _mapped_ area, we need
2711                  * lock. But adding lock here means that we need to add
2712                  * overhead of vmalloc()/vfree() calles for this _debug_
2713                  * interface, rarely used. Instead of that, we'll use
2714                  * kmap() and get small overhead in this access function.
2715                  */
2716                 if (p) {
2717                         /*
2718                          * we can expect USER0 is not used (see vread/vwrite's
2719                          * function description)
2720                          */
2721                         void *map = kmap_atomic(p);
2722                         memcpy(buf, map + offset, length);
2723                         kunmap_atomic(map);
2724                 } else
2725                         memset(buf, 0, length);
2726
2727                 addr += length;
2728                 buf += length;
2729                 copied += length;
2730                 count -= length;
2731         }
2732         return copied;
2733 }
2734
2735 static int aligned_vwrite(char *buf, char *addr, unsigned long count)
2736 {
2737         struct page *p;
2738         int copied = 0;
2739
2740         while (count) {
2741                 unsigned long offset, length;
2742
2743                 offset = offset_in_page(addr);
2744                 length = PAGE_SIZE - offset;
2745                 if (length > count)
2746                         length = count;
2747                 p = vmalloc_to_page(addr);
2748                 /*
2749                  * To do safe access to this _mapped_ area, we need
2750                  * lock. But adding lock here means that we need to add
2751                  * overhead of vmalloc()/vfree() calles for this _debug_
2752                  * interface, rarely used. Instead of that, we'll use
2753                  * kmap() and get small overhead in this access function.
2754                  */
2755                 if (p) {
2756                         /*
2757                          * we can expect USER0 is not used (see vread/vwrite's
2758                          * function description)
2759                          */
2760                         void *map = kmap_atomic(p);
2761                         memcpy(map + offset, buf, length);
2762                         kunmap_atomic(map);
2763                 }
2764                 addr += length;
2765                 buf += length;
2766                 copied += length;
2767                 count -= length;
2768         }
2769         return copied;
2770 }
2771
2772 /**
2773  * vread() - read vmalloc area in a safe way.
2774  * @buf:     buffer for reading data
2775  * @addr:    vm address.
2776  * @count:   number of bytes to be read.
2777  *
2778  * This function checks that addr is a valid vmalloc'ed area, and
2779  * copy data from that area to a given buffer. If the given memory range
2780  * of [addr...addr+count) includes some valid address, data is copied to
2781  * proper area of @buf. If there are memory holes, they'll be zero-filled.
2782  * IOREMAP area is treated as memory hole and no copy is done.
2783  *
2784  * If [addr...addr+count) doesn't includes any intersects with alive
2785  * vm_struct area, returns 0. @buf should be kernel's buffer.
2786  *
2787  * Note: In usual ops, vread() is never necessary because the caller
2788  * should know vmalloc() area is valid and can use memcpy().
2789  * This is for routines which have to access vmalloc area without
2790  * any information, as /dev/kmem.
2791  *
2792  * Return: number of bytes for which addr and buf should be increased
2793  * (same number as @count) or %0 if [addr...addr+count) doesn't
2794  * include any intersection with valid vmalloc area
2795  */
2796 long vread(char *buf, char *addr, unsigned long count)
2797 {
2798         struct vmap_area *va;
2799         struct vm_struct *vm;
2800         char *vaddr, *buf_start = buf;
2801         unsigned long buflen = count;
2802         unsigned long n;
2803
2804         /* Don't allow overflow */
2805         if ((unsigned long) addr + count < count)
2806                 count = -(unsigned long) addr;
2807
2808         spin_lock(&vmap_area_lock);
2809         list_for_each_entry(va, &vmap_area_list, list) {
2810                 if (!count)
2811                         break;
2812
2813                 if (!va->vm)
2814                         continue;
2815
2816                 vm = va->vm;
2817                 vaddr = (char *) vm->addr;
2818                 if (addr >= vaddr + get_vm_area_size(vm))
2819                         continue;
2820                 while (addr < vaddr) {
2821                         if (count == 0)
2822                                 goto finished;
2823                         *buf = '\0';
2824                         buf++;
2825                         addr++;
2826                         count--;
2827                 }
2828                 n = vaddr + get_vm_area_size(vm) - addr;
2829                 if (n > count)
2830                         n = count;
2831                 if (!(vm->flags & VM_IOREMAP))
2832                         aligned_vread(buf, addr, n);
2833                 else /* IOREMAP area is treated as memory hole */
2834                         memset(buf, 0, n);
2835                 buf += n;
2836                 addr += n;
2837                 count -= n;
2838         }
2839 finished:
2840         spin_unlock(&vmap_area_lock);
2841
2842         if (buf == buf_start)
2843                 return 0;
2844         /* zero-fill memory holes */
2845         if (buf != buf_start + buflen)
2846                 memset(buf, 0, buflen - (buf - buf_start));
2847
2848         return buflen;
2849 }
2850
2851 /**
2852  * vwrite() - write vmalloc area in a safe way.
2853  * @buf:      buffer for source data
2854  * @addr:     vm address.
2855  * @count:    number of bytes to be read.
2856  *
2857  * This function checks that addr is a valid vmalloc'ed area, and
2858  * copy data from a buffer to the given addr. If specified range of
2859  * [addr...addr+count) includes some valid address, data is copied from
2860  * proper area of @buf. If there are memory holes, no copy to hole.
2861  * IOREMAP area is treated as memory hole and no copy is done.
2862  *
2863  * If [addr...addr+count) doesn't includes any intersects with alive
2864  * vm_struct area, returns 0. @buf should be kernel's buffer.
2865  *
2866  * Note: In usual ops, vwrite() is never necessary because the caller
2867  * should know vmalloc() area is valid and can use memcpy().
2868  * This is for routines which have to access vmalloc area without
2869  * any information, as /dev/kmem.
2870  *
2871  * Return: number of bytes for which addr and buf should be
2872  * increased (same number as @count) or %0 if [addr...addr+count)
2873  * doesn't include any intersection with valid vmalloc area
2874  */
2875 long vwrite(char *buf, char *addr, unsigned long count)
2876 {
2877         struct vmap_area *va;
2878         struct vm_struct *vm;
2879         char *vaddr;
2880         unsigned long n, buflen;
2881         int copied = 0;
2882
2883         /* Don't allow overflow */
2884         if ((unsigned long) addr + count < count)
2885                 count = -(unsigned long) addr;
2886         buflen = count;
2887
2888         spin_lock(&vmap_area_lock);
2889         list_for_each_entry(va, &vmap_area_list, list) {
2890                 if (!count)
2891                         break;
2892
2893                 if (!va->vm)
2894                         continue;
2895
2896                 vm = va->vm;
2897                 vaddr = (char *) vm->addr;
2898                 if (addr >= vaddr + get_vm_area_size(vm))
2899                         continue;
2900                 while (addr < vaddr) {
2901                         if (count == 0)
2902                                 goto finished;
2903                         buf++;
2904                         addr++;
2905                         count--;
2906                 }
2907                 n = vaddr + get_vm_area_size(vm) - addr;
2908                 if (n > count)
2909                         n = count;
2910                 if (!(vm->flags & VM_IOREMAP)) {
2911                         aligned_vwrite(buf, addr, n);
2912                         copied++;
2913                 }
2914                 buf += n;
2915                 addr += n;
2916                 count -= n;
2917         }
2918 finished:
2919         spin_unlock(&vmap_area_lock);
2920         if (!copied)
2921                 return 0;
2922         return buflen;
2923 }
2924
2925 /**
2926  * remap_vmalloc_range_partial - map vmalloc pages to userspace
2927  * @vma:                vma to cover
2928  * @uaddr:              target user address to start at
2929  * @kaddr:              virtual address of vmalloc kernel memory
2930  * @pgoff:              offset from @kaddr to start at
2931  * @size:               size of map area
2932  *
2933  * Returns:     0 for success, -Exxx on failure
2934  *
2935  * This function checks that @kaddr is a valid vmalloc'ed area,
2936  * and that it is big enough to cover the range starting at
2937  * @uaddr in @vma. Will return failure if that criteria isn't
2938  * met.
2939  *
2940  * Similar to remap_pfn_range() (see mm/memory.c)
2941  */
2942 int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
2943                                 void *kaddr, unsigned long pgoff,
2944                                 unsigned long size)
2945 {
2946         struct vm_struct *area;
2947         unsigned long off;
2948         unsigned long end_index;
2949
2950         if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
2951                 return -EINVAL;
2952
2953         size = PAGE_ALIGN(size);
2954
2955         if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
2956                 return -EINVAL;
2957
2958         area = find_vm_area(kaddr);
2959         if (!area)
2960                 return -EINVAL;
2961
2962         if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT)))
2963                 return -EINVAL;
2964
2965         if (check_add_overflow(size, off, &end_index) ||
2966             end_index > get_vm_area_size(area))
2967                 return -EINVAL;
2968         kaddr += off;
2969
2970         do {
2971                 struct page *page = vmalloc_to_page(kaddr);
2972                 int ret;
2973
2974                 ret = vm_insert_page(vma, uaddr, page);
2975                 if (ret)
2976                         return ret;
2977
2978                 uaddr += PAGE_SIZE;
2979                 kaddr += PAGE_SIZE;
2980                 size -= PAGE_SIZE;
2981         } while (size > 0);
2982
2983         vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
2984
2985         return 0;
2986 }
2987 EXPORT_SYMBOL(remap_vmalloc_range_partial);
2988
2989 /**
2990  * remap_vmalloc_range - map vmalloc pages to userspace
2991  * @vma:                vma to cover (map full range of vma)
2992  * @addr:               vmalloc memory
2993  * @pgoff:              number of pages into addr before first page to map
2994  *
2995  * Returns:     0 for success, -Exxx on failure
2996  *
2997  * This function checks that addr is a valid vmalloc'ed area, and
2998  * that it is big enough to cover the vma. Will return failure if
2999  * that criteria isn't met.
3000  *
3001  * Similar to remap_pfn_range() (see mm/memory.c)
3002  */
3003 int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
3004                                                 unsigned long pgoff)
3005 {
3006         return remap_vmalloc_range_partial(vma, vma->vm_start,
3007                                            addr, pgoff,
3008                                            vma->vm_end - vma->vm_start);
3009 }
3010 EXPORT_SYMBOL(remap_vmalloc_range);
3011
3012 static int f(pte_t *pte, unsigned long addr, void *data)
3013 {
3014         pte_t ***p = data;
3015
3016         if (p) {
3017                 *(*p) = pte;
3018                 (*p)++;
3019         }
3020         return 0;
3021 }
3022
3023 /**
3024  * alloc_vm_area - allocate a range of kernel address space
3025  * @size:          size of the area
3026  * @ptes:          returns the PTEs for the address space
3027  *
3028  * Returns:     NULL on failure, vm_struct on success
3029  *
3030  * This function reserves a range of kernel address space, and
3031  * allocates pagetables to map that range.  No actual mappings
3032  * are created.
3033  *
3034  * If @ptes is non-NULL, pointers to the PTEs (in init_mm)
3035  * allocated for the VM area are returned.
3036  */
3037 struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
3038 {
3039         struct vm_struct *area;
3040
3041         area = get_vm_area_caller(size, VM_IOREMAP,
3042                                 __builtin_return_address(0));
3043         if (area == NULL)
3044                 return NULL;
3045
3046         /*
3047          * This ensures that page tables are constructed for this region
3048          * of kernel virtual address space and mapped into init_mm.
3049          */
3050         if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
3051                                 size, f, ptes ? &ptes : NULL)) {
3052                 free_vm_area(area);
3053                 return NULL;
3054         }
3055
3056         return area;
3057 }
3058 EXPORT_SYMBOL_GPL(alloc_vm_area);
3059
3060 void free_vm_area(struct vm_struct *area)
3061 {
3062         struct vm_struct *ret;
3063         ret = remove_vm_area(area->addr);
3064         BUG_ON(ret != area);
3065         kfree(area);
3066 }
3067 EXPORT_SYMBOL_GPL(free_vm_area);
3068
3069 #ifdef CONFIG_SMP
3070 static struct vmap_area *node_to_va(struct rb_node *n)
3071 {
3072         return rb_entry_safe(n, struct vmap_area, rb_node);
3073 }
3074
3075 /**
3076  * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
3077  * @addr: target address
3078  *
3079  * Returns: vmap_area if it is found. If there is no such area
3080  *   the first highest(reverse order) vmap_area is returned
3081  *   i.e. va->va_start < addr && va->va_end < addr or NULL
3082  *   if there are no any areas before @addr.
3083  */
3084 static struct vmap_area *
3085 pvm_find_va_enclose_addr(unsigned long addr)
3086 {
3087         struct vmap_area *va, *tmp;
3088         struct rb_node *n;
3089
3090         n = free_vmap_area_root.rb_node;
3091         va = NULL;
3092
3093         while (n) {
3094                 tmp = rb_entry(n, struct vmap_area, rb_node);
3095                 if (tmp->va_start <= addr) {
3096                         va = tmp;
3097                         if (tmp->va_end >= addr)
3098                                 break;
3099
3100                         n = n->rb_right;
3101                 } else {
3102                         n = n->rb_left;
3103                 }
3104         }
3105
3106         return va;
3107 }
3108
3109 /**
3110  * pvm_determine_end_from_reverse - find the highest aligned address
3111  * of free block below VMALLOC_END
3112  * @va:
3113  *   in - the VA we start the search(reverse order);
3114  *   out - the VA with the highest aligned end address.
3115  *
3116  * Returns: determined end address within vmap_area
3117  */
3118 static unsigned long
3119 pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
3120 {
3121         unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
3122         unsigned long addr;
3123
3124         if (likely(*va)) {
3125                 list_for_each_entry_from_reverse((*va),
3126                                 &free_vmap_area_list, list) {
3127                         addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
3128                         if ((*va)->va_start < addr)
3129                                 return addr;
3130                 }
3131         }
3132
3133         return 0;
3134 }
3135
3136 /**
3137  * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
3138  * @offsets: array containing offset of each area
3139  * @sizes: array containing size of each area
3140  * @nr_vms: the number of areas to allocate
3141  * @align: alignment, all entries in @offsets and @sizes must be aligned to this
3142  *
3143  * Returns: kmalloc'd vm_struct pointer array pointing to allocated
3144  *          vm_structs on success, %NULL on failure
3145  *
3146  * Percpu allocator wants to use congruent vm areas so that it can
3147  * maintain the offsets among percpu areas.  This function allocates
3148  * congruent vmalloc areas for it with GFP_KERNEL.  These areas tend to
3149  * be scattered pretty far, distance between two areas easily going up
3150  * to gigabytes.  To avoid interacting with regular vmallocs, these
3151  * areas are allocated from top.
3152  *
3153  * Despite its complicated look, this allocator is rather simple. It
3154  * does everything top-down and scans free blocks from the end looking
3155  * for matching base. While scanning, if any of the areas do not fit the
3156  * base address is pulled down to fit the area. Scanning is repeated till
3157  * all the areas fit and then all necessary data structures are inserted
3158  * and the result is returned.
3159  */
3160 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
3161                                      const size_t *sizes, int nr_vms,
3162                                      size_t align)
3163 {
3164         const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
3165         const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
3166         struct vmap_area **vas, *va;
3167         struct vm_struct **vms;
3168         int area, area2, last_area, term_area;
3169         unsigned long base, start, size, end, last_end, orig_start, orig_end;
3170         bool purged = false;
3171         enum fit_type type;
3172
3173         /* verify parameters and allocate data structures */
3174         BUG_ON(offset_in_page(align) || !is_power_of_2(align));
3175         for (last_area = 0, area = 0; area < nr_vms; area++) {
3176                 start = offsets[area];
3177                 end = start + sizes[area];
3178
3179                 /* is everything aligned properly? */
3180                 BUG_ON(!IS_ALIGNED(offsets[area], align));
3181                 BUG_ON(!IS_ALIGNED(sizes[area], align));
3182
3183                 /* detect the area with the highest address */
3184                 if (start > offsets[last_area])
3185                         last_area = area;
3186
3187                 for (area2 = area + 1; area2 < nr_vms; area2++) {
3188                         unsigned long start2 = offsets[area2];
3189                         unsigned long end2 = start2 + sizes[area2];
3190
3191                         BUG_ON(start2 < end && start < end2);
3192                 }
3193         }
3194         last_end = offsets[last_area] + sizes[last_area];
3195
3196         if (vmalloc_end - vmalloc_start < last_end) {
3197                 WARN_ON(true);
3198                 return NULL;
3199         }
3200
3201         vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
3202         vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
3203         if (!vas || !vms)
3204                 goto err_free2;
3205
3206         for (area = 0; area < nr_vms; area++) {
3207                 vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
3208                 vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
3209                 if (!vas[area] || !vms[area])
3210                         goto err_free;
3211         }
3212 retry:
3213         spin_lock(&free_vmap_area_lock);
3214
3215         /* start scanning - we scan from the top, begin with the last area */
3216         area = term_area = last_area;
3217         start = offsets[area];
3218         end = start + sizes[area];
3219
3220         va = pvm_find_va_enclose_addr(vmalloc_end);
3221         base = pvm_determine_end_from_reverse(&va, align) - end;
3222
3223         while (true) {
3224                 /*
3225                  * base might have underflowed, add last_end before
3226                  * comparing.
3227                  */
3228                 if (base + last_end < vmalloc_start + last_end)
3229                         goto overflow;
3230
3231                 /*
3232                  * Fitting base has not been found.
3233                  */
3234                 if (va == NULL)
3235                         goto overflow;
3236
3237                 /*
3238                  * If required width exceeds current VA block, move
3239                  * base downwards and then recheck.
3240                  */
3241                 if (base + end > va->va_end) {
3242                         base = pvm_determine_end_from_reverse(&va, align) - end;
3243                         term_area = area;
3244                         continue;
3245                 }
3246
3247                 /*
3248                  * If this VA does not fit, move base downwards and recheck.
3249                  */
3250                 if (base + start < va->va_start) {
3251                         va = node_to_va(rb_prev(&va->rb_node));
3252                         base = pvm_determine_end_from_reverse(&va, align) - end;
3253                         term_area = area;
3254                         continue;
3255                 }
3256
3257                 /*
3258                  * This area fits, move on to the previous one.  If
3259                  * the previous one is the terminal one, we're done.
3260                  */
3261                 area = (area + nr_vms - 1) % nr_vms;
3262                 if (area == term_area)
3263                         break;
3264
3265                 start = offsets[area];
3266                 end = start + sizes[area];
3267                 va = pvm_find_va_enclose_addr(base + end);
3268         }
3269
3270         /* we've found a fitting base, insert all va's */
3271         for (area = 0; area < nr_vms; area++) {
3272                 int ret;
3273
3274                 start = base + offsets[area];
3275                 size = sizes[area];
3276
3277                 va = pvm_find_va_enclose_addr(start);
3278                 if (WARN_ON_ONCE(va == NULL))
3279                         /* It is a BUG(), but trigger recovery instead. */
3280                         goto recovery;
3281
3282                 type = classify_va_fit_type(va, start, size);
3283                 if (WARN_ON_ONCE(type == NOTHING_FIT))
3284                         /* It is a BUG(), but trigger recovery instead. */
3285                         goto recovery;
3286
3287                 ret = adjust_va_to_fit_type(va, start, size, type);
3288                 if (unlikely(ret))
3289                         goto recovery;
3290
3291                 /* Allocated area. */
3292                 va = vas[area];
3293                 va->va_start = start;
3294                 va->va_end = start + size;
3295         }
3296
3297         spin_unlock(&free_vmap_area_lock);
3298
3299         /* populate the kasan shadow space */
3300         for (area = 0; area < nr_vms; area++) {
3301                 if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
3302                         goto err_free_shadow;
3303
3304                 kasan_unpoison_vmalloc((void *)vas[area]->va_start,
3305                                        sizes[area]);
3306         }
3307
3308         /* insert all vm's */
3309         spin_lock(&vmap_area_lock);
3310         for (area = 0; area < nr_vms; area++) {
3311                 insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
3312
3313                 setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
3314                                  pcpu_get_vm_areas);
3315         }
3316         spin_unlock(&vmap_area_lock);
3317
3318         kfree(vas);
3319         return vms;
3320
3321 recovery:
3322         /*
3323          * Remove previously allocated areas. There is no
3324          * need in removing these areas from the busy tree,
3325          * because they are inserted only on the final step
3326          * and when pcpu_get_vm_areas() is success.
3327          */
3328         while (area--) {
3329                 orig_start = vas[area]->va_start;
3330                 orig_end = vas[area]->va_end;
3331                 va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
3332                                             &free_vmap_area_list);
3333                 kasan_release_vmalloc(orig_start, orig_end,
3334                                       va->va_start, va->va_end);
3335                 vas[area] = NULL;
3336         }
3337
3338 overflow:
3339         spin_unlock(&free_vmap_area_lock);
3340         if (!purged) {
3341                 purge_vmap_area_lazy();
3342                 purged = true;
3343
3344                 /* Before "retry", check if we recover. */
3345                 for (area = 0; area < nr_vms; area++) {
3346                         if (vas[area])
3347                                 continue;
3348
3349                         vas[area] = kmem_cache_zalloc(
3350                                 vmap_area_cachep, GFP_KERNEL);
3351                         if (!vas[area])
3352                                 goto err_free;
3353                 }
3354
3355                 goto retry;
3356         }
3357
3358 err_free:
3359         for (area = 0; area < nr_vms; area++) {
3360                 if (vas[area])
3361                         kmem_cache_free(vmap_area_cachep, vas[area]);
3362
3363                 kfree(vms[area]);
3364         }
3365 err_free2:
3366         kfree(vas);
3367         kfree(vms);
3368         return NULL;
3369
3370 err_free_shadow:
3371         spin_lock(&free_vmap_area_lock);
3372         /*
3373          * We release all the vmalloc shadows, even the ones for regions that
3374          * hadn't been successfully added. This relies on kasan_release_vmalloc
3375          * being able to tolerate this case.
3376          */
3377         for (area = 0; area < nr_vms; area++) {
3378                 orig_start = vas[area]->va_start;
3379                 orig_end = vas[area]->va_end;
3380                 va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
3381                                             &free_vmap_area_list);
3382                 kasan_release_vmalloc(orig_start, orig_end,
3383                                       va->va_start, va->va_end);
3384                 vas[area] = NULL;
3385                 kfree(vms[area]);
3386         }
3387         spin_unlock(&free_vmap_area_lock);
3388         kfree(vas);
3389         kfree(vms);
3390         return NULL;
3391 }
3392
3393 /**
3394  * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
3395  * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
3396  * @nr_vms: the number of allocated areas
3397  *
3398  * Free vm_structs and the array allocated by pcpu_get_vm_areas().
3399  */
3400 void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
3401 {
3402         int i;
3403
3404         for (i = 0; i < nr_vms; i++)
3405                 free_vm_area(vms[i]);
3406         kfree(vms);
3407 }
3408 #endif  /* CONFIG_SMP */
3409
3410 #ifdef CONFIG_PROC_FS
3411 static void *s_start(struct seq_file *m, loff_t *pos)
3412         __acquires(&vmap_purge_lock)
3413         __acquires(&vmap_area_lock)
3414 {
3415         mutex_lock(&vmap_purge_lock);
3416         spin_lock(&vmap_area_lock);
3417
3418         return seq_list_start(&vmap_area_list, *pos);
3419 }
3420
3421 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
3422 {
3423         return seq_list_next(p, &vmap_area_list, pos);
3424 }
3425
3426 static void s_stop(struct seq_file *m, void *p)
3427         __releases(&vmap_purge_lock)
3428         __releases(&vmap_area_lock)
3429 {
3430         mutex_unlock(&vmap_purge_lock);
3431         spin_unlock(&vmap_area_lock);
3432 }
3433
3434 static void show_numa_info(struct seq_file *m, struct vm_struct *v)
3435 {
3436         if (IS_ENABLED(CONFIG_NUMA)) {
3437                 unsigned int nr, *counters = m->private;
3438
3439                 if (!counters)
3440                         return;
3441
3442                 if (v->flags & VM_UNINITIALIZED)
3443                         return;
3444                 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
3445                 smp_rmb();
3446
3447                 memset(counters, 0, nr_node_ids * sizeof(unsigned int));
3448
3449                 for (nr = 0; nr < v->nr_pages; nr++)
3450                         counters[page_to_nid(v->pages[nr])]++;
3451
3452                 for_each_node_state(nr, N_HIGH_MEMORY)
3453                         if (counters[nr])
3454                                 seq_printf(m, " N%u=%u", nr, counters[nr]);
3455         }
3456 }
3457
3458 static void show_purge_info(struct seq_file *m)
3459 {
3460         struct llist_node *head;
3461         struct vmap_area *va;
3462
3463         head = READ_ONCE(vmap_purge_list.first);
3464         if (head == NULL)
3465                 return;
3466
3467         llist_for_each_entry(va, head, purge_list) {
3468                 seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
3469                         (void *)va->va_start, (void *)va->va_end,
3470                         va->va_end - va->va_start);
3471         }
3472 }
3473
3474 static int s_show(struct seq_file *m, void *p)
3475 {
3476         struct vmap_area *va;
3477         struct vm_struct *v;
3478
3479         va = list_entry(p, struct vmap_area, list);
3480
3481         /*
3482          * s_show can encounter race with remove_vm_area, !vm on behalf
3483          * of vmap area is being tear down or vm_map_ram allocation.
3484          */
3485         if (!va->vm) {
3486                 seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
3487                         (void *)va->va_start, (void *)va->va_end,
3488                         va->va_end - va->va_start);
3489
3490                 return 0;
3491         }
3492
3493         v = va->vm;
3494
3495         seq_printf(m, "0x%pK-0x%pK %7ld",
3496                 v->addr, v->addr + v->size, v->size);
3497
3498         if (v->caller)
3499                 seq_printf(m, " %pS", v->caller);
3500
3501         if (v->nr_pages)
3502                 seq_printf(m, " pages=%d", v->nr_pages);
3503
3504         if (v->phys_addr)
3505                 seq_printf(m, " phys=%pa", &v->phys_addr);
3506
3507         if (v->flags & VM_IOREMAP)
3508                 seq_puts(m, " ioremap");
3509
3510         if (v->flags & VM_ALLOC)
3511                 seq_puts(m, " vmalloc");
3512
3513         if (v->flags & VM_MAP)
3514                 seq_puts(m, " vmap");
3515
3516         if (v->flags & VM_USERMAP)
3517                 seq_puts(m, " user");
3518
3519         if (v->flags & VM_DMA_COHERENT)
3520                 seq_puts(m, " dma-coherent");
3521
3522         if (is_vmalloc_addr(v->pages))
3523                 seq_puts(m, " vpages");
3524
3525         show_numa_info(m, v);
3526         seq_putc(m, '\n');
3527
3528         /*
3529          * As a final step, dump "unpurged" areas. Note,
3530          * that entire "/proc/vmallocinfo" output will not
3531          * be address sorted, because the purge list is not
3532          * sorted.
3533          */
3534         if (list_is_last(&va->list, &vmap_area_list))
3535                 show_purge_info(m);
3536
3537         return 0;
3538 }
3539
3540 static const struct seq_operations vmalloc_op = {
3541         .start = s_start,
3542         .next = s_next,
3543         .stop = s_stop,
3544         .show = s_show,
3545 };
3546
3547 static int __init proc_vmalloc_init(void)
3548 {
3549         if (IS_ENABLED(CONFIG_NUMA))
3550                 proc_create_seq_private("vmallocinfo", 0400, NULL,
3551                                 &vmalloc_op,
3552                                 nr_node_ids * sizeof(unsigned int), NULL);
3553         else
3554                 proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op);
3555         return 0;
3556 }
3557 module_init(proc_vmalloc_init);
3558
3559 #endif