mm/swap.c

   1 /*
   2  *  linux/mm/swap.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6
   7 /*
   8  * This file contains the default values for the operation of the
   9  * Linux VM subsystem. Fine-tuning documentation can be found in
  10  * Documentation/sysctl/vm.txt.
  11  * Started 18.12.91
  12  * Swap aging added 23.2.95, Stephen Tweedie.
  13  * Buffermem limits added 12.3.98, Rik van Riel.
  14  */
  15
  16 #include <linux/mm.h>
  17 #include <linux/sched.h>
  18 #include <linux/kernel_stat.h>
  19 #include <linux/swap.h>
  20 #include <linux/mman.h>
  21 #include <linux/pagemap.h>
  22 #include <linux/pagevec.h>
  23 #include <linux/init.h>
  24 #include <linux/module.h>
  25 #include <linux/mm_inline.h>
  26 #include <linux/buffer_head.h>  /* for try_to_release_page() */
  27 #include <linux/percpu_counter.h>
  28 #include <linux/percpu.h>
  29 #include <linux/cpu.h>
  30 #include <linux/notifier.h>
  31 #include <linux/backing-dev.h>
  32 #include <linux/memcontrol.h>
  33 #include <linux/gfp.h>
  34
  35 #include "internal.h"
  36
  37 /* How many pages do we try to swap or page in/out together? */
  38 int page_cluster;
  39
  40 static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
  41 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
  42
  43 /*
  44  * This path almost never happens for VM activity - pages are normally
  45  * freed via pagevecs.  But it gets used by networking.
  46  */
  47 static void __page_cache_release(struct page *page)
  48 {
  49         if (PageLRU(page)) {
  50                 unsigned long flags;
  51                 struct zone *zone = page_zone(page);
  52
  53                 spin_lock_irqsave(&zone->lru_lock, flags);
  54                 VM_BUG_ON(!PageLRU(page));
  55                 __ClearPageLRU(page);
  56                 del_page_from_lru(zone, page);
  57                 spin_unlock_irqrestore(&zone->lru_lock, flags);
  58         }
  59 }
  60
  61 static void __put_single_page(struct page *page)
  62 {
  63         __page_cache_release(page);
  64         free_hot_cold_page(page, 0);
  65 }
  66
  67 static void __put_compound_page(struct page *page)
  68 {
  69         compound_page_dtor *dtor;
  70
  71         __page_cache_release(page);
  72         dtor = get_compound_page_dtor(page);
  73         (*dtor)(page);
  74 }
  75
  76 static void put_compound_page(struct page *page)
  77 {
  78         if (unlikely(PageTail(page))) {
  79                 /* __split_huge_page_refcount can run under us */
  80                 struct page *page_head = page->first_page;
  81                 smp_rmb();
  82                 /*
  83                  * If PageTail is still set after smp_rmb() we can be sure
  84                  * that the page->first_page we read wasn't a dangling pointer.
  85                  * See __split_huge_page_refcount() smp_wmb().
  86                  */
  87                 if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
  88                         unsigned long flags;
  89                         /*
  90                          * Verify that our page_head wasn't converted
  91                          * to a a regular page before we got a
  92                          * reference on it.
  93                          */
  94                         if (unlikely(!PageHead(page_head))) {
  95                                 /* PageHead is cleared after PageTail */
  96                                 smp_rmb();
  97                                 VM_BUG_ON(PageTail(page));
  98                                 goto out_put_head;
  99                         }
 100                         /*
 101                          * Only run compound_lock on a valid PageHead,
 102                          * after having it pinned with
 103                          * get_page_unless_zero() above.
 104                          */
 105                         smp_mb();
 106                         /* page_head wasn't a dangling pointer */
 107                         flags = compound_lock_irqsave(page_head);
 108                         if (unlikely(!PageTail(page))) {
 109                                 /* __split_huge_page_refcount run before us */
 110                                 compound_unlock_irqrestore(page_head, flags);
 111                                 VM_BUG_ON(PageHead(page_head));
 112                         out_put_head:
 113                                 if (put_page_testzero(page_head))
 114                                         __put_single_page(page_head);
 115                         out_put_single:
 116                                 if (put_page_testzero(page))
 117                                         __put_single_page(page);
 118                                 return;
 119                         }
 120                         VM_BUG_ON(page_head != page->first_page);
 121                         /*
 122                          * We can release the refcount taken by
 123                          * get_page_unless_zero now that
 124                          * split_huge_page_refcount is blocked on the
 125                          * compound_lock.
 126                          */
 127                         if (put_page_testzero(page_head))
 128                                 VM_BUG_ON(1);
 129                         /* __split_huge_page_refcount will wait now */
 130                         VM_BUG_ON(atomic_read(&page->_count) <= 0);
 131                         atomic_dec(&page->_count);
 132                         VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
 133                         compound_unlock_irqrestore(page_head, flags);
 134                         if (put_page_testzero(page_head))
 135                                 __put_compound_page(page_head);
 136                 } else {
 137                         /* page_head is a dangling pointer */
 138                         VM_BUG_ON(PageTail(page));
 139                         goto out_put_single;
 140                 }
 141         } else if (put_page_testzero(page)) {
 142                 if (PageHead(page))
 143                         __put_compound_page(page);
 144                 else
 145                         __put_single_page(page);
 146         }
 147 }
 148
 149 void put_page(struct page *page)
 150 {
 151         if (unlikely(PageCompound(page)))
 152                 put_compound_page(page);
 153         else if (put_page_testzero(page))
 154                 __put_single_page(page);
 155 }
 156 EXPORT_SYMBOL(put_page);
 157
 158 /**
 159  * put_pages_list() - release a list of pages
 160  * @pages: list of pages threaded on page->lru
 161  *
 162  * Release a list of pages which are strung together on page.lru.  Currently
 163  * used by read_cache_pages() and related error recovery code.
 164  */
 165 void put_pages_list(struct list_head *pages)
 166 {
 167         while (!list_empty(pages)) {
 168                 struct page *victim;
 169
 170                 victim = list_entry(pages->prev, struct page, lru);
 171                 list_del(&victim->lru);
 172                 page_cache_release(victim);
 173         }
 174 }
 175 EXPORT_SYMBOL(put_pages_list);
 176
 177 /*
 178  * pagevec_move_tail() must be called with IRQ disabled.
 179  * Otherwise this may cause nasty races.
 180  */
 181 static void pagevec_move_tail(struct pagevec *pvec)
 182 {
 183         int i;
 184         int pgmoved = 0;
 185         struct zone *zone = NULL;
 186
 187         for (i = 0; i < pagevec_count(pvec); i++) {
 188                 struct page *page = pvec->pages[i];
 189                 struct zone *pagezone = page_zone(page);
 190
 191                 if (pagezone != zone) {
 192                         if (zone)
 193                                 spin_unlock(&zone->lru_lock);
 194                         zone = pagezone;
 195                         spin_lock(&zone->lru_lock);
 196                 }
 197                 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
 198                         int lru = page_lru_base_type(page);
 199                         list_move_tail(&page->lru, &zone->lru[lru].list);
 200                         pgmoved++;
 201                 }
 202         }
 203         if (zone)
 204                 spin_unlock(&zone->lru_lock);
 205         __count_vm_events(PGROTATED, pgmoved);
 206         release_pages(pvec->pages, pvec->nr, pvec->cold);
 207         pagevec_reinit(pvec);
 208 }
 209
 210 /*
 211  * Writeback is about to end against a page which has been marked for immediate
 212  * reclaim.  If it still appears to be reclaimable, move it to the tail of the
 213  * inactive list.
 214  */
 215 void  rotate_reclaimable_page(struct page *page)
 216 {
 217         if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
 218             !PageUnevictable(page) && PageLRU(page)) {
 219                 struct pagevec *pvec;
 220                 unsigned long flags;
 221
 222                 page_cache_get(page);
 223                 local_irq_save(flags);
 224                 pvec = &__get_cpu_var(lru_rotate_pvecs);
 225                 if (!pagevec_add(pvec, page))
 226                         pagevec_move_tail(pvec);
 227                 local_irq_restore(flags);
 228         }
 229 }
 230
 231 static void update_page_reclaim_stat(struct zone *zone, struct page *page,
 232                                      int file, int rotated)
 233 {
 234         struct zone_reclaim_stat *reclaim_stat = &zone->reclaim_stat;
 235         struct zone_reclaim_stat *memcg_reclaim_stat;
 236
 237         memcg_reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page);
 238
 239         reclaim_stat->recent_scanned[file]++;
 240         if (rotated)
 241                 reclaim_stat->recent_rotated[file]++;
 242
 243         if (!memcg_reclaim_stat)
 244                 return;
 245
 246         memcg_reclaim_stat->recent_scanned[file]++;
 247         if (rotated)
 248                 memcg_reclaim_stat->recent_rotated[file]++;
 249 }
 250
 251 /*
 252  * FIXME: speed this up?
 253  */
 254 void activate_page(struct page *page)
 255 {
 256         struct zone *zone = page_zone(page);
 257
 258         spin_lock_irq(&zone->lru_lock);
 259         if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
 260                 int file = page_is_file_cache(page);
 261                 int lru = page_lru_base_type(page);
 262                 del_page_from_lru_list(zone, page, lru);
 263
 264                 SetPageActive(page);
 265                 lru += LRU_ACTIVE;
 266                 add_page_to_lru_list(zone, page, lru);
 267                 __count_vm_event(PGACTIVATE);
 268
 269                 update_page_reclaim_stat(zone, page, file, 1);
 270         }
 271         spin_unlock_irq(&zone->lru_lock);
 272 }
 273
 274 /*
 275  * Mark a page as having seen activity.
 276  *
 277  * inactive,unreferenced        ->      inactive,referenced
 278  * inactive,referenced          ->      active,unreferenced
 279  * active,unreferenced          ->      active,referenced
 280  */
 281 void mark_page_accessed(struct page *page)
 282 {
 283         if (!PageActive(page) && !PageUnevictable(page) &&
 284                         PageReferenced(page) && PageLRU(page)) {
 285                 activate_page(page);
 286                 ClearPageReferenced(page);
 287         } else if (!PageReferenced(page)) {
 288                 SetPageReferenced(page);
 289         }
 290 }
 291
 292 EXPORT_SYMBOL(mark_page_accessed);
 293
 294 void __lru_cache_add(struct page *page, enum lru_list lru)
 295 {
 296         struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
 297
 298         page_cache_get(page);
 299         if (!pagevec_add(pvec, page))
 300                 ____pagevec_lru_add(pvec, lru);
 301         put_cpu_var(lru_add_pvecs);
 302 }
 303 EXPORT_SYMBOL(__lru_cache_add);
 304
 305 /**
 306  * lru_cache_add_lru - add a page to a page list
 307  * @page: the page to be added to the LRU.
 308  * @lru: the LRU list to which the page is added.
 309  */
 310 void lru_cache_add_lru(struct page *page, enum lru_list lru)
 311 {
 312         if (PageActive(page)) {
 313                 VM_BUG_ON(PageUnevictable(page));
 314                 ClearPageActive(page);
 315         } else if (PageUnevictable(page)) {
 316                 VM_BUG_ON(PageActive(page));
 317                 ClearPageUnevictable(page);
 318         }
 319
 320         VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page));
 321         __lru_cache_add(page, lru);
 322 }
 323
 324 /**
 325  * add_page_to_unevictable_list - add a page to the unevictable list
 326  * @page:  the page to be added to the unevictable list
 327  *
 328  * Add page directly to its zone's unevictable list.  To avoid races with
 329  * tasks that might be making the page evictable, through eg. munlock,
 330  * munmap or exit, while it's not on the lru, we want to add the page
 331  * while it's locked or otherwise "invisible" to other tasks.  This is
 332  * difficult to do when using the pagevec cache, so bypass that.
 333  */
 334 void add_page_to_unevictable_list(struct page *page)
 335 {
 336         struct zone *zone = page_zone(page);
 337
 338         spin_lock_irq(&zone->lru_lock);
 339         SetPageUnevictable(page);
 340         SetPageLRU(page);
 341         add_page_to_lru_list(zone, page, LRU_UNEVICTABLE);
 342         spin_unlock_irq(&zone->lru_lock);
 343 }
 344
 345 /*
 346  * Drain pages out of the cpu's pagevecs.
 347  * Either "cpu" is the current CPU, and preemption has already been
 348  * disabled; or "cpu" is being hot-unplugged, and is already dead.
 349  */
 350 static void drain_cpu_pagevecs(int cpu)
 351 {
 352         struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu);
 353         struct pagevec *pvec;
 354         int lru;
 355
 356         for_each_lru(lru) {
 357                 pvec = &pvecs[lru - LRU_BASE];
 358                 if (pagevec_count(pvec))
 359                         ____pagevec_lru_add(pvec, lru);
 360         }
 361
 362         pvec = &per_cpu(lru_rotate_pvecs, cpu);
 363         if (pagevec_count(pvec)) {
 364                 unsigned long flags;
 365
 366                 /* No harm done if a racing interrupt already did this */
 367                 local_irq_save(flags);
 368                 pagevec_move_tail(pvec);
 369                 local_irq_restore(flags);
 370         }
 371 }
 372
 373 void lru_add_drain(void)
 374 {
 375         drain_cpu_pagevecs(get_cpu());
 376         put_cpu();
 377 }
 378
 379 static void lru_add_drain_per_cpu(struct work_struct *dummy)
 380 {
 381         lru_add_drain();
 382 }
 383
 384 /*
 385  * Returns 0 for success
 386  */
 387 int lru_add_drain_all(void)
 388 {
 389         return schedule_on_each_cpu(lru_add_drain_per_cpu);
 390 }
 391
 392 /*
 393  * Batched page_cache_release().  Decrement the reference count on all the
 394  * passed pages.  If it fell to zero then remove the page from the LRU and
 395  * free it.
 396  *
 397  * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
 398  * for the remainder of the operation.
 399  *
 400  * The locking in this function is against shrink_inactive_list(): we recheck
 401  * the page count inside the lock to see whether shrink_inactive_list()
 402  * grabbed the page via the LRU.  If it did, give up: shrink_inactive_list()
 403  * will free it.
 404  */
 405 void release_pages(struct page **pages, int nr, int cold)
 406 {
 407         int i;
 408         struct pagevec pages_to_free;
 409         struct zone *zone = NULL;
 410         unsigned long uninitialized_var(flags);
 411
 412         pagevec_init(&pages_to_free, cold);
 413         for (i = 0; i < nr; i++) {
 414                 struct page *page = pages[i];
 415
 416                 if (unlikely(PageCompound(page))) {
 417                         if (zone) {
 418                                 spin_unlock_irqrestore(&zone->lru_lock, flags);
 419                                 zone = NULL;
 420                         }
 421                         put_compound_page(page);
 422                         continue;
 423                 }
 424
 425                 if (!put_page_testzero(page))
 426                         continue;
 427
 428                 if (PageLRU(page)) {
 429                         struct zone *pagezone = page_zone(page);
 430
 431                         if (pagezone != zone) {
 432                                 if (zone)
 433                                         spin_unlock_irqrestore(&zone->lru_lock,
 434                                                                         flags);
 435                                 zone = pagezone;
 436                                 spin_lock_irqsave(&zone->lru_lock, flags);
 437                         }
 438                         VM_BUG_ON(!PageLRU(page));
 439                         __ClearPageLRU(page);
 440                         del_page_from_lru(zone, page);
 441                 }
 442
 443                 if (!pagevec_add(&pages_to_free, page)) {
 444                         if (zone) {
 445                                 spin_unlock_irqrestore(&zone->lru_lock, flags);
 446                                 zone = NULL;
 447                         }
 448                         __pagevec_free(&pages_to_free);
 449                         pagevec_reinit(&pages_to_free);
 450                 }
 451         }
 452         if (zone)
 453                 spin_unlock_irqrestore(&zone->lru_lock, flags);
 454
 455         pagevec_free(&pages_to_free);
 456 }
 457 EXPORT_SYMBOL(release_pages);
 458
 459 /*
 460  * The pages which we're about to release may be in the deferred lru-addition
 461  * queues.  That would prevent them from really being freed right now.  That's
 462  * OK from a correctness point of view but is inefficient - those pages may be
 463  * cache-warm and we want to give them back to the page allocator ASAP.
 464  *
 465  * So __pagevec_release() will drain those queues here.  __pagevec_lru_add()
 466  * and __pagevec_lru_add_active() call release_pages() directly to avoid
 467  * mutual recursion.
 468  */
 469 void __pagevec_release(struct pagevec *pvec)
 470 {
 471         lru_add_drain();
 472         release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
 473         pagevec_reinit(pvec);
 474 }
 475
 476 EXPORT_SYMBOL(__pagevec_release);
 477
 478 /*
 479  * Add the passed pages to the LRU, then drop the caller's refcount
 480  * on them.  Reinitialises the caller's pagevec.
 481  */
 482 void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
 483 {
 484         int i;
 485         struct zone *zone = NULL;
 486
 487         VM_BUG_ON(is_unevictable_lru(lru));
 488
 489         for (i = 0; i < pagevec_count(pvec); i++) {
 490                 struct page *page = pvec->pages[i];
 491                 struct zone *pagezone = page_zone(page);
 492                 int file;
 493                 int active;
 494
 495                 if (pagezone != zone) {
 496                         if (zone)
 497                                 spin_unlock_irq(&zone->lru_lock);
 498                         zone = pagezone;
 499                         spin_lock_irq(&zone->lru_lock);
 500                 }
 501                 VM_BUG_ON(PageActive(page));
 502                 VM_BUG_ON(PageUnevictable(page));
 503                 VM_BUG_ON(PageLRU(page));
 504                 SetPageLRU(page);
 505                 active = is_active_lru(lru);
 506                 file = is_file_lru(lru);
 507                 if (active)
 508                         SetPageActive(page);
 509                 update_page_reclaim_stat(zone, page, file, active);
 510                 add_page_to_lru_list(zone, page, lru);
 511         }
 512         if (zone)
 513                 spin_unlock_irq(&zone->lru_lock);
 514         release_pages(pvec->pages, pvec->nr, pvec->cold);
 515         pagevec_reinit(pvec);
 516 }
 517
 518 EXPORT_SYMBOL(____pagevec_lru_add);
 519
 520 /*
 521  * Try to drop buffers from the pages in a pagevec
 522  */
 523 void pagevec_strip(struct pagevec *pvec)
 524 {
 525         int i;
 526
 527         for (i = 0; i < pagevec_count(pvec); i++) {
 528                 struct page *page = pvec->pages[i];
 529
 530                 if (page_has_private(page) && trylock_page(page)) {
 531                         if (page_has_private(page))
 532                                 try_to_release_page(page, 0);
 533                         unlock_page(page);
 534                 }
 535         }
 536 }
 537
 538 /**
 539  * pagevec_lookup - gang pagecache lookup
 540  * @pvec:       Where the resulting pages are placed
 541  * @mapping:    The address_space to search
 542  * @start:      The starting page index
 543  * @nr_pages:   The maximum number of pages
 544  *
 545  * pagevec_lookup() will search for and return a group of up to @nr_pages pages
 546  * in the mapping.  The pages are placed in @pvec.  pagevec_lookup() takes a
 547  * reference against the pages in @pvec.
 548  *
 549  * The search returns a group of mapping-contiguous pages with ascending
 550  * indexes.  There may be holes in the indices due to not-present pages.
 551  *
 552  * pagevec_lookup() returns the number of pages which were found.
 553  */
 554 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 555                 pgoff_t start, unsigned nr_pages)
 556 {
 557         pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
 558         return pagevec_count(pvec);
 559 }
 560
 561 EXPORT_SYMBOL(pagevec_lookup);
 562
 563 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
 564                 pgoff_t *index, int tag, unsigned nr_pages)
 565 {
 566         pvec->nr = find_get_pages_tag(mapping, index, tag,
 567                                         nr_pages, pvec->pages);
 568         return pagevec_count(pvec);
 569 }
 570
 571 EXPORT_SYMBOL(pagevec_lookup_tag);
 572
 573 /*
 574  * Perform any setup for the swap system
 575  */
 576 void __init swap_setup(void)
 577 {
 578         unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
 579
 580 #ifdef CONFIG_SWAP
 581         bdi_init(swapper_space.backing_dev_info);
 582 #endif
 583
 584         /* Use a smaller cluster for small-memory machines */
 585         if (megs < 16)
 586                 page_cluster = 2;
 587         else
 588                 page_cluster = 3;
 589         /*
 590          * Right now other parts of the system means that we
 591          * _really_ don't want to cluster much more
 592          */
 593 }