X-Git-Url: https://git.kernel.dk/?a=blobdiff_plain;f=mm%2Fpage_alloc.c;h=a762be57e46e14efa571b967eb75696c35ebd034;hb=15dbc136dff62ebefb03353cfb7d308d49b275f3;hp=c46b75d14b6f8732b513065f1bd79d124d703579;hpb=588ab3f9afdfa1a6b1e5761c858b2c4ab6098285;p=linux-2.6-block.git diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c46b75d14b6f..a762be57e46e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -249,6 +249,7 @@ compound_page_dtor * const compound_page_dtors[] = { int min_free_kbytes = 1024; int user_min_free_kbytes = -1; +int watermark_scale_factor = 10; static unsigned long __meminitdata nr_kernel_pages; static unsigned long __meminitdata nr_all_pages; @@ -307,13 +308,20 @@ static inline bool update_defer_init(pg_data_t *pgdat, unsigned long pfn, unsigned long zone_end, unsigned long *nr_initialised) { + unsigned long max_initialise; + /* Always populate low zones for address-contrained allocations */ if (zone_end < pgdat_end_pfn(pgdat)) return true; + /* + * Initialise at least 2G of a node but also take into account that + * two large system hashes that can take up 1GB for 0.25TB/node. + */ + max_initialise = max(2UL << (30 - PAGE_SHIFT), + (pgdat->node_spanned_pages >> 8)); - /* Initialise at least 2G of the highest zone */ (*nr_initialised)++; - if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) && + if ((*nr_initialised > max_initialise) && (pfn & (PAGES_PER_SECTION - 1)) == 0) { pgdat->first_deferred_pfn = pfn; return false; @@ -498,6 +506,7 @@ void prep_compound_page(struct page *page, unsigned int order) unsigned int _debug_guardpage_minorder; bool _debug_pagealloc_enabled __read_mostly = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); +EXPORT_SYMBOL(_debug_pagealloc_enabled); bool _debug_guardpage_enabled __read_mostly; static int __init early_debug_pagealloc(char *buf) @@ -542,11 +551,11 @@ static int __init debug_guardpage_minorder_setup(char *buf) unsigned long res; if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { - printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); + pr_err("Bad debug_guardpage_minorder value\n"); return 0; } _debug_guardpage_minorder = res; - printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); + pr_info("Setting debug_guardpage_minorder to %lu\n", res); return 0; } __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); @@ -764,7 +773,7 @@ static inline int free_pages_check(struct page *page) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; - if (unlikely(atomic_read(&page->_count) != 0)) + if (unlikely(page_ref_count(page) != 0)) bad_reason = "nonzero _count"; if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; @@ -1460,7 +1469,7 @@ static inline int check_new_page(struct page *page) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; - if (unlikely(atomic_read(&page->_count) != 0)) + if (unlikely(page_ref_count(page) != 0)) bad_reason = "nonzero _count"; if (unlikely(page->flags & __PG_HWPOISON)) { bad_reason = "HWPoisoned (hardware-corrupted)"; @@ -2348,19 +2357,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, list_del(&page->lru); pcp->count--; } else { - if (unlikely(gfp_flags & __GFP_NOFAIL)) { - /* - * __GFP_NOFAIL is not to be used in new code. - * - * All __GFP_NOFAIL callers should be fixed so that they - * properly detect and handle allocation failures. - * - * We most definitely don't want callers attempting to - * allocate greater than order-1 page units with - * __GFP_NOFAIL. - */ - WARN_ON_ONCE(order > 1); - } + /* + * We most definitely don't want callers attempting to + * allocate greater than order-1 page units with __GFP_NOFAIL. + */ + WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); spin_lock_irqsave(&zone->lock, flags); page = NULL; @@ -2857,8 +2858,12 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, * XXX: Page reclaim didn't yield anything, * and the OOM killer can't be invoked, but * keep looping as per tradition. + * + * But do not keep looping if oom_killer_disable() + * was already called, for the system is trying to + * enter a quiescent state during suspend. */ - *did_some_progress = 1; + *did_some_progress = !oom_killer_disabled; goto out; } if (pm_suspended_storage()) @@ -3117,14 +3122,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) gfp_mask &= ~__GFP_ATOMIC; - /* - * If this allocation cannot block and it is for a specific node, then - * fail early. There's no need to wakeup kswapd or retry for a - * speculative node-specific allocation. - */ - if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim) - goto nopage; - retry: if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); @@ -3481,7 +3478,7 @@ refill: /* Even if we own the page, we do not use atomic_set(). * This would break get_page_unless_zero() users. */ - atomic_add(size - 1, &page->_count); + page_ref_add(page, size - 1); /* reset page count bias and offset to start of new frag */ nc->pfmemalloc = page_is_pfmemalloc(page); @@ -3493,7 +3490,7 @@ refill: if (unlikely(offset < 0)) { page = virt_to_page(nc->va); - if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count)) + if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) goto refill; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) @@ -3501,7 +3498,7 @@ refill: size = nc->size; #endif /* OK, page count is 0, we can safely set it */ - atomic_set(&page->_count, size); + set_page_count(page, size); /* reset page count bias and offset to start of new frag */ nc->pagecnt_bias = size; @@ -3712,6 +3709,49 @@ static inline void show_node(struct zone *zone) printk("Node %d ", zone_to_nid(zone)); } +long si_mem_available(void) +{ + long available; + unsigned long pagecache; + unsigned long wmark_low = 0; + unsigned long pages[NR_LRU_LISTS]; + struct zone *zone; + int lru; + + for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) + pages[lru] = global_page_state(NR_LRU_BASE + lru); + + for_each_zone(zone) + wmark_low += zone->watermark[WMARK_LOW]; + + /* + * Estimate the amount of memory available for userspace allocations, + * without causing swapping. + */ + available = global_page_state(NR_FREE_PAGES) - totalreserve_pages; + + /* + * Not all the page cache can be freed, otherwise the system will + * start swapping. Assume at least half of the page cache, or the + * low watermark worth of cache, needs to stay. + */ + pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; + pagecache -= min(pagecache / 2, wmark_low); + available += pagecache; + + /* + * Part of the reclaimable slab consists of items that are in use, + * and cannot be freed. Cap this estimate at the low watermark. + */ + available += global_page_state(NR_SLAB_RECLAIMABLE) - + min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); + + if (available < 0) + available = 0; + return available; +} +EXPORT_SYMBOL_GPL(si_mem_available); + void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; @@ -4044,9 +4084,7 @@ static int __parse_numa_zonelist_order(char *s) } else if (*s == 'z' || *s == 'Z') { user_zonelist_order = ZONELIST_ORDER_ZONE; } else { - printk(KERN_WARNING - "Ignoring invalid numa_zonelist_order value: " - "%s\n", s); + pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s); return -EINVAL; } return 0; @@ -4510,12 +4548,11 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) else page_group_by_mobility_disabled = 0; - pr_info("Built %i zonelists in %s order, mobility grouping %s. " - "Total pages: %ld\n", - nr_online_nodes, - zonelist_order_name[current_zonelist_order], - page_group_by_mobility_disabled ? "off" : "on", - vm_total_pages); + pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n", + nr_online_nodes, + zonelist_order_name[current_zonelist_order], + page_group_by_mobility_disabled ? "off" : "on", + vm_total_pages); #ifdef CONFIG_NUMA pr_info("Policy zone: %s\n", zone_names[policy_zone]); #endif @@ -5404,6 +5441,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) #endif init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); +#ifdef CONFIG_COMPACTION + init_waitqueue_head(&pgdat->kcompactd_wait); +#endif pgdat_page_ext_init(pgdat); for (j = 0; j < MAX_NR_ZONES; j++) { @@ -5428,8 +5468,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) " %s zone: %lu pages used for memmap\n", zone_names[j], memmap_pages); } else - printk(KERN_WARNING - " %s zone: %lu pages exceeds freesize %lu\n", + pr_warn(" %s zone: %lu pages exceeds freesize %lu\n", zone_names[j], memmap_pages, freesize); } @@ -5637,8 +5676,7 @@ static unsigned long __init find_min_pfn_for_node(int nid) min_pfn = min(min_pfn, start_pfn); if (min_pfn == ULONG_MAX) { - printk(KERN_WARNING - "Could not find start_pfn for node %d\n", nid); + pr_warn("Could not find start_pfn for node %d\n", nid); return 0; } @@ -6110,22 +6148,21 @@ void __init mem_init_print_info(const char *str) #undef adj_init_size - pr_info("Memory: %luK/%luK available " - "(%luK kernel code, %luK rwdata, %luK rodata, " - "%luK init, %luK bss, %luK reserved, %luK cma-reserved" + pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved" #ifdef CONFIG_HIGHMEM - ", %luK highmem" + ", %luK highmem" #endif - "%s%s)\n", - nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), - codesize >> 10, datasize >> 10, rosize >> 10, - (init_data_size + init_code_size) >> 10, bss_size >> 10, - (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT-10), - totalcma_pages << (PAGE_SHIFT-10), + "%s%s)\n", + nr_free_pages() << (PAGE_SHIFT - 10), + physpages << (PAGE_SHIFT - 10), + codesize >> 10, datasize >> 10, rosize >> 10, + (init_data_size + init_code_size) >> 10, bss_size >> 10, + (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10), + totalcma_pages << (PAGE_SHIFT - 10), #ifdef CONFIG_HIGHMEM - totalhigh_pages << (PAGE_SHIFT-10), + totalhigh_pages << (PAGE_SHIFT - 10), #endif - str ? ", " : "", str ? str : ""); + str ? ", " : "", str ? str : ""); } /** @@ -6300,8 +6337,17 @@ static void __setup_per_zone_wmarks(void) zone->watermark[WMARK_MIN] = tmp; } - zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); - zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); + /* + * Set the kswapd watermarks distance according to the + * scale factor in proportion to available memory, but + * ensure a minimum size on small systems. + */ + tmp = max_t(u64, tmp >> 2, + mult_frac(zone->managed_pages, + watermark_scale_factor, 10000)); + + zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; + zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; __mod_zone_page_state(zone, NR_ALLOC_BATCH, high_wmark_pages(zone) - low_wmark_pages(zone) - @@ -6442,6 +6488,21 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, return 0; } +int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int rc; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + if (write) + setup_per_zone_wmarks(); + + return 0; +} + #ifdef CONFIG_NUMA int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) @@ -6633,11 +6694,8 @@ void *__init alloc_large_system_hash(const char *tablename, if (!table) panic("Failed to allocate %s hash table\n", tablename); - printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", - tablename, - (1UL << log2qty), - ilog2(size) - PAGE_SHIFT, - size); + pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n", + tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size); if (_hash_shift) *_hash_shift = log2qty; @@ -6788,7 +6846,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, * This check already skips compound tails of THP * because their page->_count is zero at all time. */ - if (!atomic_read(&page->_count)) { + if (!page_ref_count(page)) { if (PageBuddy(page)) iter += (1 << page_order(page)) - 1; continue; @@ -7138,8 +7196,8 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) BUG_ON(!PageBuddy(page)); order = page_order(page); #ifdef CONFIG_DEBUG_VM - printk(KERN_INFO "remove from free list %lx %d %lx\n", - pfn, 1 << order, end_pfn); + pr_info("remove from free list %lx %d %lx\n", + pfn, 1 << order, end_pfn); #endif list_del(&page->lru); rmv_page_order(page); @@ -7152,7 +7210,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) } #endif -#ifdef CONFIG_MEMORY_FAILURE bool is_free_buddy_page(struct page *page) { struct zone *zone = page_zone(page); @@ -7171,4 +7228,3 @@ bool is_free_buddy_page(struct page *page) return order < MAX_ORDER; } -#endif