* appear as a "reserved" entry instead of simply dangling with incorrect
* counts.
*/
-void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve)
+void hugetlb_fix_reserve_counts(struct inode *inode)
{
struct hugepage_subpool *spool = subpool_inode(inode);
long rsv_adjust;
rsv_adjust = hugepage_subpool_get_pages(spool, 1);
- if (restore_reserve && rsv_adjust) {
+ if (rsv_adjust) {
struct hstate *h = hstate_inode(inode);
hugetlb_acct_memory(h, 1);
((node = hstate_next_node_to_free(hs, mask)) || 1); \
nr_nodes--)
-#if (defined(CONFIG_X86_64) || defined(CONFIG_S390)) && \
+#if defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) && \
((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || \
defined(CONFIG_CMA))
static void destroy_compound_gigantic_page(struct page *page,
/*
* Dissolve a given free hugepage into free buddy pages. This function does
- * nothing for in-use (including surplus) hugepages.
+ * nothing for in-use (including surplus) hugepages. Returns -EBUSY if the
+ * number of free hugepages would be reduced below the number of reserved
+ * hugepages.
*/
-static void dissolve_free_huge_page(struct page *page)
+static int dissolve_free_huge_page(struct page *page)
{
+ int rc = 0;
+
spin_lock(&hugetlb_lock);
if (PageHuge(page) && !page_count(page)) {
- struct hstate *h = page_hstate(page);
- int nid = page_to_nid(page);
- list_del(&page->lru);
+ struct page *head = compound_head(page);
+ struct hstate *h = page_hstate(head);
+ int nid = page_to_nid(head);
+ if (h->free_huge_pages - h->resv_huge_pages == 0) {
+ rc = -EBUSY;
+ goto out;
+ }
+ list_del(&head->lru);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
h->max_huge_pages--;
- update_and_free_page(h, page);
+ update_and_free_page(h, head);
}
+out:
spin_unlock(&hugetlb_lock);
+ return rc;
}
/*
* Dissolve free hugepages in a given pfn range. Used by memory hotplug to
* make specified memory blocks removable from the system.
- * Note that start_pfn should aligned with (minimum) hugepage size.
+ * Note that this will dissolve a free gigantic hugepage completely, if any
+ * part of it lies within the given range.
+ * Also note that if dissolve_free_huge_page() returns with an error, all
+ * free hugepages that were dissolved before that error are lost.
*/
-void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
+int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long pfn;
+ struct page *page;
+ int rc = 0;
if (!hugepages_supported())
- return;
+ return rc;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) {
+ page = pfn_to_page(pfn);
+ if (PageHuge(page) && !page_count(page)) {
+ rc = dissolve_free_huge_page(page);
+ if (rc)
+ break;
+ }
+ }
- VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order));
- for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order)
- dissolve_free_huge_page(pfn_to_page(pfn));
+ return rc;
}
/*
}
/*
- * When releasing a hugetlb pool reservation, any surplus pages that were
- * allocated to satisfy the reservation must be explicitly freed if they were
- * never used.
- * Called with hugetlb_lock held.
+ * This routine has two main purposes:
+ * 1) Decrement the reservation count (resv_huge_pages) by the value passed
+ * in unused_resv_pages. This corresponds to the prior adjustments made
+ * to the associated reservation map.
+ * 2) Free any unused surplus pages that may have been allocated to satisfy
+ * the reservation. As many as unused_resv_pages may be freed.
+ *
+ * Called with hugetlb_lock held. However, the lock could be dropped (and
+ * reacquired) during calls to cond_resched_lock. Whenever dropping the lock,
+ * we must make sure nobody else can claim pages we are in the process of
+ * freeing. Do this by ensuring resv_huge_page always is greater than the
+ * number of huge pages we plan to free when dropping the lock.
*/
static void return_unused_surplus_pages(struct hstate *h,
unsigned long unused_resv_pages)
{
unsigned long nr_pages;
- /* Uncommit the reservation */
- h->resv_huge_pages -= unused_resv_pages;
-
/* Cannot return gigantic pages currently */
if (hstate_is_gigantic(h))
- return;
+ goto out;
+ /*
+ * Part (or even all) of the reservation could have been backed
+ * by pre-allocated pages. Only free surplus pages.
+ */
nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
/*
* when the nodes with surplus pages have no free pages.
* free_pool_huge_page() will balance the the freed pages across the
* on-line nodes with memory and will handle the hstate accounting.
+ *
+ * Note that we decrement resv_huge_pages as we free the pages. If
+ * we drop the lock, resv_huge_pages will still be sufficiently large
+ * to cover subsequent pages we may free.
*/
while (nr_pages--) {
+ h->resv_huge_pages--;
+ unused_resv_pages--;
if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
- break;
+ goto out;
cond_resched_lock(&hugetlb_lock);
}
+
+out:
+ /* Fully uncommit the reservation */
+ h->resv_huge_pages -= unused_resv_pages;
}
* is not the case is if a reserve map was changed between calls. It
* is the responsibility of the caller to notice the difference and
* take appropriate action.
+ *
+ * vma_add_reservation is used in error paths where a reservation must
+ * be restored when a newly allocated huge page must be freed. It is
+ * to be called after calling vma_needs_reservation to determine if a
+ * reservation exists.
*/
enum vma_resv_mode {
VMA_NEEDS_RESV,
VMA_COMMIT_RESV,
VMA_END_RESV,
+ VMA_ADD_RESV,
};
static long __vma_reservation_common(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr,
region_abort(resv, idx, idx + 1);
ret = 0;
break;
+ case VMA_ADD_RESV:
+ if (vma->vm_flags & VM_MAYSHARE)
+ ret = region_add(resv, idx, idx + 1);
+ else {
+ region_abort(resv, idx, idx + 1);
+ ret = region_del(resv, idx, idx + 1);
+ }
+ break;
default:
BUG();
}
(void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
}
+static long vma_add_reservation(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
+}
+
+/*
+ * This routine is called to restore a reservation on error paths. In the
+ * specific error paths, a huge page was allocated (via alloc_huge_page)
+ * and is about to be freed. If a reservation for the page existed,
+ * alloc_huge_page would have consumed the reservation and set PagePrivate
+ * in the newly allocated page. When the page is freed via free_huge_page,
+ * the global reservation count will be incremented if PagePrivate is set.
+ * However, free_huge_page can not adjust the reserve map. Adjust the
+ * reserve map here to be consistent with global reserve count adjustments
+ * to be made by free_huge_page.
+ */
+static void restore_reserve_on_error(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long address,
+ struct page *page)
+{
+ if (unlikely(PagePrivate(page))) {
+ long rc = vma_needs_reservation(h, vma, address);
+
+ if (unlikely(rc < 0)) {
+ /*
+ * Rare out of memory condition in reserve map
+ * manipulation. Clear PagePrivate so that
+ * global reserve count will not be incremented
+ * by free_huge_page. This will make it appear
+ * as though the reservation for this page was
+ * consumed. This may prevent the task from
+ * faulting in the page at a later time. This
+ * is better than inconsistent global huge page
+ * accounting of reserve counts.
+ */
+ ClearPagePrivate(page);
+ } else if (rc) {
+ rc = vma_add_reservation(h, vma, address);
+ if (unlikely(rc < 0))
+ /*
+ * See above comment about rare out of
+ * memory condition.
+ */
+ ClearPagePrivate(page);
+ } else
+ vma_end_reservation(h, vma, address);
+ }
+}
+
struct page *alloc_huge_page(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve)
{
BUG_ON(start & ~huge_page_mask(h));
BUG_ON(end & ~huge_page_mask(h));
+ /*
+ * This is a hugetlb vma, all the pte entries should point
+ * to huge page.
+ */
+ tlb_remove_check_page_size_change(tlb, sz);
tlb_start_vma(tlb, vma);
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
address = start;
}
pte = huge_ptep_get_and_clear(mm, address, ptep);
- tlb_remove_tlb_entry(tlb, ptep, address);
+ tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
if (huge_pte_dirty(pte))
set_page_dirty(page);
* Keep the pte_same checks anyway to make transition from the mutex easier.
*/
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep, pte_t pte,
- struct page *pagecache_page, spinlock_t *ptl)
+ unsigned long address, pte_t *ptep,
+ struct page *pagecache_page, spinlock_t *ptl)
{
+ pte_t pte;
struct hstate *h = hstate_vma(vma);
struct page *old_page, *new_page;
int ret = 0, outside_reserve = 0;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
+ pte = huge_ptep_get(ptep);
old_page = pte_page(pte);
retry_avoidcopy:
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out_release_all:
+ restore_reserve_on_error(h, vma, address, new_page);
put_page(new_page);
out_release_old:
put_page(old_page);
vma_end_reservation(h, vma, address);
}
- ptl = huge_pte_lockptr(h, mm, ptep);
- spin_lock(ptl);
+ ptl = huge_pte_lock(h, mm, ptep);
size = i_size_read(mapping->host) >> huge_page_shift(h);
if (idx >= size)
goto backout;
hugetlb_count_add(pages_per_huge_page(h), mm);
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
- ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
+ ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
}
spin_unlock(ptl);
spin_unlock(ptl);
backout_unlocked:
unlock_page(page);
+ restore_reserve_on_error(h, vma, address, page);
put_page(page);
goto out;
}
if (flags & FAULT_FLAG_WRITE) {
if (!huge_pte_write(entry)) {
- ret = hugetlb_cow(mm, vma, address, ptep, entry,
- pagecache_page, ptl);
+ ret = hugetlb_cow(mm, vma, address, ptep,
+ pagecache_page, ptl);
goto out_put_page;
}
entry = huge_pte_mkdirty(entry);
if (!spte)
goto out;
- ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
- spin_lock(ptl);
+ ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
if (pud_none(*pud)) {
pud_populate(mm, pud,
(pmd_t *)((unsigned long)spte & PAGE_MASK));