This reverts
c86aa7bbfd5568ba8a82d3635d8f7b8a8e06fe54
The reverted commit caused ABBA deadlocks when file migration raced with
file eviction for specific hugetlbfs files. This was discovered with a
modified version of the LTP move_pages12 test.
The purpose of the reverted patch was to close a long existing race
between hugetlbfs file truncation and page faults. After more analysis
of the patch and impacted code, it was determined that i_mmap_rwsem can
not be used for all required synchronization. Therefore, revert this
patch while working an another approach to the underlying issue.
Link: http://lkml.kernel.org/r/20190103235452.29335-1-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reported-by: Jan Stancek <jstancek@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
* truncation is indicated by end of range being LLONG_MAX
* In this case, we first scan the range and release found pages.
* After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
* truncation is indicated by end of range being LLONG_MAX
* In this case, we first scan the range and release found pages.
* After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
- * maps and global counts.
+ * maps and global counts. Page faults can not race with truncation
+ * in this routine. hugetlb_no_page() prevents page faults in the
+ * truncated range. It checks i_size before allocation, and again after
+ * with the page table lock for the page held. The same lock must be
+ * acquired to unmap a page.
* hole punch is indicated if end is not LLONG_MAX
* In the hole punch case we scan the range and release found pages.
* Only when releasing a page is the associated region/reserv map
* deleted. The region/reserv map for ranges without associated
* hole punch is indicated if end is not LLONG_MAX
* In the hole punch case we scan the range and release found pages.
* Only when releasing a page is the associated region/reserv map
* deleted. The region/reserv map for ranges without associated
- * pages are not modified.
- *
- * Callers of this routine must hold the i_mmap_rwsem in write mode to prevent
- * races with page faults.
- *
+ * pages are not modified. Page faults can race with hole punch.
+ * This is indicated if we find a mapped page.
* Note: If the passed end of range value is beyond the end of file, but
* not LLONG_MAX this routine still performs a hole punch operation.
*/
* Note: If the passed end of range value is beyond the end of file, but
* not LLONG_MAX this routine still performs a hole punch operation.
*/
for (i = 0; i < pagevec_count(&pvec); ++i) {
struct page *page = pvec.pages[i];
for (i = 0; i < pagevec_count(&pvec); ++i) {
struct page *page = pvec.pages[i];
+ hash = hugetlb_fault_mutex_hash(h, current->mm,
+ &pseudo_vma,
+ mapping, index, 0);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
- * A mapped page is impossible as callers should unmap
- * all references before calling. And, i_mmap_rwsem
- * prevents the creation of additional mappings.
+ * If page is mapped, it was faulted in after being
+ * unmapped in caller. Unmap (again) now after taking
+ * the fault mutex. The mutex will prevent faults
+ * until we finish removing the page.
+ *
+ * This race can only happen in the hole punch case.
+ * Getting here in a truncate operation is a bug.
- VM_BUG_ON(page_mapped(page));
+ if (unlikely(page_mapped(page))) {
+ BUG_ON(truncate_op);
+
+ i_mmap_lock_write(mapping);
+ hugetlb_vmdelete_list(&mapping->i_mmap,
+ index * pages_per_huge_page(h),
+ (index + 1) * pages_per_huge_page(h));
+ i_mmap_unlock_write(mapping);
+ }
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
}
huge_pagevec_release(&pvec);
cond_resched();
}
huge_pagevec_release(&pvec);
cond_resched();
static void hugetlbfs_evict_inode(struct inode *inode)
{
static void hugetlbfs_evict_inode(struct inode *inode)
{
- struct address_space *mapping = inode->i_mapping;
struct resv_map *resv_map;
struct resv_map *resv_map;
- /*
- * The vfs layer guarantees that there are no other users of this
- * inode. Therefore, it would be safe to call remove_inode_hugepages
- * without holding i_mmap_rwsem. We acquire and hold here to be
- * consistent with other callers. Since there will be no contention
- * on the semaphore, overhead is negligible.
- */
- i_mmap_lock_write(mapping);
remove_inode_hugepages(inode, 0, LLONG_MAX);
remove_inode_hugepages(inode, 0, LLONG_MAX);
- i_mmap_unlock_write(mapping);
-
resv_map = (struct resv_map *)inode->i_mapping->private_data;
/* root inode doesn't have the resv_map, so we should check it */
if (resv_map)
resv_map = (struct resv_map *)inode->i_mapping->private_data;
/* root inode doesn't have the resv_map, so we should check it */
if (resv_map)
i_mmap_lock_write(mapping);
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
i_mmap_lock_write(mapping);
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
- remove_inode_hugepages(inode, offset, LLONG_MAX);
i_mmap_unlock_write(mapping);
i_mmap_unlock_write(mapping);
+ remove_inode_hugepages(inode, offset, LLONG_MAX);
hugetlb_vmdelete_list(&mapping->i_mmap,
hole_start >> PAGE_SHIFT,
hole_end >> PAGE_SHIFT);
hugetlb_vmdelete_list(&mapping->i_mmap,
hole_start >> PAGE_SHIFT,
hole_end >> PAGE_SHIFT);
- remove_inode_hugepages(inode, hole_start, hole_end);
i_mmap_unlock_write(mapping);
i_mmap_unlock_write(mapping);
+ remove_inode_hugepages(inode, hole_start, hole_end);
/* addr is the offset within the file (zero based) */
addr = index * hpage_size;
/* addr is the offset within the file (zero based) */
addr = index * hpage_size;
- /*
- * fault mutex taken here, protects against fault path
- * and hole punch. inode_lock previously taken protects
- * against truncation.
- */
+ /* mutex taken here, fault path and hole punch */
hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
index, addr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
index, addr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
- * We can not race with truncation due to holding i_mmap_rwsem.
- * Check once here for faults beyond end of file.
+ * Use page lock to guard against racing truncation
+ * before we get page_table_lock.
- size = i_size_read(mapping->host) >> huge_page_shift(h);
- if (idx >= size)
- goto out;
-
retry:
page = find_lock_page(mapping, idx);
if (!page) {
retry:
page = find_lock_page(mapping, idx);
if (!page) {
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
+ if (idx >= size)
+ goto out;
+
/*
* Check for page in userfault range
*/
/*
* Check for page in userfault range
*/
}
ptl = huge_pte_lock(h, mm, ptep);
}
ptl = huge_pte_lock(h, mm, ptep);
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
+ if (idx >= size)
+ goto backout;
ret = 0;
if (!huge_pte_none(huge_ptep_get(ptep)))
ret = 0;
if (!huge_pte_none(huge_ptep_get(ptep)))
/*
* Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
/*
* Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
- * until finished with ptep. This serves two purposes:
- * 1) It prevents huge_pmd_unshare from being called elsewhere
- * and making the ptep no longer valid.
- * 2) It synchronizes us with file truncation.
+ * until finished with ptep. This prevents huge_pmd_unshare from
+ * being called elsewhere and making the ptep no longer valid.
*
* ptep could have already be assigned via huge_pte_offset. That
* is OK, as huge_pte_alloc will return the same value unless
*
* ptep could have already be assigned via huge_pte_offset. That
* is OK, as huge_pte_alloc will return the same value unless