* VM_MIXEDMAP mappings can likewise contain memory with or without "struct
* page" backing, however the difference is that _all_ pages with a struct
* page (that is, those where pfn_valid is true) are refcounted and considered
- * normal pages by the VM. The disadvantage is that pages are refcounted
- * (which can be slower and simply not an option for some PFNMAP users). The
- * advantage is that we don't have to follow the strict linearity rule of
- * PFNMAP mappings in order to support COWable mappings.
+ * normal pages by the VM. The only exception are zeropages, which are
+ * *never* refcounted.
+ *
+ * The disadvantage is that pages are refcounted (which can be slower and
+ * simply not an option for some PFNMAP users). The advantage is that we
+ * don't have to follow the strict linearity rule of PFNMAP mappings in
+ * order to support COWable mappings.
*
*/
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
if (vma->vm_flags & VM_MIXEDMAP) {
if (!pfn_valid(pfn))
return NULL;
+ if (is_zero_pfn(pfn))
+ return NULL;
goto out;
} else {
unsigned long off;
* eg. VDSO mappings can cause them to exist.
*/
out:
+ VM_WARN_ON_ONCE(is_zero_pfn(pfn));
return pfn_to_page(pfn);
}
return pte_alloc_map_lock(mm, pmd, addr, ptl);
}
-static int validate_page_before_insert(struct page *page)
+static bool vm_mixed_zeropage_allowed(struct vm_area_struct *vma)
+{
+ VM_WARN_ON_ONCE(vma->vm_flags & VM_PFNMAP);
+ /*
+ * Whoever wants to forbid the zeropage after some zeropages
+ * might already have been mapped has to scan the page tables and
+ * bail out on any zeropages. Zeropages in COW mappings can
+ * be unshared using FAULT_FLAG_UNSHARE faults.
+ */
+ if (mm_forbids_zeropage(vma->vm_mm))
+ return false;
+ /* zeropages in COW mappings are common and unproblematic. */
+ if (is_cow_mapping(vma->vm_flags))
+ return true;
+ /* Mappings that do not allow for writable PTEs are unproblematic. */
+ if (!(vma->vm_flags & (VM_WRITE | VM_MAYWRITE)))
+ return true;
+ /*
+ * Why not allow any VMA that has vm_ops->pfn_mkwrite? GUP could
+ * find the shared zeropage and longterm-pin it, which would
+ * be problematic as soon as the zeropage gets replaced by a different
+ * page due to vma->vm_ops->pfn_mkwrite, because what's mapped would
+ * now differ to what GUP looked up. FSDAX is incompatible to
+ * FOLL_LONGTERM and VM_IO is incompatible to GUP completely (see
+ * check_vma_flags).
+ */
+ return vma->vm_ops && vma->vm_ops->pfn_mkwrite &&
+ (vma_is_fsdax(vma) || vma->vm_flags & VM_IO);
+}
+
+static int validate_page_before_insert(struct vm_area_struct *vma,
+ struct page *page)
{
struct folio *folio = page_folio(page);
if (!folio_ref_count(folio))
return -EINVAL;
+ if (unlikely(is_zero_folio(folio))) {
+ if (!vm_mixed_zeropage_allowed(vma))
+ return -EINVAL;
+ return 0;
+ }
if (folio_test_anon(folio) || folio_test_slab(folio) ||
page_has_type(page))
return -EINVAL;
unsigned long addr, struct page *page, pgprot_t prot)
{
struct folio *folio = page_folio(page);
+ pte_t pteval;
if (!pte_none(ptep_get(pte)))
return -EBUSY;
/* Ok, finally just insert the thing.. */
- folio_get(folio);
- inc_mm_counter(vma->vm_mm, mm_counter_file(folio));
- folio_add_file_rmap_pte(folio, page, vma);
- set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot));
+ pteval = mk_pte(page, prot);
+ if (unlikely(is_zero_folio(folio))) {
+ pteval = pte_mkspecial(pteval);
+ } else {
+ folio_get(folio);
+ inc_mm_counter(vma->vm_mm, mm_counter_file(folio));
+ folio_add_file_rmap_pte(folio, page, vma);
+ }
+ set_pte_at(vma->vm_mm, addr, pte, pteval);
return 0;
}
-/*
- * This is the old fallback for page remapping.
- *
- * For historical reasons, it only allows reserved pages. Only
- * old drivers should use this, and they needed to mark their
- * pages reserved for the old functions anyway.
- */
static int insert_page(struct vm_area_struct *vma, unsigned long addr,
struct page *page, pgprot_t prot)
{
pte_t *pte;
spinlock_t *ptl;
- retval = validate_page_before_insert(page);
+ retval = validate_page_before_insert(vma, page);
if (retval)
goto out;
retval = -ENOMEM;
{
int err;
- err = validate_page_before_insert(page);
+ err = validate_page_before_insert(vma, page);
if (err)
return err;
return insert_page_into_pte_locked(vma, pte, addr, page, prot);
* @page: source kernel page
*
* This allows drivers to insert individual pages they've allocated
- * into a user vma.
+ * into a user vma. The zeropage is supported in some VMAs,
+ * see vm_mixed_zeropage_allowed().
*
* The page has to be a nice clean _individual_ kernel allocation.
* If you allocate a compound page, you need to have marked it as
* @offset: user's requested vm_pgoff
*
* This allows drivers to map range of kernel pages into a user vma.
+ * The zeropage is supported in some VMAs, see
+ * vm_mixed_zeropage_allowed().
*
* Return: 0 on success and error code otherwise.
*/
}
EXPORT_SYMBOL(vmf_insert_pfn);
-static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
+static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn, bool mkwrite)
{
+ if (unlikely(is_zero_pfn(pfn_t_to_pfn(pfn))) &&
+ (mkwrite || !vm_mixed_zeropage_allowed(vma)))
+ return false;
/* these checks mirror the abort conditions in vm_normal_page */
if (vma->vm_flags & VM_MIXEDMAP)
return true;
pgprot_t pgprot = vma->vm_page_prot;
int err;
- BUG_ON(!vm_mixed_ok(vma, pfn));
+ if (!vm_mixed_ok(vma, pfn, mkwrite))
+ return VM_FAULT_SIGBUS;
if (addr < vma->vm_start || addr >= vma->vm_end)
return VM_FAULT_SIGBUS;
pte_t entry;
VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE));
+ VM_WARN_ON(is_zero_pfn(pte_pfn(vmf->orig_pte)));
if (folio) {
VM_BUG_ON(folio_test_anon(folio) &&