[linux-block.git] / arch / x86 / kernel / ldt.c

// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
 * Copyright (C) 2002 Andi Kleen
 *
 * This handles calls from both 32bit and 64bit mode.
 *
 * Lock order:
 *	contex.ldt_usr_sem
 *	  mmap_lock
 *	    context.lock
 */

#include <linux/errno.h>
#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>

#include <asm/ldt.h>
#include <asm/tlb.h>
#include <asm/desc.h>
#include <asm/mmu_context.h>
#include <asm/pgtable_areas.h>

#include <xen/xen.h>

/* This is a multiple of PAGE_SIZE. */
#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)

static inline void *ldt_slot_va(int slot)
{
	return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
}

void load_mm_ldt(struct mm_struct *mm)
{
	struct ldt_struct *ldt;

	/* READ_ONCE synchronizes with smp_store_release */
	ldt = READ_ONCE(mm->context.ldt);

	/*
	 * Any change to mm->context.ldt is followed by an IPI to all
	 * CPUs with the mm active.  The LDT will not be freed until
	 * after the IPI is handled by all such CPUs.  This means that,
	 * if the ldt_struct changes before we return, the values we see
	 * will be safe, and the new values will be loaded before we run
	 * any user code.
	 *
	 * NB: don't try to convert this to use RCU without extreme care.
	 * We would still need IRQs off, because we don't want to change
	 * the local LDT after an IPI loaded a newer value than the one
	 * that we can see.
	 */

	if (unlikely(ldt)) {
		if (static_cpu_has(X86_FEATURE_PTI)) {
			if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
				/*
				 * Whoops -- either the new LDT isn't mapped
				 * (if slot == -1) or is mapped into a bogus
				 * slot (if slot > 1).
				 */
				clear_LDT();
				return;
			}

			/*
			 * If page table isolation is enabled, ldt->entries
			 * will not be mapped in the userspace pagetables.
			 * Tell the CPU to access the LDT through the alias
			 * at ldt_slot_va(ldt->slot).
			 */
			set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
		} else {
			set_ldt(ldt->entries, ldt->nr_entries);
		}
	} else {
		clear_LDT();
	}
}

void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
{
	/*
	 * Load the LDT if either the old or new mm had an LDT.
	 *
	 * An mm will never go from having an LDT to not having an LDT.  Two
	 * mms never share an LDT, so we don't gain anything by checking to
	 * see whether the LDT changed.  There's also no guarantee that
	 * prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL,
	 * then prev->context.ldt will also be non-NULL.
	 *
	 * If we really cared, we could optimize the case where prev == next
	 * and we're exiting lazy mode.  Most of the time, if this happens,
	 * we don't actually need to reload LDTR, but modify_ldt() is mostly
	 * used by legacy code and emulators where we don't need this level of
	 * performance.
	 *
	 * This uses | instead of || because it generates better code.
	 */
	if (unlikely((unsigned long)prev->context.ldt |
		     (unsigned long)next->context.ldt))
		load_mm_ldt(next);

	DEBUG_LOCKS_WARN_ON(preemptible());
}

static void refresh_ldt_segments(void)
{
#ifdef CONFIG_X86_64
	unsigned short sel;

	/*
	 * Make sure that the cached DS and ES descriptors match the updated
	 * LDT.
	 */
	savesegment(ds, sel);
	if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
		loadsegment(ds, sel);

	savesegment(es, sel);
	if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
		loadsegment(es, sel);
#endif
}

/* context.lock is held by the task which issued the smp function call */
static void flush_ldt(void *__mm)
{
	struct mm_struct *mm = __mm;

	if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
		return;

	load_mm_ldt(mm);

	refresh_ldt_segments();
}

/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
{
	struct ldt_struct *new_ldt;
	unsigned int alloc_size;

	if (num_entries > LDT_ENTRIES)
		return NULL;

	new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL_ACCOUNT);
	if (!new_ldt)
		return NULL;

	BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct));
	alloc_size = num_entries * LDT_ENTRY_SIZE;

	/*
	 * Xen is very picky: it requires a page-aligned LDT that has no
	 * trailing nonzero bytes in any page that contains LDT descriptors.
	 * Keep it simple: zero the whole allocation and never allocate less
	 * than PAGE_SIZE.
	 */
	if (alloc_size > PAGE_SIZE)
		new_ldt->entries = __vmalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
	else
		new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);

	if (!new_ldt->entries) {
		kfree(new_ldt);
		return NULL;
	}

	/* The new LDT isn't aliased for PTI yet. */
	new_ldt->slot = -1;

	new_ldt->nr_entries = num_entries;
	return new_ldt;
}

#ifdef CONFIG_PAGE_TABLE_ISOLATION

static void do_sanity_check(struct mm_struct *mm,
			    bool had_kernel_mapping,
			    bool had_user_mapping)
{
	if (mm->context.ldt) {
		/*
		 * We already had an LDT.  The top-level entry should already
		 * have been allocated and synchronized with the usermode
		 * tables.
		 */
		WARN_ON(!had_kernel_mapping);
		if (boot_cpu_has(X86_FEATURE_PTI))
			WARN_ON(!had_user_mapping);
	} else {
		/*
		 * This is the first time we're mapping an LDT for this process.
		 * Sync the pgd to the usermode tables.
		 */
		WARN_ON(had_kernel_mapping);
		if (boot_cpu_has(X86_FEATURE_PTI))
			WARN_ON(had_user_mapping);
	}
}

#ifdef CONFIG_X86_PAE

static pmd_t *pgd_to_pmd_walk(pgd_t *pgd, unsigned long va)
{
	p4d_t *p4d;
	pud_t *pud;

	if (pgd->pgd == 0)
		return NULL;

	p4d = p4d_offset(pgd, va);
	if (p4d_none(*p4d))
		return NULL;

	pud = pud_offset(p4d, va);
	if (pud_none(*pud))
		return NULL;

	return pmd_offset(pud, va);
}

static void map_ldt_struct_to_user(struct mm_struct *mm)
{
	pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR);
	pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
	pmd_t *k_pmd, *u_pmd;

	k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR);
	u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR);

	if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt)
		set_pmd(u_pmd, *k_pmd);
}

static void sanity_check_ldt_mapping(struct mm_struct *mm)
{
	pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR);
	pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
	bool had_kernel, had_user;
	pmd_t *k_pmd, *u_pmd;

	k_pmd      = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR);
	u_pmd      = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR);
	had_kernel = (k_pmd->pmd != 0);
	had_user   = (u_pmd->pmd != 0);

	do_sanity_check(mm, had_kernel, had_user);
}

#else /* !CONFIG_X86_PAE */

static void map_ldt_struct_to_user(struct mm_struct *mm)
{
	pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR);

	if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt)
		set_pgd(kernel_to_user_pgdp(pgd), *pgd);
}

static void sanity_check_ldt_mapping(struct mm_struct *mm)
{
	pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR);
	bool had_kernel = (pgd->pgd != 0);
	bool had_user   = (kernel_to_user_pgdp(pgd)->pgd != 0);

	do_sanity_check(mm, had_kernel, had_user);
}

#endif /* CONFIG_X86_PAE */

/*
 * If PTI is enabled, this maps the LDT into the kernelmode and
 * usermode tables for the given mm.
 */
static int
map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
{
	unsigned long va;
	bool is_vmalloc;
	spinlock_t *ptl;
	int i, nr_pages;

	if (!boot_cpu_has(X86_FEATURE_PTI))
		return 0;

	/*
	 * Any given ldt_struct should have map_ldt_struct() called at most
	 * once.
	 */
	WARN_ON(ldt->slot != -1);

	/* Check if the current mappings are sane */
	sanity_check_ldt_mapping(mm);

	is_vmalloc = is_vmalloc_addr(ldt->entries);

	nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE);

	for (i = 0; i < nr_pages; i++) {
		unsigned long offset = i << PAGE_SHIFT;
		const void *src = (char *)ldt->entries + offset;
		unsigned long pfn;
		pgprot_t pte_prot;
		pte_t pte, *ptep;

		va = (unsigned long)ldt_slot_va(slot) + offset;
		pfn = is_vmalloc ? vmalloc_to_pfn(src) :
			page_to_pfn(virt_to_page(src));
		/*
		 * Treat the PTI LDT range as a *userspace* range.
		 * get_locked_pte() will allocate all needed pagetables
		 * and account for them in this mm.
		 */
		ptep = get_locked_pte(mm, va, &ptl);
		if (!ptep)
			return -ENOMEM;
		/*
		 * Map it RO so the easy to find address is not a primary
		 * target via some kernel interface which misses a
		 * permission check.
		 */
		pte_prot = __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL);
		/* Filter out unsuppored __PAGE_KERNEL* bits: */
		pgprot_val(pte_prot) &= __supported_pte_mask;
		pte = pfn_pte(pfn, pte_prot);
		set_pte_at(mm, va, ptep, pte);
		pte_unmap_unlock(ptep, ptl);
	}

	/* Propagate LDT mapping to the user page-table */
	map_ldt_struct_to_user(mm);

	ldt->slot = slot;
	return 0;
}

static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt)
{
	unsigned long va;
	int i, nr_pages;

	if (!ldt)
		return;

	/* LDT map/unmap is only required for PTI */
	if (!boot_cpu_has(X86_FEATURE_PTI))
		return;

	nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE);

	for (i = 0; i < nr_pages; i++) {
		unsigned long offset = i << PAGE_SHIFT;
		spinlock_t *ptl;
		pte_t *ptep;

		va = (unsigned long)ldt_slot_va(ldt->slot) + offset;
		ptep = get_locked_pte(mm, va, &ptl);
		pte_clear(mm, va, ptep);
		pte_unmap_unlock(ptep, ptl);
	}

	va = (unsigned long)ldt_slot_va(ldt->slot);
	flush_tlb_mm_range(mm, va, va + nr_pages * PAGE_SIZE, PAGE_SHIFT, false);
}

#else /* !CONFIG_PAGE_TABLE_ISOLATION */

static int
map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
{
	return 0;
}

static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt)
{
}
#endif /* CONFIG_PAGE_TABLE_ISOLATION */

static void free_ldt_pgtables(struct mm_struct *mm)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
	struct mmu_gather tlb;
	unsigned long start = LDT_BASE_ADDR;
	unsigned long end = LDT_END_ADDR;

	if (!boot_cpu_has(X86_FEATURE_PTI))
		return;

	/*
	 * Although free_pgd_range() is intended for freeing user
	 * page-tables, it also works out for kernel mappings on x86.
	 * We use tlb_gather_mmu_fullmm() to avoid confusing the
	 * range-tracking logic in __tlb_adjust_range().
	 */
	tlb_gather_mmu_fullmm(&tlb, mm);
	free_pgd_range(&tlb, start, end, start, end);
	tlb_finish_mmu(&tlb);
#endif
}

/* After calling this, the LDT is immutable. */
static void finalize_ldt_struct(struct ldt_struct *ldt)
{
	paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);
}

static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt)
{
	mutex_lock(&mm->context.lock);

	/* Synchronizes with READ_ONCE in load_mm_ldt. */
	smp_store_release(&mm->context.ldt, ldt);

	/* Activate the LDT for all CPUs using currents mm. */
	on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true);

	mutex_unlock(&mm->context.lock);
}

static void free_ldt_struct(struct ldt_struct *ldt)
{
	if (likely(!ldt))
		return;

	paravirt_free_ldt(ldt->entries, ldt->nr_entries);
	if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE)
		vfree_atomic(ldt->entries);
	else
		free_page((unsigned long)ldt->entries);
	kfree(ldt);
}

/*
 * Called on fork from arch_dup_mmap(). Just copy the current LDT state,
 * the new task is not running, so nothing can be installed.
 */
int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
{
	struct ldt_struct *new_ldt;
	int retval = 0;

	if (!old_mm)
		return 0;

	mutex_lock(&old_mm->context.lock);
	if (!old_mm->context.ldt)
		goto out_unlock;

	new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
	if (!new_ldt) {
		retval = -ENOMEM;
		goto out_unlock;
	}

	memcpy(new_ldt->entries, old_mm->context.ldt->entries,
	       new_ldt->nr_entries * LDT_ENTRY_SIZE);
	finalize_ldt_struct(new_ldt);

	retval = map_ldt_struct(mm, new_ldt, 0);
	if (retval) {
		free_ldt_pgtables(mm);
		free_ldt_struct(new_ldt);
		goto out_unlock;
	}
	mm->context.ldt = new_ldt;

out_unlock:
	mutex_unlock(&old_mm->context.lock);
	return retval;
}

/*
 * No need to lock the MM as we are the last user
 *
 * 64bit: Don't touch the LDT register - we're already in the next thread.
 */
void destroy_context_ldt(struct mm_struct *mm)
{
	free_ldt_struct(mm->context.ldt);
	mm->context.ldt = NULL;
}

void ldt_arch_exit_mmap(struct mm_struct *mm)
{
	free_ldt_pgtables(mm);
}

static int read_ldt(void __user *ptr, unsigned long bytecount)
{
	struct mm_struct *mm = current->mm;
	unsigned long entries_size;
	int retval;

	down_read(&mm->context.ldt_usr_sem);

	if (!mm->context.ldt) {
		retval = 0;
		goto out_unlock;
	}

	if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
		bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;

	entries_size = mm->context.ldt->nr_entries * LDT_ENTRY_SIZE;
	if (entries_size > bytecount)
		entries_size = bytecount;

	if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) {
		retval = -EFAULT;
		goto out_unlock;
	}

	if (entries_size != bytecount) {
		/* Zero-fill the rest and pretend we read bytecount bytes. */
		if (clear_user(ptr + entries_size, bytecount - entries_size)) {
			retval = -EFAULT;
			goto out_unlock;
		}
	}
	retval = bytecount;

out_unlock:
	up_read(&mm->context.ldt_usr_sem);
	return retval;
}

static int read_default_ldt(void __user *ptr, unsigned long bytecount)
{
	/* CHECKME: Can we use _one_ random number ? */
#ifdef CONFIG_X86_32
	unsigned long size = 5 * sizeof(struct desc_struct);
#else
	unsigned long size = 128;
#endif
	if (bytecount > size)
		bytecount = size;
	if (clear_user(ptr, bytecount))
		return -EFAULT;
	return bytecount;
}

static bool allow_16bit_segments(void)
{
	if (!IS_ENABLED(CONFIG_X86_16BIT))
		return false;

#ifdef CONFIG_XEN_PV
	/*
	 * Xen PV does not implement ESPFIX64, which means that 16-bit
	 * segments will not work correctly.  Until either Xen PV implements
	 * ESPFIX64 and can signal this fact to the guest or unless someone
	 * provides compelling evidence that allowing broken 16-bit segments
	 * is worthwhile, disallow 16-bit segments under Xen PV.
	 */
	if (xen_pv_domain()) {
		pr_info_once("Warning: 16-bit segments do not work correctly in a Xen PV guest\n");
		return false;
	}
#endif

	return true;
}

static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
{
	struct mm_struct *mm = current->mm;
	struct ldt_struct *new_ldt, *old_ldt;
	unsigned int old_nr_entries, new_nr_entries;
	struct user_desc ldt_info;
	struct desc_struct ldt;
	int error;

	error = -EINVAL;
	if (bytecount != sizeof(ldt_info))
		goto out;
	error = -EFAULT;
	if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
		goto out;

	error = -EINVAL;
	if (ldt_info.entry_number >= LDT_ENTRIES)
		goto out;
	if (ldt_info.contents == 3) {
		if (oldmode)
			goto out;
		if (ldt_info.seg_not_present == 0)
			goto out;
	}

	if ((oldmode && !ldt_info.base_addr && !ldt_info.limit) ||
	    LDT_empty(&ldt_info)) {
		/* The user wants to clear the entry. */
		memset(&ldt, 0, sizeof(ldt));
	} else {
		if (!ldt_info.seg_32bit && !allow_16bit_segments()) {
			error = -EINVAL;
			goto out;
		}

		fill_ldt(&ldt, &ldt_info);
		if (oldmode)
			ldt.avl = 0;
	}

	if (down_write_killable(&mm->context.ldt_usr_sem))
		return -EINTR;

	old_ldt       = mm->context.ldt;
	old_nr_entries = old_ldt ? old_ldt->nr_entries : 0;
	new_nr_entries = max(ldt_info.entry_number + 1, old_nr_entries);

	error = -ENOMEM;
	new_ldt = alloc_ldt_struct(new_nr_entries);
	if (!new_ldt)
		goto out_unlock;

	if (old_ldt)
		memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE);

	new_ldt->entries[ldt_info.entry_number] = ldt;
	finalize_ldt_struct(new_ldt);

	/*
	 * If we are using PTI, map the new LDT into the userspace pagetables.
	 * If there is already an LDT, use the other slot so that other CPUs
	 * will continue to use the old LDT until install_ldt() switches
	 * them over to the new LDT.
	 */
	error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
	if (error) {
		/*
		 * This only can fail for the first LDT setup. If an LDT is
		 * already installed then the PTE page is already
		 * populated. Mop up a half populated page table.
		 */
		if (!WARN_ON_ONCE(old_ldt))
			free_ldt_pgtables(mm);
		free_ldt_struct(new_ldt);
		goto out_unlock;
	}

	install_ldt(mm, new_ldt);
	unmap_ldt_struct(mm, old_ldt);
	free_ldt_struct(old_ldt);
	error = 0;

out_unlock:
	up_write(&mm->context.ldt_usr_sem);
out:
	return error;
}

SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr ,
		unsigned long , bytecount)
{
	int ret = -ENOSYS;

	switch (func) {
	case 0:
		ret = read_ldt(ptr, bytecount);
		break;
	case 1:
		ret = write_ldt(ptr, bytecount, 1);
		break;
	case 2:
		ret = read_default_ldt(ptr, bytecount);
		break;
	case 0x11:
		ret = write_ldt(ptr, bytecount, 0);
		break;
	}
	/*
	 * The SYSCALL_DEFINE() macros give us an 'unsigned long'
	 * return type, but tht ABI for sys_modify_ldt() expects
	 * 'int'.  This cast gives us an int-sized value in %rax
	 * for the return code.  The 'unsigned' is necessary so
	 * the compiler does not try to sign-extend the negative
	 * return codes into the high half of the register when
	 * taking the value from int->long.
	 */
	return (unsigned int)ret;
}
Commit	Line	Data
	1	// SPDX-License-Identifier: GPL-2.0
	2	/*
	3	* Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
	4	* Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
	5	* Copyright (C) 2002 Andi Kleen
	6	*
	7	* This handles calls from both 32bit and 64bit mode.
	8	*
	9	* Lock order:
	10	* contex.ldt_usr_sem
	11	* mmap_lock
	12	* context.lock
	13	*/
	14
	15	#include <linux/errno.h>
	16	#include <linux/gfp.h>
	17	#include <linux/sched.h>
	18	#include <linux/string.h>
	19	#include <linux/mm.h>
	20	#include <linux/smp.h>
	21	#include <linux/syscalls.h>
	22	#include <linux/slab.h>
	23	#include <linux/vmalloc.h>
	24	#include <linux/uaccess.h>
	25
	26	#include <asm/ldt.h>
	27	#include <asm/tlb.h>
	28	#include <asm/desc.h>
	29	#include <asm/mmu_context.h>
	30	#include <asm/pgtable_areas.h>
	31
	32	#include <xen/xen.h>
	33
	34	/* This is a multiple of PAGE_SIZE. */
	35	#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
	36
	37	static inline void *ldt_slot_va(int slot)
	38	{
	39	return (void )(LDT_BASE_ADDR + LDT_SLOT_STRIDE slot);
	40	}
	41
	42	void load_mm_ldt(struct mm_struct *mm)
	43	{
	44	struct ldt_struct *ldt;
	45
	46	/* READ_ONCE synchronizes with smp_store_release */
	47	ldt = READ_ONCE(mm->context.ldt);
	48
	49	/*
	50	* Any change to mm->context.ldt is followed by an IPI to all
	51	* CPUs with the mm active. The LDT will not be freed until
	52	* after the IPI is handled by all such CPUs. This means that,
	53	* if the ldt_struct changes before we return, the values we see
	54	* will be safe, and the new values will be loaded before we run
	55	* any user code.
	56	*
	57	* NB: don't try to convert this to use RCU without extreme care.
	58	* We would still need IRQs off, because we don't want to change
	59	* the local LDT after an IPI loaded a newer value than the one
	60	* that we can see.
	61	*/
	62
	63	if (unlikely(ldt)) {
	64	if (static_cpu_has(X86_FEATURE_PTI)) {
	65	if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
	66	/*
	67	* Whoops -- either the new LDT isn't mapped
	68	* (if slot == -1) or is mapped into a bogus
	69	* slot (if slot > 1).
	70	*/
	71	clear_LDT();
	72	return;
	73	}
	74
	75	/*
	76	* If page table isolation is enabled, ldt->entries
	77	* will not be mapped in the userspace pagetables.
	78	* Tell the CPU to access the LDT through the alias
	79	* at ldt_slot_va(ldt->slot).
	80	*/
	81	set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
	82	} else {
	83	set_ldt(ldt->entries, ldt->nr_entries);
	84	}
	85	} else {
	86	clear_LDT();
	87	}
	88	}
	89
	90	void switch_ldt(struct mm_struct prev, struct mm_struct next)
	91	{
	92	/*
	93	* Load the LDT if either the old or new mm had an LDT.
	94	*
	95	* An mm will never go from having an LDT to not having an LDT. Two
	96	* mms never share an LDT, so we don't gain anything by checking to
	97	* see whether the LDT changed. There's also no guarantee that
	98	* prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL,
	99	* then prev->context.ldt will also be non-NULL.
	100	*
	101	* If we really cared, we could optimize the case where prev == next
	102	* and we're exiting lazy mode. Most of the time, if this happens,
	103	* we don't actually need to reload LDTR, but modify_ldt() is mostly
	104	* used by legacy code and emulators where we don't need this level of
	105	* performance.
	106	*
	107	* This uses \| instead of \|\| because it generates better code.
	108	*/
	109	if (unlikely((unsigned long)prev->context.ldt \|
	110	(unsigned long)next->context.ldt))
	111	load_mm_ldt(next);
	112
	113	DEBUG_LOCKS_WARN_ON(preemptible());
	114	}
	115
	116	static void refresh_ldt_segments(void)
	117	{
	118	#ifdef CONFIG_X86_64
	119	unsigned short sel;
	120
	121	/*
	122	* Make sure that the cached DS and ES descriptors match the updated
	123	* LDT.
	124	*/
	125	savesegment(ds, sel);
	126	if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
	127	loadsegment(ds, sel);
	128
	129	savesegment(es, sel);
	130	if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
	131	loadsegment(es, sel);
	132	#endif
	133	}
	134
	135	/* context.lock is held by the task which issued the smp function call */
	136	static void flush_ldt(void *__mm)
	137	{
	138	struct mm_struct *mm = __mm;
	139
	140	if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
	141	return;
	142
	143	load_mm_ldt(mm);
	144
	145	refresh_ldt_segments();
	146	}
	147
	148	/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
	149	static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
	150	{
	151	struct ldt_struct *new_ldt;
	152	unsigned int alloc_size;
	153
	154	if (num_entries > LDT_ENTRIES)
	155	return NULL;
	156
	157	new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL_ACCOUNT);
	158	if (!new_ldt)
	159	return NULL;
	160
	161	BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct));
	162	alloc_size = num_entries * LDT_ENTRY_SIZE;
	163
	164	/*
	165	* Xen is very picky: it requires a page-aligned LDT that has no
	166	* trailing nonzero bytes in any page that contains LDT descriptors.
	167	* Keep it simple: zero the whole allocation and never allocate less
	168	* than PAGE_SIZE.
	169	*/
	170	if (alloc_size > PAGE_SIZE)
	171	new_ldt->entries = __vmalloc(alloc_size, GFP_KERNEL_ACCOUNT \| __GFP_ZERO);
	172	else
	173	new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
	174
	175	if (!new_ldt->entries) {
	176	kfree(new_ldt);
	177	return NULL;
	178	}
	179
	180	/* The new LDT isn't aliased for PTI yet. */
	181	new_ldt->slot = -1;
	182
	183	new_ldt->nr_entries = num_entries;
	184	return new_ldt;
	185	}
	186
	187	#ifdef CONFIG_PAGE_TABLE_ISOLATION
	188
	189	static void do_sanity_check(struct mm_struct *mm,
	190	bool had_kernel_mapping,
	191	bool had_user_mapping)
	192	{
	193	if (mm->context.ldt) {
	194	/*
	195	* We already had an LDT. The top-level entry should already
	196	* have been allocated and synchronized with the usermode
	197	* tables.
	198	*/
	199	WARN_ON(!had_kernel_mapping);
	200	if (boot_cpu_has(X86_FEATURE_PTI))
	201	WARN_ON(!had_user_mapping);
	202	} else {
	203	/*
	204	* This is the first time we're mapping an LDT for this process.
	205	* Sync the pgd to the usermode tables.
	206	*/
	207	WARN_ON(had_kernel_mapping);
	208	if (boot_cpu_has(X86_FEATURE_PTI))
	209	WARN_ON(had_user_mapping);
	210	}
	211	}
	212
	213	#ifdef CONFIG_X86_PAE
	214
	215	static pmd_t pgd_to_pmd_walk(pgd_t pgd, unsigned long va)
	216	{
	217	p4d_t *p4d;
	218	pud_t *pud;
	219
	220	if (pgd->pgd == 0)
	221	return NULL;
	222
	223	p4d = p4d_offset(pgd, va);
	224	if (p4d_none(*p4d))
	225	return NULL;
	226
	227	pud = pud_offset(p4d, va);
	228	if (pud_none(*pud))
	229	return NULL;
	230
	231	return pmd_offset(pud, va);
	232	}
	233
	234	static void map_ldt_struct_to_user(struct mm_struct *mm)
	235	{
	236	pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR);
	237	pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
	238	pmd_t k_pmd, u_pmd;
	239
	240	k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR);
	241	u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR);
	242
	243	if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt)
	244	set_pmd(u_pmd, *k_pmd);
	245	}
	246
	247	static void sanity_check_ldt_mapping(struct mm_struct *mm)
	248	{
	249	pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR);
	250	pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
	251	bool had_kernel, had_user;
	252	pmd_t k_pmd, u_pmd;
	253
	254	k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR);
	255	u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR);
	256	had_kernel = (k_pmd->pmd != 0);
	257	had_user = (u_pmd->pmd != 0);
	258
	259	do_sanity_check(mm, had_kernel, had_user);
	260	}
	261
	262	#else /* !CONFIG_X86_PAE */
	263
	264	static void map_ldt_struct_to_user(struct mm_struct *mm)
	265	{
	266	pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR);
	267
	268	if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt)
	269	set_pgd(kernel_to_user_pgdp(pgd), *pgd);
	270	}
	271
	272	static void sanity_check_ldt_mapping(struct mm_struct *mm)
	273	{
	274	pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR);
	275	bool had_kernel = (pgd->pgd != 0);
	276	bool had_user = (kernel_to_user_pgdp(pgd)->pgd != 0);
	277
	278	do_sanity_check(mm, had_kernel, had_user);
	279	}
	280
	281	#endif /* CONFIG_X86_PAE */
	282
	283	/*
	284	* If PTI is enabled, this maps the LDT into the kernelmode and
	285	* usermode tables for the given mm.
	286	*/
	287	static int
	288	map_ldt_struct(struct mm_struct mm, struct ldt_struct ldt, int slot)
	289	{
	290	unsigned long va;
	291	bool is_vmalloc;
	292	spinlock_t *ptl;
	293	int i, nr_pages;
	294
	295	if (!boot_cpu_has(X86_FEATURE_PTI))
	296	return 0;
	297
	298	/*
	299	* Any given ldt_struct should have map_ldt_struct() called at most
	300	* once.
	301	*/
	302	WARN_ON(ldt->slot != -1);
	303
	304	/* Check if the current mappings are sane */
	305	sanity_check_ldt_mapping(mm);
	306
	307	is_vmalloc = is_vmalloc_addr(ldt->entries);
	308
	309	nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE);
	310
	311	for (i = 0; i < nr_pages; i++) {
	312	unsigned long offset = i << PAGE_SHIFT;
	313	const void src = (char )ldt->entries + offset;
	314	unsigned long pfn;
	315	pgprot_t pte_prot;
	316	pte_t pte, *ptep;
	317
	318	va = (unsigned long)ldt_slot_va(slot) + offset;
	319	pfn = is_vmalloc ? vmalloc_to_pfn(src) :
	320	page_to_pfn(virt_to_page(src));
	321	/*
	322	* Treat the PTI LDT range as a userspace range.
	323	* get_locked_pte() will allocate all needed pagetables
	324	* and account for them in this mm.
	325	*/
	326	ptep = get_locked_pte(mm, va, &ptl);
	327	if (!ptep)
	328	return -ENOMEM;
	329	/*
	330	* Map it RO so the easy to find address is not a primary
	331	* target via some kernel interface which misses a
	332	* permission check.
	333	*/
	334	pte_prot = __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL);
	335	/* Filter out unsuppored __PAGE_KERNEL* bits: */
	336	pgprot_val(pte_prot) &= __supported_pte_mask;
	337	pte = pfn_pte(pfn, pte_prot);
	338	set_pte_at(mm, va, ptep, pte);
	339	pte_unmap_unlock(ptep, ptl);
	340	}
	341
	342	/* Propagate LDT mapping to the user page-table */
	343	map_ldt_struct_to_user(mm);
	344
	345	ldt->slot = slot;
	346	return 0;
	347	}
	348
	349	static void unmap_ldt_struct(struct mm_struct mm, struct ldt_struct ldt)
	350	{
	351	unsigned long va;
	352	int i, nr_pages;
	353
	354	if (!ldt)
	355	return;
	356
	357	/* LDT map/unmap is only required for PTI */
	358	if (!boot_cpu_has(X86_FEATURE_PTI))
	359	return;
	360
	361	nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE);
	362
	363	for (i = 0; i < nr_pages; i++) {
	364	unsigned long offset = i << PAGE_SHIFT;
	365	spinlock_t *ptl;
	366	pte_t *ptep;
	367
	368	va = (unsigned long)ldt_slot_va(ldt->slot) + offset;
	369	ptep = get_locked_pte(mm, va, &ptl);
	370	pte_clear(mm, va, ptep);
	371	pte_unmap_unlock(ptep, ptl);
	372	}
	373
	374	va = (unsigned long)ldt_slot_va(ldt->slot);
	375	flush_tlb_mm_range(mm, va, va + nr_pages * PAGE_SIZE, PAGE_SHIFT, false);
	376	}
	377
	378	#else /* !CONFIG_PAGE_TABLE_ISOLATION */
	379
	380	static int
	381	map_ldt_struct(struct mm_struct mm, struct ldt_struct ldt, int slot)
	382	{
	383	return 0;
	384	}
	385
	386	static void unmap_ldt_struct(struct mm_struct mm, struct ldt_struct ldt)
	387	{
	388	}
	389	#endif /* CONFIG_PAGE_TABLE_ISOLATION */
	390
	391	static void free_ldt_pgtables(struct mm_struct *mm)
	392	{
	393	#ifdef CONFIG_PAGE_TABLE_ISOLATION
	394	struct mmu_gather tlb;
	395	unsigned long start = LDT_BASE_ADDR;
	396	unsigned long end = LDT_END_ADDR;
	397
	398	if (!boot_cpu_has(X86_FEATURE_PTI))
	399	return;
	400
	401	/*
	402	* Although free_pgd_range() is intended for freeing user
	403	* page-tables, it also works out for kernel mappings on x86.
	404	* We use tlb_gather_mmu_fullmm() to avoid confusing the
	405	* range-tracking logic in __tlb_adjust_range().
	406	*/
	407	tlb_gather_mmu_fullmm(&tlb, mm);
	408	free_pgd_range(&tlb, start, end, start, end);
	409	tlb_finish_mmu(&tlb);
	410	#endif
	411	}
	412
	413	/* After calling this, the LDT is immutable. */
	414	static void finalize_ldt_struct(struct ldt_struct *ldt)
	415	{
	416	paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);
	417	}
	418
	419	static void install_ldt(struct mm_struct mm, struct ldt_struct ldt)
	420	{
	421	mutex_lock(&mm->context.lock);
	422
	423	/* Synchronizes with READ_ONCE in load_mm_ldt. */
	424	smp_store_release(&mm->context.ldt, ldt);
	425
	426	/* Activate the LDT for all CPUs using currents mm. */
	427	on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true);
	428
	429	mutex_unlock(&mm->context.lock);
	430	}
	431
	432	static void free_ldt_struct(struct ldt_struct *ldt)
	433	{
	434	if (likely(!ldt))
	435	return;
	436
	437	paravirt_free_ldt(ldt->entries, ldt->nr_entries);
	438	if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE)
	439	vfree_atomic(ldt->entries);
	440	else
	441	free_page((unsigned long)ldt->entries);
	442	kfree(ldt);
	443	}
	444
	445	/*
	446	* Called on fork from arch_dup_mmap(). Just copy the current LDT state,
	447	* the new task is not running, so nothing can be installed.
	448	*/
	449	int ldt_dup_context(struct mm_struct old_mm, struct mm_struct mm)
	450	{
	451	struct ldt_struct *new_ldt;
	452	int retval = 0;
	453
	454	if (!old_mm)
	455	return 0;
	456
	457	mutex_lock(&old_mm->context.lock);
	458	if (!old_mm->context.ldt)
	459	goto out_unlock;
	460
	461	new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
	462	if (!new_ldt) {
	463	retval = -ENOMEM;
	464	goto out_unlock;
	465	}
	466
	467	memcpy(new_ldt->entries, old_mm->context.ldt->entries,
	468	new_ldt->nr_entries * LDT_ENTRY_SIZE);
	469	finalize_ldt_struct(new_ldt);
	470
	471	retval = map_ldt_struct(mm, new_ldt, 0);
	472	if (retval) {
	473	free_ldt_pgtables(mm);
	474	free_ldt_struct(new_ldt);
	475	goto out_unlock;
	476	}
	477	mm->context.ldt = new_ldt;
	478
	479	out_unlock:
	480	mutex_unlock(&old_mm->context.lock);
	481	return retval;
	482	}
	483
	484	/*
	485	* No need to lock the MM as we are the last user
	486	*
	487	* 64bit: Don't touch the LDT register - we're already in the next thread.
	488	*/
	489	void destroy_context_ldt(struct mm_struct *mm)
	490	{
	491	free_ldt_struct(mm->context.ldt);
	492	mm->context.ldt = NULL;
	493	}
	494
	495	void ldt_arch_exit_mmap(struct mm_struct *mm)
	496	{
	497	free_ldt_pgtables(mm);
	498	}
	499
	500	static int read_ldt(void __user *ptr, unsigned long bytecount)
	501	{
	502	struct mm_struct *mm = current->mm;
	503	unsigned long entries_size;
	504	int retval;
	505
	506	down_read(&mm->context.ldt_usr_sem);
	507
	508	if (!mm->context.ldt) {
	509	retval = 0;
	510	goto out_unlock;
	511	}
	512
	513	if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
	514	bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
	515
	516	entries_size = mm->context.ldt->nr_entries * LDT_ENTRY_SIZE;
	517	if (entries_size > bytecount)
	518	entries_size = bytecount;
	519
	520	if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) {
	521	retval = -EFAULT;
	522	goto out_unlock;
	523	}
	524
	525	if (entries_size != bytecount) {
	526	/* Zero-fill the rest and pretend we read bytecount bytes. */
	527	if (clear_user(ptr + entries_size, bytecount - entries_size)) {
	528	retval = -EFAULT;
	529	goto out_unlock;
	530	}
	531	}
	532	retval = bytecount;
	533
	534	out_unlock:
	535	up_read(&mm->context.ldt_usr_sem);
	536	return retval;
	537	}
	538
	539	static int read_default_ldt(void __user *ptr, unsigned long bytecount)
	540	{
	541	/* CHECKME: Can we use _one_ random number ? */
	542	#ifdef CONFIG_X86_32
	543	unsigned long size = 5 * sizeof(struct desc_struct);
	544	#else
	545	unsigned long size = 128;
	546	#endif
	547	if (bytecount > size)
	548	bytecount = size;
	549	if (clear_user(ptr, bytecount))
	550	return -EFAULT;
	551	return bytecount;
	552	}
	553
	554	static bool allow_16bit_segments(void)
	555	{
	556	if (!IS_ENABLED(CONFIG_X86_16BIT))
	557	return false;
	558
	559	#ifdef CONFIG_XEN_PV
	560	/*
	561	* Xen PV does not implement ESPFIX64, which means that 16-bit
	562	* segments will not work correctly. Until either Xen PV implements
	563	* ESPFIX64 and can signal this fact to the guest or unless someone
	564	* provides compelling evidence that allowing broken 16-bit segments
	565	* is worthwhile, disallow 16-bit segments under Xen PV.
	566	*/
	567	if (xen_pv_domain()) {
	568	pr_info_once("Warning: 16-bit segments do not work correctly in a Xen PV guest\n");
	569	return false;
	570	}
	571	#endif
	572
	573	return true;
	574	}
	575
	576	static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
	577	{
	578	struct mm_struct *mm = current->mm;
	579	struct ldt_struct new_ldt, old_ldt;
	580	unsigned int old_nr_entries, new_nr_entries;
	581	struct user_desc ldt_info;
	582	struct desc_struct ldt;
	583	int error;
	584
	585	error = -EINVAL;
	586	if (bytecount != sizeof(ldt_info))
	587	goto out;
	588	error = -EFAULT;
	589	if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
	590	goto out;
	591
	592	error = -EINVAL;
	593	if (ldt_info.entry_number >= LDT_ENTRIES)
	594	goto out;
	595	if (ldt_info.contents == 3) {
	596	if (oldmode)
	597	goto out;
	598	if (ldt_info.seg_not_present == 0)
	599	goto out;
	600	}
	601
	602	if ((oldmode && !ldt_info.base_addr && !ldt_info.limit) \|\|
	603	LDT_empty(&ldt_info)) {
	604	/* The user wants to clear the entry. */
	605	memset(&ldt, 0, sizeof(ldt));
	606	} else {
	607	if (!ldt_info.seg_32bit && !allow_16bit_segments()) {
	608	error = -EINVAL;
	609	goto out;
	610	}
	611
	612	fill_ldt(&ldt, &ldt_info);
	613	if (oldmode)
	614	ldt.avl = 0;
	615	}
	616
	617	if (down_write_killable(&mm->context.ldt_usr_sem))
	618	return -EINTR;
	619
	620	old_ldt = mm->context.ldt;
	621	old_nr_entries = old_ldt ? old_ldt->nr_entries : 0;
	622	new_nr_entries = max(ldt_info.entry_number + 1, old_nr_entries);
	623
	624	error = -ENOMEM;
	625	new_ldt = alloc_ldt_struct(new_nr_entries);
	626	if (!new_ldt)
	627	goto out_unlock;
	628
	629	if (old_ldt)
	630	memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE);
	631
	632	new_ldt->entries[ldt_info.entry_number] = ldt;
	633	finalize_ldt_struct(new_ldt);
	634
	635	/*
	636	* If we are using PTI, map the new LDT into the userspace pagetables.
	637	* If there is already an LDT, use the other slot so that other CPUs
	638	* will continue to use the old LDT until install_ldt() switches
	639	* them over to the new LDT.
	640	*/
	641	error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
	642	if (error) {
	643	/*
	644	* This only can fail for the first LDT setup. If an LDT is
	645	* already installed then the PTE page is already
	646	* populated. Mop up a half populated page table.
	647	*/
	648	if (!WARN_ON_ONCE(old_ldt))
	649	free_ldt_pgtables(mm);
	650	free_ldt_struct(new_ldt);
	651	goto out_unlock;
	652	}
	653
	654	install_ldt(mm, new_ldt);
	655	unmap_ldt_struct(mm, old_ldt);
	656	free_ldt_struct(old_ldt);
	657	error = 0;
	658
	659	out_unlock:
	660	up_write(&mm->context.ldt_usr_sem);
	661	out:
	662	return error;
	663	}
	664
	665	SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr ,
	666	unsigned long , bytecount)
	667	{
	668	int ret = -ENOSYS;
	669
	670	switch (func) {
	671	case 0:
	672	ret = read_ldt(ptr, bytecount);
	673	break;
	674	case 1:
	675	ret = write_ldt(ptr, bytecount, 1);
	676	break;
	677	case 2:
	678	ret = read_default_ldt(ptr, bytecount);
	679	break;
	680	case 0x11:
	681	ret = write_ldt(ptr, bytecount, 0);
	682	break;
	683	}
	684	/*
	685	* The SYSCALL_DEFINE() macros give us an 'unsigned long'
	686	* return type, but tht ABI for sys_modify_ldt() expects
	687	* 'int'. This cast gives us an int-sized value in %rax
	688	* for the return code. The 'unsigned' is necessary so
	689	* the compiler does not try to sign-extend the negative
	690	* return codes into the high half of the register when
	691	* taking the value from int->long.
	692	*/
	693	return (unsigned int)ret;
	694	}