[linux-block.git] / arch / arm64 / mm / init.c

// SPDX-License-Identifier: GPL-2.0-only
/*
 * Based on arch/arm/mm/init.c
 *
 * Copyright (C) 1995-2005 Russell King
 * Copyright (C) 2012 ARM Ltd.
 */

#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/errno.h>
#include <linux/swap.h>
#include <linux/init.h>
#include <linux/cache.h>
#include <linux/mman.h>
#include <linux/nodemask.h>
#include <linux/initrd.h>
#include <linux/gfp.h>
#include <linux/math.h>
#include <linux/memblock.h>
#include <linux/sort.h>
#include <linux/of.h>
#include <linux/of_fdt.h>
#include <linux/dma-direct.h>
#include <linux/dma-map-ops.h>
#include <linux/efi.h>
#include <linux/swiotlb.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/kexec.h>
#include <linux/crash_dump.h>
#include <linux/hugetlb.h>
#include <linux/acpi_iort.h>
#include <linux/kmemleak.h>
#include <linux/execmem.h>

#include <asm/boot.h>
#include <asm/fixmap.h>
#include <asm/kasan.h>
#include <asm/kernel-pgtable.h>
#include <asm/kvm_host.h>
#include <asm/memory.h>
#include <asm/numa.h>
#include <asm/rsi.h>
#include <asm/sections.h>
#include <asm/setup.h>
#include <linux/sizes.h>
#include <asm/tlb.h>
#include <asm/alternative.h>
#include <asm/xen/swiotlb-xen.h>

/*
 * We need to be able to catch inadvertent references to memstart_addr
 * that occur (potentially in generic code) before arm64_memblock_init()
 * executes, which assigns it its actual value. So use a default value
 * that cannot be mistaken for a real physical address.
 */
s64 memstart_addr __ro_after_init = -1;
EXPORT_SYMBOL(memstart_addr);

/*
 * If the corresponding config options are enabled, we create both ZONE_DMA
 * and ZONE_DMA32. By default ZONE_DMA covers the 32-bit addressable memory
 * unless restricted on specific platforms (e.g. 30-bit on Raspberry Pi 4).
 * In such case, ZONE_DMA32 covers the rest of the 32-bit addressable memory,
 * otherwise it is empty.
 */
phys_addr_t __ro_after_init arm64_dma_phys_limit;

/*
 * To make optimal use of block mappings when laying out the linear
 * mapping, round down the base of physical memory to a size that can
 * be mapped efficiently, i.e., either PUD_SIZE (4k granule) or PMD_SIZE
 * (64k granule), or a multiple that can be mapped using contiguous bits
 * in the page tables: 32 * PMD_SIZE (16k granule)
 */
#if defined(CONFIG_ARM64_4K_PAGES)
#define ARM64_MEMSTART_SHIFT		PUD_SHIFT
#elif defined(CONFIG_ARM64_16K_PAGES)
#define ARM64_MEMSTART_SHIFT		CONT_PMD_SHIFT
#else
#define ARM64_MEMSTART_SHIFT		PMD_SHIFT
#endif

/*
 * sparsemem vmemmap imposes an additional requirement on the alignment of
 * memstart_addr, due to the fact that the base of the vmemmap region
 * has a direct correspondence, and needs to appear sufficiently aligned
 * in the virtual address space.
 */
#if ARM64_MEMSTART_SHIFT < SECTION_SIZE_BITS
#define ARM64_MEMSTART_ALIGN	(1UL << SECTION_SIZE_BITS)
#else
#define ARM64_MEMSTART_ALIGN	(1UL << ARM64_MEMSTART_SHIFT)
#endif

static void __init arch_reserve_crashkernel(void)
{
	unsigned long long low_size = 0;
	unsigned long long crash_base, crash_size;
	bool high = false;
	int ret;

	if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
		return;

	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
				&crash_size, &crash_base,
				&low_size, &high);
	if (ret)
		return;

	reserve_crashkernel_generic(crash_size, crash_base, low_size, high);
}

static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit)
{
	return min(zone_limit, memblock_end_of_DRAM() - 1) + 1;
}

static void __init zone_sizes_init(void)
{
	unsigned long max_zone_pfns[MAX_NR_ZONES]  = {0};
	phys_addr_t __maybe_unused acpi_zone_dma_limit;
	phys_addr_t __maybe_unused dt_zone_dma_limit;
	phys_addr_t __maybe_unused dma32_phys_limit =
		max_zone_phys(DMA_BIT_MASK(32));

#ifdef CONFIG_ZONE_DMA
	acpi_zone_dma_limit = acpi_iort_dma_get_max_cpu_address();
	dt_zone_dma_limit = of_dma_get_max_cpu_address(NULL);
	zone_dma_limit = min(dt_zone_dma_limit, acpi_zone_dma_limit);
	/*
	 * Information we get from firmware (e.g. DT dma-ranges) describe DMA
	 * bus constraints. Devices using DMA might have their own limitations.
	 * Some of them rely on DMA zone in low 32-bit memory. Keep low RAM
	 * DMA zone on platforms that have RAM there.
	 */
	if (memblock_start_of_DRAM() < U32_MAX)
		zone_dma_limit = min(zone_dma_limit, U32_MAX);
	arm64_dma_phys_limit = max_zone_phys(zone_dma_limit);
	max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
#endif
#ifdef CONFIG_ZONE_DMA32
	max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit);
	if (!arm64_dma_phys_limit)
		arm64_dma_phys_limit = dma32_phys_limit;
#endif
	if (!arm64_dma_phys_limit)
		arm64_dma_phys_limit = PHYS_MASK + 1;
	max_zone_pfns[ZONE_NORMAL] = max_pfn;

	free_area_init(max_zone_pfns);
}

int pfn_is_map_memory(unsigned long pfn)
{
	phys_addr_t addr = PFN_PHYS(pfn);

	/* avoid false positives for bogus PFNs, see comment in pfn_valid() */
	if (PHYS_PFN(addr) != pfn)
		return 0;

	return memblock_is_map_memory(addr);
}
EXPORT_SYMBOL(pfn_is_map_memory);

static phys_addr_t memory_limit __ro_after_init = PHYS_ADDR_MAX;

/*
 * Limit the memory size that was specified via FDT.
 */
static int __init early_mem(char *p)
{
	if (!p)
		return 1;

	memory_limit = memparse(p, &p) & PAGE_MASK;
	pr_notice("Memory limited to %lldMB\n", memory_limit >> 20);

	return 0;
}
early_param("mem", early_mem);

void __init arm64_memblock_init(void)
{
	s64 linear_region_size = PAGE_END - _PAGE_OFFSET(vabits_actual);

	/*
	 * Corner case: 52-bit VA capable systems running KVM in nVHE mode may
	 * be limited in their ability to support a linear map that exceeds 51
	 * bits of VA space, depending on the placement of the ID map. Given
	 * that the placement of the ID map may be randomized, let's simply
	 * limit the kernel's linear map to 51 bits as well if we detect this
	 * configuration.
	 */
	if (IS_ENABLED(CONFIG_KVM) && vabits_actual == 52 &&
	    is_hyp_mode_available() && !is_kernel_in_hyp_mode()) {
		pr_info("Capping linear region to 51 bits for KVM in nVHE mode on LVA capable hardware.\n");
		linear_region_size = min_t(u64, linear_region_size, BIT(51));
	}

	/* Remove memory above our supported physical address size */
	memblock_remove(1ULL << PHYS_MASK_SHIFT, ULLONG_MAX);

	/*
	 * Select a suitable value for the base of physical memory.
	 */
	memstart_addr = round_down(memblock_start_of_DRAM(),
				   ARM64_MEMSTART_ALIGN);

	if ((memblock_end_of_DRAM() - memstart_addr) > linear_region_size)
		pr_warn("Memory doesn't fit in the linear mapping, VA_BITS too small\n");

	/*
	 * Remove the memory that we will not be able to cover with the
	 * linear mapping. Take care not to clip the kernel which may be
	 * high in memory.
	 */
	memblock_remove(max_t(u64, memstart_addr + linear_region_size,
			__pa_symbol(_end)), ULLONG_MAX);
	if (memstart_addr + linear_region_size < memblock_end_of_DRAM()) {
		/* ensure that memstart_addr remains sufficiently aligned */
		memstart_addr = round_up(memblock_end_of_DRAM() - linear_region_size,
					 ARM64_MEMSTART_ALIGN);
		memblock_remove(0, memstart_addr);
	}

	/*
	 * If we are running with a 52-bit kernel VA config on a system that
	 * does not support it, we have to place the available physical
	 * memory in the 48-bit addressable part of the linear region, i.e.,
	 * we have to move it upward. Since memstart_addr represents the
	 * physical address of PAGE_OFFSET, we have to *subtract* from it.
	 */
	if (IS_ENABLED(CONFIG_ARM64_VA_BITS_52) && (vabits_actual != 52))
		memstart_addr -= _PAGE_OFFSET(vabits_actual) - _PAGE_OFFSET(52);

	/*
	 * Apply the memory limit if it was set. Since the kernel may be loaded
	 * high up in memory, add back the kernel region that must be accessible
	 * via the linear mapping.
	 */
	if (memory_limit != PHYS_ADDR_MAX) {
		memblock_mem_limit_remove_map(memory_limit);
		memblock_add(__pa_symbol(_text), (u64)(_end - _text));
	}

	if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
		/*
		 * Add back the memory we just removed if it results in the
		 * initrd to become inaccessible via the linear mapping.
		 * Otherwise, this is a no-op
		 */
		u64 base = phys_initrd_start & PAGE_MASK;
		u64 size = PAGE_ALIGN(phys_initrd_start + phys_initrd_size) - base;

		/*
		 * We can only add back the initrd memory if we don't end up
		 * with more memory than we can address via the linear mapping.
		 * It is up to the bootloader to position the kernel and the
		 * initrd reasonably close to each other (i.e., within 32 GB of
		 * each other) so that all granule/#levels combinations can
		 * always access both.
		 */
		if (WARN(base < memblock_start_of_DRAM() ||
			 base + size > memblock_start_of_DRAM() +
				       linear_region_size,
			"initrd not fully accessible via the linear mapping -- please check your bootloader ...\n")) {
			phys_initrd_size = 0;
		} else {
			memblock_add(base, size);
			memblock_clear_nomap(base, size);
			memblock_reserve(base, size);
		}
	}

	/*
	 * Register the kernel text, kernel data, initrd, and initial
	 * pagetables with memblock.
	 */
	memblock_reserve(__pa_symbol(_stext), _end - _stext);
	if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
		/* the generic initrd code expects virtual addresses */
		initrd_start = __phys_to_virt(phys_initrd_start);
		initrd_end = initrd_start + phys_initrd_size;
	}

	early_init_fdt_scan_reserved_mem();
}

void __init bootmem_init(void)
{
	unsigned long min, max;

	min = PFN_UP(memblock_start_of_DRAM());
	max = PFN_DOWN(memblock_end_of_DRAM());

	early_memtest(min << PAGE_SHIFT, max << PAGE_SHIFT);

	max_pfn = max_low_pfn = max;
	min_low_pfn = min;

	arch_numa_init();

	/*
	 * must be done after arch_numa_init() which calls numa_init() to
	 * initialize node_online_map that gets used in hugetlb_cma_reserve()
	 * while allocating required CMA size across online nodes.
	 */
#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
	arm64_hugetlb_cma_reserve();
#endif

	kvm_hyp_reserve();

	/*
	 * sparse_init() tries to allocate memory from memblock, so must be
	 * done after the fixed reservations
	 */
	sparse_init();
	zone_sizes_init();

	/*
	 * Reserve the CMA area after arm64_dma_phys_limit was initialised.
	 */
	dma_contiguous_reserve(arm64_dma_phys_limit);

	/*
	 * request_standard_resources() depends on crashkernel's memory being
	 * reserved, so do it here.
	 */
	arch_reserve_crashkernel();

	memblock_dump_all();
}

void __init arch_mm_preinit(void)
{
	unsigned int flags = SWIOTLB_VERBOSE;
	bool swiotlb = max_pfn > PFN_DOWN(arm64_dma_phys_limit);

	if (is_realm_world()) {
		swiotlb = true;
		flags |= SWIOTLB_FORCE;
	}

	if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && !swiotlb) {
		/*
		 * If no bouncing needed for ZONE_DMA, reduce the swiotlb
		 * buffer for kmalloc() bouncing to 1MB per 1GB of RAM.
		 */
		unsigned long size =
			DIV_ROUND_UP(memblock_phys_mem_size(), 1024);
		swiotlb_adjust_size(min(swiotlb_size_or_default(), size));
		swiotlb = true;
	}

	swiotlb_init(swiotlb, flags);
	swiotlb_update_mem_attributes();

	/*
	 * Check boundaries twice: Some fundamental inconsistencies can be
	 * detected at build time already.
	 */
#ifdef CONFIG_COMPAT
	BUILD_BUG_ON(TASK_SIZE_32 > DEFAULT_MAP_WINDOW_64);
#endif

	/*
	 * Selected page table levels should match when derived from
	 * scratch using the virtual address range and page size.
	 */
	BUILD_BUG_ON(ARM64_HW_PGTABLE_LEVELS(CONFIG_ARM64_VA_BITS) !=
		     CONFIG_PGTABLE_LEVELS);

	if (PAGE_SIZE >= 16384 && get_num_physpages() <= 128) {
		extern int sysctl_overcommit_memory;
		/*
		 * On a machine this small we won't get anywhere without
		 * overcommit, so turn it on by default.
		 */
		sysctl_overcommit_memory = OVERCOMMIT_ALWAYS;
	}
}

void free_initmem(void)
{
	void *lm_init_begin = lm_alias(__init_begin);
	void *lm_init_end = lm_alias(__init_end);

	WARN_ON(!IS_ALIGNED((unsigned long)lm_init_begin, PAGE_SIZE));
	WARN_ON(!IS_ALIGNED((unsigned long)lm_init_end, PAGE_SIZE));

	/* Delete __init region from memblock.reserved. */
	memblock_free(lm_init_begin, lm_init_end - lm_init_begin);

	free_reserved_area(lm_init_begin, lm_init_end,
			   POISON_FREE_INITMEM, "unused kernel");
	/*
	 * Unmap the __init region but leave the VM area in place. This
	 * prevents the region from being reused for kernel modules, which
	 * is not supported by kallsyms.
	 */
	vunmap_range((u64)__init_begin, (u64)__init_end);
}

void dump_mem_limit(void)
{
	if (memory_limit != PHYS_ADDR_MAX) {
		pr_emerg("Memory Limit: %llu MB\n", memory_limit >> 20);
	} else {
		pr_emerg("Memory Limit: none\n");
	}
}

#ifdef CONFIG_EXECMEM
static u64 module_direct_base __ro_after_init = 0;
static u64 module_plt_base __ro_after_init = 0;

/*
 * Choose a random page-aligned base address for a window of 'size' bytes which
 * entirely contains the interval [start, end - 1].
 */
static u64 __init random_bounding_box(u64 size, u64 start, u64 end)
{
	u64 max_pgoff, pgoff;

	if ((end - start) >= size)
		return 0;

	max_pgoff = (size - (end - start)) / PAGE_SIZE;
	pgoff = get_random_u32_inclusive(0, max_pgoff);

	return start - pgoff * PAGE_SIZE;
}

/*
 * Modules may directly reference data and text anywhere within the kernel
 * image and other modules. References using PREL32 relocations have a +/-2G
 * range, and so we need to ensure that the entire kernel image and all modules
 * fall within a 2G window such that these are always within range.
 *
 * Modules may directly branch to functions and code within the kernel text,
 * and to functions and code within other modules. These branches will use
 * CALL26/JUMP26 relocations with a +/-128M range. Without PLTs, we must ensure
 * that the entire kernel text and all module text falls within a 128M window
 * such that these are always within range. With PLTs, we can expand this to a
 * 2G window.
 *
 * We chose the 128M region to surround the entire kernel image (rather than
 * just the text) as using the same bounds for the 128M and 2G regions ensures
 * by construction that we never select a 128M region that is not a subset of
 * the 2G region. For very large and unusual kernel configurations this means
 * we may fall back to PLTs where they could have been avoided, but this keeps
 * the logic significantly simpler.
 */
static int __init module_init_limits(void)
{
	u64 kernel_end = (u64)_end;
	u64 kernel_start = (u64)_text;
	u64 kernel_size = kernel_end - kernel_start;

	/*
	 * The default modules region is placed immediately below the kernel
	 * image, and is large enough to use the full 2G relocation range.
	 */
	BUILD_BUG_ON(KIMAGE_VADDR != MODULES_END);
	BUILD_BUG_ON(MODULES_VSIZE < SZ_2G);

	if (!kaslr_enabled()) {
		if (kernel_size < SZ_128M)
			module_direct_base = kernel_end - SZ_128M;
		if (kernel_size < SZ_2G)
			module_plt_base = kernel_end - SZ_2G;
	} else {
		u64 min = kernel_start;
		u64 max = kernel_end;

		if (IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) {
			pr_info("2G module region forced by RANDOMIZE_MODULE_REGION_FULL\n");
		} else {
			module_direct_base = random_bounding_box(SZ_128M, min, max);
			if (module_direct_base) {
				min = module_direct_base;
				max = module_direct_base + SZ_128M;
			}
		}

		module_plt_base = random_bounding_box(SZ_2G, min, max);
	}

	pr_info("%llu pages in range for non-PLT usage",
		module_direct_base ? (SZ_128M - kernel_size) / PAGE_SIZE : 0);
	pr_info("%llu pages in range for PLT usage",
		module_plt_base ? (SZ_2G - kernel_size) / PAGE_SIZE : 0);

	return 0;
}

static struct execmem_info execmem_info __ro_after_init;

struct execmem_info __init *execmem_arch_setup(void)
{
	unsigned long fallback_start = 0, fallback_end = 0;
	unsigned long start = 0, end = 0;

	module_init_limits();

	/*
	 * Where possible, prefer to allocate within direct branch range of the
	 * kernel such that no PLTs are necessary.
	 */
	if (module_direct_base) {
		start = module_direct_base;
		end = module_direct_base + SZ_128M;

		if (module_plt_base) {
			fallback_start = module_plt_base;
			fallback_end = module_plt_base + SZ_2G;
		}
	} else if (module_plt_base) {
		start = module_plt_base;
		end = module_plt_base + SZ_2G;
	}

	execmem_info = (struct execmem_info){
		.ranges = {
			[EXECMEM_DEFAULT] = {
				.start	= start,
				.end	= end,
				.pgprot	= PAGE_KERNEL,
				.alignment = 1,
				.fallback_start	= fallback_start,
				.fallback_end	= fallback_end,
			},
			[EXECMEM_KPROBES] = {
				.start	= VMALLOC_START,
				.end	= VMALLOC_END,
				.pgprot	= PAGE_KERNEL_ROX,
				.alignment = 1,
			},
			[EXECMEM_BPF] = {
				.start	= VMALLOC_START,
				.end	= VMALLOC_END,
				.pgprot	= PAGE_KERNEL,
				.alignment = 1,
			},
		},
	};

	return &execmem_info;
}
#endif /* CONFIG_EXECMEM */
Commit	Line	Data
caab277b	1	// SPDX-License-Identifier: GPL-2.0-only
c1cc1552 CM	2	/*
	3	* Based on arch/arm/mm/init.c
	4	*
	5	* Copyright (C) 1995-2005 Russell King
	6	* Copyright (C) 2012 ARM Ltd.
c1cc1552 CM	7	*/
	8
	9	#include <linux/kernel.h>
	10	#include <linux/export.h>
	11	#include <linux/errno.h>
	12	#include <linux/swap.h>
	13	#include <linux/init.h>
5a9e3e15	14	#include <linux/cache.h>
c1cc1552 CM	15	#include <linux/mman.h>
	16	#include <linux/nodemask.h>
	17	#include <linux/initrd.h>
	18	#include <linux/gfp.h>
65033574	19	#include <linux/math.h>
c1cc1552 CM	20	#include <linux/memblock.h>
c1cc1552 CM	21	#include <linux/sort.h>
764b51ea	22	#include <linux/of.h>
c1cc1552	23	#include <linux/of_fdt.h>
8b5369ea	24	#include <linux/dma-direct.h>
0b1abd1f	25	#include <linux/dma-map-ops.h>
86c8b27a	26	#include <linux/efi.h>
a1e50a82	27	#include <linux/swiotlb.h>
dae8c235	28	#include <linux/vmalloc.h>
2077be67	29	#include <linux/mm.h>
764b51ea	30	#include <linux/kexec.h>
e62aaeac	31	#include <linux/crash_dump.h>
cf11e85f	32	#include <linux/hugetlb.h>
2b865293	33	#include <linux/acpi_iort.h>
85f58eb1	34	#include <linux/kmemleak.h>
0cc2dc49	35	#include <linux/execmem.h>
c1cc1552	36
a7f8de16	37	#include <asm/boot.h>
08375198	38	#include <asm/fixmap.h>
f9040773	39	#include <asm/kasan.h>
a7f8de16	40	#include <asm/kernel-pgtable.h>
f320bc74	41	#include <asm/kvm_host.h>
aa03c428	42	#include <asm/memory.h>
1a2db300	43	#include <asm/numa.h>
fbf979a0	44	#include <asm/rsi.h>
c1cc1552 CM	45	#include <asm/sections.h>
c1cc1552 CM	46	#include <asm/setup.h>
87dfb311	47	#include <linux/sizes.h>
c1cc1552	48	#include <asm/tlb.h>
e039ee4e	49	#include <asm/alternative.h>
687842ec	50	#include <asm/xen/swiotlb-xen.h>
c1cc1552	51
a7f8de16 AB	52	/*
	53	* We need to be able to catch inadvertent references to memstart_addr
	54	* that occur (potentially in generic code) before arm64_memblock_init()
	55	* executes, which assigns it its actual value. So use a default value
	56	* that cannot be mistaken for a real physical address.
	57	*/
5a9e3e15	58	s64 memstart_addr __ro_after_init = -1;
03ef055f MR	59	EXPORT_SYMBOL(memstart_addr);
03ef055f MR	60
1a8e1cef	61	/*
d78050ee CM	62	* If the corresponding config options are enabled, we create both ZONE_DMA
	63	* and ZONE_DMA32. By default ZONE_DMA covers the 32-bit addressable memory
	64	* unless restricted on specific platforms (e.g. 30-bit on Raspberry Pi 4).
	65	* In such case, ZONE_DMA32 covers the rest of the 32-bit addressable memory,
	66	* otherwise it is empty.
1a8e1cef	67	*/
03149563	68	phys_addr_t __ro_after_init arm64_dma_phys_limit;
c1cc1552	69
4e0bacd6 ZJ	70	/*
	71	* To make optimal use of block mappings when laying out the linear
	72	* mapping, round down the base of physical memory to a size that can
	73	* be mapped efficiently, i.e., either PUD_SIZE (4k granule) or PMD_SIZE
	74	* (64k granule), or a multiple that can be mapped using contiguous bits
	75	* in the page tables: 32 * PMD_SIZE (16k granule)
	76	*/
	77	#if defined(CONFIG_ARM64_4K_PAGES)
	78	#define ARM64_MEMSTART_SHIFT PUD_SHIFT
	79	#elif defined(CONFIG_ARM64_16K_PAGES)
	80	#define ARM64_MEMSTART_SHIFT CONT_PMD_SHIFT
	81	#else
	82	#define ARM64_MEMSTART_SHIFT PMD_SHIFT
	83	#endif
	84
	85	/*
	86	* sparsemem vmemmap imposes an additional requirement on the alignment of
	87	* memstart_addr, due to the fact that the base of the vmemmap region
	88	* has a direct correspondence, and needs to appear sufficiently aligned
	89	* in the virtual address space.
	90	*/
	91	#if ARM64_MEMSTART_SHIFT < SECTION_SIZE_BITS
	92	#define ARM64_MEMSTART_ALIGN (1UL << SECTION_SIZE_BITS)
	93	#else
	94	#define ARM64_MEMSTART_ALIGN (1UL << ARM64_MEMSTART_SHIFT)
	95	#endif
	96
fdc26823	97	static void __init arch_reserve_crashkernel(void)
764b51ea	98	{
fdc26823	99	unsigned long long low_size = 0;
6c4dcadd	100	unsigned long long crash_base, crash_size;
6c4dcadd BH	101	bool high = false;
6c4dcadd BH	102	int ret;
764b51ea	103
40254101	104	if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
d339f158 JZ	105	return;
d339f158 JZ	106
7b54a96f	107	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
fdc26823 BH	108	&crash_size, &crash_base,
	109	&low_size, &high);
	110	if (ret)
764b51ea	111	return;
944a45ab	112
7b54a96f	113	reserve_crashkernel_generic(crash_size, crash_base, low_size, high);
764b51ea	114	}
764b51ea	115
ba0fb44a	116	static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit)
d50314a6	117	{
ba0fb44a	118	return min(zone_limit, memblock_end_of_DRAM() - 1) + 1;
d50314a6 CM	119	}
d50314a6 CM	120
f41ef4c2	121	static void __init zone_sizes_init(void)
1a2db300 GK	122	{
1a2db300 GK	123	unsigned long max_zone_pfns[MAX_NR_ZONES] = {0};
ba0fb44a CM	124	phys_addr_t __maybe_unused acpi_zone_dma_limit;
	125	phys_addr_t __maybe_unused dt_zone_dma_limit;
	126	phys_addr_t __maybe_unused dma32_phys_limit =
	127	max_zone_phys(DMA_BIT_MASK(32));
1a2db300	128
1a8e1cef	129	#ifdef CONFIG_ZONE_DMA
ba0fb44a CM	130	acpi_zone_dma_limit = acpi_iort_dma_get_max_cpu_address();
	131	dt_zone_dma_limit = of_dma_get_max_cpu_address(NULL);
	132	zone_dma_limit = min(dt_zone_dma_limit, acpi_zone_dma_limit);
56a70874 YS	133	/*
	134	* Information we get from firmware (e.g. DT dma-ranges) describe DMA
	135	* bus constraints. Devices using DMA might have their own limitations.
	136	* Some of them rely on DMA zone in low 32-bit memory. Keep low RAM
	137	* DMA zone on platforms that have RAM there.
	138	*/
	139	if (memblock_start_of_DRAM() < U32_MAX)
	140	zone_dma_limit = min(zone_dma_limit, U32_MAX);
ba0fb44a	141	arm64_dma_phys_limit = max_zone_phys(zone_dma_limit);
1a8e1cef NSJ	142	max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
1a8e1cef NSJ	143	#endif
0c1f14ed	144	#ifdef CONFIG_ZONE_DMA32
d78050ee CM	145	max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit);
	146	if (!arm64_dma_phys_limit)
	147	arm64_dma_phys_limit = dma32_phys_limit;
0c1f14ed	148	#endif
504cae45 BH	149	if (!arm64_dma_phys_limit)
504cae45 BH	150	arm64_dma_phys_limit = PHYS_MASK + 1;
f41ef4c2	151	max_zone_pfns[ZONE_NORMAL] = max_pfn;
1a2db300	152
9691a071	153	free_area_init(max_zone_pfns);
1a2db300 GK	154	}
1a2db300 GK	155
873ba463	156	int pfn_is_map_memory(unsigned long pfn)
c1cc1552	157	{
093bbe21	158	phys_addr_t addr = PFN_PHYS(pfn);
4ab21506	159
873ba463 MR	160	/* avoid false positives for bogus PFNs, see comment in pfn_valid() */
873ba463 MR	161	if (PHYS_PFN(addr) != pfn)
4ab21506	162	return 0;
eeb0753b	163
5ad356ea	164	return memblock_is_map_memory(addr);
c1cc1552	165	}
873ba463	166	EXPORT_SYMBOL(pfn_is_map_memory);
c1cc1552	167
bb425a75	168	static phys_addr_t memory_limit __ro_after_init = PHYS_ADDR_MAX;
6083fe74 MR	169
	170	/*
	171	* Limit the memory size that was specified via FDT.
	172	*/
	173	static int __init early_mem(char *p)
	174	{
	175	if (!p)
	176	return 1;
	177
	178	memory_limit = memparse(p, &p) & PAGE_MASK;
	179	pr_notice("Memory limited to %lldMB\n", memory_limit >> 20);
	180
	181	return 0;
	182	}
	183	early_param("mem", early_mem);
	184
c1cc1552 CM	185	void __init arm64_memblock_init(void)
c1cc1552 CM	186	{
88053ec8 AB	187	s64 linear_region_size = PAGE_END - _PAGE_OFFSET(vabits_actual);
	188
	189	/*
	190	* Corner case: 52-bit VA capable systems running KVM in nVHE mode may
	191	* be limited in their ability to support a linear map that exceeds 51
	192	* bits of VA space, depending on the placement of the ID map. Given
	193	* that the placement of the ID map may be randomized, let's simply
	194	* limit the kernel's linear map to 51 bits as well if we detect this
	195	* configuration.
	196	*/
	197	if (IS_ENABLED(CONFIG_KVM) && vabits_actual == 52 &&
	198	is_hyp_mode_available() && !is_kernel_in_hyp_mode()) {
	199	pr_info("Capping linear region to 51 bits for KVM in nVHE mode on LVA capable hardware.\n");
	200	linear_region_size = min_t(u64, linear_region_size, BIT(51));
	201	}
a7f8de16	202
e9eaa805 KM	203	/* Remove memory above our supported physical address size */
	204	memblock_remove(1ULL << PHYS_MASK_SHIFT, ULLONG_MAX);
	205
a7f8de16 AB	206	/*
	207	* Select a suitable value for the base of physical memory.
	208	*/
	209	memstart_addr = round_down(memblock_start_of_DRAM(),
	210	ARM64_MEMSTART_ALIGN);
	211
31f80a4e MZ	212	if ((memblock_end_of_DRAM() - memstart_addr) > linear_region_size)
	213	pr_warn("Memory doesn't fit in the linear mapping, VA_BITS too small\n");
	214
a7f8de16 AB	215	/*
	216	* Remove the memory that we will not be able to cover with the
	217	* linear mapping. Take care not to clip the kernel which may be
	218	* high in memory.
	219	*/
2077be67 LA	220	memblock_remove(max_t(u64, memstart_addr + linear_region_size,
2077be67 LA	221	__pa_symbol(_end)), ULLONG_MAX);
2958987f AB	222	if (memstart_addr + linear_region_size < memblock_end_of_DRAM()) {
	223	/* ensure that memstart_addr remains sufficiently aligned */
	224	memstart_addr = round_up(memblock_end_of_DRAM() - linear_region_size,
	225	ARM64_MEMSTART_ALIGN);
	226	memblock_remove(0, memstart_addr);
	227	}
a7f8de16	228
7bc1a0f9 AB	229	/*
	230	* If we are running with a 52-bit kernel VA config on a system that
	231	* does not support it, we have to place the available physical
	232	* memory in the 48-bit addressable part of the linear region, i.e.,
	233	* we have to move it upward. Since memstart_addr represents the
	234	* physical address of PAGE_OFFSET, we have to subtract from it.
	235	*/
	236	if (IS_ENABLED(CONFIG_ARM64_VA_BITS_52) && (vabits_actual != 52))
9684ec18	237	memstart_addr -= _PAGE_OFFSET(vabits_actual) - _PAGE_OFFSET(52);
7bc1a0f9	238
a7f8de16 AB	239	/*
	240	* Apply the memory limit if it was set. Since the kernel may be loaded
	241	* high up in memory, add back the kernel region that must be accessible
	242	* via the linear mapping.
	243	*/
d7dc899a	244	if (memory_limit != PHYS_ADDR_MAX) {
cb0a6502	245	memblock_mem_limit_remove_map(memory_limit);
2077be67	246	memblock_add(__pa_symbol(_text), (u64)(_end - _text));
a7f8de16	247	}
6083fe74	248
c756c592	249	if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
177e15f0 AB	250	/*
	251	* Add back the memory we just removed if it results in the
	252	* initrd to become inaccessible via the linear mapping.
	253	* Otherwise, this is a no-op
	254	*/
c756c592	255	u64 base = phys_initrd_start & PAGE_MASK;
d4d18e3e	256	u64 size = PAGE_ALIGN(phys_initrd_start + phys_initrd_size) - base;
177e15f0 AB	257
	258	/*
	259	* We can only add back the initrd memory if we don't end up
	260	* with more memory than we can address via the linear mapping.
	261	* It is up to the bootloader to position the kernel and the
	262	* initrd reasonably close to each other (i.e., within 32 GB of
	263	* each other) so that all granule/#levels combinations can
	264	* always access both.
	265	*/
	266	if (WARN(base < memblock_start_of_DRAM() \|\|
	267	base + size > memblock_start_of_DRAM() +
	268	linear_region_size,
	269	"initrd not fully accessible via the linear mapping -- please check your bootloader ...\n")) {
70b3d237	270	phys_initrd_size = 0;
177e15f0	271	} else {
177e15f0	272	memblock_add(base, size);
c0b978fe	273	memblock_clear_nomap(base, size);
177e15f0 AB	274	memblock_reserve(base, size);
	275	}
	276	}
	277
bd00cd5f MR	278	/*
	279	* Register the kernel text, kernel data, initrd, and initial
	280	* pagetables with memblock.
	281	*/
e2a073dd	282	memblock_reserve(__pa_symbol(_stext), _end - _stext);
c756c592	283	if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
a89dea58	284	/* the generic initrd code expects virtual addresses */
c756c592 FF	285	initrd_start = __phys_to_virt(phys_initrd_start);
c756c592 FF	286	initrd_end = initrd_start + phys_initrd_size;
a89dea58	287	}
c1cc1552	288
0ceac9e0	289	early_init_fdt_scan_reserved_mem();
c1cc1552 CM	290	}
	291
	292	void __init bootmem_init(void)
	293	{
	294	unsigned long min, max;
	295
	296	min = PFN_UP(memblock_start_of_DRAM());
	297	max = PFN_DOWN(memblock_end_of_DRAM());
	298
36dd9086 VM	299	early_memtest(min << PAGE_SHIFT, max << PAGE_SHIFT);
36dd9086 VM	300
1a2db300	301	max_pfn = max_low_pfn = max;
19d6242e	302	min_low_pfn = min;
1a2db300	303
eb75541f	304	arch_numa_init();
618e0786 BS	305
618e0786 BS	306	/*
eb75541f	307	* must be done after arch_numa_init() which calls numa_init() to
618e0786 BS	308	* initialize node_online_map that gets used in hugetlb_cma_reserve()
	309	* while allocating required CMA size across online nodes.
	310	*/
abb7962a AK	311	#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
abb7962a AK	312	arm64_hugetlb_cma_reserve();
618e0786 BS	313	#endif
618e0786 BS	314
f320bc74 QP	315	kvm_hyp_reserve();
f320bc74 QP	316
c1cc1552	317	/*
c89ab04f MR	318	* sparse_init() tries to allocate memory from memblock, so must be
c89ab04f MR	319	* done after the fixed reservations
c1cc1552	320	*/
c1cc1552	321	sparse_init();
f41ef4c2	322	zone_sizes_init();
c1cc1552	323
d78050ee CM	324	/*
	325	* Reserve the CMA area after arm64_dma_phys_limit was initialised.
	326	*/
	327	dma_contiguous_reserve(arm64_dma_phys_limit);
	328
0a30c535 NSJ	329	/*
	330	* request_standard_resources() depends on crashkernel's memory being
	331	* reserved, so do it here.
	332	*/
fdc26823	333	arch_reserve_crashkernel();
0a30c535	334
1a2db300	335	memblock_dump_all();
c1cc1552 CM	336	}
c1cc1552 CM	337
0d98484e	338	void __init arch_mm_preinit(void)
c1cc1552	339	{
fbf979a0	340	unsigned int flags = SWIOTLB_VERBOSE;
1c1a429e CM	341	bool swiotlb = max_pfn > PFN_DOWN(arm64_dma_phys_limit);
1c1a429e CM	342
fbf979a0 SP	343	if (is_realm_world()) {
	344	swiotlb = true;
	345	flags \|= SWIOTLB_FORCE;
	346	}
	347
65033574 CM	348	if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && !swiotlb) {
	349	/*
	350	* If no bouncing needed for ZONE_DMA, reduce the swiotlb
	351	* buffer for kmalloc() bouncing to 1MB per 1GB of RAM.
	352	*/
	353	unsigned long size =
	354	DIV_ROUND_UP(memblock_phys_mem_size(), 1024);
	355	swiotlb_adjust_size(min(swiotlb_size_or_default(), size));
1c1a429e	356	swiotlb = true;
65033574	357	}
1c1a429e	358
fbf979a0 SP	359	swiotlb_init(swiotlb, flags);
fbf979a0 SP	360	swiotlb_update_mem_attributes();
a1e50a82	361
c1cc1552 CM	362	/*
	363	* Check boundaries twice: Some fundamental inconsistencies can be
	364	* detected at build time already.
	365	*/
	366	#ifdef CONFIG_COMPAT
363524d2	367	BUILD_BUG_ON(TASK_SIZE_32 > DEFAULT_MAP_WINDOW_64);
c1cc1552	368	#endif
c1cc1552	369
7e04cc91 AK	370	/*
	371	* Selected page table levels should match when derived from
	372	* scratch using the virtual address range and page size.
	373	*/
	374	BUILD_BUG_ON(ARM64_HW_PGTABLE_LEVELS(CONFIG_ARM64_VA_BITS) !=
	375	CONFIG_PGTABLE_LEVELS);
	376
bee4ebd1	377	if (PAGE_SIZE >= 16384 && get_num_physpages() <= 128) {
c1cc1552 CM	378	extern int sysctl_overcommit_memory;
	379	/*
	380	* On a machine this small we won't get anywhere without
	381	* overcommit, so turn it on by default.
	382	*/
	383	sysctl_overcommit_memory = OVERCOMMIT_ALWAYS;
	384	}
	385	}
	386
	387	void free_initmem(void)
	388	{
c02e7c5c JG	389	void *lm_init_begin = lm_alias(__init_begin);
	390	void *lm_init_end = lm_alias(__init_end);
	391
	392	WARN_ON(!IS_ALIGNED((unsigned long)lm_init_begin, PAGE_SIZE));
	393	WARN_ON(!IS_ALIGNED((unsigned long)lm_init_end, PAGE_SIZE));
1db9716d RQ	394
1db9716d RQ	395	/* Delete __init region from memblock.reserved. */
c02e7c5c	396	memblock_free(lm_init_begin, lm_init_end - lm_init_begin);
1db9716d	397
c02e7c5c	398	free_reserved_area(lm_init_begin, lm_init_end,
6ec939f8	399	POISON_FREE_INITMEM, "unused kernel");
dae8c235 KW	400	/*
	401	* Unmap the __init region but leave the VM area in place. This
	402	* prevents the region from being reused for kernel modules, which
	403	* is not supported by kallsyms.
	404	*/
4ad0ae8c	405	vunmap_range((u64)__init_begin, (u64)__init_end);
c1cc1552 CM	406	}
c1cc1552 CM	407
638d5031	408	void dump_mem_limit(void)
a7f8de16	409	{
d7dc899a	410	if (memory_limit != PHYS_ADDR_MAX) {
a7f8de16 AB	411	pr_emerg("Memory Limit: %llu MB\n", memory_limit >> 20);
	412	} else {
	413	pr_emerg("Memory Limit: none\n");
	414	}
a7f8de16	415	}
0cc2dc49 MRI	416
	417	#ifdef CONFIG_EXECMEM
	418	static u64 module_direct_base __ro_after_init = 0;
	419	static u64 module_plt_base __ro_after_init = 0;
	420
	421	/*
	422	* Choose a random page-aligned base address for a window of 'size' bytes which
	423	* entirely contains the interval [start, end - 1].
	424	*/
	425	static u64 __init random_bounding_box(u64 size, u64 start, u64 end)
	426	{
	427	u64 max_pgoff, pgoff;
	428
	429	if ((end - start) >= size)
	430	return 0;
	431
	432	max_pgoff = (size - (end - start)) / PAGE_SIZE;
	433	pgoff = get_random_u32_inclusive(0, max_pgoff);
	434
	435	return start - pgoff * PAGE_SIZE;
	436	}
	437
	438	/*
	439	* Modules may directly reference data and text anywhere within the kernel
	440	* image and other modules. References using PREL32 relocations have a +/-2G
	441	* range, and so we need to ensure that the entire kernel image and all modules
	442	* fall within a 2G window such that these are always within range.
	443	*
	444	* Modules may directly branch to functions and code within the kernel text,
	445	* and to functions and code within other modules. These branches will use
	446	* CALL26/JUMP26 relocations with a +/-128M range. Without PLTs, we must ensure
	447	* that the entire kernel text and all module text falls within a 128M window
	448	* such that these are always within range. With PLTs, we can expand this to a
	449	* 2G window.
	450	*
	451	* We chose the 128M region to surround the entire kernel image (rather than
	452	* just the text) as using the same bounds for the 128M and 2G regions ensures
	453	* by construction that we never select a 128M region that is not a subset of
	454	* the 2G region. For very large and unusual kernel configurations this means
	455	* we may fall back to PLTs where they could have been avoided, but this keeps
	456	* the logic significantly simpler.
	457	*/
	458	static int __init module_init_limits(void)
	459	{
	460	u64 kernel_end = (u64)_end;
	461	u64 kernel_start = (u64)_text;
	462	u64 kernel_size = kernel_end - kernel_start;
	463
	464	/*
	465	* The default modules region is placed immediately below the kernel
	466	* image, and is large enough to use the full 2G relocation range.
	467	*/
	468	BUILD_BUG_ON(KIMAGE_VADDR != MODULES_END);
	469	BUILD_BUG_ON(MODULES_VSIZE < SZ_2G);
	470
	471	if (!kaslr_enabled()) {
	472	if (kernel_size < SZ_128M)
	473	module_direct_base = kernel_end - SZ_128M;
	474	if (kernel_size < SZ_2G)
	475	module_plt_base = kernel_end - SZ_2G;
	476	} else {
	477	u64 min = kernel_start;
	478	u64 max = kernel_end;
	479
480	if (IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) {
481	pr_info("2G module region forced by RANDOMIZE_MODULE_REGION_FULL\n");
482	} else {
483	module_direct_base = random_bounding_box(SZ_128M, min, max);
484	if (module_direct_base) {
485	min = module_direct_base;
486	max = module_direct_base + SZ_128M;
487	}
488	}
489
490	module_plt_base = random_bounding_box(SZ_2G, min, max);
491	}
492
493	pr_info("%llu pages in range for non-PLT usage",
494	module_direct_base ? (SZ_128M - kernel_size) / PAGE_SIZE : 0);
495	pr_info("%llu pages in range for PLT usage",
496	module_plt_base ? (SZ_2G - kernel_size) / PAGE_SIZE : 0);
497
498	return 0;
499	}
500
501	static struct execmem_info execmem_info __ro_after_init;
502
503	struct execmem_info __init *execmem_arch_setup(void)
504	{
505	unsigned long fallback_start = 0, fallback_end = 0;
506	unsigned long start = 0, end = 0;
507
508	module_init_limits();
509
510	/*
511	* Where possible, prefer to allocate within direct branch range of the
512	* kernel such that no PLTs are necessary.
513	*/
514	if (module_direct_base) {
515	start = module_direct_base;
516	end = module_direct_base + SZ_128M;
517
518	if (module_plt_base) {
519	fallback_start = module_plt_base;
520	fallback_end = module_plt_base + SZ_2G;
521	}
522	} else if (module_plt_base) {
523	start = module_plt_base;
524	end = module_plt_base + SZ_2G;
525	}
526
527	execmem_info = (struct execmem_info){
528	.ranges = {
529	[EXECMEM_DEFAULT] = {
530	.start = start,
531	.end = end,
532	.pgprot = PAGE_KERNEL,
533	.alignment = 1,
534	.fallback_start = fallback_start,
535	.fallback_end = fallback_end,
536	},
537	[EXECMEM_KPROBES] = {
538	.start = VMALLOC_START,
539	.end = VMALLOC_END,
540	.pgprot = PAGE_KERNEL_ROX,
541	.alignment = 1,
542	},
543	[EXECMEM_BPF] = {
544	.start = VMALLOC_START,
545	.end = VMALLOC_END,
546	.pgprot = PAGE_KERNEL,
547	.alignment = 1,
548	},
549	},
550	};
551
552	return &execmem_info;
553	}
554	#endif /* CONFIG_EXECMEM */