[linux-2.6-block.git] / arch / arm64 / mm / init.c

// SPDX-License-Identifier: GPL-2.0-only
/*
 * Based on arch/arm/mm/init.c
 *
 * Copyright (C) 1995-2005 Russell King
 * Copyright (C) 2012 ARM Ltd.
 */

#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/errno.h>
#include <linux/swap.h>
#include <linux/init.h>
#include <linux/cache.h>
#include <linux/mman.h>
#include <linux/nodemask.h>
#include <linux/initrd.h>
#include <linux/gfp.h>
#include <linux/math.h>
#include <linux/memblock.h>
#include <linux/sort.h>
#include <linux/of.h>
#include <linux/of_fdt.h>
#include <linux/dma-direct.h>
#include <linux/dma-map-ops.h>
#include <linux/efi.h>
#include <linux/swiotlb.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/kexec.h>
#include <linux/crash_dump.h>
#include <linux/hugetlb.h>
#include <linux/acpi_iort.h>
#include <linux/kmemleak.h>
#include <linux/execmem.h>

#include <asm/boot.h>
#include <asm/fixmap.h>
#include <asm/kasan.h>
#include <asm/kernel-pgtable.h>
#include <asm/kvm_host.h>
#include <asm/memory.h>
#include <asm/numa.h>
#include <asm/rsi.h>
#include <asm/sections.h>
#include <asm/setup.h>
#include <linux/sizes.h>
#include <asm/tlb.h>
#include <asm/alternative.h>
#include <asm/xen/swiotlb-xen.h>

/*
 * We need to be able to catch inadvertent references to memstart_addr
 * that occur (potentially in generic code) before arm64_memblock_init()
 * executes, which assigns it its actual value. So use a default value
 * that cannot be mistaken for a real physical address.
 */
s64 memstart_addr __ro_after_init = -1;
EXPORT_SYMBOL(memstart_addr);

/*
 * If the corresponding config options are enabled, we create both ZONE_DMA
 * and ZONE_DMA32. By default ZONE_DMA covers the 32-bit addressable memory
 * unless restricted on specific platforms (e.g. 30-bit on Raspberry Pi 4).
 * In such case, ZONE_DMA32 covers the rest of the 32-bit addressable memory,
 * otherwise it is empty.
 */
phys_addr_t __ro_after_init arm64_dma_phys_limit;

/*
 * To make optimal use of block mappings when laying out the linear
 * mapping, round down the base of physical memory to a size that can
 * be mapped efficiently, i.e., either PUD_SIZE (4k granule) or PMD_SIZE
 * (64k granule), or a multiple that can be mapped using contiguous bits
 * in the page tables: 32 * PMD_SIZE (16k granule)
 */
#if defined(CONFIG_ARM64_4K_PAGES)
#define ARM64_MEMSTART_SHIFT		PUD_SHIFT
#elif defined(CONFIG_ARM64_16K_PAGES)
#define ARM64_MEMSTART_SHIFT		CONT_PMD_SHIFT
#else
#define ARM64_MEMSTART_SHIFT		PMD_SHIFT
#endif

/*
 * sparsemem vmemmap imposes an additional requirement on the alignment of
 * memstart_addr, due to the fact that the base of the vmemmap region
 * has a direct correspondence, and needs to appear sufficiently aligned
 * in the virtual address space.
 */
#if ARM64_MEMSTART_SHIFT < SECTION_SIZE_BITS
#define ARM64_MEMSTART_ALIGN	(1UL << SECTION_SIZE_BITS)
#else
#define ARM64_MEMSTART_ALIGN	(1UL << ARM64_MEMSTART_SHIFT)
#endif

static void __init arch_reserve_crashkernel(void)
{
	unsigned long long low_size = 0;
	unsigned long long crash_base, crash_size;
	bool high = false;
	int ret;

	if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
		return;

	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
				&crash_size, &crash_base,
				&low_size, &high);
	if (ret)
		return;

	reserve_crashkernel_generic(crash_size, crash_base, low_size, high);
}

static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit)
{
	return min(zone_limit, memblock_end_of_DRAM() - 1) + 1;
}

static void __init zone_sizes_init(void)
{
	unsigned long max_zone_pfns[MAX_NR_ZONES]  = {0};
	phys_addr_t __maybe_unused acpi_zone_dma_limit;
	phys_addr_t __maybe_unused dt_zone_dma_limit;
	phys_addr_t __maybe_unused dma32_phys_limit =
		max_zone_phys(DMA_BIT_MASK(32));

#ifdef CONFIG_ZONE_DMA
	acpi_zone_dma_limit = acpi_iort_dma_get_max_cpu_address();
	dt_zone_dma_limit = of_dma_get_max_cpu_address(NULL);
	zone_dma_limit = min(dt_zone_dma_limit, acpi_zone_dma_limit);
	/*
	 * Information we get from firmware (e.g. DT dma-ranges) describe DMA
	 * bus constraints. Devices using DMA might have their own limitations.
	 * Some of them rely on DMA zone in low 32-bit memory. Keep low RAM
	 * DMA zone on platforms that have RAM there.
	 */
	if (memblock_start_of_DRAM() < U32_MAX)
		zone_dma_limit = min(zone_dma_limit, U32_MAX);
	arm64_dma_phys_limit = max_zone_phys(zone_dma_limit);
	max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
#endif
#ifdef CONFIG_ZONE_DMA32
	max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit);
	if (!arm64_dma_phys_limit)
		arm64_dma_phys_limit = dma32_phys_limit;
#endif
	if (!arm64_dma_phys_limit)
		arm64_dma_phys_limit = PHYS_MASK + 1;
	max_zone_pfns[ZONE_NORMAL] = max_pfn;

	free_area_init(max_zone_pfns);
}

int pfn_is_map_memory(unsigned long pfn)
{
	phys_addr_t addr = PFN_PHYS(pfn);

	/* avoid false positives for bogus PFNs, see comment in pfn_valid() */
	if (PHYS_PFN(addr) != pfn)
		return 0;

	return memblock_is_map_memory(addr);
}
EXPORT_SYMBOL(pfn_is_map_memory);

static phys_addr_t memory_limit __ro_after_init = PHYS_ADDR_MAX;

/*
 * Limit the memory size that was specified via FDT.
 */
static int __init early_mem(char *p)
{
	if (!p)
		return 1;

	memory_limit = memparse(p, &p) & PAGE_MASK;
	pr_notice("Memory limited to %lldMB\n", memory_limit >> 20);

	return 0;
}
early_param("mem", early_mem);

void __init arm64_memblock_init(void)
{
	s64 linear_region_size = PAGE_END - _PAGE_OFFSET(vabits_actual);

	/*
	 * Corner case: 52-bit VA capable systems running KVM in nVHE mode may
	 * be limited in their ability to support a linear map that exceeds 51
	 * bits of VA space, depending on the placement of the ID map. Given
	 * that the placement of the ID map may be randomized, let's simply
	 * limit the kernel's linear map to 51 bits as well if we detect this
	 * configuration.
	 */
	if (IS_ENABLED(CONFIG_KVM) && vabits_actual == 52 &&
	    is_hyp_mode_available() && !is_kernel_in_hyp_mode()) {
		pr_info("Capping linear region to 51 bits for KVM in nVHE mode on LVA capable hardware.\n");
		linear_region_size = min_t(u64, linear_region_size, BIT(51));
	}

	/* Remove memory above our supported physical address size */
	memblock_remove(1ULL << PHYS_MASK_SHIFT, ULLONG_MAX);

	/*
	 * Select a suitable value for the base of physical memory.
	 */
	memstart_addr = round_down(memblock_start_of_DRAM(),
				   ARM64_MEMSTART_ALIGN);

	if ((memblock_end_of_DRAM() - memstart_addr) > linear_region_size)
		pr_warn("Memory doesn't fit in the linear mapping, VA_BITS too small\n");

	/*
	 * Remove the memory that we will not be able to cover with the
	 * linear mapping. Take care not to clip the kernel which may be
	 * high in memory.
	 */
	memblock_remove(max_t(u64, memstart_addr + linear_region_size,
			__pa_symbol(_end)), ULLONG_MAX);
	if (memstart_addr + linear_region_size < memblock_end_of_DRAM()) {
		/* ensure that memstart_addr remains sufficiently aligned */
		memstart_addr = round_up(memblock_end_of_DRAM() - linear_region_size,
					 ARM64_MEMSTART_ALIGN);
		memblock_remove(0, memstart_addr);
	}

	/*
	 * If we are running with a 52-bit kernel VA config on a system that
	 * does not support it, we have to place the available physical
	 * memory in the 48-bit addressable part of the linear region, i.e.,
	 * we have to move it upward. Since memstart_addr represents the
	 * physical address of PAGE_OFFSET, we have to *subtract* from it.
	 */
	if (IS_ENABLED(CONFIG_ARM64_VA_BITS_52) && (vabits_actual != 52))
		memstart_addr -= _PAGE_OFFSET(vabits_actual) - _PAGE_OFFSET(52);

	/*
	 * Apply the memory limit if it was set. Since the kernel may be loaded
	 * high up in memory, add back the kernel region that must be accessible
	 * via the linear mapping.
	 */
	if (memory_limit != PHYS_ADDR_MAX) {
		memblock_mem_limit_remove_map(memory_limit);
		memblock_add(__pa_symbol(_text), (u64)(_end - _text));
	}

	if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
		/*
		 * Add back the memory we just removed if it results in the
		 * initrd to become inaccessible via the linear mapping.
		 * Otherwise, this is a no-op
		 */
		u64 base = phys_initrd_start & PAGE_MASK;
		u64 size = PAGE_ALIGN(phys_initrd_start + phys_initrd_size) - base;

		/*
		 * We can only add back the initrd memory if we don't end up
		 * with more memory than we can address via the linear mapping.
		 * It is up to the bootloader to position the kernel and the
		 * initrd reasonably close to each other (i.e., within 32 GB of
		 * each other) so that all granule/#levels combinations can
		 * always access both.
		 */
		if (WARN(base < memblock_start_of_DRAM() ||
			 base + size > memblock_start_of_DRAM() +
				       linear_region_size,
			"initrd not fully accessible via the linear mapping -- please check your bootloader ...\n")) {
			phys_initrd_size = 0;
		} else {
			memblock_add(base, size);
			memblock_clear_nomap(base, size);
			memblock_reserve(base, size);
		}
	}

	/*
	 * Register the kernel text, kernel data, initrd, and initial
	 * pagetables with memblock.
	 */
	memblock_reserve(__pa_symbol(_stext), _end - _stext);
	if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
		/* the generic initrd code expects virtual addresses */
		initrd_start = __phys_to_virt(phys_initrd_start);
		initrd_end = initrd_start + phys_initrd_size;
	}

	early_init_fdt_scan_reserved_mem();
}

void __init bootmem_init(void)
{
	unsigned long min, max;

	min = PFN_UP(memblock_start_of_DRAM());
	max = PFN_DOWN(memblock_end_of_DRAM());

	early_memtest(min << PAGE_SHIFT, max << PAGE_SHIFT);

	max_pfn = max_low_pfn = max;
	min_low_pfn = min;

	arch_numa_init();

	/*
	 * must be done after arch_numa_init() which calls numa_init() to
	 * initialize node_online_map that gets used in hugetlb_cma_reserve()
	 * while allocating required CMA size across online nodes.
	 */
#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
	arm64_hugetlb_cma_reserve();
#endif

	kvm_hyp_reserve();

	/*
	 * sparse_init() tries to allocate memory from memblock, so must be
	 * done after the fixed reservations
	 */
	sparse_init();
	zone_sizes_init();

	/*
	 * Reserve the CMA area after arm64_dma_phys_limit was initialised.
	 */
	dma_contiguous_reserve(arm64_dma_phys_limit);

	/*
	 * request_standard_resources() depends on crashkernel's memory being
	 * reserved, so do it here.
	 */
	arch_reserve_crashkernel();

	memblock_dump_all();
}

void __init arch_mm_preinit(void)
{
	unsigned int flags = SWIOTLB_VERBOSE;
	bool swiotlb = max_pfn > PFN_DOWN(arm64_dma_phys_limit);

	if (is_realm_world()) {
		swiotlb = true;
		flags |= SWIOTLB_FORCE;
	}

	if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && !swiotlb) {
		/*
		 * If no bouncing needed for ZONE_DMA, reduce the swiotlb
		 * buffer for kmalloc() bouncing to 1MB per 1GB of RAM.
		 */
		unsigned long size =
			DIV_ROUND_UP(memblock_phys_mem_size(), 1024);
		swiotlb_adjust_size(min(swiotlb_size_or_default(), size));
		swiotlb = true;
	}

	swiotlb_init(swiotlb, flags);
	swiotlb_update_mem_attributes();

	/*
	 * Check boundaries twice: Some fundamental inconsistencies can be
	 * detected at build time already.
	 */
#ifdef CONFIG_COMPAT
	BUILD_BUG_ON(TASK_SIZE_32 > DEFAULT_MAP_WINDOW_64);
#endif

	/*
	 * Selected page table levels should match when derived from
	 * scratch using the virtual address range and page size.
	 */
	BUILD_BUG_ON(ARM64_HW_PGTABLE_LEVELS(CONFIG_ARM64_VA_BITS) !=
		     CONFIG_PGTABLE_LEVELS);

	if (PAGE_SIZE >= 16384 && get_num_physpages() <= 128) {
		extern int sysctl_overcommit_memory;
		/*
		 * On a machine this small we won't get anywhere without
		 * overcommit, so turn it on by default.
		 */
		sysctl_overcommit_memory = OVERCOMMIT_ALWAYS;
	}
}

void free_initmem(void)
{
	void *lm_init_begin = lm_alias(__init_begin);
	void *lm_init_end = lm_alias(__init_end);

	WARN_ON(!IS_ALIGNED((unsigned long)lm_init_begin, PAGE_SIZE));
	WARN_ON(!IS_ALIGNED((unsigned long)lm_init_end, PAGE_SIZE));

	/* Delete __init region from memblock.reserved. */
	memblock_free(lm_init_begin, lm_init_end - lm_init_begin);

	free_reserved_area(lm_init_begin, lm_init_end,
			   POISON_FREE_INITMEM, "unused kernel");
	/*
	 * Unmap the __init region but leave the VM area in place. This
	 * prevents the region from being reused for kernel modules, which
	 * is not supported by kallsyms.
	 */
	vunmap_range((u64)__init_begin, (u64)__init_end);
}

void dump_mem_limit(void)
{
	if (memory_limit != PHYS_ADDR_MAX) {
		pr_emerg("Memory Limit: %llu MB\n", memory_limit >> 20);
	} else {
		pr_emerg("Memory Limit: none\n");
	}
}

#ifdef CONFIG_EXECMEM
static u64 module_direct_base __ro_after_init = 0;
static u64 module_plt_base __ro_after_init = 0;

/*
 * Choose a random page-aligned base address for a window of 'size' bytes which
 * entirely contains the interval [start, end - 1].
 */
static u64 __init random_bounding_box(u64 size, u64 start, u64 end)
{
	u64 max_pgoff, pgoff;

	if ((end - start) >= size)
		return 0;

	max_pgoff = (size - (end - start)) / PAGE_SIZE;
	pgoff = get_random_u32_inclusive(0, max_pgoff);

	return start - pgoff * PAGE_SIZE;
}

/*
 * Modules may directly reference data and text anywhere within the kernel
 * image and other modules. References using PREL32 relocations have a +/-2G
 * range, and so we need to ensure that the entire kernel image and all modules
 * fall within a 2G window such that these are always within range.
 *
 * Modules may directly branch to functions and code within the kernel text,
 * and to functions and code within other modules. These branches will use
 * CALL26/JUMP26 relocations with a +/-128M range. Without PLTs, we must ensure
 * that the entire kernel text and all module text falls within a 128M window
 * such that these are always within range. With PLTs, we can expand this to a
 * 2G window.
 *
 * We chose the 128M region to surround the entire kernel image (rather than
 * just the text) as using the same bounds for the 128M and 2G regions ensures
 * by construction that we never select a 128M region that is not a subset of
 * the 2G region. For very large and unusual kernel configurations this means
 * we may fall back to PLTs where they could have been avoided, but this keeps
 * the logic significantly simpler.
 */
static int __init module_init_limits(void)
{
	u64 kernel_end = (u64)_end;
	u64 kernel_start = (u64)_text;
	u64 kernel_size = kernel_end - kernel_start;

	/*
	 * The default modules region is placed immediately below the kernel
	 * image, and is large enough to use the full 2G relocation range.
	 */
	BUILD_BUG_ON(KIMAGE_VADDR != MODULES_END);
	BUILD_BUG_ON(MODULES_VSIZE < SZ_2G);

	if (!kaslr_enabled()) {
		if (kernel_size < SZ_128M)
			module_direct_base = kernel_end - SZ_128M;
		if (kernel_size < SZ_2G)
			module_plt_base = kernel_end - SZ_2G;
	} else {
		u64 min = kernel_start;
		u64 max = kernel_end;

		if (IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) {
			pr_info("2G module region forced by RANDOMIZE_MODULE_REGION_FULL\n");
		} else {
			module_direct_base = random_bounding_box(SZ_128M, min, max);
			if (module_direct_base) {
				min = module_direct_base;
				max = module_direct_base + SZ_128M;
			}
		}

		module_plt_base = random_bounding_box(SZ_2G, min, max);
	}

	pr_info("%llu pages in range for non-PLT usage",
		module_direct_base ? (SZ_128M - kernel_size) / PAGE_SIZE : 0);
	pr_info("%llu pages in range for PLT usage",
		module_plt_base ? (SZ_2G - kernel_size) / PAGE_SIZE : 0);

	return 0;
}

static struct execmem_info execmem_info __ro_after_init;

struct execmem_info __init *execmem_arch_setup(void)
{
	unsigned long fallback_start = 0, fallback_end = 0;
	unsigned long start = 0, end = 0;

	module_init_limits();

	/*
	 * Where possible, prefer to allocate within direct branch range of the
	 * kernel such that no PLTs are necessary.
	 */
	if (module_direct_base) {
		start = module_direct_base;
		end = module_direct_base + SZ_128M;

		if (module_plt_base) {
			fallback_start = module_plt_base;
			fallback_end = module_plt_base + SZ_2G;
		}
	} else if (module_plt_base) {
		start = module_plt_base;
		end = module_plt_base + SZ_2G;
	}

	execmem_info = (struct execmem_info){
		.ranges = {
			[EXECMEM_DEFAULT] = {
				.start	= start,
				.end	= end,
				.pgprot	= PAGE_KERNEL,
				.alignment = 1,
				.fallback_start	= fallback_start,
				.fallback_end	= fallback_end,
			},
			[EXECMEM_KPROBES] = {
				.start	= VMALLOC_START,
				.end	= VMALLOC_END,
				.pgprot	= PAGE_KERNEL_ROX,
				.alignment = 1,
			},
			[EXECMEM_BPF] = {
				.start	= VMALLOC_START,
				.end	= VMALLOC_END,
				.pgprot	= PAGE_KERNEL,
				.alignment = 1,
			},
		},
	};

	return &execmem_info;
}
#endif /* CONFIG_EXECMEM */
Commit	Line	Data
	1	// SPDX-License-Identifier: GPL-2.0-only
	2	/*
	3	* Based on arch/arm/mm/init.c
	4	*
	5	* Copyright (C) 1995-2005 Russell King
	6	* Copyright (C) 2012 ARM Ltd.
	7	*/
	8
	9	#include <linux/kernel.h>
	10	#include <linux/export.h>
	11	#include <linux/errno.h>
	12	#include <linux/swap.h>
	13	#include <linux/init.h>
	14	#include <linux/cache.h>
	15	#include <linux/mman.h>
	16	#include <linux/nodemask.h>
	17	#include <linux/initrd.h>
	18	#include <linux/gfp.h>
	19	#include <linux/math.h>
	20	#include <linux/memblock.h>
	21	#include <linux/sort.h>
	22	#include <linux/of.h>
	23	#include <linux/of_fdt.h>
	24	#include <linux/dma-direct.h>
	25	#include <linux/dma-map-ops.h>
	26	#include <linux/efi.h>
	27	#include <linux/swiotlb.h>
	28	#include <linux/vmalloc.h>
	29	#include <linux/mm.h>
	30	#include <linux/kexec.h>
	31	#include <linux/crash_dump.h>
	32	#include <linux/hugetlb.h>
	33	#include <linux/acpi_iort.h>
	34	#include <linux/kmemleak.h>
	35	#include <linux/execmem.h>
	36
	37	#include <asm/boot.h>
	38	#include <asm/fixmap.h>
	39	#include <asm/kasan.h>
	40	#include <asm/kernel-pgtable.h>
	41	#include <asm/kvm_host.h>
	42	#include <asm/memory.h>
	43	#include <asm/numa.h>
	44	#include <asm/rsi.h>
	45	#include <asm/sections.h>
	46	#include <asm/setup.h>
	47	#include <linux/sizes.h>
	48	#include <asm/tlb.h>
	49	#include <asm/alternative.h>
	50	#include <asm/xen/swiotlb-xen.h>
	51
	52	/*
	53	* We need to be able to catch inadvertent references to memstart_addr
	54	* that occur (potentially in generic code) before arm64_memblock_init()
	55	* executes, which assigns it its actual value. So use a default value
	56	* that cannot be mistaken for a real physical address.
	57	*/
	58	s64 memstart_addr __ro_after_init = -1;
	59	EXPORT_SYMBOL(memstart_addr);
	60
	61	/*
	62	* If the corresponding config options are enabled, we create both ZONE_DMA
	63	* and ZONE_DMA32. By default ZONE_DMA covers the 32-bit addressable memory
	64	* unless restricted on specific platforms (e.g. 30-bit on Raspberry Pi 4).
	65	* In such case, ZONE_DMA32 covers the rest of the 32-bit addressable memory,
	66	* otherwise it is empty.
	67	*/
	68	phys_addr_t __ro_after_init arm64_dma_phys_limit;
	69
	70	/*
	71	* To make optimal use of block mappings when laying out the linear
	72	* mapping, round down the base of physical memory to a size that can
	73	* be mapped efficiently, i.e., either PUD_SIZE (4k granule) or PMD_SIZE
	74	* (64k granule), or a multiple that can be mapped using contiguous bits
	75	* in the page tables: 32 * PMD_SIZE (16k granule)
	76	*/
	77	#if defined(CONFIG_ARM64_4K_PAGES)
	78	#define ARM64_MEMSTART_SHIFT PUD_SHIFT
	79	#elif defined(CONFIG_ARM64_16K_PAGES)
	80	#define ARM64_MEMSTART_SHIFT CONT_PMD_SHIFT
	81	#else
	82	#define ARM64_MEMSTART_SHIFT PMD_SHIFT
	83	#endif
	84
	85	/*
	86	* sparsemem vmemmap imposes an additional requirement on the alignment of
	87	* memstart_addr, due to the fact that the base of the vmemmap region
	88	* has a direct correspondence, and needs to appear sufficiently aligned
	89	* in the virtual address space.
	90	*/
	91	#if ARM64_MEMSTART_SHIFT < SECTION_SIZE_BITS
	92	#define ARM64_MEMSTART_ALIGN (1UL << SECTION_SIZE_BITS)
	93	#else
	94	#define ARM64_MEMSTART_ALIGN (1UL << ARM64_MEMSTART_SHIFT)
	95	#endif
	96
	97	static void __init arch_reserve_crashkernel(void)
	98	{
	99	unsigned long long low_size = 0;
	100	unsigned long long crash_base, crash_size;
	101	bool high = false;
	102	int ret;
	103
	104	if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
	105	return;
	106
	107	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
	108	&crash_size, &crash_base,
	109	&low_size, &high);
	110	if (ret)
	111	return;
	112
	113	reserve_crashkernel_generic(crash_size, crash_base, low_size, high);
	114	}
	115
	116	static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit)
	117	{
	118	return min(zone_limit, memblock_end_of_DRAM() - 1) + 1;
	119	}
	120
	121	static void __init zone_sizes_init(void)
	122	{
	123	unsigned long max_zone_pfns[MAX_NR_ZONES] = {0};
	124	phys_addr_t __maybe_unused acpi_zone_dma_limit;
	125	phys_addr_t __maybe_unused dt_zone_dma_limit;
	126	phys_addr_t __maybe_unused dma32_phys_limit =
	127	max_zone_phys(DMA_BIT_MASK(32));
	128
	129	#ifdef CONFIG_ZONE_DMA
	130	acpi_zone_dma_limit = acpi_iort_dma_get_max_cpu_address();
	131	dt_zone_dma_limit = of_dma_get_max_cpu_address(NULL);
	132	zone_dma_limit = min(dt_zone_dma_limit, acpi_zone_dma_limit);
	133	/*
	134	* Information we get from firmware (e.g. DT dma-ranges) describe DMA
	135	* bus constraints. Devices using DMA might have their own limitations.
	136	* Some of them rely on DMA zone in low 32-bit memory. Keep low RAM
	137	* DMA zone on platforms that have RAM there.
	138	*/
	139	if (memblock_start_of_DRAM() < U32_MAX)
	140	zone_dma_limit = min(zone_dma_limit, U32_MAX);
	141	arm64_dma_phys_limit = max_zone_phys(zone_dma_limit);
	142	max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
	143	#endif
	144	#ifdef CONFIG_ZONE_DMA32
	145	max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit);
	146	if (!arm64_dma_phys_limit)
	147	arm64_dma_phys_limit = dma32_phys_limit;
	148	#endif
	149	if (!arm64_dma_phys_limit)
	150	arm64_dma_phys_limit = PHYS_MASK + 1;
	151	max_zone_pfns[ZONE_NORMAL] = max_pfn;
	152
	153	free_area_init(max_zone_pfns);
	154	}
	155
	156	int pfn_is_map_memory(unsigned long pfn)
	157	{
	158	phys_addr_t addr = PFN_PHYS(pfn);
	159
	160	/* avoid false positives for bogus PFNs, see comment in pfn_valid() */
	161	if (PHYS_PFN(addr) != pfn)
	162	return 0;
	163
	164	return memblock_is_map_memory(addr);
	165	}
	166	EXPORT_SYMBOL(pfn_is_map_memory);
	167
	168	static phys_addr_t memory_limit __ro_after_init = PHYS_ADDR_MAX;
	169
	170	/*
	171	* Limit the memory size that was specified via FDT.
	172	*/
	173	static int __init early_mem(char *p)
	174	{
	175	if (!p)
	176	return 1;
	177
	178	memory_limit = memparse(p, &p) & PAGE_MASK;
	179	pr_notice("Memory limited to %lldMB\n", memory_limit >> 20);
	180
	181	return 0;
	182	}
	183	early_param("mem", early_mem);
	184
	185	void __init arm64_memblock_init(void)
	186	{
	187	s64 linear_region_size = PAGE_END - _PAGE_OFFSET(vabits_actual);
	188
	189	/*
	190	* Corner case: 52-bit VA capable systems running KVM in nVHE mode may
	191	* be limited in their ability to support a linear map that exceeds 51
	192	* bits of VA space, depending on the placement of the ID map. Given
	193	* that the placement of the ID map may be randomized, let's simply
	194	* limit the kernel's linear map to 51 bits as well if we detect this
	195	* configuration.
	196	*/
	197	if (IS_ENABLED(CONFIG_KVM) && vabits_actual == 52 &&
	198	is_hyp_mode_available() && !is_kernel_in_hyp_mode()) {
	199	pr_info("Capping linear region to 51 bits for KVM in nVHE mode on LVA capable hardware.\n");
	200	linear_region_size = min_t(u64, linear_region_size, BIT(51));
	201	}
	202
	203	/* Remove memory above our supported physical address size */
	204	memblock_remove(1ULL << PHYS_MASK_SHIFT, ULLONG_MAX);
	205
	206	/*
	207	* Select a suitable value for the base of physical memory.
	208	*/
	209	memstart_addr = round_down(memblock_start_of_DRAM(),
	210	ARM64_MEMSTART_ALIGN);
	211
	212	if ((memblock_end_of_DRAM() - memstart_addr) > linear_region_size)
	213	pr_warn("Memory doesn't fit in the linear mapping, VA_BITS too small\n");
	214
	215	/*
	216	* Remove the memory that we will not be able to cover with the
	217	* linear mapping. Take care not to clip the kernel which may be
	218	* high in memory.
	219	*/
	220	memblock_remove(max_t(u64, memstart_addr + linear_region_size,
	221	__pa_symbol(_end)), ULLONG_MAX);
	222	if (memstart_addr + linear_region_size < memblock_end_of_DRAM()) {
	223	/* ensure that memstart_addr remains sufficiently aligned */
	224	memstart_addr = round_up(memblock_end_of_DRAM() - linear_region_size,
	225	ARM64_MEMSTART_ALIGN);
	226	memblock_remove(0, memstart_addr);
	227	}
	228
	229	/*
	230	* If we are running with a 52-bit kernel VA config on a system that
	231	* does not support it, we have to place the available physical
	232	* memory in the 48-bit addressable part of the linear region, i.e.,
	233	* we have to move it upward. Since memstart_addr represents the
	234	* physical address of PAGE_OFFSET, we have to subtract from it.
	235	*/
	236	if (IS_ENABLED(CONFIG_ARM64_VA_BITS_52) && (vabits_actual != 52))
	237	memstart_addr -= _PAGE_OFFSET(vabits_actual) - _PAGE_OFFSET(52);
	238
	239	/*
	240	* Apply the memory limit if it was set. Since the kernel may be loaded
	241	* high up in memory, add back the kernel region that must be accessible
	242	* via the linear mapping.
	243	*/
	244	if (memory_limit != PHYS_ADDR_MAX) {
	245	memblock_mem_limit_remove_map(memory_limit);
	246	memblock_add(__pa_symbol(_text), (u64)(_end - _text));
	247	}
	248
	249	if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
	250	/*
	251	* Add back the memory we just removed if it results in the
	252	* initrd to become inaccessible via the linear mapping.
	253	* Otherwise, this is a no-op
	254	*/
	255	u64 base = phys_initrd_start & PAGE_MASK;
	256	u64 size = PAGE_ALIGN(phys_initrd_start + phys_initrd_size) - base;
	257
	258	/*
	259	* We can only add back the initrd memory if we don't end up
	260	* with more memory than we can address via the linear mapping.
	261	* It is up to the bootloader to position the kernel and the
	262	* initrd reasonably close to each other (i.e., within 32 GB of
	263	* each other) so that all granule/#levels combinations can
	264	* always access both.
	265	*/
	266	if (WARN(base < memblock_start_of_DRAM() \|\|
	267	base + size > memblock_start_of_DRAM() +
	268	linear_region_size,
	269	"initrd not fully accessible via the linear mapping -- please check your bootloader ...\n")) {
	270	phys_initrd_size = 0;
	271	} else {
	272	memblock_add(base, size);
	273	memblock_clear_nomap(base, size);
	274	memblock_reserve(base, size);
	275	}
	276	}
	277
	278	/*
	279	* Register the kernel text, kernel data, initrd, and initial
	280	* pagetables with memblock.
	281	*/
	282	memblock_reserve(__pa_symbol(_stext), _end - _stext);
	283	if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
	284	/* the generic initrd code expects virtual addresses */
	285	initrd_start = __phys_to_virt(phys_initrd_start);
	286	initrd_end = initrd_start + phys_initrd_size;
	287	}
	288
	289	early_init_fdt_scan_reserved_mem();
	290	}
	291
	292	void __init bootmem_init(void)
	293	{
	294	unsigned long min, max;
	295
	296	min = PFN_UP(memblock_start_of_DRAM());
	297	max = PFN_DOWN(memblock_end_of_DRAM());
	298
	299	early_memtest(min << PAGE_SHIFT, max << PAGE_SHIFT);
	300
	301	max_pfn = max_low_pfn = max;
	302	min_low_pfn = min;
	303
	304	arch_numa_init();
	305
	306	/*
	307	* must be done after arch_numa_init() which calls numa_init() to
	308	* initialize node_online_map that gets used in hugetlb_cma_reserve()
	309	* while allocating required CMA size across online nodes.
	310	*/
	311	#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
	312	arm64_hugetlb_cma_reserve();
	313	#endif
	314
	315	kvm_hyp_reserve();
	316
	317	/*
	318	* sparse_init() tries to allocate memory from memblock, so must be
	319	* done after the fixed reservations
	320	*/
	321	sparse_init();
	322	zone_sizes_init();
	323
	324	/*
	325	* Reserve the CMA area after arm64_dma_phys_limit was initialised.
	326	*/
	327	dma_contiguous_reserve(arm64_dma_phys_limit);
	328
	329	/*
	330	* request_standard_resources() depends on crashkernel's memory being
	331	* reserved, so do it here.
	332	*/
	333	arch_reserve_crashkernel();
	334
	335	memblock_dump_all();
	336	}
	337
	338	void __init arch_mm_preinit(void)
	339	{
	340	unsigned int flags = SWIOTLB_VERBOSE;
	341	bool swiotlb = max_pfn > PFN_DOWN(arm64_dma_phys_limit);
	342
	343	if (is_realm_world()) {
	344	swiotlb = true;
	345	flags \|= SWIOTLB_FORCE;
	346	}
	347
	348	if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && !swiotlb) {
	349	/*
	350	* If no bouncing needed for ZONE_DMA, reduce the swiotlb
	351	* buffer for kmalloc() bouncing to 1MB per 1GB of RAM.
	352	*/
	353	unsigned long size =
	354	DIV_ROUND_UP(memblock_phys_mem_size(), 1024);
	355	swiotlb_adjust_size(min(swiotlb_size_or_default(), size));
	356	swiotlb = true;
	357	}
	358
	359	swiotlb_init(swiotlb, flags);
	360	swiotlb_update_mem_attributes();
	361
	362	/*
	363	* Check boundaries twice: Some fundamental inconsistencies can be
	364	* detected at build time already.
	365	*/
	366	#ifdef CONFIG_COMPAT
	367	BUILD_BUG_ON(TASK_SIZE_32 > DEFAULT_MAP_WINDOW_64);
	368	#endif
	369
	370	/*
	371	* Selected page table levels should match when derived from
	372	* scratch using the virtual address range and page size.
	373	*/
	374	BUILD_BUG_ON(ARM64_HW_PGTABLE_LEVELS(CONFIG_ARM64_VA_BITS) !=
	375	CONFIG_PGTABLE_LEVELS);
	376
	377	if (PAGE_SIZE >= 16384 && get_num_physpages() <= 128) {
	378	extern int sysctl_overcommit_memory;
	379	/*
	380	* On a machine this small we won't get anywhere without
	381	* overcommit, so turn it on by default.
	382	*/
	383	sysctl_overcommit_memory = OVERCOMMIT_ALWAYS;
	384	}
	385	}
	386
	387	void free_initmem(void)
	388	{
	389	void *lm_init_begin = lm_alias(__init_begin);
	390	void *lm_init_end = lm_alias(__init_end);
	391
	392	WARN_ON(!IS_ALIGNED((unsigned long)lm_init_begin, PAGE_SIZE));
	393	WARN_ON(!IS_ALIGNED((unsigned long)lm_init_end, PAGE_SIZE));
	394
	395	/* Delete __init region from memblock.reserved. */
	396	memblock_free(lm_init_begin, lm_init_end - lm_init_begin);
	397
	398	free_reserved_area(lm_init_begin, lm_init_end,
	399	POISON_FREE_INITMEM, "unused kernel");
	400	/*
	401	* Unmap the __init region but leave the VM area in place. This
	402	* prevents the region from being reused for kernel modules, which
	403	* is not supported by kallsyms.
	404	*/
	405	vunmap_range((u64)__init_begin, (u64)__init_end);
	406	}
	407
	408	void dump_mem_limit(void)
	409	{
	410	if (memory_limit != PHYS_ADDR_MAX) {
	411	pr_emerg("Memory Limit: %llu MB\n", memory_limit >> 20);
	412	} else {
	413	pr_emerg("Memory Limit: none\n");
	414	}
	415	}
	416
	417	#ifdef CONFIG_EXECMEM
	418	static u64 module_direct_base __ro_after_init = 0;
	419	static u64 module_plt_base __ro_after_init = 0;
	420
	421	/*
	422	* Choose a random page-aligned base address for a window of 'size' bytes which
	423	* entirely contains the interval [start, end - 1].
	424	*/
	425	static u64 __init random_bounding_box(u64 size, u64 start, u64 end)
	426	{
	427	u64 max_pgoff, pgoff;
	428
	429	if ((end - start) >= size)
	430	return 0;
	431
	432	max_pgoff = (size - (end - start)) / PAGE_SIZE;
	433	pgoff = get_random_u32_inclusive(0, max_pgoff);
	434
	435	return start - pgoff * PAGE_SIZE;
	436	}
	437
	438	/*
	439	* Modules may directly reference data and text anywhere within the kernel
	440	* image and other modules. References using PREL32 relocations have a +/-2G
	441	* range, and so we need to ensure that the entire kernel image and all modules
	442	* fall within a 2G window such that these are always within range.
	443	*
	444	* Modules may directly branch to functions and code within the kernel text,
	445	* and to functions and code within other modules. These branches will use
	446	* CALL26/JUMP26 relocations with a +/-128M range. Without PLTs, we must ensure
	447	* that the entire kernel text and all module text falls within a 128M window
	448	* such that these are always within range. With PLTs, we can expand this to a
	449	* 2G window.
	450	*
	451	* We chose the 128M region to surround the entire kernel image (rather than
	452	* just the text) as using the same bounds for the 128M and 2G regions ensures
	453	* by construction that we never select a 128M region that is not a subset of
	454	* the 2G region. For very large and unusual kernel configurations this means
	455	* we may fall back to PLTs where they could have been avoided, but this keeps
	456	* the logic significantly simpler.
	457	*/
	458	static int __init module_init_limits(void)
	459	{
	460	u64 kernel_end = (u64)_end;
	461	u64 kernel_start = (u64)_text;
	462	u64 kernel_size = kernel_end - kernel_start;
	463
	464	/*
	465	* The default modules region is placed immediately below the kernel
	466	* image, and is large enough to use the full 2G relocation range.
	467	*/
	468	BUILD_BUG_ON(KIMAGE_VADDR != MODULES_END);
	469	BUILD_BUG_ON(MODULES_VSIZE < SZ_2G);
	470
	471	if (!kaslr_enabled()) {
	472	if (kernel_size < SZ_128M)
	473	module_direct_base = kernel_end - SZ_128M;
	474	if (kernel_size < SZ_2G)
	475	module_plt_base = kernel_end - SZ_2G;
	476	} else {
	477	u64 min = kernel_start;
	478	u64 max = kernel_end;
	479
	480	if (IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) {
	481	pr_info("2G module region forced by RANDOMIZE_MODULE_REGION_FULL\n");
	482	} else {
	483	module_direct_base = random_bounding_box(SZ_128M, min, max);
	484	if (module_direct_base) {
	485	min = module_direct_base;
	486	max = module_direct_base + SZ_128M;
	487	}
	488	}
	489
	490	module_plt_base = random_bounding_box(SZ_2G, min, max);
	491	}
	492
	493	pr_info("%llu pages in range for non-PLT usage",
	494	module_direct_base ? (SZ_128M - kernel_size) / PAGE_SIZE : 0);
	495	pr_info("%llu pages in range for PLT usage",
	496	module_plt_base ? (SZ_2G - kernel_size) / PAGE_SIZE : 0);
	497
	498	return 0;
	499	}
	500
	501	static struct execmem_info execmem_info __ro_after_init;
	502
	503	struct execmem_info __init *execmem_arch_setup(void)
	504	{
	505	unsigned long fallback_start = 0, fallback_end = 0;
	506	unsigned long start = 0, end = 0;
	507
	508	module_init_limits();
	509
	510	/*
	511	* Where possible, prefer to allocate within direct branch range of the
	512	* kernel such that no PLTs are necessary.
	513	*/
	514	if (module_direct_base) {
	515	start = module_direct_base;
	516	end = module_direct_base + SZ_128M;
	517
	518	if (module_plt_base) {
	519	fallback_start = module_plt_base;
	520	fallback_end = module_plt_base + SZ_2G;
	521	}
	522	} else if (module_plt_base) {
	523	start = module_plt_base;
	524	end = module_plt_base + SZ_2G;
	525	}
	526
	527	execmem_info = (struct execmem_info){
	528	.ranges = {
	529	[EXECMEM_DEFAULT] = {
	530	.start = start,
	531	.end = end,
	532	.pgprot = PAGE_KERNEL,
	533	.alignment = 1,
	534	.fallback_start = fallback_start,
	535	.fallback_end = fallback_end,
	536	},
	537	[EXECMEM_KPROBES] = {
	538	.start = VMALLOC_START,
	539	.end = VMALLOC_END,
	540	.pgprot = PAGE_KERNEL_ROX,
	541	.alignment = 1,
	542	},
	543	[EXECMEM_BPF] = {
	544	.start = VMALLOC_START,
	545	.end = VMALLOC_END,
	546	.pgprot = PAGE_KERNEL,
	547	.alignment = 1,
	548	},
	549	},
	550	};
	551
	552	return &execmem_info;
	553	}
	554	#endif /* CONFIG_EXECMEM */