Commit | Line | Data |
---|---|---|
9f4c815c IM |
1 | /* |
2 | * Copyright 2002 Andi Kleen, SuSE Labs. | |
1da177e4 | 3 | * Thanks to Ben LaHaise for precious feedback. |
9f4c815c | 4 | */ |
1da177e4 | 5 | #include <linux/highmem.h> |
8192206d | 6 | #include <linux/bootmem.h> |
1da177e4 | 7 | #include <linux/module.h> |
9f4c815c | 8 | #include <linux/sched.h> |
1da177e4 | 9 | #include <linux/slab.h> |
9f4c815c IM |
10 | #include <linux/mm.h> |
11 | ||
950f9d95 | 12 | #include <asm/e820.h> |
1da177e4 LT |
13 | #include <asm/processor.h> |
14 | #include <asm/tlbflush.h> | |
f8af095d | 15 | #include <asm/sections.h> |
9f4c815c IM |
16 | #include <asm/uaccess.h> |
17 | #include <asm/pgalloc.h> | |
1da177e4 | 18 | |
ed724be6 AV |
19 | static inline int |
20 | within(unsigned long addr, unsigned long start, unsigned long end) | |
687c4825 | 21 | { |
ed724be6 AV |
22 | return addr >= start && addr < end; |
23 | } | |
24 | ||
25 | /* | |
26 | * Certain areas of memory on x86 require very specific protection flags, | |
27 | * for example the BIOS area or kernel text. Callers don't always get this | |
28 | * right (again, ioremap() on BIOS memory is not uncommon) so this function | |
29 | * checks and fixes these known static required protection bits. | |
30 | */ | |
31 | static inline pgprot_t static_protections(pgprot_t prot, unsigned long address) | |
32 | { | |
33 | pgprot_t forbidden = __pgprot(0); | |
34 | ||
687c4825 | 35 | /* |
ed724be6 AV |
36 | * The BIOS area between 640k and 1Mb needs to be executable for |
37 | * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. | |
687c4825 | 38 | */ |
ed724be6 AV |
39 | if (within(__pa(address), BIOS_BEGIN, BIOS_END)) |
40 | pgprot_val(forbidden) |= _PAGE_NX; | |
41 | ||
42 | /* | |
43 | * The kernel text needs to be executable for obvious reasons | |
44 | * Does not cover __inittext since that is gone later on | |
45 | */ | |
46 | if (within(address, (unsigned long)_text, (unsigned long)_etext)) | |
47 | pgprot_val(forbidden) |= _PAGE_NX; | |
48 | ||
49 | #ifdef CONFIG_DEBUG_RODATA | |
50 | /* The .rodata section needs to be read-only */ | |
51 | if (within(address, (unsigned long)__start_rodata, | |
52 | (unsigned long)__end_rodata)) | |
53 | pgprot_val(forbidden) |= _PAGE_RW; | |
54 | #endif | |
55 | ||
56 | prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); | |
687c4825 IM |
57 | |
58 | return prot; | |
59 | } | |
60 | ||
f0646e43 | 61 | pte_t *lookup_address(unsigned long address, int *level) |
9f4c815c | 62 | { |
1da177e4 LT |
63 | pgd_t *pgd = pgd_offset_k(address); |
64 | pud_t *pud; | |
65 | pmd_t *pmd; | |
9f4c815c | 66 | |
30551bb3 TG |
67 | *level = PG_LEVEL_NONE; |
68 | ||
1da177e4 LT |
69 | if (pgd_none(*pgd)) |
70 | return NULL; | |
71 | pud = pud_offset(pgd, address); | |
72 | if (pud_none(*pud)) | |
73 | return NULL; | |
74 | pmd = pmd_offset(pud, address); | |
75 | if (pmd_none(*pmd)) | |
76 | return NULL; | |
30551bb3 TG |
77 | |
78 | *level = PG_LEVEL_2M; | |
1da177e4 LT |
79 | if (pmd_large(*pmd)) |
80 | return (pte_t *)pmd; | |
1da177e4 | 81 | |
30551bb3 | 82 | *level = PG_LEVEL_4K; |
9f4c815c IM |
83 | return pte_offset_kernel(pmd, address); |
84 | } | |
85 | ||
9a3dc780 | 86 | static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) |
9f4c815c | 87 | { |
9f4c815c IM |
88 | /* change init_mm */ |
89 | set_pte_atomic(kpte, pte); | |
44af6c41 | 90 | #ifdef CONFIG_X86_32 |
e4b71dcf | 91 | if (!SHARED_KERNEL_PMD) { |
44af6c41 IM |
92 | struct page *page; |
93 | ||
94 | for (page = pgd_list; page; page = (struct page *)page->index) { | |
95 | pgd_t *pgd; | |
96 | pud_t *pud; | |
97 | pmd_t *pmd; | |
98 | ||
99 | pgd = (pgd_t *)page_address(page) + pgd_index(address); | |
100 | pud = pud_offset(pgd, address); | |
101 | pmd = pmd_offset(pud, address); | |
102 | set_pte_atomic((pte_t *)pmd, pte); | |
103 | } | |
1da177e4 | 104 | } |
44af6c41 | 105 | #endif |
1da177e4 LT |
106 | } |
107 | ||
7afe15b9 | 108 | static int split_large_page(pte_t *kpte, unsigned long address) |
bb5c2dbd | 109 | { |
7afe15b9 | 110 | pgprot_t ref_prot = pte_pgprot(pte_clrhuge(*kpte)); |
12d6f21e | 111 | gfp_t gfp_flags = GFP_KERNEL; |
9a3dc780 | 112 | unsigned long flags; |
bb5c2dbd IM |
113 | unsigned long addr; |
114 | pte_t *pbase, *tmp; | |
115 | struct page *base; | |
7afe15b9 | 116 | int i, level; |
bb5c2dbd | 117 | |
12d6f21e IM |
118 | #ifdef CONFIG_DEBUG_PAGEALLOC |
119 | gfp_flags = GFP_ATOMIC; | |
120 | #endif | |
121 | base = alloc_pages(gfp_flags, 0); | |
bb5c2dbd IM |
122 | if (!base) |
123 | return -ENOMEM; | |
124 | ||
9a3dc780 | 125 | spin_lock_irqsave(&pgd_lock, flags); |
bb5c2dbd IM |
126 | /* |
127 | * Check for races, another CPU might have split this page | |
128 | * up for us already: | |
129 | */ | |
130 | tmp = lookup_address(address, &level); | |
5508a748 IM |
131 | if (tmp != kpte) { |
132 | WARN_ON_ONCE(1); | |
bb5c2dbd | 133 | goto out_unlock; |
5508a748 | 134 | } |
bb5c2dbd IM |
135 | |
136 | address = __pa(address); | |
137 | addr = address & LARGE_PAGE_MASK; | |
138 | pbase = (pte_t *)page_address(base); | |
44af6c41 | 139 | #ifdef CONFIG_X86_32 |
bb5c2dbd | 140 | paravirt_alloc_pt(&init_mm, page_to_pfn(base)); |
44af6c41 | 141 | #endif |
bb5c2dbd IM |
142 | |
143 | for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) | |
144 | set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, ref_prot)); | |
145 | ||
146 | /* | |
4c881ca1 HY |
147 | * Install the new, split up pagetable. Important detail here: |
148 | * | |
149 | * On Intel the NX bit of all levels must be cleared to make a | |
150 | * page executable. See section 4.13.2 of Intel 64 and IA-32 | |
151 | * Architectures Software Developer's Manual). | |
bb5c2dbd | 152 | */ |
4c881ca1 | 153 | ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte))); |
9a3dc780 | 154 | __set_pmd_pte(kpte, address, mk_pte(base, ref_prot)); |
bb5c2dbd IM |
155 | base = NULL; |
156 | ||
157 | out_unlock: | |
9a3dc780 | 158 | spin_unlock_irqrestore(&pgd_lock, flags); |
bb5c2dbd IM |
159 | |
160 | if (base) | |
161 | __free_pages(base, 0); | |
162 | ||
163 | return 0; | |
164 | } | |
165 | ||
44af6c41 | 166 | static int |
8192206d | 167 | __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot) |
9f4c815c | 168 | { |
1da177e4 | 169 | struct page *kpte_page; |
bb5c2dbd | 170 | int level, err = 0; |
9f4c815c | 171 | pte_t *kpte; |
1da177e4 | 172 | |
8192206d IM |
173 | #ifdef CONFIG_X86_32 |
174 | BUG_ON(pfn > max_low_pfn); | |
175 | #endif | |
1da177e4 | 176 | |
97f99fed | 177 | repeat: |
f0646e43 | 178 | kpte = lookup_address(address, &level); |
1da177e4 LT |
179 | if (!kpte) |
180 | return -EINVAL; | |
9f4c815c | 181 | |
1da177e4 | 182 | kpte_page = virt_to_page(kpte); |
65d2f0bc AK |
183 | BUG_ON(PageLRU(kpte_page)); |
184 | BUG_ON(PageCompound(kpte_page)); | |
185 | ||
ed724be6 | 186 | prot = static_protections(prot, address); |
65d2f0bc | 187 | |
30551bb3 | 188 | if (level == PG_LEVEL_4K) { |
8192206d | 189 | set_pte_atomic(kpte, pfn_pte(pfn, canon_pgprot(prot))); |
78c94aba | 190 | } else { |
7afe15b9 | 191 | err = split_large_page(kpte, address); |
bb5c2dbd IM |
192 | if (!err) |
193 | goto repeat; | |
1da177e4 | 194 | } |
bb5c2dbd | 195 | return err; |
9f4c815c | 196 | } |
1da177e4 | 197 | |
44af6c41 IM |
198 | /** |
199 | * change_page_attr_addr - Change page table attributes in linear mapping | |
200 | * @address: Virtual address in linear mapping. | |
201 | * @numpages: Number of pages to change | |
202 | * @prot: New page table attribute (PAGE_*) | |
1da177e4 | 203 | * |
44af6c41 IM |
204 | * Change page attributes of a page in the direct mapping. This is a variant |
205 | * of change_page_attr() that also works on memory holes that do not have | |
206 | * mem_map entry (pfn_valid() is false). | |
9f4c815c | 207 | * |
44af6c41 | 208 | * See change_page_attr() documentation for more details. |
75cbade8 AV |
209 | * |
210 | * Modules and drivers should use the set_memory_* APIs instead. | |
1da177e4 | 211 | */ |
44af6c41 | 212 | |
d1028a15 AV |
213 | static int change_page_attr_addr(unsigned long address, int numpages, |
214 | pgprot_t prot) | |
1da177e4 | 215 | { |
44af6c41 IM |
216 | int err = 0, kernel_map = 0, i; |
217 | ||
218 | #ifdef CONFIG_X86_64 | |
219 | if (address >= __START_KERNEL_map && | |
220 | address < __START_KERNEL_map + KERNEL_TEXT_SIZE) { | |
1da177e4 | 221 | |
44af6c41 IM |
222 | address = (unsigned long)__va(__pa(address)); |
223 | kernel_map = 1; | |
224 | } | |
225 | #endif | |
226 | ||
227 | for (i = 0; i < numpages; i++, address += PAGE_SIZE) { | |
228 | unsigned long pfn = __pa(address) >> PAGE_SHIFT; | |
229 | ||
230 | if (!kernel_map || pte_present(pfn_pte(0, prot))) { | |
8192206d | 231 | err = __change_page_attr(address, pfn, prot); |
44af6c41 IM |
232 | if (err) |
233 | break; | |
234 | } | |
235 | #ifdef CONFIG_X86_64 | |
236 | /* | |
237 | * Handle kernel mapping too which aliases part of | |
238 | * lowmem: | |
239 | */ | |
240 | if (__pa(address) < KERNEL_TEXT_SIZE) { | |
241 | unsigned long addr2; | |
242 | pgprot_t prot2; | |
243 | ||
244 | addr2 = __START_KERNEL_map + __pa(address); | |
245 | /* Make sure the kernel mappings stay executable */ | |
246 | prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot))); | |
8192206d | 247 | err = __change_page_attr(addr2, pfn, prot2); |
44af6c41 IM |
248 | } |
249 | #endif | |
9f4c815c | 250 | } |
9f4c815c | 251 | |
1da177e4 LT |
252 | return err; |
253 | } | |
254 | ||
75cbade8 AV |
255 | /** |
256 | * change_page_attr_set - Change page table attributes in the linear mapping. | |
257 | * @addr: Virtual address in linear mapping. | |
258 | * @numpages: Number of pages to change | |
259 | * @prot: Protection/caching type bits to set (PAGE_*) | |
260 | * | |
261 | * Returns 0 on success, otherwise a negated errno. | |
262 | * | |
263 | * This should be used when a page is mapped with a different caching policy | |
264 | * than write-back somewhere - some CPUs do not like it when mappings with | |
265 | * different caching policies exist. This changes the page attributes of the | |
266 | * in kernel linear mapping too. | |
267 | * | |
75cbade8 AV |
268 | * The caller needs to ensure that there are no conflicting mappings elsewhere |
269 | * (e.g. in user space) * This function only deals with the kernel linear map. | |
270 | * | |
271 | * This function is different from change_page_attr() in that only selected bits | |
272 | * are impacted, all other bits remain as is. | |
273 | */ | |
d1028a15 AV |
274 | static int change_page_attr_set(unsigned long addr, int numpages, |
275 | pgprot_t prot) | |
75cbade8 AV |
276 | { |
277 | pgprot_t current_prot; | |
278 | int level; | |
279 | pte_t *pte; | |
280 | ||
281 | pte = lookup_address(addr, &level); | |
282 | if (pte) | |
283 | current_prot = pte_pgprot(*pte); | |
284 | else | |
285 | pgprot_val(current_prot) = 0; | |
286 | ||
287 | pgprot_val(prot) = pgprot_val(current_prot) | pgprot_val(prot); | |
288 | ||
289 | return change_page_attr_addr(addr, numpages, prot); | |
290 | } | |
291 | ||
292 | /** | |
293 | * change_page_attr_clear - Change page table attributes in the linear mapping. | |
294 | * @addr: Virtual address in linear mapping. | |
295 | * @numpages: Number of pages to change | |
296 | * @prot: Protection/caching type bits to clear (PAGE_*) | |
297 | * | |
298 | * Returns 0 on success, otherwise a negated errno. | |
299 | * | |
300 | * This should be used when a page is mapped with a different caching policy | |
301 | * than write-back somewhere - some CPUs do not like it when mappings with | |
302 | * different caching policies exist. This changes the page attributes of the | |
303 | * in kernel linear mapping too. | |
304 | * | |
75cbade8 AV |
305 | * The caller needs to ensure that there are no conflicting mappings elsewhere |
306 | * (e.g. in user space) * This function only deals with the kernel linear map. | |
307 | * | |
308 | * This function is different from change_page_attr() in that only selected bits | |
309 | * are impacted, all other bits remain as is. | |
310 | */ | |
d1028a15 AV |
311 | static int change_page_attr_clear(unsigned long addr, int numpages, |
312 | pgprot_t prot) | |
75cbade8 AV |
313 | { |
314 | pgprot_t current_prot; | |
315 | int level; | |
316 | pte_t *pte; | |
317 | ||
318 | pte = lookup_address(addr, &level); | |
319 | if (pte) | |
320 | current_prot = pte_pgprot(*pte); | |
321 | else | |
322 | pgprot_val(current_prot) = 0; | |
323 | ||
324 | pgprot_val(prot) = pgprot_val(current_prot) & ~pgprot_val(prot); | |
325 | ||
326 | return change_page_attr_addr(addr, numpages, prot); | |
327 | } | |
328 | ||
75cbade8 AV |
329 | int set_memory_uc(unsigned long addr, int numpages) |
330 | { | |
331 | pgprot_t uncached; | |
332 | ||
333 | pgprot_val(uncached) = _PAGE_PCD | _PAGE_PWT; | |
334 | return change_page_attr_set(addr, numpages, uncached); | |
335 | } | |
336 | EXPORT_SYMBOL(set_memory_uc); | |
337 | ||
338 | int set_memory_wb(unsigned long addr, int numpages) | |
339 | { | |
340 | pgprot_t uncached; | |
341 | ||
342 | pgprot_val(uncached) = _PAGE_PCD | _PAGE_PWT; | |
343 | return change_page_attr_clear(addr, numpages, uncached); | |
344 | } | |
345 | EXPORT_SYMBOL(set_memory_wb); | |
346 | ||
347 | int set_memory_x(unsigned long addr, int numpages) | |
348 | { | |
349 | pgprot_t nx; | |
350 | ||
351 | pgprot_val(nx) = _PAGE_NX; | |
352 | return change_page_attr_clear(addr, numpages, nx); | |
353 | } | |
354 | EXPORT_SYMBOL(set_memory_x); | |
355 | ||
356 | int set_memory_nx(unsigned long addr, int numpages) | |
357 | { | |
358 | pgprot_t nx; | |
359 | ||
360 | pgprot_val(nx) = _PAGE_NX; | |
361 | return change_page_attr_set(addr, numpages, nx); | |
362 | } | |
363 | EXPORT_SYMBOL(set_memory_nx); | |
364 | ||
365 | int set_memory_ro(unsigned long addr, int numpages) | |
366 | { | |
367 | pgprot_t rw; | |
368 | ||
369 | pgprot_val(rw) = _PAGE_RW; | |
370 | return change_page_attr_clear(addr, numpages, rw); | |
371 | } | |
75cbade8 AV |
372 | |
373 | int set_memory_rw(unsigned long addr, int numpages) | |
374 | { | |
375 | pgprot_t rw; | |
376 | ||
377 | pgprot_val(rw) = _PAGE_RW; | |
378 | return change_page_attr_set(addr, numpages, rw); | |
379 | } | |
f62d0f00 IM |
380 | |
381 | int set_memory_np(unsigned long addr, int numpages) | |
382 | { | |
383 | pgprot_t present; | |
384 | ||
385 | pgprot_val(present) = _PAGE_PRESENT; | |
386 | return change_page_attr_clear(addr, numpages, present); | |
387 | } | |
75cbade8 AV |
388 | |
389 | int set_pages_uc(struct page *page, int numpages) | |
390 | { | |
391 | unsigned long addr = (unsigned long)page_address(page); | |
392 | pgprot_t uncached; | |
393 | ||
394 | pgprot_val(uncached) = _PAGE_PCD | _PAGE_PWT; | |
395 | return change_page_attr_set(addr, numpages, uncached); | |
396 | } | |
397 | EXPORT_SYMBOL(set_pages_uc); | |
398 | ||
399 | int set_pages_wb(struct page *page, int numpages) | |
400 | { | |
401 | unsigned long addr = (unsigned long)page_address(page); | |
402 | pgprot_t uncached; | |
403 | ||
404 | pgprot_val(uncached) = _PAGE_PCD | _PAGE_PWT; | |
405 | return change_page_attr_clear(addr, numpages, uncached); | |
406 | } | |
407 | EXPORT_SYMBOL(set_pages_wb); | |
408 | ||
409 | int set_pages_x(struct page *page, int numpages) | |
410 | { | |
411 | unsigned long addr = (unsigned long)page_address(page); | |
412 | pgprot_t nx; | |
413 | ||
414 | pgprot_val(nx) = _PAGE_NX; | |
415 | return change_page_attr_clear(addr, numpages, nx); | |
416 | } | |
417 | EXPORT_SYMBOL(set_pages_x); | |
418 | ||
419 | int set_pages_nx(struct page *page, int numpages) | |
420 | { | |
421 | unsigned long addr = (unsigned long)page_address(page); | |
422 | pgprot_t nx; | |
423 | ||
424 | pgprot_val(nx) = _PAGE_NX; | |
425 | return change_page_attr_set(addr, numpages, nx); | |
426 | } | |
427 | EXPORT_SYMBOL(set_pages_nx); | |
428 | ||
429 | int set_pages_ro(struct page *page, int numpages) | |
430 | { | |
431 | unsigned long addr = (unsigned long)page_address(page); | |
432 | pgprot_t rw; | |
433 | ||
434 | pgprot_val(rw) = _PAGE_RW; | |
435 | return change_page_attr_clear(addr, numpages, rw); | |
436 | } | |
75cbade8 AV |
437 | |
438 | int set_pages_rw(struct page *page, int numpages) | |
439 | { | |
440 | unsigned long addr = (unsigned long)page_address(page); | |
441 | pgprot_t rw; | |
442 | ||
443 | pgprot_val(rw) = _PAGE_RW; | |
444 | return change_page_attr_set(addr, numpages, rw); | |
445 | } | |
75cbade8 | 446 | |
e81d5dc4 IM |
447 | void clflush_cache_range(void *addr, int size) |
448 | { | |
449 | int i; | |
450 | ||
451 | for (i = 0; i < size; i += boot_cpu_data.x86_clflush_size) | |
452 | clflush(addr+i); | |
453 | } | |
454 | ||
78c94aba IM |
455 | static void flush_kernel_map(void *arg) |
456 | { | |
457 | /* | |
458 | * Flush all to work around Errata in early athlons regarding | |
459 | * large page flushing. | |
460 | */ | |
461 | __flush_tlb_all(); | |
462 | ||
463 | if (boot_cpu_data.x86_model >= 4) | |
464 | wbinvd(); | |
465 | } | |
466 | ||
467 | void global_flush_tlb(void) | |
468 | { | |
1da177e4 LT |
469 | BUG_ON(irqs_disabled()); |
470 | ||
78c94aba | 471 | on_each_cpu(flush_kernel_map, NULL, 1, 1); |
626ab0e6 | 472 | } |
9f4c815c | 473 | EXPORT_SYMBOL(global_flush_tlb); |
1da177e4 LT |
474 | |
475 | #ifdef CONFIG_DEBUG_PAGEALLOC | |
f62d0f00 IM |
476 | |
477 | static int __set_pages_p(struct page *page, int numpages) | |
478 | { | |
479 | unsigned long addr = (unsigned long)page_address(page); | |
480 | return change_page_attr_set(addr, numpages, | |
481 | __pgprot(_PAGE_PRESENT | _PAGE_RW)); | |
482 | } | |
483 | ||
484 | static int __set_pages_np(struct page *page, int numpages) | |
485 | { | |
486 | unsigned long addr = (unsigned long)page_address(page); | |
487 | return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT)); | |
488 | } | |
489 | ||
1da177e4 LT |
490 | void kernel_map_pages(struct page *page, int numpages, int enable) |
491 | { | |
492 | if (PageHighMem(page)) | |
493 | return; | |
9f4c815c | 494 | if (!enable) { |
f9b8404c IM |
495 | debug_check_no_locks_freed(page_address(page), |
496 | numpages * PAGE_SIZE); | |
9f4c815c | 497 | } |
de5097c2 | 498 | |
12d6f21e IM |
499 | /* |
500 | * If page allocator is not up yet then do not call c_p_a(): | |
501 | */ | |
502 | if (!debug_pagealloc_enabled) | |
503 | return; | |
504 | ||
9f4c815c | 505 | /* |
e4b71dcf IM |
506 | * The return value is ignored - the calls cannot fail, |
507 | * large pages are disabled at boot time: | |
1da177e4 | 508 | */ |
f62d0f00 IM |
509 | if (enable) |
510 | __set_pages_p(page, numpages); | |
511 | else | |
512 | __set_pages_np(page, numpages); | |
9f4c815c IM |
513 | |
514 | /* | |
e4b71dcf IM |
515 | * We should perform an IPI and flush all tlbs, |
516 | * but that can deadlock->flush only current cpu: | |
1da177e4 LT |
517 | */ |
518 | __flush_tlb_all(); | |
519 | } | |
520 | #endif | |
d1028a15 AV |
521 | |
522 | /* | |
523 | * The testcases use internal knowledge of the implementation that shouldn't | |
524 | * be exposed to the rest of the kernel. Include these directly here. | |
525 | */ | |
526 | #ifdef CONFIG_CPA_DEBUG | |
527 | #include "pageattr-test.c" | |
528 | #endif |