tracing, Text Edit Lock - SMP alternatives support
[linux-2.6-block.git] / arch / x86 / kernel / alternative.c
CommitLineData
9a0b5817 1#include <linux/module.h>
f6a57033 2#include <linux/sched.h>
2f1dafe5 3#include <linux/mutex.h>
9a0b5817 4#include <linux/list.h>
19d36ccd
AK
5#include <linux/kprobes.h>
6#include <linux/mm.h>
7#include <linux/vmalloc.h>
3945dab4 8#include <linux/memory.h>
9a0b5817
GH
9#include <asm/alternative.h>
10#include <asm/sections.h>
19d36ccd 11#include <asm/pgtable.h>
8f4e956b
AK
12#include <asm/mce.h>
13#include <asm/nmi.h>
b097976e 14#include <asm/vsyscall.h>
e587cadd
MD
15#include <asm/cacheflush.h>
16#include <asm/io.h>
9a0b5817 17
ab144f5e
AK
18#define MAX_PATCH_LEN (255-1)
19
09488165
JB
20#ifdef CONFIG_HOTPLUG_CPU
21static int smp_alt_once;
9a0b5817 22
d167a518
GH
23static int __init bootonly(char *str)
24{
25 smp_alt_once = 1;
26 return 1;
27}
b7fb4af0 28__setup("smp-alt-boot", bootonly);
09488165
JB
29#else
30#define smp_alt_once 1
31#endif
32
33static int debug_alternative;
b7fb4af0 34
d167a518
GH
35static int __init debug_alt(char *str)
36{
37 debug_alternative = 1;
38 return 1;
39}
d167a518
GH
40__setup("debug-alternative", debug_alt);
41
09488165
JB
42static int noreplace_smp;
43
b7fb4af0
JF
44static int __init setup_noreplace_smp(char *str)
45{
46 noreplace_smp = 1;
47 return 1;
48}
49__setup("noreplace-smp", setup_noreplace_smp);
50
959b4fdf
JF
51#ifdef CONFIG_PARAVIRT
52static int noreplace_paravirt = 0;
53
54static int __init setup_noreplace_paravirt(char *str)
55{
56 noreplace_paravirt = 1;
57 return 1;
58}
59__setup("noreplace-paravirt", setup_noreplace_paravirt);
60#endif
b7fb4af0 61
d167a518
GH
62#define DPRINTK(fmt, args...) if (debug_alternative) \
63 printk(KERN_DEBUG fmt, args)
64
65#ifdef GENERIC_NOP1
9a0b5817
GH
66/* Use inline assembly to define this because the nops are defined
67 as inline assembly strings in the include files and we cannot
68 get them easily into strings. */
121d7bf5 69asm("\t.section .rodata, \"a\"\nintelnops: "
9a0b5817 70 GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
f4be31ec
SR
71 GENERIC_NOP7 GENERIC_NOP8
72 "\t.previous");
121d7bf5
JB
73extern const unsigned char intelnops[];
74static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = {
9a0b5817
GH
75 NULL,
76 intelnops,
77 intelnops + 1,
78 intelnops + 1 + 2,
79 intelnops + 1 + 2 + 3,
80 intelnops + 1 + 2 + 3 + 4,
81 intelnops + 1 + 2 + 3 + 4 + 5,
82 intelnops + 1 + 2 + 3 + 4 + 5 + 6,
83 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
84};
d167a518
GH
85#endif
86
87#ifdef K8_NOP1
121d7bf5 88asm("\t.section .rodata, \"a\"\nk8nops: "
d167a518 89 K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
f4be31ec
SR
90 K8_NOP7 K8_NOP8
91 "\t.previous");
121d7bf5
JB
92extern const unsigned char k8nops[];
93static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = {
9a0b5817
GH
94 NULL,
95 k8nops,
96 k8nops + 1,
97 k8nops + 1 + 2,
98 k8nops + 1 + 2 + 3,
99 k8nops + 1 + 2 + 3 + 4,
100 k8nops + 1 + 2 + 3 + 4 + 5,
101 k8nops + 1 + 2 + 3 + 4 + 5 + 6,
102 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
103};
d167a518
GH
104#endif
105
106#ifdef K7_NOP1
121d7bf5 107asm("\t.section .rodata, \"a\"\nk7nops: "
d167a518 108 K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
f4be31ec
SR
109 K7_NOP7 K7_NOP8
110 "\t.previous");
121d7bf5
JB
111extern const unsigned char k7nops[];
112static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = {
9a0b5817
GH
113 NULL,
114 k7nops,
115 k7nops + 1,
116 k7nops + 1 + 2,
117 k7nops + 1 + 2 + 3,
118 k7nops + 1 + 2 + 3 + 4,
119 k7nops + 1 + 2 + 3 + 4 + 5,
120 k7nops + 1 + 2 + 3 + 4 + 5 + 6,
121 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
122};
d167a518
GH
123#endif
124
32c464f5
JB
125#ifdef P6_NOP1
126asm("\t.section .rodata, \"a\"\np6nops: "
127 P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6
f4be31ec
SR
128 P6_NOP7 P6_NOP8
129 "\t.previous");
32c464f5
JB
130extern const unsigned char p6nops[];
131static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = {
132 NULL,
133 p6nops,
134 p6nops + 1,
135 p6nops + 1 + 2,
136 p6nops + 1 + 2 + 3,
137 p6nops + 1 + 2 + 3 + 4,
138 p6nops + 1 + 2 + 3 + 4 + 5,
139 p6nops + 1 + 2 + 3 + 4 + 5 + 6,
140 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
141};
142#endif
143
d167a518
GH
144#ifdef CONFIG_X86_64
145
146extern char __vsyscall_0;
dfa60aba 147const unsigned char *const *find_nop_table(void)
d167a518 148{
f31d731e
PA
149 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
150 boot_cpu_has(X86_FEATURE_NOPL))
151 return p6_nops;
152 else
153 return k8_nops;
d167a518
GH
154}
155
156#else /* CONFIG_X86_64 */
157
dfa60aba 158const unsigned char *const *find_nop_table(void)
9a0b5817 159{
f31d731e
PA
160 if (boot_cpu_has(X86_FEATURE_K8))
161 return k8_nops;
162 else if (boot_cpu_has(X86_FEATURE_K7))
163 return k7_nops;
164 else if (boot_cpu_has(X86_FEATURE_NOPL))
165 return p6_nops;
166 else
167 return intel_nops;
9a0b5817
GH
168}
169
d167a518
GH
170#endif /* CONFIG_X86_64 */
171
ab144f5e 172/* Use this to add nops to a buffer, then text_poke the whole buffer. */
e587cadd 173void add_nops(void *insns, unsigned int len)
139ec7c4 174{
121d7bf5 175 const unsigned char *const *noptable = find_nop_table();
139ec7c4
RR
176
177 while (len > 0) {
178 unsigned int noplen = len;
179 if (noplen > ASM_NOP_MAX)
180 noplen = ASM_NOP_MAX;
ab144f5e 181 memcpy(insns, noptable[noplen], noplen);
139ec7c4
RR
182 insns += noplen;
183 len -= noplen;
184 }
185}
e587cadd 186EXPORT_SYMBOL_GPL(add_nops);
139ec7c4 187
d167a518 188extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
d167a518
GH
189extern u8 *__smp_locks[], *__smp_locks_end[];
190
9a0b5817
GH
191/* Replace instructions with better alternatives for this CPU type.
192 This runs before SMP is initialized to avoid SMP problems with
193 self modifying code. This implies that assymetric systems where
194 APs have less capabilities than the boot processor are not handled.
195 Tough. Make sure you disable such features by hand. */
196
197void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
198{
9a0b5817 199 struct alt_instr *a;
ab144f5e 200 char insnbuf[MAX_PATCH_LEN];
9a0b5817 201
77bf90ed 202 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
9a0b5817 203 for (a = start; a < end; a++) {
ab144f5e 204 u8 *instr = a->instr;
9a0b5817 205 BUG_ON(a->replacementlen > a->instrlen);
ab144f5e 206 BUG_ON(a->instrlen > sizeof(insnbuf));
9a0b5817
GH
207 if (!boot_cpu_has(a->cpuid))
208 continue;
d167a518
GH
209#ifdef CONFIG_X86_64
210 /* vsyscall code is not mapped yet. resolve it manually. */
211 if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) {
212 instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0));
213 DPRINTK("%s: vsyscall fixup: %p => %p\n",
77bf90ed 214 __func__, a->instr, instr);
d167a518
GH
215 }
216#endif
ab144f5e
AK
217 memcpy(insnbuf, a->replacement, a->replacementlen);
218 add_nops(insnbuf + a->replacementlen,
219 a->instrlen - a->replacementlen);
e587cadd 220 text_poke_early(instr, insnbuf, a->instrlen);
9a0b5817
GH
221 }
222}
223
8ec4d41f
GH
224#ifdef CONFIG_SMP
225
9a0b5817
GH
226static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
227{
228 u8 **ptr;
229
3945dab4 230 mutex_lock(&text_mutex);
9a0b5817
GH
231 for (ptr = start; ptr < end; ptr++) {
232 if (*ptr < text)
233 continue;
234 if (*ptr > text_end)
235 continue;
f88f07e0
MD
236 /* turn DS segment override prefix into lock prefix */
237 text_poke(*ptr, ((unsigned char []){0xf0}), 1);
9a0b5817 238 };
3945dab4 239 mutex_unlock(&text_mutex);
9a0b5817
GH
240}
241
242static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
243{
9a0b5817
GH
244 u8 **ptr;
245
b7fb4af0
JF
246 if (noreplace_smp)
247 return;
248
3945dab4 249 mutex_lock(&text_mutex);
9a0b5817
GH
250 for (ptr = start; ptr < end; ptr++) {
251 if (*ptr < text)
252 continue;
253 if (*ptr > text_end)
254 continue;
f88f07e0
MD
255 /* turn lock prefix into DS segment override prefix */
256 text_poke(*ptr, ((unsigned char []){0x3E}), 1);
9a0b5817 257 };
3945dab4 258 mutex_unlock(&text_mutex);
9a0b5817
GH
259}
260
261struct smp_alt_module {
262 /* what is this ??? */
263 struct module *mod;
264 char *name;
265
266 /* ptrs to lock prefixes */
267 u8 **locks;
268 u8 **locks_end;
269
270 /* .text segment, needed to avoid patching init code ;) */
271 u8 *text;
272 u8 *text_end;
273
274 struct list_head next;
275};
276static LIST_HEAD(smp_alt_modules);
2f1dafe5 277static DEFINE_MUTEX(smp_alt);
ca74a6f8 278static int smp_mode = 1; /* protected by smp_alt */
9a0b5817 279
9a0b5817
GH
280void alternatives_smp_module_add(struct module *mod, char *name,
281 void *locks, void *locks_end,
282 void *text, void *text_end)
283{
284 struct smp_alt_module *smp;
9a0b5817 285
b7fb4af0
JF
286 if (noreplace_smp)
287 return;
288
9a0b5817
GH
289 if (smp_alt_once) {
290 if (boot_cpu_has(X86_FEATURE_UP))
291 alternatives_smp_unlock(locks, locks_end,
292 text, text_end);
293 return;
294 }
295
296 smp = kzalloc(sizeof(*smp), GFP_KERNEL);
297 if (NULL == smp)
298 return; /* we'll run the (safe but slow) SMP code then ... */
299
300 smp->mod = mod;
301 smp->name = name;
302 smp->locks = locks;
303 smp->locks_end = locks_end;
304 smp->text = text;
305 smp->text_end = text_end;
306 DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
77bf90ed 307 __func__, smp->locks, smp->locks_end,
9a0b5817
GH
308 smp->text, smp->text_end, smp->name);
309
2f1dafe5 310 mutex_lock(&smp_alt);
9a0b5817
GH
311 list_add_tail(&smp->next, &smp_alt_modules);
312 if (boot_cpu_has(X86_FEATURE_UP))
313 alternatives_smp_unlock(smp->locks, smp->locks_end,
314 smp->text, smp->text_end);
2f1dafe5 315 mutex_unlock(&smp_alt);
9a0b5817
GH
316}
317
318void alternatives_smp_module_del(struct module *mod)
319{
320 struct smp_alt_module *item;
9a0b5817 321
b7fb4af0 322 if (smp_alt_once || noreplace_smp)
9a0b5817
GH
323 return;
324
2f1dafe5 325 mutex_lock(&smp_alt);
9a0b5817
GH
326 list_for_each_entry(item, &smp_alt_modules, next) {
327 if (mod != item->mod)
328 continue;
329 list_del(&item->next);
2f1dafe5 330 mutex_unlock(&smp_alt);
77bf90ed 331 DPRINTK("%s: %s\n", __func__, item->name);
9a0b5817
GH
332 kfree(item);
333 return;
334 }
2f1dafe5 335 mutex_unlock(&smp_alt);
9a0b5817
GH
336}
337
338void alternatives_smp_switch(int smp)
339{
340 struct smp_alt_module *mod;
9a0b5817 341
3047e99e
IM
342#ifdef CONFIG_LOCKDEP
343 /*
17abecfe
IM
344 * Older binutils section handling bug prevented
345 * alternatives-replacement from working reliably.
346 *
347 * If this still occurs then you should see a hang
348 * or crash shortly after this line:
3047e99e 349 */
17abecfe 350 printk("lockdep: fixing up alternatives.\n");
3047e99e
IM
351#endif
352
b7fb4af0 353 if (noreplace_smp || smp_alt_once)
9a0b5817
GH
354 return;
355 BUG_ON(!smp && (num_online_cpus() > 1));
356
2f1dafe5 357 mutex_lock(&smp_alt);
ca74a6f8
AK
358
359 /*
360 * Avoid unnecessary switches because it forces JIT based VMs to
361 * throw away all cached translations, which can be quite costly.
362 */
363 if (smp == smp_mode) {
364 /* nothing */
365 } else if (smp) {
9a0b5817 366 printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
53756d37
JF
367 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
368 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
9a0b5817
GH
369 list_for_each_entry(mod, &smp_alt_modules, next)
370 alternatives_smp_lock(mod->locks, mod->locks_end,
371 mod->text, mod->text_end);
372 } else {
373 printk(KERN_INFO "SMP alternatives: switching to UP code\n");
53756d37
JF
374 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
375 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
9a0b5817
GH
376 list_for_each_entry(mod, &smp_alt_modules, next)
377 alternatives_smp_unlock(mod->locks, mod->locks_end,
378 mod->text, mod->text_end);
379 }
ca74a6f8 380 smp_mode = smp;
2f1dafe5 381 mutex_unlock(&smp_alt);
9a0b5817
GH
382}
383
8ec4d41f
GH
384#endif
385
139ec7c4 386#ifdef CONFIG_PARAVIRT
98de032b
JF
387void apply_paravirt(struct paravirt_patch_site *start,
388 struct paravirt_patch_site *end)
139ec7c4 389{
98de032b 390 struct paravirt_patch_site *p;
ab144f5e 391 char insnbuf[MAX_PATCH_LEN];
139ec7c4 392
959b4fdf
JF
393 if (noreplace_paravirt)
394 return;
395
139ec7c4
RR
396 for (p = start; p < end; p++) {
397 unsigned int used;
398
ab144f5e 399 BUG_ON(p->len > MAX_PATCH_LEN);
d34fda4a
CW
400 /* prep the buffer with the original instructions */
401 memcpy(insnbuf, p->instr, p->len);
93b1eab3
JF
402 used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf,
403 (unsigned long)p->instr, p->len);
7f63c41c 404
63f70270
JF
405 BUG_ON(used > p->len);
406
139ec7c4 407 /* Pad the rest with nops */
ab144f5e 408 add_nops(insnbuf + used, p->len - used);
e587cadd 409 text_poke_early(p->instr, insnbuf, p->len);
139ec7c4 410 }
139ec7c4 411}
98de032b 412extern struct paravirt_patch_site __start_parainstructions[],
139ec7c4
RR
413 __stop_parainstructions[];
414#endif /* CONFIG_PARAVIRT */
415
9a0b5817
GH
416void __init alternative_instructions(void)
417{
8f4e956b
AK
418 /* The patching is not fully atomic, so try to avoid local interruptions
419 that might execute the to be patched code.
420 Other CPUs are not running. */
421 stop_nmi();
123aa76e
AK
422
423 /*
424 * Don't stop machine check exceptions while patching.
425 * MCEs only happen when something got corrupted and in this
426 * case we must do something about the corruption.
427 * Ignoring it is worse than a unlikely patching race.
428 * Also machine checks tend to be broadcast and if one CPU
429 * goes into machine check the others follow quickly, so we don't
430 * expect a machine check to cause undue problems during to code
431 * patching.
432 */
8f4e956b 433
9a0b5817
GH
434 apply_alternatives(__alt_instructions, __alt_instructions_end);
435
436 /* switch to patch-once-at-boottime-only mode and free the
437 * tables in case we know the number of CPUs will never ever
438 * change */
439#ifdef CONFIG_HOTPLUG_CPU
440 if (num_possible_cpus() < 2)
441 smp_alt_once = 1;
9a0b5817
GH
442#endif
443
8ec4d41f 444#ifdef CONFIG_SMP
9a0b5817
GH
445 if (smp_alt_once) {
446 if (1 == num_possible_cpus()) {
447 printk(KERN_INFO "SMP alternatives: switching to UP code\n");
53756d37
JF
448 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
449 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
450
9a0b5817
GH
451 alternatives_smp_unlock(__smp_locks, __smp_locks_end,
452 _text, _etext);
453 }
9a0b5817 454 } else {
9a0b5817
GH
455 alternatives_smp_module_add(NULL, "core kernel",
456 __smp_locks, __smp_locks_end,
457 _text, _etext);
ca74a6f8
AK
458
459 /* Only switch to UP mode if we don't immediately boot others */
649c6653 460 if (num_present_cpus() == 1 || setup_max_cpus <= 1)
ca74a6f8 461 alternatives_smp_switch(0);
9a0b5817 462 }
8ec4d41f 463#endif
441d40dc 464 apply_paravirt(__parainstructions, __parainstructions_end);
8f4e956b 465
f68fd5f4
FW
466 if (smp_alt_once)
467 free_init_pages("SMP alternatives",
468 (unsigned long)__smp_locks,
469 (unsigned long)__smp_locks_end);
470
8f4e956b 471 restart_nmi();
9a0b5817 472}
19d36ccd 473
e587cadd
MD
474/**
475 * text_poke_early - Update instructions on a live kernel at boot time
476 * @addr: address to modify
477 * @opcode: source of the copy
478 * @len: length to copy
479 *
19d36ccd
AK
480 * When you use this code to patch more than one byte of an instruction
481 * you need to make sure that other CPUs cannot execute this code in parallel.
e587cadd
MD
482 * Also no thread must be currently preempted in the middle of these
483 * instructions. And on the local CPU you need to be protected again NMI or MCE
484 * handlers seeing an inconsistent instruction while you patch.
19d36ccd 485 */
e587cadd 486void *text_poke_early(void *addr, const void *opcode, size_t len)
19d36ccd 487{
e587cadd
MD
488 unsigned long flags;
489 local_irq_save(flags);
19d36ccd 490 memcpy(addr, opcode, len);
e587cadd
MD
491 local_irq_restore(flags);
492 sync_core();
493 /* Could also do a CLFLUSH here to speed up CPU recovery; but
494 that causes hangs on some VIA CPUs. */
495 return addr;
496}
497
498/**
499 * text_poke - Update instructions on a live kernel
500 * @addr: address to modify
501 * @opcode: source of the copy
502 * @len: length to copy
503 *
504 * Only atomic text poke/set should be allowed when not doing early patching.
505 * It means the size must be writable atomically and the address must be aligned
506 * in a way that permits an atomic write. It also makes sure we fit on a single
507 * page.
508 */
509void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
510{
e587cadd
MD
511 char *vaddr;
512 int nr_pages = 2;
b7b66baa
MD
513 struct page *pages[2];
514 int i;
e587cadd 515
34754b69 516 might_sleep();
b7b66baa
MD
517 if (!core_kernel_text((unsigned long)addr)) {
518 pages[0] = vmalloc_to_page(addr);
519 pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
15a601eb 520 } else {
b7b66baa 521 pages[0] = virt_to_page(addr);
00c6b2d5 522 WARN_ON(!PageReserved(pages[0]));
b7b66baa 523 pages[1] = virt_to_page(addr + PAGE_SIZE);
e587cadd 524 }
b7b66baa
MD
525 BUG_ON(!pages[0]);
526 if (!pages[1])
527 nr_pages = 1;
528 vaddr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
529 BUG_ON(!vaddr);
34754b69 530 local_irq_disable();
b7b66baa 531 memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
34754b69 532 local_irq_enable();
b7b66baa 533 vunmap(vaddr);
19d36ccd 534 sync_core();
a534b679
AK
535 /* Could also do a CLFLUSH here to speed up CPU recovery; but
536 that causes hangs on some VIA CPUs. */
b7b66baa
MD
537 for (i = 0; i < len; i++)
538 BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
e587cadd 539 return addr;
19d36ccd 540}