x86: add the "print code before the trapping instruction" feature to 64 bit
[linux-2.6-block.git] / arch / x86 / kernel / process_64.c
CommitLineData
1da177e4 1/*
1da177e4
LT
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6612538c 6 *
1da177e4
LT
7 * X86-64 port
8 * Andi Kleen.
76e4f660
AR
9 *
10 * CPU hotplug support - ashok.raj@intel.com
1da177e4
LT
11 */
12
13/*
14 * This file handles the architecture-dependent parts of process handling..
15 */
16
17#include <stdarg.h>
18
76e4f660 19#include <linux/cpu.h>
1da177e4
LT
20#include <linux/errno.h>
21#include <linux/sched.h>
6612538c 22#include <linux/fs.h>
1da177e4
LT
23#include <linux/kernel.h>
24#include <linux/mm.h>
25#include <linux/elfcore.h>
26#include <linux/smp.h>
27#include <linux/slab.h>
28#include <linux/user.h>
1da177e4
LT
29#include <linux/a.out.h>
30#include <linux/interrupt.h>
6612538c 31#include <linux/utsname.h>
1da177e4 32#include <linux/delay.h>
6612538c 33#include <linux/module.h>
1da177e4 34#include <linux/ptrace.h>
1da177e4 35#include <linux/random.h>
95833c83 36#include <linux/notifier.h>
c6fd91f0 37#include <linux/kprobes.h>
1eeb66a1 38#include <linux/kdebug.h>
02290683 39#include <linux/tick.h>
1da177e4
LT
40
41#include <asm/uaccess.h>
42#include <asm/pgtable.h>
43#include <asm/system.h>
44#include <asm/io.h>
45#include <asm/processor.h>
46#include <asm/i387.h>
47#include <asm/mmu_context.h>
48#include <asm/pda.h>
49#include <asm/prctl.h>
1da177e4
LT
50#include <asm/desc.h>
51#include <asm/proto.h>
52#include <asm/ia32.h>
95833c83 53#include <asm/idle.h>
1da177e4
LT
54
55asmlinkage extern void ret_from_fork(void);
56
57unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
58
1da177e4
LT
59unsigned long boot_option_idle_override = 0;
60EXPORT_SYMBOL(boot_option_idle_override);
61
62/*
63 * Powermanagement idle function, if any..
64 */
65void (*pm_idle)(void);
2ee60e17 66EXPORT_SYMBOL(pm_idle);
1da177e4
LT
67static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
68
e041c683 69static ATOMIC_NOTIFIER_HEAD(idle_notifier);
95833c83
AK
70
71void idle_notifier_register(struct notifier_block *n)
72{
e041c683 73 atomic_notifier_chain_register(&idle_notifier, n);
95833c83 74}
95833c83 75
95833c83
AK
76void enter_idle(void)
77{
a15da49d 78 write_pda(isidle, 1);
e041c683 79 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
95833c83
AK
80}
81
82static void __exit_idle(void)
83{
9446868b 84 if (test_and_clear_bit_pda(0, isidle) == 0)
a15da49d 85 return;
e041c683 86 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
95833c83
AK
87}
88
89/* Called from interrupts to signify idle end */
90void exit_idle(void)
91{
a15da49d
AK
92 /* idle loop has pid 0 */
93 if (current->pid)
95833c83
AK
94 return;
95 __exit_idle();
96}
97
1da177e4
LT
98/*
99 * We use this if we don't have any better
100 * idle routine..
101 */
d8954222 102void default_idle(void)
1da177e4 103{
495ab9c0 104 current_thread_info()->status &= ~TS_POLLING;
0888f06a
IM
105 /*
106 * TS_POLLING-cleared state must be visible before we
107 * test NEED_RESCHED:
108 */
109 smp_mb();
72690a21
AK
110 local_irq_disable();
111 if (!need_resched()) {
5ee613b6
IM
112 ktime_t t0, t1;
113 u64 t0n, t1n;
114
115 t0 = ktime_get();
116 t0n = ktime_to_ns(t0);
117 safe_halt(); /* enables interrupts racelessly */
118 local_irq_disable();
119 t1 = ktime_get();
120 t1n = ktime_to_ns(t1);
121 sched_clock_idle_wakeup_event(t1n - t0n);
39d44a51
HS
122 }
123 local_irq_enable();
495ab9c0 124 current_thread_info()->status |= TS_POLLING;
1da177e4
LT
125}
126
127/*
128 * On SMP it's slightly faster (but much more power-consuming!)
129 * to poll the ->need_resched flag instead of waiting for the
130 * cross-CPU IPI to arrive. Use this option with caution.
131 */
6612538c 132static void poll_idle(void)
1da177e4 133{
d331e739 134 local_irq_enable();
72690a21 135 cpu_relax();
1da177e4
LT
136}
137
76e4f660
AR
138#ifdef CONFIG_HOTPLUG_CPU
139DECLARE_PER_CPU(int, cpu_state);
140
141#include <asm/nmi.h>
1fa744e6 142/* We halt the CPU with physical CPU hotplug */
76e4f660
AR
143static inline void play_dead(void)
144{
145 idle_task_exit();
146 wbinvd();
147 mb();
148 /* Ack it */
149 __get_cpu_var(cpu_state) = CPU_DEAD;
150
1fa744e6 151 local_irq_disable();
76e4f660 152 while (1)
1fa744e6 153 halt();
76e4f660
AR
154}
155#else
156static inline void play_dead(void)
157{
158 BUG();
159}
160#endif /* CONFIG_HOTPLUG_CPU */
161
1da177e4
LT
162/*
163 * The idle thread. There's no useful work to be
164 * done, so just try to conserve power and have a
165 * low exit latency (ie sit in a loop waiting for
166 * somebody to say that they'd like to reschedule)
167 */
b10db7f0 168void cpu_idle(void)
1da177e4 169{
495ab9c0 170 current_thread_info()->status |= TS_POLLING;
1da177e4
LT
171 /* endless idle loop with no priority at all */
172 while (1) {
3d97775a 173 tick_nohz_stop_sched_tick();
1da177e4
LT
174 while (!need_resched()) {
175 void (*idle)(void);
176
177 if (__get_cpu_var(cpu_idle_state))
178 __get_cpu_var(cpu_idle_state) = 0;
179
180 rmb();
181 idle = pm_idle;
182 if (!idle)
183 idle = default_idle;
76e4f660
AR
184 if (cpu_is_offline(smp_processor_id()))
185 play_dead();
d331e739
VP
186 /*
187 * Idle routines should keep interrupts disabled
188 * from here on, until they go to idle.
189 * Otherwise, idle callbacks can misfire.
190 */
191 local_irq_disable();
95833c83 192 enter_idle();
1da177e4 193 idle();
a15da49d
AK
194 /* In many cases the interrupt that ended idle
195 has already called exit_idle. But some idle
196 loops can be woken up without interrupt. */
95833c83 197 __exit_idle();
1da177e4
LT
198 }
199
02290683 200 tick_nohz_restart_sched_tick();
5bfb5d69 201 preempt_enable_no_resched();
1da177e4 202 schedule();
5bfb5d69 203 preempt_disable();
1da177e4
LT
204 }
205}
206
6612538c
HS
207static void do_nothing(void *unused)
208{
209}
210
211void cpu_idle_wait(void)
212{
213 unsigned int cpu, this_cpu = get_cpu();
214 cpumask_t map, tmp = current->cpus_allowed;
215
216 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
217 put_cpu();
218
219 cpus_clear(map);
220 for_each_online_cpu(cpu) {
221 per_cpu(cpu_idle_state, cpu) = 1;
222 cpu_set(cpu, map);
223 }
224
225 __get_cpu_var(cpu_idle_state) = 0;
226
227 wmb();
228 do {
229 ssleep(1);
230 for_each_online_cpu(cpu) {
231 if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
232 cpu_clear(cpu, map);
233 }
234 cpus_and(map, map, cpu_online_map);
235 /*
236 * We waited 1 sec, if a CPU still did not call idle
237 * it may be because it is in idle and not waking up
238 * because it has nothing to do.
239 * Give all the remaining CPUS a kick.
240 */
241 smp_call_function_mask(map, do_nothing, 0, 0);
242 } while (!cpus_empty(map));
243
244 set_cpus_allowed(current, tmp);
245}
246EXPORT_SYMBOL_GPL(cpu_idle_wait);
247
1da177e4
LT
248/*
249 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
250 * which can obviate IPI to trigger checking of need_resched.
251 * We execute MONITOR against need_resched and enter optimized wait state
252 * through MWAIT. Whenever someone changes need_resched, we would be woken
253 * up from MWAIT (without an IPI).
991528d7
VP
254 *
255 * New with Core Duo processors, MWAIT can take some hints based on CPU
256 * capability.
1da177e4 257 */
65ea5b03 258void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
1da177e4 259{
991528d7 260 if (!need_resched()) {
64c7c8f8
NP
261 __monitor((void *)&current_thread_info()->flags, 0, 0);
262 smp_mb();
991528d7 263 if (!need_resched())
65ea5b03 264 __mwait(ax, cx);
1da177e4
LT
265 }
266}
267
991528d7
VP
268/* Default MONITOR/MWAIT with no hints, used for default C1 state */
269static void mwait_idle(void)
270{
d331e739
VP
271 if (!need_resched()) {
272 __monitor((void *)&current_thread_info()->flags, 0, 0);
273 smp_mb();
274 if (!need_resched())
275 __sti_mwait(0, 0);
276 else
277 local_irq_enable();
278 } else {
279 local_irq_enable();
280 }
991528d7
VP
281}
282
e6982c67 283void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
1da177e4
LT
284{
285 static int printed;
286 if (cpu_has(c, X86_FEATURE_MWAIT)) {
287 /*
288 * Skip, if setup has overridden idle.
289 * One CPU supports mwait => All CPUs supports mwait
290 */
291 if (!pm_idle) {
292 if (!printed) {
2d4fa2f6 293 printk(KERN_INFO "using mwait in idle threads.\n");
1da177e4
LT
294 printed = 1;
295 }
296 pm_idle = mwait_idle;
297 }
298 }
299}
300
6612538c 301static int __init idle_setup(char *str)
1da177e4 302{
f039b754 303 if (!strcmp(str, "poll")) {
1da177e4
LT
304 printk("using polling idle threads.\n");
305 pm_idle = poll_idle;
f039b754
AK
306 } else if (!strcmp(str, "mwait"))
307 force_mwait = 1;
308 else
309 return -1;
1da177e4
LT
310
311 boot_option_idle_override = 1;
f039b754 312 return 0;
1da177e4 313}
f039b754 314early_param("idle", idle_setup);
1da177e4 315
6612538c 316/* Prints also some state that isn't saved in the pt_regs */
1da177e4
LT
317void __show_regs(struct pt_regs * regs)
318{
319 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
bb1995d5 320 unsigned long d0, d1, d2, d3, d6, d7;
6612538c
HS
321 unsigned int fsindex, gsindex;
322 unsigned int ds, cs, es;
1da177e4
LT
323
324 printk("\n");
325 print_modules();
9acf23c4
AK
326 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
327 current->pid, current->comm, print_tainted(),
96b644bd
SH
328 init_utsname()->release,
329 (int)strcspn(init_utsname()->version, " "),
330 init_utsname()->version);
65ea5b03 331 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
bc850d6b 332 printk_address(regs->ip, regs->bp);
65ea5b03
PA
333 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
334 regs->flags);
1da177e4 335 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
65ea5b03 336 regs->ax, regs->bx, regs->cx);
1da177e4 337 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
65ea5b03 338 regs->dx, regs->si, regs->di);
1da177e4 339 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
65ea5b03 340 regs->bp, regs->r8, regs->r9);
1da177e4
LT
341 printk("R10: %016lx R11: %016lx R12: %016lx\n",
342 regs->r10, regs->r11, regs->r12);
343 printk("R13: %016lx R14: %016lx R15: %016lx\n",
344 regs->r13, regs->r14, regs->r15);
345
346 asm("movl %%ds,%0" : "=r" (ds));
347 asm("movl %%cs,%0" : "=r" (cs));
348 asm("movl %%es,%0" : "=r" (es));
349 asm("movl %%fs,%0" : "=r" (fsindex));
350 asm("movl %%gs,%0" : "=r" (gsindex));
351
352 rdmsrl(MSR_FS_BASE, fs);
353 rdmsrl(MSR_GS_BASE, gs);
354 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
355
f51c9452
GOC
356 cr0 = read_cr0();
357 cr2 = read_cr2();
358 cr3 = read_cr3();
359 cr4 = read_cr4();
1da177e4
LT
360
361 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
362 fs,fsindex,gs,gsindex,shadowgs);
363 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
364 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
bb1995d5
AS
365
366 get_debugreg(d0, 0);
367 get_debugreg(d1, 1);
368 get_debugreg(d2, 2);
369 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
370 get_debugreg(d3, 3);
371 get_debugreg(d6, 6);
372 get_debugreg(d7, 7);
373 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
1da177e4
LT
374}
375
376void show_regs(struct pt_regs *regs)
377{
c078d326 378 printk("CPU %d:", smp_processor_id());
1da177e4 379 __show_regs(regs);
bc850d6b 380 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
1da177e4
LT
381}
382
383/*
384 * Free current thread data structures etc..
385 */
386void exit_thread(void)
387{
388 struct task_struct *me = current;
389 struct thread_struct *t = &me->thread;
73649dab 390
6612538c 391 if (me->thread.io_bitmap_ptr) {
1da177e4
LT
392 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
393
394 kfree(t->io_bitmap_ptr);
395 t->io_bitmap_ptr = NULL;
d3a4f48d 396 clear_thread_flag(TIF_IO_BITMAP);
1da177e4
LT
397 /*
398 * Careful, clear this in the TSS too:
399 */
400 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
401 t->io_bitmap_max = 0;
402 put_cpu();
403 }
404}
405
406void flush_thread(void)
407{
408 struct task_struct *tsk = current;
1da177e4 409
303cd153
MD
410 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
411 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
412 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
413 clear_tsk_thread_flag(tsk, TIF_IA32);
414 } else {
415 set_tsk_thread_flag(tsk, TIF_IA32);
4d9bc79c 416 current_thread_info()->status |= TS_COMPAT;
303cd153 417 }
4d9bc79c 418 }
303cd153 419 clear_tsk_thread_flag(tsk, TIF_DEBUG);
1da177e4
LT
420
421 tsk->thread.debugreg0 = 0;
422 tsk->thread.debugreg1 = 0;
423 tsk->thread.debugreg2 = 0;
424 tsk->thread.debugreg3 = 0;
425 tsk->thread.debugreg6 = 0;
426 tsk->thread.debugreg7 = 0;
6612538c 427 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
1da177e4
LT
428 /*
429 * Forget coprocessor state..
430 */
431 clear_fpu(tsk);
432 clear_used_math();
433}
434
435void release_thread(struct task_struct *dead_task)
436{
437 if (dead_task->mm) {
438 if (dead_task->mm->context.size) {
439 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
440 dead_task->comm,
441 dead_task->mm->context.ldt,
442 dead_task->mm->context.size);
443 BUG();
444 }
445 }
446}
447
448static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
449{
6612538c 450 struct user_desc ud = {
1da177e4
LT
451 .base_addr = addr,
452 .limit = 0xfffff,
453 .seg_32bit = 1,
454 .limit_in_pages = 1,
455 .useable = 1,
456 };
6842ef0e 457 struct desc_struct *desc = (void *)t->thread.tls_array;
1da177e4 458 desc += tls;
80fbb69a 459 fill_ldt(desc, &ud);
1da177e4
LT
460}
461
462static inline u32 read_32bit_tls(struct task_struct *t, int tls)
463{
91394eb0 464 return get_desc_base(&t->thread.tls_array[tls]);
1da177e4
LT
465}
466
467/*
468 * This gets called before we allocate a new thread and copy
469 * the current task into it.
470 */
471void prepare_to_copy(struct task_struct *tsk)
472{
473 unlazy_fpu(tsk);
474}
475
65ea5b03 476int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
1da177e4
LT
477 unsigned long unused,
478 struct task_struct * p, struct pt_regs * regs)
479{
480 int err;
481 struct pt_regs * childregs;
482 struct task_struct *me = current;
483
a88cde13 484 childregs = ((struct pt_regs *)
57eafdc2 485 (THREAD_SIZE + task_stack_page(p))) - 1;
1da177e4
LT
486 *childregs = *regs;
487
65ea5b03
PA
488 childregs->ax = 0;
489 childregs->sp = sp;
490 if (sp == ~0UL)
491 childregs->sp = (unsigned long)childregs;
1da177e4 492
faca6227
PA
493 p->thread.sp = (unsigned long) childregs;
494 p->thread.sp0 = (unsigned long) (childregs+1);
495 p->thread.usersp = me->thread.usersp;
1da177e4 496
e4f17c43 497 set_tsk_thread_flag(p, TIF_FORK);
1da177e4
LT
498
499 p->thread.fs = me->thread.fs;
500 p->thread.gs = me->thread.gs;
501
fd51f666
L
502 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
503 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
504 asm("mov %%es,%0" : "=m" (p->thread.es));
505 asm("mov %%ds,%0" : "=m" (p->thread.ds));
1da177e4 506
d3a4f48d 507 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
1da177e4
LT
508 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
509 if (!p->thread.io_bitmap_ptr) {
510 p->thread.io_bitmap_max = 0;
511 return -ENOMEM;
512 }
a88cde13
AK
513 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
514 IO_BITMAP_BYTES);
d3a4f48d 515 set_tsk_thread_flag(p, TIF_IO_BITMAP);
6612538c 516 }
1da177e4
LT
517
518 /*
519 * Set a new TLS for the child thread?
520 */
521 if (clone_flags & CLONE_SETTLS) {
522#ifdef CONFIG_IA32_EMULATION
523 if (test_thread_flag(TIF_IA32))
efd1ca52 524 err = do_set_thread_area(p, -1,
65ea5b03 525 (struct user_desc __user *)childregs->si, 0);
1da177e4
LT
526 else
527#endif
528 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
529 if (err)
530 goto out;
531 }
532 err = 0;
533out:
534 if (err && p->thread.io_bitmap_ptr) {
535 kfree(p->thread.io_bitmap_ptr);
536 p->thread.io_bitmap_max = 0;
537 }
538 return err;
539}
540
541/*
542 * This special macro can be used to load a debugging register
543 */
6612538c
HS
544#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
545
d3a4f48d 546static inline void __switch_to_xtra(struct task_struct *prev_p,
6612538c
HS
547 struct task_struct *next_p,
548 struct tss_struct *tss)
d3a4f48d
SE
549{
550 struct thread_struct *prev, *next;
eee3af4a 551 unsigned long debugctl;
d3a4f48d
SE
552
553 prev = &prev_p->thread,
554 next = &next_p->thread;
555
eee3af4a
MM
556 debugctl = prev->debugctlmsr;
557 if (next->ds_area_msr != prev->ds_area_msr) {
558 /* we clear debugctl to make sure DS
559 * is not in use when we change it */
560 debugctl = 0;
561 wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
562 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
563 }
564
565 if (next->debugctlmsr != debugctl)
7e991604
RM
566 wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);
567
d3a4f48d
SE
568 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
569 loaddebug(next, 0);
570 loaddebug(next, 1);
571 loaddebug(next, 2);
572 loaddebug(next, 3);
573 /* no 4 and 5 */
574 loaddebug(next, 6);
575 loaddebug(next, 7);
576 }
577
578 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
579 /*
580 * Copy the relevant range of the IO bitmap.
581 * Normally this is 128 bytes or less:
582 */
583 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
584 max(prev->io_bitmap_max, next->io_bitmap_max));
585 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
586 /*
587 * Clear any possible leftover bits:
588 */
589 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
590 }
eee3af4a 591
eee3af4a
MM
592 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
593 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
594
595 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
596 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
d3a4f48d
SE
597}
598
1da177e4
LT
599/*
600 * switch_to(x,y) should switch tasks from x to y.
601 *
6612538c 602 * This could still be optimized:
1da177e4
LT
603 * - fold all the options into a flag word and test it with a single test.
604 * - could test fs/gs bitsliced
099f318b
AK
605 *
606 * Kprobes not supported here. Set the probe on schedule instead.
1da177e4 607 */
f438d914 608struct task_struct *
a88cde13 609__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
1da177e4
LT
610{
611 struct thread_struct *prev = &prev_p->thread,
612 *next = &next_p->thread;
6612538c 613 int cpu = smp_processor_id();
1da177e4
LT
614 struct tss_struct *tss = &per_cpu(init_tss, cpu);
615
e07e23e1
AV
616 /* we're going to use this soon, after a few expensive things */
617 if (next_p->fpu_counter>5)
618 prefetch(&next->i387.fxsave);
619
1da177e4
LT
620 /*
621 * Reload esp0, LDT and the page table pointer:
622 */
7818a1e0 623 load_sp0(tss, next);
1da177e4
LT
624
625 /*
626 * Switch DS and ES.
627 * This won't pick up thread selector changes, but I guess that is ok.
628 */
fd51f666 629 asm volatile("mov %%es,%0" : "=m" (prev->es));
1da177e4
LT
630 if (unlikely(next->es | prev->es))
631 loadsegment(es, next->es);
632
fd51f666 633 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
1da177e4
LT
634 if (unlikely(next->ds | prev->ds))
635 loadsegment(ds, next->ds);
636
637 load_TLS(next, cpu);
638
639 /*
640 * Switch FS and GS.
641 */
642 {
643 unsigned fsindex;
644 asm volatile("movl %%fs,%0" : "=r" (fsindex));
645 /* segment register != 0 always requires a reload.
646 also reload when it has changed.
647 when prev process used 64bit base always reload
648 to avoid an information leak. */
649 if (unlikely(fsindex | next->fsindex | prev->fs)) {
650 loadsegment(fs, next->fsindex);
651 /* check if the user used a selector != 0
652 * if yes clear 64bit base, since overloaded base
653 * is always mapped to the Null selector
654 */
655 if (fsindex)
656 prev->fs = 0;
657 }
658 /* when next process has a 64bit base use it */
659 if (next->fs)
660 wrmsrl(MSR_FS_BASE, next->fs);
661 prev->fsindex = fsindex;
662 }
663 {
664 unsigned gsindex;
665 asm volatile("movl %%gs,%0" : "=r" (gsindex));
666 if (unlikely(gsindex | next->gsindex | prev->gs)) {
667 load_gs_index(next->gsindex);
668 if (gsindex)
669 prev->gs = 0;
670 }
671 if (next->gs)
672 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
673 prev->gsindex = gsindex;
674 }
675
0a5ace2a
AK
676 /* Must be after DS reload */
677 unlazy_fpu(prev_p);
678
1da177e4 679 /*
45948d77 680 * Switch the PDA and FPU contexts.
1da177e4 681 */
faca6227
PA
682 prev->usersp = read_pda(oldrsp);
683 write_pda(oldrsp, next->usersp);
1da177e4 684 write_pda(pcurrent, next_p);
18bd057b 685
a88cde13 686 write_pda(kernelstack,
7b0bda74 687 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
0a425405
AV
688#ifdef CONFIG_CC_STACKPROTECTOR
689 write_pda(stack_canary, next_p->stack_canary);
690 /*
691 * Build time only check to make sure the stack_canary is at
692 * offset 40 in the pda; this is a gcc ABI requirement
693 */
694 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
695#endif
1da177e4
LT
696
697 /*
d3a4f48d 698 * Now maybe reload the debug registers and handle I/O bitmaps
1da177e4 699 */
eee3af4a
MM
700 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
701 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
d3a4f48d 702 __switch_to_xtra(prev_p, next_p, tss);
1da177e4 703
e07e23e1
AV
704 /* If the task has used fpu the last 5 timeslices, just do a full
705 * restore of the math state immediately to avoid the trap; the
706 * chances of needing FPU soon are obviously high now
707 */
708 if (next_p->fpu_counter>5)
709 math_state_restore();
1da177e4
LT
710 return prev_p;
711}
712
713/*
714 * sys_execve() executes a new program.
715 */
6612538c 716asmlinkage
1da177e4
LT
717long sys_execve(char __user *name, char __user * __user *argv,
718 char __user * __user *envp, struct pt_regs regs)
719{
720 long error;
721 char * filename;
722
723 filename = getname(name);
724 error = PTR_ERR(filename);
725 if (IS_ERR(filename))
726 return error;
727 error = do_execve(filename, argv, envp, &regs);
1da177e4
LT
728 putname(filename);
729 return error;
730}
731
732void set_personality_64bit(void)
733{
734 /* inherit personality from parent */
735
736 /* Make sure to be in 64bit mode */
6612538c 737 clear_thread_flag(TIF_IA32);
1da177e4
LT
738
739 /* TBD: overwrites user setup. Should have two bits.
740 But 64bit processes have always behaved this way,
741 so it's not too bad. The main problem is just that
6612538c 742 32bit childs are affected again. */
1da177e4
LT
743 current->personality &= ~READ_IMPLIES_EXEC;
744}
745
746asmlinkage long sys_fork(struct pt_regs *regs)
747{
65ea5b03 748 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
1da177e4
LT
749}
750
a88cde13
AK
751asmlinkage long
752sys_clone(unsigned long clone_flags, unsigned long newsp,
753 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
1da177e4
LT
754{
755 if (!newsp)
65ea5b03 756 newsp = regs->sp;
1da177e4
LT
757 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
758}
759
760/*
761 * This is trivial, and on the face of it looks like it
762 * could equally well be done in user mode.
763 *
764 * Not so, for quite unobvious reasons - register pressure.
765 * In user mode vfork() cannot have a stack frame, and if
766 * done by calling the "clone()" system call directly, you
767 * do not have enough call-clobbered registers to hold all
768 * the information you need.
769 */
770asmlinkage long sys_vfork(struct pt_regs *regs)
771{
65ea5b03 772 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
1da177e4
LT
773 NULL, NULL);
774}
775
776unsigned long get_wchan(struct task_struct *p)
777{
778 unsigned long stack;
65ea5b03 779 u64 fp,ip;
1da177e4
LT
780 int count = 0;
781
782 if (!p || p == current || p->state==TASK_RUNNING)
783 return 0;
57eafdc2 784 stack = (unsigned long)task_stack_page(p);
faca6227 785 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
1da177e4 786 return 0;
faca6227 787 fp = *(u64 *)(p->thread.sp);
1da177e4 788 do {
a88cde13
AK
789 if (fp < (unsigned long)stack ||
790 fp > (unsigned long)stack+THREAD_SIZE)
1da177e4 791 return 0;
65ea5b03
PA
792 ip = *(u64 *)(fp+8);
793 if (!in_sched_functions(ip))
794 return ip;
1da177e4
LT
795 fp = *(u64 *)fp;
796 } while (count++ < 16);
797 return 0;
798}
799
800long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
801{
802 int ret = 0;
803 int doit = task == current;
804 int cpu;
805
806 switch (code) {
807 case ARCH_SET_GS:
84929801 808 if (addr >= TASK_SIZE_OF(task))
1da177e4
LT
809 return -EPERM;
810 cpu = get_cpu();
811 /* handle small bases via the GDT because that's faster to
812 switch. */
813 if (addr <= 0xffffffff) {
814 set_32bit_tls(task, GS_TLS, addr);
815 if (doit) {
816 load_TLS(&task->thread, cpu);
817 load_gs_index(GS_TLS_SEL);
818 }
819 task->thread.gsindex = GS_TLS_SEL;
820 task->thread.gs = 0;
821 } else {
822 task->thread.gsindex = 0;
823 task->thread.gs = addr;
824 if (doit) {
a88cde13
AK
825 load_gs_index(0);
826 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
1da177e4
LT
827 }
828 }
829 put_cpu();
830 break;
831 case ARCH_SET_FS:
832 /* Not strictly needed for fs, but do it for symmetry
833 with gs */
84929801 834 if (addr >= TASK_SIZE_OF(task))
6612538c 835 return -EPERM;
1da177e4 836 cpu = get_cpu();
6612538c 837 /* handle small bases via the GDT because that's faster to
1da177e4 838 switch. */
6612538c 839 if (addr <= 0xffffffff) {
1da177e4 840 set_32bit_tls(task, FS_TLS, addr);
6612538c
HS
841 if (doit) {
842 load_TLS(&task->thread, cpu);
a88cde13 843 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
1da177e4
LT
844 }
845 task->thread.fsindex = FS_TLS_SEL;
846 task->thread.fs = 0;
6612538c 847 } else {
1da177e4
LT
848 task->thread.fsindex = 0;
849 task->thread.fs = addr;
850 if (doit) {
851 /* set the selector to 0 to not confuse
852 __switch_to */
a88cde13
AK
853 asm volatile("movl %0,%%fs" :: "r" (0));
854 ret = checking_wrmsrl(MSR_FS_BASE, addr);
1da177e4
LT
855 }
856 }
857 put_cpu();
858 break;
6612538c
HS
859 case ARCH_GET_FS: {
860 unsigned long base;
1da177e4
LT
861 if (task->thread.fsindex == FS_TLS_SEL)
862 base = read_32bit_tls(task, FS_TLS);
a88cde13 863 else if (doit)
1da177e4 864 rdmsrl(MSR_FS_BASE, base);
a88cde13 865 else
1da177e4 866 base = task->thread.fs;
6612538c
HS
867 ret = put_user(base, (unsigned long __user *)addr);
868 break;
1da177e4 869 }
6612538c 870 case ARCH_GET_GS: {
1da177e4 871 unsigned long base;
97c2803c 872 unsigned gsindex;
1da177e4
LT
873 if (task->thread.gsindex == GS_TLS_SEL)
874 base = read_32bit_tls(task, GS_TLS);
97c2803c 875 else if (doit) {
6612538c 876 asm("movl %%gs,%0" : "=r" (gsindex));
97c2803c
JB
877 if (gsindex)
878 rdmsrl(MSR_KERNEL_GS_BASE, base);
879 else
880 base = task->thread.gs;
881 }
a88cde13 882 else
1da177e4 883 base = task->thread.gs;
6612538c 884 ret = put_user(base, (unsigned long __user *)addr);
1da177e4
LT
885 break;
886 }
887
888 default:
889 ret = -EINVAL;
890 break;
6612538c 891 }
1da177e4 892
6612538c
HS
893 return ret;
894}
1da177e4
LT
895
896long sys_arch_prctl(int code, unsigned long addr)
897{
898 return do_arch_prctl(current, code, addr);
1da177e4
LT
899}
900
901unsigned long arch_align_stack(unsigned long sp)
902{
c16b63e0 903 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
1da177e4
LT
904 sp -= get_random_int() % 8192;
905 return sp & ~0xf;
906}
c1d171a0
JK
907
908unsigned long arch_randomize_brk(struct mm_struct *mm)
909{
910 unsigned long range_end = mm->brk + 0x02000000;
911 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
912}