2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
17 #include <linux/stackprotector.h>
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
22 #include <linux/kernel.h>
24 #include <linux/elfcore.h>
25 #include <linux/smp.h>
26 #include <linux/slab.h>
27 #include <linux/user.h>
28 #include <linux/interrupt.h>
29 #include <linux/utsname.h>
30 #include <linux/delay.h>
31 #include <linux/module.h>
32 #include <linux/ptrace.h>
33 #include <linux/notifier.h>
34 #include <linux/kprobes.h>
35 #include <linux/kdebug.h>
36 #include <linux/tick.h>
37 #include <linux/prctl.h>
38 #include <linux/uaccess.h>
40 #include <linux/ftrace.h>
41 #include <linux/dmi.h>
43 #include <asm/pgtable.h>
44 #include <asm/system.h>
45 #include <asm/processor.h>
47 #include <asm/mmu_context.h>
48 #include <asm/prctl.h>
50 #include <asm/proto.h>
53 #include <asm/syscalls.h>
55 #include <asm/debugreg.h>
57 asmlinkage extern void ret_from_fork(void);
59 DEFINE_PER_CPU(unsigned long, old_rsp);
60 static DEFINE_PER_CPU(unsigned char, is_idle);
62 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
64 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
66 void idle_notifier_register(struct notifier_block *n)
68 atomic_notifier_chain_register(&idle_notifier, n);
70 EXPORT_SYMBOL_GPL(idle_notifier_register);
72 void idle_notifier_unregister(struct notifier_block *n)
74 atomic_notifier_chain_unregister(&idle_notifier, n);
76 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
80 percpu_write(is_idle, 1);
81 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
84 static void __exit_idle(void)
86 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
88 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
91 /* Called from interrupts to signify idle end */
94 /* idle loop has pid 0 */
101 static inline void play_dead(void)
108 * The idle thread. There's no useful work to be
109 * done, so just try to conserve power and have a
110 * low exit latency (ie sit in a loop waiting for
111 * somebody to say that they'd like to reschedule)
115 current_thread_info()->status |= TS_POLLING;
118 * If we're the non-boot CPU, nothing set the stack canary up
119 * for us. CPU0 already has it initialized but no harm in
120 * doing it again. This is a good place for updating it, as
121 * we wont ever return from this function (so the invalid
122 * canaries already on the stack wont ever trigger).
124 boot_init_stack_canary();
126 /* endless idle loop with no priority at all */
128 tick_nohz_stop_sched_tick(1);
129 while (!need_resched()) {
133 if (cpu_is_offline(smp_processor_id()))
136 * Idle routines should keep interrupts disabled
137 * from here on, until they go to idle.
138 * Otherwise, idle callbacks can misfire.
142 /* Don't trace irqs off for idle */
143 stop_critical_timings();
145 start_critical_timings();
146 /* In many cases the interrupt that ended idle
147 has already called exit_idle. But some idle
148 loops can be woken up without interrupt. */
152 tick_nohz_restart_sched_tick();
153 preempt_enable_no_resched();
159 /* Prints also some state that isn't saved in the pt_regs */
160 void __show_regs(struct pt_regs *regs, int all)
162 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
163 unsigned long d0, d1, d2, d3, d6, d7;
164 unsigned int fsindex, gsindex;
165 unsigned int ds, cs, es;
170 board = dmi_get_system_info(DMI_PRODUCT_NAME);
173 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
174 current->pid, current->comm, print_tainted(),
175 init_utsname()->release,
176 (int)strcspn(init_utsname()->version, " "),
177 init_utsname()->version, board);
178 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
179 printk_address(regs->ip, 1);
180 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
181 regs->sp, regs->flags);
182 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
183 regs->ax, regs->bx, regs->cx);
184 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
185 regs->dx, regs->si, regs->di);
186 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
187 regs->bp, regs->r8, regs->r9);
188 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
189 regs->r10, regs->r11, regs->r12);
190 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
191 regs->r13, regs->r14, regs->r15);
193 asm("movl %%ds,%0" : "=r" (ds));
194 asm("movl %%cs,%0" : "=r" (cs));
195 asm("movl %%es,%0" : "=r" (es));
196 asm("movl %%fs,%0" : "=r" (fsindex));
197 asm("movl %%gs,%0" : "=r" (gsindex));
199 rdmsrl(MSR_FS_BASE, fs);
200 rdmsrl(MSR_GS_BASE, gs);
201 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
211 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
212 fs, fsindex, gs, gsindex, shadowgs);
213 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
215 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
221 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
225 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
228 void show_regs(struct pt_regs *regs)
230 printk(KERN_INFO "CPU %d:", smp_processor_id());
231 __show_regs(regs, 1);
232 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
235 void release_thread(struct task_struct *dead_task)
238 if (dead_task->mm->context.size) {
239 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
241 dead_task->mm->context.ldt,
242 dead_task->mm->context.size);
248 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
250 struct user_desc ud = {
257 struct desc_struct *desc = t->thread.tls_array;
262 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
264 return get_desc_base(&t->thread.tls_array[tls]);
268 * This gets called before we allocate a new thread and copy
269 * the current task into it.
271 void prepare_to_copy(struct task_struct *tsk)
276 int copy_thread(unsigned long clone_flags, unsigned long sp,
277 unsigned long unused,
278 struct task_struct *p, struct pt_regs *regs)
281 struct pt_regs *childregs;
282 struct task_struct *me = current;
284 childregs = ((struct pt_regs *)
285 (THREAD_SIZE + task_stack_page(p))) - 1;
291 childregs->sp = (unsigned long)childregs;
293 p->thread.sp = (unsigned long) childregs;
294 p->thread.sp0 = (unsigned long) (childregs+1);
295 p->thread.usersp = me->thread.usersp;
297 set_tsk_thread_flag(p, TIF_FORK);
299 p->thread.fs = me->thread.fs;
300 p->thread.gs = me->thread.gs;
301 p->thread.io_bitmap_ptr = NULL;
303 savesegment(gs, p->thread.gsindex);
304 savesegment(fs, p->thread.fsindex);
305 savesegment(es, p->thread.es);
306 savesegment(ds, p->thread.ds);
309 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
311 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
312 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
313 if (!p->thread.io_bitmap_ptr) {
314 p->thread.io_bitmap_max = 0;
317 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
319 set_tsk_thread_flag(p, TIF_IO_BITMAP);
323 * Set a new TLS for the child thread?
325 if (clone_flags & CLONE_SETTLS) {
326 #ifdef CONFIG_IA32_EMULATION
327 if (test_thread_flag(TIF_IA32))
328 err = do_set_thread_area(p, -1,
329 (struct user_desc __user *)childregs->si, 0);
332 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
337 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
338 p->thread.ds_ctx = NULL;
340 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
341 p->thread.debugctlmsr = 0;
345 if (err && p->thread.io_bitmap_ptr) {
346 kfree(p->thread.io_bitmap_ptr);
347 p->thread.io_bitmap_max = 0;
354 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
362 percpu_write(old_rsp, new_sp);
363 regs->cs = __USER_CS;
364 regs->ss = __USER_DS;
368 * Free the old FP and other extended state
370 free_thread_xstate(current);
372 EXPORT_SYMBOL_GPL(start_thread);
375 * switch_to(x,y) should switch tasks from x to y.
377 * This could still be optimized:
378 * - fold all the options into a flag word and test it with a single test.
379 * - could test fs/gs bitsliced
381 * Kprobes not supported here. Set the probe on schedule instead.
382 * Function graph tracer not supported too.
384 __notrace_funcgraph struct task_struct *
385 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
387 struct thread_struct *prev = &prev_p->thread;
388 struct thread_struct *next = &next_p->thread;
389 int cpu = smp_processor_id();
390 struct tss_struct *tss = &per_cpu(init_tss, cpu);
391 unsigned fsindex, gsindex;
395 * If the task has used fpu the last 5 timeslices, just do a full
396 * restore of the math state immediately to avoid the trap; the
397 * chances of needing FPU soon are obviously high now
399 preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
401 /* we're going to use this soon, after a few expensive things */
403 prefetch(next->xstate);
406 * Reload esp0, LDT and the page table pointer:
412 * This won't pick up thread selector changes, but I guess that is ok.
414 savesegment(es, prev->es);
415 if (unlikely(next->es | prev->es))
416 loadsegment(es, next->es);
418 savesegment(ds, prev->ds);
419 if (unlikely(next->ds | prev->ds))
420 loadsegment(ds, next->ds);
423 /* We must save %fs and %gs before load_TLS() because
424 * %fs and %gs may be cleared by load_TLS().
426 * (e.g. xen_load_tls())
428 savesegment(fs, fsindex);
429 savesegment(gs, gsindex);
433 /* Must be after DS reload */
436 /* Make sure cpu is ready for new context */
441 * Leave lazy mode, flushing any hypercalls made here.
442 * This must be done before restoring TLS segments so
443 * the GDT and LDT are properly updated, and must be
444 * done before math_state_restore, so the TS bit is up
447 arch_end_context_switch(next_p);
452 * Segment register != 0 always requires a reload. Also
453 * reload when it has changed. When prev process used 64bit
454 * base always reload to avoid an information leak.
456 if (unlikely(fsindex | next->fsindex | prev->fs)) {
457 loadsegment(fs, next->fsindex);
459 * Check if the user used a selector != 0; if yes
460 * clear 64bit base, since overloaded base is always
461 * mapped to the Null selector
466 /* when next process has a 64bit base use it */
468 wrmsrl(MSR_FS_BASE, next->fs);
469 prev->fsindex = fsindex;
471 if (unlikely(gsindex | next->gsindex | prev->gs)) {
472 load_gs_index(next->gsindex);
477 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
478 prev->gsindex = gsindex;
481 * Switch the PDA and FPU contexts.
483 prev->usersp = percpu_read(old_rsp);
484 percpu_write(old_rsp, next->usersp);
485 percpu_write(current_task, next_p);
487 percpu_write(kernel_stack,
488 (unsigned long)task_stack_page(next_p) +
489 THREAD_SIZE - KERNEL_STACK_OFFSET);
492 * Now maybe reload the debug registers and handle I/O bitmaps
494 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
495 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
496 __switch_to_xtra(prev_p, next_p, tss);
499 * Preload the FPU context, now that we've determined that the
500 * task is likely to be using it.
503 __math_state_restore();
509 * sys_execve() executes a new program.
512 long sys_execve(char __user *name, char __user * __user *argv,
513 char __user * __user *envp, struct pt_regs *regs)
518 filename = getname(name);
519 error = PTR_ERR(filename);
520 if (IS_ERR(filename))
522 error = do_execve(filename, argv, envp, regs);
527 void set_personality_64bit(void)
529 /* inherit personality from parent */
531 /* Make sure to be in 64bit mode */
532 clear_thread_flag(TIF_IA32);
534 /* TBD: overwrites user setup. Should have two bits.
535 But 64bit processes have always behaved this way,
536 so it's not too bad. The main problem is just that
537 32bit childs are affected again. */
538 current->personality &= ~READ_IMPLIES_EXEC;
542 sys_clone(unsigned long clone_flags, unsigned long newsp,
543 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
547 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
550 unsigned long get_wchan(struct task_struct *p)
556 if (!p || p == current || p->state == TASK_RUNNING)
558 stack = (unsigned long)task_stack_page(p);
559 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
561 fp = *(u64 *)(p->thread.sp);
563 if (fp < (unsigned long)stack ||
564 fp >= (unsigned long)stack+THREAD_SIZE)
567 if (!in_sched_functions(ip))
570 } while (count++ < 16);
574 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
577 int doit = task == current;
582 if (addr >= TASK_SIZE_OF(task))
585 /* handle small bases via the GDT because that's faster to
587 if (addr <= 0xffffffff) {
588 set_32bit_tls(task, GS_TLS, addr);
590 load_TLS(&task->thread, cpu);
591 load_gs_index(GS_TLS_SEL);
593 task->thread.gsindex = GS_TLS_SEL;
596 task->thread.gsindex = 0;
597 task->thread.gs = addr;
600 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
606 /* Not strictly needed for fs, but do it for symmetry
608 if (addr >= TASK_SIZE_OF(task))
611 /* handle small bases via the GDT because that's faster to
613 if (addr <= 0xffffffff) {
614 set_32bit_tls(task, FS_TLS, addr);
616 load_TLS(&task->thread, cpu);
617 loadsegment(fs, FS_TLS_SEL);
619 task->thread.fsindex = FS_TLS_SEL;
622 task->thread.fsindex = 0;
623 task->thread.fs = addr;
625 /* set the selector to 0 to not confuse
628 ret = checking_wrmsrl(MSR_FS_BASE, addr);
635 if (task->thread.fsindex == FS_TLS_SEL)
636 base = read_32bit_tls(task, FS_TLS);
638 rdmsrl(MSR_FS_BASE, base);
640 base = task->thread.fs;
641 ret = put_user(base, (unsigned long __user *)addr);
647 if (task->thread.gsindex == GS_TLS_SEL)
648 base = read_32bit_tls(task, GS_TLS);
650 savesegment(gs, gsindex);
652 rdmsrl(MSR_KERNEL_GS_BASE, base);
654 base = task->thread.gs;
656 base = task->thread.gs;
657 ret = put_user(base, (unsigned long __user *)addr);
669 long sys_arch_prctl(int code, unsigned long addr)
671 return do_arch_prctl(current, code, addr);