2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/stackprotector.h>
20 #include <linux/cpu.h>
21 #include <linux/errno.h>
22 #include <linux/sched.h>
24 #include <linux/kernel.h>
26 #include <linux/elfcore.h>
27 #include <linux/smp.h>
28 #include <linux/slab.h>
29 #include <linux/user.h>
30 #include <linux/interrupt.h>
31 #include <linux/utsname.h>
32 #include <linux/delay.h>
33 #include <linux/module.h>
34 #include <linux/ptrace.h>
35 #include <linux/random.h>
36 #include <linux/notifier.h>
37 #include <linux/kprobes.h>
38 #include <linux/kdebug.h>
39 #include <linux/tick.h>
40 #include <linux/prctl.h>
41 #include <linux/uaccess.h>
43 #include <linux/ftrace.h>
44 #include <linux/dmi.h>
46 #include <asm/pgtable.h>
47 #include <asm/system.h>
48 #include <asm/processor.h>
50 #include <asm/mmu_context.h>
51 #include <asm/prctl.h>
53 #include <asm/proto.h>
56 #include <asm/syscalls.h>
58 #include <asm/debugreg.h>
59 #include <asm/hw_breakpoint.h>
61 asmlinkage extern void ret_from_fork(void);
63 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
64 EXPORT_PER_CPU_SYMBOL(current_task);
66 DEFINE_PER_CPU(unsigned long, old_rsp);
67 static DEFINE_PER_CPU(unsigned char, is_idle);
69 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
71 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
73 void idle_notifier_register(struct notifier_block *n)
75 atomic_notifier_chain_register(&idle_notifier, n);
77 EXPORT_SYMBOL_GPL(idle_notifier_register);
79 void idle_notifier_unregister(struct notifier_block *n)
81 atomic_notifier_chain_unregister(&idle_notifier, n);
83 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
87 percpu_write(is_idle, 1);
88 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
91 static void __exit_idle(void)
93 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
95 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
98 /* Called from interrupts to signify idle end */
101 /* idle loop has pid 0 */
108 static inline void play_dead(void)
115 * The idle thread. There's no useful work to be
116 * done, so just try to conserve power and have a
117 * low exit latency (ie sit in a loop waiting for
118 * somebody to say that they'd like to reschedule)
122 current_thread_info()->status |= TS_POLLING;
125 * If we're the non-boot CPU, nothing set the stack canary up
126 * for us. CPU0 already has it initialized but no harm in
127 * doing it again. This is a good place for updating it, as
128 * we wont ever return from this function (so the invalid
129 * canaries already on the stack wont ever trigger).
131 boot_init_stack_canary();
133 /* endless idle loop with no priority at all */
135 tick_nohz_stop_sched_tick(1);
136 while (!need_resched()) {
140 if (cpu_is_offline(smp_processor_id()))
143 * Idle routines should keep interrupts disabled
144 * from here on, until they go to idle.
145 * Otherwise, idle callbacks can misfire.
149 /* Don't trace irqs off for idle */
150 stop_critical_timings();
152 start_critical_timings();
153 /* In many cases the interrupt that ended idle
154 has already called exit_idle. But some idle
155 loops can be woken up without interrupt. */
159 tick_nohz_restart_sched_tick();
160 preempt_enable_no_resched();
166 /* Prints also some state that isn't saved in the pt_regs */
167 void __show_regs(struct pt_regs *regs, int all)
169 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
170 unsigned long d0, d1, d2, d3, d6, d7;
171 unsigned int fsindex, gsindex;
172 unsigned int ds, cs, es;
177 board = dmi_get_system_info(DMI_PRODUCT_NAME);
180 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
181 current->pid, current->comm, print_tainted(),
182 init_utsname()->release,
183 (int)strcspn(init_utsname()->version, " "),
184 init_utsname()->version, board);
185 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
186 printk_address(regs->ip, 1);
187 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
188 regs->sp, regs->flags);
189 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
190 regs->ax, regs->bx, regs->cx);
191 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
192 regs->dx, regs->si, regs->di);
193 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
194 regs->bp, regs->r8, regs->r9);
195 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
196 regs->r10, regs->r11, regs->r12);
197 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
198 regs->r13, regs->r14, regs->r15);
200 asm("movl %%ds,%0" : "=r" (ds));
201 asm("movl %%cs,%0" : "=r" (cs));
202 asm("movl %%es,%0" : "=r" (es));
203 asm("movl %%fs,%0" : "=r" (fsindex));
204 asm("movl %%gs,%0" : "=r" (gsindex));
206 rdmsrl(MSR_FS_BASE, fs);
207 rdmsrl(MSR_GS_BASE, gs);
208 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
218 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
219 fs, fsindex, gs, gsindex, shadowgs);
220 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
222 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
228 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
232 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
235 void show_regs(struct pt_regs *regs)
237 printk(KERN_INFO "CPU %d:", smp_processor_id());
238 __show_regs(regs, 1);
239 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
242 void release_thread(struct task_struct *dead_task)
245 if (dead_task->mm->context.size) {
246 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
248 dead_task->mm->context.ldt,
249 dead_task->mm->context.size);
253 if (unlikely(dead_task->thread.debugreg7))
254 flush_thread_hw_breakpoint(dead_task);
257 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
259 struct user_desc ud = {
266 struct desc_struct *desc = t->thread.tls_array;
271 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
273 return get_desc_base(&t->thread.tls_array[tls]);
277 * This gets called before we allocate a new thread and copy
278 * the current task into it.
280 void prepare_to_copy(struct task_struct *tsk)
285 int copy_thread(unsigned long clone_flags, unsigned long sp,
286 unsigned long unused,
287 struct task_struct *p, struct pt_regs *regs)
290 struct pt_regs *childregs;
291 struct task_struct *me = current;
293 childregs = ((struct pt_regs *)
294 (THREAD_SIZE + task_stack_page(p))) - 1;
300 childregs->sp = (unsigned long)childregs;
302 p->thread.sp = (unsigned long) childregs;
303 p->thread.sp0 = (unsigned long) (childregs+1);
304 p->thread.usersp = me->thread.usersp;
306 set_tsk_thread_flag(p, TIF_FORK);
308 p->thread.fs = me->thread.fs;
309 p->thread.gs = me->thread.gs;
310 p->thread.io_bitmap_ptr = NULL;
312 savesegment(gs, p->thread.gsindex);
313 savesegment(fs, p->thread.fsindex);
314 savesegment(es, p->thread.es);
315 savesegment(ds, p->thread.ds);
318 if (unlikely(test_tsk_thread_flag(me, TIF_DEBUG)))
319 if (copy_thread_hw_breakpoint(me, p, clone_flags))
322 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
323 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
324 if (!p->thread.io_bitmap_ptr) {
325 p->thread.io_bitmap_max = 0;
328 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
330 set_tsk_thread_flag(p, TIF_IO_BITMAP);
334 * Set a new TLS for the child thread?
336 if (clone_flags & CLONE_SETTLS) {
337 #ifdef CONFIG_IA32_EMULATION
338 if (test_thread_flag(TIF_IA32))
339 err = do_set_thread_area(p, -1,
340 (struct user_desc __user *)childregs->si, 0);
343 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
348 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
349 p->thread.ds_ctx = NULL;
351 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
352 p->thread.debugctlmsr = 0;
356 if (err && p->thread.io_bitmap_ptr) {
357 kfree(p->thread.io_bitmap_ptr);
358 p->thread.io_bitmap_max = 0;
361 flush_thread_hw_breakpoint(p);
367 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
375 percpu_write(old_rsp, new_sp);
376 regs->cs = __USER_CS;
377 regs->ss = __USER_DS;
381 * Free the old FP and other extended state
383 free_thread_xstate(current);
385 EXPORT_SYMBOL_GPL(start_thread);
388 * switch_to(x,y) should switch tasks from x to y.
390 * This could still be optimized:
391 * - fold all the options into a flag word and test it with a single test.
392 * - could test fs/gs bitsliced
394 * Kprobes not supported here. Set the probe on schedule instead.
395 * Function graph tracer not supported too.
397 __notrace_funcgraph struct task_struct *
398 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
400 struct thread_struct *prev = &prev_p->thread;
401 struct thread_struct *next = &next_p->thread;
402 int cpu = smp_processor_id();
403 struct tss_struct *tss = &per_cpu(init_tss, cpu);
404 unsigned fsindex, gsindex;
406 /* we're going to use this soon, after a few expensive things */
407 if (next_p->fpu_counter > 5)
408 prefetch(next->xstate);
411 * Reload esp0, LDT and the page table pointer:
417 * This won't pick up thread selector changes, but I guess that is ok.
419 savesegment(es, prev->es);
420 if (unlikely(next->es | prev->es))
421 loadsegment(es, next->es);
423 savesegment(ds, prev->ds);
424 if (unlikely(next->ds | prev->ds))
425 loadsegment(ds, next->ds);
428 /* We must save %fs and %gs before load_TLS() because
429 * %fs and %gs may be cleared by load_TLS().
431 * (e.g. xen_load_tls())
433 savesegment(fs, fsindex);
434 savesegment(gs, gsindex);
439 * Leave lazy mode, flushing any hypercalls made here.
440 * This must be done before restoring TLS segments so
441 * the GDT and LDT are properly updated, and must be
442 * done before math_state_restore, so the TS bit is up
445 arch_leave_lazy_cpu_mode();
450 * Segment register != 0 always requires a reload. Also
451 * reload when it has changed. When prev process used 64bit
452 * base always reload to avoid an information leak.
454 if (unlikely(fsindex | next->fsindex | prev->fs)) {
455 loadsegment(fs, next->fsindex);
457 * Check if the user used a selector != 0; if yes
458 * clear 64bit base, since overloaded base is always
459 * mapped to the Null selector
464 /* when next process has a 64bit base use it */
466 wrmsrl(MSR_FS_BASE, next->fs);
467 prev->fsindex = fsindex;
469 if (unlikely(gsindex | next->gsindex | prev->gs)) {
470 load_gs_index(next->gsindex);
475 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
476 prev->gsindex = gsindex;
478 /* Must be after DS reload */
482 * Switch the PDA and FPU contexts.
484 prev->usersp = percpu_read(old_rsp);
485 percpu_write(old_rsp, next->usersp);
486 percpu_write(current_task, next_p);
488 percpu_write(kernel_stack,
489 (unsigned long)task_stack_page(next_p) +
490 THREAD_SIZE - KERNEL_STACK_OFFSET);
493 * Now maybe reload the debug registers and handle I/O bitmaps
495 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
496 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
497 __switch_to_xtra(prev_p, next_p, tss);
499 /* If the task has used fpu the last 5 timeslices, just do a full
500 * restore of the math state immediately to avoid the trap; the
501 * chances of needing FPU soon are obviously high now
503 * tsk_used_math() checks prevent calling math_state_restore(),
504 * which can sleep in the case of !tsk_used_math()
506 if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
507 math_state_restore();
509 * There's a problem with moving the arch_install_thread_hw_breakpoint()
510 * call before current is updated. Suppose a kernel breakpoint is
511 * triggered in between the two, the hw-breakpoint handler will see that
512 * the 'current' task does not have TIF_DEBUG flag set and will think it
513 * is leftover from an old task (lazy switching) and will erase it. Then
514 * until the next context switch, no user-breakpoints will be installed.
516 * The real problem is that it's impossible to update both current and
517 * physical debug registers at the same instant, so there will always be
518 * a window in which they disagree and a breakpoint might get triggered.
519 * Since we use lazy switching, we are forced to assume that a
520 * disagreement means that current is correct and the exception is due
521 * to lazy debug register switching.
523 if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
524 arch_install_thread_hw_breakpoint(next_p);
530 * sys_execve() executes a new program.
533 long sys_execve(char __user *name, char __user * __user *argv,
534 char __user * __user *envp, struct pt_regs *regs)
539 filename = getname(name);
540 error = PTR_ERR(filename);
541 if (IS_ERR(filename))
543 error = do_execve(filename, argv, envp, regs);
548 void set_personality_64bit(void)
550 /* inherit personality from parent */
552 /* Make sure to be in 64bit mode */
553 clear_thread_flag(TIF_IA32);
555 /* TBD: overwrites user setup. Should have two bits.
556 But 64bit processes have always behaved this way,
557 so it's not too bad. The main problem is just that
558 32bit childs are affected again. */
559 current->personality &= ~READ_IMPLIES_EXEC;
563 sys_clone(unsigned long clone_flags, unsigned long newsp,
564 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
568 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
571 unsigned long get_wchan(struct task_struct *p)
577 if (!p || p == current || p->state == TASK_RUNNING)
579 stack = (unsigned long)task_stack_page(p);
580 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
582 fp = *(u64 *)(p->thread.sp);
584 if (fp < (unsigned long)stack ||
585 fp >= (unsigned long)stack+THREAD_SIZE)
588 if (!in_sched_functions(ip))
591 } while (count++ < 16);
595 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
598 int doit = task == current;
603 if (addr >= TASK_SIZE_OF(task))
606 /* handle small bases via the GDT because that's faster to
608 if (addr <= 0xffffffff) {
609 set_32bit_tls(task, GS_TLS, addr);
611 load_TLS(&task->thread, cpu);
612 load_gs_index(GS_TLS_SEL);
614 task->thread.gsindex = GS_TLS_SEL;
617 task->thread.gsindex = 0;
618 task->thread.gs = addr;
621 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
627 /* Not strictly needed for fs, but do it for symmetry
629 if (addr >= TASK_SIZE_OF(task))
632 /* handle small bases via the GDT because that's faster to
634 if (addr <= 0xffffffff) {
635 set_32bit_tls(task, FS_TLS, addr);
637 load_TLS(&task->thread, cpu);
638 loadsegment(fs, FS_TLS_SEL);
640 task->thread.fsindex = FS_TLS_SEL;
643 task->thread.fsindex = 0;
644 task->thread.fs = addr;
646 /* set the selector to 0 to not confuse
649 ret = checking_wrmsrl(MSR_FS_BASE, addr);
656 if (task->thread.fsindex == FS_TLS_SEL)
657 base = read_32bit_tls(task, FS_TLS);
659 rdmsrl(MSR_FS_BASE, base);
661 base = task->thread.fs;
662 ret = put_user(base, (unsigned long __user *)addr);
668 if (task->thread.gsindex == GS_TLS_SEL)
669 base = read_32bit_tls(task, GS_TLS);
671 savesegment(gs, gsindex);
673 rdmsrl(MSR_KERNEL_GS_BASE, base);
675 base = task->thread.gs;
677 base = task->thread.gs;
678 ret = put_user(base, (unsigned long __user *)addr);
690 long sys_arch_prctl(int code, unsigned long addr)
692 return do_arch_prctl(current, code, addr);
695 unsigned long arch_align_stack(unsigned long sp)
697 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
698 sp -= get_random_int() % 8192;
702 unsigned long arch_randomize_brk(struct mm_struct *mm)
704 unsigned long range_end = mm->brk + 0x02000000;
705 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;