[PATCH] x86_64: CPU hotplug support
[linux-2.6-block.git] / arch / x86_64 / kernel / smpboot.c
CommitLineData
1da177e4
LT
1/*
2 * x86 SMP booting functions
3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
5 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
6 * Copyright 2001 Andi Kleen, SuSE Labs.
7 *
8 * Much of the core SMP work is based on previous work by Thomas Radke, to
9 * whom a great many thanks are extended.
10 *
11 * Thanks to Intel for making available several different Pentium,
12 * Pentium Pro and Pentium-II/Xeon MP machines.
13 * Original development of Linux SMP code supported by Caldera.
14 *
a8ab26fe 15 * This code is released under the GNU General Public License version 2
1da177e4
LT
16 *
17 * Fixes
18 * Felix Koop : NR_CPUS used properly
19 * Jose Renau : Handle single CPU case.
20 * Alan Cox : By repeated request 8) - Total BogoMIP report.
21 * Greg Wright : Fix for kernel stacks panic.
22 * Erich Boleyn : MP v1.4 and additional changes.
23 * Matthias Sattler : Changes for 2.1 kernel map.
24 * Michel Lespinasse : Changes for 2.1 kernel map.
25 * Michael Chastain : Change trampoline.S to gnu as.
26 * Alan Cox : Dumb bug: 'B' step PPro's are fine
27 * Ingo Molnar : Added APIC timers, based on code
28 * from Jose Renau
29 * Ingo Molnar : various cleanups and rewrites
30 * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
31 * Maciej W. Rozycki : Bits for genuine 82489DX APICs
32 * Andi Kleen : Changed for SMP boot into long mode.
a8ab26fe
AK
33 * Rusty Russell : Hacked into shape for new "hotplug" boot process.
34 * Andi Kleen : Converted to new state machine.
35 * Various cleanups.
36 * Probably mostly hotplug CPU ready now.
76e4f660 37 * Ashok Raj : CPU hotplug support
1da177e4
LT
38 */
39
a8ab26fe 40
1da177e4
LT
41#include <linux/config.h>
42#include <linux/init.h>
43
44#include <linux/mm.h>
45#include <linux/kernel_stat.h>
46#include <linux/smp_lock.h>
47#include <linux/irq.h>
48#include <linux/bootmem.h>
49#include <linux/thread_info.h>
50#include <linux/module.h>
51
52#include <linux/delay.h>
53#include <linux/mc146818rtc.h>
54#include <asm/mtrr.h>
55#include <asm/pgalloc.h>
56#include <asm/desc.h>
57#include <asm/kdebug.h>
58#include <asm/tlbflush.h>
59#include <asm/proto.h>
75152114 60#include <asm/nmi.h>
1da177e4
LT
61
62/* Number of siblings per CPU package */
63int smp_num_siblings = 1;
64/* Package ID of each logical CPU */
65u8 phys_proc_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
3dd9d514 66u8 cpu_core_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
1da177e4 67EXPORT_SYMBOL(phys_proc_id);
3dd9d514 68EXPORT_SYMBOL(cpu_core_id);
1da177e4
LT
69
70/* Bitmask of currently online CPUs */
71cpumask_t cpu_online_map;
72
a8ab26fe
AK
73EXPORT_SYMBOL(cpu_online_map);
74
75/*
76 * Private maps to synchronize booting between AP and BP.
77 * Probably not needed anymore, but it makes for easier debugging. -AK
78 */
1da177e4
LT
79cpumask_t cpu_callin_map;
80cpumask_t cpu_callout_map;
a8ab26fe
AK
81
82cpumask_t cpu_possible_map;
83EXPORT_SYMBOL(cpu_possible_map);
1da177e4
LT
84
85/* Per CPU bogomips and other parameters */
86struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
87
a8ab26fe
AK
88/* Set when the idlers are all forked */
89int smp_threads_ready;
90
1da177e4 91cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
3dd9d514 92cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned;
2df9fa36 93EXPORT_SYMBOL(cpu_core_map);
1da177e4
LT
94
95/*
96 * Trampoline 80x86 program as an array.
97 */
98
a8ab26fe
AK
99extern unsigned char trampoline_data[];
100extern unsigned char trampoline_end[];
1da177e4 101
76e4f660
AR
102/* State of each CPU */
103DEFINE_PER_CPU(int, cpu_state) = { 0 };
104
105/*
106 * Store all idle threads, this can be reused instead of creating
107 * a new thread. Also avoids complicated thread destroy functionality
108 * for idle threads.
109 */
110struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
111
112#define get_idle_for_cpu(x) (idle_thread_array[(x)])
113#define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p))
114
115/*
116 * cpu_possible_map should be static, it cannot change as cpu's
117 * are onlined, or offlined. The reason is per-cpu data-structures
118 * are allocated by some modules at init time, and dont expect to
119 * do this dynamically on cpu arrival/departure.
120 * cpu_present_map on the other hand can change dynamically.
121 * In case when cpu_hotplug is not compiled, then we resort to current
122 * behaviour, which is cpu_possible == cpu_present.
123 * If cpu-hotplug is supported, then we need to preallocate for all
124 * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range.
125 * - Ashok Raj
126 */
127#ifdef CONFIG_HOTPLUG_CPU
128#define fixup_cpu_possible_map(x) cpu_set((x), cpu_possible_map)
129#else
130#define fixup_cpu_possible_map(x)
131#endif
132
1da177e4
LT
133/*
134 * Currently trivial. Write the real->protected mode
135 * bootstrap into the page concerned. The caller
136 * has made sure it's suitably aligned.
137 */
138
a8ab26fe 139static unsigned long __cpuinit setup_trampoline(void)
1da177e4
LT
140{
141 void *tramp = __va(SMP_TRAMPOLINE_BASE);
142 memcpy(tramp, trampoline_data, trampoline_end - trampoline_data);
143 return virt_to_phys(tramp);
144}
145
146/*
147 * The bootstrap kernel entry code has set these up. Save them for
148 * a given CPU
149 */
150
a8ab26fe 151static void __cpuinit smp_store_cpu_info(int id)
1da177e4
LT
152{
153 struct cpuinfo_x86 *c = cpu_data + id;
154
155 *c = boot_cpu_data;
156 identify_cpu(c);
dda50e71 157 print_cpu_info(c);
1da177e4
LT
158}
159
160/*
dda50e71
AK
161 * New Funky TSC sync algorithm borrowed from IA64.
162 * Main advantage is that it doesn't reset the TSCs fully and
163 * in general looks more robust and it works better than my earlier
164 * attempts. I believe it was written by David Mosberger. Some minor
165 * adjustments for x86-64 by me -AK
1da177e4 166 *
dda50e71
AK
167 * Original comment reproduced below.
168 *
169 * Synchronize TSC of the current (slave) CPU with the TSC of the
170 * MASTER CPU (normally the time-keeper CPU). We use a closed loop to
171 * eliminate the possibility of unaccounted-for errors (such as
172 * getting a machine check in the middle of a calibration step). The
173 * basic idea is for the slave to ask the master what itc value it has
174 * and to read its own itc before and after the master responds. Each
175 * iteration gives us three timestamps:
176 *
177 * slave master
178 *
179 * t0 ---\
180 * ---\
181 * --->
182 * tm
183 * /---
184 * /---
185 * t1 <---
186 *
187 *
188 * The goal is to adjust the slave's TSC such that tm falls exactly
189 * half-way between t0 and t1. If we achieve this, the clocks are
190 * synchronized provided the interconnect between the slave and the
191 * master is symmetric. Even if the interconnect were asymmetric, we
192 * would still know that the synchronization error is smaller than the
193 * roundtrip latency (t0 - t1).
194 *
195 * When the interconnect is quiet and symmetric, this lets us
196 * synchronize the TSC to within one or two cycles. However, we can
197 * only *guarantee* that the synchronization is accurate to within a
198 * round-trip time, which is typically in the range of several hundred
199 * cycles (e.g., ~500 cycles). In practice, this means that the TSCs
200 * are usually almost perfectly synchronized, but we shouldn't assume
201 * that the accuracy is much better than half a micro second or so.
202 *
203 * [there are other errors like the latency of RDTSC and of the
204 * WRMSR. These can also account to hundreds of cycles. So it's
205 * probably worse. It claims 153 cycles error on a dual Opteron,
206 * but I suspect the numbers are actually somewhat worse -AK]
1da177e4
LT
207 */
208
dda50e71
AK
209#define MASTER 0
210#define SLAVE (SMP_CACHE_BYTES/8)
211
212/* Intentionally don't use cpu_relax() while TSC synchronization
213 because we don't want to go into funky power save modi or cause
214 hypervisors to schedule us away. Going to sleep would likely affect
215 latency and low latency is the primary objective here. -AK */
216#define no_cpu_relax() barrier()
217
a8ab26fe 218static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock);
dda50e71
AK
219static volatile __cpuinitdata unsigned long go[SLAVE + 1];
220static int notscsync __cpuinitdata;
221
222#undef DEBUG_TSC_SYNC
1da177e4 223
dda50e71
AK
224#define NUM_ROUNDS 64 /* magic value */
225#define NUM_ITERS 5 /* likewise */
1da177e4 226
dda50e71
AK
227/* Callback on boot CPU */
228static __cpuinit void sync_master(void *arg)
1da177e4 229{
dda50e71
AK
230 unsigned long flags, i;
231
232 if (smp_processor_id() != boot_cpu_id)
233 return;
234
235 go[MASTER] = 0;
236
237 local_irq_save(flags);
238 {
239 for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) {
240 while (!go[MASTER])
241 no_cpu_relax();
242 go[MASTER] = 0;
243 rdtscll(go[SLAVE]);
244 }
245 }
246 local_irq_restore(flags);
a8ab26fe 247}
1da177e4 248
a8ab26fe 249/*
dda50e71
AK
250 * Return the number of cycles by which our tsc differs from the tsc
251 * on the master (time-keeper) CPU. A positive number indicates our
252 * tsc is ahead of the master, negative that it is behind.
a8ab26fe 253 */
dda50e71
AK
254static inline long
255get_delta(long *rt, long *master)
a8ab26fe 256{
dda50e71
AK
257 unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
258 unsigned long tcenter, t0, t1, tm;
259 int i;
a8ab26fe 260
dda50e71
AK
261 for (i = 0; i < NUM_ITERS; ++i) {
262 rdtscll(t0);
263 go[MASTER] = 1;
264 while (!(tm = go[SLAVE]))
265 no_cpu_relax();
266 go[SLAVE] = 0;
267 rdtscll(t1);
268
269 if (t1 - t0 < best_t1 - best_t0)
270 best_t0 = t0, best_t1 = t1, best_tm = tm;
271 }
272
273 *rt = best_t1 - best_t0;
274 *master = best_tm - best_t0;
275
276 /* average best_t0 and best_t1 without overflow: */
277 tcenter = (best_t0/2 + best_t1/2);
278 if (best_t0 % 2 + best_t1 % 2 == 2)
279 ++tcenter;
280 return tcenter - best_tm;
1da177e4
LT
281}
282
dda50e71 283static __cpuinit void sync_tsc(void)
1da177e4 284{
dda50e71
AK
285 int i, done = 0;
286 long delta, adj, adjust_latency = 0;
287 unsigned long flags, rt, master_time_stamp, bound;
288#if DEBUG_TSC_SYNC
289 static struct syncdebug {
290 long rt; /* roundtrip time */
291 long master; /* master's timestamp */
292 long diff; /* difference between midpoint and master's timestamp */
293 long lat; /* estimate of tsc adjustment latency */
294 } t[NUM_ROUNDS] __cpuinitdata;
295#endif
296
297 go[MASTER] = 1;
298
299 smp_call_function(sync_master, NULL, 1, 0);
300
301 while (go[MASTER]) /* wait for master to be ready */
302 no_cpu_relax();
303
304 spin_lock_irqsave(&tsc_sync_lock, flags);
305 {
306 for (i = 0; i < NUM_ROUNDS; ++i) {
307 delta = get_delta(&rt, &master_time_stamp);
308 if (delta == 0) {
309 done = 1; /* let's lock on to this... */
310 bound = rt;
311 }
312
313 if (!done) {
314 unsigned long t;
315 if (i > 0) {
316 adjust_latency += -delta;
317 adj = -delta + adjust_latency/4;
318 } else
319 adj = -delta;
320
321 rdtscll(t);
322 wrmsrl(MSR_IA32_TSC, t + adj);
323 }
324#if DEBUG_TSC_SYNC
325 t[i].rt = rt;
326 t[i].master = master_time_stamp;
327 t[i].diff = delta;
328 t[i].lat = adjust_latency/4;
329#endif
330 }
331 }
332 spin_unlock_irqrestore(&tsc_sync_lock, flags);
333
334#if DEBUG_TSC_SYNC
335 for (i = 0; i < NUM_ROUNDS; ++i)
336 printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
337 t[i].rt, t[i].master, t[i].diff, t[i].lat);
338#endif
339
340 printk(KERN_INFO
341 "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, "
342 "maxerr %lu cycles)\n",
343 smp_processor_id(), boot_cpu_id, delta, rt);
a8ab26fe 344}
1da177e4 345
dda50e71 346static void __cpuinit tsc_sync_wait(void)
a8ab26fe 347{
dda50e71 348 if (notscsync || !cpu_has_tsc)
a8ab26fe 349 return;
dda50e71
AK
350 printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", smp_processor_id(),
351 boot_cpu_id);
352 sync_tsc();
a8ab26fe 353}
1da177e4 354
dda50e71 355static __init int notscsync_setup(char *s)
a8ab26fe 356{
dda50e71
AK
357 notscsync = 1;
358 return 0;
1da177e4 359}
dda50e71 360__setup("notscsync", notscsync_setup);
1da177e4 361
a8ab26fe 362static atomic_t init_deasserted __cpuinitdata;
1da177e4 363
a8ab26fe
AK
364/*
365 * Report back to the Boot Processor.
366 * Running on AP.
367 */
368void __cpuinit smp_callin(void)
1da177e4
LT
369{
370 int cpuid, phys_id;
371 unsigned long timeout;
372
373 /*
374 * If waken up by an INIT in an 82489DX configuration
375 * we may get here before an INIT-deassert IPI reaches
376 * our local APIC. We have to wait for the IPI or we'll
377 * lock up on an APIC access.
378 */
a8ab26fe
AK
379 while (!atomic_read(&init_deasserted))
380 cpu_relax();
1da177e4
LT
381
382 /*
383 * (This works even if the APIC is not enabled.)
384 */
385 phys_id = GET_APIC_ID(apic_read(APIC_ID));
386 cpuid = smp_processor_id();
387 if (cpu_isset(cpuid, cpu_callin_map)) {
388 panic("smp_callin: phys CPU#%d, CPU#%d already present??\n",
389 phys_id, cpuid);
390 }
391 Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
392
393 /*
394 * STARTUP IPIs are fragile beasts as they might sometimes
395 * trigger some glue motherboard logic. Complete APIC bus
396 * silence for 1 second, this overestimates the time the
397 * boot CPU is spending to send the up to 2 STARTUP IPIs
398 * by a factor of two. This should be enough.
399 */
400
401 /*
402 * Waiting 2s total for startup (udelay is not yet working)
403 */
404 timeout = jiffies + 2*HZ;
405 while (time_before(jiffies, timeout)) {
406 /*
407 * Has the boot CPU finished it's STARTUP sequence?
408 */
409 if (cpu_isset(cpuid, cpu_callout_map))
410 break;
a8ab26fe 411 cpu_relax();
1da177e4
LT
412 }
413
414 if (!time_before(jiffies, timeout)) {
415 panic("smp_callin: CPU%d started up but did not get a callout!\n",
416 cpuid);
417 }
418
419 /*
420 * the boot CPU has finished the init stage and is spinning
421 * on callin_map until we finish. We are free to set up this
422 * CPU, first the APIC. (this is probably redundant on most
423 * boards)
424 */
425
426 Dprintk("CALLIN, before setup_local_APIC().\n");
427 setup_local_APIC();
428
1da177e4
LT
429 /*
430 * Get our bogomips.
431 */
432 calibrate_delay();
433 Dprintk("Stack at about %p\n",&cpuid);
434
435 disable_APIC_timer();
436
437 /*
438 * Save our processor parameters
439 */
440 smp_store_cpu_info(cpuid);
441
1da177e4
LT
442 /*
443 * Allow the master to continue.
444 */
445 cpu_set(cpuid, cpu_callin_map);
1da177e4
LT
446}
447
1da177e4 448/*
a8ab26fe 449 * Setup code on secondary processor (after comming out of the trampoline)
1da177e4 450 */
a8ab26fe 451void __cpuinit start_secondary(void)
1da177e4
LT
452{
453 /*
454 * Dont put anything before smp_callin(), SMP
455 * booting is too fragile that we want to limit the
456 * things done here to the most necessary things.
457 */
458 cpu_init();
459 smp_callin();
460
461 /* otherwise gcc will move up the smp_processor_id before the cpu_init */
462 barrier();
463
1da177e4
LT
464 Dprintk("cpu %d: setting up apic clock\n", smp_processor_id());
465 setup_secondary_APIC_clock();
466
a8ab26fe 467 Dprintk("cpu %d: enabling apic timer\n", smp_processor_id());
1da177e4
LT
468
469 if (nmi_watchdog == NMI_IO_APIC) {
470 disable_8259A_irq(0);
471 enable_NMI_through_LVT0(NULL);
472 enable_8259A_irq(0);
473 }
474
a8ab26fe 475 enable_APIC_timer();
1da177e4
LT
476
477 /*
a8ab26fe 478 * Allow the master to continue.
1da177e4 479 */
1da177e4 480 cpu_set(smp_processor_id(), cpu_online_map);
a8ab26fe
AK
481 mb();
482
dda50e71
AK
483 /* Wait for TSC sync to not schedule things before.
484 We still process interrupts, which could see an inconsistent
485 time in that window unfortunately. */
486 tsc_sync_wait();
487
1da177e4
LT
488 cpu_idle();
489}
490
a8ab26fe 491extern volatile unsigned long init_rsp;
1da177e4
LT
492extern void (*initial_code)(void);
493
494#if APIC_DEBUG
a8ab26fe 495static void inquire_remote_apic(int apicid)
1da177e4
LT
496{
497 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
498 char *names[] = { "ID", "VERSION", "SPIV" };
499 int timeout, status;
500
501 printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid);
502
503 for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) {
504 printk("... APIC #%d %s: ", apicid, names[i]);
505
506 /*
507 * Wait for idle.
508 */
509 apic_wait_icr_idle();
510
511 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
512 apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
513
514 timeout = 0;
515 do {
516 udelay(100);
517 status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
518 } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
519
520 switch (status) {
521 case APIC_ICR_RR_VALID:
522 status = apic_read(APIC_RRR);
523 printk("%08x\n", status);
524 break;
525 default:
526 printk("failed\n");
527 }
528 }
529}
530#endif
531
a8ab26fe
AK
532/*
533 * Kick the secondary to wake up.
534 */
535static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip)
1da177e4
LT
536{
537 unsigned long send_status = 0, accept_status = 0;
538 int maxlvt, timeout, num_starts, j;
539
540 Dprintk("Asserting INIT.\n");
541
542 /*
543 * Turn INIT on target chip
544 */
545 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
546
547 /*
548 * Send IPI
549 */
550 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
551 | APIC_DM_INIT);
552
553 Dprintk("Waiting for send to finish...\n");
554 timeout = 0;
555 do {
556 Dprintk("+");
557 udelay(100);
558 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
559 } while (send_status && (timeout++ < 1000));
560
561 mdelay(10);
562
563 Dprintk("Deasserting INIT.\n");
564
565 /* Target chip */
566 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
567
568 /* Send IPI */
569 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
570
571 Dprintk("Waiting for send to finish...\n");
572 timeout = 0;
573 do {
574 Dprintk("+");
575 udelay(100);
576 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
577 } while (send_status && (timeout++ < 1000));
578
579 atomic_set(&init_deasserted, 1);
580
581 /*
582 * Should we send STARTUP IPIs ?
583 *
584 * Determine this based on the APIC version.
585 * If we don't have an integrated APIC, don't send the STARTUP IPIs.
586 */
587 if (APIC_INTEGRATED(apic_version[phys_apicid]))
588 num_starts = 2;
589 else
590 num_starts = 0;
591
592 /*
593 * Run STARTUP IPI loop.
594 */
595 Dprintk("#startup loops: %d.\n", num_starts);
596
597 maxlvt = get_maxlvt();
598
599 for (j = 1; j <= num_starts; j++) {
600 Dprintk("Sending STARTUP #%d.\n",j);
601 apic_read_around(APIC_SPIV);
602 apic_write(APIC_ESR, 0);
603 apic_read(APIC_ESR);
604 Dprintk("After apic_write.\n");
605
606 /*
607 * STARTUP IPI
608 */
609
610 /* Target chip */
611 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
612
613 /* Boot on the stack */
614 /* Kick the second */
615 apic_write_around(APIC_ICR, APIC_DM_STARTUP
616 | (start_rip >> 12));
617
618 /*
619 * Give the other CPU some time to accept the IPI.
620 */
621 udelay(300);
622
623 Dprintk("Startup point 1.\n");
624
625 Dprintk("Waiting for send to finish...\n");
626 timeout = 0;
627 do {
628 Dprintk("+");
629 udelay(100);
630 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
631 } while (send_status && (timeout++ < 1000));
632
633 /*
634 * Give the other CPU some time to accept the IPI.
635 */
636 udelay(200);
637 /*
638 * Due to the Pentium erratum 3AP.
639 */
640 if (maxlvt > 3) {
641 apic_read_around(APIC_SPIV);
642 apic_write(APIC_ESR, 0);
643 }
644 accept_status = (apic_read(APIC_ESR) & 0xEF);
645 if (send_status || accept_status)
646 break;
647 }
648 Dprintk("After Startup.\n");
649
650 if (send_status)
651 printk(KERN_ERR "APIC never delivered???\n");
652 if (accept_status)
653 printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
654
655 return (send_status | accept_status);
656}
657
76e4f660
AR
658struct create_idle {
659 struct task_struct *idle;
660 struct completion done;
661 int cpu;
662};
663
664void do_fork_idle(void *_c_idle)
665{
666 struct create_idle *c_idle = _c_idle;
667
668 c_idle->idle = fork_idle(c_idle->cpu);
669 complete(&c_idle->done);
670}
671
a8ab26fe
AK
672/*
673 * Boot one CPU.
674 */
675static int __cpuinit do_boot_cpu(int cpu, int apicid)
1da177e4 676{
1da177e4 677 unsigned long boot_error;
a8ab26fe 678 int timeout;
1da177e4 679 unsigned long start_rip;
76e4f660
AR
680 struct create_idle c_idle = {
681 .cpu = cpu,
682 .done = COMPLETION_INITIALIZER(c_idle.done),
683 };
684 DECLARE_WORK(work, do_fork_idle, &c_idle);
685
686 c_idle.idle = get_idle_for_cpu(cpu);
687
688 if (c_idle.idle) {
689 c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *)
690 (THREAD_SIZE + (unsigned long) c_idle.idle->thread_info)) - 1);
691 init_idle(c_idle.idle, cpu);
692 goto do_rest;
693 }
694
1da177e4 695 /*
76e4f660
AR
696 * During cold boot process, keventd thread is not spun up yet.
697 * When we do cpu hot-add, we create idle threads on the fly, we should
698 * not acquire any attributes from the calling context. Hence the clean
699 * way to create kernel_threads() is to do that from keventd().
700 * We do the current_is_keventd() due to the fact that ACPI notifier
701 * was also queuing to keventd() and when the caller is already running
702 * in context of keventd(), we would end up with locking up the keventd
703 * thread.
1da177e4 704 */
76e4f660
AR
705 if (!keventd_up() || current_is_keventd())
706 work.func(work.data);
707 else {
708 schedule_work(&work);
709 wait_for_completion(&c_idle.done);
710 }
711
712 if (IS_ERR(c_idle.idle)) {
a8ab26fe 713 printk("failed fork for CPU %d\n", cpu);
76e4f660 714 return PTR_ERR(c_idle.idle);
a8ab26fe 715 }
1da177e4 716
76e4f660
AR
717 set_idle_for_cpu(cpu, c_idle.idle);
718
719do_rest:
720
721 cpu_pda[cpu].pcurrent = c_idle.idle;
1da177e4
LT
722
723 start_rip = setup_trampoline();
724
76e4f660 725 init_rsp = c_idle.idle->thread.rsp;
1da177e4
LT
726 per_cpu(init_tss,cpu).rsp0 = init_rsp;
727 initial_code = start_secondary;
76e4f660 728 clear_ti_thread_flag(c_idle.idle->thread_info, TIF_FORK);
1da177e4 729
a8ab26fe 730 printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid,
1da177e4
LT
731 start_rip, init_rsp);
732
733 /*
734 * This grunge runs the startup process for
735 * the targeted processor.
736 */
737
738 atomic_set(&init_deasserted, 0);
739
740 Dprintk("Setting warm reset code and vector.\n");
741
742 CMOS_WRITE(0xa, 0xf);
743 local_flush_tlb();
744 Dprintk("1.\n");
745 *((volatile unsigned short *) phys_to_virt(0x469)) = start_rip >> 4;
746 Dprintk("2.\n");
747 *((volatile unsigned short *) phys_to_virt(0x467)) = start_rip & 0xf;
748 Dprintk("3.\n");
749
750 /*
751 * Be paranoid about clearing APIC errors.
752 */
753 if (APIC_INTEGRATED(apic_version[apicid])) {
754 apic_read_around(APIC_SPIV);
755 apic_write(APIC_ESR, 0);
756 apic_read(APIC_ESR);
757 }
758
759 /*
760 * Status is now clean
761 */
762 boot_error = 0;
763
764 /*
765 * Starting actual IPI sequence...
766 */
a8ab26fe 767 boot_error = wakeup_secondary_via_INIT(apicid, start_rip);
1da177e4
LT
768
769 if (!boot_error) {
770 /*
771 * allow APs to start initializing.
772 */
773 Dprintk("Before Callout %d.\n", cpu);
774 cpu_set(cpu, cpu_callout_map);
775 Dprintk("After Callout %d.\n", cpu);
776
777 /*
778 * Wait 5s total for a response
779 */
780 for (timeout = 0; timeout < 50000; timeout++) {
781 if (cpu_isset(cpu, cpu_callin_map))
782 break; /* It has booted */
783 udelay(100);
784 }
785
786 if (cpu_isset(cpu, cpu_callin_map)) {
787 /* number CPUs logically, starting from 1 (BSP is 0) */
1da177e4
LT
788 Dprintk("CPU has booted.\n");
789 } else {
790 boot_error = 1;
791 if (*((volatile unsigned char *)phys_to_virt(SMP_TRAMPOLINE_BASE))
792 == 0xA5)
793 /* trampoline started but...? */
794 printk("Stuck ??\n");
795 else
796 /* trampoline code not run */
797 printk("Not responding.\n");
798#if APIC_DEBUG
799 inquire_remote_apic(apicid);
800#endif
801 }
802 }
803 if (boot_error) {
804 cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
805 clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
a8ab26fe
AK
806 cpu_clear(cpu, cpu_present_map);
807 cpu_clear(cpu, cpu_possible_map);
1da177e4
LT
808 x86_cpu_to_apicid[cpu] = BAD_APICID;
809 x86_cpu_to_log_apicid[cpu] = BAD_APICID;
a8ab26fe 810 return -EIO;
1da177e4 811 }
a8ab26fe
AK
812
813 return 0;
1da177e4
LT
814}
815
a8ab26fe
AK
816cycles_t cacheflush_time;
817unsigned long cache_decay_ticks;
818
819/*
820 * Construct cpu_sibling_map[], so that we can tell the sibling CPU
821 * on SMT systems efficiently.
822 */
823static __cpuinit void detect_siblings(void)
1da177e4 824{
a8ab26fe 825 int cpu;
1da177e4 826
a8ab26fe
AK
827 for (cpu = 0; cpu < NR_CPUS; cpu++) {
828 cpus_clear(cpu_sibling_map[cpu]);
829 cpus_clear(cpu_core_map[cpu]);
830 }
831
832 for_each_online_cpu (cpu) {
833 struct cpuinfo_x86 *c = cpu_data + cpu;
834 int siblings = 0;
835 int i;
836 if (smp_num_siblings > 1) {
837 for_each_online_cpu (i) {
d31ddaa1 838 if (cpu_core_id[cpu] == cpu_core_id[i]) {
a8ab26fe
AK
839 siblings++;
840 cpu_set(i, cpu_sibling_map[cpu]);
841 }
842 }
843 } else {
844 siblings++;
845 cpu_set(cpu, cpu_sibling_map[cpu]);
846 }
847
848 if (siblings != smp_num_siblings) {
849 printk(KERN_WARNING
850 "WARNING: %d siblings found for CPU%d, should be %d\n",
851 siblings, cpu, smp_num_siblings);
852 smp_num_siblings = siblings;
1da177e4 853 }
a8ab26fe
AK
854 if (c->x86_num_cores > 1) {
855 for_each_online_cpu(i) {
856 if (phys_proc_id[cpu] == phys_proc_id[i])
857 cpu_set(i, cpu_core_map[cpu]);
858 }
859 } else
860 cpu_core_map[cpu] = cpu_sibling_map[cpu];
1da177e4
LT
861 }
862}
863
864/*
a8ab26fe 865 * Cleanup possible dangling ends...
1da177e4 866 */
a8ab26fe 867static __cpuinit void smp_cleanup_boot(void)
1da177e4 868{
a8ab26fe
AK
869 /*
870 * Paranoid: Set warm reset code and vector here back
871 * to default values.
872 */
873 CMOS_WRITE(0, 0xf);
1da177e4 874
a8ab26fe
AK
875 /*
876 * Reset trampoline flag
877 */
878 *((volatile int *) phys_to_virt(0x467)) = 0;
1da177e4 879
a8ab26fe 880#ifndef CONFIG_HOTPLUG_CPU
1da177e4 881 /*
a8ab26fe
AK
882 * Free pages reserved for SMP bootup.
883 * When you add hotplug CPU support later remove this
884 * Note there is more work to be done for later CPU bootup.
1da177e4 885 */
1da177e4 886
a8ab26fe
AK
887 free_page((unsigned long) __va(PAGE_SIZE));
888 free_page((unsigned long) __va(SMP_TRAMPOLINE_BASE));
889#endif
890}
891
892/*
893 * Fall back to non SMP mode after errors.
894 *
895 * RED-PEN audit/test this more. I bet there is more state messed up here.
896 */
e6982c67 897static __init void disable_smp(void)
a8ab26fe
AK
898{
899 cpu_present_map = cpumask_of_cpu(0);
900 cpu_possible_map = cpumask_of_cpu(0);
901 if (smp_found_config)
902 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
903 else
904 phys_cpu_present_map = physid_mask_of_physid(0);
905 cpu_set(0, cpu_sibling_map[0]);
906 cpu_set(0, cpu_core_map[0]);
907}
908
909/*
910 * Handle user cpus=... parameter.
911 */
e6982c67 912static __init void enforce_max_cpus(unsigned max_cpus)
a8ab26fe
AK
913{
914 int i, k;
915 k = 0;
916 for (i = 0; i < NR_CPUS; i++) {
917 if (!cpu_possible(i))
918 continue;
919 if (++k > max_cpus) {
920 cpu_clear(i, cpu_possible_map);
921 cpu_clear(i, cpu_present_map);
922 }
923 }
924}
1da177e4 925
a8ab26fe
AK
926/*
927 * Various sanity checks.
928 */
e6982c67 929static int __init smp_sanity_check(unsigned max_cpus)
a8ab26fe 930{
1da177e4
LT
931 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
932 printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
933 hard_smp_processor_id());
934 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
935 }
936
937 /*
938 * If we couldn't find an SMP configuration at boot time,
939 * get out of here now!
940 */
941 if (!smp_found_config) {
942 printk(KERN_NOTICE "SMP motherboard not detected.\n");
a8ab26fe 943 disable_smp();
1da177e4
LT
944 if (APIC_init_uniprocessor())
945 printk(KERN_NOTICE "Local APIC not detected."
946 " Using dummy APIC emulation.\n");
a8ab26fe 947 return -1;
1da177e4
LT
948 }
949
950 /*
951 * Should not be necessary because the MP table should list the boot
952 * CPU too, but we do it for the sake of robustness anyway.
953 */
954 if (!physid_isset(boot_cpu_id, phys_cpu_present_map)) {
955 printk(KERN_NOTICE "weird, boot CPU (#%d) not listed by the BIOS.\n",
956 boot_cpu_id);
957 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
958 }
959
960 /*
961 * If we couldn't find a local APIC, then get out of here now!
962 */
963 if (APIC_INTEGRATED(apic_version[boot_cpu_id]) && !cpu_has_apic) {
964 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
965 boot_cpu_id);
966 printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
a8ab26fe
AK
967 nr_ioapics = 0;
968 return -1;
1da177e4
LT
969 }
970
1da177e4
LT
971 /*
972 * If SMP should be disabled, then really disable it!
973 */
974 if (!max_cpus) {
1da177e4 975 printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
a8ab26fe
AK
976 nr_ioapics = 0;
977 return -1;
1da177e4
LT
978 }
979
a8ab26fe
AK
980 return 0;
981}
1da177e4 982
a8ab26fe
AK
983/*
984 * Prepare for SMP bootup. The MP table or ACPI has been read
985 * earlier. Just do some sanity checking here and enable APIC mode.
986 */
e6982c67 987void __init smp_prepare_cpus(unsigned int max_cpus)
a8ab26fe
AK
988{
989 int i;
1da177e4 990
a8ab26fe
AK
991 nmi_watchdog_default();
992 current_cpu_data = boot_cpu_data;
993 current_thread_info()->cpu = 0; /* needed? */
1da177e4 994
a8ab26fe 995 enforce_max_cpus(max_cpus);
1da177e4
LT
996
997 /*
a8ab26fe 998 * Fill in cpu_present_mask
1da177e4 999 */
a8ab26fe
AK
1000 for (i = 0; i < NR_CPUS; i++) {
1001 int apicid = cpu_present_to_apicid(i);
1002 if (physid_isset(apicid, phys_cpu_present_map)) {
1003 cpu_set(i, cpu_present_map);
a8ab26fe
AK
1004 cpu_set(i, cpu_possible_map);
1005 }
76e4f660 1006 fixup_cpu_possible_map(i);
1da177e4
LT
1007 }
1008
a8ab26fe
AK
1009 if (smp_sanity_check(max_cpus) < 0) {
1010 printk(KERN_INFO "SMP disabled\n");
1011 disable_smp();
1012 return;
1da177e4
LT
1013 }
1014
a8ab26fe 1015
1da177e4 1016 /*
a8ab26fe 1017 * Switch from PIC to APIC mode.
1da177e4 1018 */
a8ab26fe
AK
1019 connect_bsp_APIC();
1020 setup_local_APIC();
1da177e4 1021
a8ab26fe
AK
1022 if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) {
1023 panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
1024 GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id);
1025 /* Or can we switch back to PIC here? */
1da177e4 1026 }
1da177e4
LT
1027
1028 /*
a8ab26fe 1029 * Now start the IO-APICs
1da177e4
LT
1030 */
1031 if (!skip_ioapic_setup && nr_ioapics)
1032 setup_IO_APIC();
1033 else
1034 nr_ioapics = 0;
1035
1da177e4 1036 /*
a8ab26fe 1037 * Set up local APIC timer on boot CPU.
1da177e4 1038 */
1da177e4 1039
a8ab26fe 1040 setup_boot_APIC_clock();
1da177e4
LT
1041}
1042
a8ab26fe
AK
1043/*
1044 * Early setup to make printk work.
1045 */
1046void __init smp_prepare_boot_cpu(void)
1da177e4 1047{
a8ab26fe
AK
1048 int me = smp_processor_id();
1049 cpu_set(me, cpu_online_map);
1050 cpu_set(me, cpu_callout_map);
1da177e4
LT
1051}
1052
a8ab26fe
AK
1053/*
1054 * Entry point to boot a CPU.
a8ab26fe
AK
1055 */
1056int __cpuinit __cpu_up(unsigned int cpu)
1da177e4 1057{
a8ab26fe
AK
1058 int err;
1059 int apicid = cpu_present_to_apicid(cpu);
1da177e4 1060
a8ab26fe 1061 WARN_ON(irqs_disabled());
1da177e4 1062
a8ab26fe
AK
1063 Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu);
1064
1065 if (apicid == BAD_APICID || apicid == boot_cpu_id ||
1066 !physid_isset(apicid, phys_cpu_present_map)) {
1067 printk("__cpu_up: bad cpu %d\n", cpu);
1068 return -EINVAL;
1069 }
a8ab26fe 1070
76e4f660
AR
1071 /*
1072 * Already booted CPU?
1073 */
1074 if (cpu_isset(cpu, cpu_callin_map)) {
1075 Dprintk("do_boot_cpu %d Already started\n", cpu);
1076 return -ENOSYS;
1077 }
1078
a8ab26fe
AK
1079 /* Boot it! */
1080 err = do_boot_cpu(cpu, apicid);
1081 if (err < 0) {
a8ab26fe
AK
1082 Dprintk("do_boot_cpu failed %d\n", err);
1083 return err;
1da177e4 1084 }
a8ab26fe 1085
1da177e4
LT
1086 /* Unleash the CPU! */
1087 Dprintk("waiting for cpu %d\n", cpu);
1088
1da177e4 1089 while (!cpu_isset(cpu, cpu_online_map))
a8ab26fe 1090 cpu_relax();
76e4f660
AR
1091 err = 0;
1092
1093 return err;
1da177e4
LT
1094}
1095
a8ab26fe
AK
1096/*
1097 * Finish the SMP boot.
1098 */
e6982c67 1099void __init smp_cpus_done(unsigned int max_cpus)
1da177e4 1100{
76e4f660 1101#ifndef CONFIG_HOTPLUG_CPU
a8ab26fe 1102 zap_low_mappings();
76e4f660 1103#endif
a8ab26fe
AK
1104 smp_cleanup_boot();
1105
1da177e4
LT
1106#ifdef CONFIG_X86_IO_APIC
1107 setup_ioapic_dest();
1108#endif
1da177e4 1109
a8ab26fe
AK
1110 detect_siblings();
1111 time_init_gtod();
75152114
AK
1112
1113 check_nmi_watchdog();
a8ab26fe 1114}
76e4f660
AR
1115
1116#ifdef CONFIG_HOTPLUG_CPU
1117
1118static void
1119remove_siblinginfo(int cpu)
1120{
1121 int sibling;
1122
1123 for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
1124 cpu_clear(cpu, cpu_sibling_map[sibling]);
1125 for_each_cpu_mask(sibling, cpu_core_map[cpu])
1126 cpu_clear(cpu, cpu_core_map[sibling]);
1127 cpus_clear(cpu_sibling_map[cpu]);
1128 cpus_clear(cpu_core_map[cpu]);
1129 phys_proc_id[cpu] = BAD_APICID;
1130 cpu_core_id[cpu] = BAD_APICID;
1131}
1132
1133void remove_cpu_from_maps(void)
1134{
1135 int cpu = smp_processor_id();
1136
1137 cpu_clear(cpu, cpu_callout_map);
1138 cpu_clear(cpu, cpu_callin_map);
1139 clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
1140}
1141
1142int __cpu_disable(void)
1143{
1144 int cpu = smp_processor_id();
1145
1146 /*
1147 * Perhaps use cpufreq to drop frequency, but that could go
1148 * into generic code.
1149 *
1150 * We won't take down the boot processor on i386 due to some
1151 * interrupts only being able to be serviced by the BSP.
1152 * Especially so if we're not using an IOAPIC -zwane
1153 */
1154 if (cpu == 0)
1155 return -EBUSY;
1156
1157 disable_APIC_timer();
1158
1159 /*
1160 * HACK:
1161 * Allow any queued timer interrupts to get serviced
1162 * This is only a temporary solution until we cleanup
1163 * fixup_irqs as we do for IA64.
1164 */
1165 local_irq_enable();
1166 mdelay(1);
1167
1168 local_irq_disable();
1169 remove_siblinginfo(cpu);
1170
1171 /* It's now safe to remove this processor from the online map */
1172 cpu_clear(cpu, cpu_online_map);
1173 remove_cpu_from_maps();
1174 fixup_irqs(cpu_online_map);
1175 return 0;
1176}
1177
1178void __cpu_die(unsigned int cpu)
1179{
1180 /* We don't do anything here: idle task is faking death itself. */
1181 unsigned int i;
1182
1183 for (i = 0; i < 10; i++) {
1184 /* They ack this in play_dead by setting CPU_DEAD */
1185 if (per_cpu(cpu_state, cpu) == CPU_DEAD)
1186 return;
1187 current->state = TASK_UNINTERRUPTIBLE;
1188 schedule_timeout(HZ/10);
1189 }
1190 printk(KERN_ERR "CPU %u didn't die...\n", cpu);
1191}
1192
1193#else /* ... !CONFIG_HOTPLUG_CPU */
1194
1195int __cpu_disable(void)
1196{
1197 return -ENOSYS;
1198}
1199
1200void __cpu_die(unsigned int cpu)
1201{
1202 /* We said "no" in __cpu_disable */
1203 BUG();
1204}
1205#endif /* CONFIG_HOTPLUG_CPU */