Commit | Line | Data |
---|---|---|
83b96794 VK |
1 | /* |
2 | * Xen SMP support | |
3 | * | |
4 | * This file implements the Xen versions of smp_ops. SMP under Xen is | |
5 | * very straightforward. Bringing a CPU up is simply a matter of | |
6 | * loading its initial context and setting it running. | |
7 | * | |
8 | * IPIs are handled through the Xen event mechanism. | |
9 | * | |
10 | * Because virtual CPUs can be scheduled onto any real CPU, there's no | |
11 | * useful topology information for the kernel to make use of. As a | |
12 | * result, all CPUs are treated as if they're single-core and | |
13 | * single-threaded. | |
14 | */ | |
15 | #include <linux/sched.h> | |
16 | #include <linux/err.h> | |
17 | #include <linux/slab.h> | |
18 | #include <linux/smp.h> | |
19 | #include <linux/irq_work.h> | |
20 | #include <linux/tick.h> | |
21 | #include <linux/nmi.h> | |
c185ddec | 22 | #include <linux/cpuhotplug.h> |
83b96794 VK |
23 | |
24 | #include <asm/paravirt.h> | |
25 | #include <asm/desc.h> | |
26 | #include <asm/pgtable.h> | |
27 | #include <asm/cpu.h> | |
28 | ||
29 | #include <xen/interface/xen.h> | |
30 | #include <xen/interface/vcpu.h> | |
31 | #include <xen/interface/xenpmu.h> | |
32 | ||
33 | #include <asm/xen/interface.h> | |
34 | #include <asm/xen/hypercall.h> | |
35 | ||
36 | #include <xen/xen.h> | |
37 | #include <xen/page.h> | |
38 | #include <xen/events.h> | |
39 | ||
40 | #include <xen/hvc-console.h> | |
41 | #include "xen-ops.h" | |
42 | #include "mmu.h" | |
43 | #include "smp.h" | |
44 | #include "pmu.h" | |
45 | ||
46 | cpumask_var_t xen_cpu_initialized_map; | |
47 | ||
48 | static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 }; | |
49 | static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 }; | |
50 | ||
51 | static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id); | |
52 | ||
53 | static void cpu_bringup(void) | |
54 | { | |
55 | int cpu; | |
56 | ||
57 | cpu_init(); | |
58 | touch_softlockup_watchdog(); | |
59 | preempt_disable(); | |
60 | ||
61 | /* PVH runs in ring 0 and allows us to do native syscalls. Yay! */ | |
62 | if (!xen_feature(XENFEAT_supervisor_mode_kernel)) { | |
63 | xen_enable_sysenter(); | |
64 | xen_enable_syscall(); | |
65 | } | |
66 | cpu = smp_processor_id(); | |
67 | smp_store_cpu_info(cpu); | |
68 | cpu_data(cpu).x86_max_cores = 1; | |
69 | set_cpu_sibling_map(cpu); | |
70 | ||
71 | xen_setup_cpu_clockevents(); | |
72 | ||
73 | notify_cpu_starting(cpu); | |
74 | ||
75 | set_cpu_online(cpu, true); | |
76 | ||
77 | cpu_set_state_online(cpu); /* Implies full memory barrier. */ | |
78 | ||
79 | /* We can take interrupts now: we're officially "up". */ | |
80 | local_irq_enable(); | |
81 | } | |
82 | ||
83 | asmlinkage __visible void cpu_bringup_and_idle(void) | |
84 | { | |
85 | cpu_bringup(); | |
86 | cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); | |
87 | } | |
88 | ||
89 | void xen_smp_intr_free_pv(unsigned int cpu) | |
90 | { | |
91 | if (per_cpu(xen_irq_work, cpu).irq >= 0) { | |
92 | unbind_from_irqhandler(per_cpu(xen_irq_work, cpu).irq, NULL); | |
93 | per_cpu(xen_irq_work, cpu).irq = -1; | |
94 | kfree(per_cpu(xen_irq_work, cpu).name); | |
95 | per_cpu(xen_irq_work, cpu).name = NULL; | |
96 | } | |
97 | ||
98 | if (per_cpu(xen_pmu_irq, cpu).irq >= 0) { | |
99 | unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL); | |
100 | per_cpu(xen_pmu_irq, cpu).irq = -1; | |
101 | kfree(per_cpu(xen_pmu_irq, cpu).name); | |
102 | per_cpu(xen_pmu_irq, cpu).name = NULL; | |
103 | } | |
104 | } | |
105 | ||
106 | int xen_smp_intr_init_pv(unsigned int cpu) | |
107 | { | |
108 | int rc; | |
109 | char *callfunc_name, *pmu_name; | |
110 | ||
111 | callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu); | |
112 | rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR, | |
113 | cpu, | |
114 | xen_irq_work_interrupt, | |
115 | IRQF_PERCPU|IRQF_NOBALANCING, | |
116 | callfunc_name, | |
117 | NULL); | |
118 | if (rc < 0) | |
119 | goto fail; | |
120 | per_cpu(xen_irq_work, cpu).irq = rc; | |
121 | per_cpu(xen_irq_work, cpu).name = callfunc_name; | |
122 | ||
123 | if (is_xen_pmu(cpu)) { | |
124 | pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu); | |
125 | rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu, | |
126 | xen_pmu_irq_handler, | |
127 | IRQF_PERCPU|IRQF_NOBALANCING, | |
128 | pmu_name, NULL); | |
129 | if (rc < 0) | |
130 | goto fail; | |
131 | per_cpu(xen_pmu_irq, cpu).irq = rc; | |
132 | per_cpu(xen_pmu_irq, cpu).name = pmu_name; | |
133 | } | |
134 | ||
135 | return 0; | |
136 | ||
137 | fail: | |
138 | xen_smp_intr_free_pv(cpu); | |
139 | return rc; | |
140 | } | |
141 | ||
142 | static void __init xen_fill_possible_map(void) | |
143 | { | |
144 | int i, rc; | |
145 | ||
146 | if (xen_initial_domain()) | |
147 | return; | |
148 | ||
149 | for (i = 0; i < nr_cpu_ids; i++) { | |
150 | rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); | |
151 | if (rc >= 0) { | |
152 | num_processors++; | |
153 | set_cpu_possible(i, true); | |
154 | } | |
155 | } | |
156 | } | |
157 | ||
158 | static void __init xen_filter_cpu_maps(void) | |
159 | { | |
160 | int i, rc; | |
161 | unsigned int subtract = 0; | |
162 | ||
163 | if (!xen_initial_domain()) | |
164 | return; | |
165 | ||
166 | num_processors = 0; | |
167 | disabled_cpus = 0; | |
168 | for (i = 0; i < nr_cpu_ids; i++) { | |
169 | rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); | |
170 | if (rc >= 0) { | |
171 | num_processors++; | |
172 | set_cpu_possible(i, true); | |
173 | } else { | |
174 | set_cpu_possible(i, false); | |
175 | set_cpu_present(i, false); | |
176 | subtract++; | |
177 | } | |
178 | } | |
179 | #ifdef CONFIG_HOTPLUG_CPU | |
180 | /* This is akin to using 'nr_cpus' on the Linux command line. | |
181 | * Which is OK as when we use 'dom0_max_vcpus=X' we can only | |
182 | * have up to X, while nr_cpu_ids is greater than X. This | |
183 | * normally is not a problem, except when CPU hotplugging | |
184 | * is involved and then there might be more than X CPUs | |
185 | * in the guest - which will not work as there is no | |
186 | * hypercall to expand the max number of VCPUs an already | |
187 | * running guest has. So cap it up to X. */ | |
188 | if (subtract) | |
189 | nr_cpu_ids = nr_cpu_ids - subtract; | |
190 | #endif | |
191 | ||
192 | } | |
193 | ||
194 | static void __init xen_pv_smp_prepare_boot_cpu(void) | |
195 | { | |
196 | BUG_ON(smp_processor_id() != 0); | |
197 | native_smp_prepare_boot_cpu(); | |
198 | ||
199 | if (!xen_feature(XENFEAT_writable_page_tables)) | |
200 | /* We've switched to the "real" per-cpu gdt, so make | |
201 | * sure the old memory can be recycled. */ | |
202 | make_lowmem_page_readwrite(xen_initial_gdt); | |
203 | ||
204 | #ifdef CONFIG_X86_32 | |
205 | /* | |
206 | * Xen starts us with XEN_FLAT_RING1_DS, but linux code | |
207 | * expects __USER_DS | |
208 | */ | |
209 | loadsegment(ds, __USER_DS); | |
210 | loadsegment(es, __USER_DS); | |
211 | #endif | |
212 | ||
213 | xen_filter_cpu_maps(); | |
214 | xen_setup_vcpu_info_placement(); | |
215 | ||
216 | /* | |
217 | * The alternative logic (which patches the unlock/lock) runs before | |
218 | * the smp bootup up code is activated. Hence we need to set this up | |
219 | * the core kernel is being patched. Otherwise we will have only | |
220 | * modules patched but not core code. | |
221 | */ | |
222 | xen_init_spinlocks(); | |
223 | } | |
224 | ||
8cb6de39 | 225 | static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus) |
83b96794 VK |
226 | { |
227 | unsigned cpu; | |
228 | unsigned int i; | |
229 | ||
230 | if (skip_ioapic_setup) { | |
231 | char *m = (max_cpus == 0) ? | |
232 | "The nosmp parameter is incompatible with Xen; " \ | |
233 | "use Xen dom0_max_vcpus=1 parameter" : | |
234 | "The noapic parameter is incompatible with Xen"; | |
235 | ||
236 | xen_raw_printk(m); | |
237 | panic(m); | |
238 | } | |
239 | xen_init_lock_cpu(0); | |
240 | ||
241 | smp_store_boot_cpu_info(); | |
242 | cpu_data(0).x86_max_cores = 1; | |
243 | ||
244 | for_each_possible_cpu(i) { | |
245 | zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); | |
246 | zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); | |
247 | zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); | |
248 | } | |
249 | set_cpu_sibling_map(0); | |
250 | ||
251 | xen_pmu_init(0); | |
252 | ||
f31b9692 | 253 | if (xen_smp_intr_init(0) || xen_smp_intr_init_pv(0)) |
83b96794 VK |
254 | BUG(); |
255 | ||
256 | if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL)) | |
257 | panic("could not allocate xen_cpu_initialized_map\n"); | |
258 | ||
259 | cpumask_copy(xen_cpu_initialized_map, cpumask_of(0)); | |
260 | ||
261 | /* Restrict the possible_map according to max_cpus. */ | |
262 | while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) { | |
263 | for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--) | |
264 | continue; | |
265 | set_cpu_possible(cpu, false); | |
266 | } | |
267 | ||
268 | for_each_possible_cpu(cpu) | |
269 | set_cpu_present(cpu, true); | |
270 | } | |
271 | ||
272 | static int | |
273 | cpu_initialize_context(unsigned int cpu, struct task_struct *idle) | |
274 | { | |
275 | struct vcpu_guest_context *ctxt; | |
276 | struct desc_struct *gdt; | |
277 | unsigned long gdt_mfn; | |
278 | ||
279 | /* used to tell cpu_init() that it can proceed with initialization */ | |
280 | cpumask_set_cpu(cpu, cpu_callout_mask); | |
281 | if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) | |
282 | return 0; | |
283 | ||
284 | ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); | |
285 | if (ctxt == NULL) | |
286 | return -ENOMEM; | |
287 | ||
288 | gdt = get_cpu_gdt_rw(cpu); | |
289 | ||
290 | #ifdef CONFIG_X86_32 | |
291 | ctxt->user_regs.fs = __KERNEL_PERCPU; | |
292 | ctxt->user_regs.gs = __KERNEL_STACK_CANARY; | |
293 | #endif | |
294 | memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); | |
295 | ||
296 | ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; | |
297 | ctxt->flags = VGCF_IN_KERNEL; | |
298 | ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ | |
299 | ctxt->user_regs.ds = __USER_DS; | |
300 | ctxt->user_regs.es = __USER_DS; | |
301 | ctxt->user_regs.ss = __KERNEL_DS; | |
302 | ||
303 | xen_copy_trap_info(ctxt->trap_ctxt); | |
304 | ||
305 | ctxt->ldt_ents = 0; | |
306 | ||
307 | BUG_ON((unsigned long)gdt & ~PAGE_MASK); | |
308 | ||
309 | gdt_mfn = arbitrary_virt_to_mfn(gdt); | |
310 | make_lowmem_page_readonly(gdt); | |
311 | make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); | |
312 | ||
313 | ctxt->gdt_frames[0] = gdt_mfn; | |
314 | ctxt->gdt_ents = GDT_ENTRIES; | |
315 | ||
316 | ctxt->kernel_ss = __KERNEL_DS; | |
317 | ctxt->kernel_sp = idle->thread.sp0; | |
318 | ||
319 | #ifdef CONFIG_X86_32 | |
320 | ctxt->event_callback_cs = __KERNEL_CS; | |
321 | ctxt->failsafe_callback_cs = __KERNEL_CS; | |
322 | #else | |
323 | ctxt->gs_base_kernel = per_cpu_offset(cpu); | |
324 | #endif | |
325 | ctxt->event_callback_eip = | |
326 | (unsigned long)xen_hypervisor_callback; | |
327 | ctxt->failsafe_callback_eip = | |
328 | (unsigned long)xen_failsafe_callback; | |
329 | ctxt->user_regs.cs = __KERNEL_CS; | |
330 | per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); | |
331 | ||
332 | ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); | |
333 | ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir)); | |
334 | if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt)) | |
335 | BUG(); | |
336 | ||
337 | kfree(ctxt); | |
338 | return 0; | |
339 | } | |
340 | ||
8cb6de39 | 341 | static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle) |
83b96794 VK |
342 | { |
343 | int rc; | |
344 | ||
345 | common_cpu_up(cpu, idle); | |
346 | ||
347 | xen_setup_runstate_info(cpu); | |
348 | ||
349 | /* | |
350 | * PV VCPUs are always successfully taken down (see 'while' loop | |
351 | * in xen_cpu_die()), so -EBUSY is an error. | |
352 | */ | |
353 | rc = cpu_check_up_prepare(cpu); | |
354 | if (rc) | |
355 | return rc; | |
356 | ||
357 | /* make sure interrupts start blocked */ | |
358 | per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; | |
359 | ||
360 | rc = cpu_initialize_context(cpu, idle); | |
361 | if (rc) | |
362 | return rc; | |
363 | ||
364 | xen_pmu_init(cpu); | |
365 | ||
366 | rc = HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL); | |
367 | BUG_ON(rc); | |
368 | ||
369 | while (cpu_report_state(cpu) != CPU_ONLINE) | |
370 | HYPERVISOR_sched_op(SCHEDOP_yield, NULL); | |
371 | ||
372 | return 0; | |
373 | } | |
374 | ||
83b96794 | 375 | #ifdef CONFIG_HOTPLUG_CPU |
8cb6de39 | 376 | static int xen_pv_cpu_disable(void) |
83b96794 VK |
377 | { |
378 | unsigned int cpu = smp_processor_id(); | |
379 | if (cpu == 0) | |
380 | return -EBUSY; | |
381 | ||
382 | cpu_disable_common(); | |
383 | ||
384 | load_cr3(swapper_pg_dir); | |
385 | return 0; | |
386 | } | |
387 | ||
388 | static void xen_pv_cpu_die(unsigned int cpu) | |
389 | { | |
390 | while (HYPERVISOR_vcpu_op(VCPUOP_is_up, | |
391 | xen_vcpu_nr(cpu), NULL)) { | |
392 | __set_current_state(TASK_UNINTERRUPTIBLE); | |
393 | schedule_timeout(HZ/10); | |
394 | } | |
395 | ||
396 | if (common_cpu_die(cpu) == 0) { | |
397 | xen_smp_intr_free(cpu); | |
398 | xen_uninit_lock_cpu(cpu); | |
399 | xen_teardown_timer(cpu); | |
400 | xen_pmu_finish(cpu); | |
401 | } | |
402 | } | |
403 | ||
8cb6de39 | 404 | static void xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */ |
83b96794 VK |
405 | { |
406 | play_dead_common(); | |
407 | HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(smp_processor_id()), NULL); | |
408 | cpu_bringup(); | |
409 | /* | |
410 | * commit 4b0c0f294 (tick: Cleanup NOHZ per cpu data on cpu down) | |
411 | * clears certain data that the cpu_idle loop (which called us | |
412 | * and that we return from) expects. The only way to get that | |
413 | * data back is to call: | |
414 | */ | |
415 | tick_nohz_idle_enter(); | |
416 | ||
c185ddec | 417 | cpuhp_online_idle(CPUHP_AP_ONLINE_IDLE); |
83b96794 VK |
418 | } |
419 | ||
420 | #else /* !CONFIG_HOTPLUG_CPU */ | |
8cb6de39 | 421 | static int xen_pv_cpu_disable(void) |
83b96794 VK |
422 | { |
423 | return -ENOSYS; | |
424 | } | |
425 | ||
426 | static void xen_pv_cpu_die(unsigned int cpu) | |
427 | { | |
428 | BUG(); | |
429 | } | |
430 | ||
8cb6de39 | 431 | static void xen_pv_play_dead(void) |
83b96794 VK |
432 | { |
433 | BUG(); | |
434 | } | |
435 | ||
436 | #endif | |
437 | static void stop_self(void *v) | |
438 | { | |
439 | int cpu = smp_processor_id(); | |
440 | ||
441 | /* make sure we're not pinning something down */ | |
442 | load_cr3(swapper_pg_dir); | |
443 | /* should set up a minimal gdt */ | |
444 | ||
445 | set_cpu_online(cpu, false); | |
446 | ||
447 | HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL); | |
448 | BUG(); | |
449 | } | |
450 | ||
8cb6de39 | 451 | static void xen_pv_stop_other_cpus(int wait) |
83b96794 VK |
452 | { |
453 | smp_call_function(stop_self, NULL, wait); | |
454 | } | |
455 | ||
83b96794 VK |
456 | static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id) |
457 | { | |
458 | irq_enter(); | |
459 | irq_work_run(); | |
460 | inc_irq_stat(apic_irq_work_irqs); | |
461 | irq_exit(); | |
462 | ||
463 | return IRQ_HANDLED; | |
464 | } | |
465 | ||
466 | static const struct smp_ops xen_smp_ops __initconst = { | |
467 | .smp_prepare_boot_cpu = xen_pv_smp_prepare_boot_cpu, | |
8cb6de39 | 468 | .smp_prepare_cpus = xen_pv_smp_prepare_cpus, |
ae039001 | 469 | .smp_cpus_done = xen_smp_cpus_done, |
83b96794 | 470 | |
8cb6de39 | 471 | .cpu_up = xen_pv_cpu_up, |
83b96794 | 472 | .cpu_die = xen_pv_cpu_die, |
8cb6de39 VK |
473 | .cpu_disable = xen_pv_cpu_disable, |
474 | .play_dead = xen_pv_play_dead, | |
83b96794 | 475 | |
8cb6de39 | 476 | .stop_other_cpus = xen_pv_stop_other_cpus, |
83b96794 VK |
477 | .smp_send_reschedule = xen_smp_send_reschedule, |
478 | ||
479 | .send_call_func_ipi = xen_smp_send_call_function_ipi, | |
480 | .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi, | |
481 | }; | |
482 | ||
483 | void __init xen_smp_init(void) | |
484 | { | |
485 | smp_ops = xen_smp_ops; | |
486 | xen_fill_possible_map(); | |
487 | } |