Commit | Line | Data |
---|---|---|
5f1f79bb DH |
1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* | |
3 | * Virtio-mem device driver. | |
4 | * | |
5 | * Copyright Red Hat, Inc. 2020 | |
6 | * | |
7 | * Author(s): David Hildenbrand <david@redhat.com> | |
8 | */ | |
9 | ||
10 | #include <linux/virtio.h> | |
11 | #include <linux/virtio_mem.h> | |
12 | #include <linux/workqueue.h> | |
13 | #include <linux/slab.h> | |
14 | #include <linux/module.h> | |
15 | #include <linux/mm.h> | |
16 | #include <linux/memory_hotplug.h> | |
17 | #include <linux/memory.h> | |
18 | #include <linux/hrtimer.h> | |
19 | #include <linux/crash_dump.h> | |
20 | #include <linux/mutex.h> | |
21 | #include <linux/bitmap.h> | |
22 | #include <linux/lockdep.h> | |
6639032a | 23 | #include <linux/log2.h> |
5f1f79bb | 24 | |
f2af6d39 DH |
25 | #include <acpi/acpi_numa.h> |
26 | ||
255f5985 DH |
27 | static bool unplug_online = true; |
28 | module_param(unplug_online, bool, 0644); | |
29 | MODULE_PARM_DESC(unplug_online, "Try to unplug online memory"); | |
30 | ||
faa45ff4 DH |
31 | static bool force_bbm; |
32 | module_param(force_bbm, bool, 0444); | |
33 | MODULE_PARM_DESC(force_bbm, | |
34 | "Force Big Block Mode. Default is 0 (auto-selection)"); | |
35 | ||
36 | static unsigned long bbm_block_size; | |
37 | module_param(bbm_block_size, ulong, 0444); | |
38 | MODULE_PARM_DESC(bbm_block_size, | |
39 | "Big Block size in bytes. Default is 0 (auto-detection)."); | |
40 | ||
d5614944 DH |
41 | /* |
42 | * virtio-mem currently supports the following modes of operation: | |
43 | * | |
4ba50cd3 | 44 | * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The |
d5614944 DH |
45 | * size of a Sub Block (SB) is determined based on the device block size, the |
46 | * pageblock size, and the maximum allocation granularity of the buddy. | |
47 | * Subblocks within a Linux memory block might either be plugged or unplugged. | |
48 | * Memory is added/removed to Linux MM in Linux memory block granularity. | |
49 | * | |
4ba50cd3 DH |
50 | * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks. |
51 | * Memory is added/removed to Linux MM in Big Block granularity. | |
52 | * | |
53 | * The mode is determined automatically based on the Linux memory block size | |
54 | * and the device block size. | |
55 | * | |
d5614944 DH |
56 | * User space / core MM (auto onlining) is responsible for onlining added |
57 | * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are | |
58 | * always onlined separately, and all memory within a Linux memory block is | |
59 | * onlined to the same zone - virtio-mem relies on this behavior. | |
60 | */ | |
61 | ||
99f0b55e DH |
62 | /* |
63 | * State of a Linux memory block in SBM. | |
64 | */ | |
65 | enum virtio_mem_sbm_mb_state { | |
5f1f79bb | 66 | /* Unplugged, not added to Linux. Can be reused later. */ |
99f0b55e | 67 | VIRTIO_MEM_SBM_MB_UNUSED = 0, |
5f1f79bb | 68 | /* (Partially) plugged, not added to Linux. Error on add_memory(). */ |
99f0b55e | 69 | VIRTIO_MEM_SBM_MB_PLUGGED, |
5f1f79bb | 70 | /* Fully plugged, fully added to Linux, offline. */ |
99f0b55e | 71 | VIRTIO_MEM_SBM_MB_OFFLINE, |
5f1f79bb | 72 | /* Partially plugged, fully added to Linux, offline. */ |
99f0b55e | 73 | VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, |
c740bb97 DH |
74 | /* Fully plugged, fully added to Linux, onlined to a kernel zone. */ |
75 | VIRTIO_MEM_SBM_MB_KERNEL, | |
76 | /* Partially plugged, fully added to Linux, online to a kernel zone */ | |
77 | VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, | |
78 | /* Fully plugged, fully added to Linux, onlined to ZONE_MOVABLE. */ | |
79 | VIRTIO_MEM_SBM_MB_MOVABLE, | |
80 | /* Partially plugged, fully added to Linux, onlined to ZONE_MOVABLE. */ | |
81 | VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, | |
99f0b55e | 82 | VIRTIO_MEM_SBM_MB_COUNT |
5f1f79bb DH |
83 | }; |
84 | ||
4ba50cd3 DH |
85 | /* |
86 | * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks. | |
87 | */ | |
88 | enum virtio_mem_bbm_bb_state { | |
89 | /* Unplugged, not added to Linux. Can be reused later. */ | |
90 | VIRTIO_MEM_BBM_BB_UNUSED = 0, | |
91 | /* Plugged, not added to Linux. Error on add_memory(). */ | |
92 | VIRTIO_MEM_BBM_BB_PLUGGED, | |
93 | /* Plugged and added to Linux. */ | |
94 | VIRTIO_MEM_BBM_BB_ADDED, | |
3711387a DH |
95 | /* All online parts are fake-offline, ready to remove. */ |
96 | VIRTIO_MEM_BBM_BB_FAKE_OFFLINE, | |
4ba50cd3 DH |
97 | VIRTIO_MEM_BBM_BB_COUNT |
98 | }; | |
99 | ||
5f1f79bb DH |
100 | struct virtio_mem { |
101 | struct virtio_device *vdev; | |
102 | ||
103 | /* We might first have to unplug all memory when starting up. */ | |
104 | bool unplug_all_required; | |
105 | ||
106 | /* Workqueue that processes the plug/unplug requests. */ | |
107 | struct work_struct wq; | |
98ff9f94 | 108 | atomic_t wq_active; |
5f1f79bb DH |
109 | atomic_t config_changed; |
110 | ||
111 | /* Virtqueue for guest->host requests. */ | |
112 | struct virtqueue *vq; | |
113 | ||
114 | /* Wait for a host response to a guest request. */ | |
115 | wait_queue_head_t host_resp; | |
116 | ||
117 | /* Space for one guest request and the host response. */ | |
118 | struct virtio_mem_req req; | |
119 | struct virtio_mem_resp resp; | |
120 | ||
121 | /* The current size of the device. */ | |
122 | uint64_t plugged_size; | |
123 | /* The requested size of the device. */ | |
124 | uint64_t requested_size; | |
125 | ||
126 | /* The device block size (for communicating with the device). */ | |
544fc7db | 127 | uint64_t device_block_size; |
6725f211 | 128 | /* The determined node id for all memory of the device. */ |
f2af6d39 | 129 | int nid; |
5f1f79bb DH |
130 | /* Physical start address of the memory region. */ |
131 | uint64_t addr; | |
132 | /* Maximum region size in bytes. */ | |
133 | uint64_t region_size; | |
134 | ||
ebf71552 DH |
135 | /* The parent resource for all memory added via this device. */ |
136 | struct resource *parent_resource; | |
b3562c60 DH |
137 | /* |
138 | * Copy of "System RAM (virtio_mem)" to be used for | |
139 | * add_memory_driver_managed(). | |
140 | */ | |
141 | const char *resource_name; | |
ffaa6ce8 DH |
142 | /* Memory group identification. */ |
143 | int mgid; | |
ebf71552 | 144 | |
98ff9f94 DH |
145 | /* |
146 | * We don't want to add too much memory if it's not getting onlined, | |
147 | * to avoid running OOM. Besides this threshold, we allow to have at | |
148 | * least two offline blocks at a time (whatever is bigger). | |
149 | */ | |
150 | #define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD (1024 * 1024 * 1024) | |
151 | atomic64_t offline_size; | |
152 | uint64_t offline_threshold; | |
5f1f79bb | 153 | |
4ba50cd3 DH |
154 | /* If set, the driver is in SBM, otherwise in BBM. */ |
155 | bool in_sbm; | |
156 | ||
157 | union { | |
158 | struct { | |
159 | /* Id of the first memory block of this device. */ | |
160 | unsigned long first_mb_id; | |
161 | /* Id of the last usable memory block of this device. */ | |
162 | unsigned long last_usable_mb_id; | |
163 | /* Id of the next memory bock to prepare when needed. */ | |
164 | unsigned long next_mb_id; | |
165 | ||
166 | /* The subblock size. */ | |
167 | uint64_t sb_size; | |
168 | /* The number of subblocks per Linux memory block. */ | |
169 | uint32_t sbs_per_mb; | |
170 | ||
a31648fd DH |
171 | /* |
172 | * Some of the Linux memory blocks tracked as "partially | |
173 | * plugged" are completely unplugged and can be offlined | |
174 | * and removed -- which previously failed. | |
175 | */ | |
176 | bool have_unplugged_mb; | |
177 | ||
4ba50cd3 DH |
178 | /* Summary of all memory block states. */ |
179 | unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT]; | |
180 | ||
181 | /* | |
182 | * One byte state per memory block. Allocated via | |
183 | * vmalloc(). Resized (alloc+copy+free) on demand. | |
184 | * | |
185 | * With 128 MiB memory blocks, we have states for 512 | |
186 | * GiB of memory in one 4 KiB page. | |
187 | */ | |
188 | uint8_t *mb_states; | |
189 | ||
190 | /* | |
191 | * Bitmap: one bit per subblock. Allocated similar to | |
192 | * sbm.mb_states. | |
193 | * | |
194 | * A set bit means the corresponding subblock is | |
195 | * plugged, otherwise it's unblocked. | |
196 | * | |
197 | * With 4 MiB subblocks, we manage 128 GiB of memory | |
198 | * in one 4 KiB page. | |
199 | */ | |
200 | unsigned long *sb_states; | |
201 | } sbm; | |
202 | ||
203 | struct { | |
204 | /* Id of the first big block of this device. */ | |
205 | unsigned long first_bb_id; | |
206 | /* Id of the last usable big block of this device. */ | |
207 | unsigned long last_usable_bb_id; | |
208 | /* Id of the next device bock to prepare when needed. */ | |
209 | unsigned long next_bb_id; | |
210 | ||
211 | /* Summary of all big block states. */ | |
212 | unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT]; | |
213 | ||
214 | /* One byte state per big block. See sbm.mb_states. */ | |
215 | uint8_t *bb_states; | |
216 | ||
217 | /* The block size used for plugging/adding/removing. */ | |
218 | uint64_t bb_size; | |
219 | } bbm; | |
220 | }; | |
5f1f79bb DH |
221 | |
222 | /* | |
4ba50cd3 DH |
223 | * Mutex that protects the sbm.mb_count, sbm.mb_states, |
224 | * sbm.sb_states, bbm.bb_count, and bbm.bb_states | |
5f1f79bb DH |
225 | * |
226 | * When this lock is held the pointers can't change, ONLINE and | |
227 | * OFFLINE blocks can't change the state and no subblocks will get | |
c627ff5d | 228 | * plugged/unplugged. |
ce281462 DH |
229 | * |
230 | * In kdump mode, used to serialize requests, last_block_addr and | |
231 | * last_block_plugged. | |
5f1f79bb DH |
232 | */ |
233 | struct mutex hotplug_mutex; | |
234 | bool hotplug_active; | |
235 | ||
236 | /* An error occurred we cannot handle - stop processing requests. */ | |
237 | bool broken; | |
238 | ||
ce281462 DH |
239 | /* Cached valued of is_kdump_kernel() when the device was probed. */ |
240 | bool in_kdump; | |
241 | ||
5f1f79bb DH |
242 | /* The driver is being removed. */ |
243 | spinlock_t removal_lock; | |
244 | bool removing; | |
245 | ||
246 | /* Timer for retrying to plug/unplug memory. */ | |
247 | struct hrtimer retry_timer; | |
23e77b5d DH |
248 | unsigned int retry_timer_ms; |
249 | #define VIRTIO_MEM_RETRY_TIMER_MIN_MS 50000 | |
250 | #define VIRTIO_MEM_RETRY_TIMER_MAX_MS 300000 | |
5f1f79bb DH |
251 | |
252 | /* Memory notifier (online/offline events). */ | |
253 | struct notifier_block memory_notifier; | |
254 | ||
ce281462 DH |
255 | #ifdef CONFIG_PROC_VMCORE |
256 | /* vmcore callback for /proc/vmcore handling in kdump mode */ | |
257 | struct vmcore_cb vmcore_cb; | |
258 | uint64_t last_block_addr; | |
259 | bool last_block_plugged; | |
260 | #endif /* CONFIG_PROC_VMCORE */ | |
261 | ||
5f1f79bb DH |
262 | /* Next device in the list of virtio-mem devices. */ |
263 | struct list_head next; | |
264 | }; | |
265 | ||
266 | /* | |
267 | * We have to share a single online_page callback among all virtio-mem | |
268 | * devices. We use RCU to iterate the list in the callback. | |
269 | */ | |
270 | static DEFINE_MUTEX(virtio_mem_mutex); | |
271 | static LIST_HEAD(virtio_mem_devices); | |
272 | ||
273 | static void virtio_mem_online_page_cb(struct page *page, unsigned int order); | |
7a34c77d DH |
274 | static void virtio_mem_fake_offline_going_offline(unsigned long pfn, |
275 | unsigned long nr_pages); | |
276 | static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, | |
277 | unsigned long nr_pages); | |
1d33c2ca | 278 | static void virtio_mem_retry(struct virtio_mem *vm); |
84e17e68 DH |
279 | static int virtio_mem_create_resource(struct virtio_mem *vm); |
280 | static void virtio_mem_delete_resource(struct virtio_mem *vm); | |
5f1f79bb DH |
281 | |
282 | /* | |
283 | * Register a virtio-mem device so it will be considered for the online_page | |
284 | * callback. | |
285 | */ | |
286 | static int register_virtio_mem_device(struct virtio_mem *vm) | |
287 | { | |
288 | int rc = 0; | |
289 | ||
290 | /* First device registers the callback. */ | |
291 | mutex_lock(&virtio_mem_mutex); | |
292 | if (list_empty(&virtio_mem_devices)) | |
293 | rc = set_online_page_callback(&virtio_mem_online_page_cb); | |
294 | if (!rc) | |
295 | list_add_rcu(&vm->next, &virtio_mem_devices); | |
296 | mutex_unlock(&virtio_mem_mutex); | |
297 | ||
298 | return rc; | |
299 | } | |
300 | ||
301 | /* | |
302 | * Unregister a virtio-mem device so it will no longer be considered for the | |
303 | * online_page callback. | |
304 | */ | |
305 | static void unregister_virtio_mem_device(struct virtio_mem *vm) | |
306 | { | |
307 | /* Last device unregisters the callback. */ | |
308 | mutex_lock(&virtio_mem_mutex); | |
309 | list_del_rcu(&vm->next); | |
310 | if (list_empty(&virtio_mem_devices)) | |
311 | restore_online_page_callback(&virtio_mem_online_page_cb); | |
312 | mutex_unlock(&virtio_mem_mutex); | |
313 | ||
314 | synchronize_rcu(); | |
315 | } | |
316 | ||
317 | /* | |
318 | * Calculate the memory block id of a given address. | |
319 | */ | |
320 | static unsigned long virtio_mem_phys_to_mb_id(unsigned long addr) | |
321 | { | |
322 | return addr / memory_block_size_bytes(); | |
323 | } | |
324 | ||
325 | /* | |
326 | * Calculate the physical start address of a given memory block id. | |
327 | */ | |
328 | static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id) | |
329 | { | |
330 | return mb_id * memory_block_size_bytes(); | |
331 | } | |
332 | ||
4ba50cd3 DH |
333 | /* |
334 | * Calculate the big block id of a given address. | |
335 | */ | |
336 | static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm, | |
337 | uint64_t addr) | |
338 | { | |
339 | return addr / vm->bbm.bb_size; | |
340 | } | |
341 | ||
342 | /* | |
343 | * Calculate the physical start address of a given big block id. | |
344 | */ | |
345 | static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm, | |
346 | unsigned long bb_id) | |
347 | { | |
348 | return bb_id * vm->bbm.bb_size; | |
349 | } | |
350 | ||
5f1f79bb DH |
351 | /* |
352 | * Calculate the subblock id of a given address. | |
353 | */ | |
354 | static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm, | |
355 | unsigned long addr) | |
356 | { | |
357 | const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr); | |
358 | const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id); | |
359 | ||
905c4c51 | 360 | return (addr - mb_addr) / vm->sbm.sb_size; |
5f1f79bb DH |
361 | } |
362 | ||
4ba50cd3 DH |
363 | /* |
364 | * Set the state of a big block, taking care of the state counter. | |
365 | */ | |
366 | static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm, | |
367 | unsigned long bb_id, | |
368 | enum virtio_mem_bbm_bb_state state) | |
369 | { | |
370 | const unsigned long idx = bb_id - vm->bbm.first_bb_id; | |
371 | enum virtio_mem_bbm_bb_state old_state; | |
372 | ||
373 | old_state = vm->bbm.bb_states[idx]; | |
374 | vm->bbm.bb_states[idx] = state; | |
375 | ||
376 | BUG_ON(vm->bbm.bb_count[old_state] == 0); | |
377 | vm->bbm.bb_count[old_state]--; | |
378 | vm->bbm.bb_count[state]++; | |
379 | } | |
380 | ||
381 | /* | |
382 | * Get the state of a big block. | |
383 | */ | |
384 | static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm, | |
385 | unsigned long bb_id) | |
386 | { | |
387 | return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id]; | |
388 | } | |
389 | ||
390 | /* | |
391 | * Prepare the big block state array for the next big block. | |
392 | */ | |
393 | static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm) | |
394 | { | |
395 | unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id; | |
396 | unsigned long new_bytes = old_bytes + 1; | |
397 | int old_pages = PFN_UP(old_bytes); | |
398 | int new_pages = PFN_UP(new_bytes); | |
399 | uint8_t *new_array; | |
400 | ||
401 | if (vm->bbm.bb_states && old_pages == new_pages) | |
402 | return 0; | |
403 | ||
404 | new_array = vzalloc(new_pages * PAGE_SIZE); | |
405 | if (!new_array) | |
406 | return -ENOMEM; | |
407 | ||
408 | mutex_lock(&vm->hotplug_mutex); | |
409 | if (vm->bbm.bb_states) | |
410 | memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE); | |
411 | vfree(vm->bbm.bb_states); | |
412 | vm->bbm.bb_states = new_array; | |
413 | mutex_unlock(&vm->hotplug_mutex); | |
414 | ||
415 | return 0; | |
416 | } | |
417 | ||
418 | #define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \ | |
419 | for (_bb_id = vm->bbm.first_bb_id; \ | |
420 | _bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \ | |
421 | _bb_id++) \ | |
422 | if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) | |
423 | ||
269ac938 DH |
424 | #define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \ |
425 | for (_bb_id = vm->bbm.next_bb_id - 1; \ | |
426 | _bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \ | |
427 | _bb_id--) \ | |
428 | if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) | |
429 | ||
5f1f79bb DH |
430 | /* |
431 | * Set the state of a memory block, taking care of the state counter. | |
432 | */ | |
99f0b55e DH |
433 | static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm, |
434 | unsigned long mb_id, uint8_t state) | |
5f1f79bb | 435 | { |
8a6f082b | 436 | const unsigned long idx = mb_id - vm->sbm.first_mb_id; |
99f0b55e | 437 | uint8_t old_state; |
5f1f79bb | 438 | |
99f0b55e DH |
439 | old_state = vm->sbm.mb_states[idx]; |
440 | vm->sbm.mb_states[idx] = state; | |
5f1f79bb | 441 | |
99f0b55e DH |
442 | BUG_ON(vm->sbm.mb_count[old_state] == 0); |
443 | vm->sbm.mb_count[old_state]--; | |
444 | vm->sbm.mb_count[state]++; | |
5f1f79bb DH |
445 | } |
446 | ||
447 | /* | |
448 | * Get the state of a memory block. | |
449 | */ | |
99f0b55e DH |
450 | static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm, |
451 | unsigned long mb_id) | |
5f1f79bb | 452 | { |
8a6f082b | 453 | const unsigned long idx = mb_id - vm->sbm.first_mb_id; |
5f1f79bb | 454 | |
99f0b55e | 455 | return vm->sbm.mb_states[idx]; |
5f1f79bb DH |
456 | } |
457 | ||
458 | /* | |
459 | * Prepare the state array for the next memory block. | |
460 | */ | |
99f0b55e | 461 | static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm) |
5f1f79bb | 462 | { |
8a6f082b DH |
463 | int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id); |
464 | int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1); | |
99f0b55e | 465 | uint8_t *new_array; |
5f1f79bb | 466 | |
99f0b55e | 467 | if (vm->sbm.mb_states && old_pages == new_pages) |
5f1f79bb DH |
468 | return 0; |
469 | ||
99f0b55e DH |
470 | new_array = vzalloc(new_pages * PAGE_SIZE); |
471 | if (!new_array) | |
5f1f79bb DH |
472 | return -ENOMEM; |
473 | ||
474 | mutex_lock(&vm->hotplug_mutex); | |
99f0b55e DH |
475 | if (vm->sbm.mb_states) |
476 | memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE); | |
477 | vfree(vm->sbm.mb_states); | |
478 | vm->sbm.mb_states = new_array; | |
5f1f79bb DH |
479 | mutex_unlock(&vm->hotplug_mutex); |
480 | ||
481 | return 0; | |
482 | } | |
483 | ||
99f0b55e | 484 | #define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \ |
8a6f082b DH |
485 | for (_mb_id = _vm->sbm.first_mb_id; \ |
486 | _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \ | |
5f1f79bb | 487 | _mb_id++) \ |
99f0b55e | 488 | if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) |
5f1f79bb | 489 | |
99f0b55e | 490 | #define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \ |
8a6f082b DH |
491 | for (_mb_id = _vm->sbm.next_mb_id - 1; \ |
492 | _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \ | |
c627ff5d | 493 | _mb_id--) \ |
99f0b55e | 494 | if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) |
c627ff5d | 495 | |
41e6215c DH |
496 | /* |
497 | * Calculate the bit number in the subblock bitmap for the given subblock | |
498 | * inside the given memory block. | |
499 | */ | |
54c6a6ba DH |
500 | static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm, |
501 | unsigned long mb_id, int sb_id) | |
41e6215c | 502 | { |
8a6f082b | 503 | return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id; |
41e6215c DH |
504 | } |
505 | ||
5f1f79bb DH |
506 | /* |
507 | * Mark all selected subblocks plugged. | |
508 | * | |
509 | * Will not modify the state of the memory block. | |
510 | */ | |
54c6a6ba DH |
511 | static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm, |
512 | unsigned long mb_id, int sb_id, | |
513 | int count) | |
5f1f79bb | 514 | { |
54c6a6ba | 515 | const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); |
5f1f79bb | 516 | |
54c6a6ba | 517 | __bitmap_set(vm->sbm.sb_states, bit, count); |
5f1f79bb DH |
518 | } |
519 | ||
520 | /* | |
521 | * Mark all selected subblocks unplugged. | |
522 | * | |
523 | * Will not modify the state of the memory block. | |
524 | */ | |
54c6a6ba DH |
525 | static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm, |
526 | unsigned long mb_id, int sb_id, | |
527 | int count) | |
5f1f79bb | 528 | { |
54c6a6ba | 529 | const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); |
5f1f79bb | 530 | |
54c6a6ba | 531 | __bitmap_clear(vm->sbm.sb_states, bit, count); |
5f1f79bb DH |
532 | } |
533 | ||
534 | /* | |
535 | * Test if all selected subblocks are plugged. | |
536 | */ | |
54c6a6ba DH |
537 | static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm, |
538 | unsigned long mb_id, int sb_id, | |
539 | int count) | |
5f1f79bb | 540 | { |
54c6a6ba | 541 | const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); |
5f1f79bb DH |
542 | |
543 | if (count == 1) | |
54c6a6ba | 544 | return test_bit(bit, vm->sbm.sb_states); |
5f1f79bb DH |
545 | |
546 | /* TODO: Helper similar to bitmap_set() */ | |
54c6a6ba | 547 | return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >= |
5f1f79bb DH |
548 | bit + count; |
549 | } | |
550 | ||
c627ff5d DH |
551 | /* |
552 | * Test if all selected subblocks are unplugged. | |
553 | */ | |
54c6a6ba DH |
554 | static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm, |
555 | unsigned long mb_id, int sb_id, | |
556 | int count) | |
c627ff5d | 557 | { |
54c6a6ba | 558 | const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); |
c627ff5d DH |
559 | |
560 | /* TODO: Helper similar to bitmap_set() */ | |
54c6a6ba DH |
561 | return find_next_bit(vm->sbm.sb_states, bit + count, bit) >= |
562 | bit + count; | |
c627ff5d DH |
563 | } |
564 | ||
5f1f79bb | 565 | /* |
905c4c51 | 566 | * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is |
5f1f79bb DH |
567 | * none. |
568 | */ | |
54c6a6ba | 569 | static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm, |
5f1f79bb DH |
570 | unsigned long mb_id) |
571 | { | |
54c6a6ba | 572 | const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0); |
5f1f79bb | 573 | |
54c6a6ba | 574 | return find_next_zero_bit(vm->sbm.sb_states, |
905c4c51 | 575 | bit + vm->sbm.sbs_per_mb, bit) - bit; |
5f1f79bb DH |
576 | } |
577 | ||
578 | /* | |
579 | * Prepare the subblock bitmap for the next memory block. | |
580 | */ | |
54c6a6ba | 581 | static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm) |
5f1f79bb | 582 | { |
8a6f082b | 583 | const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id; |
905c4c51 DH |
584 | const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb; |
585 | const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb; | |
5f1f79bb DH |
586 | int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long)); |
587 | int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long)); | |
54c6a6ba | 588 | unsigned long *new_bitmap, *old_bitmap; |
5f1f79bb | 589 | |
54c6a6ba | 590 | if (vm->sbm.sb_states && old_pages == new_pages) |
5f1f79bb DH |
591 | return 0; |
592 | ||
54c6a6ba DH |
593 | new_bitmap = vzalloc(new_pages * PAGE_SIZE); |
594 | if (!new_bitmap) | |
5f1f79bb DH |
595 | return -ENOMEM; |
596 | ||
597 | mutex_lock(&vm->hotplug_mutex); | |
cf4a4493 | 598 | if (vm->sbm.sb_states) |
54c6a6ba | 599 | memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE); |
5f1f79bb | 600 | |
54c6a6ba DH |
601 | old_bitmap = vm->sbm.sb_states; |
602 | vm->sbm.sb_states = new_bitmap; | |
5f1f79bb DH |
603 | mutex_unlock(&vm->hotplug_mutex); |
604 | ||
54c6a6ba | 605 | vfree(old_bitmap); |
5f1f79bb DH |
606 | return 0; |
607 | } | |
608 | ||
98ff9f94 DH |
609 | /* |
610 | * Test if we could add memory without creating too much offline memory - | |
611 | * to avoid running OOM if memory is getting onlined deferred. | |
612 | */ | |
613 | static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size) | |
614 | { | |
615 | if (WARN_ON_ONCE(size > vm->offline_threshold)) | |
616 | return false; | |
617 | ||
618 | return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold; | |
619 | } | |
620 | ||
5f1f79bb | 621 | /* |
01afdee2 | 622 | * Try adding memory to Linux. Will usually only fail if out of memory. |
5f1f79bb DH |
623 | * |
624 | * Must not be called with the vm->hotplug_mutex held (possible deadlock with | |
625 | * onlining code). | |
626 | * | |
01afdee2 | 627 | * Will not modify the state of memory blocks in virtio-mem. |
5f1f79bb | 628 | */ |
01afdee2 DH |
629 | static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr, |
630 | uint64_t size) | |
5f1f79bb | 631 | { |
98ff9f94 | 632 | int rc; |
5f1f79bb | 633 | |
b3562c60 DH |
634 | /* |
635 | * When force-unloading the driver and we still have memory added to | |
636 | * Linux, the resource name has to stay. | |
637 | */ | |
638 | if (!vm->resource_name) { | |
639 | vm->resource_name = kstrdup_const("System RAM (virtio_mem)", | |
640 | GFP_KERNEL); | |
641 | if (!vm->resource_name) | |
642 | return -ENOMEM; | |
643 | } | |
644 | ||
01afdee2 DH |
645 | dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr, |
646 | addr + size - 1); | |
98ff9f94 DH |
647 | /* Memory might get onlined immediately. */ |
648 | atomic64_add(size, &vm->offline_size); | |
ffaa6ce8 DH |
649 | rc = add_memory_driver_managed(vm->mgid, addr, size, vm->resource_name, |
650 | MHP_MERGE_RESOURCE | MHP_NID_IS_MGID); | |
01afdee2 | 651 | if (rc) { |
98ff9f94 | 652 | atomic64_sub(size, &vm->offline_size); |
01afdee2 DH |
653 | dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc); |
654 | /* | |
655 | * TODO: Linux MM does not properly clean up yet in all cases | |
656 | * where adding of memory failed - especially on -ENOMEM. | |
657 | */ | |
658 | } | |
98ff9f94 | 659 | return rc; |
5f1f79bb DH |
660 | } |
661 | ||
662 | /* | |
01afdee2 DH |
663 | * See virtio_mem_add_memory(): Try adding a single Linux memory block. |
664 | */ | |
665 | static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id) | |
666 | { | |
667 | const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); | |
668 | const uint64_t size = memory_block_size_bytes(); | |
669 | ||
670 | return virtio_mem_add_memory(vm, addr, size); | |
671 | } | |
672 | ||
4ba50cd3 DH |
673 | /* |
674 | * See virtio_mem_add_memory(): Try adding a big block. | |
675 | */ | |
676 | static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id) | |
677 | { | |
678 | const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); | |
679 | const uint64_t size = vm->bbm.bb_size; | |
680 | ||
681 | return virtio_mem_add_memory(vm, addr, size); | |
682 | } | |
683 | ||
01afdee2 DH |
684 | /* |
685 | * Try removing memory from Linux. Will only fail if memory blocks aren't | |
686 | * offline. | |
5f1f79bb DH |
687 | * |
688 | * Must not be called with the vm->hotplug_mutex held (possible deadlock with | |
689 | * onlining code). | |
690 | * | |
01afdee2 | 691 | * Will not modify the state of memory blocks in virtio-mem. |
5f1f79bb | 692 | */ |
01afdee2 DH |
693 | static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr, |
694 | uint64_t size) | |
5f1f79bb | 695 | { |
1d33c2ca | 696 | int rc; |
5f1f79bb | 697 | |
01afdee2 DH |
698 | dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr, |
699 | addr + size - 1); | |
e1c158e4 | 700 | rc = remove_memory(addr, size); |
98ff9f94 DH |
701 | if (!rc) { |
702 | atomic64_sub(size, &vm->offline_size); | |
1d33c2ca DH |
703 | /* |
704 | * We might have freed up memory we can now unplug, retry | |
705 | * immediately instead of waiting. | |
706 | */ | |
707 | virtio_mem_retry(vm); | |
01afdee2 DH |
708 | } else { |
709 | dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc); | |
98ff9f94 | 710 | } |
1d33c2ca | 711 | return rc; |
5f1f79bb DH |
712 | } |
713 | ||
a5732387 | 714 | /* |
01afdee2 DH |
715 | * See virtio_mem_remove_memory(): Try removing a single Linux memory block. |
716 | */ | |
717 | static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id) | |
718 | { | |
719 | const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); | |
720 | const uint64_t size = memory_block_size_bytes(); | |
721 | ||
722 | return virtio_mem_remove_memory(vm, addr, size); | |
723 | } | |
724 | ||
725 | /* | |
726 | * Try offlining and removing memory from Linux. | |
a5732387 DH |
727 | * |
728 | * Must not be called with the vm->hotplug_mutex held (possible deadlock with | |
729 | * onlining code). | |
730 | * | |
01afdee2 | 731 | * Will not modify the state of memory blocks in virtio-mem. |
a5732387 | 732 | */ |
01afdee2 DH |
733 | static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm, |
734 | uint64_t addr, | |
735 | uint64_t size) | |
a5732387 | 736 | { |
1d33c2ca | 737 | int rc; |
a5732387 | 738 | |
01afdee2 DH |
739 | dev_dbg(&vm->vdev->dev, |
740 | "offlining and removing memory: 0x%llx - 0x%llx\n", addr, | |
741 | addr + size - 1); | |
742 | ||
e1c158e4 | 743 | rc = offline_and_remove_memory(addr, size); |
98ff9f94 DH |
744 | if (!rc) { |
745 | atomic64_sub(size, &vm->offline_size); | |
1d33c2ca DH |
746 | /* |
747 | * We might have freed up memory we can now unplug, retry | |
748 | * immediately instead of waiting. | |
749 | */ | |
750 | virtio_mem_retry(vm); | |
ddf40985 | 751 | return 0; |
98ff9f94 | 752 | } |
ddf40985 DH |
753 | dev_dbg(&vm->vdev->dev, "offlining and removing memory failed: %d\n", rc); |
754 | /* | |
755 | * We don't really expect this to fail, because we fake-offlined all | |
756 | * memory already. But it could fail in corner cases. | |
757 | */ | |
758 | WARN_ON_ONCE(rc != -ENOMEM && rc != -EBUSY); | |
759 | return rc == -ENOMEM ? -ENOMEM : -EBUSY; | |
a5732387 DH |
760 | } |
761 | ||
01afdee2 DH |
762 | /* |
763 | * See virtio_mem_offline_and_remove_memory(): Try offlining and removing | |
764 | * a single Linux memory block. | |
765 | */ | |
766 | static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm, | |
767 | unsigned long mb_id) | |
768 | { | |
769 | const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); | |
770 | const uint64_t size = memory_block_size_bytes(); | |
771 | ||
772 | return virtio_mem_offline_and_remove_memory(vm, addr, size); | |
773 | } | |
774 | ||
a31648fd DH |
775 | /* |
776 | * Try (offlining and) removing memory from Linux in case all subblocks are | |
777 | * unplugged. Can be called on online and offline memory blocks. | |
778 | * | |
779 | * May modify the state of memory blocks in virtio-mem. | |
780 | */ | |
781 | static int virtio_mem_sbm_try_remove_unplugged_mb(struct virtio_mem *vm, | |
782 | unsigned long mb_id) | |
783 | { | |
784 | int rc; | |
785 | ||
786 | /* | |
787 | * Once all subblocks of a memory block were unplugged, offline and | |
788 | * remove it. | |
789 | */ | |
790 | if (!virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) | |
791 | return 0; | |
792 | ||
793 | /* offline_and_remove_memory() works for online and offline memory. */ | |
794 | mutex_unlock(&vm->hotplug_mutex); | |
795 | rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id); | |
796 | mutex_lock(&vm->hotplug_mutex); | |
797 | if (!rc) | |
798 | virtio_mem_sbm_set_mb_state(vm, mb_id, | |
799 | VIRTIO_MEM_SBM_MB_UNUSED); | |
800 | return rc; | |
801 | } | |
802 | ||
269ac938 DH |
803 | /* |
804 | * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a | |
805 | * all Linux memory blocks covered by the big block. | |
806 | */ | |
807 | static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm, | |
808 | unsigned long bb_id) | |
809 | { | |
810 | const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); | |
811 | const uint64_t size = vm->bbm.bb_size; | |
812 | ||
813 | return virtio_mem_offline_and_remove_memory(vm, addr, size); | |
814 | } | |
815 | ||
5f1f79bb DH |
816 | /* |
817 | * Trigger the workqueue so the device can perform its magic. | |
818 | */ | |
819 | static void virtio_mem_retry(struct virtio_mem *vm) | |
820 | { | |
821 | unsigned long flags; | |
822 | ||
823 | spin_lock_irqsave(&vm->removal_lock, flags); | |
824 | if (!vm->removing) | |
825 | queue_work(system_freezable_wq, &vm->wq); | |
826 | spin_unlock_irqrestore(&vm->removal_lock, flags); | |
827 | } | |
828 | ||
f2af6d39 DH |
829 | static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id) |
830 | { | |
831 | int node = NUMA_NO_NODE; | |
832 | ||
833 | #if defined(CONFIG_ACPI_NUMA) | |
834 | if (virtio_has_feature(vm->vdev, VIRTIO_MEM_F_ACPI_PXM)) | |
835 | node = pxm_to_node(node_id); | |
836 | #endif | |
837 | return node; | |
838 | } | |
839 | ||
5f1f79bb DH |
840 | /* |
841 | * Test if a virtio-mem device overlaps with the given range. Can be called | |
842 | * from (notifier) callbacks lockless. | |
843 | */ | |
835491c5 DH |
844 | static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start, |
845 | uint64_t size) | |
5f1f79bb | 846 | { |
835491c5 | 847 | return start < vm->addr + vm->region_size && vm->addr < start + size; |
5f1f79bb DH |
848 | } |
849 | ||
850 | /* | |
8464e3bd | 851 | * Test if a virtio-mem device contains a given range. Can be called from |
5f1f79bb DH |
852 | * (notifier) callbacks lockless. |
853 | */ | |
8464e3bd DH |
854 | static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start, |
855 | uint64_t size) | |
5f1f79bb | 856 | { |
8464e3bd | 857 | return start >= vm->addr && start + size <= vm->addr + vm->region_size; |
5f1f79bb DH |
858 | } |
859 | ||
d46dfb62 DH |
860 | static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm, |
861 | unsigned long mb_id) | |
5f1f79bb | 862 | { |
99f0b55e DH |
863 | switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { |
864 | case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: | |
865 | case VIRTIO_MEM_SBM_MB_OFFLINE: | |
5f1f79bb DH |
866 | return NOTIFY_OK; |
867 | default: | |
868 | break; | |
869 | } | |
870 | dev_warn_ratelimited(&vm->vdev->dev, | |
871 | "memory block onlining denied\n"); | |
872 | return NOTIFY_BAD; | |
873 | } | |
874 | ||
d46dfb62 DH |
875 | static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm, |
876 | unsigned long mb_id) | |
5f1f79bb | 877 | { |
99f0b55e | 878 | switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { |
c740bb97 DH |
879 | case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL: |
880 | case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL: | |
99f0b55e DH |
881 | virtio_mem_sbm_set_mb_state(vm, mb_id, |
882 | VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); | |
5f1f79bb | 883 | break; |
c740bb97 DH |
884 | case VIRTIO_MEM_SBM_MB_KERNEL: |
885 | case VIRTIO_MEM_SBM_MB_MOVABLE: | |
99f0b55e DH |
886 | virtio_mem_sbm_set_mb_state(vm, mb_id, |
887 | VIRTIO_MEM_SBM_MB_OFFLINE); | |
5f1f79bb DH |
888 | break; |
889 | default: | |
890 | BUG(); | |
891 | break; | |
892 | } | |
893 | } | |
894 | ||
d46dfb62 | 895 | static void virtio_mem_sbm_notify_online(struct virtio_mem *vm, |
c740bb97 DH |
896 | unsigned long mb_id, |
897 | unsigned long start_pfn) | |
5f1f79bb | 898 | { |
07252dfe | 899 | const bool is_movable = is_zone_movable_page(pfn_to_page(start_pfn)); |
c740bb97 DH |
900 | int new_state; |
901 | ||
99f0b55e DH |
902 | switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { |
903 | case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: | |
c740bb97 DH |
904 | new_state = VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL; |
905 | if (is_movable) | |
906 | new_state = VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL; | |
5f1f79bb | 907 | break; |
99f0b55e | 908 | case VIRTIO_MEM_SBM_MB_OFFLINE: |
c740bb97 DH |
909 | new_state = VIRTIO_MEM_SBM_MB_KERNEL; |
910 | if (is_movable) | |
911 | new_state = VIRTIO_MEM_SBM_MB_MOVABLE; | |
5f1f79bb DH |
912 | break; |
913 | default: | |
914 | BUG(); | |
915 | break; | |
916 | } | |
c740bb97 | 917 | virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); |
5f1f79bb DH |
918 | } |
919 | ||
d46dfb62 DH |
920 | static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm, |
921 | unsigned long mb_id) | |
8e5c921c | 922 | { |
905c4c51 | 923 | const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); |
8e5c921c | 924 | unsigned long pfn; |
7a34c77d | 925 | int sb_id; |
8e5c921c | 926 | |
905c4c51 | 927 | for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { |
54c6a6ba | 928 | if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) |
8e5c921c | 929 | continue; |
8e5c921c | 930 | pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + |
905c4c51 | 931 | sb_id * vm->sbm.sb_size); |
7a34c77d | 932 | virtio_mem_fake_offline_going_offline(pfn, nr_pages); |
8e5c921c DH |
933 | } |
934 | } | |
935 | ||
d46dfb62 DH |
936 | static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm, |
937 | unsigned long mb_id) | |
8e5c921c | 938 | { |
905c4c51 | 939 | const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); |
8e5c921c | 940 | unsigned long pfn; |
7a34c77d | 941 | int sb_id; |
8e5c921c | 942 | |
905c4c51 | 943 | for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { |
54c6a6ba | 944 | if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) |
8e5c921c | 945 | continue; |
8e5c921c | 946 | pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + |
905c4c51 | 947 | sb_id * vm->sbm.sb_size); |
7a34c77d | 948 | virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); |
8e5c921c DH |
949 | } |
950 | } | |
951 | ||
3711387a DH |
952 | static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm, |
953 | unsigned long bb_id, | |
954 | unsigned long pfn, | |
955 | unsigned long nr_pages) | |
956 | { | |
957 | /* | |
958 | * When marked as "fake-offline", all online memory of this device block | |
959 | * is allocated by us. Otherwise, we don't have any memory allocated. | |
960 | */ | |
961 | if (virtio_mem_bbm_get_bb_state(vm, bb_id) != | |
962 | VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) | |
963 | return; | |
964 | virtio_mem_fake_offline_going_offline(pfn, nr_pages); | |
965 | } | |
966 | ||
967 | static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm, | |
968 | unsigned long bb_id, | |
969 | unsigned long pfn, | |
970 | unsigned long nr_pages) | |
971 | { | |
972 | if (virtio_mem_bbm_get_bb_state(vm, bb_id) != | |
973 | VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) | |
974 | return; | |
975 | virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); | |
976 | } | |
977 | ||
5f1f79bb DH |
978 | /* |
979 | * This callback will either be called synchronously from add_memory() or | |
980 | * asynchronously (e.g., triggered via user space). We have to be careful | |
981 | * with locking when calling add_memory(). | |
982 | */ | |
983 | static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, | |
984 | unsigned long action, void *arg) | |
985 | { | |
986 | struct virtio_mem *vm = container_of(nb, struct virtio_mem, | |
987 | memory_notifier); | |
988 | struct memory_notify *mhp = arg; | |
989 | const unsigned long start = PFN_PHYS(mhp->start_pfn); | |
990 | const unsigned long size = PFN_PHYS(mhp->nr_pages); | |
5f1f79bb | 991 | int rc = NOTIFY_OK; |
4ba50cd3 | 992 | unsigned long id; |
5f1f79bb DH |
993 | |
994 | if (!virtio_mem_overlaps_range(vm, start, size)) | |
995 | return NOTIFY_DONE; | |
996 | ||
4ba50cd3 DH |
997 | if (vm->in_sbm) { |
998 | id = virtio_mem_phys_to_mb_id(start); | |
999 | /* | |
1000 | * In SBM, we add memory in separate memory blocks - we expect | |
1001 | * it to be onlined/offlined in the same granularity. Bail out | |
1002 | * if this ever changes. | |
1003 | */ | |
1004 | if (WARN_ON_ONCE(size != memory_block_size_bytes() || | |
1005 | !IS_ALIGNED(start, memory_block_size_bytes()))) | |
1006 | return NOTIFY_BAD; | |
1007 | } else { | |
1008 | id = virtio_mem_phys_to_bb_id(vm, start); | |
1009 | /* | |
1010 | * In BBM, we only care about onlining/offlining happening | |
1011 | * within a single big block, we don't care about the | |
1012 | * actual granularity as we don't track individual Linux | |
1013 | * memory blocks. | |
1014 | */ | |
1015 | if (WARN_ON_ONCE(id != virtio_mem_phys_to_bb_id(vm, start + size - 1))) | |
1016 | return NOTIFY_BAD; | |
1017 | } | |
5f1f79bb DH |
1018 | |
1019 | /* | |
1020 | * Avoid circular locking lockdep warnings. We lock the mutex | |
1021 | * e.g., in MEM_GOING_ONLINE and unlock it in MEM_ONLINE. The | |
1022 | * blocking_notifier_call_chain() has it's own lock, which gets unlocked | |
1023 | * between both notifier calls and will bail out. False positive. | |
1024 | */ | |
1025 | lockdep_off(); | |
1026 | ||
1027 | switch (action) { | |
1028 | case MEM_GOING_OFFLINE: | |
1029 | mutex_lock(&vm->hotplug_mutex); | |
1030 | if (vm->removing) { | |
1031 | rc = notifier_from_errno(-EBUSY); | |
1032 | mutex_unlock(&vm->hotplug_mutex); | |
1033 | break; | |
1034 | } | |
1035 | vm->hotplug_active = true; | |
4ba50cd3 DH |
1036 | if (vm->in_sbm) |
1037 | virtio_mem_sbm_notify_going_offline(vm, id); | |
3711387a DH |
1038 | else |
1039 | virtio_mem_bbm_notify_going_offline(vm, id, | |
1040 | mhp->start_pfn, | |
1041 | mhp->nr_pages); | |
5f1f79bb DH |
1042 | break; |
1043 | case MEM_GOING_ONLINE: | |
1044 | mutex_lock(&vm->hotplug_mutex); | |
1045 | if (vm->removing) { | |
1046 | rc = notifier_from_errno(-EBUSY); | |
1047 | mutex_unlock(&vm->hotplug_mutex); | |
1048 | break; | |
1049 | } | |
1050 | vm->hotplug_active = true; | |
4ba50cd3 DH |
1051 | if (vm->in_sbm) |
1052 | rc = virtio_mem_sbm_notify_going_online(vm, id); | |
5f1f79bb DH |
1053 | break; |
1054 | case MEM_OFFLINE: | |
4ba50cd3 DH |
1055 | if (vm->in_sbm) |
1056 | virtio_mem_sbm_notify_offline(vm, id); | |
1d33c2ca | 1057 | |
98ff9f94 | 1058 | atomic64_add(size, &vm->offline_size); |
1d33c2ca DH |
1059 | /* |
1060 | * Trigger the workqueue. Now that we have some offline memory, | |
1061 | * maybe we can handle pending unplug requests. | |
1062 | */ | |
1063 | if (!unplug_online) | |
1064 | virtio_mem_retry(vm); | |
1065 | ||
5f1f79bb DH |
1066 | vm->hotplug_active = false; |
1067 | mutex_unlock(&vm->hotplug_mutex); | |
1068 | break; | |
1069 | case MEM_ONLINE: | |
4ba50cd3 | 1070 | if (vm->in_sbm) |
c740bb97 | 1071 | virtio_mem_sbm_notify_online(vm, id, mhp->start_pfn); |
98ff9f94 DH |
1072 | |
1073 | atomic64_sub(size, &vm->offline_size); | |
1074 | /* | |
1075 | * Start adding more memory once we onlined half of our | |
1076 | * threshold. Don't trigger if it's possibly due to our actipn | |
1077 | * (e.g., us adding memory which gets onlined immediately from | |
1078 | * the core). | |
1079 | */ | |
1080 | if (!atomic_read(&vm->wq_active) && | |
1081 | virtio_mem_could_add_memory(vm, vm->offline_threshold / 2)) | |
1082 | virtio_mem_retry(vm); | |
1083 | ||
5f1f79bb DH |
1084 | vm->hotplug_active = false; |
1085 | mutex_unlock(&vm->hotplug_mutex); | |
1086 | break; | |
1087 | case MEM_CANCEL_OFFLINE: | |
8e5c921c DH |
1088 | if (!vm->hotplug_active) |
1089 | break; | |
4ba50cd3 DH |
1090 | if (vm->in_sbm) |
1091 | virtio_mem_sbm_notify_cancel_offline(vm, id); | |
3711387a DH |
1092 | else |
1093 | virtio_mem_bbm_notify_cancel_offline(vm, id, | |
1094 | mhp->start_pfn, | |
1095 | mhp->nr_pages); | |
8e5c921c DH |
1096 | vm->hotplug_active = false; |
1097 | mutex_unlock(&vm->hotplug_mutex); | |
1098 | break; | |
5f1f79bb DH |
1099 | case MEM_CANCEL_ONLINE: |
1100 | if (!vm->hotplug_active) | |
1101 | break; | |
1102 | vm->hotplug_active = false; | |
1103 | mutex_unlock(&vm->hotplug_mutex); | |
1104 | break; | |
1105 | default: | |
1106 | break; | |
1107 | } | |
1108 | ||
1109 | lockdep_on(); | |
1110 | ||
1111 | return rc; | |
1112 | } | |
1113 | ||
1114 | /* | |
255f5985 DH |
1115 | * Set a range of pages PG_offline. Remember pages that were never onlined |
1116 | * (via generic_online_page()) using PageDirty(). | |
5f1f79bb DH |
1117 | */ |
1118 | static void virtio_mem_set_fake_offline(unsigned long pfn, | |
2a628511 | 1119 | unsigned long nr_pages, bool onlined) |
5f1f79bb | 1120 | { |
6cc26d77 | 1121 | page_offline_begin(); |
255f5985 DH |
1122 | for (; nr_pages--; pfn++) { |
1123 | struct page *page = pfn_to_page(pfn); | |
1124 | ||
1125 | __SetPageOffline(page); | |
8e5c921c | 1126 | if (!onlined) { |
255f5985 | 1127 | SetPageDirty(page); |
8e5c921c DH |
1128 | /* FIXME: remove after cleanups */ |
1129 | ClearPageReserved(page); | |
1130 | } | |
255f5985 | 1131 | } |
6cc26d77 | 1132 | page_offline_end(); |
5f1f79bb DH |
1133 | } |
1134 | ||
1135 | /* | |
255f5985 DH |
1136 | * Clear PG_offline from a range of pages. If the pages were never onlined, |
1137 | * (via generic_online_page()), clear PageDirty(). | |
5f1f79bb DH |
1138 | */ |
1139 | static void virtio_mem_clear_fake_offline(unsigned long pfn, | |
2a628511 | 1140 | unsigned long nr_pages, bool onlined) |
5f1f79bb | 1141 | { |
255f5985 DH |
1142 | for (; nr_pages--; pfn++) { |
1143 | struct page *page = pfn_to_page(pfn); | |
1144 | ||
1145 | __ClearPageOffline(page); | |
1146 | if (!onlined) | |
1147 | ClearPageDirty(page); | |
1148 | } | |
5f1f79bb DH |
1149 | } |
1150 | ||
1151 | /* | |
1152 | * Release a range of fake-offline pages to the buddy, effectively | |
1153 | * fake-onlining them. | |
1154 | */ | |
2a628511 | 1155 | static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages) |
5f1f79bb | 1156 | { |
23baf831 | 1157 | unsigned long order = MAX_ORDER; |
2a628511 | 1158 | unsigned long i; |
5f1f79bb DH |
1159 | |
1160 | /* | |
57c5a5b3 | 1161 | * We might get called for ranges that don't cover properly aligned |
23baf831 KS |
1162 | * MAX_ORDER pages; however, we can only online properly aligned |
1163 | * pages with an order of MAX_ORDER at maximum. | |
5f1f79bb | 1164 | */ |
57c5a5b3 DH |
1165 | while (!IS_ALIGNED(pfn | nr_pages, 1 << order)) |
1166 | order--; | |
1167 | ||
1168 | for (i = 0; i < nr_pages; i += 1 << order) { | |
255f5985 | 1169 | struct page *page = pfn_to_page(pfn + i); |
5f1f79bb | 1170 | |
255f5985 DH |
1171 | /* |
1172 | * If the page is PageDirty(), it was kept fake-offline when | |
1173 | * onlining the memory block. Otherwise, it was allocated | |
1174 | * using alloc_contig_range(). All pages in a subblock are | |
1175 | * alike. | |
1176 | */ | |
1177 | if (PageDirty(page)) { | |
57c5a5b3 DH |
1178 | virtio_mem_clear_fake_offline(pfn + i, 1 << order, false); |
1179 | generic_online_page(page, order); | |
255f5985 | 1180 | } else { |
57c5a5b3 DH |
1181 | virtio_mem_clear_fake_offline(pfn + i, 1 << order, true); |
1182 | free_contig_range(pfn + i, 1 << order); | |
1183 | adjust_managed_page_count(page, 1 << order); | |
255f5985 DH |
1184 | } |
1185 | } | |
5f1f79bb DH |
1186 | } |
1187 | ||
89c486c4 DH |
1188 | /* |
1189 | * Try to allocate a range, marking pages fake-offline, effectively | |
1190 | * fake-offlining them. | |
1191 | */ | |
f55484fd DH |
1192 | static int virtio_mem_fake_offline(struct virtio_mem *vm, unsigned long pfn, |
1193 | unsigned long nr_pages) | |
89c486c4 | 1194 | { |
07252dfe | 1195 | const bool is_movable = is_zone_movable_page(pfn_to_page(pfn)); |
f2d799d5 DH |
1196 | int rc, retry_count; |
1197 | ||
1198 | /* | |
1199 | * TODO: We want an alloc_contig_range() mode that tries to allocate | |
1200 | * harder (e.g., dealing with temporarily pinned pages, PCP), especially | |
1201 | * with ZONE_MOVABLE. So for now, retry a couple of times with | |
1202 | * ZONE_MOVABLE before giving up - because that zone is supposed to give | |
1203 | * some guarantees. | |
1204 | */ | |
1205 | for (retry_count = 0; retry_count < 5; retry_count++) { | |
f55484fd DH |
1206 | /* |
1207 | * If the config changed, stop immediately and go back to the | |
1208 | * main loop: avoid trying to keep unplugging if the device | |
1209 | * might have decided to not remove any more memory. | |
1210 | */ | |
1211 | if (atomic_read(&vm->config_changed)) | |
1212 | return -EAGAIN; | |
1213 | ||
f2d799d5 DH |
1214 | rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE, |
1215 | GFP_KERNEL); | |
1216 | if (rc == -ENOMEM) | |
1217 | /* whoops, out of memory */ | |
1218 | return rc; | |
1219 | else if (rc && !is_movable) | |
1220 | break; | |
1221 | else if (rc) | |
1222 | continue; | |
1223 | ||
1224 | virtio_mem_set_fake_offline(pfn, nr_pages, true); | |
1225 | adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); | |
1226 | return 0; | |
1227 | } | |
1228 | ||
1229 | return -EBUSY; | |
89c486c4 DH |
1230 | } |
1231 | ||
7a34c77d DH |
1232 | /* |
1233 | * Handle fake-offline pages when memory is going offline - such that the | |
1234 | * pages can be skipped by mm-core when offlining. | |
1235 | */ | |
1236 | static void virtio_mem_fake_offline_going_offline(unsigned long pfn, | |
1237 | unsigned long nr_pages) | |
1238 | { | |
1239 | struct page *page; | |
1240 | unsigned long i; | |
1241 | ||
1242 | /* | |
1243 | * Drop our reference to the pages so the memory can get offlined | |
1244 | * and add the unplugged pages to the managed page counters (so | |
1245 | * offlining code can correctly subtract them again). | |
1246 | */ | |
1247 | adjust_managed_page_count(pfn_to_page(pfn), nr_pages); | |
1248 | /* Drop our reference to the pages so the memory can get offlined. */ | |
1249 | for (i = 0; i < nr_pages; i++) { | |
1250 | page = pfn_to_page(pfn + i); | |
1251 | if (WARN_ON(!page_ref_dec_and_test(page))) | |
1252 | dump_page(page, "fake-offline page referenced"); | |
1253 | } | |
1254 | } | |
1255 | ||
1256 | /* | |
1257 | * Handle fake-offline pages when memory offlining is canceled - to undo | |
1258 | * what we did in virtio_mem_fake_offline_going_offline(). | |
1259 | */ | |
1260 | static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, | |
1261 | unsigned long nr_pages) | |
1262 | { | |
1263 | unsigned long i; | |
1264 | ||
1265 | /* | |
1266 | * Get the reference we dropped when going offline and subtract the | |
1267 | * unplugged pages from the managed page counters. | |
1268 | */ | |
1269 | adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); | |
1270 | for (i = 0; i < nr_pages; i++) | |
1271 | page_ref_inc(pfn_to_page(pfn + i)); | |
1272 | } | |
1273 | ||
6639032a DH |
1274 | static void virtio_mem_online_page(struct virtio_mem *vm, |
1275 | struct page *page, unsigned int order) | |
5f1f79bb | 1276 | { |
6639032a DH |
1277 | const unsigned long start = page_to_phys(page); |
1278 | const unsigned long end = start + PFN_PHYS(1 << order); | |
1279 | unsigned long addr, next, id, sb_id, count; | |
4ba50cd3 | 1280 | bool do_online; |
5f1f79bb | 1281 | |
6639032a | 1282 | /* |
23baf831 KS |
1283 | * We can get called with any order up to MAX_ORDER. If our subblock |
1284 | * size is smaller than that and we have a mixture of plugged and | |
1285 | * unplugged subblocks within such a page, we have to process in | |
6639032a DH |
1286 | * smaller granularity. In that case we'll adjust the order exactly once |
1287 | * within the loop. | |
1288 | */ | |
1289 | for (addr = start; addr < end; ) { | |
1290 | next = addr + PFN_PHYS(1 << order); | |
5f1f79bb | 1291 | |
4ba50cd3 | 1292 | if (vm->in_sbm) { |
4ba50cd3 DH |
1293 | id = virtio_mem_phys_to_mb_id(addr); |
1294 | sb_id = virtio_mem_phys_to_sb_id(vm, addr); | |
6639032a DH |
1295 | count = virtio_mem_phys_to_sb_id(vm, next - 1) - sb_id + 1; |
1296 | ||
1297 | if (virtio_mem_sbm_test_sb_plugged(vm, id, sb_id, count)) { | |
1298 | /* Fully plugged. */ | |
1299 | do_online = true; | |
1300 | } else if (count == 1 || | |
1301 | virtio_mem_sbm_test_sb_unplugged(vm, id, sb_id, count)) { | |
1302 | /* Fully unplugged. */ | |
1303 | do_online = false; | |
1304 | } else { | |
1305 | /* | |
1306 | * Mixture, process sub-blocks instead. This | |
1307 | * will be at least the size of a pageblock. | |
1308 | * We'll run into this case exactly once. | |
1309 | */ | |
1310 | order = ilog2(vm->sbm.sb_size) - PAGE_SHIFT; | |
1311 | do_online = virtio_mem_sbm_test_sb_plugged(vm, id, sb_id, 1); | |
1312 | continue; | |
1313 | } | |
4ba50cd3 | 1314 | } else { |
3711387a DH |
1315 | /* |
1316 | * If the whole block is marked fake offline, keep | |
1317 | * everything that way. | |
1318 | */ | |
1319 | id = virtio_mem_phys_to_bb_id(vm, addr); | |
1320 | do_online = virtio_mem_bbm_get_bb_state(vm, id) != | |
1321 | VIRTIO_MEM_BBM_BB_FAKE_OFFLINE; | |
4ba50cd3 | 1322 | } |
425bec00 | 1323 | |
6639032a DH |
1324 | if (do_online) |
1325 | generic_online_page(pfn_to_page(PFN_DOWN(addr)), order); | |
1326 | else | |
1327 | virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order, | |
1328 | false); | |
1329 | addr = next; | |
1330 | } | |
1331 | } | |
1332 | ||
1333 | static void virtio_mem_online_page_cb(struct page *page, unsigned int order) | |
1334 | { | |
1335 | const unsigned long addr = page_to_phys(page); | |
1336 | struct virtio_mem *vm; | |
1337 | ||
1338 | rcu_read_lock(); | |
1339 | list_for_each_entry_rcu(vm, &virtio_mem_devices, next) { | |
1340 | /* | |
1341 | * Pages we're onlining will never cross memory blocks and, | |
1342 | * therefore, not virtio-mem devices. | |
1343 | */ | |
1344 | if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order))) | |
1345 | continue; | |
1346 | ||
425bec00 | 1347 | /* |
6639032a DH |
1348 | * virtio_mem_set_fake_offline() might sleep. We can safely |
1349 | * drop the RCU lock at this point because the device | |
1350 | * cannot go away. See virtio_mem_remove() how races | |
425bec00 DH |
1351 | * between memory onlining and device removal are handled. |
1352 | */ | |
1353 | rcu_read_unlock(); | |
1354 | ||
6639032a | 1355 | virtio_mem_online_page(vm, page, order); |
5f1f79bb DH |
1356 | return; |
1357 | } | |
1358 | rcu_read_unlock(); | |
1359 | ||
1360 | /* not virtio-mem memory, but e.g., a DIMM. online it */ | |
1361 | generic_online_page(page, order); | |
1362 | } | |
1363 | ||
1364 | static uint64_t virtio_mem_send_request(struct virtio_mem *vm, | |
1365 | const struct virtio_mem_req *req) | |
1366 | { | |
1367 | struct scatterlist *sgs[2], sg_req, sg_resp; | |
1368 | unsigned int len; | |
1369 | int rc; | |
1370 | ||
1371 | /* don't use the request residing on the stack (vaddr) */ | |
1372 | vm->req = *req; | |
1373 | ||
1374 | /* out: buffer for request */ | |
1375 | sg_init_one(&sg_req, &vm->req, sizeof(vm->req)); | |
1376 | sgs[0] = &sg_req; | |
1377 | ||
1378 | /* in: buffer for response */ | |
1379 | sg_init_one(&sg_resp, &vm->resp, sizeof(vm->resp)); | |
1380 | sgs[1] = &sg_resp; | |
1381 | ||
1382 | rc = virtqueue_add_sgs(vm->vq, sgs, 1, 1, vm, GFP_KERNEL); | |
1383 | if (rc < 0) | |
1384 | return rc; | |
1385 | ||
1386 | virtqueue_kick(vm->vq); | |
1387 | ||
1388 | /* wait for a response */ | |
1389 | wait_event(vm->host_resp, virtqueue_get_buf(vm->vq, &len)); | |
1390 | ||
1391 | return virtio16_to_cpu(vm->vdev, vm->resp.type); | |
1392 | } | |
1393 | ||
1394 | static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr, | |
1395 | uint64_t size) | |
1396 | { | |
1397 | const uint64_t nb_vm_blocks = size / vm->device_block_size; | |
1398 | const struct virtio_mem_req req = { | |
1399 | .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_PLUG), | |
1400 | .u.plug.addr = cpu_to_virtio64(vm->vdev, addr), | |
1401 | .u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), | |
1402 | }; | |
6beb3a94 | 1403 | int rc = -ENOMEM; |
5f1f79bb DH |
1404 | |
1405 | if (atomic_read(&vm->config_changed)) | |
1406 | return -EAGAIN; | |
1407 | ||
6beb3a94 DH |
1408 | dev_dbg(&vm->vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr, |
1409 | addr + size - 1); | |
1410 | ||
5f1f79bb DH |
1411 | switch (virtio_mem_send_request(vm, &req)) { |
1412 | case VIRTIO_MEM_RESP_ACK: | |
1413 | vm->plugged_size += size; | |
1414 | return 0; | |
1415 | case VIRTIO_MEM_RESP_NACK: | |
6beb3a94 DH |
1416 | rc = -EAGAIN; |
1417 | break; | |
5f1f79bb | 1418 | case VIRTIO_MEM_RESP_BUSY: |
6beb3a94 DH |
1419 | rc = -ETXTBSY; |
1420 | break; | |
5f1f79bb | 1421 | case VIRTIO_MEM_RESP_ERROR: |
6beb3a94 DH |
1422 | rc = -EINVAL; |
1423 | break; | |
5f1f79bb | 1424 | default: |
6beb3a94 | 1425 | break; |
5f1f79bb | 1426 | } |
6beb3a94 DH |
1427 | |
1428 | dev_dbg(&vm->vdev->dev, "plugging memory failed: %d\n", rc); | |
1429 | return rc; | |
5f1f79bb DH |
1430 | } |
1431 | ||
1432 | static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr, | |
1433 | uint64_t size) | |
1434 | { | |
1435 | const uint64_t nb_vm_blocks = size / vm->device_block_size; | |
1436 | const struct virtio_mem_req req = { | |
1437 | .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG), | |
1438 | .u.unplug.addr = cpu_to_virtio64(vm->vdev, addr), | |
1439 | .u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), | |
1440 | }; | |
6beb3a94 | 1441 | int rc = -ENOMEM; |
5f1f79bb DH |
1442 | |
1443 | if (atomic_read(&vm->config_changed)) | |
1444 | return -EAGAIN; | |
1445 | ||
6beb3a94 DH |
1446 | dev_dbg(&vm->vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr, |
1447 | addr + size - 1); | |
1448 | ||
5f1f79bb DH |
1449 | switch (virtio_mem_send_request(vm, &req)) { |
1450 | case VIRTIO_MEM_RESP_ACK: | |
1451 | vm->plugged_size -= size; | |
1452 | return 0; | |
1453 | case VIRTIO_MEM_RESP_BUSY: | |
6beb3a94 DH |
1454 | rc = -ETXTBSY; |
1455 | break; | |
5f1f79bb | 1456 | case VIRTIO_MEM_RESP_ERROR: |
6beb3a94 DH |
1457 | rc = -EINVAL; |
1458 | break; | |
5f1f79bb | 1459 | default: |
6beb3a94 | 1460 | break; |
5f1f79bb | 1461 | } |
6beb3a94 DH |
1462 | |
1463 | dev_dbg(&vm->vdev->dev, "unplugging memory failed: %d\n", rc); | |
1464 | return rc; | |
5f1f79bb DH |
1465 | } |
1466 | ||
1467 | static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm) | |
1468 | { | |
1469 | const struct virtio_mem_req req = { | |
1470 | .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL), | |
1471 | }; | |
6beb3a94 DH |
1472 | int rc = -ENOMEM; |
1473 | ||
1474 | dev_dbg(&vm->vdev->dev, "unplugging all memory"); | |
5f1f79bb DH |
1475 | |
1476 | switch (virtio_mem_send_request(vm, &req)) { | |
1477 | case VIRTIO_MEM_RESP_ACK: | |
1478 | vm->unplug_all_required = false; | |
1479 | vm->plugged_size = 0; | |
1480 | /* usable region might have shrunk */ | |
1481 | atomic_set(&vm->config_changed, 1); | |
1482 | return 0; | |
1483 | case VIRTIO_MEM_RESP_BUSY: | |
6beb3a94 DH |
1484 | rc = -ETXTBSY; |
1485 | break; | |
5f1f79bb | 1486 | default: |
6beb3a94 | 1487 | break; |
5f1f79bb | 1488 | } |
6beb3a94 DH |
1489 | |
1490 | dev_dbg(&vm->vdev->dev, "unplugging all memory failed: %d\n", rc); | |
1491 | return rc; | |
5f1f79bb DH |
1492 | } |
1493 | ||
1494 | /* | |
1495 | * Plug selected subblocks. Updates the plugged state, but not the state | |
1496 | * of the memory block. | |
1497 | */ | |
602ef894 DH |
1498 | static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id, |
1499 | int sb_id, int count) | |
5f1f79bb DH |
1500 | { |
1501 | const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + | |
905c4c51 DH |
1502 | sb_id * vm->sbm.sb_size; |
1503 | const uint64_t size = count * vm->sbm.sb_size; | |
5f1f79bb DH |
1504 | int rc; |
1505 | ||
5f1f79bb DH |
1506 | rc = virtio_mem_send_plug_request(vm, addr, size); |
1507 | if (!rc) | |
54c6a6ba | 1508 | virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count); |
5f1f79bb DH |
1509 | return rc; |
1510 | } | |
1511 | ||
1512 | /* | |
1513 | * Unplug selected subblocks. Updates the plugged state, but not the state | |
1514 | * of the memory block. | |
1515 | */ | |
602ef894 DH |
1516 | static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long mb_id, |
1517 | int sb_id, int count) | |
5f1f79bb DH |
1518 | { |
1519 | const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + | |
905c4c51 DH |
1520 | sb_id * vm->sbm.sb_size; |
1521 | const uint64_t size = count * vm->sbm.sb_size; | |
5f1f79bb DH |
1522 | int rc; |
1523 | ||
5f1f79bb DH |
1524 | rc = virtio_mem_send_unplug_request(vm, addr, size); |
1525 | if (!rc) | |
54c6a6ba | 1526 | virtio_mem_sbm_set_sb_unplugged(vm, mb_id, sb_id, count); |
5f1f79bb DH |
1527 | return rc; |
1528 | } | |
1529 | ||
4ba50cd3 DH |
1530 | /* |
1531 | * Request to unplug a big block. | |
1532 | * | |
1533 | * Will not modify the state of the big block. | |
1534 | */ | |
1535 | static int virtio_mem_bbm_unplug_bb(struct virtio_mem *vm, unsigned long bb_id) | |
1536 | { | |
1537 | const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); | |
1538 | const uint64_t size = vm->bbm.bb_size; | |
1539 | ||
1540 | return virtio_mem_send_unplug_request(vm, addr, size); | |
1541 | } | |
1542 | ||
1543 | /* | |
1544 | * Request to plug a big block. | |
1545 | * | |
1546 | * Will not modify the state of the big block. | |
1547 | */ | |
1548 | static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id) | |
1549 | { | |
1550 | const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); | |
1551 | const uint64_t size = vm->bbm.bb_size; | |
1552 | ||
1553 | return virtio_mem_send_plug_request(vm, addr, size); | |
1554 | } | |
1555 | ||
5f1f79bb DH |
1556 | /* |
1557 | * Unplug the desired number of plugged subblocks of a offline or not-added | |
1558 | * memory block. Will fail if any subblock cannot get unplugged (instead of | |
1559 | * skipping it). | |
1560 | * | |
1561 | * Will not modify the state of the memory block. | |
1562 | * | |
1563 | * Note: can fail after some subblocks were unplugged. | |
1564 | */ | |
5304ca3d DH |
1565 | static int virtio_mem_sbm_unplug_any_sb_raw(struct virtio_mem *vm, |
1566 | unsigned long mb_id, uint64_t *nb_sb) | |
5f1f79bb DH |
1567 | { |
1568 | int sb_id, count; | |
1569 | int rc; | |
1570 | ||
905c4c51 | 1571 | sb_id = vm->sbm.sbs_per_mb - 1; |
5f1f79bb | 1572 | while (*nb_sb) { |
562e08cd DH |
1573 | /* Find the next candidate subblock */ |
1574 | while (sb_id >= 0 && | |
54c6a6ba | 1575 | virtio_mem_sbm_test_sb_unplugged(vm, mb_id, sb_id, 1)) |
562e08cd DH |
1576 | sb_id--; |
1577 | if (sb_id < 0) | |
5f1f79bb | 1578 | break; |
562e08cd | 1579 | /* Try to unplug multiple subblocks at a time */ |
5f1f79bb | 1580 | count = 1; |
562e08cd | 1581 | while (count < *nb_sb && sb_id > 0 && |
54c6a6ba | 1582 | virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) { |
5f1f79bb | 1583 | count++; |
562e08cd DH |
1584 | sb_id--; |
1585 | } | |
5f1f79bb | 1586 | |
602ef894 | 1587 | rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); |
5f1f79bb DH |
1588 | if (rc) |
1589 | return rc; | |
1590 | *nb_sb -= count; | |
562e08cd | 1591 | sb_id--; |
5f1f79bb DH |
1592 | } |
1593 | ||
1594 | return 0; | |
1595 | } | |
1596 | ||
1597 | /* | |
1598 | * Unplug all plugged subblocks of an offline or not-added memory block. | |
1599 | * | |
1600 | * Will not modify the state of the memory block. | |
1601 | * | |
1602 | * Note: can fail after some subblocks were unplugged. | |
1603 | */ | |
602ef894 | 1604 | static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id) |
5f1f79bb | 1605 | { |
905c4c51 | 1606 | uint64_t nb_sb = vm->sbm.sbs_per_mb; |
5f1f79bb | 1607 | |
5304ca3d | 1608 | return virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, &nb_sb); |
5f1f79bb DH |
1609 | } |
1610 | ||
1611 | /* | |
1612 | * Prepare tracking data for the next memory block. | |
1613 | */ | |
602ef894 DH |
1614 | static int virtio_mem_sbm_prepare_next_mb(struct virtio_mem *vm, |
1615 | unsigned long *mb_id) | |
5f1f79bb DH |
1616 | { |
1617 | int rc; | |
1618 | ||
8a6f082b | 1619 | if (vm->sbm.next_mb_id > vm->sbm.last_usable_mb_id) |
5f1f79bb DH |
1620 | return -ENOSPC; |
1621 | ||
1622 | /* Resize the state array if required. */ | |
99f0b55e | 1623 | rc = virtio_mem_sbm_mb_states_prepare_next_mb(vm); |
5f1f79bb DH |
1624 | if (rc) |
1625 | return rc; | |
1626 | ||
1627 | /* Resize the subblock bitmap if required. */ | |
54c6a6ba | 1628 | rc = virtio_mem_sbm_sb_states_prepare_next_mb(vm); |
5f1f79bb DH |
1629 | if (rc) |
1630 | return rc; | |
1631 | ||
99f0b55e | 1632 | vm->sbm.mb_count[VIRTIO_MEM_SBM_MB_UNUSED]++; |
8a6f082b | 1633 | *mb_id = vm->sbm.next_mb_id++; |
5f1f79bb DH |
1634 | return 0; |
1635 | } | |
1636 | ||
5f1f79bb DH |
1637 | /* |
1638 | * Try to plug the desired number of subblocks and add the memory block | |
1639 | * to Linux. | |
1640 | * | |
1641 | * Will modify the state of the memory block. | |
1642 | */ | |
602ef894 DH |
1643 | static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm, |
1644 | unsigned long mb_id, uint64_t *nb_sb) | |
5f1f79bb | 1645 | { |
905c4c51 | 1646 | const int count = min_t(int, *nb_sb, vm->sbm.sbs_per_mb); |
d76944f8 | 1647 | int rc; |
5f1f79bb DH |
1648 | |
1649 | if (WARN_ON_ONCE(!count)) | |
1650 | return -EINVAL; | |
1651 | ||
1652 | /* | |
1653 | * Plug the requested number of subblocks before adding it to linux, | |
1654 | * so that onlining will directly online all plugged subblocks. | |
1655 | */ | |
602ef894 | 1656 | rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count); |
5f1f79bb DH |
1657 | if (rc) |
1658 | return rc; | |
1659 | ||
1660 | /* | |
1661 | * Mark the block properly offline before adding it to Linux, | |
1662 | * so the memory notifiers will find the block in the right state. | |
1663 | */ | |
905c4c51 | 1664 | if (count == vm->sbm.sbs_per_mb) |
99f0b55e DH |
1665 | virtio_mem_sbm_set_mb_state(vm, mb_id, |
1666 | VIRTIO_MEM_SBM_MB_OFFLINE); | |
5f1f79bb | 1667 | else |
99f0b55e DH |
1668 | virtio_mem_sbm_set_mb_state(vm, mb_id, |
1669 | VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); | |
5f1f79bb DH |
1670 | |
1671 | /* Add the memory block to linux - if that fails, try to unplug. */ | |
01afdee2 | 1672 | rc = virtio_mem_sbm_add_mb(vm, mb_id); |
5f1f79bb | 1673 | if (rc) { |
99f0b55e | 1674 | int new_state = VIRTIO_MEM_SBM_MB_UNUSED; |
5f1f79bb | 1675 | |
602ef894 | 1676 | if (virtio_mem_sbm_unplug_sb(vm, mb_id, 0, count)) |
99f0b55e DH |
1677 | new_state = VIRTIO_MEM_SBM_MB_PLUGGED; |
1678 | virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); | |
5f1f79bb DH |
1679 | return rc; |
1680 | } | |
1681 | ||
1682 | *nb_sb -= count; | |
1683 | return 0; | |
1684 | } | |
1685 | ||
1686 | /* | |
1687 | * Try to plug the desired number of subblocks of a memory block that | |
1688 | * is already added to Linux. | |
1689 | * | |
1690 | * Will modify the state of the memory block. | |
1691 | * | |
1692 | * Note: Can fail after some subblocks were successfully plugged. | |
1693 | */ | |
602ef894 | 1694 | static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm, |
f4cf803d | 1695 | unsigned long mb_id, uint64_t *nb_sb) |
5f1f79bb | 1696 | { |
f4cf803d | 1697 | const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); |
5f1f79bb DH |
1698 | unsigned long pfn, nr_pages; |
1699 | int sb_id, count; | |
1700 | int rc; | |
1701 | ||
1702 | if (WARN_ON_ONCE(!*nb_sb)) | |
1703 | return -EINVAL; | |
1704 | ||
1705 | while (*nb_sb) { | |
54c6a6ba | 1706 | sb_id = virtio_mem_sbm_first_unplugged_sb(vm, mb_id); |
905c4c51 | 1707 | if (sb_id >= vm->sbm.sbs_per_mb) |
5f1f79bb DH |
1708 | break; |
1709 | count = 1; | |
1710 | while (count < *nb_sb && | |
905c4c51 | 1711 | sb_id + count < vm->sbm.sbs_per_mb && |
54c6a6ba | 1712 | !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1)) |
5f1f79bb DH |
1713 | count++; |
1714 | ||
602ef894 | 1715 | rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count); |
5f1f79bb DH |
1716 | if (rc) |
1717 | return rc; | |
1718 | *nb_sb -= count; | |
f4cf803d | 1719 | if (old_state == VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) |
5f1f79bb DH |
1720 | continue; |
1721 | ||
1722 | /* fake-online the pages if the memory block is online */ | |
1723 | pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + | |
905c4c51 DH |
1724 | sb_id * vm->sbm.sb_size); |
1725 | nr_pages = PFN_DOWN(count * vm->sbm.sb_size); | |
5f1f79bb DH |
1726 | virtio_mem_fake_online(pfn, nr_pages); |
1727 | } | |
1728 | ||
f4cf803d DH |
1729 | if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) |
1730 | virtio_mem_sbm_set_mb_state(vm, mb_id, old_state - 1); | |
5f1f79bb | 1731 | |
1c3d69ab | 1732 | return 0; |
5f1f79bb DH |
1733 | } |
1734 | ||
4ba50cd3 | 1735 | static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff) |
5f1f79bb | 1736 | { |
f4cf803d | 1737 | const int mb_states[] = { |
c740bb97 DH |
1738 | VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, |
1739 | VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, | |
f4cf803d DH |
1740 | VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, |
1741 | }; | |
905c4c51 | 1742 | uint64_t nb_sb = diff / vm->sbm.sb_size; |
5f1f79bb | 1743 | unsigned long mb_id; |
f4cf803d | 1744 | int rc, i; |
5f1f79bb DH |
1745 | |
1746 | if (!nb_sb) | |
1747 | return 0; | |
1748 | ||
1749 | /* Don't race with onlining/offlining */ | |
1750 | mutex_lock(&vm->hotplug_mutex); | |
1751 | ||
f4cf803d DH |
1752 | for (i = 0; i < ARRAY_SIZE(mb_states); i++) { |
1753 | virtio_mem_sbm_for_each_mb(vm, mb_id, mb_states[i]) { | |
1754 | rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb); | |
1755 | if (rc || !nb_sb) | |
1756 | goto out_unlock; | |
1757 | cond_resched(); | |
1758 | } | |
5f1f79bb DH |
1759 | } |
1760 | ||
1761 | /* | |
1762 | * We won't be working on online/offline memory blocks from this point, | |
1763 | * so we can't race with memory onlining/offlining. Drop the mutex. | |
1764 | */ | |
1765 | mutex_unlock(&vm->hotplug_mutex); | |
1766 | ||
1767 | /* Try to plug and add unused blocks */ | |
99f0b55e | 1768 | virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) { |
98ff9f94 | 1769 | if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) |
5f1f79bb DH |
1770 | return -ENOSPC; |
1771 | ||
602ef894 | 1772 | rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); |
5f1f79bb DH |
1773 | if (rc || !nb_sb) |
1774 | return rc; | |
1775 | cond_resched(); | |
1776 | } | |
1777 | ||
1778 | /* Try to prepare, plug and add new blocks */ | |
1779 | while (nb_sb) { | |
98ff9f94 | 1780 | if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) |
5f1f79bb DH |
1781 | return -ENOSPC; |
1782 | ||
602ef894 | 1783 | rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id); |
5f1f79bb DH |
1784 | if (rc) |
1785 | return rc; | |
602ef894 | 1786 | rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); |
5f1f79bb DH |
1787 | if (rc) |
1788 | return rc; | |
1789 | cond_resched(); | |
1790 | } | |
1791 | ||
1792 | return 0; | |
1793 | out_unlock: | |
1794 | mutex_unlock(&vm->hotplug_mutex); | |
1795 | return rc; | |
1796 | } | |
1797 | ||
4ba50cd3 DH |
1798 | /* |
1799 | * Plug a big block and add it to Linux. | |
1800 | * | |
1801 | * Will modify the state of the big block. | |
1802 | */ | |
1803 | static int virtio_mem_bbm_plug_and_add_bb(struct virtio_mem *vm, | |
1804 | unsigned long bb_id) | |
1805 | { | |
1806 | int rc; | |
1807 | ||
1808 | if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != | |
1809 | VIRTIO_MEM_BBM_BB_UNUSED)) | |
1810 | return -EINVAL; | |
1811 | ||
1812 | rc = virtio_mem_bbm_plug_bb(vm, bb_id); | |
1813 | if (rc) | |
1814 | return rc; | |
1815 | virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED); | |
1816 | ||
1817 | rc = virtio_mem_bbm_add_bb(vm, bb_id); | |
1818 | if (rc) { | |
1819 | if (!virtio_mem_bbm_unplug_bb(vm, bb_id)) | |
1820 | virtio_mem_bbm_set_bb_state(vm, bb_id, | |
1821 | VIRTIO_MEM_BBM_BB_UNUSED); | |
1822 | else | |
1823 | /* Retry from the main loop. */ | |
1824 | virtio_mem_bbm_set_bb_state(vm, bb_id, | |
1825 | VIRTIO_MEM_BBM_BB_PLUGGED); | |
1826 | return rc; | |
1827 | } | |
1828 | return 0; | |
1829 | } | |
1830 | ||
1831 | /* | |
1832 | * Prepare tracking data for the next big block. | |
1833 | */ | |
1834 | static int virtio_mem_bbm_prepare_next_bb(struct virtio_mem *vm, | |
1835 | unsigned long *bb_id) | |
1836 | { | |
1837 | int rc; | |
1838 | ||
1839 | if (vm->bbm.next_bb_id > vm->bbm.last_usable_bb_id) | |
1840 | return -ENOSPC; | |
1841 | ||
1842 | /* Resize the big block state array if required. */ | |
1843 | rc = virtio_mem_bbm_bb_states_prepare_next_bb(vm); | |
1844 | if (rc) | |
1845 | return rc; | |
1846 | ||
1847 | vm->bbm.bb_count[VIRTIO_MEM_BBM_BB_UNUSED]++; | |
1848 | *bb_id = vm->bbm.next_bb_id; | |
1849 | vm->bbm.next_bb_id++; | |
1850 | return 0; | |
1851 | } | |
1852 | ||
1853 | static int virtio_mem_bbm_plug_request(struct virtio_mem *vm, uint64_t diff) | |
1854 | { | |
1855 | uint64_t nb_bb = diff / vm->bbm.bb_size; | |
1856 | unsigned long bb_id; | |
1857 | int rc; | |
1858 | ||
1859 | if (!nb_bb) | |
1860 | return 0; | |
1861 | ||
1862 | /* Try to plug and add unused big blocks */ | |
1863 | virtio_mem_bbm_for_each_bb(vm, bb_id, VIRTIO_MEM_BBM_BB_UNUSED) { | |
1864 | if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size)) | |
1865 | return -ENOSPC; | |
1866 | ||
1867 | rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id); | |
1868 | if (!rc) | |
1869 | nb_bb--; | |
1870 | if (rc || !nb_bb) | |
1871 | return rc; | |
1872 | cond_resched(); | |
1873 | } | |
1874 | ||
1875 | /* Try to prepare, plug and add new big blocks */ | |
1876 | while (nb_bb) { | |
1877 | if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size)) | |
1878 | return -ENOSPC; | |
1879 | ||
1880 | rc = virtio_mem_bbm_prepare_next_bb(vm, &bb_id); | |
1881 | if (rc) | |
1882 | return rc; | |
1883 | rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id); | |
1884 | if (!rc) | |
1885 | nb_bb--; | |
1886 | if (rc) | |
1887 | return rc; | |
1888 | cond_resched(); | |
1889 | } | |
1890 | ||
1891 | return 0; | |
1892 | } | |
1893 | ||
1894 | /* | |
1895 | * Try to plug the requested amount of memory. | |
1896 | */ | |
1897 | static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) | |
1898 | { | |
1899 | if (vm->in_sbm) | |
1900 | return virtio_mem_sbm_plug_request(vm, diff); | |
1901 | return virtio_mem_bbm_plug_request(vm, diff); | |
1902 | } | |
1903 | ||
c627ff5d DH |
1904 | /* |
1905 | * Unplug the desired number of plugged subblocks of an offline memory block. | |
1906 | * Will fail if any subblock cannot get unplugged (instead of skipping it). | |
1907 | * | |
1908 | * Will modify the state of the memory block. Might temporarily drop the | |
1909 | * hotplug_mutex. | |
1910 | * | |
1911 | * Note: Can fail after some subblocks were successfully unplugged. | |
1912 | */ | |
602ef894 DH |
1913 | static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm, |
1914 | unsigned long mb_id, | |
1915 | uint64_t *nb_sb) | |
c627ff5d DH |
1916 | { |
1917 | int rc; | |
1918 | ||
5304ca3d | 1919 | rc = virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, nb_sb); |
c627ff5d DH |
1920 | |
1921 | /* some subblocks might have been unplugged even on failure */ | |
905c4c51 | 1922 | if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) |
99f0b55e DH |
1923 | virtio_mem_sbm_set_mb_state(vm, mb_id, |
1924 | VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); | |
c627ff5d DH |
1925 | if (rc) |
1926 | return rc; | |
1927 | ||
905c4c51 | 1928 | if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { |
c627ff5d DH |
1929 | /* |
1930 | * Remove the block from Linux - this should never fail. | |
1931 | * Hinder the block from getting onlined by marking it | |
1932 | * unplugged. Temporarily drop the mutex, so | |
1933 | * any pending GOING_ONLINE requests can be serviced/rejected. | |
1934 | */ | |
99f0b55e DH |
1935 | virtio_mem_sbm_set_mb_state(vm, mb_id, |
1936 | VIRTIO_MEM_SBM_MB_UNUSED); | |
c627ff5d DH |
1937 | |
1938 | mutex_unlock(&vm->hotplug_mutex); | |
01afdee2 | 1939 | rc = virtio_mem_sbm_remove_mb(vm, mb_id); |
c627ff5d DH |
1940 | BUG_ON(rc); |
1941 | mutex_lock(&vm->hotplug_mutex); | |
1942 | } | |
1943 | return 0; | |
1944 | } | |
1945 | ||
72f9525a DH |
1946 | /* |
1947 | * Unplug the given plugged subblocks of an online memory block. | |
1948 | * | |
1949 | * Will modify the state of the memory block. | |
1950 | */ | |
602ef894 DH |
1951 | static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm, |
1952 | unsigned long mb_id, int sb_id, | |
1953 | int count) | |
72f9525a | 1954 | { |
905c4c51 | 1955 | const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count; |
c740bb97 | 1956 | const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); |
72f9525a DH |
1957 | unsigned long start_pfn; |
1958 | int rc; | |
1959 | ||
1960 | start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + | |
905c4c51 | 1961 | sb_id * vm->sbm.sb_size); |
72f9525a | 1962 | |
f55484fd | 1963 | rc = virtio_mem_fake_offline(vm, start_pfn, nr_pages); |
89c486c4 DH |
1964 | if (rc) |
1965 | return rc; | |
72f9525a DH |
1966 | |
1967 | /* Try to unplug the allocated memory */ | |
602ef894 | 1968 | rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); |
72f9525a DH |
1969 | if (rc) { |
1970 | /* Return the memory to the buddy. */ | |
1971 | virtio_mem_fake_online(start_pfn, nr_pages); | |
1972 | return rc; | |
1973 | } | |
1974 | ||
c740bb97 DH |
1975 | switch (old_state) { |
1976 | case VIRTIO_MEM_SBM_MB_KERNEL: | |
1977 | virtio_mem_sbm_set_mb_state(vm, mb_id, | |
1978 | VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL); | |
1979 | break; | |
1980 | case VIRTIO_MEM_SBM_MB_MOVABLE: | |
1981 | virtio_mem_sbm_set_mb_state(vm, mb_id, | |
1982 | VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL); | |
1983 | break; | |
1984 | } | |
1985 | ||
72f9525a DH |
1986 | return 0; |
1987 | } | |
1988 | ||
255f5985 DH |
1989 | /* |
1990 | * Unplug the desired number of plugged subblocks of an online memory block. | |
1991 | * Will skip subblock that are busy. | |
1992 | * | |
a5732387 DH |
1993 | * Will modify the state of the memory block. Might temporarily drop the |
1994 | * hotplug_mutex. | |
255f5985 DH |
1995 | * |
1996 | * Note: Can fail after some subblocks were successfully unplugged. Can | |
1997 | * return 0 even if subblocks were busy and could not get unplugged. | |
1998 | */ | |
602ef894 DH |
1999 | static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm, |
2000 | unsigned long mb_id, | |
2001 | uint64_t *nb_sb) | |
255f5985 | 2002 | { |
255f5985 DH |
2003 | int rc, sb_id; |
2004 | ||
72f9525a | 2005 | /* If possible, try to unplug the complete block in one shot. */ |
905c4c51 DH |
2006 | if (*nb_sb >= vm->sbm.sbs_per_mb && |
2007 | virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { | |
602ef894 DH |
2008 | rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, 0, |
2009 | vm->sbm.sbs_per_mb); | |
72f9525a | 2010 | if (!rc) { |
905c4c51 | 2011 | *nb_sb -= vm->sbm.sbs_per_mb; |
72f9525a DH |
2012 | goto unplugged; |
2013 | } else if (rc != -EBUSY) | |
2014 | return rc; | |
2015 | } | |
2016 | ||
2017 | /* Fallback to single subblocks. */ | |
905c4c51 | 2018 | for (sb_id = vm->sbm.sbs_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) { |
255f5985 | 2019 | /* Find the next candidate subblock */ |
562e08cd | 2020 | while (sb_id >= 0 && |
54c6a6ba | 2021 | !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) |
562e08cd DH |
2022 | sb_id--; |
2023 | if (sb_id < 0) | |
255f5985 DH |
2024 | break; |
2025 | ||
602ef894 | 2026 | rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1); |
72f9525a | 2027 | if (rc == -EBUSY) |
255f5985 | 2028 | continue; |
72f9525a | 2029 | else if (rc) |
255f5985 | 2030 | return rc; |
255f5985 DH |
2031 | *nb_sb -= 1; |
2032 | } | |
2033 | ||
72f9525a | 2034 | unplugged: |
a31648fd DH |
2035 | rc = virtio_mem_sbm_try_remove_unplugged_mb(vm, mb_id); |
2036 | if (rc) | |
2037 | vm->sbm.have_unplugged_mb = 1; | |
2038 | /* Ignore errors, this is not critical. We'll retry later. */ | |
255f5985 DH |
2039 | return 0; |
2040 | } | |
2041 | ||
5304ca3d DH |
2042 | /* |
2043 | * Unplug the desired number of plugged subblocks of a memory block that is | |
2044 | * already added to Linux. Will skip subblock of online memory blocks that are | |
2045 | * busy (by the OS). Will fail if any subblock that's not busy cannot get | |
2046 | * unplugged. | |
2047 | * | |
2048 | * Will modify the state of the memory block. Might temporarily drop the | |
2049 | * hotplug_mutex. | |
2050 | * | |
2051 | * Note: Can fail after some subblocks were successfully unplugged. Can | |
2052 | * return 0 even if subblocks were busy and could not get unplugged. | |
2053 | */ | |
2054 | static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm, | |
2055 | unsigned long mb_id, | |
2056 | uint64_t *nb_sb) | |
2057 | { | |
2058 | const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); | |
2059 | ||
2060 | switch (old_state) { | |
c740bb97 DH |
2061 | case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL: |
2062 | case VIRTIO_MEM_SBM_MB_KERNEL: | |
2063 | case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL: | |
2064 | case VIRTIO_MEM_SBM_MB_MOVABLE: | |
5304ca3d DH |
2065 | return virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, nb_sb); |
2066 | case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: | |
2067 | case VIRTIO_MEM_SBM_MB_OFFLINE: | |
2068 | return virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, nb_sb); | |
2069 | } | |
2070 | return -EINVAL; | |
2071 | } | |
2072 | ||
4ba50cd3 | 2073 | static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff) |
c627ff5d | 2074 | { |
5304ca3d DH |
2075 | const int mb_states[] = { |
2076 | VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, | |
2077 | VIRTIO_MEM_SBM_MB_OFFLINE, | |
c740bb97 DH |
2078 | VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, |
2079 | VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, | |
2080 | VIRTIO_MEM_SBM_MB_MOVABLE, | |
2081 | VIRTIO_MEM_SBM_MB_KERNEL, | |
5304ca3d | 2082 | }; |
905c4c51 | 2083 | uint64_t nb_sb = diff / vm->sbm.sb_size; |
c627ff5d | 2084 | unsigned long mb_id; |
5304ca3d | 2085 | int rc, i; |
c627ff5d DH |
2086 | |
2087 | if (!nb_sb) | |
2088 | return 0; | |
2089 | ||
2090 | /* | |
2091 | * We'll drop the mutex a couple of times when it is safe to do so. | |
2092 | * This might result in some blocks switching the state (online/offline) | |
2093 | * and we could miss them in this run - we will retry again later. | |
2094 | */ | |
2095 | mutex_lock(&vm->hotplug_mutex); | |
2096 | ||
5304ca3d DH |
2097 | /* |
2098 | * We try unplug from partially plugged blocks first, to try removing | |
c740bb97 DH |
2099 | * whole memory blocks along with metadata. We prioritize ZONE_MOVABLE |
2100 | * as it's more reliable to unplug memory and remove whole memory | |
2101 | * blocks, and we don't want to trigger a zone imbalances by | |
2102 | * accidentially removing too much kernel memory. | |
5304ca3d DH |
2103 | */ |
2104 | for (i = 0; i < ARRAY_SIZE(mb_states); i++) { | |
2105 | virtio_mem_sbm_for_each_mb_rev(vm, mb_id, mb_states[i]) { | |
2106 | rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb); | |
2107 | if (rc || !nb_sb) | |
2108 | goto out_unlock; | |
2109 | mutex_unlock(&vm->hotplug_mutex); | |
2110 | cond_resched(); | |
2111 | mutex_lock(&vm->hotplug_mutex); | |
2112 | } | |
2113 | if (!unplug_online && i == 1) { | |
2114 | mutex_unlock(&vm->hotplug_mutex); | |
2115 | return 0; | |
2116 | } | |
255f5985 DH |
2117 | } |
2118 | ||
c627ff5d | 2119 | mutex_unlock(&vm->hotplug_mutex); |
255f5985 | 2120 | return nb_sb ? -EBUSY : 0; |
c627ff5d DH |
2121 | out_unlock: |
2122 | mutex_unlock(&vm->hotplug_mutex); | |
2123 | return rc; | |
2124 | } | |
2125 | ||
269ac938 DH |
2126 | /* |
2127 | * Try to offline and remove a big block from Linux and unplug it. Will fail | |
2128 | * with -EBUSY if some memory is busy and cannot get unplugged. | |
2129 | * | |
2130 | * Will modify the state of the memory block. Might temporarily drop the | |
2131 | * hotplug_mutex. | |
2132 | */ | |
2133 | static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm, | |
2134 | unsigned long bb_id) | |
2135 | { | |
3711387a DH |
2136 | const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); |
2137 | const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); | |
2138 | unsigned long end_pfn = start_pfn + nr_pages; | |
2139 | unsigned long pfn; | |
2140 | struct page *page; | |
269ac938 DH |
2141 | int rc; |
2142 | ||
2143 | if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != | |
2144 | VIRTIO_MEM_BBM_BB_ADDED)) | |
2145 | return -EINVAL; | |
2146 | ||
f504e15b DH |
2147 | /* |
2148 | * Start by fake-offlining all memory. Once we marked the device | |
2149 | * block as fake-offline, all newly onlined memory will | |
2150 | * automatically be kept fake-offline. Protect from concurrent | |
2151 | * onlining/offlining until we have a consistent state. | |
2152 | */ | |
2153 | mutex_lock(&vm->hotplug_mutex); | |
2154 | virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_FAKE_OFFLINE); | |
3711387a | 2155 | |
f504e15b DH |
2156 | for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { |
2157 | page = pfn_to_online_page(pfn); | |
2158 | if (!page) | |
2159 | continue; | |
3711387a | 2160 | |
f55484fd | 2161 | rc = virtio_mem_fake_offline(vm, pfn, PAGES_PER_SECTION); |
f504e15b DH |
2162 | if (rc) { |
2163 | end_pfn = pfn; | |
2164 | goto rollback; | |
3711387a | 2165 | } |
3711387a | 2166 | } |
f504e15b | 2167 | mutex_unlock(&vm->hotplug_mutex); |
3711387a | 2168 | |
269ac938 | 2169 | rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id); |
3711387a | 2170 | if (rc) { |
f504e15b DH |
2171 | mutex_lock(&vm->hotplug_mutex); |
2172 | goto rollback; | |
3711387a | 2173 | } |
269ac938 DH |
2174 | |
2175 | rc = virtio_mem_bbm_unplug_bb(vm, bb_id); | |
2176 | if (rc) | |
2177 | virtio_mem_bbm_set_bb_state(vm, bb_id, | |
2178 | VIRTIO_MEM_BBM_BB_PLUGGED); | |
2179 | else | |
2180 | virtio_mem_bbm_set_bb_state(vm, bb_id, | |
2181 | VIRTIO_MEM_BBM_BB_UNUSED); | |
2182 | return rc; | |
3711387a | 2183 | |
f504e15b | 2184 | rollback: |
3711387a DH |
2185 | for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { |
2186 | page = pfn_to_online_page(pfn); | |
2187 | if (!page) | |
2188 | continue; | |
2189 | virtio_mem_fake_online(pfn, PAGES_PER_SECTION); | |
2190 | } | |
2191 | virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED); | |
2192 | mutex_unlock(&vm->hotplug_mutex); | |
2193 | return rc; | |
269ac938 DH |
2194 | } |
2195 | ||
2196 | /* | |
269ac938 | 2197 | * Test if a big block is completely offline. |
269ac938 | 2198 | */ |
269ac938 DH |
2199 | static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm, |
2200 | unsigned long bb_id) | |
269ac938 | 2201 | { |
269ac938 DH |
2202 | const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); |
2203 | const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); | |
2204 | unsigned long pfn; | |
269ac938 | 2205 | |
269ac938 DH |
2206 | for (pfn = start_pfn; pfn < start_pfn + nr_pages; |
2207 | pfn += PAGES_PER_SECTION) { | |
2208 | if (pfn_to_online_page(pfn)) | |
2209 | return false; | |
2210 | } | |
269ac938 | 2211 | |
269ac938 | 2212 | return true; |
269ac938 DH |
2213 | } |
2214 | ||
2215 | /* | |
db7b3377 | 2216 | * Test if a big block is completely onlined to ZONE_MOVABLE (or offline). |
269ac938 | 2217 | */ |
db7b3377 | 2218 | static bool virtio_mem_bbm_bb_is_movable(struct virtio_mem *vm, |
269ac938 DH |
2219 | unsigned long bb_id) |
2220 | { | |
2221 | const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); | |
2222 | const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); | |
db7b3377 | 2223 | struct page *page; |
269ac938 DH |
2224 | unsigned long pfn; |
2225 | ||
2226 | for (pfn = start_pfn; pfn < start_pfn + nr_pages; | |
2227 | pfn += PAGES_PER_SECTION) { | |
db7b3377 DH |
2228 | page = pfn_to_online_page(pfn); |
2229 | if (!page) | |
2230 | continue; | |
2231 | if (page_zonenum(page) != ZONE_MOVABLE) | |
269ac938 DH |
2232 | return false; |
2233 | } | |
2234 | ||
2235 | return true; | |
2236 | } | |
2237 | ||
2238 | static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff) | |
2239 | { | |
2240 | uint64_t nb_bb = diff / vm->bbm.bb_size; | |
2241 | uint64_t bb_id; | |
c6bc1422 | 2242 | int rc, i; |
269ac938 DH |
2243 | |
2244 | if (!nb_bb) | |
2245 | return 0; | |
2246 | ||
c6bc1422 DH |
2247 | /* |
2248 | * Try to unplug big blocks. Similar to SBM, start with offline | |
2249 | * big blocks. | |
2250 | */ | |
db7b3377 | 2251 | for (i = 0; i < 3; i++) { |
c6bc1422 DH |
2252 | virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) { |
2253 | cond_resched(); | |
269ac938 | 2254 | |
c6bc1422 DH |
2255 | /* |
2256 | * As we're holding no locks, these checks are racy, | |
2257 | * but we don't care. | |
2258 | */ | |
2259 | if (i == 0 && !virtio_mem_bbm_bb_is_offline(vm, bb_id)) | |
2260 | continue; | |
db7b3377 DH |
2261 | if (i == 1 && !virtio_mem_bbm_bb_is_movable(vm, bb_id)) |
2262 | continue; | |
c6bc1422 DH |
2263 | rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id); |
2264 | if (rc == -EBUSY) | |
2265 | continue; | |
2266 | if (!rc) | |
2267 | nb_bb--; | |
2268 | if (rc || !nb_bb) | |
2269 | return rc; | |
2270 | } | |
2271 | if (i == 0 && !unplug_online) | |
2272 | return 0; | |
269ac938 DH |
2273 | } |
2274 | ||
2275 | return nb_bb ? -EBUSY : 0; | |
2276 | } | |
2277 | ||
4ba50cd3 DH |
2278 | /* |
2279 | * Try to unplug the requested amount of memory. | |
2280 | */ | |
2281 | static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) | |
2282 | { | |
2283 | if (vm->in_sbm) | |
2284 | return virtio_mem_sbm_unplug_request(vm, diff); | |
269ac938 | 2285 | return virtio_mem_bbm_unplug_request(vm, diff); |
4ba50cd3 DH |
2286 | } |
2287 | ||
5f1f79bb DH |
2288 | /* |
2289 | * Try to unplug all blocks that couldn't be unplugged before, for example, | |
a31648fd DH |
2290 | * because the hypervisor was busy. Further, offline and remove any memory |
2291 | * blocks where we previously failed. | |
5f1f79bb | 2292 | */ |
a31648fd | 2293 | static int virtio_mem_cleanup_pending_mb(struct virtio_mem *vm) |
5f1f79bb | 2294 | { |
4ba50cd3 | 2295 | unsigned long id; |
a31648fd | 2296 | int rc = 0; |
5f1f79bb | 2297 | |
4ba50cd3 DH |
2298 | if (!vm->in_sbm) { |
2299 | virtio_mem_bbm_for_each_bb(vm, id, | |
2300 | VIRTIO_MEM_BBM_BB_PLUGGED) { | |
2301 | rc = virtio_mem_bbm_unplug_bb(vm, id); | |
2302 | if (rc) | |
2303 | return rc; | |
2304 | virtio_mem_bbm_set_bb_state(vm, id, | |
2305 | VIRTIO_MEM_BBM_BB_UNUSED); | |
2306 | } | |
2307 | return 0; | |
2308 | } | |
2309 | ||
2310 | virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_PLUGGED) { | |
2311 | rc = virtio_mem_sbm_unplug_mb(vm, id); | |
5f1f79bb DH |
2312 | if (rc) |
2313 | return rc; | |
4ba50cd3 | 2314 | virtio_mem_sbm_set_mb_state(vm, id, |
99f0b55e | 2315 | VIRTIO_MEM_SBM_MB_UNUSED); |
5f1f79bb DH |
2316 | } |
2317 | ||
a31648fd DH |
2318 | if (!vm->sbm.have_unplugged_mb) |
2319 | return 0; | |
2320 | ||
2321 | /* | |
2322 | * Let's retry (offlining and) removing completely unplugged Linux | |
2323 | * memory blocks. | |
2324 | */ | |
2325 | vm->sbm.have_unplugged_mb = false; | |
2326 | ||
2327 | mutex_lock(&vm->hotplug_mutex); | |
2328 | virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL) | |
2329 | rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id); | |
2330 | virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL) | |
2331 | rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id); | |
2332 | virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) | |
2333 | rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id); | |
2334 | mutex_unlock(&vm->hotplug_mutex); | |
2335 | ||
2336 | if (rc) | |
2337 | vm->sbm.have_unplugged_mb = true; | |
2338 | /* Ignore errors, this is not critical. We'll retry later. */ | |
5f1f79bb DH |
2339 | return 0; |
2340 | } | |
2341 | ||
2342 | /* | |
2343 | * Update all parts of the config that could have changed. | |
2344 | */ | |
2345 | static void virtio_mem_refresh_config(struct virtio_mem *vm) | |
2346 | { | |
94c89453 | 2347 | const struct range pluggable_range = mhp_get_pluggable_range(true); |
5f1f79bb DH |
2348 | uint64_t new_plugged_size, usable_region_size, end_addr; |
2349 | ||
2350 | /* the plugged_size is just a reflection of what _we_ did previously */ | |
99e0d048 MT |
2351 | virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size, |
2352 | &new_plugged_size); | |
5f1f79bb DH |
2353 | if (WARN_ON_ONCE(new_plugged_size != vm->plugged_size)) |
2354 | vm->plugged_size = new_plugged_size; | |
2355 | ||
2356 | /* calculate the last usable memory block id */ | |
99e0d048 MT |
2357 | virtio_cread_le(vm->vdev, struct virtio_mem_config, |
2358 | usable_region_size, &usable_region_size); | |
94c89453 DH |
2359 | end_addr = min(vm->addr + usable_region_size - 1, |
2360 | pluggable_range.end); | |
4ba50cd3 | 2361 | |
94c89453 DH |
2362 | if (vm->in_sbm) { |
2363 | vm->sbm.last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr); | |
2364 | if (!IS_ALIGNED(end_addr + 1, memory_block_size_bytes())) | |
2365 | vm->sbm.last_usable_mb_id--; | |
2366 | } else { | |
2367 | vm->bbm.last_usable_bb_id = virtio_mem_phys_to_bb_id(vm, | |
2368 | end_addr); | |
2369 | if (!IS_ALIGNED(end_addr + 1, vm->bbm.bb_size)) | |
2370 | vm->bbm.last_usable_bb_id--; | |
2371 | } | |
2372 | /* | |
2373 | * If we cannot plug any of our device memory (e.g., nothing in the | |
2374 | * usable region is addressable), the last usable memory block id will | |
2375 | * be smaller than the first usable memory block id. We'll stop | |
2376 | * attempting to add memory with -ENOSPC from our main loop. | |
2377 | */ | |
5f1f79bb DH |
2378 | |
2379 | /* see if there is a request to change the size */ | |
99e0d048 MT |
2380 | virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size, |
2381 | &vm->requested_size); | |
5f1f79bb DH |
2382 | |
2383 | dev_info(&vm->vdev->dev, "plugged size: 0x%llx", vm->plugged_size); | |
2384 | dev_info(&vm->vdev->dev, "requested size: 0x%llx", vm->requested_size); | |
2385 | } | |
2386 | ||
2387 | /* | |
2388 | * Workqueue function for handling plug/unplug requests and config updates. | |
2389 | */ | |
2390 | static void virtio_mem_run_wq(struct work_struct *work) | |
2391 | { | |
2392 | struct virtio_mem *vm = container_of(work, struct virtio_mem, wq); | |
2393 | uint64_t diff; | |
2394 | int rc; | |
2395 | ||
ce281462 DH |
2396 | if (unlikely(vm->in_kdump)) { |
2397 | dev_warn_once(&vm->vdev->dev, | |
2398 | "unexpected workqueue run in kdump kernel\n"); | |
2399 | return; | |
2400 | } | |
2401 | ||
5f1f79bb DH |
2402 | hrtimer_cancel(&vm->retry_timer); |
2403 | ||
2404 | if (vm->broken) | |
2405 | return; | |
2406 | ||
98ff9f94 | 2407 | atomic_set(&vm->wq_active, 1); |
5f1f79bb DH |
2408 | retry: |
2409 | rc = 0; | |
2410 | ||
2411 | /* Make sure we start with a clean state if there are leftovers. */ | |
2412 | if (unlikely(vm->unplug_all_required)) | |
2413 | rc = virtio_mem_send_unplug_all_request(vm); | |
2414 | ||
2415 | if (atomic_read(&vm->config_changed)) { | |
2416 | atomic_set(&vm->config_changed, 0); | |
2417 | virtio_mem_refresh_config(vm); | |
2418 | } | |
2419 | ||
a31648fd | 2420 | /* Cleanup any leftovers from previous runs */ |
5f1f79bb | 2421 | if (!rc) |
a31648fd | 2422 | rc = virtio_mem_cleanup_pending_mb(vm); |
5f1f79bb DH |
2423 | |
2424 | if (!rc && vm->requested_size != vm->plugged_size) { | |
2425 | if (vm->requested_size > vm->plugged_size) { | |
2426 | diff = vm->requested_size - vm->plugged_size; | |
2427 | rc = virtio_mem_plug_request(vm, diff); | |
c627ff5d DH |
2428 | } else { |
2429 | diff = vm->plugged_size - vm->requested_size; | |
2430 | rc = virtio_mem_unplug_request(vm, diff); | |
5f1f79bb | 2431 | } |
5f1f79bb DH |
2432 | } |
2433 | ||
a31648fd DH |
2434 | /* |
2435 | * Keep retrying to offline and remove completely unplugged Linux | |
2436 | * memory blocks. | |
2437 | */ | |
2438 | if (!rc && vm->in_sbm && vm->sbm.have_unplugged_mb) | |
2439 | rc = -EBUSY; | |
2440 | ||
5f1f79bb DH |
2441 | switch (rc) { |
2442 | case 0: | |
23e77b5d | 2443 | vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS; |
5f1f79bb DH |
2444 | break; |
2445 | case -ENOSPC: | |
2446 | /* | |
2447 | * We cannot add any more memory (alignment, physical limit) | |
2448 | * or we have too many offline memory blocks. | |
2449 | */ | |
2450 | break; | |
8d4edcfe | 2451 | case -ETXTBSY: |
5f1f79bb DH |
2452 | /* |
2453 | * The hypervisor cannot process our request right now | |
8d4edcfe DH |
2454 | * (e.g., out of memory, migrating); |
2455 | */ | |
2456 | case -EBUSY: | |
2457 | /* | |
2458 | * We cannot free up any memory to unplug it (all plugged memory | |
2459 | * is busy). | |
5f1f79bb DH |
2460 | */ |
2461 | case -ENOMEM: | |
2462 | /* Out of memory, try again later. */ | |
23e77b5d | 2463 | hrtimer_start(&vm->retry_timer, ms_to_ktime(vm->retry_timer_ms), |
5f1f79bb DH |
2464 | HRTIMER_MODE_REL); |
2465 | break; | |
2466 | case -EAGAIN: | |
2467 | /* Retry immediately (e.g., the config changed). */ | |
2468 | goto retry; | |
2469 | default: | |
2470 | /* Unknown error, mark as broken */ | |
2471 | dev_err(&vm->vdev->dev, | |
2472 | "unknown error, marking device broken: %d\n", rc); | |
2473 | vm->broken = true; | |
2474 | } | |
98ff9f94 DH |
2475 | |
2476 | atomic_set(&vm->wq_active, 0); | |
5f1f79bb DH |
2477 | } |
2478 | ||
2479 | static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer) | |
2480 | { | |
2481 | struct virtio_mem *vm = container_of(timer, struct virtio_mem, | |
2482 | retry_timer); | |
2483 | ||
2484 | virtio_mem_retry(vm); | |
23e77b5d DH |
2485 | vm->retry_timer_ms = min_t(unsigned int, vm->retry_timer_ms * 2, |
2486 | VIRTIO_MEM_RETRY_TIMER_MAX_MS); | |
5f1f79bb DH |
2487 | return HRTIMER_NORESTART; |
2488 | } | |
2489 | ||
2490 | static void virtio_mem_handle_response(struct virtqueue *vq) | |
2491 | { | |
2492 | struct virtio_mem *vm = vq->vdev->priv; | |
2493 | ||
2494 | wake_up(&vm->host_resp); | |
2495 | } | |
2496 | ||
2497 | static int virtio_mem_init_vq(struct virtio_mem *vm) | |
2498 | { | |
2499 | struct virtqueue *vq; | |
2500 | ||
2501 | vq = virtio_find_single_vq(vm->vdev, virtio_mem_handle_response, | |
2502 | "guest-request"); | |
2503 | if (IS_ERR(vq)) | |
2504 | return PTR_ERR(vq); | |
2505 | vm->vq = vq; | |
2506 | ||
2507 | return 0; | |
2508 | } | |
2509 | ||
94300fcf | 2510 | static int virtio_mem_init_hotplug(struct virtio_mem *vm) |
5f1f79bb | 2511 | { |
94c89453 | 2512 | const struct range pluggable_range = mhp_get_pluggable_range(true); |
84e17e68 DH |
2513 | uint64_t unit_pages, sb_size, addr; |
2514 | int rc; | |
6725f211 | 2515 | |
5f1f79bb DH |
2516 | /* bad device setup - warn only */ |
2517 | if (!IS_ALIGNED(vm->addr, memory_block_size_bytes())) | |
2518 | dev_warn(&vm->vdev->dev, | |
2519 | "The alignment of the physical start address can make some memory unusable.\n"); | |
2520 | if (!IS_ALIGNED(vm->addr + vm->region_size, memory_block_size_bytes())) | |
2521 | dev_warn(&vm->vdev->dev, | |
2522 | "The alignment of the physical end address can make some memory unusable.\n"); | |
94c89453 DH |
2523 | if (vm->addr < pluggable_range.start || |
2524 | vm->addr + vm->region_size - 1 > pluggable_range.end) | |
5f1f79bb | 2525 | dev_warn(&vm->vdev->dev, |
94c89453 | 2526 | "Some device memory is not addressable/pluggable. This can make some memory unusable.\n"); |
5f1f79bb | 2527 | |
500817bf DH |
2528 | /* Prepare the offline threshold - make sure we can add two blocks. */ |
2529 | vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(), | |
2530 | VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD); | |
2531 | ||
5f1f79bb | 2532 | /* |
448b8ec3 ZY |
2533 | * alloc_contig_range() works reliably with pageblock |
2534 | * granularity on ZONE_NORMAL, use pageblock_nr_pages. | |
5f1f79bb | 2535 | */ |
448b8ec3 | 2536 | sb_size = PAGE_SIZE * pageblock_nr_pages; |
4ba50cd3 DH |
2537 | sb_size = max_t(uint64_t, vm->device_block_size, sb_size); |
2538 | ||
faa45ff4 | 2539 | if (sb_size < memory_block_size_bytes() && !force_bbm) { |
4ba50cd3 DH |
2540 | /* SBM: At least two subblocks per Linux memory block. */ |
2541 | vm->in_sbm = true; | |
2542 | vm->sbm.sb_size = sb_size; | |
2543 | vm->sbm.sbs_per_mb = memory_block_size_bytes() / | |
2544 | vm->sbm.sb_size; | |
2545 | ||
2546 | /* Round up to the next full memory block */ | |
94c89453 DH |
2547 | addr = max_t(uint64_t, vm->addr, pluggable_range.start) + |
2548 | memory_block_size_bytes() - 1; | |
4ba50cd3 DH |
2549 | vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr); |
2550 | vm->sbm.next_mb_id = vm->sbm.first_mb_id; | |
2551 | } else { | |
2552 | /* BBM: At least one Linux memory block. */ | |
faa45ff4 DH |
2553 | vm->bbm.bb_size = max_t(uint64_t, vm->device_block_size, |
2554 | memory_block_size_bytes()); | |
2555 | ||
2556 | if (bbm_block_size) { | |
2557 | if (!is_power_of_2(bbm_block_size)) { | |
2558 | dev_warn(&vm->vdev->dev, | |
2559 | "bbm_block_size is not a power of 2"); | |
2560 | } else if (bbm_block_size < vm->bbm.bb_size) { | |
2561 | dev_warn(&vm->vdev->dev, | |
2562 | "bbm_block_size is too small"); | |
2563 | } else { | |
2564 | vm->bbm.bb_size = bbm_block_size; | |
2565 | } | |
2566 | } | |
5f1f79bb | 2567 | |
faa45ff4 | 2568 | /* Round up to the next aligned big block */ |
94c89453 DH |
2569 | addr = max_t(uint64_t, vm->addr, pluggable_range.start) + |
2570 | vm->bbm.bb_size - 1; | |
faa45ff4 | 2571 | vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr); |
4ba50cd3 | 2572 | vm->bbm.next_bb_id = vm->bbm.first_bb_id; |
5f1f79bb | 2573 | |
500817bf DH |
2574 | /* Make sure we can add two big blocks. */ |
2575 | vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size, | |
2576 | vm->offline_threshold); | |
2577 | } | |
98ff9f94 | 2578 | |
5f1f79bb DH |
2579 | dev_info(&vm->vdev->dev, "memory block size: 0x%lx", |
2580 | memory_block_size_bytes()); | |
4ba50cd3 DH |
2581 | if (vm->in_sbm) |
2582 | dev_info(&vm->vdev->dev, "subblock size: 0x%llx", | |
2583 | (unsigned long long)vm->sbm.sb_size); | |
2584 | else | |
2585 | dev_info(&vm->vdev->dev, "big block size: 0x%llx", | |
2586 | (unsigned long long)vm->bbm.bb_size); | |
94300fcf | 2587 | |
84e17e68 DH |
2588 | /* create the parent resource for all memory */ |
2589 | rc = virtio_mem_create_resource(vm); | |
2590 | if (rc) | |
2591 | return rc; | |
2592 | ||
2593 | /* use a single dynamic memory group to cover the whole memory device */ | |
2594 | if (vm->in_sbm) | |
2595 | unit_pages = PHYS_PFN(memory_block_size_bytes()); | |
2596 | else | |
2597 | unit_pages = PHYS_PFN(vm->bbm.bb_size); | |
2598 | rc = memory_group_register_dynamic(vm->nid, unit_pages); | |
2599 | if (rc < 0) | |
2600 | goto out_del_resource; | |
2601 | vm->mgid = rc; | |
2602 | ||
2603 | /* | |
2604 | * If we still have memory plugged, we have to unplug all memory first. | |
2605 | * Registering our parent resource makes sure that this memory isn't | |
2606 | * actually in use (e.g., trying to reload the driver). | |
2607 | */ | |
2608 | if (vm->plugged_size) { | |
2609 | vm->unplug_all_required = true; | |
2610 | dev_info(&vm->vdev->dev, "unplugging all memory is required\n"); | |
2611 | } | |
2612 | ||
2613 | /* register callbacks */ | |
2614 | vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb; | |
2615 | rc = register_memory_notifier(&vm->memory_notifier); | |
2616 | if (rc) | |
2617 | goto out_unreg_group; | |
2618 | rc = register_virtio_mem_device(vm); | |
2619 | if (rc) | |
2620 | goto out_unreg_mem; | |
2621 | ||
94300fcf | 2622 | return 0; |
84e17e68 DH |
2623 | out_unreg_mem: |
2624 | unregister_memory_notifier(&vm->memory_notifier); | |
2625 | out_unreg_group: | |
2626 | memory_group_unregister(vm->mgid); | |
2627 | out_del_resource: | |
2628 | virtio_mem_delete_resource(vm); | |
2629 | return rc; | |
94300fcf DH |
2630 | } |
2631 | ||
ce281462 DH |
2632 | #ifdef CONFIG_PROC_VMCORE |
2633 | static int virtio_mem_send_state_request(struct virtio_mem *vm, uint64_t addr, | |
2634 | uint64_t size) | |
2635 | { | |
2636 | const uint64_t nb_vm_blocks = size / vm->device_block_size; | |
2637 | const struct virtio_mem_req req = { | |
2638 | .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_STATE), | |
2639 | .u.state.addr = cpu_to_virtio64(vm->vdev, addr), | |
2640 | .u.state.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), | |
2641 | }; | |
2642 | int rc = -ENOMEM; | |
2643 | ||
2644 | dev_dbg(&vm->vdev->dev, "requesting state: 0x%llx - 0x%llx\n", addr, | |
2645 | addr + size - 1); | |
2646 | ||
2647 | switch (virtio_mem_send_request(vm, &req)) { | |
2648 | case VIRTIO_MEM_RESP_ACK: | |
2649 | return virtio16_to_cpu(vm->vdev, vm->resp.u.state.state); | |
2650 | case VIRTIO_MEM_RESP_ERROR: | |
2651 | rc = -EINVAL; | |
2652 | break; | |
2653 | default: | |
2654 | break; | |
2655 | } | |
2656 | ||
2657 | dev_dbg(&vm->vdev->dev, "requesting state failed: %d\n", rc); | |
2658 | return rc; | |
2659 | } | |
2660 | ||
2661 | static bool virtio_mem_vmcore_pfn_is_ram(struct vmcore_cb *cb, | |
2662 | unsigned long pfn) | |
2663 | { | |
2664 | struct virtio_mem *vm = container_of(cb, struct virtio_mem, | |
2665 | vmcore_cb); | |
2666 | uint64_t addr = PFN_PHYS(pfn); | |
2667 | bool is_ram; | |
2668 | int rc; | |
2669 | ||
2670 | if (!virtio_mem_contains_range(vm, addr, PAGE_SIZE)) | |
2671 | return true; | |
2672 | if (!vm->plugged_size) | |
2673 | return false; | |
2674 | ||
2675 | /* | |
2676 | * We have to serialize device requests and access to the information | |
2677 | * about the block queried last. | |
2678 | */ | |
2679 | mutex_lock(&vm->hotplug_mutex); | |
2680 | ||
2681 | addr = ALIGN_DOWN(addr, vm->device_block_size); | |
2682 | if (addr != vm->last_block_addr) { | |
2683 | rc = virtio_mem_send_state_request(vm, addr, | |
2684 | vm->device_block_size); | |
2685 | /* On any kind of error, we're going to signal !ram. */ | |
2686 | if (rc == VIRTIO_MEM_STATE_PLUGGED) | |
2687 | vm->last_block_plugged = true; | |
2688 | else | |
2689 | vm->last_block_plugged = false; | |
2690 | vm->last_block_addr = addr; | |
2691 | } | |
2692 | ||
2693 | is_ram = vm->last_block_plugged; | |
2694 | mutex_unlock(&vm->hotplug_mutex); | |
2695 | return is_ram; | |
2696 | } | |
2697 | #endif /* CONFIG_PROC_VMCORE */ | |
2698 | ||
2699 | static int virtio_mem_init_kdump(struct virtio_mem *vm) | |
2700 | { | |
2701 | #ifdef CONFIG_PROC_VMCORE | |
2702 | dev_info(&vm->vdev->dev, "memory hot(un)plug disabled in kdump kernel\n"); | |
2703 | vm->vmcore_cb.pfn_is_ram = virtio_mem_vmcore_pfn_is_ram; | |
2704 | register_vmcore_cb(&vm->vmcore_cb); | |
2705 | return 0; | |
2706 | #else /* CONFIG_PROC_VMCORE */ | |
2707 | dev_warn(&vm->vdev->dev, "disabled in kdump kernel without vmcore\n"); | |
2708 | return -EBUSY; | |
2709 | #endif /* CONFIG_PROC_VMCORE */ | |
2710 | } | |
2711 | ||
94300fcf DH |
2712 | static int virtio_mem_init(struct virtio_mem *vm) |
2713 | { | |
2714 | uint16_t node_id; | |
2715 | ||
2716 | if (!vm->vdev->config->get) { | |
2717 | dev_err(&vm->vdev->dev, "config access disabled\n"); | |
2718 | return -EINVAL; | |
2719 | } | |
2720 | ||
94300fcf DH |
2721 | /* Fetch all properties that can't change. */ |
2722 | virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size, | |
2723 | &vm->plugged_size); | |
2724 | virtio_cread_le(vm->vdev, struct virtio_mem_config, block_size, | |
2725 | &vm->device_block_size); | |
2726 | virtio_cread_le(vm->vdev, struct virtio_mem_config, node_id, | |
2727 | &node_id); | |
2728 | vm->nid = virtio_mem_translate_node_id(vm, node_id); | |
2729 | virtio_cread_le(vm->vdev, struct virtio_mem_config, addr, &vm->addr); | |
2730 | virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size, | |
2731 | &vm->region_size); | |
2732 | ||
2733 | /* Determine the nid for the device based on the lowest address. */ | |
2734 | if (vm->nid == NUMA_NO_NODE) | |
2735 | vm->nid = memory_add_physaddr_to_nid(vm->addr); | |
2736 | ||
2737 | dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr); | |
2738 | dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size); | |
2739 | dev_info(&vm->vdev->dev, "device block size: 0x%llx", | |
2740 | (unsigned long long)vm->device_block_size); | |
6725f211 | 2741 | if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA)) |
f2af6d39 | 2742 | dev_info(&vm->vdev->dev, "nid: %d", vm->nid); |
5f1f79bb | 2743 | |
ce281462 DH |
2744 | /* |
2745 | * We don't want to (un)plug or reuse any memory when in kdump. The | |
2746 | * memory is still accessible (but not exposed to Linux). | |
2747 | */ | |
2748 | if (vm->in_kdump) | |
2749 | return virtio_mem_init_kdump(vm); | |
94300fcf | 2750 | return virtio_mem_init_hotplug(vm); |
5f1f79bb DH |
2751 | } |
2752 | ||
ebf71552 DH |
2753 | static int virtio_mem_create_resource(struct virtio_mem *vm) |
2754 | { | |
2755 | /* | |
2756 | * When force-unloading the driver and removing the device, we | |
2757 | * could have a garbage pointer. Duplicate the string. | |
2758 | */ | |
2759 | const char *name = kstrdup(dev_name(&vm->vdev->dev), GFP_KERNEL); | |
2760 | ||
2761 | if (!name) | |
2762 | return -ENOMEM; | |
2763 | ||
2128f4e2 | 2764 | /* Disallow mapping device memory via /dev/mem completely. */ |
ebf71552 | 2765 | vm->parent_resource = __request_mem_region(vm->addr, vm->region_size, |
2128f4e2 DH |
2766 | name, IORESOURCE_SYSTEM_RAM | |
2767 | IORESOURCE_EXCLUSIVE); | |
ebf71552 DH |
2768 | if (!vm->parent_resource) { |
2769 | kfree(name); | |
2770 | dev_warn(&vm->vdev->dev, "could not reserve device region\n"); | |
3c42e198 DH |
2771 | dev_info(&vm->vdev->dev, |
2772 | "reloading the driver is not supported\n"); | |
ebf71552 DH |
2773 | return -EBUSY; |
2774 | } | |
2775 | ||
2776 | /* The memory is not actually busy - make add_memory() work. */ | |
2777 | vm->parent_resource->flags &= ~IORESOURCE_BUSY; | |
2778 | return 0; | |
2779 | } | |
2780 | ||
2781 | static void virtio_mem_delete_resource(struct virtio_mem *vm) | |
2782 | { | |
2783 | const char *name; | |
2784 | ||
2785 | if (!vm->parent_resource) | |
2786 | return; | |
2787 | ||
2788 | name = vm->parent_resource->name; | |
2789 | release_resource(vm->parent_resource); | |
2790 | kfree(vm->parent_resource); | |
2791 | kfree(name); | |
2792 | vm->parent_resource = NULL; | |
2793 | } | |
2794 | ||
989ff825 DH |
2795 | static int virtio_mem_range_has_system_ram(struct resource *res, void *arg) |
2796 | { | |
2797 | return 1; | |
2798 | } | |
2799 | ||
2800 | static bool virtio_mem_has_memory_added(struct virtio_mem *vm) | |
2801 | { | |
2802 | const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; | |
2803 | ||
2804 | return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr, | |
2805 | vm->addr + vm->region_size, NULL, | |
2806 | virtio_mem_range_has_system_ram) == 1; | |
2807 | } | |
2808 | ||
5f1f79bb DH |
2809 | static int virtio_mem_probe(struct virtio_device *vdev) |
2810 | { | |
2811 | struct virtio_mem *vm; | |
b3fb6de7 | 2812 | int rc; |
5f1f79bb | 2813 | |
fce8afd7 DH |
2814 | BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24); |
2815 | BUILD_BUG_ON(sizeof(struct virtio_mem_resp) != 10); | |
2816 | ||
5f1f79bb DH |
2817 | vdev->priv = vm = kzalloc(sizeof(*vm), GFP_KERNEL); |
2818 | if (!vm) | |
2819 | return -ENOMEM; | |
2820 | ||
2821 | init_waitqueue_head(&vm->host_resp); | |
2822 | vm->vdev = vdev; | |
2823 | INIT_WORK(&vm->wq, virtio_mem_run_wq); | |
2824 | mutex_init(&vm->hotplug_mutex); | |
2825 | INIT_LIST_HEAD(&vm->next); | |
2826 | spin_lock_init(&vm->removal_lock); | |
2827 | hrtimer_init(&vm->retry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | |
2828 | vm->retry_timer.function = virtio_mem_timer_expired; | |
23e77b5d | 2829 | vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS; |
ce281462 | 2830 | vm->in_kdump = is_kdump_kernel(); |
5f1f79bb DH |
2831 | |
2832 | /* register the virtqueue */ | |
2833 | rc = virtio_mem_init_vq(vm); | |
2834 | if (rc) | |
2835 | goto out_free_vm; | |
2836 | ||
2837 | /* initialize the device by querying the config */ | |
2838 | rc = virtio_mem_init(vm); | |
2839 | if (rc) | |
2840 | goto out_del_vq; | |
2841 | ||
5f1f79bb DH |
2842 | virtio_device_ready(vdev); |
2843 | ||
2844 | /* trigger a config update to start processing the requested_size */ | |
ce281462 DH |
2845 | if (!vm->in_kdump) { |
2846 | atomic_set(&vm->config_changed, 1); | |
2847 | queue_work(system_freezable_wq, &vm->wq); | |
2848 | } | |
5f1f79bb DH |
2849 | |
2850 | return 0; | |
5f1f79bb DH |
2851 | out_del_vq: |
2852 | vdev->config->del_vqs(vdev); | |
2853 | out_free_vm: | |
2854 | kfree(vm); | |
2855 | vdev->priv = NULL; | |
2856 | ||
2857 | return rc; | |
2858 | } | |
2859 | ||
ffc763d0 | 2860 | static void virtio_mem_deinit_hotplug(struct virtio_mem *vm) |
5f1f79bb | 2861 | { |
5f1f79bb DH |
2862 | unsigned long mb_id; |
2863 | int rc; | |
2864 | ||
2865 | /* | |
2866 | * Make sure the workqueue won't be triggered anymore and no memory | |
2867 | * blocks can be onlined/offlined until we're finished here. | |
2868 | */ | |
2869 | mutex_lock(&vm->hotplug_mutex); | |
2870 | spin_lock_irq(&vm->removal_lock); | |
2871 | vm->removing = true; | |
2872 | spin_unlock_irq(&vm->removal_lock); | |
2873 | mutex_unlock(&vm->hotplug_mutex); | |
2874 | ||
2875 | /* wait until the workqueue stopped */ | |
2876 | cancel_work_sync(&vm->wq); | |
2877 | hrtimer_cancel(&vm->retry_timer); | |
2878 | ||
4ba50cd3 DH |
2879 | if (vm->in_sbm) { |
2880 | /* | |
2881 | * After we unregistered our callbacks, user space can online | |
2882 | * partially plugged offline blocks. Make sure to remove them. | |
2883 | */ | |
2884 | virtio_mem_sbm_for_each_mb(vm, mb_id, | |
2885 | VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { | |
2886 | rc = virtio_mem_sbm_remove_mb(vm, mb_id); | |
2887 | BUG_ON(rc); | |
2888 | virtio_mem_sbm_set_mb_state(vm, mb_id, | |
2889 | VIRTIO_MEM_SBM_MB_UNUSED); | |
2890 | } | |
2891 | /* | |
2892 | * After we unregistered our callbacks, user space can no longer | |
2893 | * offline partially plugged online memory blocks. No need to | |
2894 | * worry about them. | |
2895 | */ | |
5f1f79bb DH |
2896 | } |
2897 | ||
2898 | /* unregister callbacks */ | |
2899 | unregister_virtio_mem_device(vm); | |
2900 | unregister_memory_notifier(&vm->memory_notifier); | |
2901 | ||
2902 | /* | |
2903 | * There is no way we could reliably remove all memory we have added to | |
2904 | * the system. And there is no way to stop the driver/device from going | |
2905 | * away. Warn at least. | |
2906 | */ | |
989ff825 | 2907 | if (virtio_mem_has_memory_added(vm)) { |
ffc763d0 DH |
2908 | dev_warn(&vm->vdev->dev, |
2909 | "device still has system memory added\n"); | |
b3562c60 | 2910 | } else { |
ebf71552 | 2911 | virtio_mem_delete_resource(vm); |
b3562c60 | 2912 | kfree_const(vm->resource_name); |
ffaa6ce8 | 2913 | memory_group_unregister(vm->mgid); |
b3562c60 | 2914 | } |
5f1f79bb DH |
2915 | |
2916 | /* remove all tracking data - no locking needed */ | |
4ba50cd3 DH |
2917 | if (vm->in_sbm) { |
2918 | vfree(vm->sbm.mb_states); | |
2919 | vfree(vm->sbm.sb_states); | |
2920 | } else { | |
2921 | vfree(vm->bbm.bb_states); | |
2922 | } | |
ffc763d0 DH |
2923 | } |
2924 | ||
ce281462 DH |
2925 | static void virtio_mem_deinit_kdump(struct virtio_mem *vm) |
2926 | { | |
2927 | #ifdef CONFIG_PROC_VMCORE | |
2928 | unregister_vmcore_cb(&vm->vmcore_cb); | |
2929 | #endif /* CONFIG_PROC_VMCORE */ | |
2930 | } | |
2931 | ||
ffc763d0 DH |
2932 | static void virtio_mem_remove(struct virtio_device *vdev) |
2933 | { | |
2934 | struct virtio_mem *vm = vdev->priv; | |
2935 | ||
ce281462 DH |
2936 | if (vm->in_kdump) |
2937 | virtio_mem_deinit_kdump(vm); | |
2938 | else | |
2939 | virtio_mem_deinit_hotplug(vm); | |
5f1f79bb DH |
2940 | |
2941 | /* reset the device and cleanup the queues */ | |
d9679d00 | 2942 | virtio_reset_device(vdev); |
5f1f79bb DH |
2943 | vdev->config->del_vqs(vdev); |
2944 | ||
2945 | kfree(vm); | |
2946 | vdev->priv = NULL; | |
2947 | } | |
2948 | ||
2949 | static void virtio_mem_config_changed(struct virtio_device *vdev) | |
2950 | { | |
2951 | struct virtio_mem *vm = vdev->priv; | |
2952 | ||
ce281462 DH |
2953 | if (unlikely(vm->in_kdump)) |
2954 | return; | |
2955 | ||
5f1f79bb DH |
2956 | atomic_set(&vm->config_changed, 1); |
2957 | virtio_mem_retry(vm); | |
2958 | } | |
2959 | ||
2960 | #ifdef CONFIG_PM_SLEEP | |
2961 | static int virtio_mem_freeze(struct virtio_device *vdev) | |
2962 | { | |
2963 | /* | |
2964 | * When restarting the VM, all memory is usually unplugged. Don't | |
2965 | * allow to suspend/hibernate. | |
2966 | */ | |
2967 | dev_err(&vdev->dev, "save/restore not supported.\n"); | |
2968 | return -EPERM; | |
2969 | } | |
2970 | ||
2971 | static int virtio_mem_restore(struct virtio_device *vdev) | |
2972 | { | |
2973 | return -EPERM; | |
2974 | } | |
2975 | #endif | |
2976 | ||
f2af6d39 DH |
2977 | static unsigned int virtio_mem_features[] = { |
2978 | #if defined(CONFIG_NUMA) && defined(CONFIG_ACPI_NUMA) | |
2979 | VIRTIO_MEM_F_ACPI_PXM, | |
2980 | #endif | |
61082ad6 | 2981 | VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE, |
f2af6d39 DH |
2982 | }; |
2983 | ||
7ab4de60 | 2984 | static const struct virtio_device_id virtio_mem_id_table[] = { |
5f1f79bb DH |
2985 | { VIRTIO_ID_MEM, VIRTIO_DEV_ANY_ID }, |
2986 | { 0 }, | |
2987 | }; | |
2988 | ||
2989 | static struct virtio_driver virtio_mem_driver = { | |
f2af6d39 DH |
2990 | .feature_table = virtio_mem_features, |
2991 | .feature_table_size = ARRAY_SIZE(virtio_mem_features), | |
5f1f79bb DH |
2992 | .driver.name = KBUILD_MODNAME, |
2993 | .driver.owner = THIS_MODULE, | |
2994 | .id_table = virtio_mem_id_table, | |
2995 | .probe = virtio_mem_probe, | |
2996 | .remove = virtio_mem_remove, | |
2997 | .config_changed = virtio_mem_config_changed, | |
2998 | #ifdef CONFIG_PM_SLEEP | |
2999 | .freeze = virtio_mem_freeze, | |
3000 | .restore = virtio_mem_restore, | |
3001 | #endif | |
3002 | }; | |
3003 | ||
3004 | module_virtio_driver(virtio_mem_driver); | |
3005 | MODULE_DEVICE_TABLE(virtio, virtio_mem_id_table); | |
3006 | MODULE_AUTHOR("David Hildenbrand <david@redhat.com>"); | |
3007 | MODULE_DESCRIPTION("Virtio-mem driver"); | |
3008 | MODULE_LICENSE("GPL"); |