Commit | Line | Data |
---|---|---|
e7e05452 SC |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* Copyright(c) 2016-20 Intel Corporation. */ | |
3 | ||
b3754e5d | 4 | #include <linux/file.h> |
e7e05452 SC |
5 | #include <linux/freezer.h> |
6 | #include <linux/highmem.h> | |
7 | #include <linux/kthread.h> | |
b3754e5d | 8 | #include <linux/miscdevice.h> |
2056e298 | 9 | #include <linux/node.h> |
e7e05452 SC |
10 | #include <linux/pagemap.h> |
11 | #include <linux/ratelimit.h> | |
12 | #include <linux/sched/mm.h> | |
13 | #include <linux/sched/signal.h> | |
14 | #include <linux/slab.h> | |
2056e298 | 15 | #include <linux/sysfs.h> |
0069455b | 16 | #include <linux/vmalloc.h> |
b3754e5d | 17 | #include <asm/sgx.h> |
3fe0778e JS |
18 | #include "driver.h" |
19 | #include "encl.h" | |
e7e05452 SC |
20 | #include "encls.h" |
21 | ||
22 | struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; | |
23 | static int sgx_nr_epc_sections; | |
24 | static struct task_struct *ksgxd_tsk; | |
1728ab54 | 25 | static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); |
40e0e784 | 26 | static DEFINE_XARRAY(sgx_epc_address_space); |
1728ab54 JS |
27 | |
28 | /* | |
29 | * These variables are part of the state of the reclaimer, and must be accessed | |
30 | * with sgx_reclaimer_lock acquired. | |
31 | */ | |
32 | static LIST_HEAD(sgx_active_page_list); | |
1728ab54 | 33 | static DEFINE_SPINLOCK(sgx_reclaimer_lock); |
e7e05452 | 34 | |
ac5d272a | 35 | static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0); |
901ddbb9 JS |
36 | |
37 | /* Nodes with one or more EPC sections. */ | |
38 | static nodemask_t sgx_numa_mask; | |
39 | ||
40 | /* | |
41 | * Array with one list_head for each possible NUMA node. Each | |
42 | * list contains all the sgx_epc_section's which are on that | |
43 | * node. | |
44 | */ | |
45 | static struct sgx_numa_node *sgx_numa_nodes; | |
46 | ||
51ab30eb JS |
47 | static LIST_HEAD(sgx_dirty_page_list); |
48 | ||
e7e05452 | 49 | /* |
51ab30eb JS |
50 | * Reset post-kexec EPC pages to the uninitialized state. The pages are removed |
51 | * from the input list, and made available for the page allocator. SECS pages | |
52 | * prepending their children in the input list are left intact. | |
133e049a JS |
53 | * |
54 | * Return 0 when sanitization was successful or kthread was stopped, and the | |
55 | * number of unsanitized pages otherwise. | |
e7e05452 | 56 | */ |
133e049a | 57 | static unsigned long __sgx_sanitize_pages(struct list_head *dirty_page_list) |
e7e05452 | 58 | { |
133e049a | 59 | unsigned long left_dirty = 0; |
e7e05452 SC |
60 | struct sgx_epc_page *page; |
61 | LIST_HEAD(dirty); | |
62 | int ret; | |
63 | ||
51ab30eb JS |
64 | /* dirty_page_list is thread-local, no need for a lock: */ |
65 | while (!list_empty(dirty_page_list)) { | |
e7e05452 | 66 | if (kthread_should_stop()) |
133e049a | 67 | return 0; |
e7e05452 | 68 | |
51ab30eb | 69 | page = list_first_entry(dirty_page_list, struct sgx_epc_page, list); |
e7e05452 | 70 | |
992801ae TL |
71 | /* |
72 | * Checking page->poison without holding the node->lock | |
73 | * is racy, but losing the race (i.e. poison is set just | |
74 | * after the check) just means __eremove() will be uselessly | |
75 | * called for a page that sgx_free_epc_page() will put onto | |
76 | * the node->sgx_poison_page_list later. | |
77 | */ | |
78 | if (page->poison) { | |
79 | struct sgx_epc_section *section = &sgx_epc_sections[page->section]; | |
80 | struct sgx_numa_node *node = section->node; | |
81 | ||
82 | spin_lock(&node->lock); | |
83 | list_move(&page->list, &node->sgx_poison_page_list); | |
84 | spin_unlock(&node->lock); | |
85 | ||
86 | continue; | |
87 | } | |
88 | ||
e7e05452 | 89 | ret = __eremove(sgx_get_epc_virt_addr(page)); |
51ab30eb JS |
90 | if (!ret) { |
91 | /* | |
92 | * page is now sanitized. Make it available via the SGX | |
93 | * page allocator: | |
94 | */ | |
95 | list_del(&page->list); | |
96 | sgx_free_epc_page(page); | |
97 | } else { | |
98 | /* The page is not yet clean - move to the dirty list. */ | |
e7e05452 | 99 | list_move_tail(&page->list, &dirty); |
133e049a | 100 | left_dirty++; |
51ab30eb | 101 | } |
e7e05452 SC |
102 | |
103 | cond_resched(); | |
104 | } | |
105 | ||
51ab30eb | 106 | list_splice(&dirty, dirty_page_list); |
133e049a | 107 | return left_dirty; |
e7e05452 SC |
108 | } |
109 | ||
1728ab54 JS |
110 | static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page) |
111 | { | |
112 | struct sgx_encl_page *page = epc_page->owner; | |
113 | struct sgx_encl *encl = page->encl; | |
114 | struct sgx_encl_mm *encl_mm; | |
115 | bool ret = true; | |
116 | int idx; | |
117 | ||
118 | idx = srcu_read_lock(&encl->srcu); | |
119 | ||
120 | list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { | |
121 | if (!mmget_not_zero(encl_mm->mm)) | |
122 | continue; | |
123 | ||
124 | mmap_read_lock(encl_mm->mm); | |
125 | ret = !sgx_encl_test_and_clear_young(encl_mm->mm, page); | |
126 | mmap_read_unlock(encl_mm->mm); | |
127 | ||
128 | mmput_async(encl_mm->mm); | |
129 | ||
130 | if (!ret) | |
131 | break; | |
132 | } | |
133 | ||
134 | srcu_read_unlock(&encl->srcu, idx); | |
135 | ||
136 | if (!ret) | |
137 | return false; | |
138 | ||
139 | return true; | |
140 | } | |
141 | ||
142 | static void sgx_reclaimer_block(struct sgx_epc_page *epc_page) | |
143 | { | |
144 | struct sgx_encl_page *page = epc_page->owner; | |
145 | unsigned long addr = page->desc & PAGE_MASK; | |
146 | struct sgx_encl *encl = page->encl; | |
f89c2f9b | 147 | int ret; |
1728ab54 | 148 | |
f89c2f9b | 149 | sgx_zap_enclave_ptes(encl, addr); |
1728ab54 JS |
150 | |
151 | mutex_lock(&encl->lock); | |
152 | ||
153 | ret = __eblock(sgx_get_epc_virt_addr(epc_page)); | |
154 | if (encls_failed(ret)) | |
155 | ENCLS_WARN(ret, "EBLOCK"); | |
156 | ||
157 | mutex_unlock(&encl->lock); | |
158 | } | |
159 | ||
160 | static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot, | |
161 | struct sgx_backing *backing) | |
162 | { | |
163 | struct sgx_pageinfo pginfo; | |
164 | int ret; | |
165 | ||
166 | pginfo.addr = 0; | |
167 | pginfo.secs = 0; | |
168 | ||
89e927bb KCA |
169 | pginfo.contents = (unsigned long)kmap_local_page(backing->contents); |
170 | pginfo.metadata = (unsigned long)kmap_local_page(backing->pcmd) + | |
1728ab54 JS |
171 | backing->pcmd_offset; |
172 | ||
173 | ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot); | |
6bd42964 RC |
174 | set_page_dirty(backing->pcmd); |
175 | set_page_dirty(backing->contents); | |
1728ab54 | 176 | |
89e927bb | 177 | kunmap_local((void *)(unsigned long)(pginfo.metadata - |
1728ab54 | 178 | backing->pcmd_offset)); |
89e927bb | 179 | kunmap_local((void *)(unsigned long)pginfo.contents); |
1728ab54 JS |
180 | |
181 | return ret; | |
182 | } | |
183 | ||
c7c6a8a6 | 184 | void sgx_ipi_cb(void *info) |
1728ab54 JS |
185 | { |
186 | } | |
187 | ||
1728ab54 JS |
188 | /* |
189 | * Swap page to the regular memory transformed to the blocked state by using | |
d9f6e12f | 190 | * EBLOCK, which means that it can no longer be referenced (no new TLB entries). |
1728ab54 JS |
191 | * |
192 | * The first trial just tries to write the page assuming that some other thread | |
d9f6e12f | 193 | * has reset the count for threads inside the enclave by using ETRACK, and |
1728ab54 JS |
194 | * previous thread count has been zeroed out. The second trial calls ETRACK |
195 | * before EWB. If that fails we kick all the HW threads out, and then do EWB, | |
196 | * which should be guaranteed the succeed. | |
197 | */ | |
198 | static void sgx_encl_ewb(struct sgx_epc_page *epc_page, | |
199 | struct sgx_backing *backing) | |
200 | { | |
201 | struct sgx_encl_page *encl_page = epc_page->owner; | |
202 | struct sgx_encl *encl = encl_page->encl; | |
203 | struct sgx_va_page *va_page; | |
204 | unsigned int va_offset; | |
205 | void *va_slot; | |
206 | int ret; | |
207 | ||
208 | encl_page->desc &= ~SGX_ENCL_PAGE_BEING_RECLAIMED; | |
209 | ||
210 | va_page = list_first_entry(&encl->va_pages, struct sgx_va_page, | |
211 | list); | |
212 | va_offset = sgx_alloc_va_slot(va_page); | |
213 | va_slot = sgx_get_epc_virt_addr(va_page->epc_page) + va_offset; | |
214 | if (sgx_va_page_full(va_page)) | |
215 | list_move_tail(&va_page->list, &encl->va_pages); | |
216 | ||
217 | ret = __sgx_encl_ewb(epc_page, va_slot, backing); | |
218 | if (ret == SGX_NOT_TRACKED) { | |
219 | ret = __etrack(sgx_get_epc_virt_addr(encl->secs.epc_page)); | |
220 | if (ret) { | |
221 | if (encls_failed(ret)) | |
222 | ENCLS_WARN(ret, "ETRACK"); | |
223 | } | |
224 | ||
225 | ret = __sgx_encl_ewb(epc_page, va_slot, backing); | |
226 | if (ret == SGX_NOT_TRACKED) { | |
227 | /* | |
228 | * Slow path, send IPIs to kick cpus out of the | |
229 | * enclave. Note, it's imperative that the cpu | |
230 | * mask is generated *after* ETRACK, else we'll | |
231 | * miss cpus that entered the enclave between | |
232 | * generating the mask and incrementing epoch. | |
233 | */ | |
bdaa8799 | 234 | on_each_cpu_mask(sgx_encl_cpumask(encl), |
1728ab54 JS |
235 | sgx_ipi_cb, NULL, 1); |
236 | ret = __sgx_encl_ewb(epc_page, va_slot, backing); | |
237 | } | |
238 | } | |
239 | ||
240 | if (ret) { | |
241 | if (encls_failed(ret)) | |
242 | ENCLS_WARN(ret, "EWB"); | |
243 | ||
244 | sgx_free_va_slot(va_page, va_offset); | |
245 | } else { | |
246 | encl_page->desc |= va_offset; | |
247 | encl_page->va_page = va_page; | |
248 | } | |
249 | } | |
250 | ||
251 | static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, | |
252 | struct sgx_backing *backing) | |
253 | { | |
254 | struct sgx_encl_page *encl_page = epc_page->owner; | |
255 | struct sgx_encl *encl = encl_page->encl; | |
256 | struct sgx_backing secs_backing; | |
257 | int ret; | |
258 | ||
259 | mutex_lock(&encl->lock); | |
260 | ||
261 | sgx_encl_ewb(epc_page, backing); | |
262 | encl_page->epc_page = NULL; | |
263 | encl->secs_child_cnt--; | |
0e4e729a | 264 | sgx_encl_put_backing(backing); |
1728ab54 JS |
265 | |
266 | if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) { | |
0c9782e2 | 267 | ret = sgx_encl_alloc_backing(encl, PFN_DOWN(encl->size), |
1728ab54 JS |
268 | &secs_backing); |
269 | if (ret) | |
270 | goto out; | |
271 | ||
272 | sgx_encl_ewb(encl->secs.epc_page, &secs_backing); | |
273 | ||
b0c7459b | 274 | sgx_encl_free_epc_page(encl->secs.epc_page); |
1728ab54 JS |
275 | encl->secs.epc_page = NULL; |
276 | ||
6bd42964 | 277 | sgx_encl_put_backing(&secs_backing); |
1728ab54 JS |
278 | } |
279 | ||
280 | out: | |
281 | mutex_unlock(&encl->lock); | |
282 | } | |
283 | ||
284 | /* | |
285 | * Take a fixed number of pages from the head of the active page pool and | |
286 | * reclaim them to the enclave's private shmem files. Skip the pages, which have | |
287 | * been accessed since the last scan. Move those pages to the tail of active | |
288 | * page pool so that the pages get scanned in LRU like fashion. | |
289 | * | |
290 | * Batch process a chunk of pages (at the moment 16) in order to degrade amount | |
291 | * of IPI's and ETRACK's potentially required. sgx_encl_ewb() does degrade a bit | |
292 | * among the HW threads with three stage EWB pipeline (EWB, ETRACK + EWB and IPI | |
293 | * + EWB) but not sufficiently. Reclaiming one page at a time would also be | |
294 | * problematic as it would increase the lock contention too much, which would | |
295 | * halt forward progress. | |
296 | */ | |
297 | static void sgx_reclaim_pages(void) | |
298 | { | |
299 | struct sgx_epc_page *chunk[SGX_NR_TO_SCAN]; | |
300 | struct sgx_backing backing[SGX_NR_TO_SCAN]; | |
1728ab54 JS |
301 | struct sgx_encl_page *encl_page; |
302 | struct sgx_epc_page *epc_page; | |
303 | pgoff_t page_index; | |
304 | int cnt = 0; | |
305 | int ret; | |
306 | int i; | |
307 | ||
308 | spin_lock(&sgx_reclaimer_lock); | |
309 | for (i = 0; i < SGX_NR_TO_SCAN; i++) { | |
310 | if (list_empty(&sgx_active_page_list)) | |
311 | break; | |
312 | ||
313 | epc_page = list_first_entry(&sgx_active_page_list, | |
314 | struct sgx_epc_page, list); | |
315 | list_del_init(&epc_page->list); | |
316 | encl_page = epc_page->owner; | |
317 | ||
318 | if (kref_get_unless_zero(&encl_page->encl->refcount) != 0) | |
319 | chunk[cnt++] = epc_page; | |
320 | else | |
321 | /* The owner is freeing the page. No need to add the | |
322 | * page back to the list of reclaimable pages. | |
323 | */ | |
324 | epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; | |
325 | } | |
326 | spin_unlock(&sgx_reclaimer_lock); | |
327 | ||
328 | for (i = 0; i < cnt; i++) { | |
329 | epc_page = chunk[i]; | |
330 | encl_page = epc_page->owner; | |
331 | ||
332 | if (!sgx_reclaimer_age(epc_page)) | |
333 | goto skip; | |
334 | ||
335 | page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); | |
0e4e729a RC |
336 | |
337 | mutex_lock(&encl_page->encl->lock); | |
0c9782e2 | 338 | ret = sgx_encl_alloc_backing(encl_page->encl, page_index, &backing[i]); |
0e4e729a RC |
339 | if (ret) { |
340 | mutex_unlock(&encl_page->encl->lock); | |
1728ab54 | 341 | goto skip; |
0e4e729a | 342 | } |
1728ab54 | 343 | |
1728ab54 JS |
344 | encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED; |
345 | mutex_unlock(&encl_page->encl->lock); | |
346 | continue; | |
347 | ||
348 | skip: | |
349 | spin_lock(&sgx_reclaimer_lock); | |
350 | list_add_tail(&epc_page->list, &sgx_active_page_list); | |
351 | spin_unlock(&sgx_reclaimer_lock); | |
352 | ||
353 | kref_put(&encl_page->encl->refcount, sgx_encl_release); | |
354 | ||
355 | chunk[i] = NULL; | |
356 | } | |
357 | ||
358 | for (i = 0; i < cnt; i++) { | |
359 | epc_page = chunk[i]; | |
360 | if (epc_page) | |
361 | sgx_reclaimer_block(epc_page); | |
362 | } | |
363 | ||
364 | for (i = 0; i < cnt; i++) { | |
365 | epc_page = chunk[i]; | |
366 | if (!epc_page) | |
367 | continue; | |
368 | ||
369 | encl_page = epc_page->owner; | |
370 | sgx_reclaimer_write(epc_page, &backing[i]); | |
1728ab54 JS |
371 | |
372 | kref_put(&encl_page->encl->refcount, sgx_encl_release); | |
373 | epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; | |
374 | ||
e5733d8c | 375 | sgx_free_epc_page(epc_page); |
901ddbb9 | 376 | } |
1728ab54 JS |
377 | } |
378 | ||
379 | static bool sgx_should_reclaim(unsigned long watermark) | |
380 | { | |
ac5d272a RC |
381 | return atomic_long_read(&sgx_nr_free_pages) < watermark && |
382 | !list_empty(&sgx_active_page_list); | |
1728ab54 JS |
383 | } |
384 | ||
a0506b3b RC |
385 | /* |
386 | * sgx_reclaim_direct() should be called (without enclave's mutex held) | |
387 | * in locations where SGX memory resources might be low and might be | |
388 | * needed in order to make forward progress. | |
389 | */ | |
390 | void sgx_reclaim_direct(void) | |
391 | { | |
392 | if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) | |
393 | sgx_reclaim_pages(); | |
394 | } | |
395 | ||
e7e05452 SC |
396 | static int ksgxd(void *p) |
397 | { | |
e7e05452 SC |
398 | set_freezable(); |
399 | ||
400 | /* | |
401 | * Sanitize pages in order to recover from kexec(). The 2nd pass is | |
402 | * required for SECS pages, whose child pages blocked EREMOVE. | |
403 | */ | |
51ab30eb | 404 | __sgx_sanitize_pages(&sgx_dirty_page_list); |
133e049a | 405 | WARN_ON(__sgx_sanitize_pages(&sgx_dirty_page_list)); |
e7e05452 | 406 | |
1728ab54 JS |
407 | while (!kthread_should_stop()) { |
408 | if (try_to_freeze()) | |
409 | continue; | |
410 | ||
411 | wait_event_freezable(ksgxd_waitq, | |
412 | kthread_should_stop() || | |
413 | sgx_should_reclaim(SGX_NR_HIGH_PAGES)); | |
414 | ||
415 | if (sgx_should_reclaim(SGX_NR_HIGH_PAGES)) | |
416 | sgx_reclaim_pages(); | |
417 | ||
418 | cond_resched(); | |
419 | } | |
420 | ||
e7e05452 SC |
421 | return 0; |
422 | } | |
423 | ||
424 | static bool __init sgx_page_reclaimer_init(void) | |
425 | { | |
426 | struct task_struct *tsk; | |
427 | ||
428 | tsk = kthread_run(ksgxd, NULL, "ksgxd"); | |
429 | if (IS_ERR(tsk)) | |
430 | return false; | |
431 | ||
432 | ksgxd_tsk = tsk; | |
433 | ||
434 | return true; | |
435 | } | |
436 | ||
0c9782e2 KCA |
437 | bool current_is_ksgxd(void) |
438 | { | |
439 | return current == ksgxd_tsk; | |
440 | } | |
441 | ||
901ddbb9 | 442 | static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) |
d2285493 | 443 | { |
901ddbb9 JS |
444 | struct sgx_numa_node *node = &sgx_numa_nodes[nid]; |
445 | struct sgx_epc_page *page = NULL; | |
d2285493 | 446 | |
901ddbb9 | 447 | spin_lock(&node->lock); |
d2285493 | 448 | |
901ddbb9 JS |
449 | if (list_empty(&node->free_page_list)) { |
450 | spin_unlock(&node->lock); | |
d2285493 JS |
451 | return NULL; |
452 | } | |
453 | ||
901ddbb9 | 454 | page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list); |
d2285493 | 455 | list_del_init(&page->list); |
d6d261bd | 456 | page->flags = 0; |
901ddbb9 JS |
457 | |
458 | spin_unlock(&node->lock); | |
ac5d272a | 459 | atomic_long_dec(&sgx_nr_free_pages); |
d2285493 | 460 | |
d2285493 JS |
461 | return page; |
462 | } | |
463 | ||
464 | /** | |
465 | * __sgx_alloc_epc_page() - Allocate an EPC page | |
466 | * | |
901ddbb9 JS |
467 | * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start |
468 | * from the NUMA node, where the caller is executing. | |
d2285493 JS |
469 | * |
470 | * Return: | |
901ddbb9 JS |
471 | * - an EPC page: A borrowed EPC pages were available. |
472 | * - NULL: Out of EPC pages. | |
d2285493 JS |
473 | */ |
474 | struct sgx_epc_page *__sgx_alloc_epc_page(void) | |
475 | { | |
d2285493 | 476 | struct sgx_epc_page *page; |
901ddbb9 JS |
477 | int nid_of_current = numa_node_id(); |
478 | int nid = nid_of_current; | |
d2285493 | 479 | |
901ddbb9 JS |
480 | if (node_isset(nid_of_current, sgx_numa_mask)) { |
481 | page = __sgx_alloc_epc_page_from_node(nid_of_current); | |
482 | if (page) | |
483 | return page; | |
484 | } | |
485 | ||
486 | /* Fall back to the non-local NUMA nodes: */ | |
487 | while (true) { | |
488 | nid = next_node_in(nid, sgx_numa_mask); | |
489 | if (nid == nid_of_current) | |
490 | break; | |
d2285493 | 491 | |
901ddbb9 | 492 | page = __sgx_alloc_epc_page_from_node(nid); |
d2285493 JS |
493 | if (page) |
494 | return page; | |
495 | } | |
496 | ||
497 | return ERR_PTR(-ENOMEM); | |
498 | } | |
499 | ||
1728ab54 JS |
500 | /** |
501 | * sgx_mark_page_reclaimable() - Mark a page as reclaimable | |
502 | * @page: EPC page | |
503 | * | |
504 | * Mark a page as reclaimable and add it to the active page list. Pages | |
505 | * are automatically removed from the active list when freed. | |
506 | */ | |
507 | void sgx_mark_page_reclaimable(struct sgx_epc_page *page) | |
508 | { | |
509 | spin_lock(&sgx_reclaimer_lock); | |
510 | page->flags |= SGX_EPC_PAGE_RECLAIMER_TRACKED; | |
511 | list_add_tail(&page->list, &sgx_active_page_list); | |
512 | spin_unlock(&sgx_reclaimer_lock); | |
513 | } | |
514 | ||
515 | /** | |
516 | * sgx_unmark_page_reclaimable() - Remove a page from the reclaim list | |
517 | * @page: EPC page | |
518 | * | |
519 | * Clear the reclaimable flag and remove the page from the active page list. | |
520 | * | |
521 | * Return: | |
522 | * 0 on success, | |
523 | * -EBUSY if the page is in the process of being reclaimed | |
524 | */ | |
525 | int sgx_unmark_page_reclaimable(struct sgx_epc_page *page) | |
526 | { | |
527 | spin_lock(&sgx_reclaimer_lock); | |
528 | if (page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED) { | |
529 | /* The page is being reclaimed. */ | |
530 | if (list_empty(&page->list)) { | |
531 | spin_unlock(&sgx_reclaimer_lock); | |
532 | return -EBUSY; | |
533 | } | |
534 | ||
535 | list_del(&page->list); | |
536 | page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; | |
537 | } | |
538 | spin_unlock(&sgx_reclaimer_lock); | |
539 | ||
540 | return 0; | |
541 | } | |
542 | ||
543 | /** | |
544 | * sgx_alloc_epc_page() - Allocate an EPC page | |
545 | * @owner: the owner of the EPC page | |
546 | * @reclaim: reclaim pages if necessary | |
547 | * | |
548 | * Iterate through EPC sections and borrow a free EPC page to the caller. When a | |
549 | * page is no longer needed it must be released with sgx_free_epc_page(). If | |
550 | * @reclaim is set to true, directly reclaim pages when we are out of pages. No | |
551 | * mm's can be locked when @reclaim is set to true. | |
552 | * | |
553 | * Finally, wake up ksgxd when the number of pages goes below the watermark | |
554 | * before returning back to the caller. | |
555 | * | |
556 | * Return: | |
557 | * an EPC page, | |
558 | * -errno on error | |
559 | */ | |
560 | struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim) | |
561 | { | |
562 | struct sgx_epc_page *page; | |
563 | ||
564 | for ( ; ; ) { | |
565 | page = __sgx_alloc_epc_page(); | |
566 | if (!IS_ERR(page)) { | |
567 | page->owner = owner; | |
568 | break; | |
569 | } | |
570 | ||
571 | if (list_empty(&sgx_active_page_list)) | |
572 | return ERR_PTR(-ENOMEM); | |
573 | ||
574 | if (!reclaim) { | |
575 | page = ERR_PTR(-EBUSY); | |
576 | break; | |
577 | } | |
578 | ||
579 | if (signal_pending(current)) { | |
580 | page = ERR_PTR(-ERESTARTSYS); | |
581 | break; | |
582 | } | |
583 | ||
584 | sgx_reclaim_pages(); | |
585 | cond_resched(); | |
586 | } | |
587 | ||
588 | if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) | |
589 | wake_up(&ksgxd_waitq); | |
590 | ||
591 | return page; | |
592 | } | |
593 | ||
d2285493 JS |
594 | /** |
595 | * sgx_free_epc_page() - Free an EPC page | |
596 | * @page: an EPC page | |
597 | * | |
b0c7459b KH |
598 | * Put the EPC page back to the list of free pages. It's the caller's |
599 | * responsibility to make sure that the page is in uninitialized state. In other | |
600 | * words, do EREMOVE, EWB or whatever operation is necessary before calling | |
601 | * this function. | |
d2285493 JS |
602 | */ |
603 | void sgx_free_epc_page(struct sgx_epc_page *page) | |
604 | { | |
605 | struct sgx_epc_section *section = &sgx_epc_sections[page->section]; | |
901ddbb9 | 606 | struct sgx_numa_node *node = section->node; |
d2285493 | 607 | |
901ddbb9 JS |
608 | spin_lock(&node->lock); |
609 | ||
992801ae TL |
610 | page->owner = NULL; |
611 | if (page->poison) | |
612 | list_add(&page->list, &node->sgx_poison_page_list); | |
613 | else | |
614 | list_add_tail(&page->list, &node->free_page_list); | |
d6d261bd | 615 | page->flags = SGX_EPC_PAGE_IS_FREE; |
901ddbb9 JS |
616 | |
617 | spin_unlock(&node->lock); | |
ac5d272a | 618 | atomic_long_inc(&sgx_nr_free_pages); |
d2285493 JS |
619 | } |
620 | ||
e7e05452 SC |
621 | static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, |
622 | unsigned long index, | |
623 | struct sgx_epc_section *section) | |
624 | { | |
625 | unsigned long nr_pages = size >> PAGE_SHIFT; | |
626 | unsigned long i; | |
627 | ||
628 | section->virt_addr = memremap(phys_addr, size, MEMREMAP_WB); | |
629 | if (!section->virt_addr) | |
630 | return false; | |
631 | ||
632 | section->pages = vmalloc(nr_pages * sizeof(struct sgx_epc_page)); | |
633 | if (!section->pages) { | |
634 | memunmap(section->virt_addr); | |
635 | return false; | |
636 | } | |
637 | ||
638 | section->phys_addr = phys_addr; | |
40e0e784 TL |
639 | xa_store_range(&sgx_epc_address_space, section->phys_addr, |
640 | phys_addr + size - 1, section, GFP_KERNEL); | |
e7e05452 SC |
641 | |
642 | for (i = 0; i < nr_pages; i++) { | |
643 | section->pages[i].section = index; | |
1728ab54 JS |
644 | section->pages[i].flags = 0; |
645 | section->pages[i].owner = NULL; | |
992801ae | 646 | section->pages[i].poison = 0; |
51ab30eb | 647 | list_add_tail(§ion->pages[i].list, &sgx_dirty_page_list); |
e7e05452 SC |
648 | } |
649 | ||
650 | return true; | |
651 | } | |
652 | ||
40e0e784 TL |
653 | bool arch_is_platform_page(u64 paddr) |
654 | { | |
655 | return !!xa_load(&sgx_epc_address_space, paddr); | |
656 | } | |
657 | EXPORT_SYMBOL_GPL(arch_is_platform_page); | |
658 | ||
a495cbdf TL |
659 | static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr) |
660 | { | |
661 | struct sgx_epc_section *section; | |
662 | ||
663 | section = xa_load(&sgx_epc_address_space, paddr); | |
664 | if (!section) | |
665 | return NULL; | |
666 | ||
667 | return §ion->pages[PFN_DOWN(paddr - section->phys_addr)]; | |
668 | } | |
669 | ||
670 | /* | |
671 | * Called in process context to handle a hardware reported | |
672 | * error in an SGX EPC page. | |
673 | * If the MF_ACTION_REQUIRED bit is set in flags, then the | |
674 | * context is the task that consumed the poison data. Otherwise | |
675 | * this is called from a kernel thread unrelated to the page. | |
676 | */ | |
677 | int arch_memory_failure(unsigned long pfn, int flags) | |
678 | { | |
679 | struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT); | |
680 | struct sgx_epc_section *section; | |
681 | struct sgx_numa_node *node; | |
682 | ||
683 | /* | |
684 | * mm/memory-failure.c calls this routine for all errors | |
685 | * where there isn't a "struct page" for the address. But that | |
686 | * includes other address ranges besides SGX. | |
687 | */ | |
688 | if (!page) | |
689 | return -ENXIO; | |
690 | ||
691 | /* | |
692 | * If poison was consumed synchronously. Send a SIGBUS to | |
693 | * the task. Hardware has already exited the SGX enclave and | |
694 | * will not allow re-entry to an enclave that has a memory | |
695 | * error. The signal may help the task understand why the | |
696 | * enclave is broken. | |
697 | */ | |
698 | if (flags & MF_ACTION_REQUIRED) | |
699 | force_sig(SIGBUS); | |
700 | ||
701 | section = &sgx_epc_sections[page->section]; | |
702 | node = section->node; | |
703 | ||
704 | spin_lock(&node->lock); | |
705 | ||
706 | /* Already poisoned? Nothing more to do */ | |
707 | if (page->poison) | |
708 | goto out; | |
709 | ||
710 | page->poison = 1; | |
711 | ||
712 | /* | |
713 | * If the page is on a free list, move it to the per-node | |
714 | * poison page list. | |
715 | */ | |
716 | if (page->flags & SGX_EPC_PAGE_IS_FREE) { | |
717 | list_move(&page->list, &node->sgx_poison_page_list); | |
718 | goto out; | |
719 | } | |
720 | ||
721 | /* | |
722 | * TBD: Add additional plumbing to enable pre-emptive | |
723 | * action for asynchronous poison notification. Until | |
724 | * then just hope that the poison: | |
725 | * a) is not accessed - sgx_free_epc_page() will deal with it | |
726 | * when the user gives it back | |
727 | * b) results in a recoverable machine check rather than | |
728 | * a fatal one | |
729 | */ | |
730 | out: | |
731 | spin_unlock(&node->lock); | |
732 | return 0; | |
733 | } | |
734 | ||
e7e05452 SC |
735 | /** |
736 | * A section metric is concatenated in a way that @low bits 12-31 define the | |
737 | * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the | |
738 | * metric. | |
739 | */ | |
740 | static inline u64 __init sgx_calc_section_metric(u64 low, u64 high) | |
741 | { | |
742 | return (low & GENMASK_ULL(31, 12)) + | |
743 | ((high & GENMASK_ULL(19, 0)) << 32); | |
744 | } | |
745 | ||
2056e298 DH |
746 | #ifdef CONFIG_NUMA |
747 | static ssize_t sgx_total_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) | |
748 | { | |
749 | return sysfs_emit(buf, "%lu\n", sgx_numa_nodes[dev->id].size); | |
750 | } | |
751 | static DEVICE_ATTR_RO(sgx_total_bytes); | |
752 | ||
753 | static umode_t arch_node_attr_is_visible(struct kobject *kobj, | |
754 | struct attribute *attr, int idx) | |
755 | { | |
756 | /* Make all x86/ attributes invisible when SGX is not initialized: */ | |
757 | if (nodes_empty(sgx_numa_mask)) | |
758 | return 0; | |
759 | ||
760 | return attr->mode; | |
761 | } | |
762 | ||
763 | static struct attribute *arch_node_dev_attrs[] = { | |
764 | &dev_attr_sgx_total_bytes.attr, | |
765 | NULL, | |
766 | }; | |
767 | ||
768 | const struct attribute_group arch_node_dev_group = { | |
769 | .name = "x86", | |
770 | .attrs = arch_node_dev_attrs, | |
771 | .is_visible = arch_node_attr_is_visible, | |
772 | }; | |
773 | ||
774 | static void __init arch_update_sysfs_visibility(int nid) | |
775 | { | |
776 | struct node *node = node_devices[nid]; | |
777 | int ret; | |
778 | ||
779 | ret = sysfs_update_group(&node->dev.kobj, &arch_node_dev_group); | |
780 | ||
781 | if (ret) | |
782 | pr_err("sysfs update failed (%d), files may be invisible", ret); | |
783 | } | |
784 | #else /* !CONFIG_NUMA */ | |
785 | static void __init arch_update_sysfs_visibility(int nid) {} | |
786 | #endif | |
787 | ||
e7e05452 SC |
788 | static bool __init sgx_page_cache_init(void) |
789 | { | |
790 | u32 eax, ebx, ecx, edx, type; | |
791 | u64 pa, size; | |
901ddbb9 | 792 | int nid; |
e7e05452 SC |
793 | int i; |
794 | ||
901ddbb9 JS |
795 | sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL); |
796 | if (!sgx_numa_nodes) | |
797 | return false; | |
798 | ||
e7e05452 SC |
799 | for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) { |
800 | cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx); | |
801 | ||
802 | type = eax & SGX_CPUID_EPC_MASK; | |
803 | if (type == SGX_CPUID_EPC_INVALID) | |
804 | break; | |
805 | ||
806 | if (type != SGX_CPUID_EPC_SECTION) { | |
807 | pr_err_once("Unknown EPC section type: %u\n", type); | |
808 | break; | |
809 | } | |
810 | ||
811 | pa = sgx_calc_section_metric(eax, ebx); | |
812 | size = sgx_calc_section_metric(ecx, edx); | |
813 | ||
814 | pr_info("EPC section 0x%llx-0x%llx\n", pa, pa + size - 1); | |
815 | ||
816 | if (!sgx_setup_epc_section(pa, size, i, &sgx_epc_sections[i])) { | |
817 | pr_err("No free memory for an EPC section\n"); | |
818 | break; | |
819 | } | |
820 | ||
901ddbb9 JS |
821 | nid = numa_map_to_online_node(phys_to_target_node(pa)); |
822 | if (nid == NUMA_NO_NODE) { | |
823 | /* The physical address is already printed above. */ | |
824 | pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n"); | |
825 | nid = 0; | |
826 | } | |
827 | ||
828 | if (!node_isset(nid, sgx_numa_mask)) { | |
829 | spin_lock_init(&sgx_numa_nodes[nid].lock); | |
830 | INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list); | |
992801ae | 831 | INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list); |
901ddbb9 | 832 | node_set(nid, sgx_numa_mask); |
50468e43 | 833 | sgx_numa_nodes[nid].size = 0; |
2056e298 DH |
834 | |
835 | /* Make SGX-specific node sysfs files visible: */ | |
836 | arch_update_sysfs_visibility(nid); | |
901ddbb9 JS |
837 | } |
838 | ||
839 | sgx_epc_sections[i].node = &sgx_numa_nodes[nid]; | |
50468e43 | 840 | sgx_numa_nodes[nid].size += size; |
901ddbb9 | 841 | |
e7e05452 SC |
842 | sgx_nr_epc_sections++; |
843 | } | |
844 | ||
845 | if (!sgx_nr_epc_sections) { | |
846 | pr_err("There are zero EPC sections.\n"); | |
847 | return false; | |
848 | } | |
849 | ||
850 | return true; | |
851 | } | |
852 | ||
73916b6a KH |
853 | /* |
854 | * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller. | |
855 | * Bare-metal driver requires to update them to hash of enclave's signer | |
856 | * before EINIT. KVM needs to update them to guest's virtual MSR values | |
857 | * before doing EINIT from guest. | |
858 | */ | |
859 | void sgx_update_lepubkeyhash(u64 *lepubkeyhash) | |
860 | { | |
861 | int i; | |
862 | ||
863 | WARN_ON_ONCE(preemptible()); | |
864 | ||
865 | for (i = 0; i < 4; i++) | |
866 | wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]); | |
867 | } | |
868 | ||
b3754e5d SC |
869 | const struct file_operations sgx_provision_fops = { |
870 | .owner = THIS_MODULE, | |
871 | }; | |
872 | ||
873 | static struct miscdevice sgx_dev_provision = { | |
874 | .minor = MISC_DYNAMIC_MINOR, | |
875 | .name = "sgx_provision", | |
876 | .nodename = "sgx_provision", | |
877 | .fops = &sgx_provision_fops, | |
878 | }; | |
879 | ||
880 | /** | |
881 | * sgx_set_attribute() - Update allowed attributes given file descriptor | |
882 | * @allowed_attributes: Pointer to allowed enclave attributes | |
883 | * @attribute_fd: File descriptor for specific attribute | |
884 | * | |
885 | * Append enclave attribute indicated by file descriptor to allowed | |
886 | * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by | |
887 | * /dev/sgx_provision is supported. | |
888 | * | |
889 | * Return: | |
890 | * -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes | |
891 | * -EINVAL: Invalid, or not supported file descriptor | |
892 | */ | |
893 | int sgx_set_attribute(unsigned long *allowed_attributes, | |
894 | unsigned int attribute_fd) | |
895 | { | |
e73d4376 | 896 | struct fd f = fdget(attribute_fd); |
b3754e5d | 897 | |
e73d4376 | 898 | if (!f.file) |
b3754e5d SC |
899 | return -EINVAL; |
900 | ||
e73d4376 AV |
901 | if (f.file->f_op != &sgx_provision_fops) { |
902 | fdput(f); | |
b3754e5d SC |
903 | return -EINVAL; |
904 | } | |
905 | ||
906 | *allowed_attributes |= SGX_ATTR_PROVISIONKEY; | |
907 | ||
e73d4376 | 908 | fdput(f); |
b3754e5d SC |
909 | return 0; |
910 | } | |
911 | EXPORT_SYMBOL_GPL(sgx_set_attribute); | |
912 | ||
31bf9288 | 913 | static int __init sgx_init(void) |
e7e05452 | 914 | { |
3fe0778e | 915 | int ret; |
e7e05452 SC |
916 | int i; |
917 | ||
3fe0778e | 918 | if (!cpu_feature_enabled(X86_FEATURE_SGX)) |
31bf9288 | 919 | return -ENODEV; |
e7e05452 SC |
920 | |
921 | if (!sgx_page_cache_init()) | |
31bf9288 | 922 | return -ENOMEM; |
e7e05452 | 923 | |
31bf9288 ST |
924 | if (!sgx_page_reclaimer_init()) { |
925 | ret = -ENOMEM; | |
e7e05452 | 926 | goto err_page_cache; |
31bf9288 | 927 | } |
e7e05452 | 928 | |
b3754e5d SC |
929 | ret = misc_register(&sgx_dev_provision); |
930 | if (ret) | |
931 | goto err_kthread; | |
932 | ||
faa7d3e6 KH |
933 | /* |
934 | * Always try to initialize the native *and* KVM drivers. | |
935 | * The KVM driver is less picky than the native one and | |
936 | * can function if the native one is not supported on the | |
937 | * current system or fails to initialize. | |
938 | * | |
939 | * Error out only if both fail to initialize. | |
940 | */ | |
3fe0778e | 941 | ret = sgx_drv_init(); |
faa7d3e6 KH |
942 | |
943 | if (sgx_vepc_init() && ret) | |
b3754e5d | 944 | goto err_provision; |
3fe0778e | 945 | |
31bf9288 | 946 | return 0; |
e7e05452 | 947 | |
b3754e5d SC |
948 | err_provision: |
949 | misc_deregister(&sgx_dev_provision); | |
950 | ||
3fe0778e JS |
951 | err_kthread: |
952 | kthread_stop(ksgxd_tsk); | |
953 | ||
e7e05452 SC |
954 | err_page_cache: |
955 | for (i = 0; i < sgx_nr_epc_sections; i++) { | |
956 | vfree(sgx_epc_sections[i].pages); | |
957 | memunmap(sgx_epc_sections[i].virt_addr); | |
958 | } | |
31bf9288 ST |
959 | |
960 | return ret; | |
e7e05452 SC |
961 | } |
962 | ||
963 | device_initcall(sgx_init); |