Commit | Line | Data |
---|---|---|
e7e05452 SC |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* Copyright(c) 2016-20 Intel Corporation. */ | |
3 | ||
b3754e5d | 4 | #include <linux/file.h> |
e7e05452 SC |
5 | #include <linux/freezer.h> |
6 | #include <linux/highmem.h> | |
7 | #include <linux/kthread.h> | |
b3754e5d | 8 | #include <linux/miscdevice.h> |
2056e298 | 9 | #include <linux/node.h> |
e7e05452 SC |
10 | #include <linux/pagemap.h> |
11 | #include <linux/ratelimit.h> | |
12 | #include <linux/sched/mm.h> | |
13 | #include <linux/sched/signal.h> | |
14 | #include <linux/slab.h> | |
2056e298 | 15 | #include <linux/sysfs.h> |
b3754e5d | 16 | #include <asm/sgx.h> |
3fe0778e JS |
17 | #include "driver.h" |
18 | #include "encl.h" | |
e7e05452 SC |
19 | #include "encls.h" |
20 | ||
21 | struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; | |
22 | static int sgx_nr_epc_sections; | |
23 | static struct task_struct *ksgxd_tsk; | |
1728ab54 | 24 | static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); |
40e0e784 | 25 | static DEFINE_XARRAY(sgx_epc_address_space); |
1728ab54 JS |
26 | |
27 | /* | |
28 | * These variables are part of the state of the reclaimer, and must be accessed | |
29 | * with sgx_reclaimer_lock acquired. | |
30 | */ | |
31 | static LIST_HEAD(sgx_active_page_list); | |
1728ab54 | 32 | static DEFINE_SPINLOCK(sgx_reclaimer_lock); |
e7e05452 | 33 | |
ac5d272a | 34 | static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0); |
901ddbb9 JS |
35 | |
36 | /* Nodes with one or more EPC sections. */ | |
37 | static nodemask_t sgx_numa_mask; | |
38 | ||
39 | /* | |
40 | * Array with one list_head for each possible NUMA node. Each | |
41 | * list contains all the sgx_epc_section's which are on that | |
42 | * node. | |
43 | */ | |
44 | static struct sgx_numa_node *sgx_numa_nodes; | |
45 | ||
51ab30eb JS |
46 | static LIST_HEAD(sgx_dirty_page_list); |
47 | ||
e7e05452 | 48 | /* |
51ab30eb JS |
49 | * Reset post-kexec EPC pages to the uninitialized state. The pages are removed |
50 | * from the input list, and made available for the page allocator. SECS pages | |
51 | * prepending their children in the input list are left intact. | |
133e049a JS |
52 | * |
53 | * Return 0 when sanitization was successful or kthread was stopped, and the | |
54 | * number of unsanitized pages otherwise. | |
e7e05452 | 55 | */ |
133e049a | 56 | static unsigned long __sgx_sanitize_pages(struct list_head *dirty_page_list) |
e7e05452 | 57 | { |
133e049a | 58 | unsigned long left_dirty = 0; |
e7e05452 SC |
59 | struct sgx_epc_page *page; |
60 | LIST_HEAD(dirty); | |
61 | int ret; | |
62 | ||
51ab30eb JS |
63 | /* dirty_page_list is thread-local, no need for a lock: */ |
64 | while (!list_empty(dirty_page_list)) { | |
e7e05452 | 65 | if (kthread_should_stop()) |
133e049a | 66 | return 0; |
e7e05452 | 67 | |
51ab30eb | 68 | page = list_first_entry(dirty_page_list, struct sgx_epc_page, list); |
e7e05452 | 69 | |
992801ae TL |
70 | /* |
71 | * Checking page->poison without holding the node->lock | |
72 | * is racy, but losing the race (i.e. poison is set just | |
73 | * after the check) just means __eremove() will be uselessly | |
74 | * called for a page that sgx_free_epc_page() will put onto | |
75 | * the node->sgx_poison_page_list later. | |
76 | */ | |
77 | if (page->poison) { | |
78 | struct sgx_epc_section *section = &sgx_epc_sections[page->section]; | |
79 | struct sgx_numa_node *node = section->node; | |
80 | ||
81 | spin_lock(&node->lock); | |
82 | list_move(&page->list, &node->sgx_poison_page_list); | |
83 | spin_unlock(&node->lock); | |
84 | ||
85 | continue; | |
86 | } | |
87 | ||
e7e05452 | 88 | ret = __eremove(sgx_get_epc_virt_addr(page)); |
51ab30eb JS |
89 | if (!ret) { |
90 | /* | |
91 | * page is now sanitized. Make it available via the SGX | |
92 | * page allocator: | |
93 | */ | |
94 | list_del(&page->list); | |
95 | sgx_free_epc_page(page); | |
96 | } else { | |
97 | /* The page is not yet clean - move to the dirty list. */ | |
e7e05452 | 98 | list_move_tail(&page->list, &dirty); |
133e049a | 99 | left_dirty++; |
51ab30eb | 100 | } |
e7e05452 SC |
101 | |
102 | cond_resched(); | |
103 | } | |
104 | ||
51ab30eb | 105 | list_splice(&dirty, dirty_page_list); |
133e049a | 106 | return left_dirty; |
e7e05452 SC |
107 | } |
108 | ||
1728ab54 JS |
109 | static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page) |
110 | { | |
111 | struct sgx_encl_page *page = epc_page->owner; | |
112 | struct sgx_encl *encl = page->encl; | |
113 | struct sgx_encl_mm *encl_mm; | |
114 | bool ret = true; | |
115 | int idx; | |
116 | ||
117 | idx = srcu_read_lock(&encl->srcu); | |
118 | ||
119 | list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { | |
120 | if (!mmget_not_zero(encl_mm->mm)) | |
121 | continue; | |
122 | ||
123 | mmap_read_lock(encl_mm->mm); | |
124 | ret = !sgx_encl_test_and_clear_young(encl_mm->mm, page); | |
125 | mmap_read_unlock(encl_mm->mm); | |
126 | ||
127 | mmput_async(encl_mm->mm); | |
128 | ||
129 | if (!ret) | |
130 | break; | |
131 | } | |
132 | ||
133 | srcu_read_unlock(&encl->srcu, idx); | |
134 | ||
135 | if (!ret) | |
136 | return false; | |
137 | ||
138 | return true; | |
139 | } | |
140 | ||
141 | static void sgx_reclaimer_block(struct sgx_epc_page *epc_page) | |
142 | { | |
143 | struct sgx_encl_page *page = epc_page->owner; | |
144 | unsigned long addr = page->desc & PAGE_MASK; | |
145 | struct sgx_encl *encl = page->encl; | |
f89c2f9b | 146 | int ret; |
1728ab54 | 147 | |
f89c2f9b | 148 | sgx_zap_enclave_ptes(encl, addr); |
1728ab54 JS |
149 | |
150 | mutex_lock(&encl->lock); | |
151 | ||
152 | ret = __eblock(sgx_get_epc_virt_addr(epc_page)); | |
153 | if (encls_failed(ret)) | |
154 | ENCLS_WARN(ret, "EBLOCK"); | |
155 | ||
156 | mutex_unlock(&encl->lock); | |
157 | } | |
158 | ||
159 | static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot, | |
160 | struct sgx_backing *backing) | |
161 | { | |
162 | struct sgx_pageinfo pginfo; | |
163 | int ret; | |
164 | ||
165 | pginfo.addr = 0; | |
166 | pginfo.secs = 0; | |
167 | ||
89e927bb KCA |
168 | pginfo.contents = (unsigned long)kmap_local_page(backing->contents); |
169 | pginfo.metadata = (unsigned long)kmap_local_page(backing->pcmd) + | |
1728ab54 JS |
170 | backing->pcmd_offset; |
171 | ||
172 | ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot); | |
6bd42964 RC |
173 | set_page_dirty(backing->pcmd); |
174 | set_page_dirty(backing->contents); | |
1728ab54 | 175 | |
89e927bb | 176 | kunmap_local((void *)(unsigned long)(pginfo.metadata - |
1728ab54 | 177 | backing->pcmd_offset)); |
89e927bb | 178 | kunmap_local((void *)(unsigned long)pginfo.contents); |
1728ab54 JS |
179 | |
180 | return ret; | |
181 | } | |
182 | ||
c7c6a8a6 | 183 | void sgx_ipi_cb(void *info) |
1728ab54 JS |
184 | { |
185 | } | |
186 | ||
1728ab54 JS |
187 | /* |
188 | * Swap page to the regular memory transformed to the blocked state by using | |
d9f6e12f | 189 | * EBLOCK, which means that it can no longer be referenced (no new TLB entries). |
1728ab54 JS |
190 | * |
191 | * The first trial just tries to write the page assuming that some other thread | |
d9f6e12f | 192 | * has reset the count for threads inside the enclave by using ETRACK, and |
1728ab54 JS |
193 | * previous thread count has been zeroed out. The second trial calls ETRACK |
194 | * before EWB. If that fails we kick all the HW threads out, and then do EWB, | |
195 | * which should be guaranteed the succeed. | |
196 | */ | |
197 | static void sgx_encl_ewb(struct sgx_epc_page *epc_page, | |
198 | struct sgx_backing *backing) | |
199 | { | |
200 | struct sgx_encl_page *encl_page = epc_page->owner; | |
201 | struct sgx_encl *encl = encl_page->encl; | |
202 | struct sgx_va_page *va_page; | |
203 | unsigned int va_offset; | |
204 | void *va_slot; | |
205 | int ret; | |
206 | ||
207 | encl_page->desc &= ~SGX_ENCL_PAGE_BEING_RECLAIMED; | |
208 | ||
209 | va_page = list_first_entry(&encl->va_pages, struct sgx_va_page, | |
210 | list); | |
211 | va_offset = sgx_alloc_va_slot(va_page); | |
212 | va_slot = sgx_get_epc_virt_addr(va_page->epc_page) + va_offset; | |
213 | if (sgx_va_page_full(va_page)) | |
214 | list_move_tail(&va_page->list, &encl->va_pages); | |
215 | ||
216 | ret = __sgx_encl_ewb(epc_page, va_slot, backing); | |
217 | if (ret == SGX_NOT_TRACKED) { | |
218 | ret = __etrack(sgx_get_epc_virt_addr(encl->secs.epc_page)); | |
219 | if (ret) { | |
220 | if (encls_failed(ret)) | |
221 | ENCLS_WARN(ret, "ETRACK"); | |
222 | } | |
223 | ||
224 | ret = __sgx_encl_ewb(epc_page, va_slot, backing); | |
225 | if (ret == SGX_NOT_TRACKED) { | |
226 | /* | |
227 | * Slow path, send IPIs to kick cpus out of the | |
228 | * enclave. Note, it's imperative that the cpu | |
229 | * mask is generated *after* ETRACK, else we'll | |
230 | * miss cpus that entered the enclave between | |
231 | * generating the mask and incrementing epoch. | |
232 | */ | |
bdaa8799 | 233 | on_each_cpu_mask(sgx_encl_cpumask(encl), |
1728ab54 JS |
234 | sgx_ipi_cb, NULL, 1); |
235 | ret = __sgx_encl_ewb(epc_page, va_slot, backing); | |
236 | } | |
237 | } | |
238 | ||
239 | if (ret) { | |
240 | if (encls_failed(ret)) | |
241 | ENCLS_WARN(ret, "EWB"); | |
242 | ||
243 | sgx_free_va_slot(va_page, va_offset); | |
244 | } else { | |
245 | encl_page->desc |= va_offset; | |
246 | encl_page->va_page = va_page; | |
247 | } | |
248 | } | |
249 | ||
250 | static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, | |
251 | struct sgx_backing *backing) | |
252 | { | |
253 | struct sgx_encl_page *encl_page = epc_page->owner; | |
254 | struct sgx_encl *encl = encl_page->encl; | |
255 | struct sgx_backing secs_backing; | |
256 | int ret; | |
257 | ||
258 | mutex_lock(&encl->lock); | |
259 | ||
260 | sgx_encl_ewb(epc_page, backing); | |
261 | encl_page->epc_page = NULL; | |
262 | encl->secs_child_cnt--; | |
0e4e729a | 263 | sgx_encl_put_backing(backing); |
1728ab54 JS |
264 | |
265 | if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) { | |
0c9782e2 | 266 | ret = sgx_encl_alloc_backing(encl, PFN_DOWN(encl->size), |
1728ab54 JS |
267 | &secs_backing); |
268 | if (ret) | |
269 | goto out; | |
270 | ||
271 | sgx_encl_ewb(encl->secs.epc_page, &secs_backing); | |
272 | ||
b0c7459b | 273 | sgx_encl_free_epc_page(encl->secs.epc_page); |
1728ab54 JS |
274 | encl->secs.epc_page = NULL; |
275 | ||
6bd42964 | 276 | sgx_encl_put_backing(&secs_backing); |
1728ab54 JS |
277 | } |
278 | ||
279 | out: | |
280 | mutex_unlock(&encl->lock); | |
281 | } | |
282 | ||
283 | /* | |
284 | * Take a fixed number of pages from the head of the active page pool and | |
285 | * reclaim them to the enclave's private shmem files. Skip the pages, which have | |
286 | * been accessed since the last scan. Move those pages to the tail of active | |
287 | * page pool so that the pages get scanned in LRU like fashion. | |
288 | * | |
289 | * Batch process a chunk of pages (at the moment 16) in order to degrade amount | |
290 | * of IPI's and ETRACK's potentially required. sgx_encl_ewb() does degrade a bit | |
291 | * among the HW threads with three stage EWB pipeline (EWB, ETRACK + EWB and IPI | |
292 | * + EWB) but not sufficiently. Reclaiming one page at a time would also be | |
293 | * problematic as it would increase the lock contention too much, which would | |
294 | * halt forward progress. | |
295 | */ | |
296 | static void sgx_reclaim_pages(void) | |
297 | { | |
298 | struct sgx_epc_page *chunk[SGX_NR_TO_SCAN]; | |
299 | struct sgx_backing backing[SGX_NR_TO_SCAN]; | |
1728ab54 JS |
300 | struct sgx_encl_page *encl_page; |
301 | struct sgx_epc_page *epc_page; | |
302 | pgoff_t page_index; | |
303 | int cnt = 0; | |
304 | int ret; | |
305 | int i; | |
306 | ||
307 | spin_lock(&sgx_reclaimer_lock); | |
308 | for (i = 0; i < SGX_NR_TO_SCAN; i++) { | |
309 | if (list_empty(&sgx_active_page_list)) | |
310 | break; | |
311 | ||
312 | epc_page = list_first_entry(&sgx_active_page_list, | |
313 | struct sgx_epc_page, list); | |
314 | list_del_init(&epc_page->list); | |
315 | encl_page = epc_page->owner; | |
316 | ||
317 | if (kref_get_unless_zero(&encl_page->encl->refcount) != 0) | |
318 | chunk[cnt++] = epc_page; | |
319 | else | |
320 | /* The owner is freeing the page. No need to add the | |
321 | * page back to the list of reclaimable pages. | |
322 | */ | |
323 | epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; | |
324 | } | |
325 | spin_unlock(&sgx_reclaimer_lock); | |
326 | ||
327 | for (i = 0; i < cnt; i++) { | |
328 | epc_page = chunk[i]; | |
329 | encl_page = epc_page->owner; | |
330 | ||
331 | if (!sgx_reclaimer_age(epc_page)) | |
332 | goto skip; | |
333 | ||
334 | page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); | |
0e4e729a RC |
335 | |
336 | mutex_lock(&encl_page->encl->lock); | |
0c9782e2 | 337 | ret = sgx_encl_alloc_backing(encl_page->encl, page_index, &backing[i]); |
0e4e729a RC |
338 | if (ret) { |
339 | mutex_unlock(&encl_page->encl->lock); | |
1728ab54 | 340 | goto skip; |
0e4e729a | 341 | } |
1728ab54 | 342 | |
1728ab54 JS |
343 | encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED; |
344 | mutex_unlock(&encl_page->encl->lock); | |
345 | continue; | |
346 | ||
347 | skip: | |
348 | spin_lock(&sgx_reclaimer_lock); | |
349 | list_add_tail(&epc_page->list, &sgx_active_page_list); | |
350 | spin_unlock(&sgx_reclaimer_lock); | |
351 | ||
352 | kref_put(&encl_page->encl->refcount, sgx_encl_release); | |
353 | ||
354 | chunk[i] = NULL; | |
355 | } | |
356 | ||
357 | for (i = 0; i < cnt; i++) { | |
358 | epc_page = chunk[i]; | |
359 | if (epc_page) | |
360 | sgx_reclaimer_block(epc_page); | |
361 | } | |
362 | ||
363 | for (i = 0; i < cnt; i++) { | |
364 | epc_page = chunk[i]; | |
365 | if (!epc_page) | |
366 | continue; | |
367 | ||
368 | encl_page = epc_page->owner; | |
369 | sgx_reclaimer_write(epc_page, &backing[i]); | |
1728ab54 JS |
370 | |
371 | kref_put(&encl_page->encl->refcount, sgx_encl_release); | |
372 | epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; | |
373 | ||
e5733d8c | 374 | sgx_free_epc_page(epc_page); |
901ddbb9 | 375 | } |
1728ab54 JS |
376 | } |
377 | ||
378 | static bool sgx_should_reclaim(unsigned long watermark) | |
379 | { | |
ac5d272a RC |
380 | return atomic_long_read(&sgx_nr_free_pages) < watermark && |
381 | !list_empty(&sgx_active_page_list); | |
1728ab54 JS |
382 | } |
383 | ||
a0506b3b RC |
384 | /* |
385 | * sgx_reclaim_direct() should be called (without enclave's mutex held) | |
386 | * in locations where SGX memory resources might be low and might be | |
387 | * needed in order to make forward progress. | |
388 | */ | |
389 | void sgx_reclaim_direct(void) | |
390 | { | |
391 | if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) | |
392 | sgx_reclaim_pages(); | |
393 | } | |
394 | ||
e7e05452 SC |
395 | static int ksgxd(void *p) |
396 | { | |
e7e05452 SC |
397 | set_freezable(); |
398 | ||
399 | /* | |
400 | * Sanitize pages in order to recover from kexec(). The 2nd pass is | |
401 | * required for SECS pages, whose child pages blocked EREMOVE. | |
402 | */ | |
51ab30eb | 403 | __sgx_sanitize_pages(&sgx_dirty_page_list); |
133e049a | 404 | WARN_ON(__sgx_sanitize_pages(&sgx_dirty_page_list)); |
e7e05452 | 405 | |
1728ab54 JS |
406 | while (!kthread_should_stop()) { |
407 | if (try_to_freeze()) | |
408 | continue; | |
409 | ||
410 | wait_event_freezable(ksgxd_waitq, | |
411 | kthread_should_stop() || | |
412 | sgx_should_reclaim(SGX_NR_HIGH_PAGES)); | |
413 | ||
414 | if (sgx_should_reclaim(SGX_NR_HIGH_PAGES)) | |
415 | sgx_reclaim_pages(); | |
416 | ||
417 | cond_resched(); | |
418 | } | |
419 | ||
e7e05452 SC |
420 | return 0; |
421 | } | |
422 | ||
423 | static bool __init sgx_page_reclaimer_init(void) | |
424 | { | |
425 | struct task_struct *tsk; | |
426 | ||
427 | tsk = kthread_run(ksgxd, NULL, "ksgxd"); | |
428 | if (IS_ERR(tsk)) | |
429 | return false; | |
430 | ||
431 | ksgxd_tsk = tsk; | |
432 | ||
433 | return true; | |
434 | } | |
435 | ||
0c9782e2 KCA |
436 | bool current_is_ksgxd(void) |
437 | { | |
438 | return current == ksgxd_tsk; | |
439 | } | |
440 | ||
901ddbb9 | 441 | static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) |
d2285493 | 442 | { |
901ddbb9 JS |
443 | struct sgx_numa_node *node = &sgx_numa_nodes[nid]; |
444 | struct sgx_epc_page *page = NULL; | |
d2285493 | 445 | |
901ddbb9 | 446 | spin_lock(&node->lock); |
d2285493 | 447 | |
901ddbb9 JS |
448 | if (list_empty(&node->free_page_list)) { |
449 | spin_unlock(&node->lock); | |
d2285493 JS |
450 | return NULL; |
451 | } | |
452 | ||
901ddbb9 | 453 | page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list); |
d2285493 | 454 | list_del_init(&page->list); |
d6d261bd | 455 | page->flags = 0; |
901ddbb9 JS |
456 | |
457 | spin_unlock(&node->lock); | |
ac5d272a | 458 | atomic_long_dec(&sgx_nr_free_pages); |
d2285493 | 459 | |
d2285493 JS |
460 | return page; |
461 | } | |
462 | ||
463 | /** | |
464 | * __sgx_alloc_epc_page() - Allocate an EPC page | |
465 | * | |
901ddbb9 JS |
466 | * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start |
467 | * from the NUMA node, where the caller is executing. | |
d2285493 JS |
468 | * |
469 | * Return: | |
901ddbb9 JS |
470 | * - an EPC page: A borrowed EPC pages were available. |
471 | * - NULL: Out of EPC pages. | |
d2285493 JS |
472 | */ |
473 | struct sgx_epc_page *__sgx_alloc_epc_page(void) | |
474 | { | |
d2285493 | 475 | struct sgx_epc_page *page; |
901ddbb9 JS |
476 | int nid_of_current = numa_node_id(); |
477 | int nid = nid_of_current; | |
d2285493 | 478 | |
901ddbb9 JS |
479 | if (node_isset(nid_of_current, sgx_numa_mask)) { |
480 | page = __sgx_alloc_epc_page_from_node(nid_of_current); | |
481 | if (page) | |
482 | return page; | |
483 | } | |
484 | ||
485 | /* Fall back to the non-local NUMA nodes: */ | |
486 | while (true) { | |
487 | nid = next_node_in(nid, sgx_numa_mask); | |
488 | if (nid == nid_of_current) | |
489 | break; | |
d2285493 | 490 | |
901ddbb9 | 491 | page = __sgx_alloc_epc_page_from_node(nid); |
d2285493 JS |
492 | if (page) |
493 | return page; | |
494 | } | |
495 | ||
496 | return ERR_PTR(-ENOMEM); | |
497 | } | |
498 | ||
1728ab54 JS |
499 | /** |
500 | * sgx_mark_page_reclaimable() - Mark a page as reclaimable | |
501 | * @page: EPC page | |
502 | * | |
503 | * Mark a page as reclaimable and add it to the active page list. Pages | |
504 | * are automatically removed from the active list when freed. | |
505 | */ | |
506 | void sgx_mark_page_reclaimable(struct sgx_epc_page *page) | |
507 | { | |
508 | spin_lock(&sgx_reclaimer_lock); | |
509 | page->flags |= SGX_EPC_PAGE_RECLAIMER_TRACKED; | |
510 | list_add_tail(&page->list, &sgx_active_page_list); | |
511 | spin_unlock(&sgx_reclaimer_lock); | |
512 | } | |
513 | ||
514 | /** | |
515 | * sgx_unmark_page_reclaimable() - Remove a page from the reclaim list | |
516 | * @page: EPC page | |
517 | * | |
518 | * Clear the reclaimable flag and remove the page from the active page list. | |
519 | * | |
520 | * Return: | |
521 | * 0 on success, | |
522 | * -EBUSY if the page is in the process of being reclaimed | |
523 | */ | |
524 | int sgx_unmark_page_reclaimable(struct sgx_epc_page *page) | |
525 | { | |
526 | spin_lock(&sgx_reclaimer_lock); | |
527 | if (page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED) { | |
528 | /* The page is being reclaimed. */ | |
529 | if (list_empty(&page->list)) { | |
530 | spin_unlock(&sgx_reclaimer_lock); | |
531 | return -EBUSY; | |
532 | } | |
533 | ||
534 | list_del(&page->list); | |
535 | page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; | |
536 | } | |
537 | spin_unlock(&sgx_reclaimer_lock); | |
538 | ||
539 | return 0; | |
540 | } | |
541 | ||
542 | /** | |
543 | * sgx_alloc_epc_page() - Allocate an EPC page | |
544 | * @owner: the owner of the EPC page | |
545 | * @reclaim: reclaim pages if necessary | |
546 | * | |
547 | * Iterate through EPC sections and borrow a free EPC page to the caller. When a | |
548 | * page is no longer needed it must be released with sgx_free_epc_page(). If | |
549 | * @reclaim is set to true, directly reclaim pages when we are out of pages. No | |
550 | * mm's can be locked when @reclaim is set to true. | |
551 | * | |
552 | * Finally, wake up ksgxd when the number of pages goes below the watermark | |
553 | * before returning back to the caller. | |
554 | * | |
555 | * Return: | |
556 | * an EPC page, | |
557 | * -errno on error | |
558 | */ | |
559 | struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim) | |
560 | { | |
561 | struct sgx_epc_page *page; | |
562 | ||
563 | for ( ; ; ) { | |
564 | page = __sgx_alloc_epc_page(); | |
565 | if (!IS_ERR(page)) { | |
566 | page->owner = owner; | |
567 | break; | |
568 | } | |
569 | ||
570 | if (list_empty(&sgx_active_page_list)) | |
571 | return ERR_PTR(-ENOMEM); | |
572 | ||
573 | if (!reclaim) { | |
574 | page = ERR_PTR(-EBUSY); | |
575 | break; | |
576 | } | |
577 | ||
578 | if (signal_pending(current)) { | |
579 | page = ERR_PTR(-ERESTARTSYS); | |
580 | break; | |
581 | } | |
582 | ||
583 | sgx_reclaim_pages(); | |
584 | cond_resched(); | |
585 | } | |
586 | ||
587 | if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) | |
588 | wake_up(&ksgxd_waitq); | |
589 | ||
590 | return page; | |
591 | } | |
592 | ||
d2285493 JS |
593 | /** |
594 | * sgx_free_epc_page() - Free an EPC page | |
595 | * @page: an EPC page | |
596 | * | |
b0c7459b KH |
597 | * Put the EPC page back to the list of free pages. It's the caller's |
598 | * responsibility to make sure that the page is in uninitialized state. In other | |
599 | * words, do EREMOVE, EWB or whatever operation is necessary before calling | |
600 | * this function. | |
d2285493 JS |
601 | */ |
602 | void sgx_free_epc_page(struct sgx_epc_page *page) | |
603 | { | |
604 | struct sgx_epc_section *section = &sgx_epc_sections[page->section]; | |
901ddbb9 | 605 | struct sgx_numa_node *node = section->node; |
d2285493 | 606 | |
901ddbb9 JS |
607 | spin_lock(&node->lock); |
608 | ||
992801ae TL |
609 | page->owner = NULL; |
610 | if (page->poison) | |
611 | list_add(&page->list, &node->sgx_poison_page_list); | |
612 | else | |
613 | list_add_tail(&page->list, &node->free_page_list); | |
d6d261bd | 614 | page->flags = SGX_EPC_PAGE_IS_FREE; |
901ddbb9 JS |
615 | |
616 | spin_unlock(&node->lock); | |
ac5d272a | 617 | atomic_long_inc(&sgx_nr_free_pages); |
d2285493 JS |
618 | } |
619 | ||
e7e05452 SC |
620 | static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, |
621 | unsigned long index, | |
622 | struct sgx_epc_section *section) | |
623 | { | |
624 | unsigned long nr_pages = size >> PAGE_SHIFT; | |
625 | unsigned long i; | |
626 | ||
627 | section->virt_addr = memremap(phys_addr, size, MEMREMAP_WB); | |
628 | if (!section->virt_addr) | |
629 | return false; | |
630 | ||
631 | section->pages = vmalloc(nr_pages * sizeof(struct sgx_epc_page)); | |
632 | if (!section->pages) { | |
633 | memunmap(section->virt_addr); | |
634 | return false; | |
635 | } | |
636 | ||
637 | section->phys_addr = phys_addr; | |
40e0e784 TL |
638 | xa_store_range(&sgx_epc_address_space, section->phys_addr, |
639 | phys_addr + size - 1, section, GFP_KERNEL); | |
e7e05452 SC |
640 | |
641 | for (i = 0; i < nr_pages; i++) { | |
642 | section->pages[i].section = index; | |
1728ab54 JS |
643 | section->pages[i].flags = 0; |
644 | section->pages[i].owner = NULL; | |
992801ae | 645 | section->pages[i].poison = 0; |
51ab30eb | 646 | list_add_tail(§ion->pages[i].list, &sgx_dirty_page_list); |
e7e05452 SC |
647 | } |
648 | ||
649 | return true; | |
650 | } | |
651 | ||
40e0e784 TL |
652 | bool arch_is_platform_page(u64 paddr) |
653 | { | |
654 | return !!xa_load(&sgx_epc_address_space, paddr); | |
655 | } | |
656 | EXPORT_SYMBOL_GPL(arch_is_platform_page); | |
657 | ||
a495cbdf TL |
658 | static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr) |
659 | { | |
660 | struct sgx_epc_section *section; | |
661 | ||
662 | section = xa_load(&sgx_epc_address_space, paddr); | |
663 | if (!section) | |
664 | return NULL; | |
665 | ||
666 | return §ion->pages[PFN_DOWN(paddr - section->phys_addr)]; | |
667 | } | |
668 | ||
669 | /* | |
670 | * Called in process context to handle a hardware reported | |
671 | * error in an SGX EPC page. | |
672 | * If the MF_ACTION_REQUIRED bit is set in flags, then the | |
673 | * context is the task that consumed the poison data. Otherwise | |
674 | * this is called from a kernel thread unrelated to the page. | |
675 | */ | |
676 | int arch_memory_failure(unsigned long pfn, int flags) | |
677 | { | |
678 | struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT); | |
679 | struct sgx_epc_section *section; | |
680 | struct sgx_numa_node *node; | |
681 | ||
682 | /* | |
683 | * mm/memory-failure.c calls this routine for all errors | |
684 | * where there isn't a "struct page" for the address. But that | |
685 | * includes other address ranges besides SGX. | |
686 | */ | |
687 | if (!page) | |
688 | return -ENXIO; | |
689 | ||
690 | /* | |
691 | * If poison was consumed synchronously. Send a SIGBUS to | |
692 | * the task. Hardware has already exited the SGX enclave and | |
693 | * will not allow re-entry to an enclave that has a memory | |
694 | * error. The signal may help the task understand why the | |
695 | * enclave is broken. | |
696 | */ | |
697 | if (flags & MF_ACTION_REQUIRED) | |
698 | force_sig(SIGBUS); | |
699 | ||
700 | section = &sgx_epc_sections[page->section]; | |
701 | node = section->node; | |
702 | ||
703 | spin_lock(&node->lock); | |
704 | ||
705 | /* Already poisoned? Nothing more to do */ | |
706 | if (page->poison) | |
707 | goto out; | |
708 | ||
709 | page->poison = 1; | |
710 | ||
711 | /* | |
712 | * If the page is on a free list, move it to the per-node | |
713 | * poison page list. | |
714 | */ | |
715 | if (page->flags & SGX_EPC_PAGE_IS_FREE) { | |
716 | list_move(&page->list, &node->sgx_poison_page_list); | |
717 | goto out; | |
718 | } | |
719 | ||
720 | /* | |
721 | * TBD: Add additional plumbing to enable pre-emptive | |
722 | * action for asynchronous poison notification. Until | |
723 | * then just hope that the poison: | |
724 | * a) is not accessed - sgx_free_epc_page() will deal with it | |
725 | * when the user gives it back | |
726 | * b) results in a recoverable machine check rather than | |
727 | * a fatal one | |
728 | */ | |
729 | out: | |
730 | spin_unlock(&node->lock); | |
731 | return 0; | |
732 | } | |
733 | ||
e7e05452 SC |
734 | /** |
735 | * A section metric is concatenated in a way that @low bits 12-31 define the | |
736 | * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the | |
737 | * metric. | |
738 | */ | |
739 | static inline u64 __init sgx_calc_section_metric(u64 low, u64 high) | |
740 | { | |
741 | return (low & GENMASK_ULL(31, 12)) + | |
742 | ((high & GENMASK_ULL(19, 0)) << 32); | |
743 | } | |
744 | ||
2056e298 DH |
745 | #ifdef CONFIG_NUMA |
746 | static ssize_t sgx_total_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) | |
747 | { | |
748 | return sysfs_emit(buf, "%lu\n", sgx_numa_nodes[dev->id].size); | |
749 | } | |
750 | static DEVICE_ATTR_RO(sgx_total_bytes); | |
751 | ||
752 | static umode_t arch_node_attr_is_visible(struct kobject *kobj, | |
753 | struct attribute *attr, int idx) | |
754 | { | |
755 | /* Make all x86/ attributes invisible when SGX is not initialized: */ | |
756 | if (nodes_empty(sgx_numa_mask)) | |
757 | return 0; | |
758 | ||
759 | return attr->mode; | |
760 | } | |
761 | ||
762 | static struct attribute *arch_node_dev_attrs[] = { | |
763 | &dev_attr_sgx_total_bytes.attr, | |
764 | NULL, | |
765 | }; | |
766 | ||
767 | const struct attribute_group arch_node_dev_group = { | |
768 | .name = "x86", | |
769 | .attrs = arch_node_dev_attrs, | |
770 | .is_visible = arch_node_attr_is_visible, | |
771 | }; | |
772 | ||
773 | static void __init arch_update_sysfs_visibility(int nid) | |
774 | { | |
775 | struct node *node = node_devices[nid]; | |
776 | int ret; | |
777 | ||
778 | ret = sysfs_update_group(&node->dev.kobj, &arch_node_dev_group); | |
779 | ||
780 | if (ret) | |
781 | pr_err("sysfs update failed (%d), files may be invisible", ret); | |
782 | } | |
783 | #else /* !CONFIG_NUMA */ | |
784 | static void __init arch_update_sysfs_visibility(int nid) {} | |
785 | #endif | |
786 | ||
e7e05452 SC |
787 | static bool __init sgx_page_cache_init(void) |
788 | { | |
789 | u32 eax, ebx, ecx, edx, type; | |
790 | u64 pa, size; | |
901ddbb9 | 791 | int nid; |
e7e05452 SC |
792 | int i; |
793 | ||
901ddbb9 JS |
794 | sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL); |
795 | if (!sgx_numa_nodes) | |
796 | return false; | |
797 | ||
e7e05452 SC |
798 | for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) { |
799 | cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx); | |
800 | ||
801 | type = eax & SGX_CPUID_EPC_MASK; | |
802 | if (type == SGX_CPUID_EPC_INVALID) | |
803 | break; | |
804 | ||
805 | if (type != SGX_CPUID_EPC_SECTION) { | |
806 | pr_err_once("Unknown EPC section type: %u\n", type); | |
807 | break; | |
808 | } | |
809 | ||
810 | pa = sgx_calc_section_metric(eax, ebx); | |
811 | size = sgx_calc_section_metric(ecx, edx); | |
812 | ||
813 | pr_info("EPC section 0x%llx-0x%llx\n", pa, pa + size - 1); | |
814 | ||
815 | if (!sgx_setup_epc_section(pa, size, i, &sgx_epc_sections[i])) { | |
816 | pr_err("No free memory for an EPC section\n"); | |
817 | break; | |
818 | } | |
819 | ||
901ddbb9 JS |
820 | nid = numa_map_to_online_node(phys_to_target_node(pa)); |
821 | if (nid == NUMA_NO_NODE) { | |
822 | /* The physical address is already printed above. */ | |
823 | pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n"); | |
824 | nid = 0; | |
825 | } | |
826 | ||
827 | if (!node_isset(nid, sgx_numa_mask)) { | |
828 | spin_lock_init(&sgx_numa_nodes[nid].lock); | |
829 | INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list); | |
992801ae | 830 | INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list); |
901ddbb9 | 831 | node_set(nid, sgx_numa_mask); |
50468e43 | 832 | sgx_numa_nodes[nid].size = 0; |
2056e298 DH |
833 | |
834 | /* Make SGX-specific node sysfs files visible: */ | |
835 | arch_update_sysfs_visibility(nid); | |
901ddbb9 JS |
836 | } |
837 | ||
838 | sgx_epc_sections[i].node = &sgx_numa_nodes[nid]; | |
50468e43 | 839 | sgx_numa_nodes[nid].size += size; |
901ddbb9 | 840 | |
e7e05452 SC |
841 | sgx_nr_epc_sections++; |
842 | } | |
843 | ||
844 | if (!sgx_nr_epc_sections) { | |
845 | pr_err("There are zero EPC sections.\n"); | |
846 | return false; | |
847 | } | |
848 | ||
849 | return true; | |
850 | } | |
851 | ||
73916b6a KH |
852 | /* |
853 | * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller. | |
854 | * Bare-metal driver requires to update them to hash of enclave's signer | |
855 | * before EINIT. KVM needs to update them to guest's virtual MSR values | |
856 | * before doing EINIT from guest. | |
857 | */ | |
858 | void sgx_update_lepubkeyhash(u64 *lepubkeyhash) | |
859 | { | |
860 | int i; | |
861 | ||
862 | WARN_ON_ONCE(preemptible()); | |
863 | ||
864 | for (i = 0; i < 4; i++) | |
865 | wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]); | |
866 | } | |
867 | ||
b3754e5d SC |
868 | const struct file_operations sgx_provision_fops = { |
869 | .owner = THIS_MODULE, | |
870 | }; | |
871 | ||
872 | static struct miscdevice sgx_dev_provision = { | |
873 | .minor = MISC_DYNAMIC_MINOR, | |
874 | .name = "sgx_provision", | |
875 | .nodename = "sgx_provision", | |
876 | .fops = &sgx_provision_fops, | |
877 | }; | |
878 | ||
879 | /** | |
880 | * sgx_set_attribute() - Update allowed attributes given file descriptor | |
881 | * @allowed_attributes: Pointer to allowed enclave attributes | |
882 | * @attribute_fd: File descriptor for specific attribute | |
883 | * | |
884 | * Append enclave attribute indicated by file descriptor to allowed | |
885 | * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by | |
886 | * /dev/sgx_provision is supported. | |
887 | * | |
888 | * Return: | |
889 | * -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes | |
890 | * -EINVAL: Invalid, or not supported file descriptor | |
891 | */ | |
892 | int sgx_set_attribute(unsigned long *allowed_attributes, | |
893 | unsigned int attribute_fd) | |
894 | { | |
e73d4376 | 895 | struct fd f = fdget(attribute_fd); |
b3754e5d | 896 | |
e73d4376 | 897 | if (!f.file) |
b3754e5d SC |
898 | return -EINVAL; |
899 | ||
e73d4376 AV |
900 | if (f.file->f_op != &sgx_provision_fops) { |
901 | fdput(f); | |
b3754e5d SC |
902 | return -EINVAL; |
903 | } | |
904 | ||
905 | *allowed_attributes |= SGX_ATTR_PROVISIONKEY; | |
906 | ||
e73d4376 | 907 | fdput(f); |
b3754e5d SC |
908 | return 0; |
909 | } | |
910 | EXPORT_SYMBOL_GPL(sgx_set_attribute); | |
911 | ||
31bf9288 | 912 | static int __init sgx_init(void) |
e7e05452 | 913 | { |
3fe0778e | 914 | int ret; |
e7e05452 SC |
915 | int i; |
916 | ||
3fe0778e | 917 | if (!cpu_feature_enabled(X86_FEATURE_SGX)) |
31bf9288 | 918 | return -ENODEV; |
e7e05452 SC |
919 | |
920 | if (!sgx_page_cache_init()) | |
31bf9288 | 921 | return -ENOMEM; |
e7e05452 | 922 | |
31bf9288 ST |
923 | if (!sgx_page_reclaimer_init()) { |
924 | ret = -ENOMEM; | |
e7e05452 | 925 | goto err_page_cache; |
31bf9288 | 926 | } |
e7e05452 | 927 | |
b3754e5d SC |
928 | ret = misc_register(&sgx_dev_provision); |
929 | if (ret) | |
930 | goto err_kthread; | |
931 | ||
faa7d3e6 KH |
932 | /* |
933 | * Always try to initialize the native *and* KVM drivers. | |
934 | * The KVM driver is less picky than the native one and | |
935 | * can function if the native one is not supported on the | |
936 | * current system or fails to initialize. | |
937 | * | |
938 | * Error out only if both fail to initialize. | |
939 | */ | |
3fe0778e | 940 | ret = sgx_drv_init(); |
faa7d3e6 KH |
941 | |
942 | if (sgx_vepc_init() && ret) | |
b3754e5d | 943 | goto err_provision; |
3fe0778e | 944 | |
31bf9288 | 945 | return 0; |
e7e05452 | 946 | |
b3754e5d SC |
947 | err_provision: |
948 | misc_deregister(&sgx_dev_provision); | |
949 | ||
3fe0778e JS |
950 | err_kthread: |
951 | kthread_stop(ksgxd_tsk); | |
952 | ||
e7e05452 SC |
953 | err_page_cache: |
954 | for (i = 0; i < sgx_nr_epc_sections; i++) { | |
955 | vfree(sgx_epc_sections[i].pages); | |
956 | memunmap(sgx_epc_sections[i].virt_addr); | |
957 | } | |
31bf9288 ST |
958 | |
959 | return ret; | |
e7e05452 SC |
960 | } |
961 | ||
962 | device_initcall(sgx_init); |