Merge tag 'tpmdd-next-6.10-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux-2.6-block.git] / arch / x86 / kernel / fpu / xstate.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * xsave/xrstor support.
4  *
5  * Author: Suresh Siddha <suresh.b.siddha@intel.com>
6  */
7 #include <linux/bitops.h>
8 #include <linux/compat.h>
9 #include <linux/cpu.h>
10 #include <linux/mman.h>
11 #include <linux/nospec.h>
12 #include <linux/pkeys.h>
13 #include <linux/seq_file.h>
14 #include <linux/proc_fs.h>
15 #include <linux/vmalloc.h>
16
17 #include <asm/fpu/api.h>
18 #include <asm/fpu/regset.h>
19 #include <asm/fpu/signal.h>
20 #include <asm/fpu/xcr.h>
21
22 #include <asm/tlbflush.h>
23 #include <asm/prctl.h>
24 #include <asm/elf.h>
25
26 #include "context.h"
27 #include "internal.h"
28 #include "legacy.h"
29 #include "xstate.h"
30
31 #define for_each_extended_xfeature(bit, mask)                           \
32         (bit) = FIRST_EXTENDED_XFEATURE;                                \
33         for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask))
34
35 /*
36  * Although we spell it out in here, the Processor Trace
37  * xfeature is completely unused.  We use other mechanisms
38  * to save/restore PT state in Linux.
39  */
40 static const char *xfeature_names[] =
41 {
42         "x87 floating point registers",
43         "SSE registers",
44         "AVX registers",
45         "MPX bounds registers",
46         "MPX CSR",
47         "AVX-512 opmask",
48         "AVX-512 Hi256",
49         "AVX-512 ZMM_Hi256",
50         "Processor Trace (unused)",
51         "Protection Keys User registers",
52         "PASID state",
53         "Control-flow User registers",
54         "Control-flow Kernel registers (unused)",
55         "unknown xstate feature",
56         "unknown xstate feature",
57         "unknown xstate feature",
58         "unknown xstate feature",
59         "AMX Tile config",
60         "AMX Tile data",
61         "unknown xstate feature",
62 };
63
64 static unsigned short xsave_cpuid_features[] __initdata = {
65         [XFEATURE_FP]                           = X86_FEATURE_FPU,
66         [XFEATURE_SSE]                          = X86_FEATURE_XMM,
67         [XFEATURE_YMM]                          = X86_FEATURE_AVX,
68         [XFEATURE_BNDREGS]                      = X86_FEATURE_MPX,
69         [XFEATURE_BNDCSR]                       = X86_FEATURE_MPX,
70         [XFEATURE_OPMASK]                       = X86_FEATURE_AVX512F,
71         [XFEATURE_ZMM_Hi256]                    = X86_FEATURE_AVX512F,
72         [XFEATURE_Hi16_ZMM]                     = X86_FEATURE_AVX512F,
73         [XFEATURE_PT_UNIMPLEMENTED_SO_FAR]      = X86_FEATURE_INTEL_PT,
74         [XFEATURE_PKRU]                         = X86_FEATURE_OSPKE,
75         [XFEATURE_PASID]                        = X86_FEATURE_ENQCMD,
76         [XFEATURE_CET_USER]                     = X86_FEATURE_SHSTK,
77         [XFEATURE_XTILE_CFG]                    = X86_FEATURE_AMX_TILE,
78         [XFEATURE_XTILE_DATA]                   = X86_FEATURE_AMX_TILE,
79 };
80
81 static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
82         { [ 0 ... XFEATURE_MAX - 1] = -1};
83 static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
84         { [ 0 ... XFEATURE_MAX - 1] = -1};
85 static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init;
86
87 #define XSTATE_FLAG_SUPERVISOR  BIT(0)
88 #define XSTATE_FLAG_ALIGNED64   BIT(1)
89
90 /*
91  * Return whether the system supports a given xfeature.
92  *
93  * Also return the name of the (most advanced) feature that the caller requested:
94  */
95 int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
96 {
97         u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features;
98
99         if (unlikely(feature_name)) {
100                 long xfeature_idx, max_idx;
101                 u64 xfeatures_print;
102                 /*
103                  * So we use FLS here to be able to print the most advanced
104                  * feature that was requested but is missing. So if a driver
105                  * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
106                  * missing AVX feature - this is the most informative message
107                  * to users:
108                  */
109                 if (xfeatures_missing)
110                         xfeatures_print = xfeatures_missing;
111                 else
112                         xfeatures_print = xfeatures_needed;
113
114                 xfeature_idx = fls64(xfeatures_print)-1;
115                 max_idx = ARRAY_SIZE(xfeature_names)-1;
116                 xfeature_idx = min(xfeature_idx, max_idx);
117
118                 *feature_name = xfeature_names[xfeature_idx];
119         }
120
121         if (xfeatures_missing)
122                 return 0;
123
124         return 1;
125 }
126 EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
127
128 static bool xfeature_is_aligned64(int xfeature_nr)
129 {
130         return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64;
131 }
132
133 static bool xfeature_is_supervisor(int xfeature_nr)
134 {
135         return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR;
136 }
137
138 static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature)
139 {
140         unsigned int offs, i;
141
142         /*
143          * Non-compacted format and legacy features use the cached fixed
144          * offsets.
145          */
146         if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) ||
147             xfeature <= XFEATURE_SSE)
148                 return xstate_offsets[xfeature];
149
150         /*
151          * Compacted format offsets depend on the actual content of the
152          * compacted xsave area which is determined by the xcomp_bv header
153          * field.
154          */
155         offs = FXSAVE_SIZE + XSAVE_HDR_SIZE;
156         for_each_extended_xfeature(i, xcomp_bv) {
157                 if (xfeature_is_aligned64(i))
158                         offs = ALIGN(offs, 64);
159                 if (i == xfeature)
160                         break;
161                 offs += xstate_sizes[i];
162         }
163         return offs;
164 }
165
166 /*
167  * Enable the extended processor state save/restore feature.
168  * Called once per CPU onlining.
169  */
170 void fpu__init_cpu_xstate(void)
171 {
172         if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features)
173                 return;
174
175         cr4_set_bits(X86_CR4_OSXSAVE);
176
177         /*
178          * Must happen after CR4 setup and before xsetbv() to allow KVM
179          * lazy passthrough.  Write independent of the dynamic state static
180          * key as that does not work on the boot CPU. This also ensures
181          * that any stale state is wiped out from XFD. Reset the per CPU
182          * xfd cache too.
183          */
184         if (cpu_feature_enabled(X86_FEATURE_XFD))
185                 xfd_set_state(init_fpstate.xfd);
186
187         /*
188          * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
189          * managed by XSAVE{C, OPT, S} and XRSTOR{S}.  Only XSAVE user
190          * states can be set here.
191          */
192         xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
193
194         /*
195          * MSR_IA32_XSS sets supervisor states managed by XSAVES.
196          */
197         if (boot_cpu_has(X86_FEATURE_XSAVES)) {
198                 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
199                                      xfeatures_mask_independent());
200         }
201 }
202
203 static bool xfeature_enabled(enum xfeature xfeature)
204 {
205         return fpu_kernel_cfg.max_features & BIT_ULL(xfeature);
206 }
207
208 /*
209  * Record the offsets and sizes of various xstates contained
210  * in the XSAVE state memory layout.
211  */
212 static void __init setup_xstate_cache(void)
213 {
214         u32 eax, ebx, ecx, edx, i;
215         /* start at the beginning of the "extended state" */
216         unsigned int last_good_offset = offsetof(struct xregs_state,
217                                                  extended_state_area);
218         /*
219          * The FP xstates and SSE xstates are legacy states. They are always
220          * in the fixed offsets in the xsave area in either compacted form
221          * or standard form.
222          */
223         xstate_offsets[XFEATURE_FP]     = 0;
224         xstate_sizes[XFEATURE_FP]       = offsetof(struct fxregs_state,
225                                                    xmm_space);
226
227         xstate_offsets[XFEATURE_SSE]    = xstate_sizes[XFEATURE_FP];
228         xstate_sizes[XFEATURE_SSE]      = sizeof_field(struct fxregs_state,
229                                                        xmm_space);
230
231         for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
232                 cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
233
234                 xstate_sizes[i] = eax;
235                 xstate_flags[i] = ecx;
236
237                 /*
238                  * If an xfeature is supervisor state, the offset in EBX is
239                  * invalid, leave it to -1.
240                  */
241                 if (xfeature_is_supervisor(i))
242                         continue;
243
244                 xstate_offsets[i] = ebx;
245
246                 /*
247                  * In our xstate size checks, we assume that the highest-numbered
248                  * xstate feature has the highest offset in the buffer.  Ensure
249                  * it does.
250                  */
251                 WARN_ONCE(last_good_offset > xstate_offsets[i],
252                           "x86/fpu: misordered xstate at %d\n", last_good_offset);
253
254                 last_good_offset = xstate_offsets[i];
255         }
256 }
257
258 static void __init print_xstate_feature(u64 xstate_mask)
259 {
260         const char *feature_name;
261
262         if (cpu_has_xfeatures(xstate_mask, &feature_name))
263                 pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name);
264 }
265
266 /*
267  * Print out all the supported xstate features:
268  */
269 static void __init print_xstate_features(void)
270 {
271         print_xstate_feature(XFEATURE_MASK_FP);
272         print_xstate_feature(XFEATURE_MASK_SSE);
273         print_xstate_feature(XFEATURE_MASK_YMM);
274         print_xstate_feature(XFEATURE_MASK_BNDREGS);
275         print_xstate_feature(XFEATURE_MASK_BNDCSR);
276         print_xstate_feature(XFEATURE_MASK_OPMASK);
277         print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
278         print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
279         print_xstate_feature(XFEATURE_MASK_PKRU);
280         print_xstate_feature(XFEATURE_MASK_PASID);
281         print_xstate_feature(XFEATURE_MASK_CET_USER);
282         print_xstate_feature(XFEATURE_MASK_XTILE_CFG);
283         print_xstate_feature(XFEATURE_MASK_XTILE_DATA);
284 }
285
286 /*
287  * This check is important because it is easy to get XSTATE_*
288  * confused with XSTATE_BIT_*.
289  */
290 #define CHECK_XFEATURE(nr) do {         \
291         WARN_ON(nr < FIRST_EXTENDED_XFEATURE);  \
292         WARN_ON(nr >= XFEATURE_MAX);    \
293 } while (0)
294
295 /*
296  * Print out xstate component offsets and sizes
297  */
298 static void __init print_xstate_offset_size(void)
299 {
300         int i;
301
302         for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
303                 pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
304                         i, xfeature_get_offset(fpu_kernel_cfg.max_features, i),
305                         i, xstate_sizes[i]);
306         }
307 }
308
309 /*
310  * This function is called only during boot time when x86 caps are not set
311  * up and alternative can not be used yet.
312  */
313 static __init void os_xrstor_booting(struct xregs_state *xstate)
314 {
315         u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE;
316         u32 lmask = mask;
317         u32 hmask = mask >> 32;
318         int err;
319
320         if (cpu_feature_enabled(X86_FEATURE_XSAVES))
321                 XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
322         else
323                 XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
324
325         /*
326          * We should never fault when copying from a kernel buffer, and the FPU
327          * state we set at boot time should be valid.
328          */
329         WARN_ON_FPU(err);
330 }
331
332 /*
333  * All supported features have either init state all zeros or are
334  * handled in setup_init_fpu() individually. This is an explicit
335  * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
336  * newly added supported features at build time and make people
337  * actually look at the init state for the new feature.
338  */
339 #define XFEATURES_INIT_FPSTATE_HANDLED          \
340         (XFEATURE_MASK_FP |                     \
341          XFEATURE_MASK_SSE |                    \
342          XFEATURE_MASK_YMM |                    \
343          XFEATURE_MASK_OPMASK |                 \
344          XFEATURE_MASK_ZMM_Hi256 |              \
345          XFEATURE_MASK_Hi16_ZMM  |              \
346          XFEATURE_MASK_PKRU |                   \
347          XFEATURE_MASK_BNDREGS |                \
348          XFEATURE_MASK_BNDCSR |                 \
349          XFEATURE_MASK_PASID |                  \
350          XFEATURE_MASK_CET_USER |               \
351          XFEATURE_MASK_XTILE)
352
353 /*
354  * setup the xstate image representing the init state
355  */
356 static void __init setup_init_fpu_buf(void)
357 {
358         BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
359                       XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
360                      XFEATURES_INIT_FPSTATE_HANDLED);
361
362         if (!boot_cpu_has(X86_FEATURE_XSAVE))
363                 return;
364
365         print_xstate_features();
366
367         xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures);
368
369         /*
370          * Init all the features state with header.xfeatures being 0x0
371          */
372         os_xrstor_booting(&init_fpstate.regs.xsave);
373
374         /*
375          * All components are now in init state. Read the state back so
376          * that init_fpstate contains all non-zero init state. This only
377          * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because
378          * those use the init optimization which skips writing data for
379          * components in init state.
380          *
381          * XSAVE could be used, but that would require to reshuffle the
382          * data when XSAVEC/S is available because XSAVEC/S uses xstate
383          * compaction. But doing so is a pointless exercise because most
384          * components have an all zeros init state except for the legacy
385          * ones (FP and SSE). Those can be saved with FXSAVE into the
386          * legacy area. Adding new features requires to ensure that init
387          * state is all zeroes or if not to add the necessary handling
388          * here.
389          */
390         fxsave(&init_fpstate.regs.fxsave);
391 }
392
393 int xfeature_size(int xfeature_nr)
394 {
395         u32 eax, ebx, ecx, edx;
396
397         CHECK_XFEATURE(xfeature_nr);
398         cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
399         return eax;
400 }
401
402 /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
403 static int validate_user_xstate_header(const struct xstate_header *hdr,
404                                        struct fpstate *fpstate)
405 {
406         /* No unknown or supervisor features may be set */
407         if (hdr->xfeatures & ~fpstate->user_xfeatures)
408                 return -EINVAL;
409
410         /* Userspace must use the uncompacted format */
411         if (hdr->xcomp_bv)
412                 return -EINVAL;
413
414         /*
415          * If 'reserved' is shrunken to add a new field, make sure to validate
416          * that new field here!
417          */
418         BUILD_BUG_ON(sizeof(hdr->reserved) != 48);
419
420         /* No reserved bits may be set */
421         if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
422                 return -EINVAL;
423
424         return 0;
425 }
426
427 static void __init __xstate_dump_leaves(void)
428 {
429         int i;
430         u32 eax, ebx, ecx, edx;
431         static int should_dump = 1;
432
433         if (!should_dump)
434                 return;
435         should_dump = 0;
436         /*
437          * Dump out a few leaves past the ones that we support
438          * just in case there are some goodies up there
439          */
440         for (i = 0; i < XFEATURE_MAX + 10; i++) {
441                 cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
442                 pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
443                         XSTATE_CPUID, i, eax, ebx, ecx, edx);
444         }
445 }
446
447 #define XSTATE_WARN_ON(x, fmt, ...) do {                                        \
448         if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) {   \
449                 __xstate_dump_leaves();                                         \
450         }                                                                       \
451 } while (0)
452
453 #define XCHECK_SZ(sz, nr, __struct) ({                                  \
454         if (WARN_ONCE(sz != sizeof(__struct),                           \
455             "[%s]: struct is %zu bytes, cpu state %d bytes\n",          \
456             xfeature_names[nr], sizeof(__struct), sz)) {                \
457                 __xstate_dump_leaves();                                 \
458         }                                                               \
459         true;                                                           \
460 })
461
462
463 /**
464  * check_xtile_data_against_struct - Check tile data state size.
465  *
466  * Calculate the state size by multiplying the single tile size which is
467  * recorded in a C struct, and the number of tiles that the CPU informs.
468  * Compare the provided size with the calculation.
469  *
470  * @size:       The tile data state size
471  *
472  * Returns:     0 on success, -EINVAL on mismatch.
473  */
474 static int __init check_xtile_data_against_struct(int size)
475 {
476         u32 max_palid, palid, state_size;
477         u32 eax, ebx, ecx, edx;
478         u16 max_tile;
479
480         /*
481          * Check the maximum palette id:
482          *   eax: the highest numbered palette subleaf.
483          */
484         cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx);
485
486         /*
487          * Cross-check each tile size and find the maximum number of
488          * supported tiles.
489          */
490         for (palid = 1, max_tile = 0; palid <= max_palid; palid++) {
491                 u16 tile_size, max;
492
493                 /*
494                  * Check the tile size info:
495                  *   eax[31:16]:  bytes per title
496                  *   ebx[31:16]:  the max names (or max number of tiles)
497                  */
498                 cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx);
499                 tile_size = eax >> 16;
500                 max = ebx >> 16;
501
502                 if (tile_size != sizeof(struct xtile_data)) {
503                         pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n",
504                                __stringify(XFEATURE_XTILE_DATA),
505                                sizeof(struct xtile_data), tile_size);
506                         __xstate_dump_leaves();
507                         return -EINVAL;
508                 }
509
510                 if (max > max_tile)
511                         max_tile = max;
512         }
513
514         state_size = sizeof(struct xtile_data) * max_tile;
515         if (size != state_size) {
516                 pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n",
517                        __stringify(XFEATURE_XTILE_DATA), state_size, size);
518                 __xstate_dump_leaves();
519                 return -EINVAL;
520         }
521         return 0;
522 }
523
524 /*
525  * We have a C struct for each 'xstate'.  We need to ensure
526  * that our software representation matches what the CPU
527  * tells us about the state's size.
528  */
529 static bool __init check_xstate_against_struct(int nr)
530 {
531         /*
532          * Ask the CPU for the size of the state.
533          */
534         int sz = xfeature_size(nr);
535
536         /*
537          * Match each CPU state with the corresponding software
538          * structure.
539          */
540         switch (nr) {
541         case XFEATURE_YMM:        return XCHECK_SZ(sz, nr, struct ymmh_struct);
542         case XFEATURE_BNDREGS:    return XCHECK_SZ(sz, nr, struct mpx_bndreg_state);
543         case XFEATURE_BNDCSR:     return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state);
544         case XFEATURE_OPMASK:     return XCHECK_SZ(sz, nr, struct avx_512_opmask_state);
545         case XFEATURE_ZMM_Hi256:  return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state);
546         case XFEATURE_Hi16_ZMM:   return XCHECK_SZ(sz, nr, struct avx_512_hi16_state);
547         case XFEATURE_PKRU:       return XCHECK_SZ(sz, nr, struct pkru_state);
548         case XFEATURE_PASID:      return XCHECK_SZ(sz, nr, struct ia32_pasid_state);
549         case XFEATURE_XTILE_CFG:  return XCHECK_SZ(sz, nr, struct xtile_cfg);
550         case XFEATURE_CET_USER:   return XCHECK_SZ(sz, nr, struct cet_user_state);
551         case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true;
552         default:
553                 XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr);
554                 return false;
555         }
556
557         return true;
558 }
559
560 static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
561 {
562         unsigned int topmost = fls64(xfeatures) -  1;
563         unsigned int offset = xstate_offsets[topmost];
564
565         if (topmost <= XFEATURE_SSE)
566                 return sizeof(struct xregs_state);
567
568         if (compacted)
569                 offset = xfeature_get_offset(xfeatures, topmost);
570         return offset + xstate_sizes[topmost];
571 }
572
573 /*
574  * This essentially double-checks what the cpu told us about
575  * how large the XSAVE buffer needs to be.  We are recalculating
576  * it to be safe.
577  *
578  * Independent XSAVE features allocate their own buffers and are not
579  * covered by these checks. Only the size of the buffer for task->fpu
580  * is checked here.
581  */
582 static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
583 {
584         bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
585         bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES);
586         unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
587         int i;
588
589         for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
590                 if (!check_xstate_against_struct(i))
591                         return false;
592                 /*
593                  * Supervisor state components can be managed only by
594                  * XSAVES.
595                  */
596                 if (!xsaves && xfeature_is_supervisor(i)) {
597                         XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i);
598                         return false;
599                 }
600         }
601         size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
602         XSTATE_WARN_ON(size != kernel_size,
603                        "size %u != kernel_size %u\n", size, kernel_size);
604         return size == kernel_size;
605 }
606
607 /*
608  * Get total size of enabled xstates in XCR0 | IA32_XSS.
609  *
610  * Note the SDM's wording here.  "sub-function 0" only enumerates
611  * the size of the *user* states.  If we use it to size a buffer
612  * that we use 'XSAVES' on, we could potentially overflow the
613  * buffer because 'XSAVES' saves system states too.
614  *
615  * This also takes compaction into account. So this works for
616  * XSAVEC as well.
617  */
618 static unsigned int __init get_compacted_size(void)
619 {
620         unsigned int eax, ebx, ecx, edx;
621         /*
622          * - CPUID function 0DH, sub-function 1:
623          *    EBX enumerates the size (in bytes) required by
624          *    the XSAVES instruction for an XSAVE area
625          *    containing all the state components
626          *    corresponding to bits currently set in
627          *    XCR0 | IA32_XSS.
628          *
629          * When XSAVES is not available but XSAVEC is (virt), then there
630          * are no supervisor states, but XSAVEC still uses compacted
631          * format.
632          */
633         cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
634         return ebx;
635 }
636
637 /*
638  * Get the total size of the enabled xstates without the independent supervisor
639  * features.
640  */
641 static unsigned int __init get_xsave_compacted_size(void)
642 {
643         u64 mask = xfeatures_mask_independent();
644         unsigned int size;
645
646         if (!mask)
647                 return get_compacted_size();
648
649         /* Disable independent features. */
650         wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
651
652         /*
653          * Ask the hardware what size is required of the buffer.
654          * This is the size required for the task->fpu buffer.
655          */
656         size = get_compacted_size();
657
658         /* Re-enable independent features so XSAVES will work on them again. */
659         wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);
660
661         return size;
662 }
663
664 static unsigned int __init get_xsave_size_user(void)
665 {
666         unsigned int eax, ebx, ecx, edx;
667         /*
668          * - CPUID function 0DH, sub-function 0:
669          *    EBX enumerates the size (in bytes) required by
670          *    the XSAVE instruction for an XSAVE area
671          *    containing all the *user* state components
672          *    corresponding to bits currently set in XCR0.
673          */
674         cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
675         return ebx;
676 }
677
678 static int __init init_xstate_size(void)
679 {
680         /* Recompute the context size for enabled features: */
681         unsigned int user_size, kernel_size, kernel_default_size;
682         bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
683
684         /* Uncompacted user space size */
685         user_size = get_xsave_size_user();
686
687         /*
688          * XSAVES kernel size includes supervisor states and uses compacted
689          * format. XSAVEC uses compacted format, but does not save
690          * supervisor states.
691          *
692          * XSAVE[OPT] do not support supervisor states so kernel and user
693          * size is identical.
694          */
695         if (compacted)
696                 kernel_size = get_xsave_compacted_size();
697         else
698                 kernel_size = user_size;
699
700         kernel_default_size =
701                 xstate_calculate_size(fpu_kernel_cfg.default_features, compacted);
702
703         if (!paranoid_xstate_size_valid(kernel_size))
704                 return -EINVAL;
705
706         fpu_kernel_cfg.max_size = kernel_size;
707         fpu_user_cfg.max_size = user_size;
708
709         fpu_kernel_cfg.default_size = kernel_default_size;
710         fpu_user_cfg.default_size =
711                 xstate_calculate_size(fpu_user_cfg.default_features, false);
712
713         return 0;
714 }
715
716 /*
717  * We enabled the XSAVE hardware, but something went wrong and
718  * we can not use it.  Disable it.
719  */
720 static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
721 {
722         fpu_kernel_cfg.max_features = 0;
723         cr4_clear_bits(X86_CR4_OSXSAVE);
724         setup_clear_cpu_cap(X86_FEATURE_XSAVE);
725
726         /* Restore the legacy size.*/
727         fpu_kernel_cfg.max_size = legacy_size;
728         fpu_kernel_cfg.default_size = legacy_size;
729         fpu_user_cfg.max_size = legacy_size;
730         fpu_user_cfg.default_size = legacy_size;
731
732         /*
733          * Prevent enabling the static branch which enables writes to the
734          * XFD MSR.
735          */
736         init_fpstate.xfd = 0;
737
738         fpstate_reset(&current->thread.fpu);
739 }
740
741 /*
742  * Enable and initialize the xsave feature.
743  * Called once per system bootup.
744  */
745 void __init fpu__init_system_xstate(unsigned int legacy_size)
746 {
747         unsigned int eax, ebx, ecx, edx;
748         u64 xfeatures;
749         int err;
750         int i;
751
752         if (!boot_cpu_has(X86_FEATURE_FPU)) {
753                 pr_info("x86/fpu: No FPU detected\n");
754                 return;
755         }
756
757         if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
758                 pr_info("x86/fpu: x87 FPU will use %s\n",
759                         boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE");
760                 return;
761         }
762
763         if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
764                 WARN_ON_FPU(1);
765                 return;
766         }
767
768         /*
769          * Find user xstates supported by the processor.
770          */
771         cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
772         fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);
773
774         /*
775          * Find supervisor xstates supported by the processor.
776          */
777         cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
778         fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);
779
780         if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
781                 /*
782                  * This indicates that something really unexpected happened
783                  * with the enumeration.  Disable XSAVE and try to continue
784                  * booting without it.  This is too early to BUG().
785                  */
786                 pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n",
787                        fpu_kernel_cfg.max_features);
788                 goto out_disable;
789         }
790
791         /*
792          * Clear XSAVE features that are disabled in the normal CPUID.
793          */
794         for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
795                 unsigned short cid = xsave_cpuid_features[i];
796
797                 /* Careful: X86_FEATURE_FPU is 0! */
798                 if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid))
799                         fpu_kernel_cfg.max_features &= ~BIT_ULL(i);
800         }
801
802         if (!cpu_feature_enabled(X86_FEATURE_XFD))
803                 fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC;
804
805         if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
806                 fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
807         else
808                 fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
809                                         XFEATURE_MASK_SUPERVISOR_SUPPORTED;
810
811         fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
812         fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
813
814         /* Clean out dynamic features from default */
815         fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features;
816         fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
817
818         fpu_user_cfg.default_features = fpu_user_cfg.max_features;
819         fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
820
821         /* Store it for paranoia check at the end */
822         xfeatures = fpu_kernel_cfg.max_features;
823
824         /*
825          * Initialize the default XFD state in initfp_state and enable the
826          * dynamic sizing mechanism if dynamic states are available.  The
827          * static key cannot be enabled here because this runs before
828          * jump_label_init(). This is delayed to an initcall.
829          */
830         init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC;
831
832         /* Set up compaction feature bit */
833         if (cpu_feature_enabled(X86_FEATURE_XSAVEC) ||
834             cpu_feature_enabled(X86_FEATURE_XSAVES))
835                 setup_force_cpu_cap(X86_FEATURE_XCOMPACTED);
836
837         /* Enable xstate instructions to be able to continue with initialization: */
838         fpu__init_cpu_xstate();
839
840         /* Cache size, offset and flags for initialization */
841         setup_xstate_cache();
842
843         err = init_xstate_size();
844         if (err)
845                 goto out_disable;
846
847         /* Reset the state for the current task */
848         fpstate_reset(&current->thread.fpu);
849
850         /*
851          * Update info used for ptrace frames; use standard-format size and no
852          * supervisor xstates:
853          */
854         update_regset_xstate_info(fpu_user_cfg.max_size,
855                                   fpu_user_cfg.max_features);
856
857         /*
858          * init_fpstate excludes dynamic states as they are large but init
859          * state is zero.
860          */
861         init_fpstate.size               = fpu_kernel_cfg.default_size;
862         init_fpstate.xfeatures          = fpu_kernel_cfg.default_features;
863
864         if (init_fpstate.size > sizeof(init_fpstate.regs)) {
865                 pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n",
866                         sizeof(init_fpstate.regs), init_fpstate.size);
867                 goto out_disable;
868         }
869
870         setup_init_fpu_buf();
871
872         /*
873          * Paranoia check whether something in the setup modified the
874          * xfeatures mask.
875          */
876         if (xfeatures != fpu_kernel_cfg.max_features) {
877                 pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n",
878                        xfeatures, fpu_kernel_cfg.max_features);
879                 goto out_disable;
880         }
881
882         /*
883          * CPU capabilities initialization runs before FPU init. So
884          * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely
885          * functional, set the feature bit so depending code works.
886          */
887         setup_force_cpu_cap(X86_FEATURE_OSXSAVE);
888
889         print_xstate_offset_size();
890         pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
891                 fpu_kernel_cfg.max_features,
892                 fpu_kernel_cfg.max_size,
893                 boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard");
894         return;
895
896 out_disable:
897         /* something went wrong, try to boot without any XSAVE support */
898         fpu__init_disable_system_xstate(legacy_size);
899 }
900
901 /*
902  * Restore minimal FPU state after suspend:
903  */
904 void fpu__resume_cpu(void)
905 {
906         /*
907          * Restore XCR0 on xsave capable CPUs:
908          */
909         if (cpu_feature_enabled(X86_FEATURE_XSAVE))
910                 xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
911
912         /*
913          * Restore IA32_XSS. The same CPUID bit enumerates support
914          * of XSAVES and MSR_IA32_XSS.
915          */
916         if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
917                 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()  |
918                                      xfeatures_mask_independent());
919         }
920
921         if (fpu_state_size_dynamic())
922                 wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd);
923 }
924
925 /*
926  * Given an xstate feature nr, calculate where in the xsave
927  * buffer the state is.  Callers should ensure that the buffer
928  * is valid.
929  */
930 static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
931 {
932         u64 xcomp_bv = xsave->header.xcomp_bv;
933
934         if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
935                 return NULL;
936
937         if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) {
938                 if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr))))
939                         return NULL;
940         }
941
942         return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr);
943 }
944
945 /*
946  * Given the xsave area and a state inside, this function returns the
947  * address of the state.
948  *
949  * This is the API that is called to get xstate address in either
950  * standard format or compacted format of xsave area.
951  *
952  * Note that if there is no data for the field in the xsave buffer
953  * this will return NULL.
954  *
955  * Inputs:
956  *      xstate: the thread's storage area for all FPU data
957  *      xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
958  *      XFEATURE_SSE, etc...)
959  * Output:
960  *      address of the state in the xsave area, or NULL if the
961  *      field is not present in the xsave buffer.
962  */
963 void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
964 {
965         /*
966          * Do we even *have* xsave state?
967          */
968         if (!boot_cpu_has(X86_FEATURE_XSAVE))
969                 return NULL;
970
971         /*
972          * We should not ever be requesting features that we
973          * have not enabled.
974          */
975         if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
976                 return NULL;
977
978         /*
979          * This assumes the last 'xsave*' instruction to
980          * have requested that 'xfeature_nr' be saved.
981          * If it did not, we might be seeing and old value
982          * of the field in the buffer.
983          *
984          * This can happen because the last 'xsave' did not
985          * request that this feature be saved (unlikely)
986          * or because the "init optimization" caused it
987          * to not be saved.
988          */
989         if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr)))
990                 return NULL;
991
992         return __raw_xsave_addr(xsave, xfeature_nr);
993 }
994 EXPORT_SYMBOL_GPL(get_xsave_addr);
995
996 #ifdef CONFIG_ARCH_HAS_PKEYS
997
998 /*
999  * This will go out and modify PKRU register to set the access
1000  * rights for @pkey to @init_val.
1001  */
1002 int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
1003                               unsigned long init_val)
1004 {
1005         u32 old_pkru, new_pkru_bits = 0;
1006         int pkey_shift;
1007
1008         /*
1009          * This check implies XSAVE support.  OSPKE only gets
1010          * set if we enable XSAVE and we enable PKU in XCR0.
1011          */
1012         if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
1013                 return -EINVAL;
1014
1015         /*
1016          * This code should only be called with valid 'pkey'
1017          * values originating from in-kernel users.  Complain
1018          * if a bad value is observed.
1019          */
1020         if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
1021                 return -EINVAL;
1022
1023         /* Set the bits we need in PKRU:  */
1024         if (init_val & PKEY_DISABLE_ACCESS)
1025                 new_pkru_bits |= PKRU_AD_BIT;
1026         if (init_val & PKEY_DISABLE_WRITE)
1027                 new_pkru_bits |= PKRU_WD_BIT;
1028
1029         /* Shift the bits in to the correct place in PKRU for pkey: */
1030         pkey_shift = pkey * PKRU_BITS_PER_PKEY;
1031         new_pkru_bits <<= pkey_shift;
1032
1033         /* Get old PKRU and mask off any old bits in place: */
1034         old_pkru = read_pkru();
1035         old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
1036
1037         /* Write old part along with new part: */
1038         write_pkru(old_pkru | new_pkru_bits);
1039
1040         return 0;
1041 }
1042 #endif /* ! CONFIG_ARCH_HAS_PKEYS */
1043
1044 static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
1045                          void *init_xstate, unsigned int size)
1046 {
1047         membuf_write(to, from_xstate ? xstate : init_xstate, size);
1048 }
1049
1050 /**
1051  * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
1052  * @to:         membuf descriptor
1053  * @fpstate:    The fpstate buffer from which to copy
1054  * @xfeatures:  The mask of xfeatures to save (XSAVE mode only)
1055  * @pkru_val:   The PKRU value to store in the PKRU component
1056  * @copy_mode:  The requested copy mode
1057  *
1058  * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
1059  * format, i.e. from the kernel internal hardware dependent storage format
1060  * to the requested @mode. UABI XSTATE is always uncompacted!
1061  *
1062  * It supports partial copy but @to.pos always starts from zero.
1063  */
1064 void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
1065                                u64 xfeatures, u32 pkru_val,
1066                                enum xstate_copy_mode copy_mode)
1067 {
1068         const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
1069         struct xregs_state *xinit = &init_fpstate.regs.xsave;
1070         struct xregs_state *xsave = &fpstate->regs.xsave;
1071         struct xstate_header header;
1072         unsigned int zerofrom;
1073         u64 mask;
1074         int i;
1075
1076         memset(&header, 0, sizeof(header));
1077         header.xfeatures = xsave->header.xfeatures;
1078
1079         /* Mask out the feature bits depending on copy mode */
1080         switch (copy_mode) {
1081         case XSTATE_COPY_FP:
1082                 header.xfeatures &= XFEATURE_MASK_FP;
1083                 break;
1084
1085         case XSTATE_COPY_FX:
1086                 header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE;
1087                 break;
1088
1089         case XSTATE_COPY_XSAVE:
1090                 header.xfeatures &= fpstate->user_xfeatures & xfeatures;
1091                 break;
1092         }
1093
1094         /* Copy FP state up to MXCSR */
1095         copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387,
1096                      &xinit->i387, off_mxcsr);
1097
1098         /* Copy MXCSR when SSE or YMM are set in the feature mask */
1099         copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM),
1100                      &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr,
1101                      MXCSR_AND_FLAGS_SIZE);
1102
1103         /* Copy the remaining FP state */
1104         copy_feature(header.xfeatures & XFEATURE_MASK_FP,
1105                      &to, &xsave->i387.st_space, &xinit->i387.st_space,
1106                      sizeof(xsave->i387.st_space));
1107
1108         /* Copy the SSE state - shared with YMM, but independently managed */
1109         copy_feature(header.xfeatures & XFEATURE_MASK_SSE,
1110                      &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space,
1111                      sizeof(xsave->i387.xmm_space));
1112
1113         if (copy_mode != XSTATE_COPY_XSAVE)
1114                 goto out;
1115
1116         /* Zero the padding area */
1117         membuf_zero(&to, sizeof(xsave->i387.padding));
1118
1119         /* Copy xsave->i387.sw_reserved */
1120         membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved));
1121
1122         /* Copy the user space relevant state of @xsave->header */
1123         membuf_write(&to, &header, sizeof(header));
1124
1125         zerofrom = offsetof(struct xregs_state, extended_state_area);
1126
1127         /*
1128          * This 'mask' indicates which states to copy from fpstate.
1129          * Those extended states that are not present in fpstate are
1130          * either disabled or initialized:
1131          *
1132          * In non-compacted format, disabled features still occupy
1133          * state space but there is no state to copy from in the
1134          * compacted init_fpstate. The gap tracking will zero these
1135          * states.
1136          *
1137          * The extended features have an all zeroes init state. Thus,
1138          * remove them from 'mask' to zero those features in the user
1139          * buffer instead of retrieving them from init_fpstate.
1140          */
1141         mask = header.xfeatures;
1142
1143         for_each_extended_xfeature(i, mask) {
1144                 /*
1145                  * If there was a feature or alignment gap, zero the space
1146                  * in the destination buffer.
1147                  */
1148                 if (zerofrom < xstate_offsets[i])
1149                         membuf_zero(&to, xstate_offsets[i] - zerofrom);
1150
1151                 if (i == XFEATURE_PKRU) {
1152                         struct pkru_state pkru = {0};
1153                         /*
1154                          * PKRU is not necessarily up to date in the
1155                          * XSAVE buffer. Use the provided value.
1156                          */
1157                         pkru.pkru = pkru_val;
1158                         membuf_write(&to, &pkru, sizeof(pkru));
1159                 } else {
1160                         membuf_write(&to,
1161                                      __raw_xsave_addr(xsave, i),
1162                                      xstate_sizes[i]);
1163                 }
1164                 /*
1165                  * Keep track of the last copied state in the non-compacted
1166                  * target buffer for gap zeroing.
1167                  */
1168                 zerofrom = xstate_offsets[i] + xstate_sizes[i];
1169         }
1170
1171 out:
1172         if (to.left)
1173                 membuf_zero(&to, to.left);
1174 }
1175
1176 /**
1177  * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
1178  * @to:         membuf descriptor
1179  * @tsk:        The task from which to copy the saved xstate
1180  * @copy_mode:  The requested copy mode
1181  *
1182  * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
1183  * format, i.e. from the kernel internal hardware dependent storage format
1184  * to the requested @mode. UABI XSTATE is always uncompacted!
1185  *
1186  * It supports partial copy but @to.pos always starts from zero.
1187  */
1188 void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
1189                              enum xstate_copy_mode copy_mode)
1190 {
1191         __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate,
1192                                   tsk->thread.fpu.fpstate->user_xfeatures,
1193                                   tsk->thread.pkru, copy_mode);
1194 }
1195
1196 static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
1197                             const void *kbuf, const void __user *ubuf)
1198 {
1199         if (kbuf) {
1200                 memcpy(dst, kbuf + offset, size);
1201         } else {
1202                 if (copy_from_user(dst, ubuf + offset, size))
1203                         return -EFAULT;
1204         }
1205         return 0;
1206 }
1207
1208
1209 /**
1210  * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate
1211  * @fpstate:    The fpstate buffer to copy to
1212  * @kbuf:       The UABI format buffer, if it comes from the kernel
1213  * @ubuf:       The UABI format buffer, if it comes from userspace
1214  * @pkru:       The location to write the PKRU value to
1215  *
1216  * Converts from the UABI format into the kernel internal hardware
1217  * dependent format.
1218  *
1219  * This function ultimately has three different callers with distinct PKRU
1220  * behavior.
1221  * 1.   When called from sigreturn the PKRU register will be restored from
1222  *      @fpstate via an XRSTOR. Correctly copying the UABI format buffer to
1223  *      @fpstate is sufficient to cover this case, but the caller will also
1224  *      pass a pointer to the thread_struct's pkru field in @pkru and updating
1225  *      it is harmless.
1226  * 2.   When called from ptrace the PKRU register will be restored from the
1227  *      thread_struct's pkru field. A pointer to that is passed in @pkru.
1228  *      The kernel will restore it manually, so the XRSTOR behavior that resets
1229  *      the PKRU register to the hardware init value (0) if the corresponding
1230  *      xfeatures bit is not set is emulated here.
1231  * 3.   When called from KVM the PKRU register will be restored from the vcpu's
1232  *      pkru field. A pointer to that is passed in @pkru. KVM hasn't used
1233  *      XRSTOR and hasn't had the PKRU resetting behavior described above. To
1234  *      preserve that KVM behavior, it passes NULL for @pkru if the xfeatures
1235  *      bit is not set.
1236  */
1237 static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
1238                                const void __user *ubuf, u32 *pkru)
1239 {
1240         struct xregs_state *xsave = &fpstate->regs.xsave;
1241         unsigned int offset, size;
1242         struct xstate_header hdr;
1243         u64 mask;
1244         int i;
1245
1246         offset = offsetof(struct xregs_state, header);
1247         if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf))
1248                 return -EFAULT;
1249
1250         if (validate_user_xstate_header(&hdr, fpstate))
1251                 return -EINVAL;
1252
1253         /* Validate MXCSR when any of the related features is in use */
1254         mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM;
1255         if (hdr.xfeatures & mask) {
1256                 u32 mxcsr[2];
1257
1258                 offset = offsetof(struct fxregs_state, mxcsr);
1259                 if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf))
1260                         return -EFAULT;
1261
1262                 /* Reserved bits in MXCSR must be zero. */
1263                 if (mxcsr[0] & ~mxcsr_feature_mask)
1264                         return -EINVAL;
1265
1266                 /* SSE and YMM require MXCSR even when FP is not in use. */
1267                 if (!(hdr.xfeatures & XFEATURE_MASK_FP)) {
1268                         xsave->i387.mxcsr = mxcsr[0];
1269                         xsave->i387.mxcsr_mask = mxcsr[1];
1270                 }
1271         }
1272
1273         for (i = 0; i < XFEATURE_MAX; i++) {
1274                 mask = BIT_ULL(i);
1275
1276                 if (hdr.xfeatures & mask) {
1277                         void *dst = __raw_xsave_addr(xsave, i);
1278
1279                         offset = xstate_offsets[i];
1280                         size = xstate_sizes[i];
1281
1282                         if (copy_from_buffer(dst, offset, size, kbuf, ubuf))
1283                                 return -EFAULT;
1284                 }
1285         }
1286
1287         if (hdr.xfeatures & XFEATURE_MASK_PKRU) {
1288                 struct pkru_state *xpkru;
1289
1290                 xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU);
1291                 *pkru = xpkru->pkru;
1292         } else {
1293                 /*
1294                  * KVM may pass NULL here to indicate that it does not need
1295                  * PKRU updated.
1296                  */
1297                 if (pkru)
1298                         *pkru = 0;
1299         }
1300
1301         /*
1302          * The state that came in from userspace was user-state only.
1303          * Mask all the user states out of 'xfeatures':
1304          */
1305         xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;
1306
1307         /*
1308          * Add back in the features that came in from userspace:
1309          */
1310         xsave->header.xfeatures |= hdr.xfeatures;
1311
1312         return 0;
1313 }
1314
1315 /*
1316  * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
1317  * format and copy to the target thread. Used by ptrace and KVM.
1318  */
1319 int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru)
1320 {
1321         return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru);
1322 }
1323
1324 /*
1325  * Convert from a sigreturn standard-format user-space buffer to kernel
1326  * XSAVE[S] format and copy to the target thread. This is called from the
1327  * sigreturn() and rt_sigreturn() system calls.
1328  */
1329 int copy_sigframe_from_user_to_xstate(struct task_struct *tsk,
1330                                       const void __user *ubuf)
1331 {
1332         return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru);
1333 }
1334
1335 static bool validate_independent_components(u64 mask)
1336 {
1337         u64 xchk;
1338
1339         if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES)))
1340                 return false;
1341
1342         xchk = ~xfeatures_mask_independent();
1343
1344         if (WARN_ON_ONCE(!mask || mask & xchk))
1345                 return false;
1346
1347         return true;
1348 }
1349
1350 /**
1351  * xsaves - Save selected components to a kernel xstate buffer
1352  * @xstate:     Pointer to the buffer
1353  * @mask:       Feature mask to select the components to save
1354  *
1355  * The @xstate buffer must be 64 byte aligned and correctly initialized as
1356  * XSAVES does not write the full xstate header. Before first use the
1357  * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer
1358  * can #GP.
1359  *
1360  * The feature mask must be a subset of the independent features.
1361  */
1362 void xsaves(struct xregs_state *xstate, u64 mask)
1363 {
1364         int err;
1365
1366         if (!validate_independent_components(mask))
1367                 return;
1368
1369         XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
1370         WARN_ON_ONCE(err);
1371 }
1372
1373 /**
1374  * xrstors - Restore selected components from a kernel xstate buffer
1375  * @xstate:     Pointer to the buffer
1376  * @mask:       Feature mask to select the components to restore
1377  *
1378  * The @xstate buffer must be 64 byte aligned and correctly initialized
1379  * otherwise XRSTORS from that buffer can #GP.
1380  *
1381  * Proper usage is to restore the state which was saved with
1382  * xsaves() into @xstate.
1383  *
1384  * The feature mask must be a subset of the independent features.
1385  */
1386 void xrstors(struct xregs_state *xstate, u64 mask)
1387 {
1388         int err;
1389
1390         if (!validate_independent_components(mask))
1391                 return;
1392
1393         XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
1394         WARN_ON_ONCE(err);
1395 }
1396
1397 #if IS_ENABLED(CONFIG_KVM)
1398 void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature)
1399 {
1400         void *addr = get_xsave_addr(&fps->regs.xsave, xfeature);
1401
1402         if (addr)
1403                 memset(addr, 0, xstate_sizes[xfeature]);
1404 }
1405 EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
1406 #endif
1407
1408 #ifdef CONFIG_X86_64
1409
1410 #ifdef CONFIG_X86_DEBUG_FPU
1411 /*
1412  * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask
1413  * can safely operate on the @fpstate buffer.
1414  */
1415 static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
1416 {
1417         u64 xfd = __this_cpu_read(xfd_state);
1418
1419         if (fpstate->xfd == xfd)
1420                 return true;
1421
1422          /*
1423           * The XFD MSR does not match fpstate->xfd. That's invalid when
1424           * the passed in fpstate is current's fpstate.
1425           */
1426         if (fpstate->xfd == current->thread.fpu.fpstate->xfd)
1427                 return false;
1428
1429         /*
1430          * XRSTOR(S) from init_fpstate are always correct as it will just
1431          * bring all components into init state and not read from the
1432          * buffer. XSAVE(S) raises #PF after init.
1433          */
1434         if (fpstate == &init_fpstate)
1435                 return rstor;
1436
1437         /*
1438          * XSAVE(S): clone(), fpu_swap_kvm_fpstate()
1439          * XRSTORS(S): fpu_swap_kvm_fpstate()
1440          */
1441
1442         /*
1443          * No XSAVE/XRSTOR instructions (except XSAVE itself) touch
1444          * the buffer area for XFD-disabled state components.
1445          */
1446         mask &= ~xfd;
1447
1448         /*
1449          * Remove features which are valid in fpstate. They
1450          * have space allocated in fpstate.
1451          */
1452         mask &= ~fpstate->xfeatures;
1453
1454         /*
1455          * Any remaining state components in 'mask' might be written
1456          * by XSAVE/XRSTOR. Fail validation it found.
1457          */
1458         return !mask;
1459 }
1460
1461 void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
1462 {
1463         WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor));
1464 }
1465 #endif /* CONFIG_X86_DEBUG_FPU */
1466
1467 static int __init xfd_update_static_branch(void)
1468 {
1469         /*
1470          * If init_fpstate.xfd has bits set then dynamic features are
1471          * available and the dynamic sizing must be enabled.
1472          */
1473         if (init_fpstate.xfd)
1474                 static_branch_enable(&__fpu_state_size_dynamic);
1475         return 0;
1476 }
1477 arch_initcall(xfd_update_static_branch)
1478
1479 void fpstate_free(struct fpu *fpu)
1480 {
1481         if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate)
1482                 vfree(fpu->fpstate);
1483 }
1484
1485 /**
1486  * fpstate_realloc - Reallocate struct fpstate for the requested new features
1487  *
1488  * @xfeatures:  A bitmap of xstate features which extend the enabled features
1489  *              of that task
1490  * @ksize:      The required size for the kernel buffer
1491  * @usize:      The required size for user space buffers
1492  * @guest_fpu:  Pointer to a guest FPU container. NULL for host allocations
1493  *
1494  * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
1495  * terminates quickly, vfree()-induced IPIs may be a concern, but tasks
1496  * with large states are likely to live longer.
1497  *
1498  * Returns: 0 on success, -ENOMEM on allocation error.
1499  */
1500 static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
1501                            unsigned int usize, struct fpu_guest *guest_fpu)
1502 {
1503         struct fpu *fpu = &current->thread.fpu;
1504         struct fpstate *curfps, *newfps = NULL;
1505         unsigned int fpsize;
1506         bool in_use;
1507
1508         fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);
1509
1510         newfps = vzalloc(fpsize);
1511         if (!newfps)
1512                 return -ENOMEM;
1513         newfps->size = ksize;
1514         newfps->user_size = usize;
1515         newfps->is_valloc = true;
1516
1517         /*
1518          * When a guest FPU is supplied, use @guest_fpu->fpstate
1519          * as reference independent whether it is in use or not.
1520          */
1521         curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate;
1522
1523         /* Determine whether @curfps is the active fpstate */
1524         in_use = fpu->fpstate == curfps;
1525
1526         if (guest_fpu) {
1527                 newfps->is_guest = true;
1528                 newfps->is_confidential = curfps->is_confidential;
1529                 newfps->in_use = curfps->in_use;
1530                 guest_fpu->xfeatures |= xfeatures;
1531                 guest_fpu->uabi_size = usize;
1532         }
1533
1534         fpregs_lock();
1535         /*
1536          * If @curfps is in use, ensure that the current state is in the
1537          * registers before swapping fpstate as that might invalidate it
1538          * due to layout changes.
1539          */
1540         if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD))
1541                 fpregs_restore_userregs();
1542
1543         newfps->xfeatures = curfps->xfeatures | xfeatures;
1544         newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
1545         newfps->xfd = curfps->xfd & ~xfeatures;
1546
1547         /* Do the final updates within the locked region */
1548         xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures);
1549
1550         if (guest_fpu) {
1551                 guest_fpu->fpstate = newfps;
1552                 /* If curfps is active, update the FPU fpstate pointer */
1553                 if (in_use)
1554                         fpu->fpstate = newfps;
1555         } else {
1556                 fpu->fpstate = newfps;
1557         }
1558
1559         if (in_use)
1560                 xfd_update_state(fpu->fpstate);
1561         fpregs_unlock();
1562
1563         /* Only free valloc'ed state */
1564         if (curfps && curfps->is_valloc)
1565                 vfree(curfps);
1566
1567         return 0;
1568 }
1569
1570 static int validate_sigaltstack(unsigned int usize)
1571 {
1572         struct task_struct *thread, *leader = current->group_leader;
1573         unsigned long framesize = get_sigframe_size();
1574
1575         lockdep_assert_held(&current->sighand->siglock);
1576
1577         /* get_sigframe_size() is based on fpu_user_cfg.max_size */
1578         framesize -= fpu_user_cfg.max_size;
1579         framesize += usize;
1580         for_each_thread(leader, thread) {
1581                 if (thread->sas_ss_size && thread->sas_ss_size < framesize)
1582                         return -ENOSPC;
1583         }
1584         return 0;
1585 }
1586
1587 static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
1588 {
1589         /*
1590          * This deliberately does not exclude !XSAVES as we still might
1591          * decide to optionally context switch XCR0 or talk the silicon
1592          * vendors into extending XFD for the pre AMX states, especially
1593          * AVX512.
1594          */
1595         bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
1596         struct fpu *fpu = &current->group_leader->thread.fpu;
1597         struct fpu_state_perm *perm;
1598         unsigned int ksize, usize;
1599         u64 mask;
1600         int ret = 0;
1601
1602         /* Check whether fully enabled */
1603         if ((permitted & requested) == requested)
1604                 return 0;
1605
1606         /* Calculate the resulting kernel state size */
1607         mask = permitted | requested;
1608         /* Take supervisor states into account on the host */
1609         if (!guest)
1610                 mask |= xfeatures_mask_supervisor();
1611         ksize = xstate_calculate_size(mask, compacted);
1612
1613         /* Calculate the resulting user state size */
1614         mask &= XFEATURE_MASK_USER_SUPPORTED;
1615         usize = xstate_calculate_size(mask, false);
1616
1617         if (!guest) {
1618                 ret = validate_sigaltstack(usize);
1619                 if (ret)
1620                         return ret;
1621         }
1622
1623         perm = guest ? &fpu->guest_perm : &fpu->perm;
1624         /* Pairs with the READ_ONCE() in xstate_get_group_perm() */
1625         WRITE_ONCE(perm->__state_perm, mask);
1626         /* Protected by sighand lock */
1627         perm->__state_size = ksize;
1628         perm->__user_state_size = usize;
1629         return ret;
1630 }
1631
1632 /*
1633  * Permissions array to map facilities with more than one component
1634  */
1635 static const u64 xstate_prctl_req[XFEATURE_MAX] = {
1636         [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA,
1637 };
1638
1639 static int xstate_request_perm(unsigned long idx, bool guest)
1640 {
1641         u64 permitted, requested;
1642         int ret;
1643
1644         if (idx >= XFEATURE_MAX)
1645                 return -EINVAL;
1646
1647         /*
1648          * Look up the facility mask which can require more than
1649          * one xstate component.
1650          */
1651         idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req));
1652         requested = xstate_prctl_req[idx];
1653         if (!requested)
1654                 return -EOPNOTSUPP;
1655
1656         if ((fpu_user_cfg.max_features & requested) != requested)
1657                 return -EOPNOTSUPP;
1658
1659         /* Lockless quick check */
1660         permitted = xstate_get_group_perm(guest);
1661         if ((permitted & requested) == requested)
1662                 return 0;
1663
1664         /* Protect against concurrent modifications */
1665         spin_lock_irq(&current->sighand->siglock);
1666         permitted = xstate_get_group_perm(guest);
1667
1668         /* First vCPU allocation locks the permissions. */
1669         if (guest && (permitted & FPU_GUEST_PERM_LOCKED))
1670                 ret = -EBUSY;
1671         else
1672                 ret = __xstate_request_perm(permitted, requested, guest);
1673         spin_unlock_irq(&current->sighand->siglock);
1674         return ret;
1675 }
1676
1677 int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu)
1678 {
1679         u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
1680         struct fpu_state_perm *perm;
1681         unsigned int ksize, usize;
1682         struct fpu *fpu;
1683
1684         if (!xfd_event) {
1685                 if (!guest_fpu)
1686                         pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
1687                 return 0;
1688         }
1689
1690         /* Protect against concurrent modifications */
1691         spin_lock_irq(&current->sighand->siglock);
1692
1693         /* If not permitted let it die */
1694         if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) {
1695                 spin_unlock_irq(&current->sighand->siglock);
1696                 return -EPERM;
1697         }
1698
1699         fpu = &current->group_leader->thread.fpu;
1700         perm = guest_fpu ? &fpu->guest_perm : &fpu->perm;
1701         ksize = perm->__state_size;
1702         usize = perm->__user_state_size;
1703
1704         /*
1705          * The feature is permitted. State size is sufficient.  Dropping
1706          * the lock is safe here even if more features are added from
1707          * another task, the retrieved buffer sizes are valid for the
1708          * currently requested feature(s).
1709          */
1710         spin_unlock_irq(&current->sighand->siglock);
1711
1712         /*
1713          * Try to allocate a new fpstate. If that fails there is no way
1714          * out.
1715          */
1716         if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu))
1717                 return -EFAULT;
1718         return 0;
1719 }
1720
1721 int xfd_enable_feature(u64 xfd_err)
1722 {
1723         return __xfd_enable_feature(xfd_err, NULL);
1724 }
1725
1726 #else /* CONFIG_X86_64 */
1727 static inline int xstate_request_perm(unsigned long idx, bool guest)
1728 {
1729         return -EPERM;
1730 }
1731 #endif  /* !CONFIG_X86_64 */
1732
1733 u64 xstate_get_guest_group_perm(void)
1734 {
1735         return xstate_get_group_perm(true);
1736 }
1737 EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);
1738
1739 /**
1740  * fpu_xstate_prctl - xstate permission operations
1741  * @option:     A subfunction of arch_prctl()
1742  * @arg2:       option argument
1743  * Return:      0 if successful; otherwise, an error code
1744  *
1745  * Option arguments:
1746  *
1747  * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info
1748  * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info
1749  * ARCH_REQ_XCOMP_PERM: Facility number requested
1750  *
1751  * For facilities which require more than one XSTATE component, the request
1752  * must be the highest state component number related to that facility,
1753  * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and
1754  * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18).
1755  */
1756 long fpu_xstate_prctl(int option, unsigned long arg2)
1757 {
1758         u64 __user *uptr = (u64 __user *)arg2;
1759         u64 permitted, supported;
1760         unsigned long idx = arg2;
1761         bool guest = false;
1762
1763         switch (option) {
1764         case ARCH_GET_XCOMP_SUPP:
1765                 supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features;
1766                 return put_user(supported, uptr);
1767
1768         case ARCH_GET_XCOMP_PERM:
1769                 /*
1770                  * Lockless snapshot as it can also change right after the
1771                  * dropping the lock.
1772                  */
1773                 permitted = xstate_get_host_group_perm();
1774                 permitted &= XFEATURE_MASK_USER_SUPPORTED;
1775                 return put_user(permitted, uptr);
1776
1777         case ARCH_GET_XCOMP_GUEST_PERM:
1778                 permitted = xstate_get_guest_group_perm();
1779                 permitted &= XFEATURE_MASK_USER_SUPPORTED;
1780                 return put_user(permitted, uptr);
1781
1782         case ARCH_REQ_XCOMP_GUEST_PERM:
1783                 guest = true;
1784                 fallthrough;
1785
1786         case ARCH_REQ_XCOMP_PERM:
1787                 if (!IS_ENABLED(CONFIG_X86_64))
1788                         return -EOPNOTSUPP;
1789
1790                 return xstate_request_perm(idx, guest);
1791
1792         default:
1793                 return -EINVAL;
1794         }
1795 }
1796
1797 #ifdef CONFIG_PROC_PID_ARCH_STATUS
1798 /*
1799  * Report the amount of time elapsed in millisecond since last AVX512
1800  * use in the task.
1801  */
1802 static void avx512_status(struct seq_file *m, struct task_struct *task)
1803 {
1804         unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp);
1805         long delta;
1806
1807         if (!timestamp) {
1808                 /*
1809                  * Report -1 if no AVX512 usage
1810                  */
1811                 delta = -1;
1812         } else {
1813                 delta = (long)(jiffies - timestamp);
1814                 /*
1815                  * Cap to LONG_MAX if time difference > LONG_MAX
1816                  */
1817                 if (delta < 0)
1818                         delta = LONG_MAX;
1819                 delta = jiffies_to_msecs(delta);
1820         }
1821
1822         seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta);
1823         seq_putc(m, '\n');
1824 }
1825
1826 /*
1827  * Report architecture specific information
1828  */
1829 int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
1830                         struct pid *pid, struct task_struct *task)
1831 {
1832         /*
1833          * Report AVX512 state if the processor and build option supported.
1834          */
1835         if (cpu_feature_enabled(X86_FEATURE_AVX512F))
1836                 avx512_status(m, task);
1837
1838         return 0;
1839 }
1840 #endif /* CONFIG_PROC_PID_ARCH_STATUS */