arch/x86/kernel/alternative.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 #define pr_fmt(fmt) "SMP alternatives: " fmt
   3
   4 #include <linux/mmu_context.h>
   5 #include <linux/perf_event.h>
   6 #include <linux/vmalloc.h>
   7 #include <linux/memory.h>
   8 #include <linux/execmem.h>
   9
  10 #include <asm/text-patching.h>
  11 #include <asm/insn.h>
  12 #include <asm/ibt.h>
  13 #include <asm/set_memory.h>
  14 #include <asm/nmi.h>
  15
  16 int __read_mostly alternatives_patched;
  17
  18 EXPORT_SYMBOL_GPL(alternatives_patched);
  19
  20 #define MAX_PATCH_LEN (255-1)
  21
  22 #define DA_ALL          (~0)
  23 #define DA_ALT          0x01
  24 #define DA_RET          0x02
  25 #define DA_RETPOLINE    0x04
  26 #define DA_ENDBR        0x08
  27 #define DA_SMP          0x10
  28
  29 static unsigned int debug_alternative;
  30
  31 static int __init debug_alt(char *str)
  32 {
  33         if (str && *str == '=')
  34                 str++;
  35
  36         if (!str || kstrtouint(str, 0, &debug_alternative))
  37                 debug_alternative = DA_ALL;
  38
  39         return 1;
  40 }
  41 __setup("debug-alternative", debug_alt);
  42
  43 static int noreplace_smp;
  44
  45 static int __init setup_noreplace_smp(char *str)
  46 {
  47         noreplace_smp = 1;
  48         return 1;
  49 }
  50 __setup("noreplace-smp", setup_noreplace_smp);
  51
  52 #define DPRINTK(type, fmt, args...)                                     \
  53 do {                                                                    \
  54         if (debug_alternative & DA_##type)                              \
  55                 printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args);            \
  56 } while (0)
  57
  58 #define DUMP_BYTES(type, buf, len, fmt, args...)                        \
  59 do {                                                                    \
  60         if (unlikely(debug_alternative & DA_##type)) {                  \
  61                 int j;                                                  \
  62                                                                         \
  63                 if (!(len))                                             \
  64                         break;                                          \
  65                                                                         \
  66                 printk(KERN_DEBUG pr_fmt(fmt), ##args);                 \
  67                 for (j = 0; j < (len) - 1; j++)                         \
  68                         printk(KERN_CONT "%02hhx ", buf[j]);            \
  69                 printk(KERN_CONT "%02hhx\n", buf[j]);                   \
  70         }                                                               \
  71 } while (0)
  72
  73 static const unsigned char x86nops[] =
  74 {
  75         BYTES_NOP1,
  76         BYTES_NOP2,
  77         BYTES_NOP3,
  78         BYTES_NOP4,
  79         BYTES_NOP5,
  80         BYTES_NOP6,
  81         BYTES_NOP7,
  82         BYTES_NOP8,
  83 #ifdef CONFIG_64BIT
  84         BYTES_NOP9,
  85         BYTES_NOP10,
  86         BYTES_NOP11,
  87 #endif
  88 };
  89
  90 const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
  91 {
  92         NULL,
  93         x86nops,
  94         x86nops + 1,
  95         x86nops + 1 + 2,
  96         x86nops + 1 + 2 + 3,
  97         x86nops + 1 + 2 + 3 + 4,
  98         x86nops + 1 + 2 + 3 + 4 + 5,
  99         x86nops + 1 + 2 + 3 + 4 + 5 + 6,
 100         x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
 101 #ifdef CONFIG_64BIT
 102         x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
 103         x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9,
 104         x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10,
 105 #endif
 106 };
 107
 108 #ifdef CONFIG_FINEIBT
 109 static bool cfi_paranoid __ro_after_init;
 110 #endif
 111
 112 #ifdef CONFIG_MITIGATION_ITS
 113
 114 #ifdef CONFIG_MODULES
 115 static struct module *its_mod;
 116 #endif
 117 static void *its_page;
 118 static unsigned int its_offset;
 119 struct its_array its_pages;
 120
 121 static void *__its_alloc(struct its_array *pages)
 122 {
 123         void *page __free(execmem) = execmem_alloc_rw(EXECMEM_MODULE_TEXT, PAGE_SIZE);
 124         if (!page)
 125                 return NULL;
 126
 127         void *tmp = krealloc(pages->pages, (pages->num+1) * sizeof(void *),
 128                              GFP_KERNEL);
 129         if (!tmp)
 130                 return NULL;
 131
 132         pages->pages = tmp;
 133         pages->pages[pages->num++] = page;
 134
 135         return no_free_ptr(page);
 136 }
 137
 138 /* Initialize a thunk with the "jmp *reg; int3" instructions. */
 139 static void *its_init_thunk(void *thunk, int reg)
 140 {
 141         u8 *bytes = thunk;
 142         int offset = 0;
 143         int i = 0;
 144
 145 #ifdef CONFIG_FINEIBT
 146         if (cfi_paranoid) {
 147                 /*
 148                  * When ITS uses indirect branch thunk the fineibt_paranoid
 149                  * caller sequence doesn't fit in the caller site. So put the
 150                  * remaining part of the sequence (<ea> + JNE) into the ITS
 151                  * thunk.
 152                  */
 153                 bytes[i++] = 0xea; /* invalid instruction */
 154                 bytes[i++] = 0x75; /* JNE */
 155                 bytes[i++] = 0xfd;
 156
 157                 offset = 1;
 158         }
 159 #endif
 160
 161         if (reg >= 8) {
 162                 bytes[i++] = 0x41; /* REX.B prefix */
 163                 reg -= 8;
 164         }
 165         bytes[i++] = 0xff;
 166         bytes[i++] = 0xe0 + reg; /* jmp *reg */
 167         bytes[i++] = 0xcc;
 168
 169         return thunk + offset;
 170 }
 171
 172 static void its_pages_protect(struct its_array *pages)
 173 {
 174         for (int i = 0; i < pages->num; i++) {
 175                 void *page = pages->pages[i];
 176                 execmem_restore_rox(page, PAGE_SIZE);
 177         }
 178 }
 179
 180 static void its_fini_core(void)
 181 {
 182         if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX))
 183                 its_pages_protect(&its_pages);
 184         kfree(its_pages.pages);
 185 }
 186
 187 #ifdef CONFIG_MODULES
 188 void its_init_mod(struct module *mod)
 189 {
 190         if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
 191                 return;
 192
 193         mutex_lock(&text_mutex);
 194         its_mod = mod;
 195         its_page = NULL;
 196 }
 197
 198 void its_fini_mod(struct module *mod)
 199 {
 200         if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
 201                 return;
 202
 203         WARN_ON_ONCE(its_mod != mod);
 204
 205         its_mod = NULL;
 206         its_page = NULL;
 207         mutex_unlock(&text_mutex);
 208
 209         if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
 210                 its_pages_protect(&mod->arch.its_pages);
 211 }
 212
 213 void its_free_mod(struct module *mod)
 214 {
 215         if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
 216                 return;
 217
 218         for (int i = 0; i < mod->arch.its_pages.num; i++) {
 219                 void *page = mod->arch.its_pages.pages[i];
 220                 execmem_free(page);
 221         }
 222         kfree(mod->arch.its_pages.pages);
 223 }
 224 #endif /* CONFIG_MODULES */
 225
 226 static void *its_alloc(void)
 227 {
 228         struct its_array *pages = &its_pages;
 229         void *page;
 230
 231 #ifdef CONFIG_MODULES
 232         if (its_mod)
 233                 pages = &its_mod->arch.its_pages;
 234 #endif
 235
 236         page = __its_alloc(pages);
 237         if (!page)
 238                 return NULL;
 239
 240         if (pages == &its_pages)
 241                 set_memory_x((unsigned long)page, 1);
 242
 243         return page;
 244 }
 245
 246 static void *its_allocate_thunk(int reg)
 247 {
 248         int size = 3 + (reg / 8);
 249         void *thunk;
 250
 251 #ifdef CONFIG_FINEIBT
 252         /*
 253          * The ITS thunk contains an indirect jump and an int3 instruction so
 254          * its size is 3 or 4 bytes depending on the register used. If CFI
 255          * paranoid is used then 3 extra bytes are added in the ITS thunk to
 256          * complete the fineibt_paranoid caller sequence.
 257          */
 258         if (cfi_paranoid)
 259                 size += 3;
 260 #endif
 261
 262         if (!its_page || (its_offset + size - 1) >= PAGE_SIZE) {
 263                 its_page = its_alloc();
 264                 if (!its_page) {
 265                         pr_err("ITS page allocation failed\n");
 266                         return NULL;
 267                 }
 268                 memset(its_page, INT3_INSN_OPCODE, PAGE_SIZE);
 269                 its_offset = 32;
 270         }
 271
 272         /*
 273          * If the indirect branch instruction will be in the lower half
 274          * of a cacheline, then update the offset to reach the upper half.
 275          */
 276         if ((its_offset + size - 1) % 64 < 32)
 277                 its_offset = ((its_offset - 1) | 0x3F) + 33;
 278
 279         thunk = its_page + its_offset;
 280         its_offset += size;
 281
 282         return its_init_thunk(thunk, reg);
 283 }
 284
 285 u8 *its_static_thunk(int reg)
 286 {
 287         u8 *thunk = __x86_indirect_its_thunk_array[reg];
 288
 289 #ifdef CONFIG_FINEIBT
 290         /* Paranoid thunk starts 2 bytes before */
 291         if (cfi_paranoid)
 292                 return thunk - 2;
 293 #endif
 294         return thunk;
 295 }
 296
 297 #else
 298 static inline void its_fini_core(void) {}
 299 #endif /* CONFIG_MITIGATION_ITS */
 300
 301 /*
 302  * Nomenclature for variable names to simplify and clarify this code and ease
 303  * any potential staring at it:
 304  *
 305  * @instr: source address of the original instructions in the kernel text as
 306  * generated by the compiler.
 307  *
 308  * @buf: temporary buffer on which the patching operates. This buffer is
 309  * eventually text-poked into the kernel image.
 310  *
 311  * @replacement/@repl: pointer to the opcodes which are replacing @instr, located
 312  * in the .altinstr_replacement section.
 313  */
 314
 315 /*
 316  * Fill the buffer with a single effective instruction of size @len.
 317  *
 318  * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info)
 319  * for every single-byte NOP, try to generate the maximally available NOP of
 320  * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for
 321  * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and
 322  * *jump* over instead of executing long and daft NOPs.
 323  */
 324 static void add_nop(u8 *buf, unsigned int len)
 325 {
 326         u8 *target = buf + len;
 327
 328         if (!len)
 329                 return;
 330
 331         if (len <= ASM_NOP_MAX) {
 332                 memcpy(buf, x86_nops[len], len);
 333                 return;
 334         }
 335
 336         if (len < 128) {
 337                 __text_gen_insn(buf, JMP8_INSN_OPCODE, buf, target, JMP8_INSN_SIZE);
 338                 buf += JMP8_INSN_SIZE;
 339         } else {
 340                 __text_gen_insn(buf, JMP32_INSN_OPCODE, buf, target, JMP32_INSN_SIZE);
 341                 buf += JMP32_INSN_SIZE;
 342         }
 343
 344         for (;buf < target; buf++)
 345                 *buf = INT3_INSN_OPCODE;
 346 }
 347
 348 /*
 349  * Matches NOP and NOPL, not any of the other possible NOPs.
 350  */
 351 static bool insn_is_nop(struct insn *insn)
 352 {
 353         /* Anything NOP, but no REP NOP */
 354         if (insn->opcode.bytes[0] == 0x90 &&
 355             (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3))
 356                 return true;
 357
 358         /* NOPL */
 359         if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F)
 360                 return true;
 361
 362         /* TODO: more nops */
 363
 364         return false;
 365 }
 366
 367 /*
 368  * Find the offset of the first non-NOP instruction starting at @offset
 369  * but no further than @len.
 370  */
 371 static int skip_nops(u8 *buf, int offset, int len)
 372 {
 373         struct insn insn;
 374
 375         for (; offset < len; offset += insn.length) {
 376                 if (insn_decode_kernel(&insn, &buf[offset]))
 377                         break;
 378
 379                 if (!insn_is_nop(&insn))
 380                         break;
 381         }
 382
 383         return offset;
 384 }
 385
 386 /*
 387  * "noinline" to cause control flow change and thus invalidate I$ and
 388  * cause refetch after modification.
 389  */
 390 static void noinline optimize_nops(const u8 * const instr, u8 *buf, size_t len)
 391 {
 392         for (int next, i = 0; i < len; i = next) {
 393                 struct insn insn;
 394
 395                 if (insn_decode_kernel(&insn, &buf[i]))
 396                         return;
 397
 398                 next = i + insn.length;
 399
 400                 if (insn_is_nop(&insn)) {
 401                         int nop = i;
 402
 403                         /* Has the NOP already been optimized? */
 404                         if (i + insn.length == len)
 405                                 return;
 406
 407                         next = skip_nops(buf, next, len);
 408
 409                         add_nop(buf + nop, next - nop);
 410                         DUMP_BYTES(ALT, buf, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, next);
 411                 }
 412         }
 413 }
 414
 415 /*
 416  * In this context, "source" is where the instructions are placed in the
 417  * section .altinstr_replacement, for example during kernel build by the
 418  * toolchain.
 419  * "Destination" is where the instructions are being patched in by this
 420  * machinery.
 421  *
 422  * The source offset is:
 423  *
 424  *   src_imm = target - src_next_ip                  (1)
 425  *
 426  * and the target offset is:
 427  *
 428  *   dst_imm = target - dst_next_ip                  (2)
 429  *
 430  * so rework (1) as an expression for target like:
 431  *
 432  *   target = src_imm + src_next_ip                  (1a)
 433  *
 434  * and substitute in (2) to get:
 435  *
 436  *   dst_imm = (src_imm + src_next_ip) - dst_next_ip (3)
 437  *
 438  * Now, since the instruction stream is 'identical' at src and dst (it
 439  * is being copied after all) it can be stated that:
 440  *
 441  *   src_next_ip = src + ip_offset
 442  *   dst_next_ip = dst + ip_offset                   (4)
 443  *
 444  * Substitute (4) in (3) and observe ip_offset being cancelled out to
 445  * obtain:
 446  *
 447  *   dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset)
 448  *           = src_imm + src - dst + ip_offset - ip_offset
 449  *           = src_imm + src - dst                   (5)
 450  *
 451  * IOW, only the relative displacement of the code block matters.
 452  */
 453
 454 #define apply_reloc_n(n_, p_, d_)                               \
 455         do {                                                    \
 456                 s32 v = *(s##n_ *)(p_);                         \
 457                 v += (d_);                                      \
 458                 BUG_ON((v >> 31) != (v >> (n_-1)));             \
 459                 *(s##n_ *)(p_) = (s##n_)v;                      \
 460         } while (0)
 461
 462
 463 static __always_inline
 464 void apply_reloc(int n, void *ptr, uintptr_t diff)
 465 {
 466         switch (n) {
 467         case 1: apply_reloc_n(8, ptr, diff); break;
 468         case 2: apply_reloc_n(16, ptr, diff); break;
 469         case 4: apply_reloc_n(32, ptr, diff); break;
 470         default: BUG();
 471         }
 472 }
 473
 474 static __always_inline
 475 bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
 476 {
 477         u8 *target = src + offset;
 478         /*
 479          * If the target is inside the patched block, it's relative to the
 480          * block itself and does not need relocation.
 481          */
 482         return (target < src || target > src + src_len);
 483 }
 484
 485 static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
 486 {
 487         for (int next, i = 0; i < instrlen; i = next) {
 488                 struct insn insn;
 489
 490                 if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i])))
 491                         return;
 492
 493                 next = i + insn.length;
 494
 495                 switch (insn.opcode.bytes[0]) {
 496                 case 0x0f:
 497                         if (insn.opcode.bytes[1] < 0x80 ||
 498                             insn.opcode.bytes[1] > 0x8f)
 499                                 break;
 500
 501                         fallthrough;    /* Jcc.d32 */
 502                 case 0x70 ... 0x7f:     /* Jcc.d8 */
 503                 case JMP8_INSN_OPCODE:
 504                 case JMP32_INSN_OPCODE:
 505                 case CALL_INSN_OPCODE:
 506                         if (need_reloc(next + insn.immediate.value, repl, repl_len)) {
 507                                 apply_reloc(insn.immediate.nbytes,
 508                                             buf + i + insn_offset_immediate(&insn),
 509                                             repl - instr);
 510                         }
 511
 512                         /*
 513                          * Where possible, convert JMP.d32 into JMP.d8.
 514                          */
 515                         if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) {
 516                                 s32 imm = insn.immediate.value;
 517                                 imm += repl - instr;
 518                                 imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE;
 519                                 if ((imm >> 31) == (imm >> 7)) {
 520                                         buf[i+0] = JMP8_INSN_OPCODE;
 521                                         buf[i+1] = (s8)imm;
 522
 523                                         memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2);
 524                                 }
 525                         }
 526                         break;
 527                 }
 528
 529                 if (insn_rip_relative(&insn)) {
 530                         if (need_reloc(next + insn.displacement.value, repl, repl_len)) {
 531                                 apply_reloc(insn.displacement.nbytes,
 532                                             buf + i + insn_offset_displacement(&insn),
 533                                             repl - instr);
 534                         }
 535                 }
 536         }
 537 }
 538
 539 void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
 540 {
 541         __apply_relocation(buf, instr, instrlen, repl, repl_len);
 542         optimize_nops(instr, buf, instrlen);
 543 }
 544
 545 /* Low-level backend functions usable from alternative code replacements. */
 546 DEFINE_ASM_FUNC(nop_func, "", .entry.text);
 547 EXPORT_SYMBOL_GPL(nop_func);
 548
 549 noinstr void BUG_func(void)
 550 {
 551         BUG();
 552 }
 553 EXPORT_SYMBOL(BUG_func);
 554
 555 #define CALL_RIP_REL_OPCODE     0xff
 556 #define CALL_RIP_REL_MODRM      0x15
 557
 558 /*
 559  * Rewrite the "call BUG_func" replacement to point to the target of the
 560  * indirect pv_ops call "call *disp(%ip)".
 561  */
 562 static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
 563 {
 564         void *target, *bug = &BUG_func;
 565         s32 disp;
 566
 567         if (a->replacementlen != 5 || insn_buff[0] != CALL_INSN_OPCODE) {
 568                 pr_err("ALT_FLAG_DIRECT_CALL set for a non-call replacement instruction\n");
 569                 BUG();
 570         }
 571
 572         if (a->instrlen != 6 ||
 573             instr[0] != CALL_RIP_REL_OPCODE ||
 574             instr[1] != CALL_RIP_REL_MODRM) {
 575                 pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n");
 576                 BUG();
 577         }
 578
 579         /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */
 580         disp = *(s32 *)(instr + 2);
 581 #ifdef CONFIG_X86_64
 582         /* ff 15 00 00 00 00   call   *0x0(%rip) */
 583         /* target address is stored at "next instruction + disp". */
 584         target = *(void **)(instr + a->instrlen + disp);
 585 #else
 586         /* ff 15 00 00 00 00   call   *0x0 */
 587         /* target address is stored at disp. */
 588         target = *(void **)disp;
 589 #endif
 590         if (!target)
 591                 target = bug;
 592
 593         /* (BUG_func - .) + (target - BUG_func) := target - . */
 594         *(s32 *)(insn_buff + 1) += target - bug;
 595
 596         if (target == &nop_func)
 597                 return 0;
 598
 599         return 5;
 600 }
 601
 602 static inline u8 * instr_va(struct alt_instr *i)
 603 {
 604         return (u8 *)&i->instr_offset + i->instr_offset;
 605 }
 606
 607 /*
 608  * Replace instructions with better alternatives for this CPU type. This runs
 609  * before SMP is initialized to avoid SMP problems with self modifying code.
 610  * This implies that asymmetric systems where APs have less capabilities than
 611  * the boot processor are not handled. Tough. Make sure you disable such
 612  * features by hand.
 613  *
 614  * Marked "noinline" to cause control flow change and thus insn cache
 615  * to refetch changed I$ lines.
 616  */
 617 void __init_or_module noinline apply_alternatives(struct alt_instr *start,
 618                                                   struct alt_instr *end)
 619 {
 620         u8 insn_buff[MAX_PATCH_LEN];
 621         u8 *instr, *replacement;
 622         struct alt_instr *a, *b;
 623
 624         DPRINTK(ALT, "alt table %px, -> %px", start, end);
 625
 626         /*
 627          * KASAN_SHADOW_START is defined using
 628          * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here.
 629          * During the process, KASAN becomes confused seeing partial LA57
 630          * conversion and triggers a false-positive out-of-bound report.
 631          *
 632          * Disable KASAN until the patching is complete.
 633          */
 634         kasan_disable_current();
 635
 636         /*
 637          * The scan order should be from start to end. A later scanned
 638          * alternative code can overwrite previously scanned alternative code.
 639          * Some kernel functions (e.g. memcpy, memset, etc) use this order to
 640          * patch code.
 641          *
 642          * So be careful if you want to change the scan order to any other
 643          * order.
 644          */
 645         for (a = start; a < end; a++) {
 646                 int insn_buff_sz = 0;
 647
 648                 /*
 649                  * In case of nested ALTERNATIVE()s the outer alternative might
 650                  * add more padding. To ensure consistent patching find the max
 651                  * padding for all alt_instr entries for this site (nested
 652                  * alternatives result in consecutive entries).
 653                  */
 654                 for (b = a+1; b < end && instr_va(b) == instr_va(a); b++) {
 655                         u8 len = max(a->instrlen, b->instrlen);
 656                         a->instrlen = b->instrlen = len;
 657                 }
 658
 659                 instr = instr_va(a);
 660                 replacement = (u8 *)&a->repl_offset + a->repl_offset;
 661                 BUG_ON(a->instrlen > sizeof(insn_buff));
 662                 BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
 663
 664                 /*
 665                  * Patch if either:
 666                  * - feature is present
 667                  * - feature not present but ALT_FLAG_NOT is set to mean,
 668                  *   patch if feature is *NOT* present.
 669                  */
 670                 if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
 671                         memcpy(insn_buff, instr, a->instrlen);
 672                         optimize_nops(instr, insn_buff, a->instrlen);
 673                         text_poke_early(instr, insn_buff, a->instrlen);
 674                         continue;
 675                 }
 676
 677                 DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d) flags: 0x%x",
 678                         a->cpuid >> 5,
 679                         a->cpuid & 0x1f,
 680                         instr, instr, a->instrlen,
 681                         replacement, a->replacementlen, a->flags);
 682
 683                 memcpy(insn_buff, replacement, a->replacementlen);
 684                 insn_buff_sz = a->replacementlen;
 685
 686                 if (a->flags & ALT_FLAG_DIRECT_CALL) {
 687                         insn_buff_sz = alt_replace_call(instr, insn_buff, a);
 688                         if (insn_buff_sz < 0)
 689                                 continue;
 690                 }
 691
 692                 for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
 693                         insn_buff[insn_buff_sz] = 0x90;
 694
 695                 text_poke_apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen);
 696
 697                 DUMP_BYTES(ALT, instr, a->instrlen, "%px:   old_insn: ", instr);
 698                 DUMP_BYTES(ALT, replacement, a->replacementlen, "%px:   rpl_insn: ", replacement);
 699                 DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
 700
 701                 text_poke_early(instr, insn_buff, insn_buff_sz);
 702         }
 703
 704         kasan_enable_current();
 705 }
 706
 707 static inline bool is_jcc32(struct insn *insn)
 708 {
 709         /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
 710         return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
 711 }
 712
 713 #if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_OBJTOOL)
 714
 715 /*
 716  * CALL/JMP *%\reg
 717  */
 718 static int emit_indirect(int op, int reg, u8 *bytes)
 719 {
 720         int i = 0;
 721         u8 modrm;
 722
 723         switch (op) {
 724         case CALL_INSN_OPCODE:
 725                 modrm = 0x10; /* Reg = 2; CALL r/m */
 726                 break;
 727
 728         case JMP32_INSN_OPCODE:
 729                 modrm = 0x20; /* Reg = 4; JMP r/m */
 730                 break;
 731
 732         default:
 733                 WARN_ON_ONCE(1);
 734                 return -1;
 735         }
 736
 737         if (reg >= 8) {
 738                 bytes[i++] = 0x41; /* REX.B prefix */
 739                 reg -= 8;
 740         }
 741
 742         modrm |= 0xc0; /* Mod = 3 */
 743         modrm += reg;
 744
 745         bytes[i++] = 0xff; /* opcode */
 746         bytes[i++] = modrm;
 747
 748         return i;
 749 }
 750
 751 static int __emit_trampoline(void *addr, struct insn *insn, u8 *bytes,
 752                              void *call_dest, void *jmp_dest)
 753 {
 754         u8 op = insn->opcode.bytes[0];
 755         int i = 0;
 756
 757         /*
 758          * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional
 759          * tail-calls. Deal with them.
 760          */
 761         if (is_jcc32(insn)) {
 762                 bytes[i++] = op;
 763                 op = insn->opcode.bytes[1];
 764                 goto clang_jcc;
 765         }
 766
 767         if (insn->length == 6)
 768                 bytes[i++] = 0x2e; /* CS-prefix */
 769
 770         switch (op) {
 771         case CALL_INSN_OPCODE:
 772                 __text_gen_insn(bytes+i, op, addr+i,
 773                                 call_dest,
 774                                 CALL_INSN_SIZE);
 775                 i += CALL_INSN_SIZE;
 776                 break;
 777
 778         case JMP32_INSN_OPCODE:
 779 clang_jcc:
 780                 __text_gen_insn(bytes+i, op, addr+i,
 781                                 jmp_dest,
 782                                 JMP32_INSN_SIZE);
 783                 i += JMP32_INSN_SIZE;
 784                 break;
 785
 786         default:
 787                 WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr);
 788                 return -1;
 789         }
 790
 791         WARN_ON_ONCE(i != insn->length);
 792
 793         return i;
 794 }
 795
 796 static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
 797 {
 798         return __emit_trampoline(addr, insn, bytes,
 799                                  __x86_indirect_call_thunk_array[reg],
 800                                  __x86_indirect_jump_thunk_array[reg]);
 801 }
 802
 803 #ifdef CONFIG_MITIGATION_ITS
 804 static int emit_its_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes)
 805 {
 806         u8 *thunk = __x86_indirect_its_thunk_array[reg];
 807         u8 *tmp = its_allocate_thunk(reg);
 808
 809         if (tmp)
 810                 thunk = tmp;
 811
 812         return __emit_trampoline(addr, insn, bytes, thunk, thunk);
 813 }
 814
 815 /* Check if an indirect branch is at ITS-unsafe address */
 816 static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg)
 817 {
 818         if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
 819                 return false;
 820
 821         /* Indirect branch opcode is 2 or 3 bytes depending on reg */
 822         addr += 1 + reg / 8;
 823
 824         /* Lower-half of the cacheline? */
 825         return !(addr & 0x20);
 826 }
 827 #else /* CONFIG_MITIGATION_ITS */
 828
 829 #ifdef CONFIG_FINEIBT
 830 static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg)
 831 {
 832         return false;
 833 }
 834 #endif
 835
 836 #endif /* CONFIG_MITIGATION_ITS */
 837
 838 /*
 839  * Rewrite the compiler generated retpoline thunk calls.
 840  *
 841  * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate
 842  * indirect instructions, avoiding the extra indirection.
 843  *
 844  * For example, convert:
 845  *
 846  *   CALL __x86_indirect_thunk_\reg
 847  *
 848  * into:
 849  *
 850  *   CALL *%\reg
 851  *
 852  * It also tries to inline spectre_v2=retpoline,lfence when size permits.
 853  */
 854 static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
 855 {
 856         retpoline_thunk_t *target;
 857         int reg, ret, i = 0;
 858         u8 op, cc;
 859
 860         target = addr + insn->length + insn->immediate.value;
 861         reg = target - __x86_indirect_thunk_array;
 862
 863         if (WARN_ON_ONCE(reg & ~0xf))
 864                 return -1;
 865
 866         /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */
 867         BUG_ON(reg == 4);
 868
 869         if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) &&
 870             !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
 871                 if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
 872                         return emit_call_track_retpoline(addr, insn, reg, bytes);
 873
 874                 return -1;
 875         }
 876
 877         op = insn->opcode.bytes[0];
 878
 879         /*
 880          * Convert:
 881          *
 882          *   Jcc.d32 __x86_indirect_thunk_\reg
 883          *
 884          * into:
 885          *
 886          *   Jncc.d8 1f
 887          *   [ LFENCE ]
 888          *   JMP *%\reg
 889          *   [ NOP ]
 890          * 1:
 891          */
 892         if (is_jcc32(insn)) {
 893                 cc = insn->opcode.bytes[1] & 0xf;
 894                 cc ^= 1; /* invert condition */
 895
 896                 bytes[i++] = 0x70 + cc;        /* Jcc.d8 */
 897                 bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */
 898
 899                 /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */
 900                 op = JMP32_INSN_OPCODE;
 901         }
 902
 903         /*
 904          * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE.
 905          */
 906         if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
 907                 bytes[i++] = 0x0f;
 908                 bytes[i++] = 0xae;
 909                 bytes[i++] = 0xe8; /* LFENCE */
 910         }
 911
 912 #ifdef CONFIG_MITIGATION_ITS
 913         /*
 914          * Check if the address of last byte of emitted-indirect is in
 915          * lower-half of the cacheline. Such branches need ITS mitigation.
 916          */
 917         if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + i, reg))
 918                 return emit_its_trampoline(addr, insn, reg, bytes);
 919 #endif
 920
 921         ret = emit_indirect(op, reg, bytes + i);
 922         if (ret < 0)
 923                 return ret;
 924         i += ret;
 925
 926         /*
 927          * The compiler is supposed to EMIT an INT3 after every unconditional
 928          * JMP instruction due to AMD BTC. However, if the compiler is too old
 929          * or MITIGATION_SLS isn't enabled, we still need an INT3 after
 930          * indirect JMPs even on Intel.
 931          */
 932         if (op == JMP32_INSN_OPCODE && i < insn->length)
 933                 bytes[i++] = INT3_INSN_OPCODE;
 934
 935         for (; i < insn->length;)
 936                 bytes[i++] = BYTES_NOP1;
 937
 938         return i;
 939 }
 940
 941 /*
 942  * Generated by 'objtool --retpoline'.
 943  */
 944 void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
 945 {
 946         s32 *s;
 947
 948         for (s = start; s < end; s++) {
 949                 void *addr = (void *)s + *s;
 950                 struct insn insn;
 951                 int len, ret;
 952                 u8 bytes[16];
 953                 u8 op1, op2;
 954                 u8 *dest;
 955
 956                 ret = insn_decode_kernel(&insn, addr);
 957                 if (WARN_ON_ONCE(ret < 0))
 958                         continue;
 959
 960                 op1 = insn.opcode.bytes[0];
 961                 op2 = insn.opcode.bytes[1];
 962
 963                 switch (op1) {
 964                 case 0x70 ... 0x7f:     /* Jcc.d8 */
 965                         /* See cfi_paranoid. */
 966                         WARN_ON_ONCE(cfi_mode != CFI_FINEIBT);
 967                         continue;
 968
 969                 case CALL_INSN_OPCODE:
 970                 case JMP32_INSN_OPCODE:
 971                         /* Check for cfi_paranoid + ITS */
 972                         dest = addr + insn.length + insn.immediate.value;
 973                         if (dest[-1] == 0xea && (dest[0] & 0xf0) == 0x70) {
 974                                 WARN_ON_ONCE(cfi_mode != CFI_FINEIBT);
 975                                 continue;
 976                         }
 977                         break;
 978
 979                 case 0x0f: /* escape */
 980                         if (op2 >= 0x80 && op2 <= 0x8f)
 981                                 break;
 982                         fallthrough;
 983                 default:
 984                         WARN_ON_ONCE(1);
 985                         continue;
 986                 }
 987
 988                 DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS",
 989                         addr, addr, insn.length,
 990                         addr + insn.length + insn.immediate.value);
 991
 992                 len = patch_retpoline(addr, &insn, bytes);
 993                 if (len == insn.length) {
 994                         optimize_nops(addr, bytes, len);
 995                         DUMP_BYTES(RETPOLINE, ((u8*)addr),  len, "%px: orig: ", addr);
 996                         DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr);
 997                         text_poke_early(addr, bytes, len);
 998                 }
 999         }
1000 }
1001
1002 #ifdef CONFIG_MITIGATION_RETHUNK
1003
1004 bool cpu_wants_rethunk(void)
1005 {
1006         return cpu_feature_enabled(X86_FEATURE_RETHUNK);
1007 }
1008
1009 bool cpu_wants_rethunk_at(void *addr)
1010 {
1011         if (!cpu_feature_enabled(X86_FEATURE_RETHUNK))
1012                 return false;
1013         if (x86_return_thunk != its_return_thunk)
1014                 return true;
1015
1016         return !((unsigned long)addr & 0x20);
1017 }
1018
1019 /*
1020  * Rewrite the compiler generated return thunk tail-calls.
1021  *
1022  * For example, convert:
1023  *
1024  *   JMP __x86_return_thunk
1025  *
1026  * into:
1027  *
1028  *   RET
1029  */
1030 static int patch_return(void *addr, struct insn *insn, u8 *bytes)
1031 {
1032         int i = 0;
1033
1034         /* Patch the custom return thunks... */
1035         if (cpu_wants_rethunk_at(addr)) {
1036                 i = JMP32_INSN_SIZE;
1037                 __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i);
1038         } else {
1039                 /* ... or patch them out if not needed. */
1040                 bytes[i++] = RET_INSN_OPCODE;
1041         }
1042
1043         for (; i < insn->length;)
1044                 bytes[i++] = INT3_INSN_OPCODE;
1045         return i;
1046 }
1047
1048 void __init_or_module noinline apply_returns(s32 *start, s32 *end)
1049 {
1050         s32 *s;
1051
1052         if (cpu_wants_rethunk())
1053                 static_call_force_reinit();
1054
1055         for (s = start; s < end; s++) {
1056                 void *dest = NULL, *addr = (void *)s + *s;
1057                 struct insn insn;
1058                 int len, ret;
1059                 u8 bytes[16];
1060                 u8 op;
1061
1062                 ret = insn_decode_kernel(&insn, addr);
1063                 if (WARN_ON_ONCE(ret < 0))
1064                         continue;
1065
1066                 op = insn.opcode.bytes[0];
1067                 if (op == JMP32_INSN_OPCODE)
1068                         dest = addr + insn.length + insn.immediate.value;
1069
1070                 if (__static_call_fixup(addr, op, dest) ||
1071                     WARN_ONCE(dest != &__x86_return_thunk,
1072                               "missing return thunk: %pS-%pS: %*ph",
1073                               addr, dest, 5, addr))
1074                         continue;
1075
1076                 DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS",
1077                         addr, addr, insn.length,
1078                         addr + insn.length + insn.immediate.value);
1079
1080                 len = patch_return(addr, &insn, bytes);
1081                 if (len == insn.length) {
1082                         DUMP_BYTES(RET, ((u8*)addr),  len, "%px: orig: ", addr);
1083                         DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr);
1084                         text_poke_early(addr, bytes, len);
1085                 }
1086         }
1087 }
1088 #else /* !CONFIG_MITIGATION_RETHUNK: */
1089 void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
1090 #endif /* !CONFIG_MITIGATION_RETHUNK */
1091
1092 #else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */
1093
1094 void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
1095 void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
1096
1097 #endif /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */
1098
1099 #ifdef CONFIG_X86_KERNEL_IBT
1100
1101 __noendbr bool is_endbr(u32 *val)
1102 {
1103         u32 endbr;
1104
1105         __get_kernel_nofault(&endbr, val, u32, Efault);
1106         return __is_endbr(endbr);
1107
1108 Efault:
1109         return false;
1110 }
1111
1112 #ifdef CONFIG_FINEIBT
1113
1114 static __noendbr bool exact_endbr(u32 *val)
1115 {
1116         u32 endbr;
1117
1118         __get_kernel_nofault(&endbr, val, u32, Efault);
1119         return endbr == gen_endbr();
1120
1121 Efault:
1122         return false;
1123 }
1124
1125 #endif
1126
1127 static void poison_cfi(void *addr);
1128
1129 static void __init_or_module poison_endbr(void *addr)
1130 {
1131         u32 poison = gen_endbr_poison();
1132
1133         if (WARN_ON_ONCE(!is_endbr(addr)))
1134                 return;
1135
1136         DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr);
1137
1138         /*
1139          * When we have IBT, the lack of ENDBR will trigger #CP
1140          */
1141         DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr);
1142         DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr);
1143         text_poke_early(addr, &poison, 4);
1144 }
1145
1146 /*
1147  * Generated by: objtool --ibt
1148  *
1149  * Seal the functions for indirect calls by clobbering the ENDBR instructions
1150  * and the kCFI hash value.
1151  */
1152 void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end)
1153 {
1154         s32 *s;
1155
1156         for (s = start; s < end; s++) {
1157                 void *addr = (void *)s + *s;
1158
1159                 poison_endbr(addr);
1160                 if (IS_ENABLED(CONFIG_FINEIBT))
1161                         poison_cfi(addr - 16);
1162         }
1163 }
1164
1165 #else /* !CONFIG_X86_KERNEL_IBT: */
1166
1167 void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { }
1168
1169 #endif /* !CONFIG_X86_KERNEL_IBT */
1170
1171 #ifdef CONFIG_CFI_AUTO_DEFAULT
1172 # define __CFI_DEFAULT CFI_AUTO
1173 #elif defined(CONFIG_CFI_CLANG)
1174 # define __CFI_DEFAULT CFI_KCFI
1175 #else
1176 # define __CFI_DEFAULT CFI_OFF
1177 #endif
1178
1179 enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT;
1180
1181 #ifdef CONFIG_FINEIBT_BHI
1182 bool cfi_bhi __ro_after_init = false;
1183 #endif
1184
1185 #ifdef CONFIG_CFI_CLANG
1186 u32 cfi_get_func_hash(void *func)
1187 {
1188         u32 hash;
1189
1190         func -= cfi_get_offset();
1191         switch (cfi_mode) {
1192         case CFI_FINEIBT:
1193                 func += 7;
1194                 break;
1195         case CFI_KCFI:
1196                 func += 1;
1197                 break;
1198         default:
1199                 return 0;
1200         }
1201
1202         if (get_kernel_nofault(hash, func))
1203                 return 0;
1204
1205         return hash;
1206 }
1207
1208 int cfi_get_func_arity(void *func)
1209 {
1210         bhi_thunk *target;
1211         s32 disp;
1212
1213         if (cfi_mode != CFI_FINEIBT && !cfi_bhi)
1214                 return 0;
1215
1216         if (get_kernel_nofault(disp, func - 4))
1217                 return 0;
1218
1219         target = func + disp;
1220         return target - __bhi_args;
1221 }
1222 #endif
1223
1224 #ifdef CONFIG_FINEIBT
1225
1226 static bool cfi_rand __ro_after_init = true;
1227 static u32  cfi_seed __ro_after_init;
1228
1229 /*
1230  * Re-hash the CFI hash with a boot-time seed while making sure the result is
1231  * not a valid ENDBR instruction.
1232  */
1233 static u32 cfi_rehash(u32 hash)
1234 {
1235         hash ^= cfi_seed;
1236         while (unlikely(__is_endbr(hash) || __is_endbr(-hash))) {
1237                 bool lsb = hash & 1;
1238                 hash >>= 1;
1239                 if (lsb)
1240                         hash ^= 0x80200003;
1241         }
1242         return hash;
1243 }
1244
1245 static __init int cfi_parse_cmdline(char *str)
1246 {
1247         if (!str)
1248                 return -EINVAL;
1249
1250         while (str) {
1251                 char *next = strchr(str, ',');
1252                 if (next) {
1253                         *next = 0;
1254                         next++;
1255                 }
1256
1257                 if (!strcmp(str, "auto")) {
1258                         cfi_mode = CFI_AUTO;
1259                 } else if (!strcmp(str, "off")) {
1260                         cfi_mode = CFI_OFF;
1261                         cfi_rand = false;
1262                 } else if (!strcmp(str, "kcfi")) {
1263                         cfi_mode = CFI_KCFI;
1264                 } else if (!strcmp(str, "fineibt")) {
1265                         cfi_mode = CFI_FINEIBT;
1266                 } else if (!strcmp(str, "norand")) {
1267                         cfi_rand = false;
1268                 } else if (!strcmp(str, "warn")) {
1269                         pr_alert("CFI mismatch non-fatal!\n");
1270                         cfi_warn = true;
1271                 } else if (!strcmp(str, "paranoid")) {
1272                         if (cfi_mode == CFI_FINEIBT) {
1273                                 cfi_paranoid = true;
1274                         } else {
1275                                 pr_err("Ignoring paranoid; depends on fineibt.\n");
1276                         }
1277                 } else if (!strcmp(str, "bhi")) {
1278 #ifdef CONFIG_FINEIBT_BHI
1279                         if (cfi_mode == CFI_FINEIBT) {
1280                                 cfi_bhi = true;
1281                         } else {
1282                                 pr_err("Ignoring bhi; depends on fineibt.\n");
1283                         }
1284 #else
1285                         pr_err("Ignoring bhi; depends on FINEIBT_BHI=y.\n");
1286 #endif
1287                 } else {
1288                         pr_err("Ignoring unknown cfi option (%s).", str);
1289                 }
1290
1291                 str = next;
1292         }
1293
1294         return 0;
1295 }
1296 early_param("cfi", cfi_parse_cmdline);
1297
1298 /*
1299  * kCFI                                         FineIBT
1300  *
1301  * __cfi_\func:                                 __cfi_\func:
1302  *      movl   $0x12345678,%eax         // 5         endbr64                    // 4
1303  *      nop                                          subl   $0x12345678,%r10d   // 7
1304  *      nop                                          jne    __cfi_\func+6       // 2
1305  *      nop                                          nop3                       // 3
1306  *      nop
1307  *      nop
1308  *      nop
1309  *      nop
1310  *      nop
1311  *      nop
1312  *      nop
1313  *      nop
1314  *
1315  *
1316  * caller:                                      caller:
1317  *      movl    $(-0x12345678),%r10d     // 6        movl   $0x12345678,%r10d   // 6
1318  *      addl    $-15(%r11),%r10d         // 4        lea    -0x10(%r11),%r11    // 4
1319  *      je      1f                       // 2        nop4                       // 4
1320  *      ud2                              // 2
1321  * 1:   cs call __x86_indirect_thunk_r11 // 6        call   *%r11; nop3;        // 6
1322  *
1323  */
1324
1325 /*
1326  * <fineibt_preamble_start>:
1327  *  0:   f3 0f 1e fa             endbr64
1328  *  4:   41 81 <ea> 78 56 34 12  sub    $0x12345678, %r10d
1329  *  b:   75 f9                   jne    6 <fineibt_preamble_start+0x6>
1330  *  d:   0f 1f 00                nopl   (%rax)
1331  *
1332  * Note that the JNE target is the 0xEA byte inside the SUB, this decodes as
1333  * (bad) on x86_64 and raises #UD.
1334  */
1335 asm(    ".pushsection .rodata                           \n"
1336         "fineibt_preamble_start:                        \n"
1337         "       endbr64                                 \n"
1338         "       subl    $0x12345678, %r10d              \n"
1339         "fineibt_preamble_bhi:                          \n"
1340         "       jne     fineibt_preamble_start+6        \n"
1341         ASM_NOP3
1342         "fineibt_preamble_end:                          \n"
1343         ".popsection\n"
1344 );
1345
1346 extern u8 fineibt_preamble_start[];
1347 extern u8 fineibt_preamble_bhi[];
1348 extern u8 fineibt_preamble_end[];
1349
1350 #define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start)
1351 #define fineibt_preamble_bhi  (fineibt_preamble_bhi - fineibt_preamble_start)
1352 #define fineibt_preamble_ud   6
1353 #define fineibt_preamble_hash 7
1354
1355 /*
1356  * <fineibt_caller_start>:
1357  *  0:   41 ba 78 56 34 12       mov    $0x12345678, %r10d
1358  *  6:   4d 8d 5b f0             lea    -0x10(%r11), %r11
1359  *  a:   0f 1f 40 00             nopl   0x0(%rax)
1360  */
1361 asm(    ".pushsection .rodata                   \n"
1362         "fineibt_caller_start:                  \n"
1363         "       movl    $0x12345678, %r10d      \n"
1364         "       lea     -0x10(%r11), %r11       \n"
1365         ASM_NOP4
1366         "fineibt_caller_end:                    \n"
1367         ".popsection                            \n"
1368 );
1369
1370 extern u8 fineibt_caller_start[];
1371 extern u8 fineibt_caller_end[];
1372
1373 #define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start)
1374 #define fineibt_caller_hash 2
1375
1376 #define fineibt_caller_jmp (fineibt_caller_size - 2)
1377
1378 /*
1379  * Since FineIBT does hash validation on the callee side it is prone to
1380  * circumvention attacks where a 'naked' ENDBR instruction exists that
1381  * is not part of the fineibt_preamble sequence.
1382  *
1383  * Notably the x86 entry points must be ENDBR and equally cannot be
1384  * fineibt_preamble.
1385  *
1386  * The fineibt_paranoid caller sequence adds additional caller side
1387  * hash validation. This stops such circumvention attacks dead, but at the cost
1388  * of adding a load.
1389  *
1390  * <fineibt_paranoid_start>:
1391  *  0:   41 ba 78 56 34 12       mov    $0x12345678, %r10d
1392  *  6:   45 3b 53 f7             cmp    -0x9(%r11), %r10d
1393  *  a:   4d 8d 5b <f0>           lea    -0x10(%r11), %r11
1394  *  e:   75 fd                   jne    d <fineibt_paranoid_start+0xd>
1395  * 10:   41 ff d3                call   *%r11
1396  * 13:   90                      nop
1397  *
1398  * Notably LEA does not modify flags and can be reordered with the CMP,
1399  * avoiding a dependency. Again, using a non-taken (backwards) branch
1400  * for the failure case, abusing LEA's immediate 0xf0 as LOCK prefix for the
1401  * Jcc.d8, causing #UD.
1402  */
1403 asm(    ".pushsection .rodata                           \n"
1404         "fineibt_paranoid_start:                        \n"
1405         "       movl    $0x12345678, %r10d              \n"
1406         "       cmpl    -9(%r11), %r10d                 \n"
1407         "       lea     -0x10(%r11), %r11               \n"
1408         "       jne     fineibt_paranoid_start+0xd      \n"
1409         "fineibt_paranoid_ind:                          \n"
1410         "       call    *%r11                           \n"
1411         "       nop                                     \n"
1412         "fineibt_paranoid_end:                          \n"
1413         ".popsection                                    \n"
1414 );
1415
1416 extern u8 fineibt_paranoid_start[];
1417 extern u8 fineibt_paranoid_ind[];
1418 extern u8 fineibt_paranoid_end[];
1419
1420 #define fineibt_paranoid_size (fineibt_paranoid_end - fineibt_paranoid_start)
1421 #define fineibt_paranoid_ind  (fineibt_paranoid_ind - fineibt_paranoid_start)
1422 #define fineibt_paranoid_ud   0xd
1423
1424 static u32 decode_preamble_hash(void *addr, int *reg)
1425 {
1426         u8 *p = addr;
1427
1428         /* b8+reg 78 56 34 12          movl    $0x12345678,\reg */
1429         if (p[0] >= 0xb8 && p[0] < 0xc0) {
1430                 if (reg)
1431                         *reg = p[0] - 0xb8;
1432                 return *(u32 *)(addr + 1);
1433         }
1434
1435         return 0; /* invalid hash value */
1436 }
1437
1438 static u32 decode_caller_hash(void *addr)
1439 {
1440         u8 *p = addr;
1441
1442         /* 41 ba 88 a9 cb ed       mov    $(-0x12345678),%r10d */
1443         if (p[0] == 0x41 && p[1] == 0xba)
1444                 return -*(u32 *)(addr + 2);
1445
1446         /* e8 0c 88 a9 cb ed       jmp.d8  +12 */
1447         if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp)
1448                 return -*(u32 *)(addr + 2);
1449
1450         return 0; /* invalid hash value */
1451 }
1452
1453 /* .retpoline_sites */
1454 static int cfi_disable_callers(s32 *start, s32 *end)
1455 {
1456         /*
1457          * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate
1458          * in tact for later usage. Also see decode_caller_hash() and
1459          * cfi_rewrite_callers().
1460          */
1461         const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp };
1462         s32 *s;
1463
1464         for (s = start; s < end; s++) {
1465                 void *addr = (void *)s + *s;
1466                 u32 hash;
1467
1468                 addr -= fineibt_caller_size;
1469                 hash = decode_caller_hash(addr);
1470                 if (!hash) /* nocfi callers */
1471                         continue;
1472
1473                 text_poke_early(addr, jmp, 2);
1474         }
1475
1476         return 0;
1477 }
1478
1479 static int cfi_enable_callers(s32 *start, s32 *end)
1480 {
1481         /*
1482          * Re-enable kCFI, undo what cfi_disable_callers() did.
1483          */
1484         const u8 mov[] = { 0x41, 0xba };
1485         s32 *s;
1486
1487         for (s = start; s < end; s++) {
1488                 void *addr = (void *)s + *s;
1489                 u32 hash;
1490
1491                 addr -= fineibt_caller_size;
1492                 hash = decode_caller_hash(addr);
1493                 if (!hash) /* nocfi callers */
1494                         continue;
1495
1496                 text_poke_early(addr, mov, 2);
1497         }
1498
1499         return 0;
1500 }
1501
1502 /* .cfi_sites */
1503 static int cfi_rand_preamble(s32 *start, s32 *end)
1504 {
1505         s32 *s;
1506
1507         for (s = start; s < end; s++) {
1508                 void *addr = (void *)s + *s;
1509                 u32 hash;
1510
1511                 hash = decode_preamble_hash(addr, NULL);
1512                 if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
1513                          addr, addr, 5, addr))
1514                         return -EINVAL;
1515
1516                 hash = cfi_rehash(hash);
1517                 text_poke_early(addr + 1, &hash, 4);
1518         }
1519
1520         return 0;
1521 }
1522
1523 static void cfi_fineibt_bhi_preamble(void *addr, int arity)
1524 {
1525         if (!arity)
1526                 return;
1527
1528         if (!cfi_warn && arity == 1) {
1529                 /*
1530                  * Crazy scheme to allow arity-1 inline:
1531                  *
1532                  * __cfi_foo:
1533                  *  0: f3 0f 1e fa             endbr64
1534                  *  4: 41 81 <ea> 78 56 34 12  sub     0x12345678, %r10d
1535                  *  b: 49 0f 45 fa             cmovne  %r10, %rdi
1536                  *  f: 75 f5                   jne     __cfi_foo+6
1537                  * 11: 0f 1f 00                nopl    (%rax)
1538                  *
1539                  * Code that direct calls to foo()+0, decodes the tail end as:
1540                  *
1541                  * foo:
1542                  *  0: f5                      cmc
1543                  *  1: 0f 1f 00                nopl    (%rax)
1544                  *
1545                  * which clobbers CF, but does not affect anything ABI
1546                  * wise.
1547                  *
1548                  * Notably, this scheme is incompatible with permissive CFI
1549                  * because the CMOVcc is unconditional and RDI will have been
1550                  * clobbered.
1551                  */
1552                 const u8 magic[9] = {
1553                         0x49, 0x0f, 0x45, 0xfa,
1554                         0x75, 0xf5,
1555                         BYTES_NOP3,
1556                 };
1557
1558                 text_poke_early(addr + fineibt_preamble_bhi, magic, 9);
1559
1560                 return;
1561         }
1562
1563         text_poke_early(addr + fineibt_preamble_bhi,
1564                         text_gen_insn(CALL_INSN_OPCODE,
1565                                       addr + fineibt_preamble_bhi,
1566                                       __bhi_args[arity]),
1567                         CALL_INSN_SIZE);
1568 }
1569
1570 static int cfi_rewrite_preamble(s32 *start, s32 *end)
1571 {
1572         s32 *s;
1573
1574         for (s = start; s < end; s++) {
1575                 void *addr = (void *)s + *s;
1576                 int arity;
1577                 u32 hash;
1578
1579                 /*
1580                  * When the function doesn't start with ENDBR the compiler will
1581                  * have determined there are no indirect calls to it and we
1582                  * don't need no CFI either.
1583                  */
1584                 if (!is_endbr(addr + 16))
1585                         continue;
1586
1587                 hash = decode_preamble_hash(addr, &arity);
1588                 if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
1589                          addr, addr, 5, addr))
1590                         return -EINVAL;
1591
1592                 text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size);
1593                 WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678);
1594                 text_poke_early(addr + fineibt_preamble_hash, &hash, 4);
1595
1596                 WARN_ONCE(!IS_ENABLED(CONFIG_FINEIBT_BHI) && arity,
1597                           "kCFI preamble has wrong register at: %pS %*ph\n",
1598                           addr, 5, addr);
1599
1600                 if (cfi_bhi)
1601                         cfi_fineibt_bhi_preamble(addr, arity);
1602         }
1603
1604         return 0;
1605 }
1606
1607 static void cfi_rewrite_endbr(s32 *start, s32 *end)
1608 {
1609         s32 *s;
1610
1611         for (s = start; s < end; s++) {
1612                 void *addr = (void *)s + *s;
1613
1614                 if (!exact_endbr(addr + 16))
1615                         continue;
1616
1617                 poison_endbr(addr + 16);
1618         }
1619 }
1620
1621 /* .retpoline_sites */
1622 static int cfi_rand_callers(s32 *start, s32 *end)
1623 {
1624         s32 *s;
1625
1626         for (s = start; s < end; s++) {
1627                 void *addr = (void *)s + *s;
1628                 u32 hash;
1629
1630                 addr -= fineibt_caller_size;
1631                 hash = decode_caller_hash(addr);
1632                 if (hash) {
1633                         hash = -cfi_rehash(hash);
1634                         text_poke_early(addr + 2, &hash, 4);
1635                 }
1636         }
1637
1638         return 0;
1639 }
1640
1641 static int emit_paranoid_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes)
1642 {
1643         u8 *thunk = (void *)__x86_indirect_its_thunk_array[reg] - 2;
1644
1645 #ifdef CONFIG_MITIGATION_ITS
1646         u8 *tmp = its_allocate_thunk(reg);
1647         if (tmp)
1648                 thunk = tmp;
1649 #endif
1650
1651         return __emit_trampoline(addr, insn, bytes, thunk, thunk);
1652 }
1653
1654 static int cfi_rewrite_callers(s32 *start, s32 *end)
1655 {
1656         s32 *s;
1657
1658         BUG_ON(fineibt_paranoid_size != 20);
1659
1660         for (s = start; s < end; s++) {
1661                 void *addr = (void *)s + *s;
1662                 struct insn insn;
1663                 u8 bytes[20];
1664                 u32 hash;
1665                 int ret;
1666                 u8 op;
1667
1668                 addr -= fineibt_caller_size;
1669                 hash = decode_caller_hash(addr);
1670                 if (!hash)
1671                         continue;
1672
1673                 if (!cfi_paranoid) {
1674                         text_poke_early(addr, fineibt_caller_start, fineibt_caller_size);
1675                         WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678);
1676                         text_poke_early(addr + fineibt_caller_hash, &hash, 4);
1677                         /* rely on apply_retpolines() */
1678                         continue;
1679                 }
1680
1681                 /* cfi_paranoid */
1682                 ret = insn_decode_kernel(&insn, addr + fineibt_caller_size);
1683                 if (WARN_ON_ONCE(ret < 0))
1684                         continue;
1685
1686                 op = insn.opcode.bytes[0];
1687                 if (op != CALL_INSN_OPCODE && op != JMP32_INSN_OPCODE) {
1688                         WARN_ON_ONCE(1);
1689                         continue;
1690                 }
1691
1692                 memcpy(bytes, fineibt_paranoid_start, fineibt_paranoid_size);
1693                 memcpy(bytes + fineibt_caller_hash, &hash, 4);
1694
1695                 if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + fineibt_paranoid_ind, 11)) {
1696                         emit_paranoid_trampoline(addr + fineibt_caller_size,
1697                                                  &insn, 11, bytes + fineibt_caller_size);
1698                 } else {
1699                         ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind);
1700                         if (WARN_ON_ONCE(ret != 3))
1701                                 continue;
1702                 }
1703
1704                 text_poke_early(addr, bytes, fineibt_paranoid_size);
1705         }
1706
1707         return 0;
1708 }
1709
1710 static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
1711                             s32 *start_cfi, s32 *end_cfi, bool builtin)
1712 {
1713         int ret;
1714
1715         if (WARN_ONCE(fineibt_preamble_size != 16,
1716                       "FineIBT preamble wrong size: %ld", fineibt_preamble_size))
1717                 return;
1718
1719         if (cfi_mode == CFI_AUTO) {
1720                 cfi_mode = CFI_KCFI;
1721                 if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) {
1722                         /*
1723                          * FRED has much saner context on exception entry and
1724                          * is less easy to take advantage of.
1725                          */
1726                         if (!cpu_feature_enabled(X86_FEATURE_FRED))
1727                                 cfi_paranoid = true;
1728                         cfi_mode = CFI_FINEIBT;
1729                 }
1730         }
1731
1732         /*
1733          * Rewrite the callers to not use the __cfi_ stubs, such that we might
1734          * rewrite them. This disables all CFI. If this succeeds but any of the
1735          * later stages fails, we're without CFI.
1736          */
1737         ret = cfi_disable_callers(start_retpoline, end_retpoline);
1738         if (ret)
1739                 goto err;
1740
1741         if (cfi_rand) {
1742                 if (builtin) {
1743                         cfi_seed = get_random_u32();
1744                         cfi_bpf_hash = cfi_rehash(cfi_bpf_hash);
1745                         cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash);
1746                 }
1747
1748                 ret = cfi_rand_preamble(start_cfi, end_cfi);
1749                 if (ret)
1750                         goto err;
1751
1752                 ret = cfi_rand_callers(start_retpoline, end_retpoline);
1753                 if (ret)
1754                         goto err;
1755         }
1756
1757         switch (cfi_mode) {
1758         case CFI_OFF:
1759                 if (builtin)
1760                         pr_info("Disabling CFI\n");
1761                 return;
1762
1763         case CFI_KCFI:
1764                 ret = cfi_enable_callers(start_retpoline, end_retpoline);
1765                 if (ret)
1766                         goto err;
1767
1768                 if (builtin)
1769                         pr_info("Using kCFI\n");
1770                 return;
1771
1772         case CFI_FINEIBT:
1773                 /* place the FineIBT preamble at func()-16 */
1774                 ret = cfi_rewrite_preamble(start_cfi, end_cfi);
1775                 if (ret)
1776                         goto err;
1777
1778                 /* rewrite the callers to target func()-16 */
1779                 ret = cfi_rewrite_callers(start_retpoline, end_retpoline);
1780                 if (ret)
1781                         goto err;
1782
1783                 /* now that nobody targets func()+0, remove ENDBR there */
1784                 cfi_rewrite_endbr(start_cfi, end_cfi);
1785
1786                 if (builtin) {
1787                         pr_info("Using %sFineIBT%s CFI\n",
1788                                 cfi_paranoid ? "paranoid " : "",
1789                                 cfi_bhi ? "+BHI" : "");
1790                 }
1791                 return;
1792
1793         default:
1794                 break;
1795         }
1796
1797 err:
1798         pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n");
1799 }
1800
1801 static inline void poison_hash(void *addr)
1802 {
1803         *(u32 *)addr = 0;
1804 }
1805
1806 static void poison_cfi(void *addr)
1807 {
1808         /*
1809          * Compilers manage to be inconsistent with ENDBR vs __cfi prefixes,
1810          * some (static) functions for which they can determine the address
1811          * is never taken do not get a __cfi prefix, but *DO* get an ENDBR.
1812          *
1813          * As such, these functions will get sealed, but we need to be careful
1814          * to not unconditionally scribble the previous function.
1815          */
1816         switch (cfi_mode) {
1817         case CFI_FINEIBT:
1818                 /*
1819                  * FineIBT prefix should start with an ENDBR.
1820                  */
1821                 if (!is_endbr(addr))
1822                         break;
1823
1824                 /*
1825                  * __cfi_\func:
1826                  *      osp nopl (%rax)
1827                  *      subl    $0, %r10d
1828                  *      jz      1f
1829                  *      ud2
1830                  * 1:   nop
1831                  */
1832                 poison_endbr(addr);
1833                 poison_hash(addr + fineibt_preamble_hash);
1834                 break;
1835
1836         case CFI_KCFI:
1837                 /*
1838                  * kCFI prefix should start with a valid hash.
1839                  */
1840                 if (!decode_preamble_hash(addr, NULL))
1841                         break;
1842
1843                 /*
1844                  * __cfi_\func:
1845                  *      movl    $0, %eax
1846                  *      .skip   11, 0x90
1847                  */
1848                 poison_hash(addr + 1);
1849                 break;
1850
1851         default:
1852                 break;
1853         }
1854 }
1855
1856 /*
1857  * When regs->ip points to a 0xEA byte in the FineIBT preamble,
1858  * return true and fill out target and type.
1859  *
1860  * We check the preamble by checking for the ENDBR instruction relative to the
1861  * 0xEA instruction.
1862  */
1863 static bool decode_fineibt_preamble(struct pt_regs *regs, unsigned long *target, u32 *type)
1864 {
1865         unsigned long addr = regs->ip - fineibt_preamble_ud;
1866         u32 hash;
1867
1868         if (!exact_endbr((void *)addr))
1869                 return false;
1870
1871         *target = addr + fineibt_preamble_size;
1872
1873         __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault);
1874         *type = (u32)regs->r10 + hash;
1875
1876         /*
1877          * Since regs->ip points to the middle of an instruction; it cannot
1878          * continue with the normal fixup.
1879          */
1880         regs->ip = *target;
1881
1882         return true;
1883
1884 Efault:
1885         return false;
1886 }
1887
1888 /*
1889  * regs->ip points to one of the UD2 in __bhi_args[].
1890  */
1891 static bool decode_fineibt_bhi(struct pt_regs *regs, unsigned long *target, u32 *type)
1892 {
1893         unsigned long addr;
1894         u32 hash;
1895
1896         if (!cfi_bhi)
1897                 return false;
1898
1899         if (regs->ip < (unsigned long)__bhi_args ||
1900             regs->ip >= (unsigned long)__bhi_args_end)
1901                 return false;
1902
1903         /*
1904          * Fetch the return address from the stack, this points to the
1905          * FineIBT preamble. Since the CALL instruction is in the 5 last
1906          * bytes of the preamble, the return address is in fact the target
1907          * address.
1908          */
1909         __get_kernel_nofault(&addr, regs->sp, unsigned long, Efault);
1910         *target = addr;
1911
1912         addr -= fineibt_preamble_size;
1913         if (!exact_endbr((void *)addr))
1914                 return false;
1915
1916         __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault);
1917         *type = (u32)regs->r10 + hash;
1918
1919         /*
1920          * The UD2 sites are constructed with a RET immediately following,
1921          * as such the non-fatal case can use the regular fixup.
1922          */
1923         return true;
1924
1925 Efault:
1926         return false;
1927 }
1928
1929 static bool is_paranoid_thunk(unsigned long addr)
1930 {
1931         u32 thunk;
1932
1933         __get_kernel_nofault(&thunk, (u32 *)addr, u32, Efault);
1934         return (thunk & 0x00FFFFFF) == 0xfd75ea;
1935
1936 Efault:
1937         return false;
1938 }
1939
1940 /*
1941  * regs->ip points to a LOCK Jcc.d8 instruction from the fineibt_paranoid_start[]
1942  * sequence, or to an invalid instruction (0xea) + Jcc.d8 for cfi_paranoid + ITS
1943  * thunk.
1944  */
1945 static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target, u32 *type)
1946 {
1947         unsigned long addr = regs->ip - fineibt_paranoid_ud;
1948
1949         if (!cfi_paranoid)
1950                 return false;
1951
1952         if (is_cfi_trap(addr + fineibt_caller_size - LEN_UD2)) {
1953                 *target = regs->r11 + fineibt_preamble_size;
1954                 *type = regs->r10;
1955
1956                 /*
1957                  * Since the trapping instruction is the exact, but LOCK prefixed,
1958                  * Jcc.d8 that got us here, the normal fixup will work.
1959                  */
1960                 return true;
1961         }
1962
1963         /*
1964          * The cfi_paranoid + ITS thunk combination results in:
1965          *
1966          *  0:   41 ba 78 56 34 12       mov    $0x12345678, %r10d
1967          *  6:   45 3b 53 f7             cmp    -0x9(%r11), %r10d
1968          *  a:   4d 8d 5b f0             lea    -0x10(%r11), %r11
1969          *  e:   2e e8 XX XX XX XX       cs call __x86_indirect_paranoid_thunk_r11
1970          *
1971          * Where the paranoid_thunk looks like:
1972          *
1973          *  1d:  <ea>                    (bad)
1974          *  __x86_indirect_paranoid_thunk_r11:
1975          *  1e:  75 fd                   jne 1d
1976          *  __x86_indirect_its_thunk_r11:
1977          *  20:  41 ff eb                jmp *%r11
1978          *  23:  cc                      int3
1979          *
1980          */
1981         if (is_paranoid_thunk(regs->ip)) {
1982                 *target = regs->r11 + fineibt_preamble_size;
1983                 *type = regs->r10;
1984
1985                 regs->ip = *target;
1986                 return true;
1987         }
1988
1989         return false;
1990 }
1991
1992 bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type)
1993 {
1994         if (decode_fineibt_paranoid(regs, target, type))
1995                 return true;
1996
1997         if (decode_fineibt_bhi(regs, target, type))
1998                 return true;
1999
2000         return decode_fineibt_preamble(regs, target, type);
2001 }
2002
2003 #else /* !CONFIG_FINEIBT: */
2004
2005 static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
2006                             s32 *start_cfi, s32 *end_cfi, bool builtin)
2007 {
2008 }
2009
2010 #ifdef CONFIG_X86_KERNEL_IBT
2011 static void poison_cfi(void *addr) { }
2012 #endif
2013
2014 #endif /* !CONFIG_FINEIBT */
2015
2016 void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
2017                    s32 *start_cfi, s32 *end_cfi)
2018 {
2019         return __apply_fineibt(start_retpoline, end_retpoline,
2020                                start_cfi, end_cfi,
2021                                /* .builtin = */ false);
2022 }
2023
2024 #ifdef CONFIG_SMP
2025 static void alternatives_smp_lock(const s32 *start, const s32 *end,
2026                                   u8 *text, u8 *text_end)
2027 {
2028         const s32 *poff;
2029
2030         for (poff = start; poff < end; poff++) {
2031                 u8 *ptr = (u8 *)poff + *poff;
2032
2033                 if (!*poff || ptr < text || ptr >= text_end)
2034                         continue;
2035                 /* turn DS segment override prefix into lock prefix */
2036                 if (*ptr == 0x3e)
2037                         text_poke(ptr, ((unsigned char []){0xf0}), 1);
2038         }
2039 }
2040
2041 static void alternatives_smp_unlock(const s32 *start, const s32 *end,
2042                                     u8 *text, u8 *text_end)
2043 {
2044         const s32 *poff;
2045
2046         for (poff = start; poff < end; poff++) {
2047                 u8 *ptr = (u8 *)poff + *poff;
2048
2049                 if (!*poff || ptr < text || ptr >= text_end)
2050                         continue;
2051                 /* turn lock prefix into DS segment override prefix */
2052                 if (*ptr == 0xf0)
2053                         text_poke(ptr, ((unsigned char []){0x3E}), 1);
2054         }
2055 }
2056
2057 struct smp_alt_module {
2058         /* what is this ??? */
2059         struct module   *mod;
2060         char            *name;
2061
2062         /* ptrs to lock prefixes */
2063         const s32       *locks;
2064         const s32       *locks_end;
2065
2066         /* .text segment, needed to avoid patching init code ;) */
2067         u8              *text;
2068         u8              *text_end;
2069
2070         struct list_head next;
2071 };
2072 static LIST_HEAD(smp_alt_modules);
2073 static bool uniproc_patched = false;    /* protected by text_mutex */
2074
2075 void __init_or_module alternatives_smp_module_add(struct module *mod,
2076                                                   char *name,
2077                                                   void *locks, void *locks_end,
2078                                                   void *text,  void *text_end)
2079 {
2080         struct smp_alt_module *smp;
2081
2082         mutex_lock(&text_mutex);
2083         if (!uniproc_patched)
2084                 goto unlock;
2085
2086         if (num_possible_cpus() == 1)
2087                 /* Don't bother remembering, we'll never have to undo it. */
2088                 goto smp_unlock;
2089
2090         smp = kzalloc(sizeof(*smp), GFP_KERNEL);
2091         if (NULL == smp)
2092                 /* we'll run the (safe but slow) SMP code then ... */
2093                 goto unlock;
2094
2095         smp->mod        = mod;
2096         smp->name       = name;
2097         smp->locks      = locks;
2098         smp->locks_end  = locks_end;
2099         smp->text       = text;
2100         smp->text_end   = text_end;
2101         DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n",
2102                 smp->locks, smp->locks_end,
2103                 smp->text, smp->text_end, smp->name);
2104
2105         list_add_tail(&smp->next, &smp_alt_modules);
2106 smp_unlock:
2107         alternatives_smp_unlock(locks, locks_end, text, text_end);
2108 unlock:
2109         mutex_unlock(&text_mutex);
2110 }
2111
2112 void __init_or_module alternatives_smp_module_del(struct module *mod)
2113 {
2114         struct smp_alt_module *item;
2115
2116         mutex_lock(&text_mutex);
2117         list_for_each_entry(item, &smp_alt_modules, next) {
2118                 if (mod != item->mod)
2119                         continue;
2120                 list_del(&item->next);
2121                 kfree(item);
2122                 break;
2123         }
2124         mutex_unlock(&text_mutex);
2125 }
2126
2127 void alternatives_enable_smp(void)
2128 {
2129         struct smp_alt_module *mod;
2130
2131         /* Why bother if there are no other CPUs? */
2132         BUG_ON(num_possible_cpus() == 1);
2133
2134         mutex_lock(&text_mutex);
2135
2136         if (uniproc_patched) {
2137                 pr_info("switching to SMP code\n");
2138                 BUG_ON(num_online_cpus() != 1);
2139                 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
2140                 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
2141                 list_for_each_entry(mod, &smp_alt_modules, next)
2142                         alternatives_smp_lock(mod->locks, mod->locks_end,
2143                                               mod->text, mod->text_end);
2144                 uniproc_patched = false;
2145         }
2146         mutex_unlock(&text_mutex);
2147 }
2148
2149 /*
2150  * Return 1 if the address range is reserved for SMP-alternatives.
2151  * Must hold text_mutex.
2152  */
2153 int alternatives_text_reserved(void *start, void *end)
2154 {
2155         struct smp_alt_module *mod;
2156         const s32 *poff;
2157         u8 *text_start = start;
2158         u8 *text_end = end;
2159
2160         lockdep_assert_held(&text_mutex);
2161
2162         list_for_each_entry(mod, &smp_alt_modules, next) {
2163                 if (mod->text > text_end || mod->text_end < text_start)
2164                         continue;
2165                 for (poff = mod->locks; poff < mod->locks_end; poff++) {
2166                         const u8 *ptr = (const u8 *)poff + *poff;
2167
2168                         if (text_start <= ptr && text_end > ptr)
2169                                 return 1;
2170                 }
2171         }
2172
2173         return 0;
2174 }
2175 #endif /* CONFIG_SMP */
2176
2177 /*
2178  * Self-test for the INT3 based CALL emulation code.
2179  *
2180  * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
2181  * properly and that there is a stack gap between the INT3 frame and the
2182  * previous context. Without this gap doing a virtual PUSH on the interrupted
2183  * stack would corrupt the INT3 IRET frame.
2184  *
2185  * See entry_{32,64}.S for more details.
2186  */
2187
2188 /*
2189  * We define the int3_magic() function in assembly to control the calling
2190  * convention such that we can 'call' it from assembly.
2191  */
2192
2193 extern void int3_magic(unsigned int *ptr); /* defined in asm */
2194
2195 asm (
2196 "       .pushsection    .init.text, \"ax\", @progbits\n"
2197 "       .type           int3_magic, @function\n"
2198 "int3_magic:\n"
2199         ANNOTATE_NOENDBR
2200 "       movl    $1, (%" _ASM_ARG1 ")\n"
2201         ASM_RET
2202 "       .size           int3_magic, .-int3_magic\n"
2203 "       .popsection\n"
2204 );
2205
2206 extern void int3_selftest_ip(void); /* defined in asm below */
2207
2208 static int __init
2209 int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
2210 {
2211         unsigned long selftest = (unsigned long)&int3_selftest_ip;
2212         struct die_args *args = data;
2213         struct pt_regs *regs = args->regs;
2214
2215         OPTIMIZER_HIDE_VAR(selftest);
2216
2217         if (!regs || user_mode(regs))
2218                 return NOTIFY_DONE;
2219
2220         if (val != DIE_INT3)
2221                 return NOTIFY_DONE;
2222
2223         if (regs->ip - INT3_INSN_SIZE != selftest)
2224                 return NOTIFY_DONE;
2225
2226         int3_emulate_call(regs, (unsigned long)&int3_magic);
2227         return NOTIFY_STOP;
2228 }
2229
2230 /* Must be noinline to ensure uniqueness of int3_selftest_ip. */
2231 static noinline void __init int3_selftest(void)
2232 {
2233         static __initdata struct notifier_block int3_exception_nb = {
2234                 .notifier_call  = int3_exception_notify,
2235                 .priority       = INT_MAX-1, /* last */
2236         };
2237         unsigned int val = 0;
2238
2239         BUG_ON(register_die_notifier(&int3_exception_nb));
2240
2241         /*
2242          * Basically: int3_magic(&val); but really complicated :-)
2243          *
2244          * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb
2245          * notifier above will emulate CALL for us.
2246          */
2247         asm volatile ("int3_selftest_ip:\n\t"
2248                       ANNOTATE_NOENDBR
2249                       "    int3; nop; nop; nop; nop\n\t"
2250                       : ASM_CALL_CONSTRAINT
2251                       : __ASM_SEL_RAW(a, D) (&val)
2252                       : "memory");
2253
2254         BUG_ON(val != 1);
2255
2256         unregister_die_notifier(&int3_exception_nb);
2257 }
2258
2259 static __initdata int __alt_reloc_selftest_addr;
2260
2261 extern void __init __alt_reloc_selftest(void *arg);
2262 __visible noinline void __init __alt_reloc_selftest(void *arg)
2263 {
2264         WARN_ON(arg != &__alt_reloc_selftest_addr);
2265 }
2266
2267 static noinline void __init alt_reloc_selftest(void)
2268 {
2269         /*
2270          * Tests text_poke_apply_relocation().
2271          *
2272          * This has a relative immediate (CALL) in a place other than the first
2273          * instruction and additionally on x86_64 we get a RIP-relative LEA:
2274          *
2275          *   lea    0x0(%rip),%rdi  # 5d0: R_X86_64_PC32    .init.data+0x5566c
2276          *   call   +0              # 5d5: R_X86_64_PLT32   __alt_reloc_selftest-0x4
2277          *
2278          * Getting this wrong will either crash and burn or tickle the WARN
2279          * above.
2280          */
2281         asm_inline volatile (
2282                 ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS)
2283                 : ASM_CALL_CONSTRAINT
2284                 : [mem] "m" (__alt_reloc_selftest_addr)
2285                 : _ASM_ARG1
2286         );
2287 }
2288
2289 void __init alternative_instructions(void)
2290 {
2291         u64 ibt;
2292
2293         int3_selftest();
2294
2295         /*
2296          * The patching is not fully atomic, so try to avoid local
2297          * interruptions that might execute the to be patched code.
2298          * Other CPUs are not running.
2299          */
2300         stop_nmi();
2301
2302         /*
2303          * Don't stop machine check exceptions while patching.
2304          * MCEs only happen when something got corrupted and in this
2305          * case we must do something about the corruption.
2306          * Ignoring it is worse than an unlikely patching race.
2307          * Also machine checks tend to be broadcast and if one CPU
2308          * goes into machine check the others follow quickly, so we don't
2309          * expect a machine check to cause undue problems during to code
2310          * patching.
2311          */
2312
2313         /*
2314          * Make sure to set (artificial) features depending on used paravirt
2315          * functions which can later influence alternative patching.
2316          */
2317         paravirt_set_cap();
2318
2319         /* Keep CET-IBT disabled until caller/callee are patched */
2320         ibt = ibt_save(/*disable*/ true);
2321
2322         __apply_fineibt(__retpoline_sites, __retpoline_sites_end,
2323                         __cfi_sites, __cfi_sites_end, true);
2324
2325         /*
2326          * Rewrite the retpolines, must be done before alternatives since
2327          * those can rewrite the retpoline thunks.
2328          */
2329         apply_retpolines(__retpoline_sites, __retpoline_sites_end);
2330         apply_returns(__return_sites, __return_sites_end);
2331
2332         its_fini_core();
2333
2334         /*
2335          * Adjust all CALL instructions to point to func()-10, including
2336          * those in .altinstr_replacement.
2337          */
2338         callthunks_patch_builtin_calls();
2339
2340         apply_alternatives(__alt_instructions, __alt_instructions_end);
2341
2342         /*
2343          * Seal all functions that do not have their address taken.
2344          */
2345         apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
2346
2347         ibt_restore(ibt);
2348
2349 #ifdef CONFIG_SMP
2350         /* Patch to UP if other cpus not imminent. */
2351         if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
2352                 uniproc_patched = true;
2353                 alternatives_smp_module_add(NULL, "core kernel",
2354                                             __smp_locks, __smp_locks_end,
2355                                             _text, _etext);
2356         }
2357
2358         if (!uniproc_patched || num_possible_cpus() == 1) {
2359                 free_init_pages("SMP alternatives",
2360                                 (unsigned long)__smp_locks,
2361                                 (unsigned long)__smp_locks_end);
2362         }
2363 #endif
2364
2365         restart_nmi();
2366         alternatives_patched = 1;
2367
2368         alt_reloc_selftest();
2369 }
2370
2371 /**
2372  * text_poke_early - Update instructions on a live kernel at boot time
2373  * @addr: address to modify
2374  * @opcode: source of the copy
2375  * @len: length to copy
2376  *
2377  * When you use this code to patch more than one byte of an instruction
2378  * you need to make sure that other CPUs cannot execute this code in parallel.
2379  * Also no thread must be currently preempted in the middle of these
2380  * instructions. And on the local CPU you need to be protected against NMI or
2381  * MCE handlers seeing an inconsistent instruction while you patch.
2382  */
2383 void __init_or_module text_poke_early(void *addr, const void *opcode,
2384                                       size_t len)
2385 {
2386         unsigned long flags;
2387
2388         if (boot_cpu_has(X86_FEATURE_NX) &&
2389             is_module_text_address((unsigned long)addr)) {
2390                 /*
2391                  * Modules text is marked initially as non-executable, so the
2392                  * code cannot be running and speculative code-fetches are
2393                  * prevented. Just change the code.
2394                  */
2395                 memcpy(addr, opcode, len);
2396         } else {
2397                 local_irq_save(flags);
2398                 memcpy(addr, opcode, len);
2399                 sync_core();
2400                 local_irq_restore(flags);
2401
2402                 /*
2403                  * Could also do a CLFLUSH here to speed up CPU recovery; but
2404                  * that causes hangs on some VIA CPUs.
2405                  */
2406         }
2407 }
2408
2409 __ro_after_init struct mm_struct *text_poke_mm;
2410 __ro_after_init unsigned long text_poke_mm_addr;
2411
2412 static void text_poke_memcpy(void *dst, const void *src, size_t len)
2413 {
2414         memcpy(dst, src, len);
2415 }
2416
2417 static void text_poke_memset(void *dst, const void *src, size_t len)
2418 {
2419         int c = *(const int *)src;
2420
2421         memset(dst, c, len);
2422 }
2423
2424 typedef void text_poke_f(void *dst, const void *src, size_t len);
2425
2426 static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len)
2427 {
2428         bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
2429         struct page *pages[2] = {NULL};
2430         struct mm_struct *prev_mm;
2431         unsigned long flags;
2432         pte_t pte, *ptep;
2433         spinlock_t *ptl;
2434         pgprot_t pgprot;
2435
2436         /*
2437          * While boot memory allocator is running we cannot use struct pages as
2438          * they are not yet initialized. There is no way to recover.
2439          */
2440         BUG_ON(!after_bootmem);
2441
2442         if (!core_kernel_text((unsigned long)addr)) {
2443                 pages[0] = vmalloc_to_page(addr);
2444                 if (cross_page_boundary)
2445                         pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
2446         } else {
2447                 pages[0] = virt_to_page(addr);
2448                 WARN_ON(!PageReserved(pages[0]));
2449                 if (cross_page_boundary)
2450                         pages[1] = virt_to_page(addr + PAGE_SIZE);
2451         }
2452         /*
2453          * If something went wrong, crash and burn since recovery paths are not
2454          * implemented.
2455          */
2456         BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
2457
2458         /*
2459          * Map the page without the global bit, as TLB flushing is done with
2460          * flush_tlb_mm_range(), which is intended for non-global PTEs.
2461          */
2462         pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
2463
2464         /*
2465          * The lock is not really needed, but this allows to avoid open-coding.
2466          */
2467         ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl);
2468
2469         /*
2470          * This must not fail; preallocated in poking_init().
2471          */
2472         VM_BUG_ON(!ptep);
2473
2474         local_irq_save(flags);
2475
2476         pte = mk_pte(pages[0], pgprot);
2477         set_pte_at(text_poke_mm, text_poke_mm_addr, ptep, pte);
2478
2479         if (cross_page_boundary) {
2480                 pte = mk_pte(pages[1], pgprot);
2481                 set_pte_at(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1, pte);
2482         }
2483
2484         /*
2485          * Loading the temporary mm behaves as a compiler barrier, which
2486          * guarantees that the PTE will be set at the time memcpy() is done.
2487          */
2488         prev_mm = use_temporary_mm(text_poke_mm);
2489
2490         kasan_disable_current();
2491         func((u8 *)text_poke_mm_addr + offset_in_page(addr), src, len);
2492         kasan_enable_current();
2493
2494         /*
2495          * Ensure that the PTE is only cleared after the instructions of memcpy
2496          * were issued by using a compiler barrier.
2497          */
2498         barrier();
2499
2500         pte_clear(text_poke_mm, text_poke_mm_addr, ptep);
2501         if (cross_page_boundary)
2502                 pte_clear(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1);
2503
2504         /*
2505          * Loading the previous page-table hierarchy requires a serializing
2506          * instruction that already allows the core to see the updated version.
2507          * Xen-PV is assumed to serialize execution in a similar manner.
2508          */
2509         unuse_temporary_mm(prev_mm);
2510
2511         /*
2512          * Flushing the TLB might involve IPIs, which would require enabled
2513          * IRQs, but not if the mm is not used, as it is in this point.
2514          */
2515         flush_tlb_mm_range(text_poke_mm, text_poke_mm_addr, text_poke_mm_addr +
2516                            (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
2517                            PAGE_SHIFT, false);
2518
2519         if (func == text_poke_memcpy) {
2520                 /*
2521                  * If the text does not match what we just wrote then something is
2522                  * fundamentally screwy; there's nothing we can really do about that.
2523                  */
2524                 BUG_ON(memcmp(addr, src, len));
2525         }
2526
2527         local_irq_restore(flags);
2528         pte_unmap_unlock(ptep, ptl);
2529         return addr;
2530 }
2531
2532 /**
2533  * text_poke - Update instructions on a live kernel
2534  * @addr: address to modify
2535  * @opcode: source of the copy
2536  * @len: length to copy
2537  *
2538  * Only atomic text poke/set should be allowed when not doing early patching.
2539  * It means the size must be writable atomically and the address must be aligned
2540  * in a way that permits an atomic write. It also makes sure we fit on a single
2541  * page.
2542  *
2543  * Note that the caller must ensure that if the modified code is part of a
2544  * module, the module would not be removed during poking. This can be achieved
2545  * by registering a module notifier, and ordering module removal and patching
2546  * through a mutex.
2547  */
2548 void *text_poke(void *addr, const void *opcode, size_t len)
2549 {
2550         lockdep_assert_held(&text_mutex);
2551
2552         return __text_poke(text_poke_memcpy, addr, opcode, len);
2553 }
2554
2555 /**
2556  * text_poke_kgdb - Update instructions on a live kernel by kgdb
2557  * @addr: address to modify
2558  * @opcode: source of the copy
2559  * @len: length to copy
2560  *
2561  * Only atomic text poke/set should be allowed when not doing early patching.
2562  * It means the size must be writable atomically and the address must be aligned
2563  * in a way that permits an atomic write. It also makes sure we fit on a single
2564  * page.
2565  *
2566  * Context: should only be used by kgdb, which ensures no other core is running,
2567  *          despite the fact it does not hold the text_mutex.
2568  */
2569 void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
2570 {
2571         return __text_poke(text_poke_memcpy, addr, opcode, len);
2572 }
2573
2574 void *text_poke_copy_locked(void *addr, const void *opcode, size_t len,
2575                             bool core_ok)
2576 {
2577         unsigned long start = (unsigned long)addr;
2578         size_t patched = 0;
2579
2580         if (WARN_ON_ONCE(!core_ok && core_kernel_text(start)))
2581                 return NULL;
2582
2583         while (patched < len) {
2584                 unsigned long ptr = start + patched;
2585                 size_t s;
2586
2587                 s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
2588
2589                 __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s);
2590                 patched += s;
2591         }
2592         return addr;
2593 }
2594
2595 /**
2596  * text_poke_copy - Copy instructions into (an unused part of) RX memory
2597  * @addr: address to modify
2598  * @opcode: source of the copy
2599  * @len: length to copy, could be more than 2x PAGE_SIZE
2600  *
2601  * Not safe against concurrent execution; useful for JITs to dump
2602  * new code blocks into unused regions of RX memory. Can be used in
2603  * conjunction with synchronize_rcu_tasks() to wait for existing
2604  * execution to quiesce after having made sure no existing functions
2605  * pointers are live.
2606  */
2607 void *text_poke_copy(void *addr, const void *opcode, size_t len)
2608 {
2609         mutex_lock(&text_mutex);
2610         addr = text_poke_copy_locked(addr, opcode, len, false);
2611         mutex_unlock(&text_mutex);
2612         return addr;
2613 }
2614
2615 /**
2616  * text_poke_set - memset into (an unused part of) RX memory
2617  * @addr: address to modify
2618  * @c: the byte to fill the area with
2619  * @len: length to copy, could be more than 2x PAGE_SIZE
2620  *
2621  * This is useful to overwrite unused regions of RX memory with illegal
2622  * instructions.
2623  */
2624 void *text_poke_set(void *addr, int c, size_t len)
2625 {
2626         unsigned long start = (unsigned long)addr;
2627         size_t patched = 0;
2628
2629         if (WARN_ON_ONCE(core_kernel_text(start)))
2630                 return NULL;
2631
2632         mutex_lock(&text_mutex);
2633         while (patched < len) {
2634                 unsigned long ptr = start + patched;
2635                 size_t s;
2636
2637                 s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
2638
2639                 __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s);
2640                 patched += s;
2641         }
2642         mutex_unlock(&text_mutex);
2643         return addr;
2644 }
2645
2646 static void do_sync_core(void *info)
2647 {
2648         sync_core();
2649 }
2650
2651 void smp_text_poke_sync_each_cpu(void)
2652 {
2653         on_each_cpu(do_sync_core, NULL, 1);
2654 }
2655
2656 /*
2657  * NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of
2658  * this thing. When len == 6 everything is prefixed with 0x0f and we map
2659  * opcode to Jcc.d8, using len to distinguish.
2660  */
2661 struct smp_text_poke_loc {
2662         /* addr := _stext + rel_addr */
2663         s32 rel_addr;
2664         s32 disp;
2665         u8 len;
2666         u8 opcode;
2667         const u8 text[TEXT_POKE_MAX_OPCODE_SIZE];
2668         /* see smp_text_poke_batch_finish() */
2669         u8 old;
2670 };
2671
2672 #define TEXT_POKE_ARRAY_MAX (PAGE_SIZE / sizeof(struct smp_text_poke_loc))
2673
2674 static struct smp_text_poke_array {
2675         struct smp_text_poke_loc vec[TEXT_POKE_ARRAY_MAX];
2676         int nr_entries;
2677 } text_poke_array;
2678
2679 static DEFINE_PER_CPU(atomic_t, text_poke_array_refs);
2680
2681 /*
2682  * These four __always_inline annotations imply noinstr, necessary
2683  * due to smp_text_poke_int3_handler() being noinstr:
2684  */
2685
2686 static __always_inline bool try_get_text_poke_array(void)
2687 {
2688         atomic_t *refs = this_cpu_ptr(&text_poke_array_refs);
2689
2690         if (!raw_atomic_inc_not_zero(refs))
2691                 return false;
2692
2693         return true;
2694 }
2695
2696 static __always_inline void put_text_poke_array(void)
2697 {
2698         atomic_t *refs = this_cpu_ptr(&text_poke_array_refs);
2699
2700         smp_mb__before_atomic();
2701         raw_atomic_dec(refs);
2702 }
2703
2704 static __always_inline void *text_poke_addr(const struct smp_text_poke_loc *tpl)
2705 {
2706         return _stext + tpl->rel_addr;
2707 }
2708
2709 static __always_inline int patch_cmp(const void *tpl_a, const void *tpl_b)
2710 {
2711         if (tpl_a < text_poke_addr(tpl_b))
2712                 return -1;
2713         if (tpl_a > text_poke_addr(tpl_b))
2714                 return 1;
2715         return 0;
2716 }
2717
2718 noinstr int smp_text_poke_int3_handler(struct pt_regs *regs)
2719 {
2720         struct smp_text_poke_loc *tpl;
2721         int ret = 0;
2722         void *ip;
2723
2724         if (user_mode(regs))
2725                 return 0;
2726
2727         /*
2728          * Having observed our INT3 instruction, we now must observe
2729          * text_poke_array with non-zero refcount:
2730          *
2731          *      text_poke_array_refs = 1                INT3
2732          *      WMB                     RMB
2733          *      write INT3              if (text_poke_array_refs != 0)
2734          */
2735         smp_rmb();
2736
2737         if (!try_get_text_poke_array())
2738                 return 0;
2739
2740         /*
2741          * Discount the INT3. See smp_text_poke_batch_finish().
2742          */
2743         ip = (void *) regs->ip - INT3_INSN_SIZE;
2744
2745         /*
2746          * Skip the binary search if there is a single member in the vector.
2747          */
2748         if (unlikely(text_poke_array.nr_entries > 1)) {
2749                 tpl = __inline_bsearch(ip, text_poke_array.vec, text_poke_array.nr_entries,
2750                                       sizeof(struct smp_text_poke_loc),
2751                                       patch_cmp);
2752                 if (!tpl)
2753                         goto out_put;
2754         } else {
2755                 tpl = text_poke_array.vec;
2756                 if (text_poke_addr(tpl) != ip)
2757                         goto out_put;
2758         }
2759
2760         ip += tpl->len;
2761
2762         switch (tpl->opcode) {
2763         case INT3_INSN_OPCODE:
2764                 /*
2765                  * Someone poked an explicit INT3, they'll want to handle it,
2766                  * do not consume.
2767                  */
2768                 goto out_put;
2769
2770         case RET_INSN_OPCODE:
2771                 int3_emulate_ret(regs);
2772                 break;
2773
2774         case CALL_INSN_OPCODE:
2775                 int3_emulate_call(regs, (long)ip + tpl->disp);
2776                 break;
2777
2778         case JMP32_INSN_OPCODE:
2779         case JMP8_INSN_OPCODE:
2780                 int3_emulate_jmp(regs, (long)ip + tpl->disp);
2781                 break;
2782
2783         case 0x70 ... 0x7f: /* Jcc */
2784                 int3_emulate_jcc(regs, tpl->opcode & 0xf, (long)ip, tpl->disp);
2785                 break;
2786
2787         default:
2788                 BUG();
2789         }
2790
2791         ret = 1;
2792
2793 out_put:
2794         put_text_poke_array();
2795         return ret;
2796 }
2797
2798 /**
2799  * smp_text_poke_batch_finish() -- update instructions on live kernel on SMP
2800  *
2801  * Input state:
2802  *  text_poke_array.vec: vector of instructions to patch
2803  *  text_poke_array.nr_entries: number of entries in the vector
2804  *
2805  * Modify multi-byte instructions by using INT3 breakpoints on SMP.
2806  * We completely avoid using stop_machine() here, and achieve the
2807  * synchronization using INT3 breakpoints and SMP cross-calls.
2808  *
2809  * The way it is done:
2810  *      - For each entry in the vector:
2811  *              - add an INT3 trap to the address that will be patched
2812  *      - SMP sync all CPUs
2813  *      - For each entry in the vector:
2814  *              - update all but the first byte of the patched range
2815  *      - SMP sync all CPUs
2816  *      - For each entry in the vector:
2817  *              - replace the first byte (INT3) by the first byte of the
2818  *                replacing opcode
2819  *      - SMP sync all CPUs
2820  */
2821 void smp_text_poke_batch_finish(void)
2822 {
2823         unsigned char int3 = INT3_INSN_OPCODE;
2824         unsigned int i;
2825         int do_sync;
2826
2827         if (!text_poke_array.nr_entries)
2828                 return;
2829
2830         lockdep_assert_held(&text_mutex);
2831
2832         /*
2833          * Corresponds to the implicit memory barrier in try_get_text_poke_array() to
2834          * ensure reading a non-zero refcount provides up to date text_poke_array data.
2835          */
2836         for_each_possible_cpu(i)
2837                 atomic_set_release(per_cpu_ptr(&text_poke_array_refs, i), 1);
2838
2839         /*
2840          * Function tracing can enable thousands of places that need to be
2841          * updated. This can take quite some time, and with full kernel debugging
2842          * enabled, this could cause the softlockup watchdog to trigger.
2843          * This function gets called every 256 entries added to be patched.
2844          * Call cond_resched() here to make sure that other tasks can get scheduled
2845          * while processing all the functions being patched.
2846          */
2847         cond_resched();
2848
2849         /*
2850          * Corresponding read barrier in INT3 notifier for making sure the
2851          * text_poke_array.nr_entries and handler are correctly ordered wrt. patching.
2852          */
2853         smp_wmb();
2854
2855         /*
2856          * First step: add a INT3 trap to the address that will be patched.
2857          */
2858         for (i = 0; i < text_poke_array.nr_entries; i++) {
2859                 text_poke_array.vec[i].old = *(u8 *)text_poke_addr(&text_poke_array.vec[i]);
2860                 text_poke(text_poke_addr(&text_poke_array.vec[i]), &int3, INT3_INSN_SIZE);
2861         }
2862
2863         smp_text_poke_sync_each_cpu();
2864
2865         /*
2866          * Second step: update all but the first byte of the patched range.
2867          */
2868         for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) {
2869                 u8 old[TEXT_POKE_MAX_OPCODE_SIZE+1] = { text_poke_array.vec[i].old, };
2870                 u8 _new[TEXT_POKE_MAX_OPCODE_SIZE+1];
2871                 const u8 *new = text_poke_array.vec[i].text;
2872                 int len = text_poke_array.vec[i].len;
2873
2874                 if (len - INT3_INSN_SIZE > 0) {
2875                         memcpy(old + INT3_INSN_SIZE,
2876                                text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE,
2877                                len - INT3_INSN_SIZE);
2878
2879                         if (len == 6) {
2880                                 _new[0] = 0x0f;
2881                                 memcpy(_new + 1, new, 5);
2882                                 new = _new;
2883                         }
2884
2885                         text_poke(text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE,
2886                                   new + INT3_INSN_SIZE,
2887                                   len - INT3_INSN_SIZE);
2888
2889                         do_sync++;
2890                 }
2891
2892                 /*
2893                  * Emit a perf event to record the text poke, primarily to
2894                  * support Intel PT decoding which must walk the executable code
2895                  * to reconstruct the trace. The flow up to here is:
2896                  *   - write INT3 byte
2897                  *   - IPI-SYNC
2898                  *   - write instruction tail
2899                  * At this point the actual control flow will be through the
2900                  * INT3 and handler and not hit the old or new instruction.
2901                  * Intel PT outputs FUP/TIP packets for the INT3, so the flow
2902                  * can still be decoded. Subsequently:
2903                  *   - emit RECORD_TEXT_POKE with the new instruction
2904                  *   - IPI-SYNC
2905                  *   - write first byte
2906                  *   - IPI-SYNC
2907                  * So before the text poke event timestamp, the decoder will see
2908                  * either the old instruction flow or FUP/TIP of INT3. After the
2909                  * text poke event timestamp, the decoder will see either the
2910                  * new instruction flow or FUP/TIP of INT3. Thus decoders can
2911                  * use the timestamp as the point at which to modify the
2912                  * executable code.
2913                  * The old instruction is recorded so that the event can be
2914                  * processed forwards or backwards.
2915                  */
2916                 perf_event_text_poke(text_poke_addr(&text_poke_array.vec[i]), old, len, new, len);
2917         }
2918
2919         if (do_sync) {
2920                 /*
2921                  * According to Intel, this core syncing is very likely
2922                  * not necessary and we'd be safe even without it. But
2923                  * better safe than sorry (plus there's not only Intel).
2924                  */
2925                 smp_text_poke_sync_each_cpu();
2926         }
2927
2928         /*
2929          * Third step: replace the first byte (INT3) by the first byte of the
2930          * replacing opcode.
2931          */
2932         for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) {
2933                 u8 byte = text_poke_array.vec[i].text[0];
2934
2935                 if (text_poke_array.vec[i].len == 6)
2936                         byte = 0x0f;
2937
2938                 if (byte == INT3_INSN_OPCODE)
2939                         continue;
2940
2941                 text_poke(text_poke_addr(&text_poke_array.vec[i]), &byte, INT3_INSN_SIZE);
2942                 do_sync++;
2943         }
2944
2945         if (do_sync)
2946                 smp_text_poke_sync_each_cpu();
2947
2948         /*
2949          * Remove and wait for refs to be zero.
2950          *
2951          * Notably, if after step-3 above the INT3 got removed, then the
2952          * smp_text_poke_sync_each_cpu() will have serialized against any running INT3
2953          * handlers and the below spin-wait will not happen.
2954          *
2955          * IOW. unless the replacement instruction is INT3, this case goes
2956          * unused.
2957          */
2958         for_each_possible_cpu(i) {
2959                 atomic_t *refs = per_cpu_ptr(&text_poke_array_refs, i);
2960
2961                 if (unlikely(!atomic_dec_and_test(refs)))
2962                         atomic_cond_read_acquire(refs, !VAL);
2963         }
2964
2965         /* They are all completed: */
2966         text_poke_array.nr_entries = 0;
2967 }
2968
2969 static void __smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate)
2970 {
2971         struct smp_text_poke_loc *tpl;
2972         struct insn insn;
2973         int ret, i = 0;
2974
2975         tpl = &text_poke_array.vec[text_poke_array.nr_entries++];
2976
2977         if (len == 6)
2978                 i = 1;
2979         memcpy((void *)tpl->text, opcode+i, len-i);
2980         if (!emulate)
2981                 emulate = opcode;
2982
2983         ret = insn_decode_kernel(&insn, emulate);
2984         BUG_ON(ret < 0);
2985
2986         tpl->rel_addr = addr - (void *)_stext;
2987         tpl->len = len;
2988         tpl->opcode = insn.opcode.bytes[0];
2989
2990         if (is_jcc32(&insn)) {
2991                 /*
2992                  * Map Jcc.d32 onto Jcc.d8 and use len to distinguish.
2993                  */
2994                 tpl->opcode = insn.opcode.bytes[1] - 0x10;
2995         }
2996
2997         switch (tpl->opcode) {
2998         case RET_INSN_OPCODE:
2999         case JMP32_INSN_OPCODE:
3000         case JMP8_INSN_OPCODE:
3001                 /*
3002                  * Control flow instructions without implied execution of the
3003                  * next instruction can be padded with INT3.
3004                  */
3005                 for (i = insn.length; i < len; i++)
3006                         BUG_ON(tpl->text[i] != INT3_INSN_OPCODE);
3007                 break;
3008
3009         default:
3010                 BUG_ON(len != insn.length);
3011         }
3012
3013         switch (tpl->opcode) {
3014         case INT3_INSN_OPCODE:
3015         case RET_INSN_OPCODE:
3016                 break;
3017
3018         case CALL_INSN_OPCODE:
3019         case JMP32_INSN_OPCODE:
3020         case JMP8_INSN_OPCODE:
3021         case 0x70 ... 0x7f: /* Jcc */
3022                 tpl->disp = insn.immediate.value;
3023                 break;
3024
3025         default: /* assume NOP */
3026                 switch (len) {
3027                 case 2: /* NOP2 -- emulate as JMP8+0 */
3028                         BUG_ON(memcmp(emulate, x86_nops[len], len));
3029                         tpl->opcode = JMP8_INSN_OPCODE;
3030                         tpl->disp = 0;
3031                         break;
3032
3033                 case 5: /* NOP5 -- emulate as JMP32+0 */
3034                         BUG_ON(memcmp(emulate, x86_nops[len], len));
3035                         tpl->opcode = JMP32_INSN_OPCODE;
3036                         tpl->disp = 0;
3037                         break;
3038
3039                 default: /* unknown instruction */
3040                         BUG();
3041                 }
3042                 break;
3043         }
3044 }
3045
3046 /*
3047  * We hard rely on the text_poke_array.vec being ordered; ensure this is so by flushing
3048  * early if needed.
3049  */
3050 static bool text_poke_addr_ordered(void *addr)
3051 {
3052         WARN_ON_ONCE(!addr);
3053
3054         if (!text_poke_array.nr_entries)
3055                 return true;
3056
3057         /*
3058          * If the last current entry's address is higher than the
3059          * new entry's address we'd like to add, then ordering
3060          * is violated and we must first flush all pending patching
3061          * requests:
3062          */
3063         if (text_poke_addr(text_poke_array.vec + text_poke_array.nr_entries-1) > addr)
3064                 return false;
3065
3066         return true;
3067 }
3068
3069 /**
3070  * smp_text_poke_batch_add() -- update instruction on live kernel on SMP, batched
3071  * @addr:       address to patch
3072  * @opcode:     opcode of new instruction
3073  * @len:        length to copy
3074  * @emulate:    instruction to be emulated
3075  *
3076  * Add a new instruction to the current queue of to-be-patched instructions
3077  * the kernel maintains. The patching request will not be executed immediately,
3078  * but becomes part of an array of patching requests, optimized for batched
3079  * execution. All pending patching requests will be executed on the next
3080  * smp_text_poke_batch_finish() call.
3081  */
3082 void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate)
3083 {
3084         if (text_poke_array.nr_entries == TEXT_POKE_ARRAY_MAX || !text_poke_addr_ordered(addr))
3085                 smp_text_poke_batch_finish();
3086         __smp_text_poke_batch_add(addr, opcode, len, emulate);
3087 }
3088
3089 /**
3090  * smp_text_poke_single() -- update instruction on live kernel on SMP immediately
3091  * @addr:       address to patch
3092  * @opcode:     opcode of new instruction
3093  * @len:        length to copy
3094  * @emulate:    instruction to be emulated
3095  *
3096  * Update a single instruction with the vector in the stack, avoiding
3097  * dynamically allocated memory. This function should be used when it is
3098  * not possible to allocate memory for a vector. The single instruction
3099  * is patched in immediately.
3100  */
3101 void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate)
3102 {
3103         smp_text_poke_batch_add(addr, opcode, len, emulate);
3104         smp_text_poke_batch_finish();
3105 }