Merge branch 'x86-paravirt-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 9 Jul 2019 00:34:44 +0000 (17:34 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 9 Jul 2019 00:34:44 +0000 (17:34 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 9 Jul 2019 00:34:44 +0000 (17:34 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 9 Jul 2019 00:34:44 +0000 (17:34 -0700)
diff --combined arch/x86/events/intel/ds.c

index 505c73dc6a730ee87cf9db820dd9b5a6c5848180,50f647e131bc028936b3ccef52aa6ad3158a8bfe..2c8db2c19328bcbf95e8c14b4f58e0d2d6d7d7ba
--- 1/arch/x86/events/intel/ds.c
--- 2/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@@ -337,7 -337,7 +337,7 @@@ static int alloc_pebs_buffer(int cpu
         struct debug_store *ds = hwev->ds;
         size_t bsiz = x86_pmu.pebs_buffer_size;
         int max, node = cpu_to_node(cpu);
-       void *buffer, *ibuffer, *cea;
+       void *buffer, *insn_buff, *cea;
   
         if (!x86_pmu.pebs)
                 return 0;
@@@ -351,12 -351,12 +351,12 @@@
          * buffer then.
          */
         if (x86_pmu.intel_cap.pebs_format < 2) {
-               ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
-               if (!ibuffer) {
+               insn_buff = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
+               if (!insn_buff) {
                         dsfree_pages(buffer, bsiz);
                         return -ENOMEM;
                 }
-               per_cpu(insn_buffer, cpu) = ibuffer;
+               per_cpu(insn_buffer, cpu) = insn_buff;
         }
         hwev->ds_pebs_vaddr = buffer;
         /* Update the cpu entry area mapping */
@@@ -684,7 -684,7 +684,7 @@@ struct event_constraint intel_core2_peb
         INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
         INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
         /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
- -      INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x01),
         EVENT_CONSTRAINT_END
   };
   
@@@ -693,7 -693,7 +693,7 @@@ struct event_constraint intel_atom_pebs
         INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
         INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
         /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
- -      INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x01),
         /* Allow all events as PEBS with no flags */
         INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
         EVENT_CONSTRAINT_END
@@@ -701,7 -701,7 +701,7 @@@
   
   struct event_constraint intel_slm_pebs_event_constraints[] = {
         /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
- -      INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x1),
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x1),
         /* Allow all events as PEBS with no flags */
         INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
         EVENT_CONSTRAINT_END
@@@ -726,7 -726,7 +726,7 @@@ struct event_constraint intel_nehalem_p
         INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
         INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
         /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
- -      INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x0f),
         EVENT_CONSTRAINT_END
   };
   
@@@ -743,7 -743,7 +743,7 @@@ struct event_constraint intel_westmere_
         INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
         INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
         /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
- -      INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x0f),
         EVENT_CONSTRAINT_END
   };
   
@@@ -752,7 -752,7 +752,7 @@@ struct event_constraint intel_snb_pebs_
         INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
         INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
         /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
- -      INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c2, 0xf),
           INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
           INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
           INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
@@@ -767,9 -767,9 +767,9 @@@ struct event_constraint intel_ivb_pebs_
           INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
         INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
         /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
- -      INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c2, 0xf),
         /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
- -      INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c0, 0x2),
         INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
         INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
         INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
@@@ -783,9 -783,9 +783,9 @@@ struct event_constraint intel_hsw_pebs_
         INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
         INTEL_PLD_CONSTRAINT(0x01cd, 0xf),    /* MEM_TRANS_RETIRED.* */
         /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
- -      INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c2, 0xf),
         /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
- -      INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c0, 0x2),
         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
@@@ -806,9 -806,9 +806,9 @@@ struct event_constraint intel_bdw_pebs_
         INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
         INTEL_PLD_CONSTRAINT(0x01cd, 0xf),    /* MEM_TRANS_RETIRED.* */
         /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
- -      INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c2, 0xf),
         /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
- -      INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c0, 0x2),
         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
@@@ -829,9 -829,9 +829,9 @@@
   struct event_constraint intel_skl_pebs_event_constraints[] = {
         INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2),      /* INST_RETIRED.PREC_DIST */
         /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
- -      INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c0, 0x2),
         /* INST_RETIRED.TOTAL_CYCLES_PS (inv=1, cmask=16) (cycles:p). */
- -      INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x0f),
         INTEL_PLD_CONSTRAINT(0x1cd, 0xf),                     /* MEM_TRANS_RETIRED.* */
         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
         INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
@@@ -849,26 -849,6 +849,26 @@@
         EVENT_CONSTRAINT_END
   };
   
+ +struct event_constraint intel_icl_pebs_event_constraints[] = {
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x100000000ULL),   /* INST_RETIRED.PREC_DIST */
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x400000000ULL),  /* SLOTS */
+ +
+ +      INTEL_PLD_CONSTRAINT(0x1cd, 0xff),                      /* MEM_TRANS_RETIRED.LOAD_LATENCY */
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x1d0, 0xf),    /* MEM_INST_RETIRED.LOAD */
+ +      INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x2d0, 0xf),    /* MEM_INST_RETIRED.STORE */
+ +
+ +      INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf), /* MEM_LOAD_*_RETIRED.* */
+ +
+ +      INTEL_FLAGS_EVENT_CONSTRAINT(0xd0, 0xf),                /* MEM_INST_RETIRED.* */
+ +
+ +      /*
+ +       * Everything else is handled by PMU_FL_PEBS_ALL, because we
+ +       * need the full constraints from the main table.
+ +       */
+ +
+ +      EVENT_CONSTRAINT_END
+ +};
+ +
   struct event_constraint *intel_pebs_constraints(struct perf_event *event)
   {
         struct event_constraint *c;
@@@ -878,7 -858,7 +878,7 @@@
   
         if (x86_pmu.pebs_constraints) {
                 for_each_event_constraint(c, x86_pmu.pebs_constraints) {
- -                      if ((event->hw.config & c->cmask) == c->code) {
+ +                      if (constraint_match(c, event->hw.config)) {
                                 event->hw.flags |= c->flags;
                                 return c;
                         }
@@@ -926,87 -906,17 +926,87 @@@ static inline void pebs_update_threshol
   
         if (cpuc->n_pebs == cpuc->n_large_pebs) {
                 threshold = ds->pebs_absolute_maximum -
- -                      reserved * x86_pmu.pebs_record_size;
+ +                      reserved * cpuc->pebs_record_size;
         } else {
- -              threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+ +              threshold = ds->pebs_buffer_base + cpuc->pebs_record_size;
         }
   
         ds->pebs_interrupt_threshold = threshold;
   }
   
+ +static void adaptive_pebs_record_size_update(void)
+ +{
+ +      struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ +      u64 pebs_data_cfg = cpuc->pebs_data_cfg;
+ +      int sz = sizeof(struct pebs_basic);
+ +
+ +      if (pebs_data_cfg & PEBS_DATACFG_MEMINFO)
+ +              sz += sizeof(struct pebs_meminfo);
+ +      if (pebs_data_cfg & PEBS_DATACFG_GP)
+ +              sz += sizeof(struct pebs_gprs);
+ +      if (pebs_data_cfg & PEBS_DATACFG_XMMS)
+ +              sz += sizeof(struct pebs_xmm);
+ +      if (pebs_data_cfg & PEBS_DATACFG_LBRS)
+ +              sz += x86_pmu.lbr_nr * sizeof(struct pebs_lbr_entry);
+ +
+ +      cpuc->pebs_record_size = sz;
+ +}
+ +
+ +#define PERF_PEBS_MEMINFO_TYPE        (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC |   \
+ +                              PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_WEIGHT | \
+ +                              PERF_SAMPLE_TRANSACTION)
+ +
+ +static u64 pebs_update_adaptive_cfg(struct perf_event *event)
+ +{
+ +      struct perf_event_attr *attr = &event->attr;
+ +      u64 sample_type = attr->sample_type;
+ +      u64 pebs_data_cfg = 0;
+ +      bool gprs, tsx_weight;
+ +
+ +      if (!(sample_type & ~(PERF_SAMPLE_IP|PERF_SAMPLE_TIME)) &&
+ +          attr->precise_ip > 1)
+ +              return pebs_data_cfg;
+ +
+ +      if (sample_type & PERF_PEBS_MEMINFO_TYPE)
+ +              pebs_data_cfg |= PEBS_DATACFG_MEMINFO;
+ +
+ +      /*
+ +       * We need GPRs when:
+ +       * + user requested them
+ +       * + precise_ip < 2 for the non event IP
+ +       * + For RTM TSX weight we need GPRs for the abort code.
+ +       */
+ +      gprs = (sample_type & PERF_SAMPLE_REGS_INTR) &&
+ +             (attr->sample_regs_intr & PEBS_GP_REGS);
+ +
+ +      tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT) &&
+ +                   ((attr->config & INTEL_ARCH_EVENT_MASK) ==
+ +                    x86_pmu.rtm_abort_event);
+ +
+ +      if (gprs || (attr->precise_ip < 2) || tsx_weight)
+ +              pebs_data_cfg |= PEBS_DATACFG_GP;
+ +
+ +      if ((sample_type & PERF_SAMPLE_REGS_INTR) &&
+ +          (attr->sample_regs_intr & PERF_REG_EXTENDED_MASK))
+ +              pebs_data_cfg |= PEBS_DATACFG_XMMS;
+ +
+ +      if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
+ +              /*
+ +               * For now always log all LBRs. Could configure this
+ +               * later.
+ +               */
+ +              pebs_data_cfg |= PEBS_DATACFG_LBRS |
+ +                      ((x86_pmu.lbr_nr-1) << PEBS_DATACFG_LBR_SHIFT);
+ +      }
+ +
+ +      return pebs_data_cfg;
+ +}
+ +
   static void
- -pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, struct pmu *pmu)
+ +pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc,
+ +                struct perf_event *event, bool add)
   {
+ +      struct pmu *pmu = event->ctx->pmu;
         /*
          * Make sure we get updated with the first PEBS
          * event. It will trigger also during removal, but
@@@ -1023,29 -933,6 +1023,29 @@@
                 update = true;
         }
   
+ +      /*
+ +       * The PEBS record doesn't shrink on pmu::del(). Doing so would require
+ +       * iterating all remaining PEBS events to reconstruct the config.
+ +       */
+ +      if (x86_pmu.intel_cap.pebs_baseline && add) {
+ +              u64 pebs_data_cfg;
+ +
+ +              /* Clear pebs_data_cfg and pebs_record_size for first PEBS. */
+ +              if (cpuc->n_pebs == 1) {
+ +                      cpuc->pebs_data_cfg = 0;
+ +                      cpuc->pebs_record_size = sizeof(struct pebs_basic);
+ +              }
+ +
+ +              pebs_data_cfg = pebs_update_adaptive_cfg(event);
+ +
+ +              /* Update pebs_record_size if new event requires more data. */
+ +              if (pebs_data_cfg & ~cpuc->pebs_data_cfg) {
+ +                      cpuc->pebs_data_cfg |= pebs_data_cfg;
+ +                      adaptive_pebs_record_size_update();
+ +                      update = true;
+ +              }
+ +      }
+ +
         if (update)
                 pebs_update_threshold(cpuc);
   }
@@@ -1060,7 -947,7 +1060,7 @@@ void intel_pmu_pebs_add(struct perf_eve
         if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS)
                 cpuc->n_large_pebs++;
   
- -      pebs_update_state(needed_cb, cpuc, event->ctx->pmu);
+ +      pebs_update_state(needed_cb, cpuc, event, true);
   }
   
   void intel_pmu_pebs_enable(struct perf_event *event)
@@@ -1073,19 -960,11 +1073,19 @@@
   
         cpuc->pebs_enabled |= 1ULL << hwc->idx;
   
- -      if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
+ +      if ((event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) && (x86_pmu.version < 5))
                 cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
         else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
                 cpuc->pebs_enabled |= 1ULL << 63;
   
+ +      if (x86_pmu.intel_cap.pebs_baseline) {
+ +              hwc->config |= ICL_EVENTSEL_ADAPTIVE;
+ +              if (cpuc->pebs_data_cfg != cpuc->active_pebs_data_cfg) {
+ +                      wrmsrl(MSR_PEBS_DATA_CFG, cpuc->pebs_data_cfg);
+ +                      cpuc->active_pebs_data_cfg = cpuc->pebs_data_cfg;
+ +              }
+ +      }
+ +
         /*
          * Use auto-reload if possible to save a MSR write in the PMI.
          * This must be done in pmu::start(), because PERF_EVENT_IOC_PERIOD.
@@@ -1112,7 -991,7 +1112,7 @@@ void intel_pmu_pebs_del(struct perf_eve
         if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS)
                 cpuc->n_large_pebs--;
   
- -      pebs_update_state(needed_cb, cpuc, event->ctx->pmu);
+ +      pebs_update_state(needed_cb, cpuc, event, false);
   }
   
   void intel_pmu_pebs_disable(struct perf_event *event)
@@@ -1125,8 -1004,7 +1125,8 @@@
   
         cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
   
- -      if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
+ +      if ((event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) &&
+ +          (x86_pmu.version < 5))
                 cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
         else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
                 cpuc->pebs_enabled &= ~(1ULL << 63);
@@@ -1247,57 -1125,34 +1247,57 @@@ static int intel_pmu_pebs_fixup_ip(stru
         return 0;
   }
   
- -static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs)
+ +static inline u64 intel_get_tsx_weight(u64 tsx_tuning)
   {
- -      if (pebs->tsx_tuning) {
- -              union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning };
+ +      if (tsx_tuning) {
+ +              union hsw_tsx_tuning tsx = { .value = tsx_tuning };
                 return tsx.cycles_last_block;
         }
         return 0;
   }
   
- -static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs)
+ +static inline u64 intel_get_tsx_transaction(u64 tsx_tuning, u64 ax)
   {
- -      u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32;
+ +      u64 txn = (tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32;
   
         /* For RTM XABORTs also log the abort code from AX */
- -      if ((txn & PERF_TXN_TRANSACTION) && (pebs->ax & 1))
- -              txn |= ((pebs->ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT;
+ +      if ((txn & PERF_TXN_TRANSACTION) && (ax & 1))
+ +              txn |= ((ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT;
         return txn;
   }
   
- -static void setup_pebs_sample_data(struct perf_event *event,
- -                                 struct pt_regs *iregs, void *__pebs,
- -                                 struct perf_sample_data *data,
- -                                 struct pt_regs *regs)
+ +static inline u64 get_pebs_status(void *n)
   {
+ +      if (x86_pmu.intel_cap.pebs_format < 4)
+ +              return ((struct pebs_record_nhm *)n)->status;
+ +      return ((struct pebs_basic *)n)->applicable_counters;
+ +}
+ +
   #define PERF_X86_EVENT_PEBS_HSW_PREC \
                 (PERF_X86_EVENT_PEBS_ST_HSW | \
                  PERF_X86_EVENT_PEBS_LD_HSW | \
                  PERF_X86_EVENT_PEBS_NA_HSW)
+ +
+ +static u64 get_data_src(struct perf_event *event, u64 aux)
+ +{
+ +      u64 val = PERF_MEM_NA;
+ +      int fl = event->hw.flags;
+ +      bool fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
+ +
+ +      if (fl & PERF_X86_EVENT_PEBS_LDLAT)
+ +              val = load_latency_data(aux);
+ +      else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
+ +              val = precise_datala_hsw(event, aux);
+ +      else if (fst)
+ +              val = precise_store_data(aux);
+ +      return val;
+ +}
+ +
+ +static void setup_pebs_fixed_sample_data(struct perf_event *event,
+ +                                 struct pt_regs *iregs, void *__pebs,
+ +                                 struct perf_sample_data *data,
+ +                                 struct pt_regs *regs)
+ +{
         /*
          * We cast to the biggest pebs_record but are careful not to
          * unconditionally access the 'extra' entries.
@@@ -1305,13 -1160,17 +1305,13 @@@
         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
         struct pebs_record_skl *pebs = __pebs;
         u64 sample_type;
- -      int fll, fst, dsrc;
- -      int fl = event->hw.flags;
+ +      int fll;
   
         if (pebs == NULL)
                 return;
   
         sample_type = event->attr.sample_type;
- -      dsrc = sample_type & PERF_SAMPLE_DATA_SRC;
- -
- -      fll = fl & PERF_X86_EVENT_PEBS_LDLAT;
- -      fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
+ +      fll = event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT;
   
         perf_sample_data_init(data, 0, event->hw.last_period);
   
@@@ -1326,8 -1185,16 +1326,8 @@@
         /*
          * data.data_src encodes the data source
          */
- -      if (dsrc) {
- -              u64 val = PERF_MEM_NA;
- -              if (fll)
- -                      val = load_latency_data(pebs->dse);
- -              else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
- -                      val = precise_datala_hsw(event, pebs->dse);
- -              else if (fst)
- -                      val = precise_store_data(pebs->dse);
- -              data->data_src.val = val;
- -      }
+ +      if (sample_type & PERF_SAMPLE_DATA_SRC)
+ +              data->data_src.val = get_data_src(event, pebs->dse);
   
         /*
          * We must however always use iregs for the unwinder to stay sane; the
@@@ -1414,11 -1281,10 +1414,11 @@@
         if (x86_pmu.intel_cap.pebs_format >= 2) {
                 /* Only set the TSX weight when no memory weight. */
                 if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
- -                      data->weight = intel_hsw_weight(pebs);
+ +                      data->weight = intel_get_tsx_weight(pebs->tsx_tuning);
   
                 if (sample_type & PERF_SAMPLE_TRANSACTION)
- -                      data->txn = intel_hsw_transaction(pebs);
+ +                      data->txn = intel_get_tsx_transaction(pebs->tsx_tuning,
+ +                                                            pebs->ax);
         }
   
         /*
@@@ -1435,140 -1301,6 +1435,140 @@@
                 data->br_stack = &cpuc->lbr_stack;
   }
   
+ +static void adaptive_pebs_save_regs(struct pt_regs *regs,
+ +                                  struct pebs_gprs *gprs)
+ +{
+ +      regs->ax = gprs->ax;
+ +      regs->bx = gprs->bx;
+ +      regs->cx = gprs->cx;
+ +      regs->dx = gprs->dx;
+ +      regs->si = gprs->si;
+ +      regs->di = gprs->di;
+ +      regs->bp = gprs->bp;
+ +      regs->sp = gprs->sp;
+ +#ifndef CONFIG_X86_32
+ +      regs->r8 = gprs->r8;
+ +      regs->r9 = gprs->r9;
+ +      regs->r10 = gprs->r10;
+ +      regs->r11 = gprs->r11;
+ +      regs->r12 = gprs->r12;
+ +      regs->r13 = gprs->r13;
+ +      regs->r14 = gprs->r14;
+ +      regs->r15 = gprs->r15;
+ +#endif
+ +}
+ +
+ +/*
+ + * With adaptive PEBS the layout depends on what fields are configured.
+ + */
+ +
+ +static void setup_pebs_adaptive_sample_data(struct perf_event *event,
+ +                                          struct pt_regs *iregs, void *__pebs,
+ +                                          struct perf_sample_data *data,
+ +                                          struct pt_regs *regs)
+ +{
+ +      struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ +      struct pebs_basic *basic = __pebs;
+ +      void *next_record = basic + 1;
+ +      u64 sample_type;
+ +      u64 format_size;
+ +      struct pebs_meminfo *meminfo = NULL;
+ +      struct pebs_gprs *gprs = NULL;
+ +      struct x86_perf_regs *perf_regs;
+ +
+ +      if (basic == NULL)
+ +              return;
+ +
+ +      perf_regs = container_of(regs, struct x86_perf_regs, regs);
+ +      perf_regs->xmm_regs = NULL;
+ +
+ +      sample_type = event->attr.sample_type;
+ +      format_size = basic->format_size;
+ +      perf_sample_data_init(data, 0, event->hw.last_period);
+ +      data->period = event->hw.last_period;
+ +
+ +      if (event->attr.use_clockid == 0)
+ +              data->time = native_sched_clock_from_tsc(basic->tsc);
+ +
+ +      /*
+ +       * We must however always use iregs for the unwinder to stay sane; the
+ +       * record BP,SP,IP can point into thin air when the record is from a
+ +       * previous PMI context or an (I)RET happened between the record and
+ +       * PMI.
+ +       */
+ +      if (sample_type & PERF_SAMPLE_CALLCHAIN)
+ +              data->callchain = perf_callchain(event, iregs);
+ +
+ +      *regs = *iregs;
+ +      /* The ip in basic is EventingIP */
+ +      set_linear_ip(regs, basic->ip);
+ +      regs->flags = PERF_EFLAGS_EXACT;
+ +
+ +      /*
+ +       * The record for MEMINFO is in front of GP
+ +       * But PERF_SAMPLE_TRANSACTION needs gprs->ax.
+ +       * Save the pointer here but process later.
+ +       */
+ +      if (format_size & PEBS_DATACFG_MEMINFO) {
+ +              meminfo = next_record;
+ +              next_record = meminfo + 1;
+ +      }
+ +
+ +      if (format_size & PEBS_DATACFG_GP) {
+ +              gprs = next_record;
+ +              next_record = gprs + 1;
+ +
+ +              if (event->attr.precise_ip < 2) {
+ +                      set_linear_ip(regs, gprs->ip);
+ +                      regs->flags &= ~PERF_EFLAGS_EXACT;
+ +              }
+ +
+ +              if (sample_type & PERF_SAMPLE_REGS_INTR)
+ +                      adaptive_pebs_save_regs(regs, gprs);
+ +      }
+ +
+ +      if (format_size & PEBS_DATACFG_MEMINFO) {
+ +              if (sample_type & PERF_SAMPLE_WEIGHT)
+ +                      data->weight = meminfo->latency ?:
+ +                              intel_get_tsx_weight(meminfo->tsx_tuning);
+ +
+ +              if (sample_type & PERF_SAMPLE_DATA_SRC)
+ +                      data->data_src.val = get_data_src(event, meminfo->aux);
+ +
+ +              if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR))
+ +                      data->addr = meminfo->address;
+ +
+ +              if (sample_type & PERF_SAMPLE_TRANSACTION)
+ +                      data->txn = intel_get_tsx_transaction(meminfo->tsx_tuning,
+ +                                                        gprs ? gprs->ax : 0);
+ +      }
+ +
+ +      if (format_size & PEBS_DATACFG_XMMS) {
+ +              struct pebs_xmm *xmm = next_record;
+ +
+ +              next_record = xmm + 1;
+ +              perf_regs->xmm_regs = xmm->xmm;
+ +      }
+ +
+ +      if (format_size & PEBS_DATACFG_LBRS) {
+ +              struct pebs_lbr *lbr = next_record;
+ +              int num_lbr = ((format_size >> PEBS_DATACFG_LBR_SHIFT)
+ +                                      & 0xff) + 1;
+ +              next_record = next_record + num_lbr*sizeof(struct pebs_lbr_entry);
+ +
+ +              if (has_branch_stack(event)) {
+ +                      intel_pmu_store_pebs_lbrs(lbr);
+ +                      data->br_stack = &cpuc->lbr_stack;
+ +              }
+ +      }
+ +
+ +      WARN_ONCE(next_record != __pebs + (format_size >> 48),
+ +                      "PEBS record size %llu, expected %llu, config %llx\n",
+ +                      format_size >> 48,
+ +                      (u64)(next_record - __pebs),
+ +                      basic->format_size);
+ +}
+ +
   static inline void *
   get_next_pebs_record_by_bit(void *base, void *top, int bit)
   {
@@@ -1586,19 -1318,19 +1586,19 @@@
         if (base == NULL)
                 return NULL;
   
- -      for (at = base; at < top; at += x86_pmu.pebs_record_size) {
- -              struct pebs_record_nhm *p = at;
+ +      for (at = base; at < top; at += cpuc->pebs_record_size) {
+ +              unsigned long status = get_pebs_status(at);
   
- -              if (test_bit(bit, (unsigned long *)&p->status)) {
+ +              if (test_bit(bit, (unsigned long *)&status)) {
                         /* PEBS v3 has accurate status bits */
                         if (x86_pmu.intel_cap.pebs_format >= 3)
                                 return at;
   
- -                      if (p->status == (1 << bit))
+ +                      if (status == (1 << bit))
                                 return at;
   
                         /* clear non-PEBS bit and re-check */
- -                      pebs_status = p->status & cpuc->pebs_enabled;
+ +                      pebs_status = status & cpuc->pebs_enabled;
                         pebs_status &= PEBS_COUNTER_MASK;
                         if (pebs_status == (1 << bit))
                                 return at;
@@@ -1678,18 -1410,11 +1678,18 @@@ intel_pmu_save_and_restart_reload(struc
   static void __intel_pmu_pebs_event(struct perf_event *event,
                                    struct pt_regs *iregs,
                                    void *base, void *top,
- -                                 int bit, int count)
+ +                                 int bit, int count,
+ +                                 void (*setup_sample)(struct perf_event *,
+ +                                              struct pt_regs *,
+ +                                              void *,
+ +                                              struct perf_sample_data *,
+ +                                              struct pt_regs *))
   {
+ +      struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
         struct hw_perf_event *hwc = &event->hw;
         struct perf_sample_data data;
- -      struct pt_regs regs;
+ +      struct x86_perf_regs perf_regs;
+ +      struct pt_regs *regs = &perf_regs.regs;
         void *at = get_next_pebs_record_by_bit(base, top, bit);
   
         if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
@@@ -1704,20 -1429,20 +1704,20 @@@
                 return;
   
         while (count > 1) {
- -              setup_pebs_sample_data(event, iregs, at, &data, &regs);
- -              perf_event_output(event, &data, &regs);
- -              at += x86_pmu.pebs_record_size;
+ +              setup_sample(event, iregs, at, &data, regs);
+ +              perf_event_output(event, &data, regs);
+ +              at += cpuc->pebs_record_size;
                 at = get_next_pebs_record_by_bit(at, top, bit);
                 count--;
         }
   
- -      setup_pebs_sample_data(event, iregs, at, &data, &regs);
+ +      setup_sample(event, iregs, at, &data, regs);
   
         /*
          * All but the last records are processed.
          * The last one is left to be able to call the overflow handler.
          */
- -      if (perf_event_overflow(event, &data, &regs)) {
+ +      if (perf_event_overflow(event, &data, regs)) {
                 x86_pmu_stop(event, 0);
                 return;
         }
@@@ -1758,27 -1483,7 +1758,27 @@@ static void intel_pmu_drain_pebs_core(s
                 return;
         }
   
- -      __intel_pmu_pebs_event(event, iregs, at, top, 0, n);
+ +      __intel_pmu_pebs_event(event, iregs, at, top, 0, n,
+ +                             setup_pebs_fixed_sample_data);
+ +}
+ +
+ +static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int size)
+ +{
+ +      struct perf_event *event;
+ +      int bit;
+ +
+ +      /*
+ +       * The drain_pebs() could be called twice in a short period
+ +       * for auto-reload event in pmu::read(). There are no
+ +       * overflows have happened in between.
+ +       * It needs to call intel_pmu_save_and_restart_reload() to
+ +       * update the event->count for this case.
+ +       */
+ +      for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled, size) {
+ +              event = cpuc->events[bit];
+ +              if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
+ +                      intel_pmu_save_and_restart_reload(event, 0);
+ +      }
   }
   
   static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
@@@ -1808,7 -1513,19 +1808,7 @@@
         }
   
         if (unlikely(base >= top)) {
- -              /*
- -               * The drain_pebs() could be called twice in a short period
- -               * for auto-reload event in pmu::read(). There are no
- -               * overflows have happened in between.
- -               * It needs to call intel_pmu_save_and_restart_reload() to
- -               * update the event->count for this case.
- -               */
- -              for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled,
- -                               size) {
- -                      event = cpuc->events[bit];
- -                      if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
- -                              intel_pmu_save_and_restart_reload(event, 0);
- -              }
+ +              intel_pmu_pebs_event_update_no_drain(cpuc, size);
                 return;
         }
   
@@@ -1821,7 -1538,8 +1821,7 @@@
   
                 /* PEBS v3 has more accurate status bits */
                 if (x86_pmu.intel_cap.pebs_format >= 3) {
- -                      for_each_set_bit(bit, (unsigned long *)&pebs_status,
- -                                       size)
+ +                      for_each_set_bit(bit, (unsigned long *)&pebs_status, size)
                                 counts[bit]++;
   
                         continue;
@@@ -1860,7 -1578,8 +1860,7 @@@
                  * If collision happened, the record will be dropped.
                  */
                 if (p->status != (1ULL << bit)) {
- -                      for_each_set_bit(i, (unsigned long *)&pebs_status,
- -                                       x86_pmu.max_pebs_events)
+ +                      for_each_set_bit(i, (unsigned long *)&pebs_status, size)
                                 error[i]++;
                         continue;
                 }
@@@ -1868,7 -1587,7 +1868,7 @@@
                 counts[bit]++;
         }
   
- -      for (bit = 0; bit < size; bit++) {
+ +      for_each_set_bit(bit, (unsigned long *)&mask, size) {
                 if ((counts[bit] == 0) && (error[bit] == 0))
                         continue;
   
@@@ -1889,66 -1608,11 +1889,66 @@@
   
                 if (counts[bit]) {
                         __intel_pmu_pebs_event(event, iregs, base,
- -                                             top, bit, counts[bit]);
+ +                                             top, bit, counts[bit],
+ +                                             setup_pebs_fixed_sample_data);
                 }
         }
   }
   
+ +static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs)
+ +{
+ +      short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
+ +      struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ +      struct debug_store *ds = cpuc->ds;
+ +      struct perf_event *event;
+ +      void *base, *at, *top;
+ +      int bit, size;
+ +      u64 mask;
+ +
+ +      if (!x86_pmu.pebs_active)
+ +              return;
+ +
+ +      base = (struct pebs_basic *)(unsigned long)ds->pebs_buffer_base;
+ +      top = (struct pebs_basic *)(unsigned long)ds->pebs_index;
+ +
+ +      ds->pebs_index = ds->pebs_buffer_base;
+ +
+ +      mask = ((1ULL << x86_pmu.max_pebs_events) - 1) |
+ +             (((1ULL << x86_pmu.num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED);
+ +      size = INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed;
+ +
+ +      if (unlikely(base >= top)) {
+ +              intel_pmu_pebs_event_update_no_drain(cpuc, size);
+ +              return;
+ +      }
+ +
+ +      for (at = base; at < top; at += cpuc->pebs_record_size) {
+ +              u64 pebs_status;
+ +
+ +              pebs_status = get_pebs_status(at) & cpuc->pebs_enabled;
+ +              pebs_status &= mask;
+ +
+ +              for_each_set_bit(bit, (unsigned long *)&pebs_status, size)
+ +                      counts[bit]++;
+ +      }
+ +
+ +      for_each_set_bit(bit, (unsigned long *)&mask, size) {
+ +              if (counts[bit] == 0)
+ +                      continue;
+ +
+ +              event = cpuc->events[bit];
+ +              if (WARN_ON_ONCE(!event))
+ +                      continue;
+ +
+ +              if (WARN_ON_ONCE(!event->attr.precise_ip))
+ +                      continue;
+ +
+ +              __intel_pmu_pebs_event(event, iregs, base,
+ +                                     top, bit, counts[bit],
+ +                                     setup_pebs_adaptive_sample_data);
+ +      }
+ +}
+ +
   /*
    * BTS, PEBS probe and setup
    */
@@@ -1966,15 -1630,10 +1966,15 @@@ void __init intel_ds_init(void
         x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE;
         if (x86_pmu.version <= 4)
                 x86_pmu.pebs_no_isolation = 1;
+ +
         if (x86_pmu.pebs) {
                 char pebs_type = x86_pmu.intel_cap.pebs_trap ?  '+' : '-';
+ +              char *pebs_qual = "";
                 int format = x86_pmu.intel_cap.pebs_format;
   
+ +              if (format < 4)
+ +                      x86_pmu.intel_cap.pebs_baseline = 0;
+ +
                 switch (format) {
                 case 0:
                         pr_cont("PEBS fmt0%c, ", pebs_type);
@@@ -2010,29 -1669,6 +2010,29 @@@
                         x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME;
                         break;
   
+ +              case 4:
+ +                      x86_pmu.drain_pebs = intel_pmu_drain_pebs_icl;
+ +                      x86_pmu.pebs_record_size = sizeof(struct pebs_basic);
+ +                      if (x86_pmu.intel_cap.pebs_baseline) {
+ +                              x86_pmu.large_pebs_flags |=
+ +                                      PERF_SAMPLE_BRANCH_STACK |
+ +                                      PERF_SAMPLE_TIME;
+ +                              x86_pmu.flags |= PMU_FL_PEBS_ALL;
+ +                              pebs_qual = "-baseline";
+ +                              x86_get_pmu()->capabilities |= PERF_PMU_CAP_EXTENDED_REGS;
+ +                      } else {
+ +                              /* Only basic record supported */
+ +                              x86_pmu.large_pebs_flags &=
+ +                                      ~(PERF_SAMPLE_ADDR |
+ +                                        PERF_SAMPLE_TIME |
+ +                                        PERF_SAMPLE_DATA_SRC |
+ +                                        PERF_SAMPLE_TRANSACTION |
+ +                                        PERF_SAMPLE_REGS_USER |
+ +                                        PERF_SAMPLE_REGS_INTR);
+ +                      }
+ +                      pr_cont("PEBS fmt4%c%s, ", pebs_type, pebs_qual);
+ +                      break;
+ +
                 default:
                         pr_cont("no PEBS fmt%d%c, ", format, pebs_type);
                         x86_pmu.pebs = 0;
diff --combined arch/x86/kernel/Makefile

index ce1b5cc360a27ba13160856ddeffd4040ebeffca,62e78a3fd31e02bf3f900f0f4f475dfe10795f51..3578ad248bc98319e3a0f5769615c72e191d5569
--- 1/arch/x86/kernel/Makefile
--- 2/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@@ -30,7 -30,7 +30,7 @@@ KASAN_SANITIZE_paravirt.o                             := 
   
   OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o   := y
   OBJECT_FILES_NON_STANDARD_test_nx.o                   := y
- OBJECT_FILES_NON_STANDARD_paravirt_patch_$(BITS).o    := y
+ OBJECT_FILES_NON_STANDARD_paravirt_patch.o            := y
   
   ifdef CONFIG_FRAME_POINTER
   OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o            := y
@@@ -42,7 -42,7 +42,7 @@@ endi
   # non-deterministic coverage.
   KCOV_INSTRUMENT               := n
   
- -CFLAGS_irq.o := -I$(src)/../include/asm/trace
+ +CFLAGS_irq.o := -I $(srctree)/$(src)/../include/asm/trace
   
   obj-y                 := process_$(BITS).o signal.o
   obj-$(CONFIG_COMPAT)  += signal_compat.o
@@@ -112,7 -112,7 +112,7 @@@ obj-$(CONFIG_AMD_NB)               += amd_nb.
   obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
   
   obj-$(CONFIG_KVM_GUEST)               += kvm.o kvmclock.o
- obj-$(CONFIG_PARAVIRT)                += paravirt.o paravirt_patch_$(BITS).o
+ obj-$(CONFIG_PARAVIRT)                += paravirt.o paravirt_patch.o
   obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
   obj-$(CONFIG_PARAVIRT_CLOCK)  += pvclock.o
   obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o
diff --combined arch/x86/kernel/alternative.c

index c3468b5242fdeec3edb0f81704049ed1bc658fc7,7ea5a3764fccf50f49e8b71cfe0399a06c9f3c69..99ef8b6f9a1a5abf2e89c37a79da906d2103b66c
--- 1/arch/x86/kernel/alternative.c
--- 2/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@@ -1,4 -1,3 +1,4 @@@
+ +// SPDX-License-Identifier: GPL-2.0-only
   #define pr_fmt(fmt) "SMP alternatives: " fmt
   
   #include <linux/module.h>
@@@ -13,8 -12,6 +13,8 @@@
   #include <linux/slab.h>
   #include <linux/kdebug.h>
   #include <linux/kprobes.h>
+ +#include <linux/mmu_context.h>
+ +#include <linux/bsearch.h>
   #include <asm/text-patching.h>
   #include <asm/alternative.h>
   #include <asm/sections.h>
@@@ -267,7 -264,7 +267,7 @@@ static void __init_or_module add_nops(v
   
   extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
   extern s32 __smp_locks[], __smp_locks_end[];
- -void *text_poke_early(void *addr, const void *opcode, size_t len);
+ +void text_poke_early(void *addr, const void *opcode, size_t len);
   
   /*
    * Are we looking at a near JMP with a 1 or 4-byte displacement.
@@@ -278,7 -275,7 +278,7 @@@ static inline bool is_jmp(const u8 opco
   }
   
   static void __init_or_module
- recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
+ recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff)
   {
         u8 *next_rip, *tgt_rip;
         s32 n_dspl, o_dspl;
@@@ -287,7 -284,7 +287,7 @@@
         if (a->replacementlen != 5)
                 return;
   
-       o_dspl = *(s32 *)(insnbuf + 1);
+       o_dspl = *(s32 *)(insn_buff + 1);
   
         /* next_rip of the replacement JMP */
         next_rip = repl_insn + a->replacementlen;
@@@ -313,9 -310,9 +313,9 @@@
   two_byte_jmp:
         n_dspl -= 2;
   
-       insnbuf[0] = 0xeb;
-       insnbuf[1] = (s8)n_dspl;
-       add_nops(insnbuf + 2, 3);
+       insn_buff[0] = 0xeb;
+       insn_buff[1] = (s8)n_dspl;
+       add_nops(insn_buff + 2, 3);
   
         repl_len = 2;
         goto done;
@@@ -323,8 -320,8 +323,8 @@@
   five_byte_jmp:
         n_dspl -= 5;
   
-       insnbuf[0] = 0xe9;
-       *(s32 *)&insnbuf[1] = n_dspl;
+       insn_buff[0] = 0xe9;
+       *(s32 *)&insn_buff[1] = n_dspl;
   
         repl_len = 5;
   
@@@ -371,7 -368,7 +371,7 @@@ void __init_or_module noinline apply_al
   {
         struct alt_instr *a;
         u8 *instr, *replacement;
-       u8 insnbuf[MAX_PATCH_LEN];
+       u8 insn_buff[MAX_PATCH_LEN];
   
         DPRINTK("alt table %px, -> %px", start, end);
         /*
@@@ -384,11 -381,11 +384,11 @@@
          * order.
          */
         for (a = start; a < end; a++) {
-               int insnbuf_sz = 0;
+               int insn_buff_sz = 0;
   
                 instr = (u8 *)&a->instr_offset + a->instr_offset;
                 replacement = (u8 *)&a->repl_offset + a->repl_offset;
-               BUG_ON(a->instrlen > sizeof(insnbuf));
+               BUG_ON(a->instrlen > sizeof(insn_buff));
                 BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
                 if (!boot_cpu_has(a->cpuid)) {
                         if (a->padlen > 1)
@@@ -406,8 -403,8 +406,8 @@@
                 DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
                 DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
   
-               memcpy(insnbuf, replacement, a->replacementlen);
-               insnbuf_sz = a->replacementlen;
+               memcpy(insn_buff, replacement, a->replacementlen);
+               insn_buff_sz = a->replacementlen;
   
                 /*
                  * 0xe8 is a relative jump; fix the offset.
@@@ -415,24 -412,24 +415,24 @@@
                  * Instruction length is checked before the opcode to avoid
                  * accessing uninitialized bytes for zero-length replacements.
                  */
-               if (a->replacementlen == 5 && *insnbuf == 0xe8) {
-                       *(s32 *)(insnbuf + 1) += replacement - instr;
+               if (a->replacementlen == 5 && *insn_buff == 0xe8) {
+                       *(s32 *)(insn_buff + 1) += replacement - instr;
                         DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
-                               *(s32 *)(insnbuf + 1),
-                               (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
+                               *(s32 *)(insn_buff + 1),
+                               (unsigned long)instr + *(s32 *)(insn_buff + 1) + 5);
                 }
   
                 if (a->replacementlen && is_jmp(replacement[0]))
-                       recompute_jump(a, instr, replacement, insnbuf);
+                       recompute_jump(a, instr, replacement, insn_buff);
   
                 if (a->instrlen > a->replacementlen) {
-                       add_nops(insnbuf + a->replacementlen,
+                       add_nops(insn_buff + a->replacementlen,
                                  a->instrlen - a->replacementlen);
-                       insnbuf_sz += a->instrlen - a->replacementlen;
+                       insn_buff_sz += a->instrlen - a->replacementlen;
                 }
-               DUMP_BYTES(insnbuf, insnbuf_sz, "%px: final_insn: ", instr);
+               DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
   
-               text_poke_early(instr, insnbuf, insnbuf_sz);
+               text_poke_early(instr, insn_buff, insn_buff_sz);
         }
   }
   
@@@ -594,105 -591,32 +594,104 @@@ void __init_or_module apply_paravirt(st
                                      struct paravirt_patch_site *end)
   {
         struct paravirt_patch_site *p;
-       char insnbuf[MAX_PATCH_LEN];
+       char insn_buff[MAX_PATCH_LEN];
   
         for (p = start; p < end; p++) {
                 unsigned int used;
   
                 BUG_ON(p->len > MAX_PATCH_LEN);
                 /* prep the buffer with the original instructions */
-               memcpy(insnbuf, p->instr, p->len);
-               used = pv_ops.init.patch(p->instrtype, insnbuf,
-                                        (unsigned long)p->instr, p->len);
+               memcpy(insn_buff, p->instr, p->len);
+               used = pv_ops.init.patch(p->type, insn_buff, (unsigned long)p->instr, p->len);
   
                 BUG_ON(used > p->len);
   
                 /* Pad the rest with nops */
-               add_nops(insnbuf + used, p->len - used);
-               text_poke_early(p->instr, insnbuf, p->len);
+               add_nops(insn_buff + used, p->len - used);
+               text_poke_early(p->instr, insn_buff, p->len);
         }
   }
   extern struct paravirt_patch_site __start_parainstructions[],
         __stop_parainstructions[];
   #endif        /* CONFIG_PARAVIRT */
   
+ +/*
+ + * Self-test for the INT3 based CALL emulation code.
+ + *
+ + * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
+ + * properly and that there is a stack gap between the INT3 frame and the
+ + * previous context. Without this gap doing a virtual PUSH on the interrupted
+ + * stack would corrupt the INT3 IRET frame.
+ + *
+ + * See entry_{32,64}.S for more details.
+ + */
+ +static void __init int3_magic(unsigned int *ptr)
+ +{
+ +      *ptr = 1;
+ +}
+ +
+ +extern __initdata unsigned long int3_selftest_ip; /* defined in asm below */
+ +
+ +static int __init
+ +int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
+ +{
+ +      struct die_args *args = data;
+ +      struct pt_regs *regs = args->regs;
+ +
+ +      if (!regs || user_mode(regs))
+ +              return NOTIFY_DONE;
+ +
+ +      if (val != DIE_INT3)
+ +              return NOTIFY_DONE;
+ +
+ +      if (regs->ip - INT3_INSN_SIZE != int3_selftest_ip)
+ +              return NOTIFY_DONE;
+ +
+ +      int3_emulate_call(regs, (unsigned long)&int3_magic);
+ +      return NOTIFY_STOP;
+ +}
+ +
+ +static void __init int3_selftest(void)
+ +{
+ +      static __initdata struct notifier_block int3_exception_nb = {
+ +              .notifier_call  = int3_exception_notify,
+ +              .priority       = INT_MAX-1, /* last */
+ +      };
+ +      unsigned int val = 0;
+ +
+ +      BUG_ON(register_die_notifier(&int3_exception_nb));
+ +
+ +      /*
+ +       * Basically: int3_magic(&val); but really complicated :-)
+ +       *
+ +       * Stick the address of the INT3 instruction into int3_selftest_ip,
+ +       * then trigger the INT3, padded with NOPs to match a CALL instruction
+ +       * length.
+ +       */
+ +      asm volatile ("1: int3; nop; nop; nop; nop\n\t"
+ +                    ".pushsection .init.data,\"aw\"\n\t"
+ +                    ".align " __ASM_SEL(4, 8) "\n\t"
+ +                    ".type int3_selftest_ip, @object\n\t"
+ +                    ".size int3_selftest_ip, " __ASM_SEL(4, 8) "\n\t"
+ +                    "int3_selftest_ip:\n\t"
+ +                    __ASM_SEL(.long, .quad) " 1b\n\t"
+ +                    ".popsection\n\t"
+ +                    : : __ASM_SEL_RAW(a, D) (&val) : "memory");
+ +
+ +      BUG_ON(val != 1);
+ +
+ +      unregister_die_notifier(&int3_exception_nb);
+ +}
+ +
   void __init alternative_instructions(void)
   {
- -      /* The patching is not fully atomic, so try to avoid local interruptions
- -         that might execute the to be patched code.
- -         Other CPUs are not running. */
+ +      int3_selftest();
+ +
+ +      /*
+ +       * The patching is not fully atomic, so try to avoid local
+ +       * interruptions that might execute the to be patched code.
+ +       * Other CPUs are not running.
+ +       */
         stop_nmi();
   
         /*
@@@ -717,11 -641,10 +716,11 @@@
                                             _text, _etext);
         }
   
- -      if (!uniproc_patched || num_possible_cpus() == 1)
+ +      if (!uniproc_patched || num_possible_cpus() == 1) {
                 free_init_pages("SMP alternatives",
                                 (unsigned long)__smp_locks,
                                 (unsigned long)__smp_locks_end);
+ +      }
   #endif
   
         apply_paravirt(__parainstructions, __parainstructions_end);
@@@ -742,136 -665,16 +741,136 @@@
    * instructions. And on the local CPU you need to be protected again NMI or MCE
    * handlers seeing an inconsistent instruction while you patch.
    */
- -void *__init_or_module text_poke_early(void *addr, const void *opcode,
- -                                            size_t len)
+ +void __init_or_module text_poke_early(void *addr, const void *opcode,
+ +                                    size_t len)
+ +{
+ +      unsigned long flags;
+ +
+ +      if (boot_cpu_has(X86_FEATURE_NX) &&
+ +          is_module_text_address((unsigned long)addr)) {
+ +              /*
+ +               * Modules text is marked initially as non-executable, so the
+ +               * code cannot be running and speculative code-fetches are
+ +               * prevented. Just change the code.
+ +               */
+ +              memcpy(addr, opcode, len);
+ +      } else {
+ +              local_irq_save(flags);
+ +              memcpy(addr, opcode, len);
+ +              local_irq_restore(flags);
+ +              sync_core();
+ +
+ +              /*
+ +               * Could also do a CLFLUSH here to speed up CPU recovery; but
+ +               * that causes hangs on some VIA CPUs.
+ +               */
+ +      }
+ +}
+ +
+ +__ro_after_init struct mm_struct *poking_mm;
+ +__ro_after_init unsigned long poking_addr;
+ +
+ +static void *__text_poke(void *addr, const void *opcode, size_t len)
   {
+ +      bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
+ +      struct page *pages[2] = {NULL};
+ +      temp_mm_state_t prev;
         unsigned long flags;
+ +      pte_t pte, *ptep;
+ +      spinlock_t *ptl;
+ +      pgprot_t pgprot;
+ +
+ +      /*
+ +       * While boot memory allocator is running we cannot use struct pages as
+ +       * they are not yet initialized. There is no way to recover.
+ +       */
+ +      BUG_ON(!after_bootmem);
+ +
+ +      if (!core_kernel_text((unsigned long)addr)) {
+ +              pages[0] = vmalloc_to_page(addr);
+ +              if (cross_page_boundary)
+ +                      pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
+ +      } else {
+ +              pages[0] = virt_to_page(addr);
+ +              WARN_ON(!PageReserved(pages[0]));
+ +              if (cross_page_boundary)
+ +                      pages[1] = virt_to_page(addr + PAGE_SIZE);
+ +      }
+ +      /*
+ +       * If something went wrong, crash and burn since recovery paths are not
+ +       * implemented.
+ +       */
+ +      BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
+ +
         local_irq_save(flags);
- -      memcpy(addr, opcode, len);
+ +
+ +      /*
+ +       * Map the page without the global bit, as TLB flushing is done with
+ +       * flush_tlb_mm_range(), which is intended for non-global PTEs.
+ +       */
+ +      pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
+ +
+ +      /*
+ +       * The lock is not really needed, but this allows to avoid open-coding.
+ +       */
+ +      ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
+ +
+ +      /*
+ +       * This must not fail; preallocated in poking_init().
+ +       */
+ +      VM_BUG_ON(!ptep);
+ +
+ +      pte = mk_pte(pages[0], pgprot);
+ +      set_pte_at(poking_mm, poking_addr, ptep, pte);
+ +
+ +      if (cross_page_boundary) {
+ +              pte = mk_pte(pages[1], pgprot);
+ +              set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
+ +      }
+ +
+ +      /*
+ +       * Loading the temporary mm behaves as a compiler barrier, which
+ +       * guarantees that the PTE will be set at the time memcpy() is done.
+ +       */
+ +      prev = use_temporary_mm(poking_mm);
+ +
+ +      kasan_disable_current();
+ +      memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len);
+ +      kasan_enable_current();
+ +
+ +      /*
+ +       * Ensure that the PTE is only cleared after the instructions of memcpy
+ +       * were issued by using a compiler barrier.
+ +       */
+ +      barrier();
+ +
+ +      pte_clear(poking_mm, poking_addr, ptep);
+ +      if (cross_page_boundary)
+ +              pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
+ +
+ +      /*
+ +       * Loading the previous page-table hierarchy requires a serializing
+ +       * instruction that already allows the core to see the updated version.
+ +       * Xen-PV is assumed to serialize execution in a similar manner.
+ +       */
+ +      unuse_temporary_mm(prev);
+ +
+ +      /*
+ +       * Flushing the TLB might involve IPIs, which would require enabled
+ +       * IRQs, but not if the mm is not used, as it is in this point.
+ +       */
+ +      flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
+ +                         (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
+ +                         PAGE_SHIFT, false);
+ +
+ +      /*
+ +       * If the text does not match what we just wrote then something is
+ +       * fundamentally screwy; there's nothing we can really do about that.
+ +       */
+ +      BUG_ON(memcmp(addr, opcode, len));
+ +
+ +      pte_unmap_unlock(ptep, ptl);
         local_irq_restore(flags);
- -      sync_core();
- -      /* Could also do a CLFLUSH here to speed up CPU recovery; but
- -         that causes hangs on some VIA CPUs. */
         return addr;
   }
   
@@@ -885,36 -688,48 +884,36 @@@
    * It means the size must be writable atomically and the address must be aligned
    * in a way that permits an atomic write. It also makes sure we fit on a single
    * page.
+ + *
+ + * Note that the caller must ensure that if the modified code is part of a
+ + * module, the module would not be removed during poking. This can be achieved
+ + * by registering a module notifier, and ordering module removal and patching
+ + * trough a mutex.
    */
   void *text_poke(void *addr, const void *opcode, size_t len)
   {
- -      unsigned long flags;
- -      char *vaddr;
- -      struct page *pages[2];
- -      int i;
- -
- -      /*
- -       * While boot memory allocator is runnig we cannot use struct
- -       * pages as they are not yet initialized.
- -       */
- -      BUG_ON(!after_bootmem);
- -
         lockdep_assert_held(&text_mutex);
   
- -      if (!core_kernel_text((unsigned long)addr)) {
- -              pages[0] = vmalloc_to_page(addr);
- -              pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
- -      } else {
- -              pages[0] = virt_to_page(addr);
- -              WARN_ON(!PageReserved(pages[0]));
- -              pages[1] = virt_to_page(addr + PAGE_SIZE);
- -      }
- -      BUG_ON(!pages[0]);
- -      local_irq_save(flags);
- -      set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
- -      if (pages[1])
- -              set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
- -      vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
- -      memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
- -      clear_fixmap(FIX_TEXT_POKE0);
- -      if (pages[1])
- -              clear_fixmap(FIX_TEXT_POKE1);
- -      local_flush_tlb();
- -      sync_core();
- -      /* Could also do a CLFLUSH here to speed up CPU recovery; but
- -         that causes hangs on some VIA CPUs. */
- -      for (i = 0; i < len; i++)
- -              BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
- -      local_irq_restore(flags);
- -      return addr;
+ +      return __text_poke(addr, opcode, len);
+ +}
+ +
+ +/**
+ + * text_poke_kgdb - Update instructions on a live kernel by kgdb
+ + * @addr: address to modify
+ + * @opcode: source of the copy
+ + * @len: length to copy
+ + *
+ + * Only atomic text poke/set should be allowed when not doing early patching.
+ + * It means the size must be writable atomically and the address must be aligned
+ + * in a way that permits an atomic write. It also makes sure we fit on a single
+ + * page.
+ + *
+ + * Context: should only be used by kgdb, which ensures no other core is running,
+ + *        despite the fact it does not hold the text_mutex.
+ + */
+ +void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
+ +{
+ +      return __text_poke(addr, opcode, len);
   }
   
   static void do_sync_core(void *info)
@@@ -922,133 -737,81 +921,133 @@@
         sync_core();
   }
   
- -static bool bp_patching_in_progress;
- -static void *bp_int3_handler, *bp_int3_addr;
+ +static struct bp_patching_desc {
+ +      struct text_poke_loc *vec;
+ +      int nr_entries;
+ +} bp_patching;
+ +
+ +static int patch_cmp(const void *key, const void *elt)
+ +{
+ +      struct text_poke_loc *tp = (struct text_poke_loc *) elt;
+ +
+ +      if (key < tp->addr)
+ +              return -1;
+ +      if (key > tp->addr)
+ +              return 1;
+ +      return 0;
+ +}
+ +NOKPROBE_SYMBOL(patch_cmp);
   
   int poke_int3_handler(struct pt_regs *regs)
   {
+ +      struct text_poke_loc *tp;
+ +      unsigned char int3 = 0xcc;
+ +      void *ip;
+ +
         /*
          * Having observed our INT3 instruction, we now must observe
- -       * bp_patching_in_progress.
+ +       * bp_patching.nr_entries.
          *
- -       *      in_progress = TRUE              INT3
+ +       *      nr_entries != 0                 INT3
          *      WMB                             RMB
- -       *      write INT3                      if (in_progress)
+ +       *      write INT3                      if (nr_entries)
          *
- -       * Idem for bp_int3_handler.
+ +       * Idem for other elements in bp_patching.
          */
         smp_rmb();
   
- -      if (likely(!bp_patching_in_progress))
+ +      if (likely(!bp_patching.nr_entries))
                 return 0;
   
- -      if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr)
+ +      if (user_mode(regs))
                 return 0;
   
- -      /* set up the specified breakpoint handler */
- -      regs->ip = (unsigned long) bp_int3_handler;
+ +      /*
+ +       * Discount the sizeof(int3). See text_poke_bp_batch().
+ +       */
+ +      ip = (void *) regs->ip - sizeof(int3);
+ +
+ +      /*
+ +       * Skip the binary search if there is a single member in the vector.
+ +       */
+ +      if (unlikely(bp_patching.nr_entries > 1)) {
+ +              tp = bsearch(ip, bp_patching.vec, bp_patching.nr_entries,
+ +                           sizeof(struct text_poke_loc),
+ +                           patch_cmp);
+ +              if (!tp)
+ +                      return 0;
+ +      } else {
+ +              tp = bp_patching.vec;
+ +              if (tp->addr != ip)
+ +                      return 0;
+ +      }
+ +
+ +      /* set up the specified breakpoint detour */
+ +      regs->ip = (unsigned long) tp->detour;
   
         return 1;
   }
   NOKPROBE_SYMBOL(poke_int3_handler);
   
   /**
- - * text_poke_bp() -- update instructions on live kernel on SMP
- - * @addr:     address to patch
- - * @opcode:   opcode of new instruction
- - * @len:      length to copy
- - * @handler:  address to jump to when the temporary breakpoint is hit
+ + * text_poke_bp_batch() -- update instructions on live kernel on SMP
+ + * @tp:                       vector of instructions to patch
+ + * @nr_entries:               number of entries in the vector
    *
    * Modify multi-byte instruction by using int3 breakpoint on SMP.
    * We completely avoid stop_machine() here, and achieve the
    * synchronization using int3 breakpoint.
    *
    * The way it is done:
- - *    - add a int3 trap to the address that will be patched
+ + *    - For each entry in the vector:
+ + *            - add a int3 trap to the address that will be patched
    *    - sync cores
- - *    - update all but the first byte of the patched range
+ + *    - For each entry in the vector:
+ + *            - update all but the first byte of the patched range
    *    - sync cores
- - *    - replace the first byte (int3) by the first byte of
- - *      replacing opcode
+ + *    - For each entry in the vector:
+ + *            - replace the first byte (int3) by the first byte of
+ + *              replacing opcode
    *    - sync cores
    */
- -void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
+ +void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
   {
+ +      int patched_all_but_first = 0;
         unsigned char int3 = 0xcc;
- -
- -      bp_int3_handler = handler;
- -      bp_int3_addr = (u8 *)addr + sizeof(int3);
- -      bp_patching_in_progress = true;
+ +      unsigned int i;
   
         lockdep_assert_held(&text_mutex);
   
+ +      bp_patching.vec = tp;
+ +      bp_patching.nr_entries = nr_entries;
+ +
         /*
          * Corresponding read barrier in int3 notifier for making sure the
- -       * in_progress and handler are correctly ordered wrt. patching.
+ +       * nr_entries and handler are correctly ordered wrt. patching.
          */
         smp_wmb();
   
- -      text_poke(addr, &int3, sizeof(int3));
+ +      /*
+ +       * First step: add a int3 trap to the address that will be patched.
+ +       */
+ +      for (i = 0; i < nr_entries; i++)
+ +              text_poke(tp[i].addr, &int3, sizeof(int3));
   
         on_each_cpu(do_sync_core, NULL, 1);
   
- -      if (len - sizeof(int3) > 0) {
- -              /* patch all but the first byte */
- -              text_poke((char *)addr + sizeof(int3),
- -                        (const char *) opcode + sizeof(int3),
- -                        len - sizeof(int3));
+ +      /*
+ +       * Second step: update all but the first byte of the patched range.
+ +       */
+ +      for (i = 0; i < nr_entries; i++) {
+ +              if (tp[i].len - sizeof(int3) > 0) {
+ +                      text_poke((char *)tp[i].addr + sizeof(int3),
+ +                                (const char *)tp[i].opcode + sizeof(int3),
+ +                                tp[i].len - sizeof(int3));
+ +                      patched_all_but_first++;
+ +              }
+ +      }
+ +
+ +      if (patched_all_but_first) {
                 /*
                  * According to Intel, this core syncing is very likely
                  * not necessary and we'd be safe even without it. But
@@@ -1057,47 -820,16 +1056,47 @@@
                 on_each_cpu(do_sync_core, NULL, 1);
         }
   
- -      /* patch the first byte */
- -      text_poke(addr, opcode, sizeof(int3));
+ +      /*
+ +       * Third step: replace the first byte (int3) by the first byte of
+ +       * replacing opcode.
+ +       */
+ +      for (i = 0; i < nr_entries; i++)
+ +              text_poke(tp[i].addr, tp[i].opcode, sizeof(int3));
   
         on_each_cpu(do_sync_core, NULL, 1);
         /*
          * sync_core() implies an smp_mb() and orders this store against
          * the writing of the new instruction.
          */
- -      bp_patching_in_progress = false;
- -
- -      return addr;
+ +      bp_patching.vec = NULL;
+ +      bp_patching.nr_entries = 0;
   }
   
+ +/**
+ + * text_poke_bp() -- update instructions on live kernel on SMP
+ + * @addr:     address to patch
+ + * @opcode:   opcode of new instruction
+ + * @len:      length to copy
+ + * @handler:  address to jump to when the temporary breakpoint is hit
+ + *
+ + * Update a single instruction with the vector in the stack, avoiding
+ + * dynamically allocated memory. This function should be used when it is
+ + * not possible to allocate memory.
+ + */
+ +void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
+ +{
+ +      struct text_poke_loc tp = {
+ +              .detour = handler,
+ +              .addr = addr,
+ +              .len = len,
+ +      };
+ +
+ +      if (len > POKE_MAX_OPCODE_SIZE) {
+ +              WARN_ONCE(1, "len is larger than %d\n", POKE_MAX_OPCODE_SIZE);
+ +              return;
+ +      }
+ +
+ +      memcpy((void *)tp.opcode, opcode, len);
+ +
+ +      text_poke_bp_batch(&tp, 1);
+ +}
diff --combined arch/x86/kernel/kprobes/opt.c

index 282b4eb67e30366167102fb1ecd268f59e6a769b,e77a895a9ecc3a1a161b08c567f95627a988b5be..9d4aedece363cc28c696be0dbfa1af8deaba9fa3
--- 1/arch/x86/kernel/kprobes/opt.c
--- 2/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@@ -1,7 -1,20 +1,7 @@@
+ +// SPDX-License-Identifier: GPL-2.0-or-later
   /*
    *  Kernel Probes Jump Optimization (Optprobes)
    *
- - * This program is free software; you can redistribute it and/or modify
- - * it under the terms of the GNU General Public License as published by
- - * the Free Software Foundation; either version 2 of the License, or
- - * (at your option) any later version.
- - *
- - * This program is distributed in the hope that it will be useful,
- - * but WITHOUT ANY WARRANTY; without even the implied warranty of
- - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- - * GNU General Public License for more details.
- - *
- - * You should have received a copy of the GNU General Public License
- - * along with this program; if not, write to the Free Software
- - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- - *
    * Copyright (C) IBM Corporation, 2002, 2004
    * Copyright (C) Hitachi Ltd., 2012
    */
@@@ -102,15 -115,14 +102,15 @@@ asm 
                         "optprobe_template_call:\n"
                         ASM_NOP5
                         /* Move flags to rsp */
- -                      "       movq 144(%rsp), %rdx\n"
- -                      "       movq %rdx, 152(%rsp)\n"
+ +                      "       movq 18*8(%rsp), %rdx\n"
+ +                      "       movq %rdx, 19*8(%rsp)\n"
                         RESTORE_REGS_STRING
                         /* Skip flags entry */
                         "       addq $8, %rsp\n"
                         "       popfq\n"
   #else /* CONFIG_X86_32 */
- -                      "       pushf\n"
+ +                      "       pushl %esp\n"
+ +                      "       pushfl\n"
                         SAVE_REGS_STRING
                         "       movl %esp, %edx\n"
                         ".global optprobe_template_val\n"
@@@ -119,13 -131,9 +119,13 @@@
                         ".global optprobe_template_call\n"
                         "optprobe_template_call:\n"
                         ASM_NOP5
+ +                      /* Move flags into esp */
+ +                      "       movl 14*4(%esp), %edx\n"
+ +                      "       movl %edx, 15*4(%esp)\n"
                         RESTORE_REGS_STRING
- -                      "       addl $4, %esp\n"        /* skip cs */
- -                      "       popf\n"
+ +                      /* Skip flags entry */
+ +                      "       addl $4, %esp\n"
+ +                      "       popfl\n"
   #endif
                         ".global optprobe_template_end\n"
                         "optprobe_template_end:\n"
@@@ -157,9 -165,10 +157,9 @@@ optimized_callback(struct optimized_kpr
         } else {
                 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
                 /* Save skipped registers */
- -#ifdef CONFIG_X86_64
                 regs->cs = __KERNEL_CS;
- -#else
- -              regs->cs = __KERNEL_CS | get_kernel_rpl();
+ +#ifdef CONFIG_X86_32
+ +              regs->cs |= get_kernel_rpl();
                 regs->gs = 0;
   #endif
                 regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
@@@ -422,7 -431,7 +422,7 @@@ err
   void arch_optimize_kprobes(struct list_head *oplist)
   {
         struct optimized_kprobe *op, *tmp;
-       u8 insn_buf[RELATIVEJUMP_SIZE];
+       u8 insn_buff[RELATIVEJUMP_SIZE];
   
         list_for_each_entry_safe(op, tmp, oplist, list) {
                 s32 rel = (s32)((long)op->optinsn.insn -
@@@ -434,10 -443,10 +434,10 @@@
                 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
                        RELATIVE_ADDR_SIZE);
   
-               insn_buf[0] = RELATIVEJUMP_OPCODE;
-               *(s32 *)(&insn_buf[1]) = rel;
+               insn_buff[0] = RELATIVEJUMP_OPCODE;
+               *(s32 *)(&insn_buff[1]) = rel;
   
-               text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE,
+               text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE,
                              op->optinsn.insn);
   
                 list_del_init(&op->list);
@@@ -447,12 -456,12 +447,12 @@@
   /* Replace a relative jump with a breakpoint (int3).  */
   void arch_unoptimize_kprobe(struct optimized_kprobe *op)
   {
-       u8 insn_buf[RELATIVEJUMP_SIZE];
+       u8 insn_buff[RELATIVEJUMP_SIZE];
   
         /* Set int3 to first byte for kprobes */
-       insn_buf[0] = BREAKPOINT_INSTRUCTION;
-       memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
-       text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE,
+       insn_buff[0] = BREAKPOINT_INSTRUCTION;
+       memcpy(insn_buff + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+       text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE,
                      op->optinsn.insn);
   }
   
diff --combined arch/x86/kernel/paravirt.c

index 06f6bb48d01891a31c668730697d00a093cd51e8,b7d22912e20b8100f067cbd92ff7373e9fdb594a..98039d7fb998c96152854d5488de834dc3537b3b
--- 1/arch/x86/kernel/paravirt.c
--- 2/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@@ -1,7 -1,19 +1,7 @@@
+ +// SPDX-License-Identifier: GPL-2.0-or-later
   /*  Paravirtualization interfaces
       Copyright (C) 2006 Rusty Russell IBM Corporation
   
- -    This program is free software; you can redistribute it and/or modify
- -    it under the terms of the GNU General Public License as published by
- -    the Free Software Foundation; either version 2 of the License, or
- -    (at your option) any later version.
- -
- -    This program is distributed in the hope that it will be useful,
- -    but WITHOUT ANY WARRANTY; without even the implied warranty of
- -    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- -    GNU General Public License for more details.
- -
- -    You should have received a copy of the GNU General Public License
- -    along with this program; if not, write to the Free Software
- -    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
   
       2007 - x86_64 support added by Glauber de Oliveira Costa, Red Hat Inc
   */
@@@ -58,24 -70,24 +58,24 @@@ struct branch 
         u32 delta;
   } __attribute__((packed));
   
- static unsigned paravirt_patch_call(void *insnbuf, const void *target,
+ static unsigned paravirt_patch_call(void *insn_buff, const void *target,
                                     unsigned long addr, unsigned len)
   {
-       struct branch *b = insnbuf;
-       unsigned long delta = (unsigned long)target - (addr+5);
- 
-       if (len < 5) {
- #ifdef CONFIG_RETPOLINE
-               WARN_ONCE(1, "Failing to patch indirect CALL in %ps\n", (void *)addr);
- #endif
-               return len;     /* call too long for patch site */
+       const int call_len = 5;
+       struct branch *b = insn_buff;
+       unsigned long delta = (unsigned long)target - (addr+call_len);
+ 
+       if (len < call_len) {
+               pr_warn("paravirt: Failed to patch indirect CALL at %ps\n", (void *)addr);
+               /* Kernel might not be viable if patching fails, bail out: */
+               BUG_ON(1);
         }
   
         b->opcode = 0xe8; /* call */
         b->delta = delta;
-       BUILD_BUG_ON(sizeof(*b) != 5);
+       BUILD_BUG_ON(sizeof(*b) != call_len);
   
-       return 5;
+       return call_len;
   }
   
   #ifdef CONFIG_PARAVIRT_XXL
@@@ -85,10 -97,10 +85,10 @@@ u64 notrace _paravirt_ident_64(u64 x
         return x;
   }
   
- static unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
+ static unsigned paravirt_patch_jmp(void *insn_buff, const void *target,
                                    unsigned long addr, unsigned len)
   {
-       struct branch *b = insnbuf;
+       struct branch *b = insn_buff;
         unsigned long delta = (unsigned long)target - (addr+5);
   
         if (len < 5) {
@@@ -109,11 -121,11 +109,11 @@@ DEFINE_STATIC_KEY_TRUE(virt_spin_lock_k
   
   void __init native_pv_lock_init(void)
   {
- -      if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
+ +      if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
                 static_branch_disable(&virt_spin_lock_key);
   }
   
- unsigned paravirt_patch_default(u8 type, void *insnbuf,
+ unsigned paravirt_patch_default(u8 type, void *insn_buff,
                                 unsigned long addr, unsigned len)
   {
         /*
@@@ -125,36 -137,36 +125,36 @@@
   
         if (opfunc == NULL)
                 /* If there's no function, patch it with a ud2a (BUG) */
-               ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a));
+               ret = paravirt_patch_insns(insn_buff, len, ud2a, ud2a+sizeof(ud2a));
         else if (opfunc == _paravirt_nop)
                 ret = 0;
   
   #ifdef CONFIG_PARAVIRT_XXL
         /* identity functions just return their single argument */
         else if (opfunc == _paravirt_ident_64)
-               ret = paravirt_patch_ident_64(insnbuf, len);
+               ret = paravirt_patch_ident_64(insn_buff, len);
   
         else if (type == PARAVIRT_PATCH(cpu.iret) ||
                  type == PARAVIRT_PATCH(cpu.usergs_sysret64))
                 /* If operation requires a jmp, then jmp */
-               ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
+               ret = paravirt_patch_jmp(insn_buff, opfunc, addr, len);
   #endif
         else
                 /* Otherwise call the function. */
-               ret = paravirt_patch_call(insnbuf, opfunc, addr, len);
+               ret = paravirt_patch_call(insn_buff, opfunc, addr, len);
   
         return ret;
   }
   
- unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
+ unsigned paravirt_patch_insns(void *insn_buff, unsigned len,
                               const char *start, const char *end)
   {
         unsigned insn_len = end - start;
   
-       if (insn_len > len || start == NULL)
-               insn_len = len;
-       else
-               memcpy(insnbuf, start, insn_len);
+       /* Alternative instruction is too large for the patch site and we cannot continue: */
+       BUG_ON(insn_len > len || start == NULL);
+ 
+       memcpy(insn_buff, start, insn_len);
   
         return insn_len;
   }
diff --combined arch/x86/tools/insn_decoder_test.c

index e455349e0ab5e5a44b0041f9efac90b4727173bf,34c2b3691f4f868d85bcb8b00657ca450fae1ef7..34eda63c124b19cc1592d853f8b1862ddc269087
--- 1/arch/x86/tools/insn_decoder_test.c
--- 2/arch/x86/tools/insn_decoder_test.c
+++ b/arch/x86/tools/insn_decoder_test.c
@@@ -1,5 -1,13 +1,5 @@@
+ +// SPDX-License-Identifier: GPL-2.0-or-later
   /*
- - * This program is free software; you can redistribute it and/or modify
- - * it under the terms of the GNU General Public License as published by
- - * the Free Software Foundation; either version 2 of the License, or
- - * (at your option) any later version.
- - *
- - * This program is distributed in the hope that it will be useful,
- - * but WITHOUT ANY WARRANTY; without even the implied warranty of
- - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- - * GNU General Public License for more details.
    *
    * Copyright (C) IBM Corporation, 2009
    */
@@@ -111,7 -119,7 +111,7 @@@ static void parse_args(int argc, char *
   int main(int argc, char **argv)
   {
         char line[BUFSIZE], sym[BUFSIZE] = "<unknown>";
-       unsigned char insn_buf[16];
+       unsigned char insn_buff[16];
         struct insn insn;
         int insns = 0;
         int warnings = 0;
@@@ -130,7 -138,7 +130,7 @@@
                 }
   
                 insns++;
-               memset(insn_buf, 0, 16);
+               memset(insn_buff, 0, 16);
                 strcpy(copy, line);
                 tab1 = strchr(copy, '\t');
                 if (!tab1)
@@@ -143,13 -151,13 +143,13 @@@
                 *tab2 = '\0';   /* Characters beyond tab2 aren't examined */
                 while (s < tab2) {
                         if (sscanf(s, "%x", &b) == 1) {
-                               insn_buf[nb++] = (unsigned char) b;
+                               insn_buff[nb++] = (unsigned char) b;
                                 s += 3;
                         } else
                                 break;
                 }
                 /* Decode an instruction */
-               insn_init(&insn, insn_buf, sizeof(insn_buf), x86_64);
+               insn_init(&insn, insn_buff, sizeof(insn_buff), x86_64);
                 insn_get_length(&insn);
                 if (insn.length != nb) {
                         warnings++;
diff --combined arch/x86/tools/insn_sanity.c

index 14cf07916081182935e8e13c19f5e8dc5d24e587,7adec7b490fd806a6e4da45dc70a700eae9014d3..185ceba9d289bd5de91019cebc25502c9a385a2f
--- 1/arch/x86/tools/insn_sanity.c
--- 2/arch/x86/tools/insn_sanity.c
+++ b/arch/x86/tools/insn_sanity.c
@@@ -1,7 -1,20 +1,7 @@@
+ +// SPDX-License-Identifier: GPL-2.0-or-later
   /*
    * x86 decoder sanity test - based on test_get_insn.c
    *
- - * This program is free software; you can redistribute it and/or modify
- - * it under the terms of the GNU General Public License as published by
- - * the Free Software Foundation; either version 2 of the License, or
- - * (at your option) any later version.
- - *
- - * This program is distributed in the hope that it will be useful,
- - * but WITHOUT ANY WARRANTY; without even the implied warranty of
- - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- - * GNU General Public License for more details.
- - *
- - * You should have received a copy of the GNU General Public License
- - * along with this program; if not, write to the Free Software
- - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- - *
    * Copyright (C) IBM Corporation, 2009
    * Copyright (C) Hitachi, Ltd., 2011
    */
@@@ -83,7 -96,7 +83,7 @@@ static void dump_insn(FILE *fp, struct 
   }
   
   static void dump_stream(FILE *fp, const char *msg, unsigned long nr_iter,
-                       unsigned char *insn_buf, struct insn *insn)
+                       unsigned char *insn_buff, struct insn *insn)
   {
         int i;
   
@@@ -96,7 -109,7 +96,7 @@@
         /* Input a decoded instruction sequence directly */
         fprintf(fp, " $ echo ");
         for (i = 0; i < MAX_INSN_SIZE; i++)
-               fprintf(fp, " %02x", insn_buf[i]);
+               fprintf(fp, " %02x", insn_buff[i]);
         fprintf(fp, " | %s -i -\n", prog);
   
         if (!input_file) {
@@@ -124,7 -137,7 +124,7 @@@ fail
   }
   
   /* Read given instruction sequence from the input file */
- static int read_next_insn(unsigned char *insn_buf)
+ static int read_next_insn(unsigned char *insn_buff)
   {
         char buf[256]  = "", *tmp;
         int i;
@@@ -134,7 -147,7 +134,7 @@@
                 return 0;
   
         for (i = 0; i < MAX_INSN_SIZE; i++) {
-               insn_buf[i] = (unsigned char)strtoul(tmp, &tmp, 16);
+               insn_buff[i] = (unsigned char)strtoul(tmp, &tmp, 16);
                 if (*tmp != ' ')
                         break;
         }
@@@ -142,19 -155,19 +142,19 @@@
         return i;
   }
   
- static int generate_insn(unsigned char *insn_buf)
+ static int generate_insn(unsigned char *insn_buff)
   {
         int i;
   
         if (input_file)
-               return read_next_insn(insn_buf);
+               return read_next_insn(insn_buff);
   
         /* Fills buffer with random binary up to MAX_INSN_SIZE */
         for (i = 0; i < MAX_INSN_SIZE - 1; i += 2)
-               *(unsigned short *)(&insn_buf[i]) = random() & 0xffff;
+               *(unsigned short *)(&insn_buff[i]) = random() & 0xffff;
   
         while (i < MAX_INSN_SIZE)
-               insn_buf[i++] = random() & 0xff;
+               insn_buff[i++] = random() & 0xff;
   
         return i;
   }
@@@ -226,31 -239,31 +226,31 @@@ int main(int argc, char **argv
         int insns = 0;
         int errors = 0;
         unsigned long i;
-       unsigned char insn_buf[MAX_INSN_SIZE * 2];
+       unsigned char insn_buff[MAX_INSN_SIZE * 2];
   
         parse_args(argc, argv);
   
         /* Prepare stop bytes with NOPs */
-       memset(insn_buf + MAX_INSN_SIZE, INSN_NOP, MAX_INSN_SIZE);
+       memset(insn_buff + MAX_INSN_SIZE, INSN_NOP, MAX_INSN_SIZE);
   
         for (i = 0; i < iter_end; i++) {
-               if (generate_insn(insn_buf) <= 0)
+               if (generate_insn(insn_buff) <= 0)
                         break;
   
                 if (i < iter_start)     /* Skip to given iteration number */
                         continue;
   
                 /* Decode an instruction */
-               insn_init(&insn, insn_buf, sizeof(insn_buf), x86_64);
+               insn_init(&insn, insn_buff, sizeof(insn_buff), x86_64);
                 insn_get_length(&insn);
   
                 if (insn.next_byte <= insn.kaddr ||
                     insn.kaddr + MAX_INSN_SIZE < insn.next_byte) {
                         /* Access out-of-range memory */
-                       dump_stream(stderr, "Error: Found an access violation", i, insn_buf, &insn);
+                       dump_stream(stderr, "Error: Found an access violation", i, insn_buff, &insn);
                         errors++;
                 } else if (verbose && !insn_complete(&insn))
-                       dump_stream(stdout, "Info: Found an undecodable input", i, insn_buf, &insn);
+                       dump_stream(stdout, "Info: Found an undecodable input", i, insn_buff, &insn);
                 else if (verbose >= 2)
                         dump_insn(stdout, &insn);
                 insns++;
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 9 Jul 2019 00:34:44 +0000 (17:34 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 9 Jul 2019 00:34:44 +0000 (17:34 -0700)
		1	2
arch/x86/events/intel/ds.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/alternative.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/kprobes/opt.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/paravirt.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/tools/insn_decoder_test.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/tools/insn_sanity.c	patch \|	diff1 \|	diff2 \|	blob \| history