perf/x86/intel: Make the HT bug workaround conditional on HT enabled
[linux-2.6-block.git] / arch / x86 / kernel / cpu / perf_event_intel.c
index 498b6d967138b1fff29659e81813c77e70ff1f58..6ea61a572fb0ee36afaccaa1af7c0d94ef577971 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/export.h>
+#include <linux/watchdog.h>
 
 #include <asm/cpufeature.h>
 #include <asm/hardirq.h>
@@ -113,6 +114,12 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
        INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
        INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
        INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+
+       INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
        EVENT_CONSTRAINT_END
 };
 
@@ -131,15 +138,12 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
        INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
        INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
        INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
-       /*
-        * Errata BV98 -- MEM_*_RETIRED events can leak between counters of SMT
-        * siblings; disable these events because they can corrupt unrelated
-        * counters.
-        */
-       INTEL_EVENT_CONSTRAINT(0xd0, 0x0), /* MEM_UOPS_RETIRED.* */
-       INTEL_EVENT_CONSTRAINT(0xd1, 0x0), /* MEM_LOAD_UOPS_RETIRED.* */
-       INTEL_EVENT_CONSTRAINT(0xd2, 0x0), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-       INTEL_EVENT_CONSTRAINT(0xd3, 0x0), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+       INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
        EVENT_CONSTRAINT_END
 };
 
@@ -212,11 +216,26 @@ static struct event_constraint intel_hsw_event_constraints[] = {
        INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
        INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
        /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
-       INTEL_EVENT_CONSTRAINT(0x08a3, 0x4),
+       INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4),
        /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
-       INTEL_EVENT_CONSTRAINT(0x0ca3, 0x4),
+       INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
        /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
-       INTEL_EVENT_CONSTRAINT(0x04a3, 0xf),
+       INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf),
+
+       INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_bdw_event_constraints[] = {
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0),      /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1),      /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2),      /* CPU_CLK_UNHALTED.REF */
+       INTEL_UEVENT_CONSTRAINT(0x148, 0x4),    /* L1D_PEND_MISS.PENDING */
+       INTEL_EVENT_CONSTRAINT(0xa3, 0x4),      /* CYCLE_ACTIVITY.* */
        EVENT_CONSTRAINT_END
 };
 
@@ -415,6 +434,202 @@ static __initconst const u64 snb_hw_cache_event_ids
 
 };
 
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts because they are not
+ *   reliably counted.
+ */
+
+#define HSW_DEMAND_DATA_RD             BIT_ULL(0)
+#define HSW_DEMAND_RFO                 BIT_ULL(1)
+#define HSW_ANY_RESPONSE               BIT_ULL(16)
+#define HSW_SUPPLIER_NONE              BIT_ULL(17)
+#define HSW_L3_MISS_LOCAL_DRAM         BIT_ULL(22)
+#define HSW_L3_MISS_REMOTE_HOP0                BIT_ULL(27)
+#define HSW_L3_MISS_REMOTE_HOP1                BIT_ULL(28)
+#define HSW_L3_MISS_REMOTE_HOP2P       BIT_ULL(29)
+#define HSW_L3_MISS                    (HSW_L3_MISS_LOCAL_DRAM| \
+                                        HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+                                        HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_SNOOP_NONE                 BIT_ULL(31)
+#define HSW_SNOOP_NOT_NEEDED           BIT_ULL(32)
+#define HSW_SNOOP_MISS                 BIT_ULL(33)
+#define HSW_SNOOP_HIT_NO_FWD           BIT_ULL(34)
+#define HSW_SNOOP_HIT_WITH_FWD         BIT_ULL(35)
+#define HSW_SNOOP_HITM                 BIT_ULL(36)
+#define HSW_SNOOP_NON_DRAM             BIT_ULL(37)
+#define HSW_ANY_SNOOP                  (HSW_SNOOP_NONE| \
+                                        HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
+                                        HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
+                                        HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
+#define HSW_SNOOP_DRAM                 (HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)
+#define HSW_DEMAND_READ                        HSW_DEMAND_DATA_RD
+#define HSW_DEMAND_WRITE               HSW_DEMAND_RFO
+#define HSW_L3_MISS_REMOTE             (HSW_L3_MISS_REMOTE_HOP0|\
+                                        HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_LLC_ACCESS                 HSW_ANY_RESPONSE
+
+#define BDW_L3_MISS_LOCAL              BIT(26)
+#define BDW_L3_MISS                    (BDW_L3_MISS_LOCAL| \
+                                        HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+                                        HSW_L3_MISS_REMOTE_HOP2P)
+
+
+static __initconst const u64 hsw_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_UOPS_RETIRED.ALL_LOADS */
+               [ C(RESULT_MISS)   ] = 0x151,   /* L1D.REPLACEMENT */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_UOPS_RETIRED.ALL_STORES */
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x280,   /* ICACHE.MISSES */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_UOPS_RETIRED.ALL_LOADS */
+               [ C(RESULT_MISS)   ] = 0x108,   /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_UOPS_RETIRED.ALL_STORES */
+               [ C(RESULT_MISS)   ] = 0x149,   /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x6085,  /* ITLB_MISSES.STLB_HIT */
+               [ C(RESULT_MISS)   ] = 0x185,   /* ITLB_MISSES.MISS_CAUSES_A_WALK */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0xc4,    /* BR_INST_RETIRED.ALL_BRANCHES */
+               [ C(RESULT_MISS)   ] = 0xc5,    /* BR_MISP_RETIRED.ALL_BRANCHES */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+};
+
+static __initconst const u64 hsw_hw_cache_extra_regs
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+                                      HSW_LLC_ACCESS,
+               [ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
+                                      HSW_L3_MISS|HSW_ANY_SNOOP,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+                                      HSW_LLC_ACCESS,
+               [ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
+                                      HSW_L3_MISS|HSW_ANY_SNOOP,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+                                      HSW_L3_MISS_LOCAL_DRAM|
+                                      HSW_SNOOP_DRAM,
+               [ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
+                                      HSW_L3_MISS_REMOTE|
+                                      HSW_SNOOP_DRAM,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+                                      HSW_L3_MISS_LOCAL_DRAM|
+                                      HSW_SNOOP_DRAM,
+               [ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
+                                      HSW_L3_MISS_REMOTE|
+                                      HSW_SNOOP_DRAM,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+};
+
 static __initconst const u64 westmere_hw_cache_event_ids
                                [PERF_COUNT_HW_CACHE_MAX]
                                [PERF_COUNT_HW_CACHE_OP_MAX]
@@ -1029,20 +1244,6 @@ static __initconst const u64 slm_hw_cache_event_ids
  },
 };
 
-static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
-{
-       /* user explicitly requested branch sampling */
-       if (has_branch_stack(event))
-               return true;
-
-       /* implicit branch sampling to correct PEBS skid */
-       if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
-           x86_pmu.intel_cap.pebs_format < 2)
-               return true;
-
-       return false;
-}
-
 static void intel_pmu_disable_all(void)
 {
        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1051,6 +1252,8 @@ static void intel_pmu_disable_all(void)
 
        if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
                intel_pmu_disable_bts();
+       else
+               intel_bts_disable_local();
 
        intel_pmu_pebs_disable_all();
        intel_pmu_lbr_disable_all();
@@ -1073,7 +1276,8 @@ static void intel_pmu_enable_all(int added)
                        return;
 
                intel_pmu_enable_bts(event->hw.config);
-       }
+       } else
+               intel_bts_enable_local();
 }
 
 /*
@@ -1207,7 +1411,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
         * must disable before any actual event
         * because any event may be combined with LBR
         */
-       if (intel_pmu_needs_lbr_smpl(event))
+       if (needs_branch_stack(event))
                intel_pmu_lbr_disable(event);
 
        if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
@@ -1268,7 +1472,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
         * must enabled before any actual event
         * because any event may be combined with LBR
         */
-       if (intel_pmu_needs_lbr_smpl(event))
+       if (needs_branch_stack(event))
                intel_pmu_lbr_enable(event);
 
        if (event->attr.exclude_host)
@@ -1359,6 +1563,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
                apic_write(APIC_LVTPC, APIC_DM_NMI);
        intel_pmu_disable_all();
        handled = intel_pmu_drain_bts_buffer();
+       handled += intel_bts_interrupt();
        status = intel_pmu_get_status();
        if (!status)
                goto done;
@@ -1398,6 +1603,14 @@ again:
                x86_pmu.drain_pebs(regs);
        }
 
+       /*
+        * Intel PT
+        */
+       if (__test_and_clear_bit(55, (unsigned long *)&status)) {
+               handled++;
+               intel_pt_interrupt();
+       }
+
        /*
         * Checkpointed counters can lead to 'spurious' PMIs because the
         * rollback caused by the PMI will have cleared the overflow status
@@ -1464,7 +1677,7 @@ intel_bts_constraints(struct perf_event *event)
 
 static int intel_alt_er(int idx)
 {
-       if (!(x86_pmu.er_flags & ERF_HAS_RSP_1))
+       if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
                return idx;
 
        if (idx == EXTRA_REG_RSP_0)
@@ -1624,7 +1837,8 @@ intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
 }
 
 struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+                         struct perf_event *event)
 {
        struct event_constraint *c;
 
@@ -1641,7 +1855,8 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
 }
 
 static struct event_constraint *
-intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+                           struct perf_event *event)
 {
        struct event_constraint *c;
 
@@ -1649,15 +1864,286 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
        if (c)
                return c;
 
-       c = intel_pebs_constraints(event);
+       c = intel_shared_regs_constraints(cpuc, event);
        if (c)
                return c;
 
-       c = intel_shared_regs_constraints(cpuc, event);
+       c = intel_pebs_constraints(event);
        if (c)
                return c;
 
-       return x86_get_event_constraints(cpuc, event);
+       return x86_get_event_constraints(cpuc, idx, event);
+}
+
+static void
+intel_start_scheduling(struct cpu_hw_events *cpuc)
+{
+       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+       struct intel_excl_states *xl, *xlo;
+       int tid = cpuc->excl_thread_id;
+       int o_tid = 1 - tid; /* sibling thread */
+
+       /*
+        * nothing needed if in group validation mode
+        */
+       if (cpuc->is_fake || !is_ht_workaround_enabled())
+               return;
+
+       /*
+        * no exclusion needed
+        */
+       if (!excl_cntrs)
+               return;
+
+       xlo = &excl_cntrs->states[o_tid];
+       xl = &excl_cntrs->states[tid];
+
+       xl->sched_started = true;
+       xl->num_alloc_cntrs = 0;
+       /*
+        * lock shared state until we are done scheduling
+        * in stop_event_scheduling()
+        * makes scheduling appear as a transaction
+        */
+       WARN_ON_ONCE(!irqs_disabled());
+       raw_spin_lock(&excl_cntrs->lock);
+
+       /*
+        * save initial state of sibling thread
+        */
+       memcpy(xlo->init_state, xlo->state, sizeof(xlo->init_state));
+}
+
+static void
+intel_stop_scheduling(struct cpu_hw_events *cpuc)
+{
+       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+       struct intel_excl_states *xl, *xlo;
+       int tid = cpuc->excl_thread_id;
+       int o_tid = 1 - tid; /* sibling thread */
+
+       /*
+        * nothing needed if in group validation mode
+        */
+       if (cpuc->is_fake || !is_ht_workaround_enabled())
+               return;
+       /*
+        * no exclusion needed
+        */
+       if (!excl_cntrs)
+               return;
+
+       xlo = &excl_cntrs->states[o_tid];
+       xl = &excl_cntrs->states[tid];
+
+       /*
+        * make new sibling thread state visible
+        */
+       memcpy(xlo->state, xlo->init_state, sizeof(xlo->state));
+
+       xl->sched_started = false;
+       /*
+        * release shared state lock (acquired in intel_start_scheduling())
+        */
+       raw_spin_unlock(&excl_cntrs->lock);
+}
+
+static struct event_constraint *
+intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
+                          int idx, struct event_constraint *c)
+{
+       struct event_constraint *cx;
+       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+       struct intel_excl_states *xl, *xlo;
+       int is_excl, i;
+       int tid = cpuc->excl_thread_id;
+       int o_tid = 1 - tid; /* alternate */
+
+       /*
+        * validating a group does not require
+        * enforcing cross-thread  exclusion
+        */
+       if (cpuc->is_fake || !is_ht_workaround_enabled())
+               return c;
+
+       /*
+        * no exclusion needed
+        */
+       if (!excl_cntrs)
+               return c;
+       /*
+        * event requires exclusive counter access
+        * across HT threads
+        */
+       is_excl = c->flags & PERF_X86_EVENT_EXCL;
+
+       /*
+        * xl = state of current HT
+        * xlo = state of sibling HT
+        */
+       xl = &excl_cntrs->states[tid];
+       xlo = &excl_cntrs->states[o_tid];
+
+       /*
+        * do not allow scheduling of more than max_alloc_cntrs
+        * which is set to half the available generic counters.
+        * this helps avoid counter starvation of sibling thread
+        * by ensuring at most half the counters cannot be in
+        * exclusive mode. There is not designated counters for the
+        * limits. Any N/2 counters can be used. This helps with
+        * events with specifix counter constraints
+        */
+       if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs)
+               return &emptyconstraint;
+
+       cx = c;
+
+       /*
+        * because we modify the constraint, we need
+        * to make a copy. Static constraints come
+        * from static const tables.
+        *
+        * only needed when constraint has not yet
+        * been cloned (marked dynamic)
+        */
+       if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
+
+               /* sanity check */
+               if (idx < 0)
+                       return &emptyconstraint;
+
+               /*
+                * grab pre-allocated constraint entry
+                */
+               cx = &cpuc->constraint_list[idx];
+
+               /*
+                * initialize dynamic constraint
+                * with static constraint
+                */
+               memcpy(cx, c, sizeof(*cx));
+
+               /*
+                * mark constraint as dynamic, so we
+                * can free it later on
+                */
+               cx->flags |= PERF_X86_EVENT_DYNAMIC;
+       }
+
+       /*
+        * From here on, the constraint is dynamic.
+        * Either it was just allocated above, or it
+        * was allocated during a earlier invocation
+        * of this function
+        */
+
+       /*
+        * Modify static constraint with current dynamic
+        * state of thread
+        *
+        * EXCLUSIVE: sibling counter measuring exclusive event
+        * SHARED   : sibling counter measuring non-exclusive event
+        * UNUSED   : sibling counter unused
+        */
+       for_each_set_bit(i, cx->idxmsk, X86_PMC_IDX_MAX) {
+               /*
+                * exclusive event in sibling counter
+                * our corresponding counter cannot be used
+                * regardless of our event
+                */
+               if (xl->state[i] == INTEL_EXCL_EXCLUSIVE)
+                       __clear_bit(i, cx->idxmsk);
+               /*
+                * if measuring an exclusive event, sibling
+                * measuring non-exclusive, then counter cannot
+                * be used
+                */
+               if (is_excl && xl->state[i] == INTEL_EXCL_SHARED)
+                       __clear_bit(i, cx->idxmsk);
+       }
+
+       /*
+        * recompute actual bit weight for scheduling algorithm
+        */
+       cx->weight = hweight64(cx->idxmsk64);
+
+       /*
+        * if we return an empty mask, then switch
+        * back to static empty constraint to avoid
+        * the cost of freeing later on
+        */
+       if (cx->weight == 0)
+               cx = &emptyconstraint;
+
+       return cx;
+}
+
+static struct event_constraint *
+intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+                           struct perf_event *event)
+{
+       struct event_constraint *c1 = event->hw.constraint;
+       struct event_constraint *c2;
+
+       /*
+        * first time only
+        * - static constraint: no change across incremental scheduling calls
+        * - dynamic constraint: handled by intel_get_excl_constraints()
+        */
+       c2 = __intel_get_event_constraints(cpuc, idx, event);
+       if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) {
+               bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX);
+               c1->weight = c2->weight;
+               c2 = c1;
+       }
+
+       if (cpuc->excl_cntrs)
+               return intel_get_excl_constraints(cpuc, event, idx, c2);
+
+       return c2;
+}
+
+static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
+               struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+       struct intel_excl_states *xlo, *xl;
+       unsigned long flags = 0; /* keep compiler happy */
+       int tid = cpuc->excl_thread_id;
+       int o_tid = 1 - tid;
+
+       /*
+        * nothing needed if in group validation mode
+        */
+       if (cpuc->is_fake)
+               return;
+
+       WARN_ON_ONCE(!excl_cntrs);
+
+       if (!excl_cntrs)
+               return;
+
+       xl = &excl_cntrs->states[tid];
+       xlo = &excl_cntrs->states[o_tid];
+
+       /*
+        * put_constraint may be called from x86_schedule_events()
+        * which already has the lock held so here make locking
+        * conditional
+        */
+       if (!xl->sched_started)
+               raw_spin_lock_irqsave(&excl_cntrs->lock, flags);
+
+       /*
+        * if event was actually assigned, then mark the
+        * counter state as unused now
+        */
+       if (hwc->idx >= 0)
+               xlo->state[hwc->idx] = INTEL_EXCL_UNUSED;
+
+       if (!xl->sched_started)
+               raw_spin_unlock_irqrestore(&excl_cntrs->lock, flags);
 }
 
 static void
@@ -1678,7 +2164,57 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
 static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
                                        struct perf_event *event)
 {
+       struct event_constraint *c = event->hw.constraint;
+
        intel_put_shared_regs_event_constraints(cpuc, event);
+
+       /*
+        * is PMU has exclusive counter restrictions, then
+        * all events are subject to and must call the
+        * put_excl_constraints() routine
+        */
+       if (c && cpuc->excl_cntrs)
+               intel_put_excl_constraints(cpuc, event);
+
+       /* cleanup dynamic constraint */
+       if (c && (c->flags & PERF_X86_EVENT_DYNAMIC))
+               event->hw.constraint = NULL;
+}
+
+static void intel_commit_scheduling(struct cpu_hw_events *cpuc,
+                                   struct perf_event *event, int cntr)
+{
+       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+       struct event_constraint *c = event->hw.constraint;
+       struct intel_excl_states *xlo, *xl;
+       int tid = cpuc->excl_thread_id;
+       int o_tid = 1 - tid;
+       int is_excl;
+
+       if (cpuc->is_fake || !c)
+               return;
+
+       is_excl = c->flags & PERF_X86_EVENT_EXCL;
+
+       if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
+               return;
+
+       WARN_ON_ONCE(!excl_cntrs);
+
+       if (!excl_cntrs)
+               return;
+
+       xl = &excl_cntrs->states[tid];
+       xlo = &excl_cntrs->states[o_tid];
+
+       WARN_ON_ONCE(!raw_spin_is_locked(&excl_cntrs->lock));
+
+       if (cntr >= 0) {
+               if (is_excl)
+                       xlo->init_state[cntr] = INTEL_EXCL_EXCLUSIVE;
+               else
+                       xlo->init_state[cntr] = INTEL_EXCL_SHARED;
+       }
 }
 
 static void intel_pebs_aliases_core2(struct perf_event *event)
@@ -1747,10 +2283,21 @@ static int intel_pmu_hw_config(struct perf_event *event)
        if (event->attr.precise_ip && x86_pmu.pebs_aliases)
                x86_pmu.pebs_aliases(event);
 
-       if (intel_pmu_needs_lbr_smpl(event)) {
+       if (needs_branch_stack(event)) {
                ret = intel_pmu_setup_lbr_filter(event);
                if (ret)
                        return ret;
+
+               /*
+                * BTS is set up earlier in this path, so don't account twice
+                */
+               if (!intel_pmu_has_bts(event)) {
+                       /* disallow lbr if conflicting events are present */
+                       if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+                               return -EBUSY;
+
+                       event->destroy = hw_perf_lbr_event_destroy;
+               }
        }
 
        if (event->attr.type != PERF_TYPE_RAW)
@@ -1891,9 +2438,12 @@ static struct event_constraint counter2_constraint =
                        EVENT_CONSTRAINT(0, 0x4, 0);
 
 static struct event_constraint *
-hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+                         struct perf_event *event)
 {
-       struct event_constraint *c = intel_get_event_constraints(cpuc, event);
+       struct event_constraint *c;
+
+       c = intel_get_event_constraints(cpuc, idx, event);
 
        /* Handle special quirk on in_tx_checkpointed only in counter 2 */
        if (event->hw.config & HSW_IN_TX_CHECKPOINTED) {
@@ -1905,6 +2455,32 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
        return c;
 }
 
+/*
+ * Broadwell:
+ *
+ * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared
+ * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine
+ * the two to enforce a minimum period of 128 (the smallest value that has bits
+ * 0-5 cleared and >= 100).
+ *
+ * Because of how the code in x86_perf_event_set_period() works, the truncation
+ * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period
+ * to make up for the 'lost' events due to carrying the 'error' in period_left.
+ *
+ * Therefore the effective (average) period matches the requested period,
+ * despite coarser hardware granularity.
+ */
+static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
+{
+       if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+                       X86_CONFIG(.event=0xc0, .umask=0x01)) {
+               if (left < 128)
+                       left = 128;
+               left &= ~0x3fu;
+       }
+       return left;
+}
+
 PMU_FORMAT_ATTR(event, "config:0-7"    );
 PMU_FORMAT_ATTR(umask, "config:8-15"   );
 PMU_FORMAT_ATTR(edge,  "config:18"     );
@@ -1979,16 +2555,52 @@ struct intel_shared_regs *allocate_shared_regs(int cpu)
        return regs;
 }
 
+static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
+{
+       struct intel_excl_cntrs *c;
+       int i;
+
+       c = kzalloc_node(sizeof(struct intel_excl_cntrs),
+                        GFP_KERNEL, cpu_to_node(cpu));
+       if (c) {
+               raw_spin_lock_init(&c->lock);
+               for (i = 0; i < X86_PMC_IDX_MAX; i++) {
+                       c->states[0].state[i] = INTEL_EXCL_UNUSED;
+                       c->states[0].init_state[i] = INTEL_EXCL_UNUSED;
+
+                       c->states[1].state[i] = INTEL_EXCL_UNUSED;
+                       c->states[1].init_state[i] = INTEL_EXCL_UNUSED;
+               }
+               c->core_id = -1;
+       }
+       return c;
+}
+
 static int intel_pmu_cpu_prepare(int cpu)
 {
        struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
 
-       if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map))
-               return NOTIFY_OK;
+       if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
+               cpuc->shared_regs = allocate_shared_regs(cpu);
+               if (!cpuc->shared_regs)
+                       return NOTIFY_BAD;
+       }
 
-       cpuc->shared_regs = allocate_shared_regs(cpu);
-       if (!cpuc->shared_regs)
-               return NOTIFY_BAD;
+       if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+               size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
+
+               cpuc->constraint_list = kzalloc(sz, GFP_KERNEL);
+               if (!cpuc->constraint_list)
+                       return NOTIFY_BAD;
+
+               cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
+               if (!cpuc->excl_cntrs) {
+                       kfree(cpuc->constraint_list);
+                       kfree(cpuc->shared_regs);
+                       return NOTIFY_BAD;
+               }
+               cpuc->excl_thread_id = 0;
+       }
 
        return NOTIFY_OK;
 }
@@ -2010,13 +2622,15 @@ static void intel_pmu_cpu_starting(int cpu)
        if (!cpuc->shared_regs)
                return;
 
-       if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) {
+       if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
+               void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
+
                for_each_cpu(i, topology_thread_cpumask(cpu)) {
                        struct intel_shared_regs *pc;
 
                        pc = per_cpu(cpu_hw_events, i).shared_regs;
                        if (pc && pc->core_id == core_id) {
-                               cpuc->kfree_on_online = cpuc->shared_regs;
+                               *onln = cpuc->shared_regs;
                                cpuc->shared_regs = pc;
                                break;
                        }
@@ -2027,6 +2641,44 @@ static void intel_pmu_cpu_starting(int cpu)
 
        if (x86_pmu.lbr_sel_map)
                cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
+
+       if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+               int h = x86_pmu.num_counters >> 1;
+
+               for_each_cpu(i, topology_thread_cpumask(cpu)) {
+                       struct intel_excl_cntrs *c;
+
+                       c = per_cpu(cpu_hw_events, i).excl_cntrs;
+                       if (c && c->core_id == core_id) {
+                               cpuc->kfree_on_online[1] = cpuc->excl_cntrs;
+                               cpuc->excl_cntrs = c;
+                               cpuc->excl_thread_id = 1;
+                               break;
+                       }
+               }
+               cpuc->excl_cntrs->core_id = core_id;
+               cpuc->excl_cntrs->refcnt++;
+               /*
+                * set hard limit to half the number of generic counters
+                */
+               cpuc->excl_cntrs->states[0].max_alloc_cntrs = h;
+               cpuc->excl_cntrs->states[1].max_alloc_cntrs = h;
+       }
+}
+
+static void free_excl_cntrs(int cpu)
+{
+       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+       struct intel_excl_cntrs *c;
+
+       c = cpuc->excl_cntrs;
+       if (c) {
+               if (c->core_id == -1 || --c->refcnt == 0)
+                       kfree(c);
+               cpuc->excl_cntrs = NULL;
+               kfree(cpuc->constraint_list);
+               cpuc->constraint_list = NULL;
+       }
 }
 
 static void intel_pmu_cpu_dying(int cpu)
@@ -2041,19 +2693,9 @@ static void intel_pmu_cpu_dying(int cpu)
                cpuc->shared_regs = NULL;
        }
 
-       fini_debug_store_on_cpu(cpu);
-}
+       free_excl_cntrs(cpu);
 
-static void intel_pmu_flush_branch_stack(void)
-{
-       /*
-        * Intel LBR does not tag entries with the
-        * PID of the current task, then we need to
-        * flush it on ctxsw
-        * For now, we simply reset it
-        */
-       if (x86_pmu.lbr_nr)
-               intel_pmu_lbr_reset();
+       fini_debug_store_on_cpu(cpu);
 }
 
 PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
@@ -2107,7 +2749,7 @@ static __initconst const struct x86_pmu intel_pmu = {
        .cpu_starting           = intel_pmu_cpu_starting,
        .cpu_dying              = intel_pmu_cpu_dying,
        .guest_get_msrs         = intel_guest_get_msrs,
-       .flush_branch_stack     = intel_pmu_flush_branch_stack,
+       .sched_task             = intel_pmu_lbr_sched_task,
 };
 
 static __init void intel_clovertown_quirk(void)
@@ -2264,6 +2906,27 @@ static __init void intel_nehalem_quirk(void)
        }
 }
 
+/*
+ * enable software workaround for errata:
+ * SNB: BJ122
+ * IVB: BV98
+ * HSW: HSD29
+ *
+ * Only needed when HT is enabled. However detecting
+ * if HT is enabled is difficult (model specific). So instead,
+ * we enable the workaround in the early boot, and verify if
+ * it is needed in a later initcall phase once we have valid
+ * topology information to check if HT is actually enabled
+ */
+static __init void intel_ht_bug(void)
+{
+       x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
+
+       x86_pmu.commit_scheduling = intel_commit_scheduling;
+       x86_pmu.start_scheduling = intel_start_scheduling;
+       x86_pmu.stop_scheduling = intel_stop_scheduling;
+}
+
 EVENT_ATTR_STR(mem-loads,      mem_ld_hsw,     "event=0xcd,umask=0x1,ldlat=3");
 EVENT_ATTR_STR(mem-stores,     mem_st_hsw,     "event=0xd0,umask=0x82")
 
@@ -2443,7 +3106,7 @@ __init int intel_pmu_init(void)
                x86_pmu.event_constraints = intel_slm_event_constraints;
                x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
                x86_pmu.extra_regs = intel_slm_extra_regs;
-               x86_pmu.er_flags |= ERF_HAS_RSP_1;
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
                pr_cont("Silvermont events, ");
                break;
 
@@ -2461,7 +3124,7 @@ __init int intel_pmu_init(void)
                x86_pmu.enable_all = intel_pmu_nhm_enable_all;
                x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
                x86_pmu.extra_regs = intel_westmere_extra_regs;
-               x86_pmu.er_flags |= ERF_HAS_RSP_1;
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
 
                x86_pmu.cpu_events = nhm_events_attrs;
 
@@ -2478,6 +3141,7 @@ __init int intel_pmu_init(void)
        case 42: /* 32nm SandyBridge         */
        case 45: /* 32nm SandyBridge-E/EN/EP */
                x86_add_quirk(intel_sandybridge_quirk);
+               x86_add_quirk(intel_ht_bug);
                memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
                       sizeof(hw_cache_event_ids));
                memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
@@ -2492,9 +3156,11 @@ __init int intel_pmu_init(void)
                        x86_pmu.extra_regs = intel_snbep_extra_regs;
                else
                        x86_pmu.extra_regs = intel_snb_extra_regs;
+
+
                /* all extra regs are per-cpu when HT is on */
-               x86_pmu.er_flags |= ERF_HAS_RSP_1;
-               x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
 
                x86_pmu.cpu_events = snb_events_attrs;
 
@@ -2510,6 +3176,7 @@ __init int intel_pmu_init(void)
 
        case 58: /* 22nm IvyBridge       */
        case 62: /* 22nm IvyBridge-EP/EX */
+               x86_add_quirk(intel_ht_bug);
                memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
                       sizeof(hw_cache_event_ids));
                /* dTLB-load-misses on IVB is different than SNB */
@@ -2528,8 +3195,8 @@ __init int intel_pmu_init(void)
                else
                        x86_pmu.extra_regs = intel_snb_extra_regs;
                /* all extra regs are per-cpu when HT is on */
-               x86_pmu.er_flags |= ERF_HAS_RSP_1;
-               x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
 
                x86_pmu.cpu_events = snb_events_attrs;
 
@@ -2545,19 +3212,20 @@ __init int intel_pmu_init(void)
        case 63: /* 22nm Haswell Server */
        case 69: /* 22nm Haswell ULT */
        case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+               x86_add_quirk(intel_ht_bug);
                x86_pmu.late_ack = true;
-               memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-               memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+               memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+               memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
 
-               intel_pmu_lbr_init_snb();
+               intel_pmu_lbr_init_hsw();
 
                x86_pmu.event_constraints = intel_hsw_event_constraints;
                x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
                x86_pmu.extra_regs = intel_snbep_extra_regs;
                x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
                /* all extra regs are per-cpu when HT is on */
-               x86_pmu.er_flags |= ERF_HAS_RSP_1;
-               x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
 
                x86_pmu.hw_config = hsw_hw_config;
                x86_pmu.get_event_constraints = hsw_get_event_constraints;
@@ -2566,6 +3234,39 @@ __init int intel_pmu_init(void)
                pr_cont("Haswell events, ");
                break;
 
+       case 61: /* 14nm Broadwell Core-M */
+       case 86: /* 14nm Broadwell Xeon D */
+               x86_pmu.late_ack = true;
+               memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+               memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+               /* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
+               hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
+                                                                        BDW_L3_MISS|HSW_SNOOP_DRAM;
+               hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
+                                                                         HSW_SNOOP_DRAM;
+               hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
+                                                                            BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+               hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
+                                                                             BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+
+               intel_pmu_lbr_init_snb();
+
+               x86_pmu.event_constraints = intel_bdw_event_constraints;
+               x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
+               x86_pmu.extra_regs = intel_snbep_extra_regs;
+               x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+               /* all extra regs are per-cpu when HT is on */
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+               x86_pmu.hw_config = hsw_hw_config;
+               x86_pmu.get_event_constraints = hsw_get_event_constraints;
+               x86_pmu.cpu_events = hsw_events_attrs;
+               x86_pmu.limit_period = bdw_limit_period;
+               pr_cont("Broadwell events, ");
+               break;
+
        default:
                switch (x86_pmu.version) {
                case 1:
@@ -2651,3 +3352,47 @@ __init int intel_pmu_init(void)
 
        return 0;
 }
+
+/*
+ * HT bug: phase 2 init
+ * Called once we have valid topology information to check
+ * whether or not HT is enabled
+ * If HT is off, then we disable the workaround
+ */
+static __init int fixup_ht_bug(void)
+{
+       int cpu = smp_processor_id();
+       int w, c;
+       /*
+        * problem not present on this CPU model, nothing to do
+        */
+       if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
+               return 0;
+
+       w = cpumask_weight(topology_thread_cpumask(cpu));
+       if (w > 1) {
+               pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
+               return 0;
+       }
+
+       watchdog_nmi_disable_all();
+
+       x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
+
+       x86_pmu.commit_scheduling = NULL;
+       x86_pmu.start_scheduling = NULL;
+       x86_pmu.stop_scheduling = NULL;
+
+       watchdog_nmi_enable_all();
+
+       get_online_cpus();
+
+       for_each_online_cpu(c) {
+               free_excl_cntrs(c);
+       }
+
+       put_online_cpus();
+       pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
+       return 0;
+}
+subsys_initcall(fixup_ht_bug)