Merge branch 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 15 Mar 2016 16:32:27 +0000 (09:32 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 15 Mar 2016 16:32:27 +0000 (09:32 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 15 Mar 2016 16:32:27 +0000 (09:32 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 15 Mar 2016 16:32:27 +0000 (09:32 -0700)
diff --git a/Documentation/devicetree/bindings/arm/omap/omap.txt b/Documentation/devicetree/bindings/arm/omap/omap.txt

index a2bd593881cab361fa739d9a12e63ee7ba63ef50..66422d6631841d9e6eba9be63e453176ebd43f2e 100644 (file)
--- a/Documentation/devicetree/bindings/arm/omap/omap.txt
+++ b/Documentation/devicetree/bindings/arm/omap/omap.txt
@@ -23,6 +23,7 @@ Optional properties:
    during suspend.
  - ti,no-reset-on-init: When present, the module should not be reset at init
  - ti,no-idle-on-init: When present, the module should not be idled at init
+- ti,no-idle: When present, the module is never allowed to idle.
  
  Example:
  
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt

index 0d27a9adabf849f0f95645cbcb361e9f8e789762..084775f7b052de0f9998ffa7bfa2e48080f1aec1 100644 (file)
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3491,6 +3491,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
  
         ro              [KNL] Mount root device read-only on boot
  
+       rodata=         [KNL]
+               on      Mark read-only kernel memory as read-only (default).
+               off     Leave read-only kernel memory writable for debugging.
+
         root=           [KNL] Root filesystem
                         See name_to_dev_t comment in init/do_mounts.c.
  
@@ -3528,6 +3532,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
  
         sched_debug     [KNL] Enables verbose scheduler debug messages.
  
+       schedstats=     [KNL,X86] Enable or disable scheduled statistics.
+                       Allowed values are enable and disable. This feature
+                       incurs a small amount of overhead in the scheduler
+                       but is useful for debugging and performance tuning.
+
         skew_tick=      [KNL] Offset the periodic timer tick per cpu to mitigate
                         xtime_lock contention on larger systems, and/or RCU lock
                         contention on all systems with CONFIG_MAXSMP set.
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt

index a93b414672a71ac6fa9bac1e848215804bde139c..f4444c94ff285db493961c54a1527a189da20fd2 100644 (file)
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -58,6 +58,8 @@ show up in /proc/sys/kernel:
  - panic_on_stackoverflow
  - panic_on_unrecovered_nmi
  - panic_on_warn
+- perf_cpu_time_max_percent
+- perf_event_paranoid
  - pid_max
  - powersave-nap               [ PPC only ]
  - printk
@@ -639,6 +641,17 @@ allowed to execute.
  
  ==============================================================
  
+perf_event_paranoid:
+
+Controls use of the performance events system by unprivileged
+users (without CAP_SYS_ADMIN).  The default value is 1.
+
+ -1: Allow use of (almost) all events by all users
+>=0: Disallow raw tracepoint access by users without CAP_IOC_LOCK
+>=1: Disallow CPU event access by users without CAP_SYS_ADMIN
+>=2: Disallow kernel profiling by users without CAP_SYS_ADMIN
+
+==============================================================
  
  pid_max:
  
@@ -760,6 +773,14 @@ rtsig-nr shows the number of RT signals currently queued.
  
  ==============================================================
  
+sched_schedstats:
+
+Enables/disables scheduler statistics. Enabling this feature
+incurs a small amount of overhead in the scheduler but is
+useful for debugging and performance tuning.
+
+==============================================================
+
  sg-big-buff:
  
  This file shows the size of the generic SCSI (sg) buffer.
diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt

index daf9c0f742d22e882637391539e90ea31eb788f3..c81731096a4338bcec4d43b8049a26118d10f260 100644 (file)
--- a/Documentation/virtual/kvm/mmu.txt
+++ b/Documentation/virtual/kvm/mmu.txt
@@ -358,7 +358,8 @@ In the first case there are two additional complications:
  - if CR4.SMEP is enabled: since we've turned the page into a kernel page,
    the kernel may now execute it.  We handle this by also setting spte.nx.
    If we get a user fetch or read fault, we'll change spte.u=1 and
-  spte.nx=gpte.nx back.
+  spte.nx=gpte.nx back.  For this to work, KVM forces EFER.NX to 1 when
+  shadow paging is in use.
  - if CR4.SMAP is disabled: since the page has been changed to a kernel
    page, it can not be reused when CR4.SMAP is enabled. We set
    CR4.SMAP && !CR0.WP into shadow page's role to avoid this case. Note,
diff --git a/Documentation/x86/exception-tables.txt b/Documentation/x86/exception-tables.txt

index 32901aa36f0a99f7c087aa7c510bbceb0ec14a89..e396bcd8d830428a30231986fd6f1a2ee9bdd921 100644 (file)
--- a/Documentation/x86/exception-tables.txt
+++ b/Documentation/x86/exception-tables.txt
@@ -290,3 +290,38 @@ Due to the way that the exception table is built and needs to be ordered,
  only use exceptions for code in the .text section.  Any other section
  will cause the exception table to not be sorted correctly, and the
  exceptions will fail.
+
+Things changed when 64-bit support was added to x86 Linux. Rather than
+double the size of the exception table by expanding the two entries
+from 32-bits to 64 bits, a clever trick was used to store addresses
+as relative offsets from the table itself. The assembly code changed
+from:
+       .long 1b,3b
+to:
+        .long (from) - .
+        .long (to) - .
+
+and the C-code that uses these values converts back to absolute addresses
+like this:
+
+       ex_insn_addr(const struct exception_table_entry *x)
+       {
+               return (unsigned long)&x->insn + x->insn;
+       }
+
+In v4.6 the exception table entry was expanded with a new field "handler".
+This is also 32-bits wide and contains a third relative function
+pointer which points to one of:
+
+1) int ex_handler_default(const struct exception_table_entry *fixup)
+   This is legacy case that just jumps to the fixup code
+2) int ex_handler_fault(const struct exception_table_entry *fixup)
+   This case provides the fault number of the trap that occurred at
+   entry->insn. It is used to distinguish page faults from machine
+   check.
+3) int ex_handler_ext(const struct exception_table_entry *fixup)
+   This case is used for uaccess_err ... we need to set a flag
+   in the task structure. Before the handler functions existed this
+   case was handled by adding a large offset to the fixup to tag
+   it as special.
+More functions can easily be added.
diff --git a/MAINTAINERS b/MAINTAINERS

index f5e6a535bc3479d37f1e76a8ed93209cb21d132b..2061ea77667c36260a72f92ba12c3ecdb625d3c6 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4554,6 +4554,15 @@ S:       Maintained
  F:     drivers/net/ethernet/freescale/fs_enet/
  F:     include/linux/fs_enet_pd.h
  
+FREESCALE IMX / MXC FEC DRIVER
+M:     Fugang Duan <fugang.duan@nxp.com>
+L:     netdev@vger.kernel.org
+S:     Maintained
+F:     drivers/net/ethernet/freescale/fec_main.c
+F:     drivers/net/ethernet/freescale/fec_ptp.c
+F:     drivers/net/ethernet/freescale/fec.h
+F:     Documentation/devicetree/bindings/net/fsl-fec.txt
+
  FREESCALE QUICC ENGINE LIBRARY
  L:     linuxppc-dev@lists.ozlabs.org
  S:     Orphan
@@ -6770,6 +6779,7 @@ S:        Maintained
  F:     Documentation/networking/mac80211-injection.txt
  F:     include/net/mac80211.h
  F:     net/mac80211/
+F:     drivers/net/wireless/mac80211_hwsim.[ch]
  
  MACVLAN DRIVER
  M:     Patrick McHardy <kaber@trash.net>
@@ -7389,6 +7399,17 @@ W:       https://www.myricom.com/support/downloads/myri10ge.html
  S:     Supported
  F:     drivers/net/ethernet/myricom/myri10ge/
  
+NAND FLASH SUBSYSTEM
+M:     Boris Brezillon <boris.brezillon@free-electrons.com>
+R:     Richard Weinberger <richard@nod.at>
+L:     linux-mtd@lists.infradead.org
+W:     http://www.linux-mtd.infradead.org/
+Q:     http://patchwork.ozlabs.org/project/linux-mtd/list/
+T:     git git://github.com/linux-nand/linux.git
+S:     Maintained
+F:     drivers/mtd/nand/
+F:     include/linux/mtd/nand*.h
+
  NATSEMI ETHERNET DRIVER (DP8381x)
  S:     Orphan
  F:     drivers/net/ethernet/natsemi/natsemi.c
@@ -8454,6 +8475,7 @@ PERFORMANCE EVENTS SUBSYSTEM
  M:     Peter Zijlstra <peterz@infradead.org>
  M:     Ingo Molnar <mingo@redhat.com>
  M:     Arnaldo Carvalho de Melo <acme@kernel.org>
+R:     Alexander Shishkin <alexander.shishkin@linux.intel.com>
  L:     linux-kernel@vger.kernel.org
  T:     git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git perf/core
  S:     Supported
diff --git a/Makefile b/Makefile

index 2d519d2fb3a9be1a64100c85141b824a37924602..7b3ecdcdc6c1815ca6241a72b1967593a0335b41 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
  VERSION = 4
  PATCHLEVEL = 5
  SUBLEVEL = 0
-EXTRAVERSION = -rc7
+EXTRAVERSION =
  NAME = Blurry Fish Butt
  
  # *DOCUMENTATION*
diff --git a/arch/arm/boot/dts/armada-xp-axpwifiap.dts b/arch/arm/boot/dts/armada-xp-axpwifiap.dts

index 23fc670c0427710168ce41ef012f9e37b8218eb6..5c21b236721fcb9d6e2b5055cbc89ae9527df7f1 100644 (file)
--- a/arch/arm/boot/dts/armada-xp-axpwifiap.dts
+++ b/arch/arm/boot/dts/armada-xp-axpwifiap.dts
@@ -70,8 +70,8 @@
         soc {
                 ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xf1000000 0x100000
                           MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
-                         MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-                         MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+                         MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+                         MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
  
                 pcie-controller {
                         status = "okay";
diff --git a/arch/arm/boot/dts/armada-xp-db.dts b/arch/arm/boot/dts/armada-xp-db.dts

index f774101416a5522841c475252d0a86842e475b29..ebe1d267406df5ab30e3a3189b669733eb8fcaa4 100644 (file)
--- a/arch/arm/boot/dts/armada-xp-db.dts
+++ b/arch/arm/boot/dts/armada-xp-db.dts
@@ -76,8 +76,8 @@
                 ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xf1000000 0x100000
                           MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
                           MBUS_ID(0x01, 0x2f) 0 0 0xf0000000 0x1000000
-                         MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-                         MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+                         MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+                         MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
  
                 devbus-bootcs {
                         status = "okay";
diff --git a/arch/arm/boot/dts/armada-xp-gp.dts b/arch/arm/boot/dts/armada-xp-gp.dts

index 4878d7353069fc2720e15d76af307618daced15c..5730b875c4f51a1aa2743d8881b18d6dbc0b27cd 100644 (file)
--- a/arch/arm/boot/dts/armada-xp-gp.dts
+++ b/arch/arm/boot/dts/armada-xp-gp.dts
@@ -95,8 +95,8 @@
                 ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xf1000000 0x100000
                           MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
                           MBUS_ID(0x01, 0x2f) 0 0 0xf0000000 0x1000000
-                         MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-                         MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+                         MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+                         MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
  
                 devbus-bootcs {
                         status = "okay";
diff --git a/arch/arm/boot/dts/armada-xp-lenovo-ix4-300d.dts b/arch/arm/boot/dts/armada-xp-lenovo-ix4-300d.dts

index fb9e1bbf23385b85b0b82ddb153b922b2d2e0178..8af463f26ea1e162360952dbc03c0a9bda807cd5 100644 (file)
--- a/arch/arm/boot/dts/armada-xp-lenovo-ix4-300d.dts
+++ b/arch/arm/boot/dts/armada-xp-lenovo-ix4-300d.dts
@@ -65,8 +65,8 @@
         soc {
                 ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xd0000000 0x100000
                         MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
-                       MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-                       MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+                       MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+                       MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
  
                 pcie-controller {
                         status = "okay";
diff --git a/arch/arm/boot/dts/armada-xp-linksys-mamba.dts b/arch/arm/boot/dts/armada-xp-linksys-mamba.dts

index 6e9820e141f8de5a850edcd7e74f0ea2a1acd1ed..b89e6cf1271a1370aead2a000e729cefad376ab0 100644 (file)
--- a/arch/arm/boot/dts/armada-xp-linksys-mamba.dts
+++ b/arch/arm/boot/dts/armada-xp-linksys-mamba.dts
@@ -70,8 +70,8 @@
         soc {
                 ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xf1000000 0x100000
                           MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
-                         MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-                         MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+                         MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+                         MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
  
                 pcie-controller {
                         status = "okay";
diff --git a/arch/arm/boot/dts/armada-xp-matrix.dts b/arch/arm/boot/dts/armada-xp-matrix.dts

index 6ab33837a2b6d0a5aaf4e48763bb838451a346bd..6522b04f4a8e7c7a759100153d7d990f1fb160d3 100644 (file)
--- a/arch/arm/boot/dts/armada-xp-matrix.dts
+++ b/arch/arm/boot/dts/armada-xp-matrix.dts
@@ -68,8 +68,8 @@
         soc {
                 ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xf1000000 0x100000
                           MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
-                         MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-                         MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+                         MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+                         MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
  
                 internal-regs {
                         serial@12000 {
diff --git a/arch/arm/boot/dts/armada-xp-netgear-rn2120.dts b/arch/arm/boot/dts/armada-xp-netgear-rn2120.dts

index 62175a8848bc2234343a8627d7cdf0ababfee851..d19f44c709257d9a35644d71cb49a04e354c42d8 100644 (file)
--- a/arch/arm/boot/dts/armada-xp-netgear-rn2120.dts
+++ b/arch/arm/boot/dts/armada-xp-netgear-rn2120.dts
@@ -64,8 +64,8 @@
         soc {
                 ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xd0000000 0x100000
                           MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
-                         MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-                         MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+                         MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+                         MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
  
                 pcie-controller {
                         status = "okay";
diff --git a/arch/arm/boot/dts/armada-xp-openblocks-ax3-4.dts b/arch/arm/boot/dts/armada-xp-openblocks-ax3-4.dts

index a5db17782e085662d01a22683c5c364be5c25c8c..853bd392a4fe20155ed1469a23175213814ca9dc 100644 (file)
--- a/arch/arm/boot/dts/armada-xp-openblocks-ax3-4.dts
+++ b/arch/arm/boot/dts/armada-xp-openblocks-ax3-4.dts
@@ -65,9 +65,9 @@
         soc {
                 ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xd0000000 0x100000
                           MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
-                         MBUS_ID(0x01, 0x2f) 0 0 0xf0000000 0x8000000
-                         MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-                         MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+                         MBUS_ID(0x01, 0x2f) 0 0 0xe8000000 0x8000000
+                         MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+                         MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
  
                 devbus-bootcs {
                         status = "okay";
diff --git a/arch/arm/boot/dts/armada-xp-synology-ds414.dts b/arch/arm/boot/dts/armada-xp-synology-ds414.dts

index 2391b11dc546b859dd3ab23a700be5a8a63ea83b..d17dab0a6f513e9c04ff2675f5f9d44d7f6953b3 100644 (file)
--- a/arch/arm/boot/dts/armada-xp-synology-ds414.dts
+++ b/arch/arm/boot/dts/armada-xp-synology-ds414.dts
@@ -78,8 +78,8 @@
         soc {
                 ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xf1000000 0x100000
                           MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
-                         MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-                         MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+                         MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+                         MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
  
                 pcie-controller {
                         status = "okay";
diff --git a/arch/arm/boot/dts/dra7.dtsi b/arch/arm/boot/dts/dra7.dtsi

index c4d9175b90dceab5aa64ca1973505652bf325bee..f82aa44c3ceed540fef7e291c571c2b49c5b2e1f 100644 (file)
--- a/arch/arm/boot/dts/dra7.dtsi
+++ b/arch/arm/boot/dts/dra7.dtsi
@@ -1500,6 +1500,16 @@
                                0x48485200 0x2E00>;
                         #address-cells = <1>;
                         #size-cells = <1>;
+
+                       /*
+                        * Do not allow gating of cpsw clock as workaround
+                        * for errata i877. Keeping internal clock disabled
+                        * causes the device switching characteristics
+                        * to degrade over time and eventually fail to meet
+                        * the data manual delay time/skew specs.
+                        */
+                       ti,no-idle;
+
                         /*
                          * rx_thresh_pend
                          * rx_pend
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h

index d5525bfc7e3e61879d278ae08b446e185c206982..9156fc303afd8d278671c6d4d277d866fdd2ad7b 100644 (file)
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -491,7 +491,6 @@ static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
  #endif
  
  #ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
  void set_kernel_text_rw(void);
  void set_kernel_text_ro(void);
  #else
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c

index 7d0cba6f1cc5efadff31fd0cde2aca3279e120da..139791ed473d5264682c004ea2ea7af8ddd28f5d 100644 (file)
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -176,13 +176,13 @@ static struct resource mem_res[] = {
                 .name = "Kernel code",
                 .start = 0,
                 .end = 0,
-               .flags = IORESOURCE_MEM
+               .flags = IORESOURCE_SYSTEM_RAM
         },
         {
                 .name = "Kernel data",
                 .start = 0,
                 .end = 0,
-               .flags = IORESOURCE_MEM
+               .flags = IORESOURCE_SYSTEM_RAM
         }
  };
  
@@ -851,7 +851,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc)
                 res->name  = "System RAM";
                 res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region));
                 res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1;
-               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+               res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
  
                 request_resource(&iomem_resource, res);
  
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c

index dda1959f0ddeb947e8a8020d7da0b02bb19f89cc..08e49c423c24147a2f2c40b011866c8d2d144de9 100644 (file)
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -506,18 +506,18 @@ static void kvm_arm_resume_guest(struct kvm *kvm)
         struct kvm_vcpu *vcpu;
  
         kvm_for_each_vcpu(i, vcpu, kvm) {
-               wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
+               struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
  
                 vcpu->arch.pause = false;
-               wake_up_interruptible(wq);
+               swake_up(wq);
         }
  }
  
  static void vcpu_sleep(struct kvm_vcpu *vcpu)
  {
-       wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
+       struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
  
-       wait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
+       swait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
                                        (!vcpu->arch.pause)));
  }
  
diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c

index a9b3b905e661dec55672e459f1119b3eb466b373..c2b131527a643f6e6808b78f8c53e9751d0f71a9 100644 (file)
--- a/arch/arm/kvm/psci.c
+++ b/arch/arm/kvm/psci.c
@@ -70,7 +70,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
  {
         struct kvm *kvm = source_vcpu->kvm;
         struct kvm_vcpu *vcpu = NULL;
-       wait_queue_head_t *wq;
+       struct swait_queue_head *wq;
         unsigned long cpu_id;
         unsigned long context_id;
         phys_addr_t target_pc;
@@ -119,7 +119,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
         smp_mb();               /* Make sure the above is visible */
  
         wq = kvm_arch_vcpu_wq(vcpu);
-       wake_up_interruptible(wq);
+       swake_up(wq);
  
         return PSCI_RET_SUCCESS;
  }
diff --git a/arch/arm/mach-lpc32xx/phy3250.c b/arch/arm/mach-lpc32xx/phy3250.c

index 77d6b1bab278292e31a6435294e824a6a76648b9..ee06fabdf60e69e014317c9703787da5f31dc79f 100644 (file)
--- a/arch/arm/mach-lpc32xx/phy3250.c
+++ b/arch/arm/mach-lpc32xx/phy3250.c
@@ -86,8 +86,8 @@ static int lpc32xx_clcd_setup(struct clcd_fb *fb)
  {
         dma_addr_t dma;
  
-       fb->fb.screen_base = dma_alloc_writecombine(&fb->dev->dev,
-               PANEL_SIZE, &dma, GFP_KERNEL);
+       fb->fb.screen_base = dma_alloc_wc(&fb->dev->dev, PANEL_SIZE, &dma,
+                                         GFP_KERNEL);
         if (!fb->fb.screen_base) {
                 printk(KERN_ERR "CLCD: unable to map framebuffer\n");
                 return -ENOMEM;
@@ -116,15 +116,14 @@ static int lpc32xx_clcd_setup(struct clcd_fb *fb)
  
  static int lpc32xx_clcd_mmap(struct clcd_fb *fb, struct vm_area_struct *vma)
  {
-       return dma_mmap_writecombine(&fb->dev->dev, vma,
-               fb->fb.screen_base, fb->fb.fix.smem_start,
-               fb->fb.fix.smem_len);
+       return dma_mmap_wc(&fb->dev->dev, vma, fb->fb.screen_base,
+                          fb->fb.fix.smem_start, fb->fb.fix.smem_len);
  }
  
  static void lpc32xx_clcd_remove(struct clcd_fb *fb)
  {
-       dma_free_writecombine(&fb->dev->dev, fb->fb.fix.smem_len,
-               fb->fb.screen_base, fb->fb.fix.smem_start);
+       dma_free_wc(&fb->dev->dev, fb->fb.fix.smem_len, fb->fb.screen_base,
+                   fb->fb.fix.smem_start);
  }
  
  /*
diff --git a/arch/arm/mach-netx/fb.c b/arch/arm/mach-netx/fb.c

index d122ee6ab9913aa8e8f2896ea0fee44a01832068..8814ee5e98fd52ff75412fd863542954037dafb8 100644 (file)
--- a/arch/arm/mach-netx/fb.c
+++ b/arch/arm/mach-netx/fb.c
@@ -42,8 +42,8 @@ int netx_clcd_setup(struct clcd_fb *fb)
  
         fb->panel = netx_panel;
  
-       fb->fb.screen_base = dma_alloc_writecombine(&fb->dev->dev, 1024*1024,
-                                                   &dma, GFP_KERNEL);
+       fb->fb.screen_base = dma_alloc_wc(&fb->dev->dev, 1024 * 1024, &dma,
+                                         GFP_KERNEL);
         if (!fb->fb.screen_base) {
                 printk(KERN_ERR "CLCD: unable to map framebuffer\n");
                 return -ENOMEM;
@@ -57,16 +57,14 @@ int netx_clcd_setup(struct clcd_fb *fb)
  
  int netx_clcd_mmap(struct clcd_fb *fb, struct vm_area_struct *vma)
  {
-       return dma_mmap_writecombine(&fb->dev->dev, vma,
-                                    fb->fb.screen_base,
-                                    fb->fb.fix.smem_start,
-                                    fb->fb.fix.smem_len);
+       return dma_mmap_wc(&fb->dev->dev, vma, fb->fb.screen_base,
+                          fb->fb.fix.smem_start, fb->fb.fix.smem_len);
  }
  
  void netx_clcd_remove(struct clcd_fb *fb)
  {
-       dma_free_writecombine(&fb->dev->dev, fb->fb.fix.smem_len,
-                             fb->fb.screen_base, fb->fb.fix.smem_start);
+       dma_free_wc(&fb->dev->dev, fb->fb.fix.smem_len, fb->fb.screen_base,
+                   fb->fb.fix.smem_start);
  }
  
  static AMBA_AHB_DEVICE(fb, "fb", 0, 0x00104000, { NETX_IRQ_LCD }, NULL);
diff --git a/arch/arm/mach-nspire/clcd.c b/arch/arm/mach-nspire/clcd.c

index abea12617b173e2b34f794c4bdf98b8b5d518c48..ea0e5b2ca1cd49e6328faccbb467c1d7b669430f 100644 (file)
--- a/arch/arm/mach-nspire/clcd.c
+++ b/arch/arm/mach-nspire/clcd.c
@@ -90,8 +90,8 @@ int nspire_clcd_setup(struct clcd_fb *fb)
         panel_size = ((panel->mode.xres * panel->mode.yres) * panel->bpp) / 8;
         panel_size = ALIGN(panel_size, PAGE_SIZE);
  
-       fb->fb.screen_base = dma_alloc_writecombine(&fb->dev->dev,
-               panel_size, &dma, GFP_KERNEL);
+       fb->fb.screen_base = dma_alloc_wc(&fb->dev->dev, panel_size, &dma,
+                                         GFP_KERNEL);
  
         if (!fb->fb.screen_base) {
                 pr_err("CLCD: unable to map framebuffer\n");
@@ -107,13 +107,12 @@ int nspire_clcd_setup(struct clcd_fb *fb)
  
  int nspire_clcd_mmap(struct clcd_fb *fb, struct vm_area_struct *vma)
  {
-       return dma_mmap_writecombine(&fb->dev->dev, vma,
-               fb->fb.screen_base, fb->fb.fix.smem_start,
-               fb->fb.fix.smem_len);
+       return dma_mmap_wc(&fb->dev->dev, vma, fb->fb.screen_base,
+                          fb->fb.fix.smem_start, fb->fb.fix.smem_len);
  }
  
  void nspire_clcd_remove(struct clcd_fb *fb)
  {
-       dma_free_writecombine(&fb->dev->dev, fb->fb.fix.smem_len,
-               fb->fb.screen_base, fb->fb.fix.smem_start);
+       dma_free_wc(&fb->dev->dev, fb->fb.fix.smem_len, fb->fb.screen_base,
+                   fb->fb.fix.smem_start);
  }
diff --git a/arch/arm/mach-omap2/omap_hwmod.c b/arch/arm/mach-omap2/omap_hwmod.c

index e9f65fec55c0b9beacfdf694424e601b4673327e..b6d62e4cdfddaf53f8d3aa7c1768b5e83255673e 100644 (file)
--- a/arch/arm/mach-omap2/omap_hwmod.c
+++ b/arch/arm/mach-omap2/omap_hwmod.c
@@ -2200,6 +2200,11 @@ static int _enable(struct omap_hwmod *oh)
   */
  static int _idle(struct omap_hwmod *oh)
  {
+       if (oh->flags & HWMOD_NO_IDLE) {
+               oh->_int_flags |= _HWMOD_SKIP_ENABLE;
+               return 0;
+       }
+
         pr_debug("omap_hwmod: %s: idling\n", oh->name);
  
         if (oh->_state != _HWMOD_STATE_ENABLED) {
@@ -2504,6 +2509,8 @@ static int __init _init(struct omap_hwmod *oh, void *data)
                         oh->flags |= HWMOD_INIT_NO_RESET;
                 if (of_find_property(np, "ti,no-idle-on-init", NULL))
                         oh->flags |= HWMOD_INIT_NO_IDLE;
+               if (of_find_property(np, "ti,no-idle", NULL))
+                       oh->flags |= HWMOD_NO_IDLE;
         }
  
         oh->_state = _HWMOD_STATE_INITIALIZED;
@@ -2630,7 +2637,7 @@ static void __init _setup_postsetup(struct omap_hwmod *oh)
          * XXX HWMOD_INIT_NO_IDLE does not belong in hwmod data -
          * it should be set by the core code as a runtime flag during startup
          */
-       if ((oh->flags & HWMOD_INIT_NO_IDLE) &&
+       if ((oh->flags & (HWMOD_INIT_NO_IDLE | HWMOD_NO_IDLE)) &&
             (postsetup_state == _HWMOD_STATE_IDLE)) {
                 oh->_int_flags |= _HWMOD_SKIP_ENABLE;
                 postsetup_state = _HWMOD_STATE_ENABLED;
diff --git a/arch/arm/mach-omap2/omap_hwmod.h b/arch/arm/mach-omap2/omap_hwmod.h

index 76bce11c85a40c477a5b87aab919d5b8dfded5e8..7c7a31169475b72bc8fbbe2f0dabf015b395f257 100644 (file)
--- a/arch/arm/mach-omap2/omap_hwmod.h
+++ b/arch/arm/mach-omap2/omap_hwmod.h
@@ -525,6 +525,8 @@ struct omap_hwmod_omap4_prcm {
   *     or idled.
   * HWMOD_OPT_CLKS_NEEDED: The optional clocks are needed for the module to
   *     operate and they need to be handled at the same time as the main_clk.
+ * HWMOD_NO_IDLE: Do not idle the hwmod at all. Useful to handle certain
+ *     IPs like CPSW on DRA7, where clocks to this module cannot be disabled.
   */
  #define HWMOD_SWSUP_SIDLE                      (1 << 0)
  #define HWMOD_SWSUP_MSTANDBY                   (1 << 1)
@@ -541,6 +543,7 @@ struct omap_hwmod_omap4_prcm {
  #define HWMOD_SWSUP_SIDLE_ACT                  (1 << 12)
  #define HWMOD_RECONFIG_IO_CHAIN                        (1 << 13)
  #define HWMOD_OPT_CLKS_NEEDED                  (1 << 14)
+#define HWMOD_NO_IDLE                          (1 << 15)
  
  /*
   * omap_hwmod._int_flags definitions
diff --git a/arch/arm/plat-samsung/pm-check.c b/arch/arm/plat-samsung/pm-check.c

index 04aff2c31b4607a99265d038d3ba0fd3b9a908b3..70f2f699bed3efe0802def3dc879cdd8e8857a05 100644 (file)
--- a/arch/arm/plat-samsung/pm-check.c
+++ b/arch/arm/plat-samsung/pm-check.c
@@ -53,8 +53,8 @@ static void s3c_pm_run_res(struct resource *ptr, run_fn_t fn, u32 *arg)
                 if (ptr->child != NULL)
                         s3c_pm_run_res(ptr->child, fn, arg);
  
-               if ((ptr->flags & IORESOURCE_MEM) &&
-                   strcmp(ptr->name, "System RAM") == 0) {
+               if ((ptr->flags & IORESOURCE_SYSTEM_RAM)
+                               == IORESOURCE_SYSTEM_RAM) {
                         S3C_PMDBG("Found system RAM at %08lx..%08lx\n",
                                   (unsigned long)ptr->start,
                                   (unsigned long)ptr->end);
diff --git a/arch/arm/vdso/vdso.S b/arch/arm/vdso/vdso.S

index b2b97e3e7babbbb37072b4185faa27a0e47760c8..a62a7b64f49c52706b8e133b1f786dc9dc34842d 100644 (file)
--- a/arch/arm/vdso/vdso.S
+++ b/arch/arm/vdso/vdso.S
@@ -23,9 +23,8 @@
  #include <linux/const.h>
  #include <asm/page.h>
  
-       __PAGE_ALIGNED_DATA
-
         .globl vdso_start, vdso_end
+       .section .data..ro_after_init
         .balign PAGE_SIZE
  vdso_start:
         .incbin "arch/arm/vdso/vdso.so"
diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h

index 7fc294c3bc5baab31b13b1e23753d17835ce3b27..22dda613f9c91bd3bcfe3a8aad435f7384795401 100644 (file)
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -156,8 +156,4 @@ int set_memory_rw(unsigned long addr, int numpages);
  int set_memory_x(unsigned long addr, int numpages);
  int set_memory_nx(unsigned long addr, int numpages);
  
-#ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
-#endif
-
  #endif
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h

index f5060867458024a5a61ccfd4a77330c728e13010..819aff5d593f701d1101d4cc42dbec8cedbc80aa 100644 (file)
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -40,7 +40,7 @@
   * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space,
   *     fixed mappings and modules
   */
-#define VMEMMAP_SIZE           ALIGN((1UL << (VA_BITS - PAGE_SHIFT - 1)) * sizeof(struct page), PUD_SIZE)
+#define VMEMMAP_SIZE           ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE)
  
  #ifndef CONFIG_KASAN
  #define VMALLOC_START          (VA_START)
@@ -52,7 +52,8 @@
  #define VMALLOC_END            (PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K)
  
  #define VMEMMAP_START          (VMALLOC_END + SZ_64K)
-#define vmemmap                        ((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT))
+#define vmemmap                        ((struct page *)VMEMMAP_START - \
+                                SECTION_ALIGN_DOWN(memstart_addr >> PAGE_SHIFT))
  
  #define FIRST_USER_ADDRESS     0UL
  
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c

index 8119479147db147c33800f76aa0d07c6072e8559..450987d99b9b9e497411a79f34781a3cb242df6e 100644 (file)
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -73,13 +73,13 @@ static struct resource mem_res[] = {
                 .name = "Kernel code",
                 .start = 0,
                 .end = 0,
-               .flags = IORESOURCE_MEM
+               .flags = IORESOURCE_SYSTEM_RAM
         },
         {
                 .name = "Kernel data",
                 .start = 0,
                 .end = 0,
-               .flags = IORESOURCE_MEM
+               .flags = IORESOURCE_SYSTEM_RAM
         }
  };
  
@@ -210,7 +210,7 @@ static void __init request_standard_resources(void)
                 res->name  = "System RAM";
                 res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region));
                 res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1;
-               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+               res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
  
                 request_resource(&iomem_resource, res);
  
diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S

index e33fe33876ab3804f2c6dcd6c5458e576596ef24..fd10eb6638689c1262c34cca7de533453cb383ae 100644 (file)
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -145,6 +145,10 @@ ENTRY(cpu_resume_mmu)
  ENDPROC(cpu_resume_mmu)
         .popsection
  cpu_resume_after_mmu:
+#ifdef CONFIG_KASAN
+       mov     x0, sp
+       bl      kasan_unpoison_remaining_stack
+#endif
         mov     x0, #0                  // return zero on success
         ldp     x19, x20, [sp, #16]
         ldp     x21, x22, [sp, #32]
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c

index 82d607c3614ed8454d19f2da5d44a6cc914ee0b3..da30529bb1f65c9e3d5408b2e28ab31bc2283211 100644 (file)
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -306,10 +306,6 @@ static __init int setup_hugepagesz(char *opt)
                 hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
         } else if (ps == PUD_SIZE) {
                 hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
-       } else if (ps == (PAGE_SIZE * CONT_PTES)) {
-               hugetlb_add_hstate(CONT_PTE_SHIFT);
-       } else if (ps == (PMD_SIZE * CONT_PMDS)) {
-               hugetlb_add_hstate((PMD_SHIFT + CONT_PMD_SHIFT) - PAGE_SHIFT);
         } else {
                 pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10);
                 return 0;
@@ -317,13 +313,3 @@ static __init int setup_hugepagesz(char *opt)
         return 1;
  }
  __setup("hugepagesz=", setup_hugepagesz);
-
-#ifdef CONFIG_ARM64_64K_PAGES
-static __init int add_default_hugepagesz(void)
-{
-       if (size_to_hstate(CONT_PTES * PAGE_SIZE) == NULL)
-               hugetlb_add_hstate(CONT_PMD_SHIFT);
-       return 0;
-}
-arch_initcall(add_default_hugepagesz);
-#endif
diff --git a/arch/avr32/kernel/setup.c b/arch/avr32/kernel/setup.c

index 209ae5ad34958044ae3d6addcafa090f890a8be7..e6928896da2a4a2879828df92d940a2ed321a4ea 100644 (file)
--- a/arch/avr32/kernel/setup.c
+++ b/arch/avr32/kernel/setup.c
@@ -49,13 +49,13 @@ static struct resource __initdata kernel_data = {
         .name   = "Kernel data",
         .start  = 0,
         .end    = 0,
-       .flags  = IORESOURCE_MEM,
+       .flags  = IORESOURCE_SYSTEM_RAM,
  };
  static struct resource __initdata kernel_code = {
         .name   = "Kernel code",
         .start  = 0,
         .end    = 0,
-       .flags  = IORESOURCE_MEM,
+       .flags  = IORESOURCE_SYSTEM_RAM,
         .sibling = &kernel_data,
  };
  
@@ -134,7 +134,7 @@ add_physical_memory(resource_size_t start, resource_size_t end)
         new->start = start;
         new->end = end;
         new->name = "System RAM";
-       new->flags = IORESOURCE_MEM;
+       new->flags = IORESOURCE_SYSTEM_RAM;
  
         *pprev = new;
  }
diff --git a/arch/c6x/kernel/setup.c b/arch/c6x/kernel/setup.c

index 72e17f7ebd6ff0fba193ac07190d42b41e106e76..786e36e2f61de91c8f446cc941cff895cdb91578 100644 (file)
--- a/arch/c6x/kernel/setup.c
+++ b/arch/c6x/kernel/setup.c
@@ -281,8 +281,6 @@ notrace void __init machine_init(unsigned long dt_ptr)
          */
         set_ist(_vectors_start);
  
-       lockdep_init();
-
         /*
          * dtb is passed in from bootloader.
          * fdt is linked in blob.
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c

index caae3f4e4341a40d86f4014871c1ebe929a937ae..300dac3702f11a13460d1954843bfcbd1ddc2034 100644 (file)
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -1178,7 +1178,7 @@ efi_initialize_iomem_resources(struct resource *code_resource,
         efi_memory_desc_t *md;
         u64 efi_desc_size;
         char *name;
-       unsigned long flags;
+       unsigned long flags, desc;
  
         efi_map_start = __va(ia64_boot_param->efi_memmap);
         efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
@@ -1193,6 +1193,8 @@ efi_initialize_iomem_resources(struct resource *code_resource,
                         continue;
  
                 flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+               desc = IORES_DESC_NONE;
+
                 switch (md->type) {
  
                         case EFI_MEMORY_MAPPED_IO:
@@ -1207,14 +1209,17 @@ efi_initialize_iomem_resources(struct resource *code_resource,
                                 if (md->attribute & EFI_MEMORY_WP) {
                                         name = "System ROM";
                                         flags |= IORESOURCE_READONLY;
-                               } else if (md->attribute == EFI_MEMORY_UC)
+                               } else if (md->attribute == EFI_MEMORY_UC) {
                                         name = "Uncached RAM";
-                               else
+                               } else {
                                         name = "System RAM";
+                                       flags |= IORESOURCE_SYSRAM;
+                               }
                                 break;
  
                         case EFI_ACPI_MEMORY_NVS:
                                 name = "ACPI Non-volatile Storage";
+                               desc = IORES_DESC_ACPI_NV_STORAGE;
                                 break;
  
                         case EFI_UNUSABLE_MEMORY:
@@ -1224,6 +1229,7 @@ efi_initialize_iomem_resources(struct resource *code_resource,
  
                         case EFI_PERSISTENT_MEMORY:
                                 name = "Persistent Memory";
+                               desc = IORES_DESC_PERSISTENT_MEMORY;
                                 break;
  
                         case EFI_RESERVED_TYPE:
@@ -1246,6 +1252,7 @@ efi_initialize_iomem_resources(struct resource *code_resource,
                 res->start = md->phys_addr;
                 res->end = md->phys_addr + efi_md_size(md) - 1;
                 res->flags = flags;
+               res->desc = desc;
  
                 if (insert_resource(&iomem_resource, res) < 0)
                         kfree(res);
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c

index 4f118b0d30915f4f5927719522414a66efe17f08..2029a38a72aeee89793d53ee60418567586d67ff 100644 (file)
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -80,17 +80,17 @@ unsigned long vga_console_membase;
  
  static struct resource data_resource = {
         .name   = "Kernel data",
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
  };
  
  static struct resource code_resource = {
         .name   = "Kernel code",
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
  };
  
  static struct resource bss_resource = {
         .name   = "Kernel bss",
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
  };
  
  unsigned long ia64_max_cacheline_size;
diff --git a/arch/m32r/kernel/setup.c b/arch/m32r/kernel/setup.c

index a5ecef7188baa39e6f21d7a5a7c5ed6cca8f040f..136c69f1fb8ab8b73c23e2a752ea371c6e484512 100644 (file)
--- a/arch/m32r/kernel/setup.c
+++ b/arch/m32r/kernel/setup.c
@@ -70,14 +70,14 @@ static struct resource data_resource = {
         .name   = "Kernel data",
         .start  = 0,
         .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
  };
  
  static struct resource code_resource = {
         .name   = "Kernel code",
         .start  = 0,
         .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
  };
  
  unsigned long memory_start;
diff --git a/arch/microblaze/kernel/setup.c b/arch/microblaze/kernel/setup.c

index 89a2a93949274b16b3508b314d0cbdf6db5946e6..f31ebb5dc26c21922f240545d038f0d6b5699300 100644 (file)
--- a/arch/microblaze/kernel/setup.c
+++ b/arch/microblaze/kernel/setup.c
@@ -130,8 +130,6 @@ void __init machine_early_init(const char *cmdline, unsigned int ram,
         memset(__bss_start, 0, __bss_stop-__bss_start);
         memset(_ssbss, 0, _esbss-_ssbss);
  
-       lockdep_init();
-
  /* initialize device tree for usage in early_printk */
         early_init_devtree(_fdt_start);
  
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig

index 74a3db92da1b52edc15165ac06d3b1558b8b78c0..d3da79dda629114e936af3e811348171c62a9bcd 100644 (file)
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2169,7 +2169,7 @@ config MIPS_MT_SMP
         select CPU_MIPSR2_IRQ_VI
         select CPU_MIPSR2_IRQ_EI
         select SYNC_R4K
-       select MIPS_GIC_IPI
+       select MIPS_GIC_IPI if MIPS_GIC
         select MIPS_MT
         select SMP
         select SMP_UP
@@ -2267,7 +2267,7 @@ config MIPS_VPE_APSP_API_MT
  config MIPS_CMP
         bool "MIPS CMP framework support (DEPRECATED)"
         depends on SYS_SUPPORTS_MIPS_CMP && !CPU_MIPSR6
-       select MIPS_GIC_IPI
+       select MIPS_GIC_IPI if MIPS_GIC
         select SMP
         select SYNC_R4K
         select SYS_SUPPORTS_SMP
@@ -2287,7 +2287,7 @@ config MIPS_CPS
         select MIPS_CM
         select MIPS_CPC
         select MIPS_CPS_PM if HOTPLUG_CPU
-       select MIPS_GIC_IPI
+       select MIPS_GIC_IPI if MIPS_GIC
         select SMP
         select SYNC_R4K if (CEVT_R4K || CSRC_R4K)
         select SYS_SUPPORTS_HOTPLUG_CPU
@@ -2306,6 +2306,7 @@ config MIPS_CPS_PM
         bool
  
  config MIPS_GIC_IPI
+       depends on MIPS_GIC
         bool
  
  config MIPS_CM
diff --git a/arch/mips/boot/compressed/uart-16550.c b/arch/mips/boot/compressed/uart-16550.c

index 408799a839b42f2d9d7d1232bd7ea4efda0cf2d4..f7521142deda55fb439fa0857a2383b4dad7cc83 100644 (file)
--- a/arch/mips/boot/compressed/uart-16550.c
+++ b/arch/mips/boot/compressed/uart-16550.c
@@ -17,7 +17,7 @@
  #define PORT(offset) (CKSEG1ADDR(AR7_REGS_UART0) + (4 * offset))
  #endif
  
-#ifdef CONFIG_MACH_JZ4740
+#if defined(CONFIG_MACH_JZ4740) || defined(CONFIG_MACH_JZ4780)
  #include <asm/mach-jz4740/base.h>
  #define PORT(offset) (CKSEG1ADDR(JZ4740_UART0_BASE_ADDR) + (4 * offset))
  #endif
diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c

index 5fdaf8bdcd2ebeed8e31eb7825edd958bfc1c251..4f607341a7938867efc6f0f0646c218bf168421b 100644 (file)
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -732,21 +732,23 @@ static void __init resource_init(void)
                         end = HIGHMEM_START - 1;
  
                 res = alloc_bootmem(sizeof(struct resource));
+
+               res->start = start;
+               res->end = end;
+               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+
                 switch (boot_mem_map.map[i].type) {
                 case BOOT_MEM_RAM:
                 case BOOT_MEM_INIT_RAM:
                 case BOOT_MEM_ROM_DATA:
                         res->name = "System RAM";
+                       res->flags |= IORESOURCE_SYSRAM;
                         break;
                 case BOOT_MEM_RESERVED:
                 default:
                         res->name = "reserved";
                 }
  
-               res->start = start;
-               res->end = end;
-
-               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
                 request_resource(&iomem_resource, res);
  
                 /*
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c

index bd4385a8e6e86f7fbb9ca6d988f5eee155b9a8c7..2b521e07b8601a2c80e888b3064009779effa78b 100644 (file)
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -121,6 +121,7 @@ static inline void calculate_cpu_foreign_map(void)
         cpumask_t temp_foreign_map;
  
         /* Re-calculate the mask */
+       cpumask_clear(&temp_foreign_map);
         for_each_online_cpu(i) {
                 core_present = 0;
                 for_each_cpu(k, &temp_foreign_map)
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c

index 3110447ab1e9bffc37fa8572f5ce62d44d4a5fd9..70ef1a43c1148343101f2603a019fcefc1f90bce 100644 (file)
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -445,8 +445,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
  
         dvcpu->arch.wait = 0;
  
-       if (waitqueue_active(&dvcpu->wq))
-               wake_up_interruptible(&dvcpu->wq);
+       if (swait_active(&dvcpu->wq))
+               swake_up(&dvcpu->wq);
  
         return 0;
  }
@@ -1174,8 +1174,8 @@ static void kvm_mips_comparecount_func(unsigned long data)
         kvm_mips_callbacks->queue_timer_int(vcpu);
  
         vcpu->arch.wait = 0;
-       if (waitqueue_active(&vcpu->wq))
-               wake_up_interruptible(&vcpu->wq);
+       if (swait_active(&vcpu->wq))
+               swake_up(&vcpu->wq);
  }
  
  /* low level hrtimer wake routine */
diff --git a/arch/parisc/include/asm/cache.h b/arch/parisc/include/asm/cache.h

index 3d0e17bcc8e905ece06053ae15b3ff5443e6b033..df0f52bd18b457f9efa1a0b9fc470b0803f50632 100644 (file)
--- a/arch/parisc/include/asm/cache.h
+++ b/arch/parisc/include/asm/cache.h
@@ -22,6 +22,9 @@
  
  #define __read_mostly __attribute__((__section__(".data..read_mostly")))
  
+/* Read-only memory is marked before mark_rodata_ro() is called. */
+#define __ro_after_init        __read_mostly
+
  void parisc_cache_init(void);  /* initializes cache-flushing */
  void disable_sr_hashing_asm(int); /* low level support for above */
  void disable_sr_hashing(void);   /* turns off space register hashing */
diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h

index 845272ce9cc587222e835131bb5ed96be0be03d4..7bd69bd43a018577d099f373346383900b1f0121 100644 (file)
--- a/arch/parisc/include/asm/cacheflush.h
+++ b/arch/parisc/include/asm/cacheflush.h
@@ -121,10 +121,6 @@ flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vma
         }
  }
  
-#ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
-#endif
-
  #include <asm/kmap_types.h>
  
  #define ARCH_HAS_KMAP
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c

index 1b366c47768722081efe57a9db39eaba2a7dcde9..3c07d6b968772bc6de99bada8d83578ffe172c3d 100644 (file)
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -55,12 +55,12 @@ signed char pfnnid_map[PFNNID_MAP_MAX] __read_mostly;
  
  static struct resource data_resource = {
         .name   = "Kernel data",
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
  };
  
  static struct resource code_resource = {
         .name   = "Kernel code",
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
  };
  
  static struct resource pdcdata_resource = {
@@ -201,7 +201,7 @@ static void __init setup_bootmem(void)
                 res->name = "System RAM";
                 res->start = pmem_ranges[i].start_pfn << PAGE_SHIFT;
                 res->end = res->start + (pmem_ranges[i].pages << PAGE_SHIFT)-1;
-               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+               res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
                 request_resource(&iomem_resource, res);
         }
  
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h

index 9d08d8cbed1a1ec0e5893679c5e1fd9cb69dce09..c98afa538b3aeca91901e858c02884e820ca3dfa 100644 (file)
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -289,7 +289,7 @@ struct kvmppc_vcore {
         struct list_head runnable_threads;
         struct list_head preempt_list;
         spinlock_t lock;
-       wait_queue_head_t wq;
+       struct swait_queue_head wq;
         spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
         u64 stolen_tb;
         u64 preempt_tb;
@@ -629,7 +629,7 @@ struct kvm_vcpu_arch {
         u8 prodded;
         u32 last_inst;
  
-       wait_queue_head_t *wqp;
+       struct swait_queue_head *wqp;
         struct kvmppc_vcore *vcore;
         int ret;
         int trap;
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c

index ad8c9db61237223c3b807e3a9afed1487c44af1e..d544fa31175766bf48790f75bcd5458d6a505080 100644 (file)
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -114,8 +114,6 @@ extern unsigned int memset_nocache_branch; /* Insn to be replaced by NOP */
  
  notrace void __init machine_init(u64 dt_ptr)
  {
-       lockdep_init();
-
         /* Enable early debugging if any specified (see udbg.h) */
         udbg_early_init();
  
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c

index 5c03a6a9b0542fac3d042f2481f367aae9178d38..f98be8383a39994868ba41d0f8492e99f48eb1fe 100644 (file)
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -255,9 +255,6 @@ void __init early_setup(unsigned long dt_ptr)
         setup_paca(&boot_paca);
         fixup_boot_paca();
  
-       /* Initialize lockdep early or else spinlocks will blow */
-       lockdep_init();
-
         /* -------- printk is now safe to use ------- */
  
         /* Enable early debugging if any specified (see udbg.h) */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c

index baeddb06811d738a6ea8be4ce920e5ea39a9645a..f1187bb6dd4d7f5960e57aea111bd1c12021408d 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -114,11 +114,11 @@ static bool kvmppc_ipi_thread(int cpu)
  static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
  {
         int cpu;
-       wait_queue_head_t *wqp;
+       struct swait_queue_head *wqp;
  
         wqp = kvm_arch_vcpu_wq(vcpu);
-       if (waitqueue_active(wqp)) {
-               wake_up_interruptible(wqp);
+       if (swait_active(wqp)) {
+               swake_up(wqp);
                 ++vcpu->stat.halt_wakeup;
         }
  
@@ -701,8 +701,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
                 tvcpu->arch.prodded = 1;
                 smp_mb();
                 if (vcpu->arch.ceded) {
-                       if (waitqueue_active(&vcpu->wq)) {
-                               wake_up_interruptible(&vcpu->wq);
+                       if (swait_active(&vcpu->wq)) {
+                               swake_up(&vcpu->wq);
                                 vcpu->stat.halt_wakeup++;
                         }
                 }
@@ -1459,7 +1459,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
         INIT_LIST_HEAD(&vcore->runnable_threads);
         spin_lock_init(&vcore->lock);
         spin_lock_init(&vcore->stoltb_lock);
-       init_waitqueue_head(&vcore->wq);
+       init_swait_queue_head(&vcore->wq);
         vcore->preempt_tb = TB_NIL;
         vcore->lpcr = kvm->arch.lpcr;
         vcore->first_vcpuid = core * threads_per_subcore;
@@ -2531,10 +2531,9 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
  {
         struct kvm_vcpu *vcpu;
         int do_sleep = 1;
+       DECLARE_SWAITQUEUE(wait);
  
-       DEFINE_WAIT(wait);
-
-       prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+       prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
  
         /*
          * Check one last time for pending exceptions and ceded state after
@@ -2548,7 +2547,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
         }
  
         if (!do_sleep) {
-               finish_wait(&vc->wq, &wait);
+               finish_swait(&vc->wq, &wait);
                 return;
         }
  
@@ -2556,7 +2555,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
         trace_kvmppc_vcore_blocked(vc, 0);
         spin_unlock(&vc->lock);
         schedule();
-       finish_wait(&vc->wq, &wait);
+       finish_swait(&vc->wq, &wait);
         spin_lock(&vc->lock);
         vc->vcore_state = VCORE_INACTIVE;
         trace_kvmppc_vcore_blocked(vc, 1);
@@ -2612,7 +2611,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                         kvmppc_start_thread(vcpu, vc);
                         trace_kvm_guest_enter(vcpu);
                 } else if (vc->vcore_state == VCORE_SLEEPING) {
-                       wake_up(&vc->wq);
+                       swake_up(&vc->wq);
                 }
  
         }
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S

index 6ee26de9a1ded02e0e08f376fd612049eedee0b4..25ae2c9913c39c2fae34fb60dd7ee655ba821b6e 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1370,6 +1370,20 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
         std     r6, VCPU_ACOP(r9)
         stw     r7, VCPU_GUEST_PID(r9)
         std     r8, VCPU_WORT(r9)
+       /*
+        * Restore various registers to 0, where non-zero values
+        * set by the guest could disrupt the host.
+        */
+       li      r0, 0
+       mtspr   SPRN_IAMR, r0
+       mtspr   SPRN_CIABR, r0
+       mtspr   SPRN_DAWRX, r0
+       mtspr   SPRN_TCSCR, r0
+       mtspr   SPRN_WORT, r0
+       /* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */
+       li      r0, 1
+       sldi    r0, r0, 31
+       mtspr   SPRN_MMCRS, r0
  8:
  
         /* Save and reset AMR and UAMOR before turning on the MMU */
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c

index d0f0a514b04ed66da35d1ae6b27b964285ef2aec..f078a1f94fc267de0e4de530cd413608c2d93e5f 100644 (file)
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -541,7 +541,7 @@ static int __init add_system_ram_resources(void)
                         res->name = "System RAM";
                         res->start = base;
                         res->end = base + size - 1;
-                       res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+                       res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
                         WARN_ON(request_resource(&iomem_resource, res) < 0);
                 }
         }
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h

index 8959ebb6d2c9eb35538b835a39522585321f9af4..b0c8ad0799c7f0c09607420441ea87735c88c6d9 100644 (file)
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -467,7 +467,7 @@ struct kvm_s390_irq_payload {
  struct kvm_s390_local_interrupt {
         spinlock_t lock;
         struct kvm_s390_float_interrupt *float_int;
-       wait_queue_head_t *wq;
+       struct swait_queue_head *wq;
         atomic_t *cpuflags;
         DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS);
         struct kvm_s390_irq_payload irq;
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h

index fb1b93ea3e3fead0efde9f30e819c55eecb88da9..e485817f7b1a97683e64ab04d1f7444e1aae2e1d 100644 (file)
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -15,17 +15,25 @@
  static inline int init_new_context(struct task_struct *tsk,
                                    struct mm_struct *mm)
  {
+       spin_lock_init(&mm->context.list_lock);
+       INIT_LIST_HEAD(&mm->context.pgtable_list);
+       INIT_LIST_HEAD(&mm->context.gmap_list);
         cpumask_clear(&mm->context.cpu_attach_mask);
         atomic_set(&mm->context.attach_count, 0);
         mm->context.flush_mm = 0;
-       mm->context.asce_bits = _ASCE_TABLE_LENGTH | _ASCE_USER_BITS;
-       mm->context.asce_bits |= _ASCE_TYPE_REGION3;
  #ifdef CONFIG_PGSTE
         mm->context.alloc_pgste = page_table_allocate_pgste;
         mm->context.has_pgste = 0;
         mm->context.use_skey = 0;
  #endif
-       mm->context.asce_limit = STACK_TOP_MAX;
+       if (mm->context.asce_limit == 0) {
+               /* context created by exec, set asce limit to 4TB */
+               mm->context.asce_bits = _ASCE_TABLE_LENGTH |
+                       _ASCE_USER_BITS | _ASCE_TYPE_REGION3;
+               mm->context.asce_limit = STACK_TOP_MAX;
+       } else if (mm->context.asce_limit == (1UL << 31)) {
+               mm_inc_nr_pmds(mm);
+       }
         crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
         return 0;
  }
@@ -111,8 +119,6 @@ static inline void activate_mm(struct mm_struct *prev,
  static inline void arch_dup_mmap(struct mm_struct *oldmm,
                                  struct mm_struct *mm)
  {
-       if (oldmm->context.asce_limit < mm->context.asce_limit)
-               crst_table_downgrade(mm, oldmm->context.asce_limit);
  }
  
  static inline void arch_exit_mmap(struct mm_struct *mm)
diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h

index 7b7858f158b4574b549ab99648f977a3e249068c..d7cc79fb6191117a0ee0646ae6bbdf3a92a58132 100644 (file)
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -100,12 +100,26 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
  
  static inline pgd_t *pgd_alloc(struct mm_struct *mm)
  {
-       spin_lock_init(&mm->context.list_lock);
-       INIT_LIST_HEAD(&mm->context.pgtable_list);
-       INIT_LIST_HEAD(&mm->context.gmap_list);
-       return (pgd_t *) crst_table_alloc(mm);
+       unsigned long *table = crst_table_alloc(mm);
+
+       if (!table)
+               return NULL;
+       if (mm->context.asce_limit == (1UL << 31)) {
+               /* Forking a compat process with 2 page table levels */
+               if (!pgtable_pmd_page_ctor(virt_to_page(table))) {
+                       crst_table_free(mm, table);
+                       return NULL;
+               }
+       }
+       return (pgd_t *) table;
+}
+
+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+       if (mm->context.asce_limit == (1UL << 31))
+               pgtable_pmd_page_dtor(virt_to_page(pgd));
+       crst_table_free(mm, (unsigned long *) pgd);
  }
-#define pgd_free(mm, pgd) crst_table_free(mm, (unsigned long *) pgd)
  
  static inline void pmd_populate(struct mm_struct *mm,
                                 pmd_t *pmd, pgtable_t pte)
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c

index c55576bbaa1f7555479994d6848c9aa5b60f189b..a0684de5a93b99ae199f9bf6906327f4513f662e 100644 (file)
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -448,7 +448,6 @@ void __init startup_init(void)
         rescue_initrd();
         clear_bss_section();
         init_kernel_storage_key();
-       lockdep_init();
         lockdep_off();
         setup_lowcore_early();
         setup_facility_list();
diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S

index c5febe84eba633385cd7a8c09bee87e484c2e05c..03c2b469c4725172e4c6750cb8bcb0fb4edea075 100644 (file)
--- a/arch/s390/kernel/head64.S
+++ b/arch/s390/kernel/head64.S
@@ -16,7 +16,7 @@
  
  __HEAD
  ENTRY(startup_continue)
-       tm      __LC_STFLE_FAC_LIST+6,0x80      # LPP available ?
+       tm      __LC_STFLE_FAC_LIST+5,0x80      # LPP available ?
         jz      0f
         xc      __LC_LPP+1(7,0),__LC_LPP+1      # clear lpp and current_pid
         mvi     __LC_LPP,0x80                   #   and set LPP_MAGIC
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c

index 9220db5c996aa5250ca4d904361c51cec33d7827..cedb0198675f7577c7237bb61c006acaf0e984d4 100644 (file)
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -374,17 +374,17 @@ static void __init setup_lowcore(void)
  
  static struct resource code_resource = {
         .name  = "Kernel code",
-       .flags = IORESOURCE_BUSY | IORESOURCE_MEM,
+       .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
  };
  
  static struct resource data_resource = {
         .name = "Kernel data",
-       .flags = IORESOURCE_BUSY | IORESOURCE_MEM,
+       .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
  };
  
  static struct resource bss_resource = {
         .name = "Kernel bss",
-       .flags = IORESOURCE_BUSY | IORESOURCE_MEM,
+       .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
  };
  
  static struct resource __initdata *standard_resources[] = {
@@ -408,7 +408,7 @@ static void __init setup_resources(void)
  
         for_each_memblock(memory, reg) {
                 res = alloc_bootmem_low(sizeof(*res));
-               res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+               res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
  
                 res->name = "System RAM";
                 res->start = reg->base;
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c

index f88ca72c3a77a52e05e65cfa79206cd8e41aa786..9ffc7322179213f031939fa184bc6c93545af559 100644 (file)
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -966,13 +966,13 @@ no_timer:
  
  void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
  {
-       if (waitqueue_active(&vcpu->wq)) {
+       if (swait_active(&vcpu->wq)) {
                 /*
                  * The vcpu gave up the cpu voluntarily, mark it as a good
                  * yield-candidate.
                  */
                 vcpu->preempted = true;
-               wake_up_interruptible(&vcpu->wq);
+               swake_up(&vcpu->wq);
                 vcpu->stat.halt_wakeup++;
         }
  }
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c

index 4af21c771f9b3925bf860d0dc86cebcc803cf965..03dfe9c667f4eb944705787e54ff7e6ac3c08afb 100644 (file)
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2381,7 +2381,7 @@ int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long gpa)
  
         /* manually convert vector registers if necessary */
         if (MACHINE_HAS_VX) {
-               convert_vx_to_fp(fprs, current->thread.fpu.vxrs);
+               convert_vx_to_fp(fprs, (__vector128 *) vcpu->run->s.regs.vrs);
                 rc = write_guest_abs(vcpu, gpa + __LC_FPREGS_SAVE_AREA,
                                      fprs, 128);
         } else {
diff --git a/arch/score/kernel/setup.c b/arch/score/kernel/setup.c

index b48459afefddb835582e548f5b8d7b4ffd026e0d..f3a0649ab5217ac5f87b83580723dfda40fff3de 100644 (file)
--- a/arch/score/kernel/setup.c
+++ b/arch/score/kernel/setup.c
@@ -101,7 +101,7 @@ static void __init resource_init(void)
         res->name = "System RAM";
         res->start = MEMORY_START;
         res->end = MEMORY_START + MEMORY_SIZE - 1;
-       res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+       res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
         request_resource(&iomem_resource, res);
  
         request_resource(res, &code_resource);
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c

index de19cfa768f208708406e3350b5e9aecc92a6266..3f1c18b28e8af5fc414c13ebeb29ccbf04cd63a5 100644 (file)
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -78,17 +78,17 @@ static char __initdata command_line[COMMAND_LINE_SIZE] = { 0, };
  
  static struct resource code_resource = {
         .name = "Kernel code",
-       .flags = IORESOURCE_BUSY | IORESOURCE_MEM,
+       .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
  };
  
  static struct resource data_resource = {
         .name = "Kernel data",
-       .flags = IORESOURCE_BUSY | IORESOURCE_MEM,
+       .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
  };
  
  static struct resource bss_resource = {
         .name   = "Kernel bss",
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
  };
  
  unsigned long memory_start;
@@ -202,7 +202,7 @@ void __init __add_active_range(unsigned int nid, unsigned long start_pfn,
         res->name = "System RAM";
         res->start = start;
         res->end = end - 1;
-       res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+       res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
  
         if (request_resource(&iomem_resource, res)) {
                 pr_err("unable to request memory_resource 0x%lx 0x%lx\n",
diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S

index f2d30cab5b3f388fa9b446cf5b12afda92fa0a9d..cd1f592cd3479f8c94c599bfc589087184d4d180 100644 (file)
--- a/arch/sparc/kernel/head_64.S
+++ b/arch/sparc/kernel/head_64.S
@@ -696,14 +696,6 @@ tlb_fixup_done:
         call    __bzero
          sub    %o1, %o0, %o1
  
-#ifdef CONFIG_LOCKDEP
-       /* We have this call this super early, as even prom_init can grab
-        * spinlocks and thus call into the lockdep code.
-        */
-       call    lockdep_init
-        nop
-#endif
-
         call    prom_init
          mov    %l7, %o0                        ! OpenPROM cif handler
  
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c

index 6f216853f2724f8395bdd36ab18c4eadbeae0366..1cfe6aab7a11572d54f6848fe4656b7b40af8cf2 100644 (file)
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2863,17 +2863,17 @@ void hugetlb_setup(struct pt_regs *regs)
  
  static struct resource code_resource = {
         .name   = "Kernel code",
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
  };
  
  static struct resource data_resource = {
         .name   = "Kernel data",
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
  };
  
  static struct resource bss_resource = {
         .name   = "Kernel bss",
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
  };
  
  static inline resource_size_t compute_kern_paddr(void *addr)
@@ -2909,7 +2909,7 @@ static int __init report_memory(void)
                 res->name = "System RAM";
                 res->start = pavail[i].phys_addr;
                 res->end = pavail[i].phys_addr + pavail[i].reg_size - 1;
-               res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+               res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
  
                 if (insert_resource(&iomem_resource, res) < 0) {
                         pr_warn("Resource insertion failed.\n");
diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c

index bbb855de6569841200c991bf610b090c4e94c47b..a992238e9b58260fdfa067c4cc2c0d348ceda41f 100644 (file)
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -1632,14 +1632,14 @@ static struct resource data_resource = {
         .name   = "Kernel data",
         .start  = 0,
         .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
  };
  
  static struct resource code_resource = {
         .name   = "Kernel code",
         .start  = 0,
         .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
  };
  
  /*
@@ -1673,10 +1673,15 @@ insert_ram_resource(u64 start_pfn, u64 end_pfn, bool reserved)
                 kzalloc(sizeof(struct resource), GFP_ATOMIC);
         if (!res)
                 return NULL;
-       res->name = reserved ? "Reserved" : "System RAM";
         res->start = start_pfn << PAGE_SHIFT;
         res->end = (end_pfn << PAGE_SHIFT) - 1;
         res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+       if (reserved) {
+               res->name = "Reserved";
+       } else {
+               res->name = "System RAM";
+               res->flags |= IORESOURCE_SYSRAM;
+       }
         if (insert_resource(&iomem_resource, res)) {
                 kfree(res);
                 return NULL;
diff --git a/arch/unicore32/kernel/setup.c b/arch/unicore32/kernel/setup.c

index 3fa317f96122130d30cfc7be41d4e1450fc56a02..c2bffa5614a48102a1d66a60865a2e3474a49597 100644 (file)
--- a/arch/unicore32/kernel/setup.c
+++ b/arch/unicore32/kernel/setup.c
@@ -72,13 +72,13 @@ static struct resource mem_res[] = {
                 .name = "Kernel code",
                 .start = 0,
                 .end = 0,
-               .flags = IORESOURCE_MEM
+               .flags = IORESOURCE_SYSTEM_RAM
         },
         {
                 .name = "Kernel data",
                 .start = 0,
                 .end = 0,
-               .flags = IORESOURCE_MEM
+               .flags = IORESOURCE_SYSTEM_RAM
         }
  };
  
@@ -211,7 +211,7 @@ request_standard_resources(struct meminfo *mi)
                 res->name  = "System RAM";
                 res->start = mi->bank[i].start;
                 res->end   = mi->bank[i].start + mi->bank[i].size - 1;
-               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+               res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
  
                 request_resource(&iomem_resource, res);
  
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild

index 1538562cc720e78132d0eb31c38116b029d56ce0..eb3abf8ac44eb33f333ed29727dfd49ba6bfbf38 100644 (file)
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -1,6 +1,7 @@
-
  obj-y += entry/
  
+obj-$(CONFIG_PERF_EVENTS) += events/
+
  obj-$(CONFIG_KVM) += kvm/
  
  # Xen paravirtualization support
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig

index c46662f64c392050d1c45b722e3de7c01f87e250..b1051057e5b014b6a2b11b5c2cdbef866633e0c0 100644 (file)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -303,6 +303,9 @@ config ARCH_SUPPORTS_UPROBES
  config FIX_EARLYCON_MEM
         def_bool y
  
+config DEBUG_RODATA
+       def_bool y
+
  config PGTABLE_LEVELS
         int
         default 4 if X86_64
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug

index 68a2d1f0a6832bdbfafd9ed4122f370252f5e4d1..67eec55093a5dc97634aea899c4be86ff9f18f00 100644 (file)
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -74,28 +74,16 @@ config EFI_PGT_DUMP
           issues with the mapping of the EFI runtime regions into that
           table.
  
-config DEBUG_RODATA
-       bool "Write protect kernel read-only data structures"
-       default y
-       depends on DEBUG_KERNEL
-       ---help---
-         Mark the kernel read-only data as write-protected in the pagetables,
-         in order to catch accidental (and incorrect) writes to such const
-         data. This is recommended so that we can catch kernel bugs sooner.
-         If in doubt, say "Y".
-
  config DEBUG_RODATA_TEST
-       bool "Testcase for the DEBUG_RODATA feature"
-       depends on DEBUG_RODATA
+       bool "Testcase for the marking rodata read-only"
         default y
         ---help---
-         This option enables a testcase for the DEBUG_RODATA
-         feature as well as for the change_page_attr() infrastructure.
+         This option enables a testcase for the setting rodata read-only
+         as well as for the change_page_attr() infrastructure.
           If in doubt, say "N"
  
  config DEBUG_WX
         bool "Warn on W+X mappings at boot"
-       depends on DEBUG_RODATA
         select X86_PTDUMP_CORE
         ---help---
           Generate a warning if any W+X mappings are found at boot.
diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h

index abe961c7c71c7464e9707afa01bfe14efc89ba5f..63a03bb91497e63d56e1ba329f9da7b80e751e1d 100644 (file)
--- a/arch/x86/entry/vdso/vdso2c.h
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -140,7 +140,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
         fprintf(outfile, "#include <asm/vdso.h>\n");
         fprintf(outfile, "\n");
         fprintf(outfile,
-               "static unsigned char raw_data[%lu] __page_aligned_data = {",
+               "static unsigned char raw_data[%lu] __ro_after_init __aligned(PAGE_SIZE) = {",
                 mapping_size);
         for (j = 0; j < stripped_len; j++) {
                 if (j % 10 == 0)
diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile

new file mode 100644 (file)

index 0000000..fdfea15
--- /dev/null
+++ b/arch/x86/events/Makefile
@@ -0,0 +1,13 @@
+obj-y                  += core.o
+
+obj-$(CONFIG_CPU_SUP_AMD)               += amd/core.o amd/uncore.o
+obj-$(CONFIG_X86_LOCAL_APIC)            += amd/ibs.o msr.o
+ifdef CONFIG_AMD_IOMMU
+obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
+endif
+obj-$(CONFIG_CPU_SUP_INTEL)            += intel/core.o intel/bts.o intel/cqm.o
+obj-$(CONFIG_CPU_SUP_INTEL)            += intel/cstate.o intel/ds.o intel/knc.o 
+obj-$(CONFIG_CPU_SUP_INTEL)            += intel/lbr.o intel/p4.o intel/p6.o intel/pt.o
+obj-$(CONFIG_CPU_SUP_INTEL)            += intel/rapl.o msr.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel/uncore.o intel/uncore_nhmex.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel/uncore_snb.o intel/uncore_snbep.o
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c

new file mode 100644 (file)

index 0000000..049ada8
--- /dev/null
+++ b/arch/x86/events/amd/core.c
@@ -0,0 +1,731 @@
+#include <linux/perf_event.h>
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <asm/apicdef.h>
+
+#include "../perf_event.h"
+
+static __initconst const u64 amd_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
+               [ C(RESULT_MISS)   ] = 0x0141, /* Data Cache Misses          */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts  */
+               [ C(RESULT_MISS)   ] = 0x0167, /* Data Prefetcher :cancelled */
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches  */
+               [ C(RESULT_MISS)   ] = 0x0081, /* Instruction cache misses   */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
+               [ C(RESULT_MISS)   ] = 0x037E, /* L2 Cache Misses : IC+DC     */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback           */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
+               [ C(RESULT_MISS)   ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */
+               [ C(RESULT_MISS)   ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr.      */
+               [ C(RESULT_MISS)   ] = 0x00c3, /* Retired Mispredicted BI    */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */
+               [ C(RESULT_MISS)   ] = 0x98e9, /* CPU Request to Memory, r   */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+/*
+ * AMD Performance Monitor K7 and later.
+ */
+static const u64 amd_perfmon_event_map[] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]                   = 0x0076,
+  [PERF_COUNT_HW_INSTRUCTIONS]                 = 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]             = 0x0080,
+  [PERF_COUNT_HW_CACHE_MISSES]                 = 0x0081,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]          = 0x00c2,
+  [PERF_COUNT_HW_BRANCH_MISSES]                        = 0x00c3,
+  [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]      = 0x00d0, /* "Decoder empty" event */
+  [PERF_COUNT_HW_STALLED_CYCLES_BACKEND]       = 0x00d1, /* "Dispatch stalls" event */
+};
+
+static u64 amd_pmu_event_map(int hw_event)
+{
+       return amd_perfmon_event_map[hw_event];
+}
+
+/*
+ * Previously calculated offsets
+ */
+static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly;
+static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly;
+
+/*
+ * Legacy CPUs:
+ *   4 counters starting at 0xc0010000 each offset by 1
+ *
+ * CPUs with core performance counter extensions:
+ *   6 counters starting at 0xc0010200 each offset by 2
+ */
+static inline int amd_pmu_addr_offset(int index, bool eventsel)
+{
+       int offset;
+
+       if (!index)
+               return index;
+
+       if (eventsel)
+               offset = event_offsets[index];
+       else
+               offset = count_offsets[index];
+
+       if (offset)
+               return offset;
+
+       if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
+               offset = index;
+       else
+               offset = index << 1;
+
+       if (eventsel)
+               event_offsets[index] = offset;
+       else
+               count_offsets[index] = offset;
+
+       return offset;
+}
+
+static int amd_core_hw_config(struct perf_event *event)
+{
+       if (event->attr.exclude_host && event->attr.exclude_guest)
+               /*
+                * When HO == GO == 1 the hardware treats that as GO == HO == 0
+                * and will count in both modes. We don't want to count in that
+                * case so we emulate no-counting by setting US = OS = 0.
+                */
+               event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR |
+                                     ARCH_PERFMON_EVENTSEL_OS);
+       else if (event->attr.exclude_host)
+               event->hw.config |= AMD64_EVENTSEL_GUESTONLY;
+       else if (event->attr.exclude_guest)
+               event->hw.config |= AMD64_EVENTSEL_HOSTONLY;
+
+       return 0;
+}
+
+/*
+ * AMD64 events are detected based on their event codes.
+ */
+static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc)
+{
+       return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff);
+}
+
+static inline int amd_is_nb_event(struct hw_perf_event *hwc)
+{
+       return (hwc->config & 0xe0) == 0xe0;
+}
+
+static inline int amd_has_nb(struct cpu_hw_events *cpuc)
+{
+       struct amd_nb *nb = cpuc->amd_nb;
+
+       return nb && nb->nb_id != -1;
+}
+
+static int amd_pmu_hw_config(struct perf_event *event)
+{
+       int ret;
+
+       /* pass precise event sampling to ibs: */
+       if (event->attr.precise_ip && get_ibs_caps())
+               return -ENOENT;
+
+       if (has_branch_stack(event))
+               return -EOPNOTSUPP;
+
+       ret = x86_pmu_hw_config(event);
+       if (ret)
+               return ret;
+
+       if (event->attr.type == PERF_TYPE_RAW)
+               event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
+
+       return amd_core_hw_config(event);
+}
+
+static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc,
+                                          struct perf_event *event)
+{
+       struct amd_nb *nb = cpuc->amd_nb;
+       int i;
+
+       /*
+        * need to scan whole list because event may not have
+        * been assigned during scheduling
+        *
+        * no race condition possible because event can only
+        * be removed on one CPU at a time AND PMU is disabled
+        * when we come here
+        */
+       for (i = 0; i < x86_pmu.num_counters; i++) {
+               if (cmpxchg(nb->owners + i, event, NULL) == event)
+                       break;
+       }
+}
+
+ /*
+  * AMD64 NorthBridge events need special treatment because
+  * counter access needs to be synchronized across all cores
+  * of a package. Refer to BKDG section 3.12
+  *
+  * NB events are events measuring L3 cache, Hypertransport
+  * traffic. They are identified by an event code >= 0xe00.
+  * They measure events on the NorthBride which is shared
+  * by all cores on a package. NB events are counted on a
+  * shared set of counters. When a NB event is programmed
+  * in a counter, the data actually comes from a shared
+  * counter. Thus, access to those counters needs to be
+  * synchronized.
+  *
+  * We implement the synchronization such that no two cores
+  * can be measuring NB events using the same counters. Thus,
+  * we maintain a per-NB allocation table. The available slot
+  * is propagated using the event_constraint structure.
+  *
+  * We provide only one choice for each NB event based on
+  * the fact that only NB events have restrictions. Consequently,
+  * if a counter is available, there is a guarantee the NB event
+  * will be assigned to it. If no slot is available, an empty
+  * constraint is returned and scheduling will eventually fail
+  * for this event.
+  *
+  * Note that all cores attached the same NB compete for the same
+  * counters to host NB events, this is why we use atomic ops. Some
+  * multi-chip CPUs may have more than one NB.
+  *
+  * Given that resources are allocated (cmpxchg), they must be
+  * eventually freed for others to use. This is accomplished by
+  * calling __amd_put_nb_event_constraints()
+  *
+  * Non NB events are not impacted by this restriction.
+  */
+static struct event_constraint *
+__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
+                              struct event_constraint *c)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct amd_nb *nb = cpuc->amd_nb;
+       struct perf_event *old;
+       int idx, new = -1;
+
+       if (!c)
+               c = &unconstrained;
+
+       if (cpuc->is_fake)
+               return c;
+
+       /*
+        * detect if already present, if so reuse
+        *
+        * cannot merge with actual allocation
+        * because of possible holes
+        *
+        * event can already be present yet not assigned (in hwc->idx)
+        * because of successive calls to x86_schedule_events() from
+        * hw_perf_group_sched_in() without hw_perf_enable()
+        */
+       for_each_set_bit(idx, c->idxmsk, x86_pmu.num_counters) {
+               if (new == -1 || hwc->idx == idx)
+                       /* assign free slot, prefer hwc->idx */
+                       old = cmpxchg(nb->owners + idx, NULL, event);
+               else if (nb->owners[idx] == event)
+                       /* event already present */
+                       old = event;
+               else
+                       continue;
+
+               if (old && old != event)
+                       continue;
+
+               /* reassign to this slot */
+               if (new != -1)
+                       cmpxchg(nb->owners + new, event, NULL);
+               new = idx;
+
+               /* already present, reuse */
+               if (old == event)
+                       break;
+       }
+
+       if (new == -1)
+               return &emptyconstraint;
+
+       return &nb->event_constraints[new];
+}
+
+static struct amd_nb *amd_alloc_nb(int cpu)
+{
+       struct amd_nb *nb;
+       int i;
+
+       nb = kzalloc_node(sizeof(struct amd_nb), GFP_KERNEL, cpu_to_node(cpu));
+       if (!nb)
+               return NULL;
+
+       nb->nb_id = -1;
+
+       /*
+        * initialize all possible NB constraints
+        */
+       for (i = 0; i < x86_pmu.num_counters; i++) {
+               __set_bit(i, nb->event_constraints[i].idxmsk);
+               nb->event_constraints[i].weight = 1;
+       }
+       return nb;
+}
+
+static int amd_pmu_cpu_prepare(int cpu)
+{
+       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+
+       WARN_ON_ONCE(cpuc->amd_nb);
+
+       if (boot_cpu_data.x86_max_cores < 2)
+               return NOTIFY_OK;
+
+       cpuc->amd_nb = amd_alloc_nb(cpu);
+       if (!cpuc->amd_nb)
+               return NOTIFY_BAD;
+
+       return NOTIFY_OK;
+}
+
+static void amd_pmu_cpu_starting(int cpu)
+{
+       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+       void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
+       struct amd_nb *nb;
+       int i, nb_id;
+
+       cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
+
+       if (boot_cpu_data.x86_max_cores < 2)
+               return;
+
+       nb_id = amd_get_nb_id(cpu);
+       WARN_ON_ONCE(nb_id == BAD_APICID);
+
+       for_each_online_cpu(i) {
+               nb = per_cpu(cpu_hw_events, i).amd_nb;
+               if (WARN_ON_ONCE(!nb))
+                       continue;
+
+               if (nb->nb_id == nb_id) {
+                       *onln = cpuc->amd_nb;
+                       cpuc->amd_nb = nb;
+                       break;
+               }
+       }
+
+       cpuc->amd_nb->nb_id = nb_id;
+       cpuc->amd_nb->refcnt++;
+}
+
+static void amd_pmu_cpu_dead(int cpu)
+{
+       struct cpu_hw_events *cpuhw;
+
+       if (boot_cpu_data.x86_max_cores < 2)
+               return;
+
+       cpuhw = &per_cpu(cpu_hw_events, cpu);
+
+       if (cpuhw->amd_nb) {
+               struct amd_nb *nb = cpuhw->amd_nb;
+
+               if (nb->nb_id == -1 || --nb->refcnt == 0)
+                       kfree(nb);
+
+               cpuhw->amd_nb = NULL;
+       }
+}
+
+static struct event_constraint *
+amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+                         struct perf_event *event)
+{
+       /*
+        * if not NB event or no NB, then no constraints
+        */
+       if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)))
+               return &unconstrained;
+
+       return __amd_get_nb_event_constraints(cpuc, event, NULL);
+}
+
+static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
+                                     struct perf_event *event)
+{
+       if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))
+               __amd_put_nb_event_constraints(cpuc, event);
+}
+
+PMU_FORMAT_ATTR(event, "config:0-7,32-35");
+PMU_FORMAT_ATTR(umask, "config:8-15"   );
+PMU_FORMAT_ATTR(edge,  "config:18"     );
+PMU_FORMAT_ATTR(inv,   "config:23"     );
+PMU_FORMAT_ATTR(cmask, "config:24-31"  );
+
+static struct attribute *amd_format_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_cmask.attr,
+       NULL,
+};
+
+/* AMD Family 15h */
+
+#define AMD_EVENT_TYPE_MASK    0x000000F0ULL
+
+#define AMD_EVENT_FP           0x00000000ULL ... 0x00000010ULL
+#define AMD_EVENT_LS           0x00000020ULL ... 0x00000030ULL
+#define AMD_EVENT_DC           0x00000040ULL ... 0x00000050ULL
+#define AMD_EVENT_CU           0x00000060ULL ... 0x00000070ULL
+#define AMD_EVENT_IC_DE                0x00000080ULL ... 0x00000090ULL
+#define AMD_EVENT_EX_LS                0x000000C0ULL
+#define AMD_EVENT_DE           0x000000D0ULL
+#define AMD_EVENT_NB           0x000000E0ULL ... 0x000000F0ULL
+
+/*
+ * AMD family 15h event code/PMC mappings:
+ *
+ * type = event_code & 0x0F0:
+ *
+ * 0x000       FP      PERF_CTL[5:3]
+ * 0x010       FP      PERF_CTL[5:3]
+ * 0x020       LS      PERF_CTL[5:0]
+ * 0x030       LS      PERF_CTL[5:0]
+ * 0x040       DC      PERF_CTL[5:0]
+ * 0x050       DC      PERF_CTL[5:0]
+ * 0x060       CU      PERF_CTL[2:0]
+ * 0x070       CU      PERF_CTL[2:0]
+ * 0x080       IC/DE   PERF_CTL[2:0]
+ * 0x090       IC/DE   PERF_CTL[2:0]
+ * 0x0A0       ---
+ * 0x0B0       ---
+ * 0x0C0       EX/LS   PERF_CTL[5:0]
+ * 0x0D0       DE      PERF_CTL[2:0]
+ * 0x0E0       NB      NB_PERF_CTL[3:0]
+ * 0x0F0       NB      NB_PERF_CTL[3:0]
+ *
+ * Exceptions:
+ *
+ * 0x000       FP      PERF_CTL[3], PERF_CTL[5:3] (*)
+ * 0x003       FP      PERF_CTL[3]
+ * 0x004       FP      PERF_CTL[3], PERF_CTL[5:3] (*)
+ * 0x00B       FP      PERF_CTL[3]
+ * 0x00D       FP      PERF_CTL[3]
+ * 0x023       DE      PERF_CTL[2:0]
+ * 0x02D       LS      PERF_CTL[3]
+ * 0x02E       LS      PERF_CTL[3,0]
+ * 0x031       LS      PERF_CTL[2:0] (**)
+ * 0x043       CU      PERF_CTL[2:0]
+ * 0x045       CU      PERF_CTL[2:0]
+ * 0x046       CU      PERF_CTL[2:0]
+ * 0x054       CU      PERF_CTL[2:0]
+ * 0x055       CU      PERF_CTL[2:0]
+ * 0x08F       IC      PERF_CTL[0]
+ * 0x187       DE      PERF_CTL[0]
+ * 0x188       DE      PERF_CTL[0]
+ * 0x0DB       EX      PERF_CTL[5:0]
+ * 0x0DC       LS      PERF_CTL[5:0]
+ * 0x0DD       LS      PERF_CTL[5:0]
+ * 0x0DE       LS      PERF_CTL[5:0]
+ * 0x0DF       LS      PERF_CTL[5:0]
+ * 0x1C0       EX      PERF_CTL[5:3]
+ * 0x1D6       EX      PERF_CTL[5:0]
+ * 0x1D8       EX      PERF_CTL[5:0]
+ *
+ * (*)  depending on the umask all FPU counters may be used
+ * (**) only one unitmask enabled at a time
+ */
+
+static struct event_constraint amd_f15_PMC0  = EVENT_CONSTRAINT(0, 0x01, 0);
+static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
+static struct event_constraint amd_f15_PMC3  = EVENT_CONSTRAINT(0, 0x08, 0);
+static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
+static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
+static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
+
+static struct event_constraint *
+amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx,
+                              struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       unsigned int event_code = amd_get_event_code(hwc);
+
+       switch (event_code & AMD_EVENT_TYPE_MASK) {
+       case AMD_EVENT_FP:
+               switch (event_code) {
+               case 0x000:
+                       if (!(hwc->config & 0x0000F000ULL))
+                               break;
+                       if (!(hwc->config & 0x00000F00ULL))
+                               break;
+                       return &amd_f15_PMC3;
+               case 0x004:
+                       if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
+                               break;
+                       return &amd_f15_PMC3;
+               case 0x003:
+               case 0x00B:
+               case 0x00D:
+                       return &amd_f15_PMC3;
+               }
+               return &amd_f15_PMC53;
+       case AMD_EVENT_LS:
+       case AMD_EVENT_DC:
+       case AMD_EVENT_EX_LS:
+               switch (event_code) {
+               case 0x023:
+               case 0x043:
+               case 0x045:
+               case 0x046:
+               case 0x054:
+               case 0x055:
+                       return &amd_f15_PMC20;
+               case 0x02D:
+                       return &amd_f15_PMC3;
+               case 0x02E:
+                       return &amd_f15_PMC30;
+               case 0x031:
+                       if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
+                               return &amd_f15_PMC20;
+                       return &emptyconstraint;
+               case 0x1C0:
+                       return &amd_f15_PMC53;
+               default:
+                       return &amd_f15_PMC50;
+               }
+       case AMD_EVENT_CU:
+       case AMD_EVENT_IC_DE:
+       case AMD_EVENT_DE:
+               switch (event_code) {
+               case 0x08F:
+               case 0x187:
+               case 0x188:
+                       return &amd_f15_PMC0;
+               case 0x0DB ... 0x0DF:
+               case 0x1D6:
+               case 0x1D8:
+                       return &amd_f15_PMC50;
+               default:
+                       return &amd_f15_PMC20;
+               }
+       case AMD_EVENT_NB:
+               /* moved to perf_event_amd_uncore.c */
+               return &emptyconstraint;
+       default:
+               return &emptyconstraint;
+       }
+}
+
+static ssize_t amd_event_sysfs_show(char *page, u64 config)
+{
+       u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) |
+                   (config & AMD64_EVENTSEL_EVENT) >> 24;
+
+       return x86_event_sysfs_show(page, config, event);
+}
+
+static __initconst const struct x86_pmu amd_pmu = {
+       .name                   = "AMD",
+       .handle_irq             = x86_pmu_handle_irq,
+       .disable_all            = x86_pmu_disable_all,
+       .enable_all             = x86_pmu_enable_all,
+       .enable                 = x86_pmu_enable_event,
+       .disable                = x86_pmu_disable_event,
+       .hw_config              = amd_pmu_hw_config,
+       .schedule_events        = x86_schedule_events,
+       .eventsel               = MSR_K7_EVNTSEL0,
+       .perfctr                = MSR_K7_PERFCTR0,
+       .addr_offset            = amd_pmu_addr_offset,
+       .event_map              = amd_pmu_event_map,
+       .max_events             = ARRAY_SIZE(amd_perfmon_event_map),
+       .num_counters           = AMD64_NUM_COUNTERS,
+       .cntval_bits            = 48,
+       .cntval_mask            = (1ULL << 48) - 1,
+       .apic                   = 1,
+       /* use highest bit to detect overflow */
+       .max_period             = (1ULL << 47) - 1,
+       .get_event_constraints  = amd_get_event_constraints,
+       .put_event_constraints  = amd_put_event_constraints,
+
+       .format_attrs           = amd_format_attr,
+       .events_sysfs_show      = amd_event_sysfs_show,
+
+       .cpu_prepare            = amd_pmu_cpu_prepare,
+       .cpu_starting           = amd_pmu_cpu_starting,
+       .cpu_dead               = amd_pmu_cpu_dead,
+};
+
+static int __init amd_core_pmu_init(void)
+{
+       if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
+               return 0;
+
+       switch (boot_cpu_data.x86) {
+       case 0x15:
+               pr_cont("Fam15h ");
+               x86_pmu.get_event_constraints = amd_get_event_constraints_f15h;
+               break;
+
+       default:
+               pr_err("core perfctr but no constraints; unknown hardware!\n");
+               return -ENODEV;
+       }
+
+       /*
+        * If core performance counter extensions exists, we must use
+        * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also
+        * amd_pmu_addr_offset().
+        */
+       x86_pmu.eventsel        = MSR_F15H_PERF_CTL;
+       x86_pmu.perfctr         = MSR_F15H_PERF_CTR;
+       x86_pmu.num_counters    = AMD64_NUM_COUNTERS_CORE;
+
+       pr_cont("core perfctr, ");
+       return 0;
+}
+
+__init int amd_pmu_init(void)
+{
+       int ret;
+
+       /* Performance-monitoring supported from K7 and later: */
+       if (boot_cpu_data.x86 < 6)
+               return -ENODEV;
+
+       x86_pmu = amd_pmu;
+
+       ret = amd_core_pmu_init();
+       if (ret)
+               return ret;
+
+       /* Events are common for all AMDs */
+       memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
+              sizeof(hw_cache_event_ids));
+
+       return 0;
+}
+
+void amd_pmu_enable_virt(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       cpuc->perf_ctr_virt_mask = 0;
+
+       /* Reload all events */
+       x86_pmu_disable_all();
+       x86_pmu_enable_all(0);
+}
+EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
+
+void amd_pmu_disable_virt(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       /*
+        * We only mask out the Host-only bit so that host-only counting works
+        * when SVM is disabled. If someone sets up a guest-only counter when
+        * SVM is disabled the Guest-only bits still gets set and the counter
+        * will not count anything.
+        */
+       cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
+
+       /* Reload all events */
+       x86_pmu_disable_all();
+       x86_pmu_enable_all(0);
+}
+EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c

new file mode 100644 (file)

index 0000000..51087c2
--- /dev/null
+++ b/arch/x86/events/amd/ibs.c
@@ -0,0 +1,959 @@
+/*
+ * Performance events - AMD IBS
+ *
+ *  Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/ptrace.h>
+#include <linux/syscore_ops.h>
+
+#include <asm/apic.h>
+
+#include "../perf_event.h"
+
+static u32 ibs_caps;
+
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
+
+#include <linux/kprobes.h>
+#include <linux/hardirq.h>
+
+#include <asm/nmi.h>
+
+#define IBS_FETCH_CONFIG_MASK  (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
+#define IBS_OP_CONFIG_MASK     IBS_OP_MAX_CNT
+
+enum ibs_states {
+       IBS_ENABLED     = 0,
+       IBS_STARTED     = 1,
+       IBS_STOPPING    = 2,
+
+       IBS_MAX_STATES,
+};
+
+struct cpu_perf_ibs {
+       struct perf_event       *event;
+       unsigned long           state[BITS_TO_LONGS(IBS_MAX_STATES)];
+};
+
+struct perf_ibs {
+       struct pmu                      pmu;
+       unsigned int                    msr;
+       u64                             config_mask;
+       u64                             cnt_mask;
+       u64                             enable_mask;
+       u64                             valid_mask;
+       u64                             max_period;
+       unsigned long                   offset_mask[1];
+       int                             offset_max;
+       struct cpu_perf_ibs __percpu    *pcpu;
+
+       struct attribute                **format_attrs;
+       struct attribute_group          format_group;
+       const struct attribute_group    *attr_groups[2];
+
+       u64                             (*get_count)(u64 config);
+};
+
+struct perf_ibs_data {
+       u32             size;
+       union {
+               u32     data[0];        /* data buffer starts here */
+               u32     caps;
+       };
+       u64             regs[MSR_AMD64_IBS_REG_COUNT_MAX];
+};
+
+static int
+perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
+{
+       s64 left = local64_read(&hwc->period_left);
+       s64 period = hwc->sample_period;
+       int overflow = 0;
+
+       /*
+        * If we are way outside a reasonable range then just skip forward:
+        */
+       if (unlikely(left <= -period)) {
+               left = period;
+               local64_set(&hwc->period_left, left);
+               hwc->last_period = period;
+               overflow = 1;
+       }
+
+       if (unlikely(left < (s64)min)) {
+               left += period;
+               local64_set(&hwc->period_left, left);
+               hwc->last_period = period;
+               overflow = 1;
+       }
+
+       /*
+        * If the hw period that triggers the sw overflow is too short
+        * we might hit the irq handler. This biases the results.
+        * Thus we shorten the next-to-last period and set the last
+        * period to the max period.
+        */
+       if (left > max) {
+               left -= max;
+               if (left > max)
+                       left = max;
+               else if (left < min)
+                       left = min;
+       }
+
+       *hw_period = (u64)left;
+
+       return overflow;
+}
+
+static  int
+perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       int shift = 64 - width;
+       u64 prev_raw_count;
+       u64 delta;
+
+       /*
+        * Careful: an NMI might modify the previous event value.
+        *
+        * Our tactic to handle this is to first atomically read and
+        * exchange a new raw count - then add that new-prev delta
+        * count to the generic event atomically:
+        */
+       prev_raw_count = local64_read(&hwc->prev_count);
+       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+                                       new_raw_count) != prev_raw_count)
+               return 0;
+
+       /*
+        * Now we have the new raw value and have updated the prev
+        * timestamp already. We can now calculate the elapsed delta
+        * (event-)time and add that to the generic event.
+        *
+        * Careful, not all hw sign-extends above the physical width
+        * of the count.
+        */
+       delta = (new_raw_count << shift) - (prev_raw_count << shift);
+       delta >>= shift;
+
+       local64_add(delta, &event->count);
+       local64_sub(delta, &hwc->period_left);
+
+       return 1;
+}
+
+static struct perf_ibs perf_ibs_fetch;
+static struct perf_ibs perf_ibs_op;
+
+static struct perf_ibs *get_ibs_pmu(int type)
+{
+       if (perf_ibs_fetch.pmu.type == type)
+               return &perf_ibs_fetch;
+       if (perf_ibs_op.pmu.type == type)
+               return &perf_ibs_op;
+       return NULL;
+}
+
+/*
+ * Use IBS for precise event sampling:
+ *
+ *  perf record -a -e cpu-cycles:p ...    # use ibs op counting cycle count
+ *  perf record -a -e r076:p ...          # same as -e cpu-cycles:p
+ *  perf record -a -e r0C1:p ...          # use ibs op counting micro-ops
+ *
+ * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
+ * MSRC001_1033) is used to select either cycle or micro-ops counting
+ * mode.
+ *
+ * The rip of IBS samples has skid 0. Thus, IBS supports precise
+ * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
+ * rip is invalid when IBS was not able to record the rip correctly.
+ * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
+ *
+ */
+static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
+{
+       switch (event->attr.precise_ip) {
+       case 0:
+               return -ENOENT;
+       case 1:
+       case 2:
+               break;
+       default:
+               return -EOPNOTSUPP;
+       }
+
+       switch (event->attr.type) {
+       case PERF_TYPE_HARDWARE:
+               switch (event->attr.config) {
+               case PERF_COUNT_HW_CPU_CYCLES:
+                       *config = 0;
+                       return 0;
+               }
+               break;
+       case PERF_TYPE_RAW:
+               switch (event->attr.config) {
+               case 0x0076:
+                       *config = 0;
+                       return 0;
+               case 0x00C1:
+                       *config = IBS_OP_CNT_CTL;
+                       return 0;
+               }
+               break;
+       default:
+               return -ENOENT;
+       }
+
+       return -EOPNOTSUPP;
+}
+
+static const struct perf_event_attr ibs_notsupp = {
+       .exclude_user   = 1,
+       .exclude_kernel = 1,
+       .exclude_hv     = 1,
+       .exclude_idle   = 1,
+       .exclude_host   = 1,
+       .exclude_guest  = 1,
+};
+
+static int perf_ibs_init(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct perf_ibs *perf_ibs;
+       u64 max_cnt, config;
+       int ret;
+
+       perf_ibs = get_ibs_pmu(event->attr.type);
+       if (perf_ibs) {
+               config = event->attr.config;
+       } else {
+               perf_ibs = &perf_ibs_op;
+               ret = perf_ibs_precise_event(event, &config);
+               if (ret)
+                       return ret;
+       }
+
+       if (event->pmu != &perf_ibs->pmu)
+               return -ENOENT;
+
+       if (perf_flags(&event->attr) & perf_flags(&ibs_notsupp))
+               return -EINVAL;
+
+       if (config & ~perf_ibs->config_mask)
+               return -EINVAL;
+
+       if (hwc->sample_period) {
+               if (config & perf_ibs->cnt_mask)
+                       /* raw max_cnt may not be set */
+                       return -EINVAL;
+               if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
+                       /*
+                        * lower 4 bits can not be set in ibs max cnt,
+                        * but allowing it in case we adjust the
+                        * sample period to set a frequency.
+                        */
+                       return -EINVAL;
+               hwc->sample_period &= ~0x0FULL;
+               if (!hwc->sample_period)
+                       hwc->sample_period = 0x10;
+       } else {
+               max_cnt = config & perf_ibs->cnt_mask;
+               config &= ~perf_ibs->cnt_mask;
+               event->attr.sample_period = max_cnt << 4;
+               hwc->sample_period = event->attr.sample_period;
+       }
+
+       if (!hwc->sample_period)
+               return -EINVAL;
+
+       /*
+        * If we modify hwc->sample_period, we also need to update
+        * hwc->last_period and hwc->period_left.
+        */
+       hwc->last_period = hwc->sample_period;
+       local64_set(&hwc->period_left, hwc->sample_period);
+
+       hwc->config_base = perf_ibs->msr;
+       hwc->config = config;
+
+       return 0;
+}
+
+static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
+                              struct hw_perf_event *hwc, u64 *period)
+{
+       int overflow;
+
+       /* ignore lower 4 bits in min count: */
+       overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
+       local64_set(&hwc->prev_count, 0);
+
+       return overflow;
+}
+
+static u64 get_ibs_fetch_count(u64 config)
+{
+       return (config & IBS_FETCH_CNT) >> 12;
+}
+
+static u64 get_ibs_op_count(u64 config)
+{
+       u64 count = 0;
+
+       if (config & IBS_OP_VAL)
+               count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */
+
+       if (ibs_caps & IBS_CAPS_RDWROPCNT)
+               count += (config & IBS_OP_CUR_CNT) >> 32;
+
+       return count;
+}
+
+static void
+perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
+                     u64 *config)
+{
+       u64 count = perf_ibs->get_count(*config);
+
+       /*
+        * Set width to 64 since we do not overflow on max width but
+        * instead on max count. In perf_ibs_set_period() we clear
+        * prev count manually on overflow.
+        */
+       while (!perf_event_try_update(event, count, 64)) {
+               rdmsrl(event->hw.config_base, *config);
+               count = perf_ibs->get_count(*config);
+       }
+}
+
+static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
+                                        struct hw_perf_event *hwc, u64 config)
+{
+       wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask);
+}
+
+/*
+ * Erratum #420 Instruction-Based Sampling Engine May Generate
+ * Interrupt that Cannot Be Cleared:
+ *
+ * Must clear counter mask first, then clear the enable bit. See
+ * Revision Guide for AMD Family 10h Processors, Publication #41322.
+ */
+static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
+                                         struct hw_perf_event *hwc, u64 config)
+{
+       config &= ~perf_ibs->cnt_mask;
+       wrmsrl(hwc->config_base, config);
+       config &= ~perf_ibs->enable_mask;
+       wrmsrl(hwc->config_base, config);
+}
+
+/*
+ * We cannot restore the ibs pmu state, so we always needs to update
+ * the event while stopping it and then reset the state when starting
+ * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
+ * perf_ibs_start()/perf_ibs_stop() and instead always do it.
+ */
+static void perf_ibs_start(struct perf_event *event, int flags)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+       u64 period;
+
+       if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+               return;
+
+       WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+       hwc->state = 0;
+
+       perf_ibs_set_period(perf_ibs, hwc, &period);
+       set_bit(IBS_STARTED, pcpu->state);
+       perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
+
+       perf_event_update_userpage(event);
+}
+
+static void perf_ibs_stop(struct perf_event *event, int flags)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+       u64 config;
+       int stopping;
+
+       stopping = test_and_clear_bit(IBS_STARTED, pcpu->state);
+
+       if (!stopping && (hwc->state & PERF_HES_UPTODATE))
+               return;
+
+       rdmsrl(hwc->config_base, config);
+
+       if (stopping) {
+               set_bit(IBS_STOPPING, pcpu->state);
+               perf_ibs_disable_event(perf_ibs, hwc, config);
+               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+               hwc->state |= PERF_HES_STOPPED;
+       }
+
+       if (hwc->state & PERF_HES_UPTODATE)
+               return;
+
+       /*
+        * Clear valid bit to not count rollovers on update, rollovers
+        * are only updated in the irq handler.
+        */
+       config &= ~perf_ibs->valid_mask;
+
+       perf_ibs_event_update(perf_ibs, event, &config);
+       hwc->state |= PERF_HES_UPTODATE;
+}
+
+static int perf_ibs_add(struct perf_event *event, int flags)
+{
+       struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+
+       if (test_and_set_bit(IBS_ENABLED, pcpu->state))
+               return -ENOSPC;
+
+       event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+       pcpu->event = event;
+
+       if (flags & PERF_EF_START)
+               perf_ibs_start(event, PERF_EF_RELOAD);
+
+       return 0;
+}
+
+static void perf_ibs_del(struct perf_event *event, int flags)
+{
+       struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+
+       if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
+               return;
+
+       perf_ibs_stop(event, PERF_EF_UPDATE);
+
+       pcpu->event = NULL;
+
+       perf_event_update_userpage(event);
+}
+
+static void perf_ibs_read(struct perf_event *event) { }
+
+PMU_FORMAT_ATTR(rand_en,       "config:57");
+PMU_FORMAT_ATTR(cnt_ctl,       "config:19");
+
+static struct attribute *ibs_fetch_format_attrs[] = {
+       &format_attr_rand_en.attr,
+       NULL,
+};
+
+static struct attribute *ibs_op_format_attrs[] = {
+       NULL,   /* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */
+       NULL,
+};
+
+static struct perf_ibs perf_ibs_fetch = {
+       .pmu = {
+               .task_ctx_nr    = perf_invalid_context,
+
+               .event_init     = perf_ibs_init,
+               .add            = perf_ibs_add,
+               .del            = perf_ibs_del,
+               .start          = perf_ibs_start,
+               .stop           = perf_ibs_stop,
+               .read           = perf_ibs_read,
+       },
+       .msr                    = MSR_AMD64_IBSFETCHCTL,
+       .config_mask            = IBS_FETCH_CONFIG_MASK,
+       .cnt_mask               = IBS_FETCH_MAX_CNT,
+       .enable_mask            = IBS_FETCH_ENABLE,
+       .valid_mask             = IBS_FETCH_VAL,
+       .max_period             = IBS_FETCH_MAX_CNT << 4,
+       .offset_mask            = { MSR_AMD64_IBSFETCH_REG_MASK },
+       .offset_max             = MSR_AMD64_IBSFETCH_REG_COUNT,
+       .format_attrs           = ibs_fetch_format_attrs,
+
+       .get_count              = get_ibs_fetch_count,
+};
+
+static struct perf_ibs perf_ibs_op = {
+       .pmu = {
+               .task_ctx_nr    = perf_invalid_context,
+
+               .event_init     = perf_ibs_init,
+               .add            = perf_ibs_add,
+               .del            = perf_ibs_del,
+               .start          = perf_ibs_start,
+               .stop           = perf_ibs_stop,
+               .read           = perf_ibs_read,
+       },
+       .msr                    = MSR_AMD64_IBSOPCTL,
+       .config_mask            = IBS_OP_CONFIG_MASK,
+       .cnt_mask               = IBS_OP_MAX_CNT,
+       .enable_mask            = IBS_OP_ENABLE,
+       .valid_mask             = IBS_OP_VAL,
+       .max_period             = IBS_OP_MAX_CNT << 4,
+       .offset_mask            = { MSR_AMD64_IBSOP_REG_MASK },
+       .offset_max             = MSR_AMD64_IBSOP_REG_COUNT,
+       .format_attrs           = ibs_op_format_attrs,
+
+       .get_count              = get_ibs_op_count,
+};
+
+static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
+{
+       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+       struct perf_event *event = pcpu->event;
+       struct hw_perf_event *hwc = &event->hw;
+       struct perf_sample_data data;
+       struct perf_raw_record raw;
+       struct pt_regs regs;
+       struct perf_ibs_data ibs_data;
+       int offset, size, check_rip, offset_max, throttle = 0;
+       unsigned int msr;
+       u64 *buf, *config, period;
+
+       if (!test_bit(IBS_STARTED, pcpu->state)) {
+               /*
+                * Catch spurious interrupts after stopping IBS: After
+                * disabling IBS there could be still incoming NMIs
+                * with samples that even have the valid bit cleared.
+                * Mark all this NMIs as handled.
+                */
+               return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0;
+       }
+
+       msr = hwc->config_base;
+       buf = ibs_data.regs;
+       rdmsrl(msr, *buf);
+       if (!(*buf++ & perf_ibs->valid_mask))
+               return 0;
+
+       config = &ibs_data.regs[0];
+       perf_ibs_event_update(perf_ibs, event, config);
+       perf_sample_data_init(&data, 0, hwc->last_period);
+       if (!perf_ibs_set_period(perf_ibs, hwc, &period))
+               goto out;       /* no sw counter overflow */
+
+       ibs_data.caps = ibs_caps;
+       size = 1;
+       offset = 1;
+       check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
+       if (event->attr.sample_type & PERF_SAMPLE_RAW)
+               offset_max = perf_ibs->offset_max;
+       else if (check_rip)
+               offset_max = 2;
+       else
+               offset_max = 1;
+       do {
+               rdmsrl(msr + offset, *buf++);
+               size++;
+               offset = find_next_bit(perf_ibs->offset_mask,
+                                      perf_ibs->offset_max,
+                                      offset + 1);
+       } while (offset < offset_max);
+       if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+               /*
+                * Read IbsBrTarget and IbsOpData4 separately
+                * depending on their availability.
+                * Can't add to offset_max as they are staggered
+                */
+               if (ibs_caps & IBS_CAPS_BRNTRGT) {
+                       rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++);
+                       size++;
+               }
+               if (ibs_caps & IBS_CAPS_OPDATA4) {
+                       rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++);
+                       size++;
+               }
+       }
+       ibs_data.size = sizeof(u64) * size;
+
+       regs = *iregs;
+       if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
+               regs.flags &= ~PERF_EFLAGS_EXACT;
+       } else {
+               set_linear_ip(&regs, ibs_data.regs[1]);
+               regs.flags |= PERF_EFLAGS_EXACT;
+       }
+
+       if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+               raw.size = sizeof(u32) + ibs_data.size;
+               raw.data = ibs_data.data;
+               data.raw = &raw;
+       }
+
+       throttle = perf_event_overflow(event, &data, &regs);
+out:
+       if (throttle)
+               perf_ibs_disable_event(perf_ibs, hwc, *config);
+       else
+               perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
+
+       perf_event_update_userpage(event);
+
+       return 1;
+}
+
+static int
+perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
+{
+       int handled = 0;
+
+       handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
+       handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
+
+       if (handled)
+               inc_irq_stat(apic_perf_irqs);
+
+       return handled;
+}
+NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
+
+static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
+{
+       struct cpu_perf_ibs __percpu *pcpu;
+       int ret;
+
+       pcpu = alloc_percpu(struct cpu_perf_ibs);
+       if (!pcpu)
+               return -ENOMEM;
+
+       perf_ibs->pcpu = pcpu;
+
+       /* register attributes */
+       if (perf_ibs->format_attrs[0]) {
+               memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group));
+               perf_ibs->format_group.name     = "format";
+               perf_ibs->format_group.attrs    = perf_ibs->format_attrs;
+
+               memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups));
+               perf_ibs->attr_groups[0]        = &perf_ibs->format_group;
+               perf_ibs->pmu.attr_groups       = perf_ibs->attr_groups;
+       }
+
+       ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
+       if (ret) {
+               perf_ibs->pcpu = NULL;
+               free_percpu(pcpu);
+       }
+
+       return ret;
+}
+
+static __init int perf_event_ibs_init(void)
+{
+       struct attribute **attr = ibs_op_format_attrs;
+
+       if (!ibs_caps)
+               return -ENODEV; /* ibs not supported by the cpu */
+
+       perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
+
+       if (ibs_caps & IBS_CAPS_OPCNT) {
+               perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
+               *attr++ = &format_attr_cnt_ctl.attr;
+       }
+       perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
+
+       register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
+       pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps);
+
+       return 0;
+}
+
+#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */
+
+static __init int perf_event_ibs_init(void) { return 0; }
+
+#endif
+
+/* IBS - apic initialization, for perf and oprofile */
+
+static __init u32 __get_ibs_caps(void)
+{
+       u32 caps;
+       unsigned int max_level;
+
+       if (!boot_cpu_has(X86_FEATURE_IBS))
+               return 0;
+
+       /* check IBS cpuid feature flags */
+       max_level = cpuid_eax(0x80000000);
+       if (max_level < IBS_CPUID_FEATURES)
+               return IBS_CAPS_DEFAULT;
+
+       caps = cpuid_eax(IBS_CPUID_FEATURES);
+       if (!(caps & IBS_CAPS_AVAIL))
+               /* cpuid flags not valid */
+               return IBS_CAPS_DEFAULT;
+
+       return caps;
+}
+
+u32 get_ibs_caps(void)
+{
+       return ibs_caps;
+}
+
+EXPORT_SYMBOL(get_ibs_caps);
+
+static inline int get_eilvt(int offset)
+{
+       return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
+}
+
+static inline int put_eilvt(int offset)
+{
+       return !setup_APIC_eilvt(offset, 0, 0, 1);
+}
+
+/*
+ * Check and reserve APIC extended interrupt LVT offset for IBS if available.
+ */
+static inline int ibs_eilvt_valid(void)
+{
+       int offset;
+       u64 val;
+       int valid = 0;
+
+       preempt_disable();
+
+       rdmsrl(MSR_AMD64_IBSCTL, val);
+       offset = val & IBSCTL_LVT_OFFSET_MASK;
+
+       if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
+               pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
+                      smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
+               goto out;
+       }
+
+       if (!get_eilvt(offset)) {
+               pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
+                      smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
+               goto out;
+       }
+
+       valid = 1;
+out:
+       preempt_enable();
+
+       return valid;
+}
+
+static int setup_ibs_ctl(int ibs_eilvt_off)
+{
+       struct pci_dev *cpu_cfg;
+       int nodes;
+       u32 value = 0;
+
+       nodes = 0;
+       cpu_cfg = NULL;
+       do {
+               cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
+                                        PCI_DEVICE_ID_AMD_10H_NB_MISC,
+                                        cpu_cfg);
+               if (!cpu_cfg)
+                       break;
+               ++nodes;
+               pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
+                                      | IBSCTL_LVT_OFFSET_VALID);
+               pci_read_config_dword(cpu_cfg, IBSCTL, &value);
+               if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
+                       pci_dev_put(cpu_cfg);
+                       pr_debug("Failed to setup IBS LVT offset, IBSCTL = 0x%08x\n",
+                                value);
+                       return -EINVAL;
+               }
+       } while (1);
+
+       if (!nodes) {
+               pr_debug("No CPU node configured for IBS\n");
+               return -ENODEV;
+       }
+
+       return 0;
+}
+
+/*
+ * This runs only on the current cpu. We try to find an LVT offset and
+ * setup the local APIC. For this we must disable preemption. On
+ * success we initialize all nodes with this offset. This updates then
+ * the offset in the IBS_CTL per-node msr. The per-core APIC setup of
+ * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
+ * is using the new offset.
+ */
+static void force_ibs_eilvt_setup(void)
+{
+       int offset;
+       int ret;
+
+       preempt_disable();
+       /* find the next free available EILVT entry, skip offset 0 */
+       for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
+               if (get_eilvt(offset))
+                       break;
+       }
+       preempt_enable();
+
+       if (offset == APIC_EILVT_NR_MAX) {
+               pr_debug("No EILVT entry available\n");
+               return;
+       }
+
+       ret = setup_ibs_ctl(offset);
+       if (ret)
+               goto out;
+
+       if (!ibs_eilvt_valid())
+               goto out;
+
+       pr_info("IBS: LVT offset %d assigned\n", offset);
+
+       return;
+out:
+       preempt_disable();
+       put_eilvt(offset);
+       preempt_enable();
+       return;
+}
+
+static void ibs_eilvt_setup(void)
+{
+       /*
+        * Force LVT offset assignment for family 10h: The offsets are
+        * not assigned by the BIOS for this family, so the OS is
+        * responsible for doing it. If the OS assignment fails, fall
+        * back to BIOS settings and try to setup this.
+        */
+       if (boot_cpu_data.x86 == 0x10)
+               force_ibs_eilvt_setup();
+}
+
+static inline int get_ibs_lvt_offset(void)
+{
+       u64 val;
+
+       rdmsrl(MSR_AMD64_IBSCTL, val);
+       if (!(val & IBSCTL_LVT_OFFSET_VALID))
+               return -EINVAL;
+
+       return val & IBSCTL_LVT_OFFSET_MASK;
+}
+
+static void setup_APIC_ibs(void *dummy)
+{
+       int offset;
+
+       offset = get_ibs_lvt_offset();
+       if (offset < 0)
+               goto failed;
+
+       if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
+               return;
+failed:
+       pr_warn("perf: IBS APIC setup failed on cpu #%d\n",
+               smp_processor_id());
+}
+
+static void clear_APIC_ibs(void *dummy)
+{
+       int offset;
+
+       offset = get_ibs_lvt_offset();
+       if (offset >= 0)
+               setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
+}
+
+#ifdef CONFIG_PM
+
+static int perf_ibs_suspend(void)
+{
+       clear_APIC_ibs(NULL);
+       return 0;
+}
+
+static void perf_ibs_resume(void)
+{
+       ibs_eilvt_setup();
+       setup_APIC_ibs(NULL);
+}
+
+static struct syscore_ops perf_ibs_syscore_ops = {
+       .resume         = perf_ibs_resume,
+       .suspend        = perf_ibs_suspend,
+};
+
+static void perf_ibs_pm_init(void)
+{
+       register_syscore_ops(&perf_ibs_syscore_ops);
+}
+
+#else
+
+static inline void perf_ibs_pm_init(void) { }
+
+#endif
+
+static int
+perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_STARTING:
+               setup_APIC_ibs(NULL);
+               break;
+       case CPU_DYING:
+               clear_APIC_ibs(NULL);
+               break;
+       default:
+               break;
+       }
+
+       return NOTIFY_OK;
+}
+
+static __init int amd_ibs_init(void)
+{
+       u32 caps;
+       int ret = -EINVAL;
+
+       caps = __get_ibs_caps();
+       if (!caps)
+               return -ENODEV; /* ibs not supported by the cpu */
+
+       ibs_eilvt_setup();
+
+       if (!ibs_eilvt_valid())
+               goto out;
+
+       perf_ibs_pm_init();
+       cpu_notifier_register_begin();
+       ibs_caps = caps;
+       /* make ibs_caps visible to other cpus: */
+       smp_mb();
+       smp_call_function(setup_APIC_ibs, NULL, 1);
+       __perf_cpu_notifier(perf_ibs_cpu_notifier);
+       cpu_notifier_register_done();
+
+       ret = perf_event_ibs_init();
+out:
+       if (ret)
+               pr_err("Failed to setup IBS, %d\n", ret);
+       return ret;
+}
+
+/* Since we need the pci subsystem to init ibs we can't do this earlier: */
+device_initcall(amd_ibs_init);
diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c

new file mode 100644 (file)

index 0000000..635e5eb
--- /dev/null
+++ b/arch/x86/events/amd/iommu.c
@@ -0,0 +1,499 @@
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Steven Kinney <Steven.Kinney@amd.com>
+ * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
+ *
+ * Perf: amd_iommu - AMD IOMMU Performance Counter PMU implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/module.h>
+#include <linux/cpumask.h>
+#include <linux/slab.h>
+
+#include "../perf_event.h"
+#include "iommu.h"
+
+#define COUNTER_SHIFT          16
+
+#define _GET_BANK(ev)       ((u8)(ev->hw.extra_reg.reg >> 8))
+#define _GET_CNTR(ev)       ((u8)(ev->hw.extra_reg.reg))
+
+/* iommu pmu config masks */
+#define _GET_CSOURCE(ev)    ((ev->hw.config & 0xFFULL))
+#define _GET_DEVID(ev)      ((ev->hw.config >> 8)  & 0xFFFFULL)
+#define _GET_PASID(ev)      ((ev->hw.config >> 24) & 0xFFFFULL)
+#define _GET_DOMID(ev)      ((ev->hw.config >> 40) & 0xFFFFULL)
+#define _GET_DEVID_MASK(ev) ((ev->hw.extra_reg.config)  & 0xFFFFULL)
+#define _GET_PASID_MASK(ev) ((ev->hw.extra_reg.config >> 16) & 0xFFFFULL)
+#define _GET_DOMID_MASK(ev) ((ev->hw.extra_reg.config >> 32) & 0xFFFFULL)
+
+static struct perf_amd_iommu __perf_iommu;
+
+struct perf_amd_iommu {
+       struct pmu pmu;
+       u8 max_banks;
+       u8 max_counters;
+       u64 cntr_assign_mask;
+       raw_spinlock_t lock;
+       const struct attribute_group *attr_groups[4];
+};
+
+#define format_group   attr_groups[0]
+#define cpumask_group  attr_groups[1]
+#define events_group   attr_groups[2]
+#define null_group     attr_groups[3]
+
+/*---------------------------------------------
+ * sysfs format attributes
+ *---------------------------------------------*/
+PMU_FORMAT_ATTR(csource,    "config:0-7");
+PMU_FORMAT_ATTR(devid,      "config:8-23");
+PMU_FORMAT_ATTR(pasid,      "config:24-39");
+PMU_FORMAT_ATTR(domid,      "config:40-55");
+PMU_FORMAT_ATTR(devid_mask, "config1:0-15");
+PMU_FORMAT_ATTR(pasid_mask, "config1:16-31");
+PMU_FORMAT_ATTR(domid_mask, "config1:32-47");
+
+static struct attribute *iommu_format_attrs[] = {
+       &format_attr_csource.attr,
+       &format_attr_devid.attr,
+       &format_attr_pasid.attr,
+       &format_attr_domid.attr,
+       &format_attr_devid_mask.attr,
+       &format_attr_pasid_mask.attr,
+       &format_attr_domid_mask.attr,
+       NULL,
+};
+
+static struct attribute_group amd_iommu_format_group = {
+       .name = "format",
+       .attrs = iommu_format_attrs,
+};
+
+/*---------------------------------------------
+ * sysfs events attributes
+ *---------------------------------------------*/
+struct amd_iommu_event_desc {
+       struct kobj_attribute attr;
+       const char *event;
+};
+
+static ssize_t _iommu_event_show(struct kobject *kobj,
+                               struct kobj_attribute *attr, char *buf)
+{
+       struct amd_iommu_event_desc *event =
+               container_of(attr, struct amd_iommu_event_desc, attr);
+       return sprintf(buf, "%s\n", event->event);
+}
+
+#define AMD_IOMMU_EVENT_DESC(_name, _event)                    \
+{                                                              \
+       .attr  = __ATTR(_name, 0444, _iommu_event_show, NULL),  \
+       .event = _event,                                        \
+}
+
+static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = {
+       AMD_IOMMU_EVENT_DESC(mem_pass_untrans,        "csource=0x01"),
+       AMD_IOMMU_EVENT_DESC(mem_pass_pretrans,       "csource=0x02"),
+       AMD_IOMMU_EVENT_DESC(mem_pass_excl,           "csource=0x03"),
+       AMD_IOMMU_EVENT_DESC(mem_target_abort,        "csource=0x04"),
+       AMD_IOMMU_EVENT_DESC(mem_trans_total,         "csource=0x05"),
+       AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_hit,   "csource=0x06"),
+       AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_mis,   "csource=0x07"),
+       AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_hit,   "csource=0x08"),
+       AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_mis,   "csource=0x09"),
+       AMD_IOMMU_EVENT_DESC(mem_dte_hit,             "csource=0x0a"),
+       AMD_IOMMU_EVENT_DESC(mem_dte_mis,             "csource=0x0b"),
+       AMD_IOMMU_EVENT_DESC(page_tbl_read_tot,       "csource=0x0c"),
+       AMD_IOMMU_EVENT_DESC(page_tbl_read_nst,       "csource=0x0d"),
+       AMD_IOMMU_EVENT_DESC(page_tbl_read_gst,       "csource=0x0e"),
+       AMD_IOMMU_EVENT_DESC(int_dte_hit,             "csource=0x0f"),
+       AMD_IOMMU_EVENT_DESC(int_dte_mis,             "csource=0x10"),
+       AMD_IOMMU_EVENT_DESC(cmd_processed,           "csource=0x11"),
+       AMD_IOMMU_EVENT_DESC(cmd_processed_inv,       "csource=0x12"),
+       AMD_IOMMU_EVENT_DESC(tlb_inv,                 "csource=0x13"),
+       { /* end: all zeroes */ },
+};
+
+/*---------------------------------------------
+ * sysfs cpumask attributes
+ *---------------------------------------------*/
+static cpumask_t iommu_cpumask;
+
+static ssize_t _iommu_cpumask_show(struct device *dev,
+                                  struct device_attribute *attr,
+                                  char *buf)
+{
+       return cpumap_print_to_pagebuf(true, buf, &iommu_cpumask);
+}
+static DEVICE_ATTR(cpumask, S_IRUGO, _iommu_cpumask_show, NULL);
+
+static struct attribute *iommu_cpumask_attrs[] = {
+       &dev_attr_cpumask.attr,
+       NULL,
+};
+
+static struct attribute_group amd_iommu_cpumask_group = {
+       .attrs = iommu_cpumask_attrs,
+};
+
+/*---------------------------------------------*/
+
+static int get_next_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu)
+{
+       unsigned long flags;
+       int shift, bank, cntr, retval;
+       int max_banks = perf_iommu->max_banks;
+       int max_cntrs = perf_iommu->max_counters;
+
+       raw_spin_lock_irqsave(&perf_iommu->lock, flags);
+
+       for (bank = 0, shift = 0; bank < max_banks; bank++) {
+               for (cntr = 0; cntr < max_cntrs; cntr++) {
+                       shift = bank + (bank*3) + cntr;
+                       if (perf_iommu->cntr_assign_mask & (1ULL<<shift)) {
+                               continue;
+                       } else {
+                               perf_iommu->cntr_assign_mask |= (1ULL<<shift);
+                               retval = ((u16)((u16)bank<<8) | (u8)(cntr));
+                               goto out;
+                       }
+               }
+       }
+       retval = -ENOSPC;
+out:
+       raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
+       return retval;
+}
+
+static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu,
+                                       u8 bank, u8 cntr)
+{
+       unsigned long flags;
+       int max_banks, max_cntrs;
+       int shift = 0;
+
+       max_banks = perf_iommu->max_banks;
+       max_cntrs = perf_iommu->max_counters;
+
+       if ((bank > max_banks) || (cntr > max_cntrs))
+               return -EINVAL;
+
+       shift = bank + cntr + (bank*3);
+
+       raw_spin_lock_irqsave(&perf_iommu->lock, flags);
+       perf_iommu->cntr_assign_mask &= ~(1ULL<<shift);
+       raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
+
+       return 0;
+}
+
+static int perf_iommu_event_init(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct perf_amd_iommu *perf_iommu;
+       u64 config, config1;
+
+       /* test the event attr type check for PMU enumeration */
+       if (event->attr.type != event->pmu->type)
+               return -ENOENT;
+
+       /*
+        * IOMMU counters are shared across all cores.
+        * Therefore, it does not support per-process mode.
+        * Also, it does not support event sampling mode.
+        */
+       if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
+               return -EINVAL;
+
+       /* IOMMU counters do not have usr/os/guest/host bits */
+       if (event->attr.exclude_user || event->attr.exclude_kernel ||
+           event->attr.exclude_host || event->attr.exclude_guest)
+               return -EINVAL;
+
+       if (event->cpu < 0)
+               return -EINVAL;
+
+       perf_iommu = &__perf_iommu;
+
+       if (event->pmu != &perf_iommu->pmu)
+               return -ENOENT;
+
+       if (perf_iommu) {
+               config = event->attr.config;
+               config1 = event->attr.config1;
+       } else {
+               return -EINVAL;
+       }
+
+       /* integrate with iommu base devid (0000), assume one iommu */
+       perf_iommu->max_banks =
+               amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID);
+       perf_iommu->max_counters =
+               amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID);
+       if ((perf_iommu->max_banks == 0) || (perf_iommu->max_counters == 0))
+               return -EINVAL;
+
+       /* update the hw_perf_event struct with the iommu config data */
+       hwc->config = config;
+       hwc->extra_reg.config = config1;
+
+       return 0;
+}
+
+static void perf_iommu_enable_event(struct perf_event *ev)
+{
+       u8 csource = _GET_CSOURCE(ev);
+       u16 devid = _GET_DEVID(ev);
+       u64 reg = 0ULL;
+
+       reg = csource;
+       amd_iommu_pc_get_set_reg_val(devid,
+                       _GET_BANK(ev), _GET_CNTR(ev) ,
+                        IOMMU_PC_COUNTER_SRC_REG, &reg, true);
+
+       reg = 0ULL | devid | (_GET_DEVID_MASK(ev) << 32);
+       if (reg)
+               reg |= (1UL << 31);
+       amd_iommu_pc_get_set_reg_val(devid,
+                       _GET_BANK(ev), _GET_CNTR(ev) ,
+                        IOMMU_PC_DEVID_MATCH_REG, &reg, true);
+
+       reg = 0ULL | _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32);
+       if (reg)
+               reg |= (1UL << 31);
+       amd_iommu_pc_get_set_reg_val(devid,
+                       _GET_BANK(ev), _GET_CNTR(ev) ,
+                        IOMMU_PC_PASID_MATCH_REG, &reg, true);
+
+       reg = 0ULL | _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32);
+       if (reg)
+               reg |= (1UL << 31);
+       amd_iommu_pc_get_set_reg_val(devid,
+                       _GET_BANK(ev), _GET_CNTR(ev) ,
+                        IOMMU_PC_DOMID_MATCH_REG, &reg, true);
+}
+
+static void perf_iommu_disable_event(struct perf_event *event)
+{
+       u64 reg = 0ULL;
+
+       amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
+                       _GET_BANK(event), _GET_CNTR(event),
+                       IOMMU_PC_COUNTER_SRC_REG, &reg, true);
+}
+
+static void perf_iommu_start(struct perf_event *event, int flags)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       pr_debug("perf: amd_iommu:perf_iommu_start\n");
+       if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+               return;
+
+       WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+       hwc->state = 0;
+
+       if (flags & PERF_EF_RELOAD) {
+               u64 prev_raw_count =  local64_read(&hwc->prev_count);
+               amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
+                               _GET_BANK(event), _GET_CNTR(event),
+                               IOMMU_PC_COUNTER_REG, &prev_raw_count, true);
+       }
+
+       perf_iommu_enable_event(event);
+       perf_event_update_userpage(event);
+
+}
+
+static void perf_iommu_read(struct perf_event *event)
+{
+       u64 count = 0ULL;
+       u64 prev_raw_count = 0ULL;
+       u64 delta = 0ULL;
+       struct hw_perf_event *hwc = &event->hw;
+       pr_debug("perf: amd_iommu:perf_iommu_read\n");
+
+       amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
+                               _GET_BANK(event), _GET_CNTR(event),
+                               IOMMU_PC_COUNTER_REG, &count, false);
+
+       /* IOMMU pc counter register is only 48 bits */
+       count &= 0xFFFFFFFFFFFFULL;
+
+       prev_raw_count =  local64_read(&hwc->prev_count);
+       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+                                       count) != prev_raw_count)
+               return;
+
+       /* Handling 48-bit counter overflowing */
+       delta = (count << COUNTER_SHIFT) - (prev_raw_count << COUNTER_SHIFT);
+       delta >>= COUNTER_SHIFT;
+       local64_add(delta, &event->count);
+
+}
+
+static void perf_iommu_stop(struct perf_event *event, int flags)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 config;
+
+       pr_debug("perf: amd_iommu:perf_iommu_stop\n");
+
+       if (hwc->state & PERF_HES_UPTODATE)
+               return;
+
+       perf_iommu_disable_event(event);
+       WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+       hwc->state |= PERF_HES_STOPPED;
+
+       if (hwc->state & PERF_HES_UPTODATE)
+               return;
+
+       config = hwc->config;
+       perf_iommu_read(event);
+       hwc->state |= PERF_HES_UPTODATE;
+}
+
+static int perf_iommu_add(struct perf_event *event, int flags)
+{
+       int retval;
+       struct perf_amd_iommu *perf_iommu =
+                       container_of(event->pmu, struct perf_amd_iommu, pmu);
+
+       pr_debug("perf: amd_iommu:perf_iommu_add\n");
+       event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+       /* request an iommu bank/counter */
+       retval = get_next_avail_iommu_bnk_cntr(perf_iommu);
+       if (retval != -ENOSPC)
+               event->hw.extra_reg.reg = (u16)retval;
+       else
+               return retval;
+
+       if (flags & PERF_EF_START)
+               perf_iommu_start(event, PERF_EF_RELOAD);
+
+       return 0;
+}
+
+static void perf_iommu_del(struct perf_event *event, int flags)
+{
+       struct perf_amd_iommu *perf_iommu =
+                       container_of(event->pmu, struct perf_amd_iommu, pmu);
+
+       pr_debug("perf: amd_iommu:perf_iommu_del\n");
+       perf_iommu_stop(event, PERF_EF_UPDATE);
+
+       /* clear the assigned iommu bank/counter */
+       clear_avail_iommu_bnk_cntr(perf_iommu,
+                                    _GET_BANK(event),
+                                    _GET_CNTR(event));
+
+       perf_event_update_userpage(event);
+}
+
+static __init int _init_events_attrs(struct perf_amd_iommu *perf_iommu)
+{
+       struct attribute **attrs;
+       struct attribute_group *attr_group;
+       int i = 0, j;
+
+       while (amd_iommu_v2_event_descs[i].attr.attr.name)
+               i++;
+
+       attr_group = kzalloc(sizeof(struct attribute *)
+               * (i + 1) + sizeof(*attr_group), GFP_KERNEL);
+       if (!attr_group)
+               return -ENOMEM;
+
+       attrs = (struct attribute **)(attr_group + 1);
+       for (j = 0; j < i; j++)
+               attrs[j] = &amd_iommu_v2_event_descs[j].attr.attr;
+
+       attr_group->name = "events";
+       attr_group->attrs = attrs;
+       perf_iommu->events_group = attr_group;
+
+       return 0;
+}
+
+static __init void amd_iommu_pc_exit(void)
+{
+       if (__perf_iommu.events_group != NULL) {
+               kfree(__perf_iommu.events_group);
+               __perf_iommu.events_group = NULL;
+       }
+}
+
+static __init int _init_perf_amd_iommu(
+       struct perf_amd_iommu *perf_iommu, char *name)
+{
+       int ret;
+
+       raw_spin_lock_init(&perf_iommu->lock);
+
+       /* Init format attributes */
+       perf_iommu->format_group = &amd_iommu_format_group;
+
+       /* Init cpumask attributes to only core 0 */
+       cpumask_set_cpu(0, &iommu_cpumask);
+       perf_iommu->cpumask_group = &amd_iommu_cpumask_group;
+
+       /* Init events attributes */
+       if (_init_events_attrs(perf_iommu) != 0)
+               pr_err("perf: amd_iommu: Only support raw events.\n");
+
+       /* Init null attributes */
+       perf_iommu->null_group = NULL;
+       perf_iommu->pmu.attr_groups = perf_iommu->attr_groups;
+
+       ret = perf_pmu_register(&perf_iommu->pmu, name, -1);
+       if (ret) {
+               pr_err("perf: amd_iommu: Failed to initialized.\n");
+               amd_iommu_pc_exit();
+       } else {
+               pr_info("perf: amd_iommu: Detected. (%d banks, %d counters/bank)\n",
+                       amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID),
+                       amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID));
+       }
+
+       return ret;
+}
+
+static struct perf_amd_iommu __perf_iommu = {
+       .pmu = {
+               .event_init     = perf_iommu_event_init,
+               .add            = perf_iommu_add,
+               .del            = perf_iommu_del,
+               .start          = perf_iommu_start,
+               .stop           = perf_iommu_stop,
+               .read           = perf_iommu_read,
+       },
+       .max_banks              = 0x00,
+       .max_counters           = 0x00,
+       .cntr_assign_mask       = 0ULL,
+       .format_group           = NULL,
+       .cpumask_group          = NULL,
+       .events_group           = NULL,
+       .null_group             = NULL,
+};
+
+static __init int amd_iommu_pc_init(void)
+{
+       /* Make sure the IOMMU PC resource is available */
+       if (!amd_iommu_pc_supported())
+               return -ENODEV;
+
+       _init_perf_amd_iommu(&__perf_iommu, "amd_iommu");
+
+       return 0;
+}
+
+device_initcall(amd_iommu_pc_init);
diff --git a/arch/x86/events/amd/iommu.h b/arch/x86/events/amd/iommu.h

new file mode 100644 (file)

index 0000000..845d173
--- /dev/null
+++ b/arch/x86/events/amd/iommu.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Steven Kinney <Steven.Kinney@amd.com>
+ * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _PERF_EVENT_AMD_IOMMU_H_
+#define _PERF_EVENT_AMD_IOMMU_H_
+
+/* iommu pc mmio region register indexes */
+#define IOMMU_PC_COUNTER_REG                   0x00
+#define IOMMU_PC_COUNTER_SRC_REG               0x08
+#define IOMMU_PC_PASID_MATCH_REG               0x10
+#define IOMMU_PC_DOMID_MATCH_REG               0x18
+#define IOMMU_PC_DEVID_MATCH_REG               0x20
+#define IOMMU_PC_COUNTER_REPORT_REG            0x28
+
+/* maximun specified bank/counters */
+#define PC_MAX_SPEC_BNKS                       64
+#define PC_MAX_SPEC_CNTRS                      16
+
+/* iommu pc reg masks*/
+#define IOMMU_BASE_DEVID                       0x0000
+
+/* amd_iommu_init.c external support functions */
+extern bool amd_iommu_pc_supported(void);
+
+extern u8 amd_iommu_pc_get_max_banks(u16 devid);
+
+extern u8 amd_iommu_pc_get_max_counters(u16 devid);
+
+extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr,
+                       u8 fxn, u64 *value, bool is_write);
+
+#endif /*_PERF_EVENT_AMD_IOMMU_H_*/
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c

new file mode 100644 (file)

index 0000000..3db9569
--- /dev/null
+++ b/arch/x86/events/amd/uncore.c
@@ -0,0 +1,603 @@
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Jacob Shin <jacob.shin@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/percpu.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+
+#include <asm/cpufeature.h>
+#include <asm/perf_event.h>
+#include <asm/msr.h>
+
+#define NUM_COUNTERS_NB                4
+#define NUM_COUNTERS_L2                4
+#define MAX_COUNTERS           NUM_COUNTERS_NB
+
+#define RDPMC_BASE_NB          6
+#define RDPMC_BASE_L2          10
+
+#define COUNTER_SHIFT          16
+
+struct amd_uncore {
+       int id;
+       int refcnt;
+       int cpu;
+       int num_counters;
+       int rdpmc_base;
+       u32 msr_base;
+       cpumask_t *active_mask;
+       struct pmu *pmu;
+       struct perf_event *events[MAX_COUNTERS];
+       struct amd_uncore *free_when_cpu_online;
+};
+
+static struct amd_uncore * __percpu *amd_uncore_nb;
+static struct amd_uncore * __percpu *amd_uncore_l2;
+
+static struct pmu amd_nb_pmu;
+static struct pmu amd_l2_pmu;
+
+static cpumask_t amd_nb_active_mask;
+static cpumask_t amd_l2_active_mask;
+
+static bool is_nb_event(struct perf_event *event)
+{
+       return event->pmu->type == amd_nb_pmu.type;
+}
+
+static bool is_l2_event(struct perf_event *event)
+{
+       return event->pmu->type == amd_l2_pmu.type;
+}
+
+static struct amd_uncore *event_to_amd_uncore(struct perf_event *event)
+{
+       if (is_nb_event(event) && amd_uncore_nb)
+               return *per_cpu_ptr(amd_uncore_nb, event->cpu);
+       else if (is_l2_event(event) && amd_uncore_l2)
+               return *per_cpu_ptr(amd_uncore_l2, event->cpu);
+
+       return NULL;
+}
+
+static void amd_uncore_read(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 prev, new;
+       s64 delta;
+
+       /*
+        * since we do not enable counter overflow interrupts,
+        * we do not have to worry about prev_count changing on us
+        */
+
+       prev = local64_read(&hwc->prev_count);
+       rdpmcl(hwc->event_base_rdpmc, new);
+       local64_set(&hwc->prev_count, new);
+       delta = (new << COUNTER_SHIFT) - (prev << COUNTER_SHIFT);
+       delta >>= COUNTER_SHIFT;
+       local64_add(delta, &event->count);
+}
+
+static void amd_uncore_start(struct perf_event *event, int flags)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (flags & PERF_EF_RELOAD)
+               wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count));
+
+       hwc->state = 0;
+       wrmsrl(hwc->config_base, (hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE));
+       perf_event_update_userpage(event);
+}
+
+static void amd_uncore_stop(struct perf_event *event, int flags)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       wrmsrl(hwc->config_base, hwc->config);
+       hwc->state |= PERF_HES_STOPPED;
+
+       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+               amd_uncore_read(event);
+               hwc->state |= PERF_HES_UPTODATE;
+       }
+}
+
+static int amd_uncore_add(struct perf_event *event, int flags)
+{
+       int i;
+       struct amd_uncore *uncore = event_to_amd_uncore(event);
+       struct hw_perf_event *hwc = &event->hw;
+
+       /* are we already assigned? */
+       if (hwc->idx != -1 && uncore->events[hwc->idx] == event)
+               goto out;
+
+       for (i = 0; i < uncore->num_counters; i++) {
+               if (uncore->events[i] == event) {
+                       hwc->idx = i;
+                       goto out;
+               }
+       }
+
+       /* if not, take the first available counter */
+       hwc->idx = -1;
+       for (i = 0; i < uncore->num_counters; i++) {
+               if (cmpxchg(&uncore->events[i], NULL, event) == NULL) {
+                       hwc->idx = i;
+                       break;
+               }
+       }
+
+out:
+       if (hwc->idx == -1)
+               return -EBUSY;
+
+       hwc->config_base = uncore->msr_base + (2 * hwc->idx);
+       hwc->event_base = uncore->msr_base + 1 + (2 * hwc->idx);
+       hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx;
+       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+       if (flags & PERF_EF_START)
+               amd_uncore_start(event, PERF_EF_RELOAD);
+
+       return 0;
+}
+
+static void amd_uncore_del(struct perf_event *event, int flags)
+{
+       int i;
+       struct amd_uncore *uncore = event_to_amd_uncore(event);
+       struct hw_perf_event *hwc = &event->hw;
+
+       amd_uncore_stop(event, PERF_EF_UPDATE);
+
+       for (i = 0; i < uncore->num_counters; i++) {
+               if (cmpxchg(&uncore->events[i], event, NULL) == event)
+                       break;
+       }
+
+       hwc->idx = -1;
+}
+
+static int amd_uncore_event_init(struct perf_event *event)
+{
+       struct amd_uncore *uncore;
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (event->attr.type != event->pmu->type)
+               return -ENOENT;
+
+       /*
+        * NB and L2 counters (MSRs) are shared across all cores that share the
+        * same NB / L2 cache. Interrupts can be directed to a single target
+        * core, however, event counts generated by processes running on other
+        * cores cannot be masked out. So we do not support sampling and
+        * per-thread events.
+        */
+       if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
+               return -EINVAL;
+
+       /* NB and L2 counters do not have usr/os/guest/host bits */
+       if (event->attr.exclude_user || event->attr.exclude_kernel ||
+           event->attr.exclude_host || event->attr.exclude_guest)
+               return -EINVAL;
+
+       /* and we do not enable counter overflow interrupts */
+       hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB;
+       hwc->idx = -1;
+
+       if (event->cpu < 0)
+               return -EINVAL;
+
+       uncore = event_to_amd_uncore(event);
+       if (!uncore)
+               return -ENODEV;
+
+       /*
+        * since request can come in to any of the shared cores, we will remap
+        * to a single common cpu.
+        */
+       event->cpu = uncore->cpu;
+
+       return 0;
+}
+
+static ssize_t amd_uncore_attr_show_cpumask(struct device *dev,
+                                           struct device_attribute *attr,
+                                           char *buf)
+{
+       cpumask_t *active_mask;
+       struct pmu *pmu = dev_get_drvdata(dev);
+
+       if (pmu->type == amd_nb_pmu.type)
+               active_mask = &amd_nb_active_mask;
+       else if (pmu->type == amd_l2_pmu.type)
+               active_mask = &amd_l2_active_mask;
+       else
+               return 0;
+
+       return cpumap_print_to_pagebuf(true, buf, active_mask);
+}
+static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL);
+
+static struct attribute *amd_uncore_attrs[] = {
+       &dev_attr_cpumask.attr,
+       NULL,
+};
+
+static struct attribute_group amd_uncore_attr_group = {
+       .attrs = amd_uncore_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7,32-35");
+PMU_FORMAT_ATTR(umask, "config:8-15");
+
+static struct attribute *amd_uncore_format_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       NULL,
+};
+
+static struct attribute_group amd_uncore_format_group = {
+       .name = "format",
+       .attrs = amd_uncore_format_attr,
+};
+
+static const struct attribute_group *amd_uncore_attr_groups[] = {
+       &amd_uncore_attr_group,
+       &amd_uncore_format_group,
+       NULL,
+};
+
+static struct pmu amd_nb_pmu = {
+       .attr_groups    = amd_uncore_attr_groups,
+       .name           = "amd_nb",
+       .event_init     = amd_uncore_event_init,
+       .add            = amd_uncore_add,
+       .del            = amd_uncore_del,
+       .start          = amd_uncore_start,
+       .stop           = amd_uncore_stop,
+       .read           = amd_uncore_read,
+};
+
+static struct pmu amd_l2_pmu = {
+       .attr_groups    = amd_uncore_attr_groups,
+       .name           = "amd_l2",
+       .event_init     = amd_uncore_event_init,
+       .add            = amd_uncore_add,
+       .del            = amd_uncore_del,
+       .start          = amd_uncore_start,
+       .stop           = amd_uncore_stop,
+       .read           = amd_uncore_read,
+};
+
+static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
+{
+       return kzalloc_node(sizeof(struct amd_uncore), GFP_KERNEL,
+                       cpu_to_node(cpu));
+}
+
+static int amd_uncore_cpu_up_prepare(unsigned int cpu)
+{
+       struct amd_uncore *uncore_nb = NULL, *uncore_l2;
+
+       if (amd_uncore_nb) {
+               uncore_nb = amd_uncore_alloc(cpu);
+               if (!uncore_nb)
+                       goto fail;
+               uncore_nb->cpu = cpu;
+               uncore_nb->num_counters = NUM_COUNTERS_NB;
+               uncore_nb->rdpmc_base = RDPMC_BASE_NB;
+               uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL;
+               uncore_nb->active_mask = &amd_nb_active_mask;
+               uncore_nb->pmu = &amd_nb_pmu;
+               *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb;
+       }
+
+       if (amd_uncore_l2) {
+               uncore_l2 = amd_uncore_alloc(cpu);
+               if (!uncore_l2)
+                       goto fail;
+               uncore_l2->cpu = cpu;
+               uncore_l2->num_counters = NUM_COUNTERS_L2;
+               uncore_l2->rdpmc_base = RDPMC_BASE_L2;
+               uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL;
+               uncore_l2->active_mask = &amd_l2_active_mask;
+               uncore_l2->pmu = &amd_l2_pmu;
+               *per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2;
+       }
+
+       return 0;
+
+fail:
+       if (amd_uncore_nb)
+               *per_cpu_ptr(amd_uncore_nb, cpu) = NULL;
+       kfree(uncore_nb);
+       return -ENOMEM;
+}
+
+static struct amd_uncore *
+amd_uncore_find_online_sibling(struct amd_uncore *this,
+                              struct amd_uncore * __percpu *uncores)
+{
+       unsigned int cpu;
+       struct amd_uncore *that;
+
+       for_each_online_cpu(cpu) {
+               that = *per_cpu_ptr(uncores, cpu);
+
+               if (!that)
+                       continue;
+
+               if (this == that)
+                       continue;
+
+               if (this->id == that->id) {
+                       that->free_when_cpu_online = this;
+                       this = that;
+                       break;
+               }
+       }
+
+       this->refcnt++;
+       return this;
+}
+
+static void amd_uncore_cpu_starting(unsigned int cpu)
+{
+       unsigned int eax, ebx, ecx, edx;
+       struct amd_uncore *uncore;
+
+       if (amd_uncore_nb) {
+               uncore = *per_cpu_ptr(amd_uncore_nb, cpu);
+               cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
+               uncore->id = ecx & 0xff;
+
+               uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_nb);
+               *per_cpu_ptr(amd_uncore_nb, cpu) = uncore;
+       }
+
+       if (amd_uncore_l2) {
+               unsigned int apicid = cpu_data(cpu).apicid;
+               unsigned int nshared;
+
+               uncore = *per_cpu_ptr(amd_uncore_l2, cpu);
+               cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx);
+               nshared = ((eax >> 14) & 0xfff) + 1;
+               uncore->id = apicid - (apicid % nshared);
+
+               uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_l2);
+               *per_cpu_ptr(amd_uncore_l2, cpu) = uncore;
+       }
+}
+
+static void uncore_online(unsigned int cpu,
+                         struct amd_uncore * __percpu *uncores)
+{
+       struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
+
+       kfree(uncore->free_when_cpu_online);
+       uncore->free_when_cpu_online = NULL;
+
+       if (cpu == uncore->cpu)
+               cpumask_set_cpu(cpu, uncore->active_mask);
+}
+
+static void amd_uncore_cpu_online(unsigned int cpu)
+{
+       if (amd_uncore_nb)
+               uncore_online(cpu, amd_uncore_nb);
+
+       if (amd_uncore_l2)
+               uncore_online(cpu, amd_uncore_l2);
+}
+
+static void uncore_down_prepare(unsigned int cpu,
+                               struct amd_uncore * __percpu *uncores)
+{
+       unsigned int i;
+       struct amd_uncore *this = *per_cpu_ptr(uncores, cpu);
+
+       if (this->cpu != cpu)
+               return;
+
+       /* this cpu is going down, migrate to a shared sibling if possible */
+       for_each_online_cpu(i) {
+               struct amd_uncore *that = *per_cpu_ptr(uncores, i);
+
+               if (cpu == i)
+                       continue;
+
+               if (this == that) {
+                       perf_pmu_migrate_context(this->pmu, cpu, i);
+                       cpumask_clear_cpu(cpu, that->active_mask);
+                       cpumask_set_cpu(i, that->active_mask);
+                       that->cpu = i;
+                       break;
+               }
+       }
+}
+
+static void amd_uncore_cpu_down_prepare(unsigned int cpu)
+{
+       if (amd_uncore_nb)
+               uncore_down_prepare(cpu, amd_uncore_nb);
+
+       if (amd_uncore_l2)
+               uncore_down_prepare(cpu, amd_uncore_l2);
+}
+
+static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores)
+{
+       struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
+
+       if (cpu == uncore->cpu)
+               cpumask_clear_cpu(cpu, uncore->active_mask);
+
+       if (!--uncore->refcnt)
+               kfree(uncore);
+       *per_cpu_ptr(uncores, cpu) = NULL;
+}
+
+static void amd_uncore_cpu_dead(unsigned int cpu)
+{
+       if (amd_uncore_nb)
+               uncore_dead(cpu, amd_uncore_nb);
+
+       if (amd_uncore_l2)
+               uncore_dead(cpu, amd_uncore_l2);
+}
+
+static int
+amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action,
+                       void *hcpu)
+{
+       unsigned int cpu = (long)hcpu;
+
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_UP_PREPARE:
+               if (amd_uncore_cpu_up_prepare(cpu))
+                       return notifier_from_errno(-ENOMEM);
+               break;
+
+       case CPU_STARTING:
+               amd_uncore_cpu_starting(cpu);
+               break;
+
+       case CPU_ONLINE:
+               amd_uncore_cpu_online(cpu);
+               break;
+
+       case CPU_DOWN_PREPARE:
+               amd_uncore_cpu_down_prepare(cpu);
+               break;
+
+       case CPU_UP_CANCELED:
+       case CPU_DEAD:
+               amd_uncore_cpu_dead(cpu);
+               break;
+
+       default:
+               break;
+       }
+
+       return NOTIFY_OK;
+}
+
+static struct notifier_block amd_uncore_cpu_notifier_block = {
+       .notifier_call  = amd_uncore_cpu_notifier,
+       .priority       = CPU_PRI_PERF + 1,
+};
+
+static void __init init_cpu_already_online(void *dummy)
+{
+       unsigned int cpu = smp_processor_id();
+
+       amd_uncore_cpu_starting(cpu);
+       amd_uncore_cpu_online(cpu);
+}
+
+static void cleanup_cpu_online(void *dummy)
+{
+       unsigned int cpu = smp_processor_id();
+
+       amd_uncore_cpu_dead(cpu);
+}
+
+static int __init amd_uncore_init(void)
+{
+       unsigned int cpu, cpu2;
+       int ret = -ENODEV;
+
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+               goto fail_nodev;
+
+       if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
+               goto fail_nodev;
+
+       if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) {
+               amd_uncore_nb = alloc_percpu(struct amd_uncore *);
+               if (!amd_uncore_nb) {
+                       ret = -ENOMEM;
+                       goto fail_nb;
+               }
+               ret = perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1);
+               if (ret)
+                       goto fail_nb;
+
+               pr_info("perf: AMD NB counters detected\n");
+               ret = 0;
+       }
+
+       if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) {
+               amd_uncore_l2 = alloc_percpu(struct amd_uncore *);
+               if (!amd_uncore_l2) {
+                       ret = -ENOMEM;
+                       goto fail_l2;
+               }
+               ret = perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1);
+               if (ret)
+                       goto fail_l2;
+
+               pr_info("perf: AMD L2I counters detected\n");
+               ret = 0;
+       }
+
+       if (ret)
+               goto fail_nodev;
+
+       cpu_notifier_register_begin();
+
+       /* init cpus already online before registering for hotplug notifier */
+       for_each_online_cpu(cpu) {
+               ret = amd_uncore_cpu_up_prepare(cpu);
+               if (ret)
+                       goto fail_online;
+               smp_call_function_single(cpu, init_cpu_already_online, NULL, 1);
+       }
+
+       __register_cpu_notifier(&amd_uncore_cpu_notifier_block);
+       cpu_notifier_register_done();
+
+       return 0;
+
+
+fail_online:
+       for_each_online_cpu(cpu2) {
+               if (cpu2 == cpu)
+                       break;
+               smp_call_function_single(cpu, cleanup_cpu_online, NULL, 1);
+       }
+       cpu_notifier_register_done();
+
+       /* amd_uncore_nb/l2 should have been freed by cleanup_cpu_online */
+       amd_uncore_nb = amd_uncore_l2 = NULL;
+
+       if (boot_cpu_has(X86_FEATURE_PERFCTR_L2))
+               perf_pmu_unregister(&amd_l2_pmu);
+fail_l2:
+       if (boot_cpu_has(X86_FEATURE_PERFCTR_NB))
+               perf_pmu_unregister(&amd_nb_pmu);
+       if (amd_uncore_l2)
+               free_percpu(amd_uncore_l2);
+fail_nb:
+       if (amd_uncore_nb)
+               free_percpu(amd_uncore_nb);
+
+fail_nodev:
+       return ret;
+}
+device_initcall(amd_uncore_init);
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c

new file mode 100644 (file)

index 0000000..5e830d0
--- /dev/null
+++ b/arch/x86/events/core.c
@@ -0,0 +1,2442 @@
+/*
+ * Performance events x86 architecture code
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2009 Jaswinder Singh Rajput
+ *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
+ *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
+ *  Copyright (C) 2009 Google, Inc., Stephane Eranian
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/capability.h>
+#include <linux/notifier.h>
+#include <linux/hardirq.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/kdebug.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/bitops.h>
+#include <linux/device.h>
+
+#include <asm/apic.h>
+#include <asm/stacktrace.h>
+#include <asm/nmi.h>
+#include <asm/smp.h>
+#include <asm/alternative.h>
+#include <asm/mmu_context.h>
+#include <asm/tlbflush.h>
+#include <asm/timer.h>
+#include <asm/desc.h>
+#include <asm/ldt.h>
+
+#include "perf_event.h"
+
+struct x86_pmu x86_pmu __read_mostly;
+
+DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
+       .enabled = 1,
+};
+
+struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE;
+
+u64 __read_mostly hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX];
+u64 __read_mostly hw_cache_extra_regs
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+/*
+ * Propagate event elapsed time into the generic event.
+ * Can only be executed on the CPU where the event is active.
+ * Returns the delta events processed.
+ */
+u64 x86_perf_event_update(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       int shift = 64 - x86_pmu.cntval_bits;
+       u64 prev_raw_count, new_raw_count;
+       int idx = hwc->idx;
+       s64 delta;
+
+       if (idx == INTEL_PMC_IDX_FIXED_BTS)
+               return 0;
+
+       /*
+        * Careful: an NMI might modify the previous event value.
+        *
+        * Our tactic to handle this is to first atomically read and
+        * exchange a new raw count - then add that new-prev delta
+        * count to the generic event atomically:
+        */
+again:
+       prev_raw_count = local64_read(&hwc->prev_count);
+       rdpmcl(hwc->event_base_rdpmc, new_raw_count);
+
+       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+                                       new_raw_count) != prev_raw_count)
+               goto again;
+
+       /*
+        * Now we have the new raw value and have updated the prev
+        * timestamp already. We can now calculate the elapsed delta
+        * (event-)time and add that to the generic event.
+        *
+        * Careful, not all hw sign-extends above the physical width
+        * of the count.
+        */
+       delta = (new_raw_count << shift) - (prev_raw_count << shift);
+       delta >>= shift;
+
+       local64_add(delta, &event->count);
+       local64_sub(delta, &hwc->period_left);
+
+       return new_raw_count;
+}
+
+/*
+ * Find and validate any extra registers to set up.
+ */
+static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg;
+       struct extra_reg *er;
+
+       reg = &event->hw.extra_reg;
+
+       if (!x86_pmu.extra_regs)
+               return 0;
+
+       for (er = x86_pmu.extra_regs; er->msr; er++) {
+               if (er->event != (config & er->config_mask))
+                       continue;
+               if (event->attr.config1 & ~er->valid_mask)
+                       return -EINVAL;
+               /* Check if the extra msrs can be safely accessed*/
+               if (!er->extra_msr_access)
+                       return -ENXIO;
+
+               reg->idx = er->idx;
+               reg->config = event->attr.config1;
+               reg->reg = er->msr;
+               break;
+       }
+       return 0;
+}
+
+static atomic_t active_events;
+static atomic_t pmc_refcount;
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+static bool reserve_pmc_hardware(void)
+{
+       int i;
+
+       for (i = 0; i < x86_pmu.num_counters; i++) {
+               if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
+                       goto perfctr_fail;
+       }
+
+       for (i = 0; i < x86_pmu.num_counters; i++) {
+               if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
+                       goto eventsel_fail;
+       }
+
+       return true;
+
+eventsel_fail:
+       for (i--; i >= 0; i--)
+               release_evntsel_nmi(x86_pmu_config_addr(i));
+
+       i = x86_pmu.num_counters;
+
+perfctr_fail:
+       for (i--; i >= 0; i--)
+               release_perfctr_nmi(x86_pmu_event_addr(i));
+
+       return false;
+}
+
+static void release_pmc_hardware(void)
+{
+       int i;
+
+       for (i = 0; i < x86_pmu.num_counters; i++) {
+               release_perfctr_nmi(x86_pmu_event_addr(i));
+               release_evntsel_nmi(x86_pmu_config_addr(i));
+       }
+}
+
+#else
+
+static bool reserve_pmc_hardware(void) { return true; }
+static void release_pmc_hardware(void) {}
+
+#endif
+
+static bool check_hw_exists(void)
+{
+       u64 val, val_fail, val_new= ~0;
+       int i, reg, reg_fail, ret = 0;
+       int bios_fail = 0;
+       int reg_safe = -1;
+
+       /*
+        * Check to see if the BIOS enabled any of the counters, if so
+        * complain and bail.
+        */
+       for (i = 0; i < x86_pmu.num_counters; i++) {
+               reg = x86_pmu_config_addr(i);
+               ret = rdmsrl_safe(reg, &val);
+               if (ret)
+                       goto msr_fail;
+               if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
+                       bios_fail = 1;
+                       val_fail = val;
+                       reg_fail = reg;
+               } else {
+                       reg_safe = i;
+               }
+       }
+
+       if (x86_pmu.num_counters_fixed) {
+               reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+               ret = rdmsrl_safe(reg, &val);
+               if (ret)
+                       goto msr_fail;
+               for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
+                       if (val & (0x03 << i*4)) {
+                               bios_fail = 1;
+                               val_fail = val;
+                               reg_fail = reg;
+                       }
+               }
+       }
+
+       /*
+        * If all the counters are enabled, the below test will always
+        * fail.  The tools will also become useless in this scenario.
+        * Just fail and disable the hardware counters.
+        */
+
+       if (reg_safe == -1) {
+               reg = reg_safe;
+               goto msr_fail;
+       }
+
+       /*
+        * Read the current value, change it and read it back to see if it
+        * matches, this is needed to detect certain hardware emulators
+        * (qemu/kvm) that don't trap on the MSR access and always return 0s.
+        */
+       reg = x86_pmu_event_addr(reg_safe);
+       if (rdmsrl_safe(reg, &val))
+               goto msr_fail;
+       val ^= 0xffffUL;
+       ret = wrmsrl_safe(reg, val);
+       ret |= rdmsrl_safe(reg, &val_new);
+       if (ret || val != val_new)
+               goto msr_fail;
+
+       /*
+        * We still allow the PMU driver to operate:
+        */
+       if (bios_fail) {
+               pr_cont("Broken BIOS detected, complain to your hardware vendor.\n");
+               pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n",
+                             reg_fail, val_fail);
+       }
+
+       return true;
+
+msr_fail:
+       pr_cont("Broken PMU hardware detected, using software events only.\n");
+       pr_info("%sFailed to access perfctr msr (MSR %x is %Lx)\n",
+               boot_cpu_has(X86_FEATURE_HYPERVISOR) ? KERN_INFO : KERN_ERR,
+               reg, val_new);
+
+       return false;
+}
+
+static void hw_perf_event_destroy(struct perf_event *event)
+{
+       x86_release_hardware();
+       atomic_dec(&active_events);
+}
+
+void hw_perf_lbr_event_destroy(struct perf_event *event)
+{
+       hw_perf_event_destroy(event);
+
+       /* undo the lbr/bts event accounting */
+       x86_del_exclusive(x86_lbr_exclusive_lbr);
+}
+
+static inline int x86_pmu_initialized(void)
+{
+       return x86_pmu.handle_irq != NULL;
+}
+
+static inline int
+set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
+{
+       struct perf_event_attr *attr = &event->attr;
+       unsigned int cache_type, cache_op, cache_result;
+       u64 config, val;
+
+       config = attr->config;
+
+       cache_type = (config >>  0) & 0xff;
+       if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
+               return -EINVAL;
+
+       cache_op = (config >>  8) & 0xff;
+       if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
+               return -EINVAL;
+
+       cache_result = (config >> 16) & 0xff;
+       if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+               return -EINVAL;
+
+       val = hw_cache_event_ids[cache_type][cache_op][cache_result];
+
+       if (val == 0)
+               return -ENOENT;
+
+       if (val == -1)
+               return -EINVAL;
+
+       hwc->config |= val;
+       attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
+       return x86_pmu_extra_regs(val, event);
+}
+
+int x86_reserve_hardware(void)
+{
+       int err = 0;
+
+       if (!atomic_inc_not_zero(&pmc_refcount)) {
+               mutex_lock(&pmc_reserve_mutex);
+               if (atomic_read(&pmc_refcount) == 0) {
+                       if (!reserve_pmc_hardware())
+                               err = -EBUSY;
+                       else
+                               reserve_ds_buffers();
+               }
+               if (!err)
+                       atomic_inc(&pmc_refcount);
+               mutex_unlock(&pmc_reserve_mutex);
+       }
+
+       return err;
+}
+
+void x86_release_hardware(void)
+{
+       if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
+               release_pmc_hardware();
+               release_ds_buffers();
+               mutex_unlock(&pmc_reserve_mutex);
+       }
+}
+
+/*
+ * Check if we can create event of a certain type (that no conflicting events
+ * are present).
+ */
+int x86_add_exclusive(unsigned int what)
+{
+       int i;
+
+       if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
+               mutex_lock(&pmc_reserve_mutex);
+               for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
+                       if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
+                               goto fail_unlock;
+               }
+               atomic_inc(&x86_pmu.lbr_exclusive[what]);
+               mutex_unlock(&pmc_reserve_mutex);
+       }
+
+       atomic_inc(&active_events);
+       return 0;
+
+fail_unlock:
+       mutex_unlock(&pmc_reserve_mutex);
+       return -EBUSY;
+}
+
+void x86_del_exclusive(unsigned int what)
+{
+       atomic_dec(&x86_pmu.lbr_exclusive[what]);
+       atomic_dec(&active_events);
+}
+
+int x86_setup_perfctr(struct perf_event *event)
+{
+       struct perf_event_attr *attr = &event->attr;
+       struct hw_perf_event *hwc = &event->hw;
+       u64 config;
+
+       if (!is_sampling_event(event)) {
+               hwc->sample_period = x86_pmu.max_period;
+               hwc->last_period = hwc->sample_period;
+               local64_set(&hwc->period_left, hwc->sample_period);
+       }
+
+       if (attr->type == PERF_TYPE_RAW)
+               return x86_pmu_extra_regs(event->attr.config, event);
+
+       if (attr->type == PERF_TYPE_HW_CACHE)
+               return set_ext_hw_attr(hwc, event);
+
+       if (attr->config >= x86_pmu.max_events)
+               return -EINVAL;
+
+       /*
+        * The generic map:
+        */
+       config = x86_pmu.event_map(attr->config);
+
+       if (config == 0)
+               return -ENOENT;
+
+       if (config == -1LL)
+               return -EINVAL;
+
+       /*
+        * Branch tracing:
+        */
+       if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
+           !attr->freq && hwc->sample_period == 1) {
+               /* BTS is not supported by this architecture. */
+               if (!x86_pmu.bts_active)
+                       return -EOPNOTSUPP;
+
+               /* BTS is currently only allowed for user-mode. */
+               if (!attr->exclude_kernel)
+                       return -EOPNOTSUPP;
+
+               /* disallow bts if conflicting events are present */
+               if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+                       return -EBUSY;
+
+               event->destroy = hw_perf_lbr_event_destroy;
+       }
+
+       hwc->config |= config;
+
+       return 0;
+}
+
+/*
+ * check that branch_sample_type is compatible with
+ * settings needed for precise_ip > 1 which implies
+ * using the LBR to capture ALL taken branches at the
+ * priv levels of the measurement
+ */
+static inline int precise_br_compat(struct perf_event *event)
+{
+       u64 m = event->attr.branch_sample_type;
+       u64 b = 0;
+
+       /* must capture all branches */
+       if (!(m & PERF_SAMPLE_BRANCH_ANY))
+               return 0;
+
+       m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
+
+       if (!event->attr.exclude_user)
+               b |= PERF_SAMPLE_BRANCH_USER;
+
+       if (!event->attr.exclude_kernel)
+               b |= PERF_SAMPLE_BRANCH_KERNEL;
+
+       /*
+        * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
+        */
+
+       return m == b;
+}
+
+int x86_pmu_hw_config(struct perf_event *event)
+{
+       if (event->attr.precise_ip) {
+               int precise = 0;
+
+               /* Support for constant skid */
+               if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
+                       precise++;
+
+                       /* Support for IP fixup */
+                       if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
+                               precise++;
+
+                       if (x86_pmu.pebs_prec_dist)
+                               precise++;
+               }
+
+               if (event->attr.precise_ip > precise)
+                       return -EOPNOTSUPP;
+       }
+       /*
+        * check that PEBS LBR correction does not conflict with
+        * whatever the user is asking with attr->branch_sample_type
+        */
+       if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
+               u64 *br_type = &event->attr.branch_sample_type;
+
+               if (has_branch_stack(event)) {
+                       if (!precise_br_compat(event))
+                               return -EOPNOTSUPP;
+
+                       /* branch_sample_type is compatible */
+
+               } else {
+                       /*
+                        * user did not specify  branch_sample_type
+                        *
+                        * For PEBS fixups, we capture all
+                        * the branches at the priv level of the
+                        * event.
+                        */
+                       *br_type = PERF_SAMPLE_BRANCH_ANY;
+
+                       if (!event->attr.exclude_user)
+                               *br_type |= PERF_SAMPLE_BRANCH_USER;
+
+                       if (!event->attr.exclude_kernel)
+                               *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
+               }
+       }
+
+       if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
+               event->attach_state |= PERF_ATTACH_TASK_DATA;
+
+       /*
+        * Generate PMC IRQs:
+        * (keep 'enabled' bit clear for now)
+        */
+       event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
+
+       /*
+        * Count user and OS events unless requested not to
+        */
+       if (!event->attr.exclude_user)
+               event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
+       if (!event->attr.exclude_kernel)
+               event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
+
+       if (event->attr.type == PERF_TYPE_RAW)
+               event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
+
+       if (event->attr.sample_period && x86_pmu.limit_period) {
+               if (x86_pmu.limit_period(event, event->attr.sample_period) >
+                               event->attr.sample_period)
+                       return -EINVAL;
+       }
+
+       return x86_setup_perfctr(event);
+}
+
+/*
+ * Setup the hardware configuration for a given attr_type
+ */
+static int __x86_pmu_event_init(struct perf_event *event)
+{
+       int err;
+
+       if (!x86_pmu_initialized())
+               return -ENODEV;
+
+       err = x86_reserve_hardware();
+       if (err)
+               return err;
+
+       atomic_inc(&active_events);
+       event->destroy = hw_perf_event_destroy;
+
+       event->hw.idx = -1;
+       event->hw.last_cpu = -1;
+       event->hw.last_tag = ~0ULL;
+
+       /* mark unused */
+       event->hw.extra_reg.idx = EXTRA_REG_NONE;
+       event->hw.branch_reg.idx = EXTRA_REG_NONE;
+
+       return x86_pmu.hw_config(event);
+}
+
+void x86_pmu_disable_all(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       int idx;
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               u64 val;
+
+               if (!test_bit(idx, cpuc->active_mask))
+                       continue;
+               rdmsrl(x86_pmu_config_addr(idx), val);
+               if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
+                       continue;
+               val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+               wrmsrl(x86_pmu_config_addr(idx), val);
+       }
+}
+
+/*
+ * There may be PMI landing after enabled=0. The PMI hitting could be before or
+ * after disable_all.
+ *
+ * If PMI hits before disable_all, the PMU will be disabled in the NMI handler.
+ * It will not be re-enabled in the NMI handler again, because enabled=0. After
+ * handling the NMI, disable_all will be called, which will not change the
+ * state either. If PMI hits after disable_all, the PMU is already disabled
+ * before entering NMI handler. The NMI handler will not change the state
+ * either.
+ *
+ * So either situation is harmless.
+ */
+static void x86_pmu_disable(struct pmu *pmu)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       if (!x86_pmu_initialized())
+               return;
+
+       if (!cpuc->enabled)
+               return;
+
+       cpuc->n_added = 0;
+       cpuc->enabled = 0;
+       barrier();
+
+       x86_pmu.disable_all();
+}
+
+void x86_pmu_enable_all(int added)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       int idx;
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
+
+               if (!test_bit(idx, cpuc->active_mask))
+                       continue;
+
+               __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+       }
+}
+
+static struct pmu pmu;
+
+static inline int is_x86_event(struct perf_event *event)
+{
+       return event->pmu == &pmu;
+}
+
+/*
+ * Event scheduler state:
+ *
+ * Assign events iterating over all events and counters, beginning
+ * with events with least weights first. Keep the current iterator
+ * state in struct sched_state.
+ */
+struct sched_state {
+       int     weight;
+       int     event;          /* event index */
+       int     counter;        /* counter index */
+       int     unassigned;     /* number of events to be assigned left */
+       int     nr_gp;          /* number of GP counters used */
+       unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+};
+
+/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
+#define        SCHED_STATES_MAX        2
+
+struct perf_sched {
+       int                     max_weight;
+       int                     max_events;
+       int                     max_gp;
+       int                     saved_states;
+       struct event_constraint **constraints;
+       struct sched_state      state;
+       struct sched_state      saved[SCHED_STATES_MAX];
+};
+
+/*
+ * Initialize interator that runs through all events and counters.
+ */
+static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
+                           int num, int wmin, int wmax, int gpmax)
+{
+       int idx;
+
+       memset(sched, 0, sizeof(*sched));
+       sched->max_events       = num;
+       sched->max_weight       = wmax;
+       sched->max_gp           = gpmax;
+       sched->constraints      = constraints;
+
+       for (idx = 0; idx < num; idx++) {
+               if (constraints[idx]->weight == wmin)
+                       break;
+       }
+
+       sched->state.event      = idx;          /* start with min weight */
+       sched->state.weight     = wmin;
+       sched->state.unassigned = num;
+}
+
+static void perf_sched_save_state(struct perf_sched *sched)
+{
+       if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
+               return;
+
+       sched->saved[sched->saved_states] = sched->state;
+       sched->saved_states++;
+}
+
+static bool perf_sched_restore_state(struct perf_sched *sched)
+{
+       if (!sched->saved_states)
+               return false;
+
+       sched->saved_states--;
+       sched->state = sched->saved[sched->saved_states];
+
+       /* continue with next counter: */
+       clear_bit(sched->state.counter++, sched->state.used);
+
+       return true;
+}
+
+/*
+ * Select a counter for the current event to schedule. Return true on
+ * success.
+ */
+static bool __perf_sched_find_counter(struct perf_sched *sched)
+{
+       struct event_constraint *c;
+       int idx;
+
+       if (!sched->state.unassigned)
+               return false;
+
+       if (sched->state.event >= sched->max_events)
+               return false;
+
+       c = sched->constraints[sched->state.event];
+       /* Prefer fixed purpose counters */
+       if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
+               idx = INTEL_PMC_IDX_FIXED;
+               for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
+                       if (!__test_and_set_bit(idx, sched->state.used))
+                               goto done;
+               }
+       }
+
+       /* Grab the first unused counter starting with idx */
+       idx = sched->state.counter;
+       for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
+               if (!__test_and_set_bit(idx, sched->state.used)) {
+                       if (sched->state.nr_gp++ >= sched->max_gp)
+                               return false;
+
+                       goto done;
+               }
+       }
+
+       return false;
+
+done:
+       sched->state.counter = idx;
+
+       if (c->overlap)
+               perf_sched_save_state(sched);
+
+       return true;
+}
+
+static bool perf_sched_find_counter(struct perf_sched *sched)
+{
+       while (!__perf_sched_find_counter(sched)) {
+               if (!perf_sched_restore_state(sched))
+                       return false;
+       }
+
+       return true;
+}
+
+/*
+ * Go through all unassigned events and find the next one to schedule.
+ * Take events with the least weight first. Return true on success.
+ */
+static bool perf_sched_next_event(struct perf_sched *sched)
+{
+       struct event_constraint *c;
+
+       if (!sched->state.unassigned || !--sched->state.unassigned)
+               return false;
+
+       do {
+               /* next event */
+               sched->state.event++;
+               if (sched->state.event >= sched->max_events) {
+                       /* next weight */
+                       sched->state.event = 0;
+                       sched->state.weight++;
+                       if (sched->state.weight > sched->max_weight)
+                               return false;
+               }
+               c = sched->constraints[sched->state.event];
+       } while (c->weight != sched->state.weight);
+
+       sched->state.counter = 0;       /* start with first counter */
+
+       return true;
+}
+
+/*
+ * Assign a counter for each event.
+ */
+int perf_assign_events(struct event_constraint **constraints, int n,
+                       int wmin, int wmax, int gpmax, int *assign)
+{
+       struct perf_sched sched;
+
+       perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
+
+       do {
+               if (!perf_sched_find_counter(&sched))
+                       break;  /* failed */
+               if (assign)
+                       assign[sched.state.event] = sched.state.counter;
+       } while (perf_sched_next_event(&sched));
+
+       return sched.state.unassigned;
+}
+EXPORT_SYMBOL_GPL(perf_assign_events);
+
+int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
+{
+       struct event_constraint *c;
+       unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+       struct perf_event *e;
+       int i, wmin, wmax, unsched = 0;
+       struct hw_perf_event *hwc;
+
+       bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+
+       if (x86_pmu.start_scheduling)
+               x86_pmu.start_scheduling(cpuc);
+
+       for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
+               cpuc->event_constraint[i] = NULL;
+               c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
+               cpuc->event_constraint[i] = c;
+
+               wmin = min(wmin, c->weight);
+               wmax = max(wmax, c->weight);
+       }
+
+       /*
+        * fastpath, try to reuse previous register
+        */
+       for (i = 0; i < n; i++) {
+               hwc = &cpuc->event_list[i]->hw;
+               c = cpuc->event_constraint[i];
+
+               /* never assigned */
+               if (hwc->idx == -1)
+                       break;
+
+               /* constraint still honored */
+               if (!test_bit(hwc->idx, c->idxmsk))
+                       break;
+
+               /* not already used */
+               if (test_bit(hwc->idx, used_mask))
+                       break;
+
+               __set_bit(hwc->idx, used_mask);
+               if (assign)
+                       assign[i] = hwc->idx;
+       }
+
+       /* slow path */
+       if (i != n) {
+               int gpmax = x86_pmu.num_counters;
+
+               /*
+                * Do not allow scheduling of more than half the available
+                * generic counters.
+                *
+                * This helps avoid counter starvation of sibling thread by
+                * ensuring at most half the counters cannot be in exclusive
+                * mode. There is no designated counters for the limits. Any
+                * N/2 counters can be used. This helps with events with
+                * specific counter constraints.
+                */
+               if (is_ht_workaround_enabled() && !cpuc->is_fake &&
+                   READ_ONCE(cpuc->excl_cntrs->exclusive_present))
+                       gpmax /= 2;
+
+               unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
+                                            wmax, gpmax, assign);
+       }
+
+       /*
+        * In case of success (unsched = 0), mark events as committed,
+        * so we do not put_constraint() in case new events are added
+        * and fail to be scheduled
+        *
+        * We invoke the lower level commit callback to lock the resource
+        *
+        * We do not need to do all of this in case we are called to
+        * validate an event group (assign == NULL)
+        */
+       if (!unsched && assign) {
+               for (i = 0; i < n; i++) {
+                       e = cpuc->event_list[i];
+                       e->hw.flags |= PERF_X86_EVENT_COMMITTED;
+                       if (x86_pmu.commit_scheduling)
+                               x86_pmu.commit_scheduling(cpuc, i, assign[i]);
+               }
+       } else {
+               for (i = 0; i < n; i++) {
+                       e = cpuc->event_list[i];
+                       /*
+                        * do not put_constraint() on comitted events,
+                        * because they are good to go
+                        */
+                       if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
+                               continue;
+
+                       /*
+                        * release events that failed scheduling
+                        */
+                       if (x86_pmu.put_event_constraints)
+                               x86_pmu.put_event_constraints(cpuc, e);
+               }
+       }
+
+       if (x86_pmu.stop_scheduling)
+               x86_pmu.stop_scheduling(cpuc);
+
+       return unsched ? -EINVAL : 0;
+}
+
+/*
+ * dogrp: true if must collect siblings events (group)
+ * returns total number of events and error code
+ */
+static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
+{
+       struct perf_event *event;
+       int n, max_count;
+
+       max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
+
+       /* current number of events already accepted */
+       n = cpuc->n_events;
+
+       if (is_x86_event(leader)) {
+               if (n >= max_count)
+                       return -EINVAL;
+               cpuc->event_list[n] = leader;
+               n++;
+       }
+       if (!dogrp)
+               return n;
+
+       list_for_each_entry(event, &leader->sibling_list, group_entry) {
+               if (!is_x86_event(event) ||
+                   event->state <= PERF_EVENT_STATE_OFF)
+                       continue;
+
+               if (n >= max_count)
+                       return -EINVAL;
+
+               cpuc->event_list[n] = event;
+               n++;
+       }
+       return n;
+}
+
+static inline void x86_assign_hw_event(struct perf_event *event,
+                               struct cpu_hw_events *cpuc, int i)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       hwc->idx = cpuc->assign[i];
+       hwc->last_cpu = smp_processor_id();
+       hwc->last_tag = ++cpuc->tags[i];
+
+       if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) {
+               hwc->config_base = 0;
+               hwc->event_base = 0;
+       } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
+               hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+               hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED);
+               hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30;
+       } else {
+               hwc->config_base = x86_pmu_config_addr(hwc->idx);
+               hwc->event_base  = x86_pmu_event_addr(hwc->idx);
+               hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
+       }
+}
+
+static inline int match_prev_assignment(struct hw_perf_event *hwc,
+                                       struct cpu_hw_events *cpuc,
+                                       int i)
+{
+       return hwc->idx == cpuc->assign[i] &&
+               hwc->last_cpu == smp_processor_id() &&
+               hwc->last_tag == cpuc->tags[i];
+}
+
+static void x86_pmu_start(struct perf_event *event, int flags);
+
+static void x86_pmu_enable(struct pmu *pmu)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct perf_event *event;
+       struct hw_perf_event *hwc;
+       int i, added = cpuc->n_added;
+
+       if (!x86_pmu_initialized())
+               return;
+
+       if (cpuc->enabled)
+               return;
+
+       if (cpuc->n_added) {
+               int n_running = cpuc->n_events - cpuc->n_added;
+               /*
+                * apply assignment obtained either from
+                * hw_perf_group_sched_in() or x86_pmu_enable()
+                *
+                * step1: save events moving to new counters
+                */
+               for (i = 0; i < n_running; i++) {
+                       event = cpuc->event_list[i];
+                       hwc = &event->hw;
+
+                       /*
+                        * we can avoid reprogramming counter if:
+                        * - assigned same counter as last time
+                        * - running on same CPU as last time
+                        * - no other event has used the counter since
+                        */
+                       if (hwc->idx == -1 ||
+                           match_prev_assignment(hwc, cpuc, i))
+                               continue;
+
+                       /*
+                        * Ensure we don't accidentally enable a stopped
+                        * counter simply because we rescheduled.
+                        */
+                       if (hwc->state & PERF_HES_STOPPED)
+                               hwc->state |= PERF_HES_ARCH;
+
+                       x86_pmu_stop(event, PERF_EF_UPDATE);
+               }
+
+               /*
+                * step2: reprogram moved events into new counters
+                */
+               for (i = 0; i < cpuc->n_events; i++) {
+                       event = cpuc->event_list[i];
+                       hwc = &event->hw;
+
+                       if (!match_prev_assignment(hwc, cpuc, i))
+                               x86_assign_hw_event(event, cpuc, i);
+                       else if (i < n_running)
+                               continue;
+
+                       if (hwc->state & PERF_HES_ARCH)
+                               continue;
+
+                       x86_pmu_start(event, PERF_EF_RELOAD);
+               }
+               cpuc->n_added = 0;
+               perf_events_lapic_init();
+       }
+
+       cpuc->enabled = 1;
+       barrier();
+
+       x86_pmu.enable_all(added);
+}
+
+static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
+
+/*
+ * Set the next IRQ period, based on the hwc->period_left value.
+ * To be called with the event disabled in hw:
+ */
+int x86_perf_event_set_period(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       s64 left = local64_read(&hwc->period_left);
+       s64 period = hwc->sample_period;
+       int ret = 0, idx = hwc->idx;
+
+       if (idx == INTEL_PMC_IDX_FIXED_BTS)
+               return 0;
+
+       /*
+        * If we are way outside a reasonable range then just skip forward:
+        */
+       if (unlikely(left <= -period)) {
+               left = period;
+               local64_set(&hwc->period_left, left);
+               hwc->last_period = period;
+               ret = 1;
+       }
+
+       if (unlikely(left <= 0)) {
+               left += period;
+               local64_set(&hwc->period_left, left);
+               hwc->last_period = period;
+               ret = 1;
+       }
+       /*
+        * Quirk: certain CPUs dont like it if just 1 hw_event is left:
+        */
+       if (unlikely(left < 2))
+               left = 2;
+
+       if (left > x86_pmu.max_period)
+               left = x86_pmu.max_period;
+
+       if (x86_pmu.limit_period)
+               left = x86_pmu.limit_period(event, left);
+
+       per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
+
+       if (!(hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) ||
+           local64_read(&hwc->prev_count) != (u64)-left) {
+               /*
+                * The hw event starts counting from this event offset,
+                * mark it to be able to extra future deltas:
+                */
+               local64_set(&hwc->prev_count, (u64)-left);
+
+               wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
+       }
+
+       /*
+        * Due to erratum on certan cpu we need
+        * a second write to be sure the register
+        * is updated properly
+        */
+       if (x86_pmu.perfctr_second_write) {
+               wrmsrl(hwc->event_base,
+                       (u64)(-left) & x86_pmu.cntval_mask);
+       }
+
+       perf_event_update_userpage(event);
+
+       return ret;
+}
+
+void x86_pmu_enable_event(struct perf_event *event)
+{
+       if (__this_cpu_read(cpu_hw_events.enabled))
+               __x86_pmu_enable_event(&event->hw,
+                                      ARCH_PERFMON_EVENTSEL_ENABLE);
+}
+
+/*
+ * Add a single event to the PMU.
+ *
+ * The event is added to the group of enabled events
+ * but only if it can be scehduled with existing events.
+ */
+static int x86_pmu_add(struct perf_event *event, int flags)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct hw_perf_event *hwc;
+       int assign[X86_PMC_IDX_MAX];
+       int n, n0, ret;
+
+       hwc = &event->hw;
+
+       n0 = cpuc->n_events;
+       ret = n = collect_events(cpuc, event, false);
+       if (ret < 0)
+               goto out;
+
+       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+       if (!(flags & PERF_EF_START))
+               hwc->state |= PERF_HES_ARCH;
+
+       /*
+        * If group events scheduling transaction was started,
+        * skip the schedulability test here, it will be performed
+        * at commit time (->commit_txn) as a whole.
+        */
+       if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
+               goto done_collect;
+
+       ret = x86_pmu.schedule_events(cpuc, n, assign);
+       if (ret)
+               goto out;
+       /*
+        * copy new assignment, now we know it is possible
+        * will be used by hw_perf_enable()
+        */
+       memcpy(cpuc->assign, assign, n*sizeof(int));
+
+done_collect:
+       /*
+        * Commit the collect_events() state. See x86_pmu_del() and
+        * x86_pmu_*_txn().
+        */
+       cpuc->n_events = n;
+       cpuc->n_added += n - n0;
+       cpuc->n_txn += n - n0;
+
+       ret = 0;
+out:
+       return ret;
+}
+
+static void x86_pmu_start(struct perf_event *event, int flags)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       int idx = event->hw.idx;
+
+       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+               return;
+
+       if (WARN_ON_ONCE(idx == -1))
+               return;
+
+       if (flags & PERF_EF_RELOAD) {
+               WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+               x86_perf_event_set_period(event);
+       }
+
+       event->hw.state = 0;
+
+       cpuc->events[idx] = event;
+       __set_bit(idx, cpuc->active_mask);
+       __set_bit(idx, cpuc->running);
+       x86_pmu.enable(event);
+       perf_event_update_userpage(event);
+}
+
+void perf_event_print_debug(void)
+{
+       u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
+       u64 pebs, debugctl;
+       struct cpu_hw_events *cpuc;
+       unsigned long flags;
+       int cpu, idx;
+
+       if (!x86_pmu.num_counters)
+               return;
+
+       local_irq_save(flags);
+
+       cpu = smp_processor_id();
+       cpuc = &per_cpu(cpu_hw_events, cpu);
+
+       if (x86_pmu.version >= 2) {
+               rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+               rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+               rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
+               rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
+
+               pr_info("\n");
+               pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
+               pr_info("CPU#%d: status:     %016llx\n", cpu, status);
+               pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
+               pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
+               if (x86_pmu.pebs_constraints) {
+                       rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
+                       pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
+               }
+               if (x86_pmu.lbr_nr) {
+                       rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+                       pr_info("CPU#%d: debugctl:   %016llx\n", cpu, debugctl);
+               }
+       }
+       pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
+               rdmsrl(x86_pmu_event_addr(idx), pmc_count);
+
+               prev_left = per_cpu(pmc_prev_left[idx], cpu);
+
+               pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
+                       cpu, idx, pmc_ctrl);
+               pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
+                       cpu, idx, pmc_count);
+               pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
+                       cpu, idx, prev_left);
+       }
+       for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+               rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
+
+               pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
+                       cpu, idx, pmc_count);
+       }
+       local_irq_restore(flags);
+}
+
+void x86_pmu_stop(struct perf_event *event, int flags)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
+               x86_pmu.disable(event);
+               cpuc->events[hwc->idx] = NULL;
+               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+               hwc->state |= PERF_HES_STOPPED;
+       }
+
+       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+               /*
+                * Drain the remaining delta count out of a event
+                * that we are disabling:
+                */
+               x86_perf_event_update(event);
+               hwc->state |= PERF_HES_UPTODATE;
+       }
+}
+
+static void x86_pmu_del(struct perf_event *event, int flags)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       int i;
+
+       /*
+        * event is descheduled
+        */
+       event->hw.flags &= ~PERF_X86_EVENT_COMMITTED;
+
+       /*
+        * If we're called during a txn, we don't need to do anything.
+        * The events never got scheduled and ->cancel_txn will truncate
+        * the event_list.
+        *
+        * XXX assumes any ->del() called during a TXN will only be on
+        * an event added during that same TXN.
+        */
+       if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
+               return;
+
+       /*
+        * Not a TXN, therefore cleanup properly.
+        */
+       x86_pmu_stop(event, PERF_EF_UPDATE);
+
+       for (i = 0; i < cpuc->n_events; i++) {
+               if (event == cpuc->event_list[i])
+                       break;
+       }
+
+       if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */
+               return;
+
+       /* If we have a newly added event; make sure to decrease n_added. */
+       if (i >= cpuc->n_events - cpuc->n_added)
+               --cpuc->n_added;
+
+       if (x86_pmu.put_event_constraints)
+               x86_pmu.put_event_constraints(cpuc, event);
+
+       /* Delete the array entry. */
+       while (++i < cpuc->n_events) {
+               cpuc->event_list[i-1] = cpuc->event_list[i];
+               cpuc->event_constraint[i-1] = cpuc->event_constraint[i];
+       }
+       --cpuc->n_events;
+
+       perf_event_update_userpage(event);
+}
+
+int x86_pmu_handle_irq(struct pt_regs *regs)
+{
+       struct perf_sample_data data;
+       struct cpu_hw_events *cpuc;
+       struct perf_event *event;
+       int idx, handled = 0;
+       u64 val;
+
+       cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       /*
+        * Some chipsets need to unmask the LVTPC in a particular spot
+        * inside the nmi handler.  As a result, the unmasking was pushed
+        * into all the nmi handlers.
+        *
+        * This generic handler doesn't seem to have any issues where the
+        * unmasking occurs so it was left at the top.
+        */
+       apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               if (!test_bit(idx, cpuc->active_mask)) {
+                       /*
+                        * Though we deactivated the counter some cpus
+                        * might still deliver spurious interrupts still
+                        * in flight. Catch them:
+                        */
+                       if (__test_and_clear_bit(idx, cpuc->running))
+                               handled++;
+                       continue;
+               }
+
+               event = cpuc->events[idx];
+
+               val = x86_perf_event_update(event);
+               if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
+                       continue;
+
+               /*
+                * event overflow
+                */
+               handled++;
+               perf_sample_data_init(&data, 0, event->hw.last_period);
+
+               if (!x86_perf_event_set_period(event))
+                       continue;
+
+               if (perf_event_overflow(event, &data, regs))
+                       x86_pmu_stop(event, 0);
+       }
+
+       if (handled)
+               inc_irq_stat(apic_perf_irqs);
+
+       return handled;
+}
+
+void perf_events_lapic_init(void)
+{
+       if (!x86_pmu.apic || !x86_pmu_initialized())
+               return;
+
+       /*
+        * Always use NMI for PMU
+        */
+       apic_write(APIC_LVTPC, APIC_DM_NMI);
+}
+
+static int
+perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
+{
+       u64 start_clock;
+       u64 finish_clock;
+       int ret;
+
+       /*
+        * All PMUs/events that share this PMI handler should make sure to
+        * increment active_events for their events.
+        */
+       if (!atomic_read(&active_events))
+               return NMI_DONE;
+
+       start_clock = sched_clock();
+       ret = x86_pmu.handle_irq(regs);
+       finish_clock = sched_clock();
+
+       perf_sample_event_took(finish_clock - start_clock);
+
+       return ret;
+}
+NOKPROBE_SYMBOL(perf_event_nmi_handler);
+
+struct event_constraint emptyconstraint;
+struct event_constraint unconstrained;
+
+static int
+x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+       unsigned int cpu = (long)hcpu;
+       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+       int i, ret = NOTIFY_OK;
+
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_UP_PREPARE:
+               for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
+                       cpuc->kfree_on_online[i] = NULL;
+               if (x86_pmu.cpu_prepare)
+                       ret = x86_pmu.cpu_prepare(cpu);
+               break;
+
+       case CPU_STARTING:
+               if (x86_pmu.cpu_starting)
+                       x86_pmu.cpu_starting(cpu);
+               break;
+
+       case CPU_ONLINE:
+               for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
+                       kfree(cpuc->kfree_on_online[i]);
+                       cpuc->kfree_on_online[i] = NULL;
+               }
+               break;
+
+       case CPU_DYING:
+               if (x86_pmu.cpu_dying)
+                       x86_pmu.cpu_dying(cpu);
+               break;
+
+       case CPU_UP_CANCELED:
+       case CPU_DEAD:
+               if (x86_pmu.cpu_dead)
+                       x86_pmu.cpu_dead(cpu);
+               break;
+
+       default:
+               break;
+       }
+
+       return ret;
+}
+
+static void __init pmu_check_apic(void)
+{
+       if (cpu_has_apic)
+               return;
+
+       x86_pmu.apic = 0;
+       pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
+       pr_info("no hardware sampling interrupt available.\n");
+
+       /*
+        * If we have a PMU initialized but no APIC
+        * interrupts, we cannot sample hardware
+        * events (user-space has to fall back and
+        * sample via a hrtimer based software event):
+        */
+       pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
+
+}
+
+static struct attribute_group x86_pmu_format_group = {
+       .name = "format",
+       .attrs = NULL,
+};
+
+/*
+ * Remove all undefined events (x86_pmu.event_map(id) == 0)
+ * out of events_attr attributes.
+ */
+static void __init filter_events(struct attribute **attrs)
+{
+       struct device_attribute *d;
+       struct perf_pmu_events_attr *pmu_attr;
+       int offset = 0;
+       int i, j;
+
+       for (i = 0; attrs[i]; i++) {
+               d = (struct device_attribute *)attrs[i];
+               pmu_attr = container_of(d, struct perf_pmu_events_attr, attr);
+               /* str trumps id */
+               if (pmu_attr->event_str)
+                       continue;
+               if (x86_pmu.event_map(i + offset))
+                       continue;
+
+               for (j = i; attrs[j]; j++)
+                       attrs[j] = attrs[j + 1];
+
+               /* Check the shifted attr. */
+               i--;
+
+               /*
+                * event_map() is index based, the attrs array is organized
+                * by increasing event index. If we shift the events, then
+                * we need to compensate for the event_map(), otherwise
+                * we are looking up the wrong event in the map
+                */
+               offset++;
+       }
+}
+
+/* Merge two pointer arrays */
+__init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
+{
+       struct attribute **new;
+       int j, i;
+
+       for (j = 0; a[j]; j++)
+               ;
+       for (i = 0; b[i]; i++)
+               j++;
+       j++;
+
+       new = kmalloc(sizeof(struct attribute *) * j, GFP_KERNEL);
+       if (!new)
+               return NULL;
+
+       j = 0;
+       for (i = 0; a[i]; i++)
+               new[j++] = a[i];
+       for (i = 0; b[i]; i++)
+               new[j++] = b[i];
+       new[j] = NULL;
+
+       return new;
+}
+
+ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
+                         char *page)
+{
+       struct perf_pmu_events_attr *pmu_attr = \
+               container_of(attr, struct perf_pmu_events_attr, attr);
+       u64 config = x86_pmu.event_map(pmu_attr->id);
+
+       /* string trumps id */
+       if (pmu_attr->event_str)
+               return sprintf(page, "%s", pmu_attr->event_str);
+
+       return x86_pmu.events_sysfs_show(page, config);
+}
+
+EVENT_ATTR(cpu-cycles,                 CPU_CYCLES              );
+EVENT_ATTR(instructions,               INSTRUCTIONS            );
+EVENT_ATTR(cache-references,           CACHE_REFERENCES        );
+EVENT_ATTR(cache-misses,               CACHE_MISSES            );
+EVENT_ATTR(branch-instructions,                BRANCH_INSTRUCTIONS     );
+EVENT_ATTR(branch-misses,              BRANCH_MISSES           );
+EVENT_ATTR(bus-cycles,                 BUS_CYCLES              );
+EVENT_ATTR(stalled-cycles-frontend,    STALLED_CYCLES_FRONTEND );
+EVENT_ATTR(stalled-cycles-backend,     STALLED_CYCLES_BACKEND  );
+EVENT_ATTR(ref-cycles,                 REF_CPU_CYCLES          );
+
+static struct attribute *empty_attrs;
+
+static struct attribute *events_attr[] = {
+       EVENT_PTR(CPU_CYCLES),
+       EVENT_PTR(INSTRUCTIONS),
+       EVENT_PTR(CACHE_REFERENCES),
+       EVENT_PTR(CACHE_MISSES),
+       EVENT_PTR(BRANCH_INSTRUCTIONS),
+       EVENT_PTR(BRANCH_MISSES),
+       EVENT_PTR(BUS_CYCLES),
+       EVENT_PTR(STALLED_CYCLES_FRONTEND),
+       EVENT_PTR(STALLED_CYCLES_BACKEND),
+       EVENT_PTR(REF_CPU_CYCLES),
+       NULL,
+};
+
+static struct attribute_group x86_pmu_events_group = {
+       .name = "events",
+       .attrs = events_attr,
+};
+
+ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
+{
+       u64 umask  = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
+       u64 cmask  = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
+       bool edge  = (config & ARCH_PERFMON_EVENTSEL_EDGE);
+       bool pc    = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL);
+       bool any   = (config & ARCH_PERFMON_EVENTSEL_ANY);
+       bool inv   = (config & ARCH_PERFMON_EVENTSEL_INV);
+       ssize_t ret;
+
+       /*
+       * We have whole page size to spend and just little data
+       * to write, so we can safely use sprintf.
+       */
+       ret = sprintf(page, "event=0x%02llx", event);
+
+       if (umask)
+               ret += sprintf(page + ret, ",umask=0x%02llx", umask);
+
+       if (edge)
+               ret += sprintf(page + ret, ",edge");
+
+       if (pc)
+               ret += sprintf(page + ret, ",pc");
+
+       if (any)
+               ret += sprintf(page + ret, ",any");
+
+       if (inv)
+               ret += sprintf(page + ret, ",inv");
+
+       if (cmask)
+               ret += sprintf(page + ret, ",cmask=0x%02llx", cmask);
+
+       ret += sprintf(page + ret, "\n");
+
+       return ret;
+}
+
+static int __init init_hw_perf_events(void)
+{
+       struct x86_pmu_quirk *quirk;
+       int err;
+
+       pr_info("Performance Events: ");
+
+       switch (boot_cpu_data.x86_vendor) {
+       case X86_VENDOR_INTEL:
+               err = intel_pmu_init();
+               break;
+       case X86_VENDOR_AMD:
+               err = amd_pmu_init();
+               break;
+       default:
+               err = -ENOTSUPP;
+       }
+       if (err != 0) {
+               pr_cont("no PMU driver, software events only.\n");
+               return 0;
+       }
+
+       pmu_check_apic();
+
+       /* sanity check that the hardware exists or is emulated */
+       if (!check_hw_exists())
+               return 0;
+
+       pr_cont("%s PMU driver.\n", x86_pmu.name);
+
+       x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
+
+       for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
+               quirk->func();
+
+       if (!x86_pmu.intel_ctrl)
+               x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
+
+       perf_events_lapic_init();
+       register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
+
+       unconstrained = (struct event_constraint)
+               __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
+                                  0, x86_pmu.num_counters, 0, 0);
+
+       x86_pmu_format_group.attrs = x86_pmu.format_attrs;
+
+       if (x86_pmu.event_attrs)
+               x86_pmu_events_group.attrs = x86_pmu.event_attrs;
+
+       if (!x86_pmu.events_sysfs_show)
+               x86_pmu_events_group.attrs = &empty_attrs;
+       else
+               filter_events(x86_pmu_events_group.attrs);
+
+       if (x86_pmu.cpu_events) {
+               struct attribute **tmp;
+
+               tmp = merge_attr(x86_pmu_events_group.attrs, x86_pmu.cpu_events);
+               if (!WARN_ON(!tmp))
+                       x86_pmu_events_group.attrs = tmp;
+       }
+
+       pr_info("... version:                %d\n",     x86_pmu.version);
+       pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
+       pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
+       pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
+       pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
+       pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
+       pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
+
+       perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
+       perf_cpu_notifier(x86_pmu_notifier);
+
+       return 0;
+}
+early_initcall(init_hw_perf_events);
+
+static inline void x86_pmu_read(struct perf_event *event)
+{
+       x86_perf_event_update(event);
+}
+
+/*
+ * Start group events scheduling transaction
+ * Set the flag to make pmu::enable() not perform the
+ * schedulability test, it will be performed at commit time
+ *
+ * We only support PERF_PMU_TXN_ADD transactions. Save the
+ * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
+ * transactions.
+ */
+static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       WARN_ON_ONCE(cpuc->txn_flags);          /* txn already in flight */
+
+       cpuc->txn_flags = txn_flags;
+       if (txn_flags & ~PERF_PMU_TXN_ADD)
+               return;
+
+       perf_pmu_disable(pmu);
+       __this_cpu_write(cpu_hw_events.n_txn, 0);
+}
+
+/*
+ * Stop group events scheduling transaction
+ * Clear the flag and pmu::enable() will perform the
+ * schedulability test.
+ */
+static void x86_pmu_cancel_txn(struct pmu *pmu)
+{
+       unsigned int txn_flags;
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
+
+       txn_flags = cpuc->txn_flags;
+       cpuc->txn_flags = 0;
+       if (txn_flags & ~PERF_PMU_TXN_ADD)
+               return;
+
+       /*
+        * Truncate collected array by the number of events added in this
+        * transaction. See x86_pmu_add() and x86_pmu_*_txn().
+        */
+       __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
+       __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
+       perf_pmu_enable(pmu);
+}
+
+/*
+ * Commit group events scheduling transaction
+ * Perform the group schedulability test as a whole
+ * Return 0 if success
+ *
+ * Does not cancel the transaction on failure; expects the caller to do this.
+ */
+static int x86_pmu_commit_txn(struct pmu *pmu)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       int assign[X86_PMC_IDX_MAX];
+       int n, ret;
+
+       WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
+
+       if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) {
+               cpuc->txn_flags = 0;
+               return 0;
+       }
+
+       n = cpuc->n_events;
+
+       if (!x86_pmu_initialized())
+               return -EAGAIN;
+
+       ret = x86_pmu.schedule_events(cpuc, n, assign);
+       if (ret)
+               return ret;
+
+       /*
+        * copy new assignment, now we know it is possible
+        * will be used by hw_perf_enable()
+        */
+       memcpy(cpuc->assign, assign, n*sizeof(int));
+
+       cpuc->txn_flags = 0;
+       perf_pmu_enable(pmu);
+       return 0;
+}
+/*
+ * a fake_cpuc is used to validate event groups. Due to
+ * the extra reg logic, we need to also allocate a fake
+ * per_core and per_cpu structure. Otherwise, group events
+ * using extra reg may conflict without the kernel being
+ * able to catch this when the last event gets added to
+ * the group.
+ */
+static void free_fake_cpuc(struct cpu_hw_events *cpuc)
+{
+       kfree(cpuc->shared_regs);
+       kfree(cpuc);
+}
+
+static struct cpu_hw_events *allocate_fake_cpuc(void)
+{
+       struct cpu_hw_events *cpuc;
+       int cpu = raw_smp_processor_id();
+
+       cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
+       if (!cpuc)
+               return ERR_PTR(-ENOMEM);
+
+       /* only needed, if we have extra_regs */
+       if (x86_pmu.extra_regs) {
+               cpuc->shared_regs = allocate_shared_regs(cpu);
+               if (!cpuc->shared_regs)
+                       goto error;
+       }
+       cpuc->is_fake = 1;
+       return cpuc;
+error:
+       free_fake_cpuc(cpuc);
+       return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * validate that we can schedule this event
+ */
+static int validate_event(struct perf_event *event)
+{
+       struct cpu_hw_events *fake_cpuc;
+       struct event_constraint *c;
+       int ret = 0;
+
+       fake_cpuc = allocate_fake_cpuc();
+       if (IS_ERR(fake_cpuc))
+               return PTR_ERR(fake_cpuc);
+
+       c = x86_pmu.get_event_constraints(fake_cpuc, -1, event);
+
+       if (!c || !c->weight)
+               ret = -EINVAL;
+
+       if (x86_pmu.put_event_constraints)
+               x86_pmu.put_event_constraints(fake_cpuc, event);
+
+       free_fake_cpuc(fake_cpuc);
+
+       return ret;
+}
+
+/*
+ * validate a single event group
+ *
+ * validation include:
+ *     - check events are compatible which each other
+ *     - events do not compete for the same counter
+ *     - number of events <= number of counters
+ *
+ * validation ensures the group can be loaded onto the
+ * PMU if it was the only group available.
+ */
+static int validate_group(struct perf_event *event)
+{
+       struct perf_event *leader = event->group_leader;
+       struct cpu_hw_events *fake_cpuc;
+       int ret = -EINVAL, n;
+
+       fake_cpuc = allocate_fake_cpuc();
+       if (IS_ERR(fake_cpuc))
+               return PTR_ERR(fake_cpuc);
+       /*
+        * the event is not yet connected with its
+        * siblings therefore we must first collect
+        * existing siblings, then add the new event
+        * before we can simulate the scheduling
+        */
+       n = collect_events(fake_cpuc, leader, true);
+       if (n < 0)
+               goto out;
+
+       fake_cpuc->n_events = n;
+       n = collect_events(fake_cpuc, event, false);
+       if (n < 0)
+               goto out;
+
+       fake_cpuc->n_events = n;
+
+       ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
+
+out:
+       free_fake_cpuc(fake_cpuc);
+       return ret;
+}
+
+static int x86_pmu_event_init(struct perf_event *event)
+{
+       struct pmu *tmp;
+       int err;
+
+       switch (event->attr.type) {
+       case PERF_TYPE_RAW:
+       case PERF_TYPE_HARDWARE:
+       case PERF_TYPE_HW_CACHE:
+               break;
+
+       default:
+               return -ENOENT;
+       }
+
+       err = __x86_pmu_event_init(event);
+       if (!err) {
+               /*
+                * we temporarily connect event to its pmu
+                * such that validate_group() can classify
+                * it as an x86 event using is_x86_event()
+                */
+               tmp = event->pmu;
+               event->pmu = &pmu;
+
+               if (event->group_leader != event)
+                       err = validate_group(event);
+               else
+                       err = validate_event(event);
+
+               event->pmu = tmp;
+       }
+       if (err) {
+               if (event->destroy)
+                       event->destroy(event);
+       }
+
+       if (ACCESS_ONCE(x86_pmu.attr_rdpmc))
+               event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
+
+       return err;
+}
+
+static void refresh_pce(void *ignored)
+{
+       if (current->mm)
+               load_mm_cr4(current->mm);
+}
+
+static void x86_pmu_event_mapped(struct perf_event *event)
+{
+       if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+               return;
+
+       if (atomic_inc_return(&current->mm->context.perf_rdpmc_allowed) == 1)
+               on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+}
+
+static void x86_pmu_event_unmapped(struct perf_event *event)
+{
+       if (!current->mm)
+               return;
+
+       if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+               return;
+
+       if (atomic_dec_and_test(&current->mm->context.perf_rdpmc_allowed))
+               on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+}
+
+static int x86_pmu_event_idx(struct perf_event *event)
+{
+       int idx = event->hw.idx;
+
+       if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+               return 0;
+
+       if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
+               idx -= INTEL_PMC_IDX_FIXED;
+               idx |= 1 << 30;
+       }
+
+       return idx + 1;
+}
+
+static ssize_t get_attr_rdpmc(struct device *cdev,
+                             struct device_attribute *attr,
+                             char *buf)
+{
+       return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
+}
+
+static ssize_t set_attr_rdpmc(struct device *cdev,
+                             struct device_attribute *attr,
+                             const char *buf, size_t count)
+{
+       unsigned long val;
+       ssize_t ret;
+
+       ret = kstrtoul(buf, 0, &val);
+       if (ret)
+               return ret;
+
+       if (val > 2)
+               return -EINVAL;
+
+       if (x86_pmu.attr_rdpmc_broken)
+               return -ENOTSUPP;
+
+       if ((val == 2) != (x86_pmu.attr_rdpmc == 2)) {
+               /*
+                * Changing into or out of always available, aka
+                * perf-event-bypassing mode.  This path is extremely slow,
+                * but only root can trigger it, so it's okay.
+                */
+               if (val == 2)
+                       static_key_slow_inc(&rdpmc_always_available);
+               else
+                       static_key_slow_dec(&rdpmc_always_available);
+               on_each_cpu(refresh_pce, NULL, 1);
+       }
+
+       x86_pmu.attr_rdpmc = val;
+
+       return count;
+}
+
+static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
+
+static struct attribute *x86_pmu_attrs[] = {
+       &dev_attr_rdpmc.attr,
+       NULL,
+};
+
+static struct attribute_group x86_pmu_attr_group = {
+       .attrs = x86_pmu_attrs,
+};
+
+static const struct attribute_group *x86_pmu_attr_groups[] = {
+       &x86_pmu_attr_group,
+       &x86_pmu_format_group,
+       &x86_pmu_events_group,
+       NULL,
+};
+
+static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+       if (x86_pmu.sched_task)
+               x86_pmu.sched_task(ctx, sched_in);
+}
+
+void perf_check_microcode(void)
+{
+       if (x86_pmu.check_microcode)
+               x86_pmu.check_microcode();
+}
+EXPORT_SYMBOL_GPL(perf_check_microcode);
+
+static struct pmu pmu = {
+       .pmu_enable             = x86_pmu_enable,
+       .pmu_disable            = x86_pmu_disable,
+
+       .attr_groups            = x86_pmu_attr_groups,
+
+       .event_init             = x86_pmu_event_init,
+
+       .event_mapped           = x86_pmu_event_mapped,
+       .event_unmapped         = x86_pmu_event_unmapped,
+
+       .add                    = x86_pmu_add,
+       .del                    = x86_pmu_del,
+       .start                  = x86_pmu_start,
+       .stop                   = x86_pmu_stop,
+       .read                   = x86_pmu_read,
+
+       .start_txn              = x86_pmu_start_txn,
+       .cancel_txn             = x86_pmu_cancel_txn,
+       .commit_txn             = x86_pmu_commit_txn,
+
+       .event_idx              = x86_pmu_event_idx,
+       .sched_task             = x86_pmu_sched_task,
+       .task_ctx_size          = sizeof(struct x86_perf_task_context),
+};
+
+void arch_perf_update_userpage(struct perf_event *event,
+                              struct perf_event_mmap_page *userpg, u64 now)
+{
+       struct cyc2ns_data *data;
+
+       userpg->cap_user_time = 0;
+       userpg->cap_user_time_zero = 0;
+       userpg->cap_user_rdpmc =
+               !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
+       userpg->pmc_width = x86_pmu.cntval_bits;
+
+       if (!sched_clock_stable())
+               return;
+
+       data = cyc2ns_read_begin();
+
+       /*
+        * Internal timekeeping for enabled/running/stopped times
+        * is always in the local_clock domain.
+        */
+       userpg->cap_user_time = 1;
+       userpg->time_mult = data->cyc2ns_mul;
+       userpg->time_shift = data->cyc2ns_shift;
+       userpg->time_offset = data->cyc2ns_offset - now;
+
+       /*
+        * cap_user_time_zero doesn't make sense when we're using a different
+        * time base for the records.
+        */
+       if (event->clock == &local_clock) {
+               userpg->cap_user_time_zero = 1;
+               userpg->time_zero = data->cyc2ns_offset;
+       }
+
+       cyc2ns_read_end(data);
+}
+
+/*
+ * callchain support
+ */
+
+static int backtrace_stack(void *data, char *name)
+{
+       return 0;
+}
+
+static void backtrace_address(void *data, unsigned long addr, int reliable)
+{
+       struct perf_callchain_entry *entry = data;
+
+       perf_callchain_store(entry, addr);
+}
+
+static const struct stacktrace_ops backtrace_ops = {
+       .stack                  = backtrace_stack,
+       .address                = backtrace_address,
+       .walk_stack             = print_context_stack_bp,
+};
+
+void
+perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+{
+       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+               /* TODO: We don't support guest os callchain now */
+               return;
+       }
+
+       perf_callchain_store(entry, regs->ip);
+
+       dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
+}
+
+static inline int
+valid_user_frame(const void __user *fp, unsigned long size)
+{
+       return (__range_not_ok(fp, size, TASK_SIZE) == 0);
+}
+
+static unsigned long get_segment_base(unsigned int segment)
+{
+       struct desc_struct *desc;
+       int idx = segment >> 3;
+
+       if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+               struct ldt_struct *ldt;
+
+               if (idx > LDT_ENTRIES)
+                       return 0;
+
+               /* IRQs are off, so this synchronizes with smp_store_release */
+               ldt = lockless_dereference(current->active_mm->context.ldt);
+               if (!ldt || idx > ldt->size)
+                       return 0;
+
+               desc = &ldt->entries[idx];
+#else
+               return 0;
+#endif
+       } else {
+               if (idx > GDT_ENTRIES)
+                       return 0;
+
+               desc = raw_cpu_ptr(gdt_page.gdt) + idx;
+       }
+
+       return get_desc_base(desc);
+}
+
+#ifdef CONFIG_IA32_EMULATION
+
+#include <asm/compat.h>
+
+static inline int
+perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+       /* 32-bit process in 64-bit kernel. */
+       unsigned long ss_base, cs_base;
+       struct stack_frame_ia32 frame;
+       const void __user *fp;
+
+       if (!test_thread_flag(TIF_IA32))
+               return 0;
+
+       cs_base = get_segment_base(regs->cs);
+       ss_base = get_segment_base(regs->ss);
+
+       fp = compat_ptr(ss_base + regs->bp);
+       pagefault_disable();
+       while (entry->nr < PERF_MAX_STACK_DEPTH) {
+               unsigned long bytes;
+               frame.next_frame     = 0;
+               frame.return_address = 0;
+
+               if (!access_ok(VERIFY_READ, fp, 8))
+                       break;
+
+               bytes = __copy_from_user_nmi(&frame.next_frame, fp, 4);
+               if (bytes != 0)
+                       break;
+               bytes = __copy_from_user_nmi(&frame.return_address, fp+4, 4);
+               if (bytes != 0)
+                       break;
+
+               if (!valid_user_frame(fp, sizeof(frame)))
+                       break;
+
+               perf_callchain_store(entry, cs_base + frame.return_address);
+               fp = compat_ptr(ss_base + frame.next_frame);
+       }
+       pagefault_enable();
+       return 1;
+}
+#else
+static inline int
+perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+    return 0;
+}
+#endif
+
+void
+perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+{
+       struct stack_frame frame;
+       const void __user *fp;
+
+       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+               /* TODO: We don't support guest os callchain now */
+               return;
+       }
+
+       /*
+        * We don't know what to do with VM86 stacks.. ignore them for now.
+        */
+       if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
+               return;
+
+       fp = (void __user *)regs->bp;
+
+       perf_callchain_store(entry, regs->ip);
+
+       if (!current->mm)
+               return;
+
+       if (perf_callchain_user32(regs, entry))
+               return;
+
+       pagefault_disable();
+       while (entry->nr < PERF_MAX_STACK_DEPTH) {
+               unsigned long bytes;
+               frame.next_frame             = NULL;
+               frame.return_address = 0;
+
+               if (!access_ok(VERIFY_READ, fp, 16))
+                       break;
+
+               bytes = __copy_from_user_nmi(&frame.next_frame, fp, 8);
+               if (bytes != 0)
+                       break;
+               bytes = __copy_from_user_nmi(&frame.return_address, fp+8, 8);
+               if (bytes != 0)
+                       break;
+
+               if (!valid_user_frame(fp, sizeof(frame)))
+                       break;
+
+               perf_callchain_store(entry, frame.return_address);
+               fp = (void __user *)frame.next_frame;
+       }
+       pagefault_enable();
+}
+
+/*
+ * Deal with code segment offsets for the various execution modes:
+ *
+ *   VM86 - the good olde 16 bit days, where the linear address is
+ *          20 bits and we use regs->ip + 0x10 * regs->cs.
+ *
+ *   IA32 - Where we need to look at GDT/LDT segment descriptor tables
+ *          to figure out what the 32bit base address is.
+ *
+ *    X32 - has TIF_X32 set, but is running in x86_64
+ *
+ * X86_64 - CS,DS,SS,ES are all zero based.
+ */
+static unsigned long code_segment_base(struct pt_regs *regs)
+{
+       /*
+        * For IA32 we look at the GDT/LDT segment base to convert the
+        * effective IP to a linear address.
+        */
+
+#ifdef CONFIG_X86_32
+       /*
+        * If we are in VM86 mode, add the segment offset to convert to a
+        * linear address.
+        */
+       if (regs->flags & X86_VM_MASK)
+               return 0x10 * regs->cs;
+
+       if (user_mode(regs) && regs->cs != __USER_CS)
+               return get_segment_base(regs->cs);
+#else
+       if (user_mode(regs) && !user_64bit_mode(regs) &&
+           regs->cs != __USER32_CS)
+               return get_segment_base(regs->cs);
+#endif
+       return 0;
+}
+
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
+       if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
+               return perf_guest_cbs->get_guest_ip();
+
+       return regs->ip + code_segment_base(regs);
+}
+
+unsigned long perf_misc_flags(struct pt_regs *regs)
+{
+       int misc = 0;
+
+       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+               if (perf_guest_cbs->is_user_mode())
+                       misc |= PERF_RECORD_MISC_GUEST_USER;
+               else
+                       misc |= PERF_RECORD_MISC_GUEST_KERNEL;
+       } else {
+               if (user_mode(regs))
+                       misc |= PERF_RECORD_MISC_USER;
+               else
+                       misc |= PERF_RECORD_MISC_KERNEL;
+       }
+
+       if (regs->flags & PERF_EFLAGS_EXACT)
+               misc |= PERF_RECORD_MISC_EXACT_IP;
+
+       return misc;
+}
+
+void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
+{
+       cap->version            = x86_pmu.version;
+       cap->num_counters_gp    = x86_pmu.num_counters;
+       cap->num_counters_fixed = x86_pmu.num_counters_fixed;
+       cap->bit_width_gp       = x86_pmu.cntval_bits;
+       cap->bit_width_fixed    = x86_pmu.cntval_bits;
+       cap->events_mask        = (unsigned int)x86_pmu.events_maskl;
+       cap->events_mask_len    = x86_pmu.events_mask_len;
+}
+EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c

new file mode 100644 (file)

index 0000000..b99dc92
--- /dev/null
+++ b/arch/x86/events/intel/bts.c
@@ -0,0 +1,544 @@
+/*
+ * BTS PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/coredump.h>
+
+#include <asm-generic/sizes.h>
+#include <asm/perf_event.h>
+
+#include "../perf_event.h"
+
+struct bts_ctx {
+       struct perf_output_handle       handle;
+       struct debug_store              ds_back;
+       int                             started;
+};
+
+static DEFINE_PER_CPU(struct bts_ctx, bts_ctx);
+
+#define BTS_RECORD_SIZE                24
+#define BTS_SAFETY_MARGIN      4080
+
+struct bts_phys {
+       struct page     *page;
+       unsigned long   size;
+       unsigned long   offset;
+       unsigned long   displacement;
+};
+
+struct bts_buffer {
+       size_t          real_size;      /* multiple of BTS_RECORD_SIZE */
+       unsigned int    nr_pages;
+       unsigned int    nr_bufs;
+       unsigned int    cur_buf;
+       bool            snapshot;
+       local_t         data_size;
+       local_t         lost;
+       local_t         head;
+       unsigned long   end;
+       void            **data_pages;
+       struct bts_phys buf[0];
+};
+
+struct pmu bts_pmu;
+
+static size_t buf_size(struct page *page)
+{
+       return 1 << (PAGE_SHIFT + page_private(page));
+}
+
+static void *
+bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite)
+{
+       struct bts_buffer *buf;
+       struct page *page;
+       int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+       unsigned long offset;
+       size_t size = nr_pages << PAGE_SHIFT;
+       int pg, nbuf, pad;
+
+       /* count all the high order buffers */
+       for (pg = 0, nbuf = 0; pg < nr_pages;) {
+               page = virt_to_page(pages[pg]);
+               if (WARN_ON_ONCE(!PagePrivate(page) && nr_pages > 1))
+                       return NULL;
+               pg += 1 << page_private(page);
+               nbuf++;
+       }
+
+       /*
+        * to avoid interrupts in overwrite mode, only allow one physical
+        */
+       if (overwrite && nbuf > 1)
+               return NULL;
+
+       buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node);
+       if (!buf)
+               return NULL;
+
+       buf->nr_pages = nr_pages;
+       buf->nr_bufs = nbuf;
+       buf->snapshot = overwrite;
+       buf->data_pages = pages;
+       buf->real_size = size - size % BTS_RECORD_SIZE;
+
+       for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) {
+               unsigned int __nr_pages;
+
+               page = virt_to_page(pages[pg]);
+               __nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1;
+               buf->buf[nbuf].page = page;
+               buf->buf[nbuf].offset = offset;
+               buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
+               buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement;
+               pad = buf->buf[nbuf].size % BTS_RECORD_SIZE;
+               buf->buf[nbuf].size -= pad;
+
+               pg += __nr_pages;
+               offset += __nr_pages << PAGE_SHIFT;
+       }
+
+       return buf;
+}
+
+static void bts_buffer_free_aux(void *data)
+{
+       kfree(data);
+}
+
+static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx)
+{
+       return buf->buf[idx].offset + buf->buf[idx].displacement;
+}
+
+static void
+bts_config_buffer(struct bts_buffer *buf)
+{
+       int cpu = raw_smp_processor_id();
+       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+       struct bts_phys *phys = &buf->buf[buf->cur_buf];
+       unsigned long index, thresh = 0, end = phys->size;
+       struct page *page = phys->page;
+
+       index = local_read(&buf->head);
+
+       if (!buf->snapshot) {
+               if (buf->end < phys->offset + buf_size(page))
+                       end = buf->end - phys->offset - phys->displacement;
+
+               index -= phys->offset + phys->displacement;
+
+               if (end - index > BTS_SAFETY_MARGIN)
+                       thresh = end - BTS_SAFETY_MARGIN;
+               else if (end - index > BTS_RECORD_SIZE)
+                       thresh = end - BTS_RECORD_SIZE;
+               else
+                       thresh = end;
+       }
+
+       ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
+       ds->bts_index = ds->bts_buffer_base + index;
+       ds->bts_absolute_maximum = ds->bts_buffer_base + end;
+       ds->bts_interrupt_threshold = !buf->snapshot
+               ? ds->bts_buffer_base + thresh
+               : ds->bts_absolute_maximum + BTS_RECORD_SIZE;
+}
+
+static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
+{
+       unsigned long index = head - phys->offset;
+
+       memset(page_address(phys->page) + index, 0, phys->size - index);
+}
+
+static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts)
+{
+       if (buf->snapshot)
+               return false;
+
+       if (local_read(&buf->data_size) >= bts->handle.size ||
+           bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE)
+               return true;
+
+       return false;
+}
+
+static void bts_update(struct bts_ctx *bts)
+{
+       int cpu = raw_smp_processor_id();
+       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+       struct bts_buffer *buf = perf_get_aux(&bts->handle);
+       unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
+
+       if (!buf)
+               return;
+
+       head = index + bts_buffer_offset(buf, buf->cur_buf);
+       old = local_xchg(&buf->head, head);
+
+       if (!buf->snapshot) {
+               if (old == head)
+                       return;
+
+               if (ds->bts_index >= ds->bts_absolute_maximum)
+                       local_inc(&buf->lost);
+
+               /*
+                * old and head are always in the same physical buffer, so we
+                * can subtract them to get the data size.
+                */
+               local_add(head - old, &buf->data_size);
+       } else {
+               local_set(&buf->data_size, head);
+       }
+}
+
+static void __bts_event_start(struct perf_event *event)
+{
+       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+       struct bts_buffer *buf = perf_get_aux(&bts->handle);
+       u64 config = 0;
+
+       if (!buf || bts_buffer_is_full(buf, bts))
+               return;
+
+       event->hw.itrace_started = 1;
+       event->hw.state = 0;
+
+       if (!buf->snapshot)
+               config |= ARCH_PERFMON_EVENTSEL_INT;
+       if (!event->attr.exclude_kernel)
+               config |= ARCH_PERFMON_EVENTSEL_OS;
+       if (!event->attr.exclude_user)
+               config |= ARCH_PERFMON_EVENTSEL_USR;
+
+       bts_config_buffer(buf);
+
+       /*
+        * local barrier to make sure that ds configuration made it
+        * before we enable BTS
+        */
+       wmb();
+
+       intel_pmu_enable_bts(config);
+}
+
+static void bts_event_start(struct perf_event *event, int flags)
+{
+       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+       __bts_event_start(event);
+
+       /* PMI handler: this counter is running and likely generating PMIs */
+       ACCESS_ONCE(bts->started) = 1;
+}
+
+static void __bts_event_stop(struct perf_event *event)
+{
+       /*
+        * No extra synchronization is mandated by the documentation to have
+        * BTS data stores globally visible.
+        */
+       intel_pmu_disable_bts();
+
+       if (event->hw.state & PERF_HES_STOPPED)
+               return;
+
+       ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED;
+}
+
+static void bts_event_stop(struct perf_event *event, int flags)
+{
+       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+       /* PMI handler: don't restart this counter */
+       ACCESS_ONCE(bts->started) = 0;
+
+       __bts_event_stop(event);
+
+       if (flags & PERF_EF_UPDATE)
+               bts_update(bts);
+}
+
+void intel_bts_enable_local(void)
+{
+       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+       if (bts->handle.event && bts->started)
+               __bts_event_start(bts->handle.event);
+}
+
+void intel_bts_disable_local(void)
+{
+       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+       if (bts->handle.event)
+               __bts_event_stop(bts->handle.event);
+}
+
+static int
+bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
+{
+       unsigned long head, space, next_space, pad, gap, skip, wakeup;
+       unsigned int next_buf;
+       struct bts_phys *phys, *next_phys;
+       int ret;
+
+       if (buf->snapshot)
+               return 0;
+
+       head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
+       if (WARN_ON_ONCE(head != local_read(&buf->head)))
+               return -EINVAL;
+
+       phys = &buf->buf[buf->cur_buf];
+       space = phys->offset + phys->displacement + phys->size - head;
+       pad = space;
+       if (space > handle->size) {
+               space = handle->size;
+               space -= space % BTS_RECORD_SIZE;
+       }
+       if (space <= BTS_SAFETY_MARGIN) {
+               /* See if next phys buffer has more space */
+               next_buf = buf->cur_buf + 1;
+               if (next_buf >= buf->nr_bufs)
+                       next_buf = 0;
+               next_phys = &buf->buf[next_buf];
+               gap = buf_size(phys->page) - phys->displacement - phys->size +
+                     next_phys->displacement;
+               skip = pad + gap;
+               if (handle->size >= skip) {
+                       next_space = next_phys->size;
+                       if (next_space + skip > handle->size) {
+                               next_space = handle->size - skip;
+                               next_space -= next_space % BTS_RECORD_SIZE;
+                       }
+                       if (next_space > space || !space) {
+                               if (pad)
+                                       bts_buffer_pad_out(phys, head);
+                               ret = perf_aux_output_skip(handle, skip);
+                               if (ret)
+                                       return ret;
+                               /* Advance to next phys buffer */
+                               phys = next_phys;
+                               space = next_space;
+                               head = phys->offset + phys->displacement;
+                               /*
+                                * After this, cur_buf and head won't match ds
+                                * anymore, so we must not be racing with
+                                * bts_update().
+                                */
+                               buf->cur_buf = next_buf;
+                               local_set(&buf->head, head);
+                       }
+               }
+       }
+
+       /* Don't go far beyond wakeup watermark */
+       wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
+                handle->head;
+       if (space > wakeup) {
+               space = wakeup;
+               space -= space % BTS_RECORD_SIZE;
+       }
+
+       buf->end = head + space;
+
+       /*
+        * If we have no space, the lost notification would have been sent when
+        * we hit absolute_maximum - see bts_update()
+        */
+       if (!space)
+               return -ENOSPC;
+
+       return 0;
+}
+
+int intel_bts_interrupt(void)
+{
+       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+       struct perf_event *event = bts->handle.event;
+       struct bts_buffer *buf;
+       s64 old_head;
+       int err;
+
+       if (!event || !bts->started)
+               return 0;
+
+       buf = perf_get_aux(&bts->handle);
+       /*
+        * Skip snapshot counters: they don't use the interrupt, but
+        * there's no other way of telling, because the pointer will
+        * keep moving
+        */
+       if (!buf || buf->snapshot)
+               return 0;
+
+       old_head = local_read(&buf->head);
+       bts_update(bts);
+
+       /* no new data */
+       if (old_head == local_read(&buf->head))
+               return 0;
+
+       perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
+                           !!local_xchg(&buf->lost, 0));
+
+       buf = perf_aux_output_begin(&bts->handle, event);
+       if (!buf)
+               return 1;
+
+       err = bts_buffer_reset(buf, &bts->handle);
+       if (err)
+               perf_aux_output_end(&bts->handle, 0, false);
+
+       return 1;
+}
+
+static void bts_event_del(struct perf_event *event, int mode)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+       struct bts_buffer *buf = perf_get_aux(&bts->handle);
+
+       bts_event_stop(event, PERF_EF_UPDATE);
+
+       if (buf) {
+               if (buf->snapshot)
+                       bts->handle.head =
+                               local_xchg(&buf->data_size,
+                                          buf->nr_pages << PAGE_SHIFT);
+               perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
+                                   !!local_xchg(&buf->lost, 0));
+       }
+
+       cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
+       cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
+       cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
+       cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
+}
+
+static int bts_event_add(struct perf_event *event, int mode)
+{
+       struct bts_buffer *buf;
+       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
+       int ret = -EBUSY;
+
+       event->hw.state = PERF_HES_STOPPED;
+
+       if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+               return -EBUSY;
+
+       if (bts->handle.event)
+               return -EBUSY;
+
+       buf = perf_aux_output_begin(&bts->handle, event);
+       if (!buf)
+               return -EINVAL;
+
+       ret = bts_buffer_reset(buf, &bts->handle);
+       if (ret) {
+               perf_aux_output_end(&bts->handle, 0, false);
+               return ret;
+       }
+
+       bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
+       bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
+       bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
+
+       if (mode & PERF_EF_START) {
+               bts_event_start(event, 0);
+               if (hwc->state & PERF_HES_STOPPED) {
+                       bts_event_del(event, 0);
+                       return -EBUSY;
+               }
+       }
+
+       return 0;
+}
+
+static void bts_event_destroy(struct perf_event *event)
+{
+       x86_release_hardware();
+       x86_del_exclusive(x86_lbr_exclusive_bts);
+}
+
+static int bts_event_init(struct perf_event *event)
+{
+       int ret;
+
+       if (event->attr.type != bts_pmu.type)
+               return -ENOENT;
+
+       if (x86_add_exclusive(x86_lbr_exclusive_bts))
+               return -EBUSY;
+
+       /*
+        * BTS leaks kernel addresses even when CPL0 tracing is
+        * disabled, so disallow intel_bts driver for unprivileged
+        * users on paranoid systems since it provides trace data
+        * to the user in a zero-copy fashion.
+        *
+        * Note that the default paranoia setting permits unprivileged
+        * users to profile the kernel.
+        */
+       if (event->attr.exclude_kernel && perf_paranoid_kernel() &&
+           !capable(CAP_SYS_ADMIN))
+               return -EACCES;
+
+       ret = x86_reserve_hardware();
+       if (ret) {
+               x86_del_exclusive(x86_lbr_exclusive_bts);
+               return ret;
+       }
+
+       event->destroy = bts_event_destroy;
+
+       return 0;
+}
+
+static void bts_event_read(struct perf_event *event)
+{
+}
+
+static __init int bts_init(void)
+{
+       if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
+               return -ENODEV;
+
+       bts_pmu.capabilities    = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE;
+       bts_pmu.task_ctx_nr     = perf_sw_context;
+       bts_pmu.event_init      = bts_event_init;
+       bts_pmu.add             = bts_event_add;
+       bts_pmu.del             = bts_event_del;
+       bts_pmu.start           = bts_event_start;
+       bts_pmu.stop            = bts_event_stop;
+       bts_pmu.read            = bts_event_read;
+       bts_pmu.setup_aux       = bts_buffer_setup_aux;
+       bts_pmu.free_aux        = bts_buffer_free_aux;
+
+       return perf_pmu_register(&bts_pmu, "intel_bts", -1);
+}
+arch_initcall(bts_init);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c

new file mode 100644 (file)

index 0000000..68fa55b
--- /dev/null
+++ b/arch/x86/events/intel/core.c
@@ -0,0 +1,3796 @@
+/*
+ * Per core/cpu state
+ *
+ * Used to coordinate shared registers between HT threads or
+ * among events on a single PMU.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/nmi.h>
+
+#include <asm/cpufeature.h>
+#include <asm/hardirq.h>
+#include <asm/apic.h>
+
+#include "../perf_event.h"
+
+/*
+ * Intel PerfMon, used on Core and later.
+ */
+static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
+{
+       [PERF_COUNT_HW_CPU_CYCLES]              = 0x003c,
+       [PERF_COUNT_HW_INSTRUCTIONS]            = 0x00c0,
+       [PERF_COUNT_HW_CACHE_REFERENCES]        = 0x4f2e,
+       [PERF_COUNT_HW_CACHE_MISSES]            = 0x412e,
+       [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]     = 0x00c4,
+       [PERF_COUNT_HW_BRANCH_MISSES]           = 0x00c5,
+       [PERF_COUNT_HW_BUS_CYCLES]              = 0x013c,
+       [PERF_COUNT_HW_REF_CPU_CYCLES]          = 0x0300, /* pseudo-encoding */
+};
+
+static struct event_constraint intel_core_event_constraints[] __read_mostly =
+{
+       INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
+       INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
+       INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
+       INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
+       INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
+       INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */
+       EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_core2_event_constraints[] __read_mostly =
+{
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+       INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
+       INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
+       INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
+       INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
+       INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
+       INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
+       INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
+       INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
+       INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */
+       INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
+       EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
+{
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+       INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
+       INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
+       INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
+       INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
+       INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */
+       INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
+       INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
+       INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
+       EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
+{
+       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
+       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
+       EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
+{
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+       INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
+       INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
+       INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
+       INTEL_EVENT_CONSTRAINT(0xb3, 0x1), /* SNOOPQ_REQUEST_OUTSTANDING */
+       EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_snb_event_constraints[] __read_mostly =
+{
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+       INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
+       INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x06a3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
+       INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
+       INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
+       INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
+       INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+
+       INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+       EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
+{
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+       INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */
+       INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */
+       INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
+       INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x06a3, 0xf), /* CYCLE_ACTIVITY.STALLS_LDM_PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
+
+       INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+       EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
+{
+       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
+       INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
+       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
+       EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_v1_event_constraints[] __read_mostly =
+{
+       EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_gen_event_constraints[] __read_mostly =
+{
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+       EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_slm_event_constraints[] __read_mostly =
+{
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_skl_event_constraints[] = {
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0),      /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1),      /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2),      /* CPU_CLK_UNHALTED.REF */
+       INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2),    /* INST_RETIRED.PREC_DIST */
+       EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg intel_knl_extra_regs[] __read_mostly = {
+       INTEL_UEVENT_EXTRA_REG(0x01b7,
+                              MSR_OFFCORE_RSP_0, 0x7f9ffbffffull, RSP_0),
+       INTEL_UEVENT_EXTRA_REG(0x02b7,
+                              MSR_OFFCORE_RSP_1, 0x3f9ffbffffull, RSP_1),
+       EVENT_EXTRA_END
+};
+
+static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
+       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
+       INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1),
+       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+       EVENT_EXTRA_END
+};
+
+static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
+       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
+       INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
+       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+       EVENT_EXTRA_END
+};
+
+static struct extra_reg intel_skl_extra_regs[] __read_mostly = {
+       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
+       INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
+       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+       /*
+        * Note the low 8 bits eventsel code is not a continuous field, containing
+        * some #GPing bits. These are masked out.
+        */
+       INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
+       EVENT_EXTRA_END
+};
+
+EVENT_ATTR_STR(mem-loads,      mem_ld_nhm,     "event=0x0b,umask=0x10,ldlat=3");
+EVENT_ATTR_STR(mem-loads,      mem_ld_snb,     "event=0xcd,umask=0x1,ldlat=3");
+EVENT_ATTR_STR(mem-stores,     mem_st_snb,     "event=0xcd,umask=0x2");
+
+struct attribute *nhm_events_attrs[] = {
+       EVENT_PTR(mem_ld_nhm),
+       NULL,
+};
+
+struct attribute *snb_events_attrs[] = {
+       EVENT_PTR(mem_ld_snb),
+       EVENT_PTR(mem_st_snb),
+       NULL,
+};
+
+static struct event_constraint intel_hsw_event_constraints[] = {
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+       INTEL_UEVENT_CONSTRAINT(0x148, 0x4),    /* L1D_PEND_MISS.PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
+       INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
+       /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4),
+       /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
+       /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
+       INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf),
+
+       INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_bdw_event_constraints[] = {
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0),      /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1),      /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2),      /* CPU_CLK_UNHALTED.REF */
+       INTEL_UEVENT_CONSTRAINT(0x148, 0x4),    /* L1D_PEND_MISS.PENDING */
+       INTEL_UBIT_EVENT_CONSTRAINT(0x8a3, 0x4),        /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
+       EVENT_CONSTRAINT_END
+};
+
+static u64 intel_pmu_event_map(int hw_event)
+{
+       return intel_perfmon_event_map[hw_event];
+}
+
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts.
+ * - icache miss does not include decoded icache
+ */
+
+#define SKL_DEMAND_DATA_RD             BIT_ULL(0)
+#define SKL_DEMAND_RFO                 BIT_ULL(1)
+#define SKL_ANY_RESPONSE               BIT_ULL(16)
+#define SKL_SUPPLIER_NONE              BIT_ULL(17)
+#define SKL_L3_MISS_LOCAL_DRAM         BIT_ULL(26)
+#define SKL_L3_MISS_REMOTE_HOP0_DRAM   BIT_ULL(27)
+#define SKL_L3_MISS_REMOTE_HOP1_DRAM   BIT_ULL(28)
+#define SKL_L3_MISS_REMOTE_HOP2P_DRAM  BIT_ULL(29)
+#define SKL_L3_MISS                    (SKL_L3_MISS_LOCAL_DRAM| \
+                                        SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+                                        SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+                                        SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+#define SKL_SPL_HIT                    BIT_ULL(30)
+#define SKL_SNOOP_NONE                 BIT_ULL(31)
+#define SKL_SNOOP_NOT_NEEDED           BIT_ULL(32)
+#define SKL_SNOOP_MISS                 BIT_ULL(33)
+#define SKL_SNOOP_HIT_NO_FWD           BIT_ULL(34)
+#define SKL_SNOOP_HIT_WITH_FWD         BIT_ULL(35)
+#define SKL_SNOOP_HITM                 BIT_ULL(36)
+#define SKL_SNOOP_NON_DRAM             BIT_ULL(37)
+#define SKL_ANY_SNOOP                  (SKL_SPL_HIT|SKL_SNOOP_NONE| \
+                                        SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+                                        SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+                                        SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM)
+#define SKL_DEMAND_READ                        SKL_DEMAND_DATA_RD
+#define SKL_SNOOP_DRAM                 (SKL_SNOOP_NONE| \
+                                        SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+                                        SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+                                        SKL_SNOOP_HITM|SKL_SPL_HIT)
+#define SKL_DEMAND_WRITE               SKL_DEMAND_RFO
+#define SKL_LLC_ACCESS                 SKL_ANY_RESPONSE
+#define SKL_L3_MISS_REMOTE             (SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+                                        SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+                                        SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+
+static __initconst const u64 skl_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_INST_RETIRED.ALL_LOADS */
+               [ C(RESULT_MISS)   ] = 0x151,   /* L1D.REPLACEMENT */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_INST_RETIRED.ALL_STORES */
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x283,   /* ICACHE_64B.MISS */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_INST_RETIRED.ALL_LOADS */
+               [ C(RESULT_MISS)   ] = 0x608,   /* DTLB_LOAD_MISSES.WALK_COMPLETED */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_INST_RETIRED.ALL_STORES */
+               [ C(RESULT_MISS)   ] = 0x649,   /* DTLB_STORE_MISSES.WALK_COMPLETED */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x2085,  /* ITLB_MISSES.STLB_HIT */
+               [ C(RESULT_MISS)   ] = 0xe85,   /* ITLB_MISSES.WALK_COMPLETED */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0xc4,    /* BR_INST_RETIRED.ALL_BRANCHES */
+               [ C(RESULT_MISS)   ] = 0xc5,    /* BR_MISP_RETIRED.ALL_BRANCHES */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+};
+
+static __initconst const u64 skl_hw_cache_extra_regs
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+                                      SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+               [ C(RESULT_MISS)   ] = SKL_DEMAND_READ|
+                                      SKL_L3_MISS|SKL_ANY_SNOOP|
+                                      SKL_SUPPLIER_NONE,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+                                      SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+               [ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE|
+                                      SKL_L3_MISS|SKL_ANY_SNOOP|
+                                      SKL_SUPPLIER_NONE,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+                                      SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+               [ C(RESULT_MISS)   ] = SKL_DEMAND_READ|
+                                      SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+                                      SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+               [ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE|
+                                      SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+};
+
+#define SNB_DMND_DATA_RD       (1ULL << 0)
+#define SNB_DMND_RFO           (1ULL << 1)
+#define SNB_DMND_IFETCH                (1ULL << 2)
+#define SNB_DMND_WB            (1ULL << 3)
+#define SNB_PF_DATA_RD         (1ULL << 4)
+#define SNB_PF_RFO             (1ULL << 5)
+#define SNB_PF_IFETCH          (1ULL << 6)
+#define SNB_LLC_DATA_RD                (1ULL << 7)
+#define SNB_LLC_RFO            (1ULL << 8)
+#define SNB_LLC_IFETCH         (1ULL << 9)
+#define SNB_BUS_LOCKS          (1ULL << 10)
+#define SNB_STRM_ST            (1ULL << 11)
+#define SNB_OTHER              (1ULL << 15)
+#define SNB_RESP_ANY           (1ULL << 16)
+#define SNB_NO_SUPP            (1ULL << 17)
+#define SNB_LLC_HITM           (1ULL << 18)
+#define SNB_LLC_HITE           (1ULL << 19)
+#define SNB_LLC_HITS           (1ULL << 20)
+#define SNB_LLC_HITF           (1ULL << 21)
+#define SNB_LOCAL              (1ULL << 22)
+#define SNB_REMOTE             (0xffULL << 23)
+#define SNB_SNP_NONE           (1ULL << 31)
+#define SNB_SNP_NOT_NEEDED     (1ULL << 32)
+#define SNB_SNP_MISS           (1ULL << 33)
+#define SNB_NO_FWD             (1ULL << 34)
+#define SNB_SNP_FWD            (1ULL << 35)
+#define SNB_HITM               (1ULL << 36)
+#define SNB_NON_DRAM           (1ULL << 37)
+
+#define SNB_DMND_READ          (SNB_DMND_DATA_RD|SNB_LLC_DATA_RD)
+#define SNB_DMND_WRITE         (SNB_DMND_RFO|SNB_LLC_RFO)
+#define SNB_DMND_PREFETCH      (SNB_PF_DATA_RD|SNB_PF_RFO)
+
+#define SNB_SNP_ANY            (SNB_SNP_NONE|SNB_SNP_NOT_NEEDED| \
+                                SNB_SNP_MISS|SNB_NO_FWD|SNB_SNP_FWD| \
+                                SNB_HITM)
+
+#define SNB_DRAM_ANY           (SNB_LOCAL|SNB_REMOTE|SNB_SNP_ANY)
+#define SNB_DRAM_REMOTE                (SNB_REMOTE|SNB_SNP_ANY)
+
+#define SNB_L3_ACCESS          SNB_RESP_ANY
+#define SNB_L3_MISS            (SNB_DRAM_ANY|SNB_NON_DRAM)
+
+static __initconst const u64 snb_hw_cache_extra_regs
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_L3_ACCESS,
+               [ C(RESULT_MISS)   ] = SNB_DMND_READ|SNB_L3_MISS,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_L3_ACCESS,
+               [ C(RESULT_MISS)   ] = SNB_DMND_WRITE|SNB_L3_MISS,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_L3_ACCESS,
+               [ C(RESULT_MISS)   ] = SNB_DMND_PREFETCH|SNB_L3_MISS,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_DRAM_ANY,
+               [ C(RESULT_MISS)   ] = SNB_DMND_READ|SNB_DRAM_REMOTE,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_DRAM_ANY,
+               [ C(RESULT_MISS)   ] = SNB_DMND_WRITE|SNB_DRAM_REMOTE,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_DRAM_ANY,
+               [ C(RESULT_MISS)   ] = SNB_DMND_PREFETCH|SNB_DRAM_REMOTE,
+       },
+ },
+};
+
+static __initconst const u64 snb_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS        */
+               [ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPLACEMENT              */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES       */
+               [ C(RESULT_MISS)   ] = 0x0851, /* L1D.ALL_M_REPLACEMENT        */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x024e, /* HW_PRE_REQ.DL1_MISS          */
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0280, /* ICACHE.MISSES */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_WRITE) ] = {
+               /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_PREFETCH) ] = {
+               /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */
+               [ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */
+               [ C(RESULT_MISS)   ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT         */
+               [ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK    */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+               [ C(RESULT_MISS)   ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+ },
+
+};
+
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts because they are not
+ *   reliably counted.
+ */
+
+#define HSW_DEMAND_DATA_RD             BIT_ULL(0)
+#define HSW_DEMAND_RFO                 BIT_ULL(1)
+#define HSW_ANY_RESPONSE               BIT_ULL(16)
+#define HSW_SUPPLIER_NONE              BIT_ULL(17)
+#define HSW_L3_MISS_LOCAL_DRAM         BIT_ULL(22)
+#define HSW_L3_MISS_REMOTE_HOP0                BIT_ULL(27)
+#define HSW_L3_MISS_REMOTE_HOP1                BIT_ULL(28)
+#define HSW_L3_MISS_REMOTE_HOP2P       BIT_ULL(29)
+#define HSW_L3_MISS                    (HSW_L3_MISS_LOCAL_DRAM| \
+                                        HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+                                        HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_SNOOP_NONE                 BIT_ULL(31)
+#define HSW_SNOOP_NOT_NEEDED           BIT_ULL(32)
+#define HSW_SNOOP_MISS                 BIT_ULL(33)
+#define HSW_SNOOP_HIT_NO_FWD           BIT_ULL(34)
+#define HSW_SNOOP_HIT_WITH_FWD         BIT_ULL(35)
+#define HSW_SNOOP_HITM                 BIT_ULL(36)
+#define HSW_SNOOP_NON_DRAM             BIT_ULL(37)
+#define HSW_ANY_SNOOP                  (HSW_SNOOP_NONE| \
+                                        HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
+                                        HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
+                                        HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
+#define HSW_SNOOP_DRAM                 (HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)
+#define HSW_DEMAND_READ                        HSW_DEMAND_DATA_RD
+#define HSW_DEMAND_WRITE               HSW_DEMAND_RFO
+#define HSW_L3_MISS_REMOTE             (HSW_L3_MISS_REMOTE_HOP0|\
+                                        HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_LLC_ACCESS                 HSW_ANY_RESPONSE
+
+#define BDW_L3_MISS_LOCAL              BIT(26)
+#define BDW_L3_MISS                    (BDW_L3_MISS_LOCAL| \
+                                        HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+                                        HSW_L3_MISS_REMOTE_HOP2P)
+
+
+static __initconst const u64 hsw_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_UOPS_RETIRED.ALL_LOADS */
+               [ C(RESULT_MISS)   ] = 0x151,   /* L1D.REPLACEMENT */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_UOPS_RETIRED.ALL_STORES */
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x280,   /* ICACHE.MISSES */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_UOPS_RETIRED.ALL_LOADS */
+               [ C(RESULT_MISS)   ] = 0x108,   /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_UOPS_RETIRED.ALL_STORES */
+               [ C(RESULT_MISS)   ] = 0x149,   /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x6085,  /* ITLB_MISSES.STLB_HIT */
+               [ C(RESULT_MISS)   ] = 0x185,   /* ITLB_MISSES.MISS_CAUSES_A_WALK */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0xc4,    /* BR_INST_RETIRED.ALL_BRANCHES */
+               [ C(RESULT_MISS)   ] = 0xc5,    /* BR_MISP_RETIRED.ALL_BRANCHES */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+};
+
+static __initconst const u64 hsw_hw_cache_extra_regs
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+                                      HSW_LLC_ACCESS,
+               [ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
+                                      HSW_L3_MISS|HSW_ANY_SNOOP,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+                                      HSW_LLC_ACCESS,
+               [ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
+                                      HSW_L3_MISS|HSW_ANY_SNOOP,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+                                      HSW_L3_MISS_LOCAL_DRAM|
+                                      HSW_SNOOP_DRAM,
+               [ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
+                                      HSW_L3_MISS_REMOTE|
+                                      HSW_SNOOP_DRAM,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+                                      HSW_L3_MISS_LOCAL_DRAM|
+                                      HSW_SNOOP_DRAM,
+               [ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
+                                      HSW_L3_MISS_REMOTE|
+                                      HSW_SNOOP_DRAM,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+};
+
+static __initconst const u64 westmere_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
+               [ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPL                     */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
+               [ C(RESULT_MISS)   ] = 0x0251, /* L1D.M_REPL                   */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
+               [ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
+               [ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       /*
+        * Use RFO, not WRITEBACK, because a write miss would typically occur
+        * on RFO.
+        */
+       [ C(OP_WRITE) ] = {
+               /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_PREFETCH) ] = {
+               /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
+               [ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
+               [ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
+               [ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISSES.ANY              */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+               [ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+ },
+};
+
+/*
+ * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits;
+ * See IA32 SDM Vol 3B 30.6.1.3
+ */
+
+#define NHM_DMND_DATA_RD       (1 << 0)
+#define NHM_DMND_RFO           (1 << 1)
+#define NHM_DMND_IFETCH                (1 << 2)
+#define NHM_DMND_WB            (1 << 3)
+#define NHM_PF_DATA_RD         (1 << 4)
+#define NHM_PF_DATA_RFO                (1 << 5)
+#define NHM_PF_IFETCH          (1 << 6)
+#define NHM_OFFCORE_OTHER      (1 << 7)
+#define NHM_UNCORE_HIT         (1 << 8)
+#define NHM_OTHER_CORE_HIT_SNP (1 << 9)
+#define NHM_OTHER_CORE_HITM    (1 << 10)
+                               /* reserved */
+#define NHM_REMOTE_CACHE_FWD   (1 << 12)
+#define NHM_REMOTE_DRAM                (1 << 13)
+#define NHM_LOCAL_DRAM         (1 << 14)
+#define NHM_NON_DRAM           (1 << 15)
+
+#define NHM_LOCAL              (NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD)
+#define NHM_REMOTE             (NHM_REMOTE_DRAM)
+
+#define NHM_DMND_READ          (NHM_DMND_DATA_RD)
+#define NHM_DMND_WRITE         (NHM_DMND_RFO|NHM_DMND_WB)
+#define NHM_DMND_PREFETCH      (NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
+
+#define NHM_L3_HIT     (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
+#define NHM_L3_MISS    (NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD)
+#define NHM_L3_ACCESS  (NHM_L3_HIT|NHM_L3_MISS)
+
+static __initconst const u64 nehalem_hw_cache_extra_regs
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS,
+               [ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_L3_MISS,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS,
+               [ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_L3_MISS,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
+               [ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE,
+               [ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_REMOTE,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE,
+               [ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_REMOTE,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE,
+               [ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_REMOTE,
+       },
+ },
+};
+
+static __initconst const u64 nehalem_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
+               [ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPL                     */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
+               [ C(RESULT_MISS)   ] = 0x0251, /* L1D.M_REPL                   */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
+               [ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
+               [ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       /*
+        * Use RFO, not WRITEBACK, because a write miss would typically occur
+        * on RFO.
+        */
+       [ C(OP_WRITE) ] = {
+               /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_PREFETCH) ] = {
+               /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */
+               [ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */
+               [ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
+               [ C(RESULT_MISS)   ] = 0x20c8, /* ITLB_MISS_RETIRED            */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+               [ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+ },
+};
+
+static __initconst const u64 core2_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI          */
+               [ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE       */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI          */
+               [ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE       */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS      */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
+               [ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
+               [ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
+               [ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
+               [ C(RESULT_MISS)   ] = 0x0208, /* DTLB_MISSES.MISS_LD        */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
+               [ C(RESULT_MISS)   ] = 0x0808, /* DTLB_MISSES.MISS_ST        */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
+               [ C(RESULT_MISS)   ] = 0x1282, /* ITLBMISSES                 */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
+               [ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+static __initconst const u64 atom_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD               */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST               */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                  */
+               [ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                 */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
+               [ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
+               [ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI  (alias) */
+               [ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI  (alias) */
+               [ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
+               [ C(RESULT_MISS)   ] = 0x0282, /* ITLB.MISSES                */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
+               [ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+static struct extra_reg intel_slm_extra_regs[] __read_mostly =
+{
+       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0),
+       INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x368005ffffull, RSP_1),
+       EVENT_EXTRA_END
+};
+
+#define SLM_DMND_READ          SNB_DMND_DATA_RD
+#define SLM_DMND_WRITE         SNB_DMND_RFO
+#define SLM_DMND_PREFETCH      (SNB_PF_DATA_RD|SNB_PF_RFO)
+
+#define SLM_SNP_ANY            (SNB_SNP_NONE|SNB_SNP_MISS|SNB_NO_FWD|SNB_HITM)
+#define SLM_LLC_ACCESS         SNB_RESP_ANY
+#define SLM_LLC_MISS           (SLM_SNP_ANY|SNB_NON_DRAM)
+
+static __initconst const u64 slm_hw_cache_extra_regs
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = SLM_DMND_READ|SLM_LLC_ACCESS,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = SLM_DMND_WRITE|SLM_LLC_ACCESS,
+               [ C(RESULT_MISS)   ] = SLM_DMND_WRITE|SLM_LLC_MISS,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = SLM_DMND_PREFETCH|SLM_LLC_ACCESS,
+               [ C(RESULT_MISS)   ] = SLM_DMND_PREFETCH|SLM_LLC_MISS,
+       },
+ },
+};
+
+static __initconst const u64 slm_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0x0104, /* LD_DCU_MISS */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0380, /* ICACHE.ACCESSES */
+               [ C(RESULT_MISS)   ] = 0x0280, /* ICACGE.MISSES */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_WRITE) ] = {
+               /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_PREFETCH) ] = {
+               /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0x0804, /* LD_DTLB_MISS */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
+               [ C(RESULT_MISS)   ] = 0x40205, /* PAGE_WALKS.I_SIDE_WALKS */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
+               [ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+#define KNL_OT_L2_HITE         BIT_ULL(19) /* Other Tile L2 Hit */
+#define KNL_OT_L2_HITF         BIT_ULL(20) /* Other Tile L2 Hit */
+#define KNL_MCDRAM_LOCAL       BIT_ULL(21)
+#define KNL_MCDRAM_FAR         BIT_ULL(22)
+#define KNL_DDR_LOCAL          BIT_ULL(23)
+#define KNL_DDR_FAR            BIT_ULL(24)
+#define KNL_DRAM_ANY           (KNL_MCDRAM_LOCAL | KNL_MCDRAM_FAR | \
+                                   KNL_DDR_LOCAL | KNL_DDR_FAR)
+#define KNL_L2_READ            SLM_DMND_READ
+#define KNL_L2_WRITE           SLM_DMND_WRITE
+#define KNL_L2_PREFETCH                SLM_DMND_PREFETCH
+#define KNL_L2_ACCESS          SLM_LLC_ACCESS
+#define KNL_L2_MISS            (KNL_OT_L2_HITE | KNL_OT_L2_HITF | \
+                                  KNL_DRAM_ANY | SNB_SNP_ANY | \
+                                                 SNB_NON_DRAM)
+
+static __initconst const u64 knl_hw_cache_extra_regs
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+       [C(LL)] = {
+               [C(OP_READ)] = {
+                       [C(RESULT_ACCESS)] = KNL_L2_READ | KNL_L2_ACCESS,
+                       [C(RESULT_MISS)]   = 0,
+               },
+               [C(OP_WRITE)] = {
+                       [C(RESULT_ACCESS)] = KNL_L2_WRITE | KNL_L2_ACCESS,
+                       [C(RESULT_MISS)]   = KNL_L2_WRITE | KNL_L2_MISS,
+               },
+               [C(OP_PREFETCH)] = {
+                       [C(RESULT_ACCESS)] = KNL_L2_PREFETCH | KNL_L2_ACCESS,
+                       [C(RESULT_MISS)]   = KNL_L2_PREFETCH | KNL_L2_MISS,
+               },
+       },
+};
+
+/*
+ * Used from PMIs where the LBRs are already disabled.
+ *
+ * This function could be called consecutively. It is required to remain in
+ * disabled state if called consecutively.
+ *
+ * During consecutive calls, the same disable value will be written to related
+ * registers, so the PMU state remains unchanged. hw.state in
+ * intel_bts_disable_local will remain PERF_HES_STOPPED too in consecutive
+ * calls.
+ */
+static void __intel_pmu_disable_all(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+
+       if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+               intel_pmu_disable_bts();
+       else
+               intel_bts_disable_local();
+
+       intel_pmu_pebs_disable_all();
+}
+
+static void intel_pmu_disable_all(void)
+{
+       __intel_pmu_disable_all();
+       intel_pmu_lbr_disable_all();
+}
+
+static void __intel_pmu_enable_all(int added, bool pmi)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       intel_pmu_pebs_enable_all();
+       intel_pmu_lbr_enable_all(pmi);
+       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
+                       x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
+
+       if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
+               struct perf_event *event =
+                       cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
+
+               if (WARN_ON_ONCE(!event))
+                       return;
+
+               intel_pmu_enable_bts(event->hw.config);
+       } else
+               intel_bts_enable_local();
+}
+
+static void intel_pmu_enable_all(int added)
+{
+       __intel_pmu_enable_all(added, false);
+}
+
+/*
+ * Workaround for:
+ *   Intel Errata AAK100 (model 26)
+ *   Intel Errata AAP53  (model 30)
+ *   Intel Errata BD53   (model 44)
+ *
+ * The official story:
+ *   These chips need to be 'reset' when adding counters by programming the
+ *   magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either
+ *   in sequence on the same PMC or on different PMCs.
+ *
+ * In practise it appears some of these events do in fact count, and
+ * we need to programm all 4 events.
+ */
+static void intel_pmu_nhm_workaround(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       static const unsigned long nhm_magic[4] = {
+               0x4300B5,
+               0x4300D2,
+               0x4300B1,
+               0x4300B1
+       };
+       struct perf_event *event;
+       int i;
+
+       /*
+        * The Errata requires below steps:
+        * 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL;
+        * 2) Configure 4 PERFEVTSELx with the magic events and clear
+        *    the corresponding PMCx;
+        * 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL;
+        * 4) Clear MSR_CORE_PERF_GLOBAL_CTRL;
+        * 5) Clear 4 pairs of ERFEVTSELx and PMCx;
+        */
+
+       /*
+        * The real steps we choose are a little different from above.
+        * A) To reduce MSR operations, we don't run step 1) as they
+        *    are already cleared before this function is called;
+        * B) Call x86_perf_event_update to save PMCx before configuring
+        *    PERFEVTSELx with magic number;
+        * C) With step 5), we do clear only when the PERFEVTSELx is
+        *    not used currently.
+        * D) Call x86_perf_event_set_period to restore PMCx;
+        */
+
+       /* We always operate 4 pairs of PERF Counters */
+       for (i = 0; i < 4; i++) {
+               event = cpuc->events[i];
+               if (event)
+                       x86_perf_event_update(event);
+       }
+
+       for (i = 0; i < 4; i++) {
+               wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]);
+               wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0);
+       }
+
+       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf);
+       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
+
+       for (i = 0; i < 4; i++) {
+               event = cpuc->events[i];
+
+               if (event) {
+                       x86_perf_event_set_period(event);
+                       __x86_pmu_enable_event(&event->hw,
+                                       ARCH_PERFMON_EVENTSEL_ENABLE);
+               } else
+                       wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0);
+       }
+}
+
+static void intel_pmu_nhm_enable_all(int added)
+{
+       if (added)
+               intel_pmu_nhm_workaround();
+       intel_pmu_enable_all(added);
+}
+
+static inline u64 intel_pmu_get_status(void)
+{
+       u64 status;
+
+       rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+
+       return status;
+}
+
+static inline void intel_pmu_ack_status(u64 ack)
+{
+       wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
+{
+       int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+       u64 ctrl_val, mask;
+
+       mask = 0xfULL << (idx * 4);
+
+       rdmsrl(hwc->config_base, ctrl_val);
+       ctrl_val &= ~mask;
+       wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static inline bool event_is_checkpointed(struct perf_event *event)
+{
+       return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0;
+}
+
+static void intel_pmu_disable_event(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
+               intel_pmu_disable_bts();
+               intel_pmu_drain_bts_buffer();
+               return;
+       }
+
+       cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);
+       cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
+       cpuc->intel_cp_status &= ~(1ull << hwc->idx);
+
+       /*
+        * must disable before any actual event
+        * because any event may be combined with LBR
+        */
+       if (needs_branch_stack(event))
+               intel_pmu_lbr_disable(event);
+
+       if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+               intel_pmu_disable_fixed(hwc);
+               return;
+       }
+
+       x86_pmu_disable_event(event);
+
+       if (unlikely(event->attr.precise_ip))
+               intel_pmu_pebs_disable(event);
+}
+
+static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
+{
+       int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+       u64 ctrl_val, bits, mask;
+
+       /*
+        * Enable IRQ generation (0x8),
+        * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+        * if requested:
+        */
+       bits = 0x8ULL;
+       if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+               bits |= 0x2;
+       if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+               bits |= 0x1;
+
+       /*
+        * ANY bit is supported in v3 and up
+        */
+       if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
+               bits |= 0x4;
+
+       bits <<= (idx * 4);
+       mask = 0xfULL << (idx * 4);
+
+       rdmsrl(hwc->config_base, ctrl_val);
+       ctrl_val &= ~mask;
+       ctrl_val |= bits;
+       wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void intel_pmu_enable_event(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
+               if (!__this_cpu_read(cpu_hw_events.enabled))
+                       return;
+
+               intel_pmu_enable_bts(hwc->config);
+               return;
+       }
+       /*
+        * must enabled before any actual event
+        * because any event may be combined with LBR
+        */
+       if (needs_branch_stack(event))
+               intel_pmu_lbr_enable(event);
+
+       if (event->attr.exclude_host)
+               cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
+       if (event->attr.exclude_guest)
+               cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx);
+
+       if (unlikely(event_is_checkpointed(event)))
+               cpuc->intel_cp_status |= (1ull << hwc->idx);
+
+       if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+               intel_pmu_enable_fixed(hwc);
+               return;
+       }
+
+       if (unlikely(event->attr.precise_ip))
+               intel_pmu_pebs_enable(event);
+
+       __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+}
+
+/*
+ * Save and restart an expired event. Called by NMI contexts,
+ * so it has to be careful about preempting normal event ops:
+ */
+int intel_pmu_save_and_restart(struct perf_event *event)
+{
+       x86_perf_event_update(event);
+       /*
+        * For a checkpointed counter always reset back to 0.  This
+        * avoids a situation where the counter overflows, aborts the
+        * transaction and is then set back to shortly before the
+        * overflow, and overflows and aborts again.
+        */
+       if (unlikely(event_is_checkpointed(event))) {
+               /* No race with NMIs because the counter should not be armed */
+               wrmsrl(event->hw.event_base, 0);
+               local64_set(&event->hw.prev_count, 0);
+       }
+       return x86_perf_event_set_period(event);
+}
+
+static void intel_pmu_reset(void)
+{
+       struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
+       unsigned long flags;
+       int idx;
+
+       if (!x86_pmu.num_counters)
+               return;
+
+       local_irq_save(flags);
+
+       pr_info("clearing PMU state on CPU#%d\n", smp_processor_id());
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
+               wrmsrl_safe(x86_pmu_event_addr(idx),  0ull);
+       }
+       for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
+               wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+
+       if (ds)
+               ds->bts_index = ds->bts_buffer_base;
+
+       /* Ack all overflows and disable fixed counters */
+       if (x86_pmu.version >= 2) {
+               intel_pmu_ack_status(intel_pmu_get_status());
+               wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+       }
+
+       /* Reset LBRs and LBR freezing */
+       if (x86_pmu.lbr_nr) {
+               update_debugctlmsr(get_debugctlmsr() &
+                       ~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR));
+       }
+
+       local_irq_restore(flags);
+}
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static int intel_pmu_handle_irq(struct pt_regs *regs)
+{
+       struct perf_sample_data data;
+       struct cpu_hw_events *cpuc;
+       int bit, loops;
+       u64 status;
+       int handled;
+
+       cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       /*
+        * No known reason to not always do late ACK,
+        * but just in case do it opt-in.
+        */
+       if (!x86_pmu.late_ack)
+               apic_write(APIC_LVTPC, APIC_DM_NMI);
+       __intel_pmu_disable_all();
+       handled = intel_pmu_drain_bts_buffer();
+       handled += intel_bts_interrupt();
+       status = intel_pmu_get_status();
+       if (!status)
+               goto done;
+
+       loops = 0;
+again:
+       intel_pmu_lbr_read();
+       intel_pmu_ack_status(status);
+       if (++loops > 100) {
+               static bool warned = false;
+               if (!warned) {
+                       WARN(1, "perfevents: irq loop stuck!\n");
+                       perf_event_print_debug();
+                       warned = true;
+               }
+               intel_pmu_reset();
+               goto done;
+       }
+
+       inc_irq_stat(apic_perf_irqs);
+
+
+       /*
+        * Ignore a range of extra bits in status that do not indicate
+        * overflow by themselves.
+        */
+       status &= ~(GLOBAL_STATUS_COND_CHG |
+                   GLOBAL_STATUS_ASIF |
+                   GLOBAL_STATUS_LBRS_FROZEN);
+       if (!status)
+               goto done;
+
+       /*
+        * PEBS overflow sets bit 62 in the global status register
+        */
+       if (__test_and_clear_bit(62, (unsigned long *)&status)) {
+               handled++;
+               x86_pmu.drain_pebs(regs);
+               /*
+                * There are cases where, even though, the PEBS ovfl bit is set
+                * in GLOBAL_OVF_STATUS, the PEBS events may also have their
+                * overflow bits set for their counters. We must clear them
+                * here because they have been processed as exact samples in
+                * the drain_pebs() routine. They must not be processed again
+                * in the for_each_bit_set() loop for regular samples below.
+                */
+               status &= ~cpuc->pebs_enabled;
+               status &= x86_pmu.intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI;
+       }
+
+       /*
+        * Intel PT
+        */
+       if (__test_and_clear_bit(55, (unsigned long *)&status)) {
+               handled++;
+               intel_pt_interrupt();
+       }
+
+       /*
+        * Checkpointed counters can lead to 'spurious' PMIs because the
+        * rollback caused by the PMI will have cleared the overflow status
+        * bit. Therefore always force probe these counters.
+        */
+       status |= cpuc->intel_cp_status;
+
+       for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+               struct perf_event *event = cpuc->events[bit];
+
+               handled++;
+
+               if (!test_bit(bit, cpuc->active_mask))
+                       continue;
+
+               if (!intel_pmu_save_and_restart(event))
+                       continue;
+
+               perf_sample_data_init(&data, 0, event->hw.last_period);
+
+               if (has_branch_stack(event))
+                       data.br_stack = &cpuc->lbr_stack;
+
+               if (perf_event_overflow(event, &data, regs))
+                       x86_pmu_stop(event, 0);
+       }
+
+       /*
+        * Repeat if there is more work to be done:
+        */
+       status = intel_pmu_get_status();
+       if (status)
+               goto again;
+
+done:
+       /* Only restore PMU state when it's active. See x86_pmu_disable(). */
+       if (cpuc->enabled)
+               __intel_pmu_enable_all(0, true);
+
+       /*
+        * Only unmask the NMI after the overflow counters
+        * have been reset. This avoids spurious NMIs on
+        * Haswell CPUs.
+        */
+       if (x86_pmu.late_ack)
+               apic_write(APIC_LVTPC, APIC_DM_NMI);
+       return handled;
+}
+
+static struct event_constraint *
+intel_bts_constraints(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       unsigned int hw_event, bts_event;
+
+       if (event->attr.freq)
+               return NULL;
+
+       hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
+       bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
+
+       if (unlikely(hw_event == bts_event && hwc->sample_period == 1))
+               return &bts_constraint;
+
+       return NULL;
+}
+
+static int intel_alt_er(int idx, u64 config)
+{
+       int alt_idx = idx;
+
+       if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
+               return idx;
+
+       if (idx == EXTRA_REG_RSP_0)
+               alt_idx = EXTRA_REG_RSP_1;
+
+       if (idx == EXTRA_REG_RSP_1)
+               alt_idx = EXTRA_REG_RSP_0;
+
+       if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask)
+               return idx;
+
+       return alt_idx;
+}
+
+static void intel_fixup_er(struct perf_event *event, int idx)
+{
+       event->hw.extra_reg.idx = idx;
+
+       if (idx == EXTRA_REG_RSP_0) {
+               event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+               event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event;
+               event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
+       } else if (idx == EXTRA_REG_RSP_1) {
+               event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+               event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event;
+               event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
+       }
+}
+
+/*
+ * manage allocation of shared extra msr for certain events
+ *
+ * sharing can be:
+ * per-cpu: to be shared between the various events on a single PMU
+ * per-core: per-cpu + shared by HT threads
+ */
+static struct event_constraint *
+__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
+                                  struct perf_event *event,
+                                  struct hw_perf_event_extra *reg)
+{
+       struct event_constraint *c = &emptyconstraint;
+       struct er_account *era;
+       unsigned long flags;
+       int idx = reg->idx;
+
+       /*
+        * reg->alloc can be set due to existing state, so for fake cpuc we
+        * need to ignore this, otherwise we might fail to allocate proper fake
+        * state for this extra reg constraint. Also see the comment below.
+        */
+       if (reg->alloc && !cpuc->is_fake)
+               return NULL; /* call x86_get_event_constraint() */
+
+again:
+       era = &cpuc->shared_regs->regs[idx];
+       /*
+        * we use spin_lock_irqsave() to avoid lockdep issues when
+        * passing a fake cpuc
+        */
+       raw_spin_lock_irqsave(&era->lock, flags);
+
+       if (!atomic_read(&era->ref) || era->config == reg->config) {
+
+               /*
+                * If its a fake cpuc -- as per validate_{group,event}() we
+                * shouldn't touch event state and we can avoid doing so
+                * since both will only call get_event_constraints() once
+                * on each event, this avoids the need for reg->alloc.
+                *
+                * Not doing the ER fixup will only result in era->reg being
+                * wrong, but since we won't actually try and program hardware
+                * this isn't a problem either.
+                */
+               if (!cpuc->is_fake) {
+                       if (idx != reg->idx)
+                               intel_fixup_er(event, idx);
+
+                       /*
+                        * x86_schedule_events() can call get_event_constraints()
+                        * multiple times on events in the case of incremental
+                        * scheduling(). reg->alloc ensures we only do the ER
+                        * allocation once.
+                        */
+                       reg->alloc = 1;
+               }
+
+               /* lock in msr value */
+               era->config = reg->config;
+               era->reg = reg->reg;
+
+               /* one more user */
+               atomic_inc(&era->ref);
+
+               /*
+                * need to call x86_get_event_constraint()
+                * to check if associated event has constraints
+                */
+               c = NULL;
+       } else {
+               idx = intel_alt_er(idx, reg->config);
+               if (idx != reg->idx) {
+                       raw_spin_unlock_irqrestore(&era->lock, flags);
+                       goto again;
+               }
+       }
+       raw_spin_unlock_irqrestore(&era->lock, flags);
+
+       return c;
+}
+
+static void
+__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
+                                  struct hw_perf_event_extra *reg)
+{
+       struct er_account *era;
+
+       /*
+        * Only put constraint if extra reg was actually allocated. Also takes
+        * care of event which do not use an extra shared reg.
+        *
+        * Also, if this is a fake cpuc we shouldn't touch any event state
+        * (reg->alloc) and we don't care about leaving inconsistent cpuc state
+        * either since it'll be thrown out.
+        */
+       if (!reg->alloc || cpuc->is_fake)
+               return;
+
+       era = &cpuc->shared_regs->regs[reg->idx];
+
+       /* one fewer user */
+       atomic_dec(&era->ref);
+
+       /* allocate again next time */
+       reg->alloc = 0;
+}
+
+static struct event_constraint *
+intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
+                             struct perf_event *event)
+{
+       struct event_constraint *c = NULL, *d;
+       struct hw_perf_event_extra *xreg, *breg;
+
+       xreg = &event->hw.extra_reg;
+       if (xreg->idx != EXTRA_REG_NONE) {
+               c = __intel_shared_reg_get_constraints(cpuc, event, xreg);
+               if (c == &emptyconstraint)
+                       return c;
+       }
+       breg = &event->hw.branch_reg;
+       if (breg->idx != EXTRA_REG_NONE) {
+               d = __intel_shared_reg_get_constraints(cpuc, event, breg);
+               if (d == &emptyconstraint) {
+                       __intel_shared_reg_put_constraints(cpuc, xreg);
+                       c = d;
+               }
+       }
+       return c;
+}
+
+struct event_constraint *
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+                         struct perf_event *event)
+{
+       struct event_constraint *c;
+
+       if (x86_pmu.event_constraints) {
+               for_each_event_constraint(c, x86_pmu.event_constraints) {
+                       if ((event->hw.config & c->cmask) == c->code) {
+                               event->hw.flags |= c->flags;
+                               return c;
+                       }
+               }
+       }
+
+       return &unconstrained;
+}
+
+static struct event_constraint *
+__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+                           struct perf_event *event)
+{
+       struct event_constraint *c;
+
+       c = intel_bts_constraints(event);
+       if (c)
+               return c;
+
+       c = intel_shared_regs_constraints(cpuc, event);
+       if (c)
+               return c;
+
+       c = intel_pebs_constraints(event);
+       if (c)
+               return c;
+
+       return x86_get_event_constraints(cpuc, idx, event);
+}
+
+static void
+intel_start_scheduling(struct cpu_hw_events *cpuc)
+{
+       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+       struct intel_excl_states *xl;
+       int tid = cpuc->excl_thread_id;
+
+       /*
+        * nothing needed if in group validation mode
+        */
+       if (cpuc->is_fake || !is_ht_workaround_enabled())
+               return;
+
+       /*
+        * no exclusion needed
+        */
+       if (WARN_ON_ONCE(!excl_cntrs))
+               return;
+
+       xl = &excl_cntrs->states[tid];
+
+       xl->sched_started = true;
+       /*
+        * lock shared state until we are done scheduling
+        * in stop_event_scheduling()
+        * makes scheduling appear as a transaction
+        */
+       raw_spin_lock(&excl_cntrs->lock);
+}
+
+static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
+{
+       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+       struct event_constraint *c = cpuc->event_constraint[idx];
+       struct intel_excl_states *xl;
+       int tid = cpuc->excl_thread_id;
+
+       if (cpuc->is_fake || !is_ht_workaround_enabled())
+               return;
+
+       if (WARN_ON_ONCE(!excl_cntrs))
+               return;
+
+       if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
+               return;
+
+       xl = &excl_cntrs->states[tid];
+
+       lockdep_assert_held(&excl_cntrs->lock);
+
+       if (c->flags & PERF_X86_EVENT_EXCL)
+               xl->state[cntr] = INTEL_EXCL_EXCLUSIVE;
+       else
+               xl->state[cntr] = INTEL_EXCL_SHARED;
+}
+
+static void
+intel_stop_scheduling(struct cpu_hw_events *cpuc)
+{
+       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+       struct intel_excl_states *xl;
+       int tid = cpuc->excl_thread_id;
+
+       /*
+        * nothing needed if in group validation mode
+        */
+       if (cpuc->is_fake || !is_ht_workaround_enabled())
+               return;
+       /*
+        * no exclusion needed
+        */
+       if (WARN_ON_ONCE(!excl_cntrs))
+               return;
+
+       xl = &excl_cntrs->states[tid];
+
+       xl->sched_started = false;
+       /*
+        * release shared state lock (acquired in intel_start_scheduling())
+        */
+       raw_spin_unlock(&excl_cntrs->lock);
+}
+
+static struct event_constraint *
+intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
+                          int idx, struct event_constraint *c)
+{
+       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+       struct intel_excl_states *xlo;
+       int tid = cpuc->excl_thread_id;
+       int is_excl, i;
+
+       /*
+        * validating a group does not require
+        * enforcing cross-thread  exclusion
+        */
+       if (cpuc->is_fake || !is_ht_workaround_enabled())
+               return c;
+
+       /*
+        * no exclusion needed
+        */
+       if (WARN_ON_ONCE(!excl_cntrs))
+               return c;
+
+       /*
+        * because we modify the constraint, we need
+        * to make a copy. Static constraints come
+        * from static const tables.
+        *
+        * only needed when constraint has not yet
+        * been cloned (marked dynamic)
+        */
+       if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
+               struct event_constraint *cx;
+
+               /*
+                * grab pre-allocated constraint entry
+                */
+               cx = &cpuc->constraint_list[idx];
+
+               /*
+                * initialize dynamic constraint
+                * with static constraint
+                */
+               *cx = *c;
+
+               /*
+                * mark constraint as dynamic, so we
+                * can free it later on
+                */
+               cx->flags |= PERF_X86_EVENT_DYNAMIC;
+               c = cx;
+       }
+
+       /*
+        * From here on, the constraint is dynamic.
+        * Either it was just allocated above, or it
+        * was allocated during a earlier invocation
+        * of this function
+        */
+
+       /*
+        * state of sibling HT
+        */
+       xlo = &excl_cntrs->states[tid ^ 1];
+
+       /*
+        * event requires exclusive counter access
+        * across HT threads
+        */
+       is_excl = c->flags & PERF_X86_EVENT_EXCL;
+       if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) {
+               event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT;
+               if (!cpuc->n_excl++)
+                       WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
+       }
+
+       /*
+        * Modify static constraint with current dynamic
+        * state of thread
+        *
+        * EXCLUSIVE: sibling counter measuring exclusive event
+        * SHARED   : sibling counter measuring non-exclusive event
+        * UNUSED   : sibling counter unused
+        */
+       for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) {
+               /*
+                * exclusive event in sibling counter
+                * our corresponding counter cannot be used
+                * regardless of our event
+                */
+               if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE)
+                       __clear_bit(i, c->idxmsk);
+               /*
+                * if measuring an exclusive event, sibling
+                * measuring non-exclusive, then counter cannot
+                * be used
+                */
+               if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED)
+                       __clear_bit(i, c->idxmsk);
+       }
+
+       /*
+        * recompute actual bit weight for scheduling algorithm
+        */
+       c->weight = hweight64(c->idxmsk64);
+
+       /*
+        * if we return an empty mask, then switch
+        * back to static empty constraint to avoid
+        * the cost of freeing later on
+        */
+       if (c->weight == 0)
+               c = &emptyconstraint;
+
+       return c;
+}
+
+static struct event_constraint *
+intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+                           struct perf_event *event)
+{
+       struct event_constraint *c1 = NULL;
+       struct event_constraint *c2;
+
+       if (idx >= 0) /* fake does < 0 */
+               c1 = cpuc->event_constraint[idx];
+
+       /*
+        * first time only
+        * - static constraint: no change across incremental scheduling calls
+        * - dynamic constraint: handled by intel_get_excl_constraints()
+        */
+       c2 = __intel_get_event_constraints(cpuc, idx, event);
+       if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) {
+               bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX);
+               c1->weight = c2->weight;
+               c2 = c1;
+       }
+
+       if (cpuc->excl_cntrs)
+               return intel_get_excl_constraints(cpuc, event, idx, c2);
+
+       return c2;
+}
+
+static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
+               struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+       int tid = cpuc->excl_thread_id;
+       struct intel_excl_states *xl;
+
+       /*
+        * nothing needed if in group validation mode
+        */
+       if (cpuc->is_fake)
+               return;
+
+       if (WARN_ON_ONCE(!excl_cntrs))
+               return;
+
+       if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) {
+               hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT;
+               if (!--cpuc->n_excl)
+                       WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);
+       }
+
+       /*
+        * If event was actually assigned, then mark the counter state as
+        * unused now.
+        */
+       if (hwc->idx >= 0) {
+               xl = &excl_cntrs->states[tid];
+
+               /*
+                * put_constraint may be called from x86_schedule_events()
+                * which already has the lock held so here make locking
+                * conditional.
+                */
+               if (!xl->sched_started)
+                       raw_spin_lock(&excl_cntrs->lock);
+
+               xl->state[hwc->idx] = INTEL_EXCL_UNUSED;
+
+               if (!xl->sched_started)
+                       raw_spin_unlock(&excl_cntrs->lock);
+       }
+}
+
+static void
+intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
+                                       struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg;
+
+       reg = &event->hw.extra_reg;
+       if (reg->idx != EXTRA_REG_NONE)
+               __intel_shared_reg_put_constraints(cpuc, reg);
+
+       reg = &event->hw.branch_reg;
+       if (reg->idx != EXTRA_REG_NONE)
+               __intel_shared_reg_put_constraints(cpuc, reg);
+}
+
+static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
+                                       struct perf_event *event)
+{
+       intel_put_shared_regs_event_constraints(cpuc, event);
+
+       /*
+        * is PMU has exclusive counter restrictions, then
+        * all events are subject to and must call the
+        * put_excl_constraints() routine
+        */
+       if (cpuc->excl_cntrs)
+               intel_put_excl_constraints(cpuc, event);
+}
+
+static void intel_pebs_aliases_core2(struct perf_event *event)
+{
+       if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+               /*
+                * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+                * (0x003c) so that we can use it with PEBS.
+                *
+                * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+                * PEBS capable. However we can use INST_RETIRED.ANY_P
+                * (0x00c0), which is a PEBS capable event, to get the same
+                * count.
+                *
+                * INST_RETIRED.ANY_P counts the number of cycles that retires
+                * CNTMASK instructions. By setting CNTMASK to a value (16)
+                * larger than the maximum number of instructions that can be
+                * retired per cycle (4) and then inverting the condition, we
+                * count all cycles that retire 16 or less instructions, which
+                * is every cycle.
+                *
+                * Thereby we gain a PEBS capable cycle counter.
+                */
+               u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
+
+               alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+               event->hw.config = alt_config;
+       }
+}
+
+static void intel_pebs_aliases_snb(struct perf_event *event)
+{
+       if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+               /*
+                * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+                * (0x003c) so that we can use it with PEBS.
+                *
+                * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+                * PEBS capable. However we can use UOPS_RETIRED.ALL
+                * (0x01c2), which is a PEBS capable event, to get the same
+                * count.
+                *
+                * UOPS_RETIRED.ALL counts the number of cycles that retires
+                * CNTMASK micro-ops. By setting CNTMASK to a value (16)
+                * larger than the maximum number of micro-ops that can be
+                * retired per cycle (4) and then inverting the condition, we
+                * count all cycles that retire 16 or less micro-ops, which
+                * is every cycle.
+                *
+                * Thereby we gain a PEBS capable cycle counter.
+                */
+               u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16);
+
+               alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+               event->hw.config = alt_config;
+       }
+}
+
+static void intel_pebs_aliases_precdist(struct perf_event *event)
+{
+       if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+               /*
+                * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+                * (0x003c) so that we can use it with PEBS.
+                *
+                * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+                * PEBS capable. However we can use INST_RETIRED.PREC_DIST
+                * (0x01c0), which is a PEBS capable event, to get the same
+                * count.
+                *
+                * The PREC_DIST event has special support to minimize sample
+                * shadowing effects. One drawback is that it can be
+                * only programmed on counter 1, but that seems like an
+                * acceptable trade off.
+                */
+               u64 alt_config = X86_CONFIG(.event=0xc0, .umask=0x01, .inv=1, .cmask=16);
+
+               alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+               event->hw.config = alt_config;
+       }
+}
+
+static void intel_pebs_aliases_ivb(struct perf_event *event)
+{
+       if (event->attr.precise_ip < 3)
+               return intel_pebs_aliases_snb(event);
+       return intel_pebs_aliases_precdist(event);
+}
+
+static void intel_pebs_aliases_skl(struct perf_event *event)
+{
+       if (event->attr.precise_ip < 3)
+               return intel_pebs_aliases_core2(event);
+       return intel_pebs_aliases_precdist(event);
+}
+
+static unsigned long intel_pmu_free_running_flags(struct perf_event *event)
+{
+       unsigned long flags = x86_pmu.free_running_flags;
+
+       if (event->attr.use_clockid)
+               flags &= ~PERF_SAMPLE_TIME;
+       return flags;
+}
+
+static int intel_pmu_hw_config(struct perf_event *event)
+{
+       int ret = x86_pmu_hw_config(event);
+
+       if (ret)
+               return ret;
+
+       if (event->attr.precise_ip) {
+               if (!event->attr.freq) {
+                       event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
+                       if (!(event->attr.sample_type &
+                             ~intel_pmu_free_running_flags(event)))
+                               event->hw.flags |= PERF_X86_EVENT_FREERUNNING;
+               }
+               if (x86_pmu.pebs_aliases)
+                       x86_pmu.pebs_aliases(event);
+       }
+
+       if (needs_branch_stack(event)) {
+               ret = intel_pmu_setup_lbr_filter(event);
+               if (ret)
+                       return ret;
+
+               /*
+                * BTS is set up earlier in this path, so don't account twice
+                */
+               if (!intel_pmu_has_bts(event)) {
+                       /* disallow lbr if conflicting events are present */
+                       if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+                               return -EBUSY;
+
+                       event->destroy = hw_perf_lbr_event_destroy;
+               }
+       }
+
+       if (event->attr.type != PERF_TYPE_RAW)
+               return 0;
+
+       if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
+               return 0;
+
+       if (x86_pmu.version < 3)
+               return -EINVAL;
+
+       if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+               return -EACCES;
+
+       event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
+
+       return 0;
+}
+
+struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
+{
+       if (x86_pmu.guest_get_msrs)
+               return x86_pmu.guest_get_msrs(nr);
+       *nr = 0;
+       return NULL;
+}
+EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
+
+static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
+
+       arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
+       arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
+       arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask;
+       /*
+        * If PMU counter has PEBS enabled it is not enough to disable counter
+        * on a guest entry since PEBS memory write can overshoot guest entry
+        * and corrupt guest memory. Disabling PEBS solves the problem.
+        */
+       arr[1].msr = MSR_IA32_PEBS_ENABLE;
+       arr[1].host = cpuc->pebs_enabled;
+       arr[1].guest = 0;
+
+       *nr = 2;
+       return arr;
+}
+
+static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
+       int idx;
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++)  {
+               struct perf_event *event = cpuc->events[idx];
+
+               arr[idx].msr = x86_pmu_config_addr(idx);
+               arr[idx].host = arr[idx].guest = 0;
+
+               if (!test_bit(idx, cpuc->active_mask))
+                       continue;
+
+               arr[idx].host = arr[idx].guest =
+                       event->hw.config | ARCH_PERFMON_EVENTSEL_ENABLE;
+
+               if (event->attr.exclude_host)
+                       arr[idx].host &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+               else if (event->attr.exclude_guest)
+                       arr[idx].guest &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+       }
+
+       *nr = x86_pmu.num_counters;
+       return arr;
+}
+
+static void core_pmu_enable_event(struct perf_event *event)
+{
+       if (!event->attr.exclude_host)
+               x86_pmu_enable_event(event);
+}
+
+static void core_pmu_enable_all(int added)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       int idx;
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
+
+               if (!test_bit(idx, cpuc->active_mask) ||
+                               cpuc->events[idx]->attr.exclude_host)
+                       continue;
+
+               __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+       }
+}
+
+static int hsw_hw_config(struct perf_event *event)
+{
+       int ret = intel_pmu_hw_config(event);
+
+       if (ret)
+               return ret;
+       if (!boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has(X86_FEATURE_HLE))
+               return 0;
+       event->hw.config |= event->attr.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED);
+
+       /*
+        * IN_TX/IN_TX-CP filters are not supported by the Haswell PMU with
+        * PEBS or in ANY thread mode. Since the results are non-sensical forbid
+        * this combination.
+        */
+       if ((event->hw.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)) &&
+            ((event->hw.config & ARCH_PERFMON_EVENTSEL_ANY) ||
+             event->attr.precise_ip > 0))
+               return -EOPNOTSUPP;
+
+       if (event_is_checkpointed(event)) {
+               /*
+                * Sampling of checkpointed events can cause situations where
+                * the CPU constantly aborts because of a overflow, which is
+                * then checkpointed back and ignored. Forbid checkpointing
+                * for sampling.
+                *
+                * But still allow a long sampling period, so that perf stat
+                * from KVM works.
+                */
+               if (event->attr.sample_period > 0 &&
+                   event->attr.sample_period < 0x7fffffff)
+                       return -EOPNOTSUPP;
+       }
+       return 0;
+}
+
+static struct event_constraint counter2_constraint =
+                       EVENT_CONSTRAINT(0, 0x4, 0);
+
+static struct event_constraint *
+hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+                         struct perf_event *event)
+{
+       struct event_constraint *c;
+
+       c = intel_get_event_constraints(cpuc, idx, event);
+
+       /* Handle special quirk on in_tx_checkpointed only in counter 2 */
+       if (event->hw.config & HSW_IN_TX_CHECKPOINTED) {
+               if (c->idxmsk64 & (1U << 2))
+                       return &counter2_constraint;
+               return &emptyconstraint;
+       }
+
+       return c;
+}
+
+/*
+ * Broadwell:
+ *
+ * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared
+ * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine
+ * the two to enforce a minimum period of 128 (the smallest value that has bits
+ * 0-5 cleared and >= 100).
+ *
+ * Because of how the code in x86_perf_event_set_period() works, the truncation
+ * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period
+ * to make up for the 'lost' events due to carrying the 'error' in period_left.
+ *
+ * Therefore the effective (average) period matches the requested period,
+ * despite coarser hardware granularity.
+ */
+static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
+{
+       if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+                       X86_CONFIG(.event=0xc0, .umask=0x01)) {
+               if (left < 128)
+                       left = 128;
+               left &= ~0x3fu;
+       }
+       return left;
+}
+
+PMU_FORMAT_ATTR(event, "config:0-7"    );
+PMU_FORMAT_ATTR(umask, "config:8-15"   );
+PMU_FORMAT_ATTR(edge,  "config:18"     );
+PMU_FORMAT_ATTR(pc,    "config:19"     );
+PMU_FORMAT_ATTR(any,   "config:21"     ); /* v3 + */
+PMU_FORMAT_ATTR(inv,   "config:23"     );
+PMU_FORMAT_ATTR(cmask, "config:24-31"  );
+PMU_FORMAT_ATTR(in_tx,  "config:32");
+PMU_FORMAT_ATTR(in_tx_cp, "config:33");
+
+static struct attribute *intel_arch_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_pc.attr,
+       &format_attr_inv.attr,
+       &format_attr_cmask.attr,
+       NULL,
+};
+
+ssize_t intel_event_sysfs_show(char *page, u64 config)
+{
+       u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
+
+       return x86_event_sysfs_show(page, config, event);
+}
+
+struct intel_shared_regs *allocate_shared_regs(int cpu)
+{
+       struct intel_shared_regs *regs;
+       int i;
+
+       regs = kzalloc_node(sizeof(struct intel_shared_regs),
+                           GFP_KERNEL, cpu_to_node(cpu));
+       if (regs) {
+               /*
+                * initialize the locks to keep lockdep happy
+                */
+               for (i = 0; i < EXTRA_REG_MAX; i++)
+                       raw_spin_lock_init(&regs->regs[i].lock);
+
+               regs->core_id = -1;
+       }
+       return regs;
+}
+
+static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
+{
+       struct intel_excl_cntrs *c;
+
+       c = kzalloc_node(sizeof(struct intel_excl_cntrs),
+                        GFP_KERNEL, cpu_to_node(cpu));
+       if (c) {
+               raw_spin_lock_init(&c->lock);
+               c->core_id = -1;
+       }
+       return c;
+}
+
+static int intel_pmu_cpu_prepare(int cpu)
+{
+       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+
+       if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
+               cpuc->shared_regs = allocate_shared_regs(cpu);
+               if (!cpuc->shared_regs)
+                       goto err;
+       }
+
+       if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+               size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
+
+               cpuc->constraint_list = kzalloc(sz, GFP_KERNEL);
+               if (!cpuc->constraint_list)
+                       goto err_shared_regs;
+
+               cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
+               if (!cpuc->excl_cntrs)
+                       goto err_constraint_list;
+
+               cpuc->excl_thread_id = 0;
+       }
+
+       return NOTIFY_OK;
+
+err_constraint_list:
+       kfree(cpuc->constraint_list);
+       cpuc->constraint_list = NULL;
+
+err_shared_regs:
+       kfree(cpuc->shared_regs);
+       cpuc->shared_regs = NULL;
+
+err:
+       return NOTIFY_BAD;
+}
+
+static void intel_pmu_cpu_starting(int cpu)
+{
+       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+       int core_id = topology_core_id(cpu);
+       int i;
+
+       init_debug_store_on_cpu(cpu);
+       /*
+        * Deal with CPUs that don't clear their LBRs on power-up.
+        */
+       intel_pmu_lbr_reset();
+
+       cpuc->lbr_sel = NULL;
+
+       if (!cpuc->shared_regs)
+               return;
+
+       if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
+               for_each_cpu(i, topology_sibling_cpumask(cpu)) {
+                       struct intel_shared_regs *pc;
+
+                       pc = per_cpu(cpu_hw_events, i).shared_regs;
+                       if (pc && pc->core_id == core_id) {
+                               cpuc->kfree_on_online[0] = cpuc->shared_regs;
+                               cpuc->shared_regs = pc;
+                               break;
+                       }
+               }
+               cpuc->shared_regs->core_id = core_id;
+               cpuc->shared_regs->refcnt++;
+       }
+
+       if (x86_pmu.lbr_sel_map)
+               cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
+
+       if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+               for_each_cpu(i, topology_sibling_cpumask(cpu)) {
+                       struct intel_excl_cntrs *c;
+
+                       c = per_cpu(cpu_hw_events, i).excl_cntrs;
+                       if (c && c->core_id == core_id) {
+                               cpuc->kfree_on_online[1] = cpuc->excl_cntrs;
+                               cpuc->excl_cntrs = c;
+                               cpuc->excl_thread_id = 1;
+                               break;
+                       }
+               }
+               cpuc->excl_cntrs->core_id = core_id;
+               cpuc->excl_cntrs->refcnt++;
+       }
+}
+
+static void free_excl_cntrs(int cpu)
+{
+       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+       struct intel_excl_cntrs *c;
+
+       c = cpuc->excl_cntrs;
+       if (c) {
+               if (c->core_id == -1 || --c->refcnt == 0)
+                       kfree(c);
+               cpuc->excl_cntrs = NULL;
+               kfree(cpuc->constraint_list);
+               cpuc->constraint_list = NULL;
+       }
+}
+
+static void intel_pmu_cpu_dying(int cpu)
+{
+       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+       struct intel_shared_regs *pc;
+
+       pc = cpuc->shared_regs;
+       if (pc) {
+               if (pc->core_id == -1 || --pc->refcnt == 0)
+                       kfree(pc);
+               cpuc->shared_regs = NULL;
+       }
+
+       free_excl_cntrs(cpu);
+
+       fini_debug_store_on_cpu(cpu);
+}
+
+static void intel_pmu_sched_task(struct perf_event_context *ctx,
+                                bool sched_in)
+{
+       if (x86_pmu.pebs_active)
+               intel_pmu_pebs_sched_task(ctx, sched_in);
+       if (x86_pmu.lbr_nr)
+               intel_pmu_lbr_sched_task(ctx, sched_in);
+}
+
+PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
+
+PMU_FORMAT_ATTR(ldlat, "config1:0-15");
+
+PMU_FORMAT_ATTR(frontend, "config1:0-23");
+
+static struct attribute *intel_arch3_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_pc.attr,
+       &format_attr_any.attr,
+       &format_attr_inv.attr,
+       &format_attr_cmask.attr,
+       &format_attr_in_tx.attr,
+       &format_attr_in_tx_cp.attr,
+
+       &format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */
+       &format_attr_ldlat.attr, /* PEBS load latency */
+       NULL,
+};
+
+static struct attribute *skl_format_attr[] = {
+       &format_attr_frontend.attr,
+       NULL,
+};
+
+static __initconst const struct x86_pmu core_pmu = {
+       .name                   = "core",
+       .handle_irq             = x86_pmu_handle_irq,
+       .disable_all            = x86_pmu_disable_all,
+       .enable_all             = core_pmu_enable_all,
+       .enable                 = core_pmu_enable_event,
+       .disable                = x86_pmu_disable_event,
+       .hw_config              = x86_pmu_hw_config,
+       .schedule_events        = x86_schedule_events,
+       .eventsel               = MSR_ARCH_PERFMON_EVENTSEL0,
+       .perfctr                = MSR_ARCH_PERFMON_PERFCTR0,
+       .event_map              = intel_pmu_event_map,
+       .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
+       .apic                   = 1,
+       .free_running_flags     = PEBS_FREERUNNING_FLAGS,
+
+       /*
+        * Intel PMCs cannot be accessed sanely above 32-bit width,
+        * so we install an artificial 1<<31 period regardless of
+        * the generic event period:
+        */
+       .max_period             = (1ULL<<31) - 1,
+       .get_event_constraints  = intel_get_event_constraints,
+       .put_event_constraints  = intel_put_event_constraints,
+       .event_constraints      = intel_core_event_constraints,
+       .guest_get_msrs         = core_guest_get_msrs,
+       .format_attrs           = intel_arch_formats_attr,
+       .events_sysfs_show      = intel_event_sysfs_show,
+
+       /*
+        * Virtual (or funny metal) CPU can define x86_pmu.extra_regs
+        * together with PMU version 1 and thus be using core_pmu with
+        * shared_regs. We need following callbacks here to allocate
+        * it properly.
+        */
+       .cpu_prepare            = intel_pmu_cpu_prepare,
+       .cpu_starting           = intel_pmu_cpu_starting,
+       .cpu_dying              = intel_pmu_cpu_dying,
+};
+
+static __initconst const struct x86_pmu intel_pmu = {
+       .name                   = "Intel",
+       .handle_irq             = intel_pmu_handle_irq,
+       .disable_all            = intel_pmu_disable_all,
+       .enable_all             = intel_pmu_enable_all,
+       .enable                 = intel_pmu_enable_event,
+       .disable                = intel_pmu_disable_event,
+       .hw_config              = intel_pmu_hw_config,
+       .schedule_events        = x86_schedule_events,
+       .eventsel               = MSR_ARCH_PERFMON_EVENTSEL0,
+       .perfctr                = MSR_ARCH_PERFMON_PERFCTR0,
+       .event_map              = intel_pmu_event_map,
+       .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
+       .apic                   = 1,
+       .free_running_flags     = PEBS_FREERUNNING_FLAGS,
+       /*
+        * Intel PMCs cannot be accessed sanely above 32 bit width,
+        * so we install an artificial 1<<31 period regardless of
+        * the generic event period:
+        */
+       .max_period             = (1ULL << 31) - 1,
+       .get_event_constraints  = intel_get_event_constraints,
+       .put_event_constraints  = intel_put_event_constraints,
+       .pebs_aliases           = intel_pebs_aliases_core2,
+
+       .format_attrs           = intel_arch3_formats_attr,
+       .events_sysfs_show      = intel_event_sysfs_show,
+
+       .cpu_prepare            = intel_pmu_cpu_prepare,
+       .cpu_starting           = intel_pmu_cpu_starting,
+       .cpu_dying              = intel_pmu_cpu_dying,
+       .guest_get_msrs         = intel_guest_get_msrs,
+       .sched_task             = intel_pmu_sched_task,
+};
+
+static __init void intel_clovertown_quirk(void)
+{
+       /*
+        * PEBS is unreliable due to:
+        *
+        *   AJ67  - PEBS may experience CPL leaks
+        *   AJ68  - PEBS PMI may be delayed by one event
+        *   AJ69  - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
+        *   AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
+        *
+        * AJ67 could be worked around by restricting the OS/USR flags.
+        * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
+        *
+        * AJ106 could possibly be worked around by not allowing LBR
+        *       usage from PEBS, including the fixup.
+        * AJ68  could possibly be worked around by always programming
+        *       a pebs_event_reset[0] value and coping with the lost events.
+        *
+        * But taken together it might just make sense to not enable PEBS on
+        * these chips.
+        */
+       pr_warn("PEBS disabled due to CPU errata\n");
+       x86_pmu.pebs = 0;
+       x86_pmu.pebs_constraints = NULL;
+}
+
+static int intel_snb_pebs_broken(int cpu)
+{
+       u32 rev = UINT_MAX; /* default to broken for unknown models */
+
+       switch (cpu_data(cpu).x86_model) {
+       case 42: /* SNB */
+               rev = 0x28;
+               break;
+
+       case 45: /* SNB-EP */
+               switch (cpu_data(cpu).x86_mask) {
+               case 6: rev = 0x618; break;
+               case 7: rev = 0x70c; break;
+               }
+       }
+
+       return (cpu_data(cpu).microcode < rev);
+}
+
+static void intel_snb_check_microcode(void)
+{
+       int pebs_broken = 0;
+       int cpu;
+
+       get_online_cpus();
+       for_each_online_cpu(cpu) {
+               if ((pebs_broken = intel_snb_pebs_broken(cpu)))
+                       break;
+       }
+       put_online_cpus();
+
+       if (pebs_broken == x86_pmu.pebs_broken)
+               return;
+
+       /*
+        * Serialized by the microcode lock..
+        */
+       if (x86_pmu.pebs_broken) {
+               pr_info("PEBS enabled due to microcode update\n");
+               x86_pmu.pebs_broken = 0;
+       } else {
+               pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n");
+               x86_pmu.pebs_broken = 1;
+       }
+}
+
+/*
+ * Under certain circumstances, access certain MSR may cause #GP.
+ * The function tests if the input MSR can be safely accessed.
+ */
+static bool check_msr(unsigned long msr, u64 mask)
+{
+       u64 val_old, val_new, val_tmp;
+
+       /*
+        * Read the current value, change it and read it back to see if it
+        * matches, this is needed to detect certain hardware emulators
+        * (qemu/kvm) that don't trap on the MSR access and always return 0s.
+        */
+       if (rdmsrl_safe(msr, &val_old))
+               return false;
+
+       /*
+        * Only change the bits which can be updated by wrmsrl.
+        */
+       val_tmp = val_old ^ mask;
+       if (wrmsrl_safe(msr, val_tmp) ||
+           rdmsrl_safe(msr, &val_new))
+               return false;
+
+       if (val_new != val_tmp)
+               return false;
+
+       /* Here it's sure that the MSR can be safely accessed.
+        * Restore the old value and return.
+        */
+       wrmsrl(msr, val_old);
+
+       return true;
+}
+
+static __init void intel_sandybridge_quirk(void)
+{
+       x86_pmu.check_microcode = intel_snb_check_microcode;
+       intel_snb_check_microcode();
+}
+
+static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
+       { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
+       { PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
+       { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
+       { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
+       { PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
+       { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
+       { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
+};
+
+static __init void intel_arch_events_quirk(void)
+{
+       int bit;
+
+       /* disable event that reported as not presend by cpuid */
+       for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
+               intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
+               pr_warn("CPUID marked event: \'%s\' unavailable\n",
+                       intel_arch_events_map[bit].name);
+       }
+}
+
+static __init void intel_nehalem_quirk(void)
+{
+       union cpuid10_ebx ebx;
+
+       ebx.full = x86_pmu.events_maskl;
+       if (ebx.split.no_branch_misses_retired) {
+               /*
+                * Erratum AAJ80 detected, we work it around by using
+                * the BR_MISP_EXEC.ANY event. This will over-count
+                * branch-misses, but it's still much better than the
+                * architectural event which is often completely bogus:
+                */
+               intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
+               ebx.split.no_branch_misses_retired = 0;
+               x86_pmu.events_maskl = ebx.full;
+               pr_info("CPU erratum AAJ80 worked around\n");
+       }
+}
+
+/*
+ * enable software workaround for errata:
+ * SNB: BJ122
+ * IVB: BV98
+ * HSW: HSD29
+ *
+ * Only needed when HT is enabled. However detecting
+ * if HT is enabled is difficult (model specific). So instead,
+ * we enable the workaround in the early boot, and verify if
+ * it is needed in a later initcall phase once we have valid
+ * topology information to check if HT is actually enabled
+ */
+static __init void intel_ht_bug(void)
+{
+       x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
+
+       x86_pmu.start_scheduling = intel_start_scheduling;
+       x86_pmu.commit_scheduling = intel_commit_scheduling;
+       x86_pmu.stop_scheduling = intel_stop_scheduling;
+}
+
+EVENT_ATTR_STR(mem-loads,      mem_ld_hsw,     "event=0xcd,umask=0x1,ldlat=3");
+EVENT_ATTR_STR(mem-stores,     mem_st_hsw,     "event=0xd0,umask=0x82")
+
+/* Haswell special events */
+EVENT_ATTR_STR(tx-start,       tx_start,       "event=0xc9,umask=0x1");
+EVENT_ATTR_STR(tx-commit,      tx_commit,      "event=0xc9,umask=0x2");
+EVENT_ATTR_STR(tx-abort,       tx_abort,       "event=0xc9,umask=0x4");
+EVENT_ATTR_STR(tx-capacity,    tx_capacity,    "event=0x54,umask=0x2");
+EVENT_ATTR_STR(tx-conflict,    tx_conflict,    "event=0x54,umask=0x1");
+EVENT_ATTR_STR(el-start,       el_start,       "event=0xc8,umask=0x1");
+EVENT_ATTR_STR(el-commit,      el_commit,      "event=0xc8,umask=0x2");
+EVENT_ATTR_STR(el-abort,       el_abort,       "event=0xc8,umask=0x4");
+EVENT_ATTR_STR(el-capacity,    el_capacity,    "event=0x54,umask=0x2");
+EVENT_ATTR_STR(el-conflict,    el_conflict,    "event=0x54,umask=0x1");
+EVENT_ATTR_STR(cycles-t,       cycles_t,       "event=0x3c,in_tx=1");
+EVENT_ATTR_STR(cycles-ct,      cycles_ct,      "event=0x3c,in_tx=1,in_tx_cp=1");
+
+static struct attribute *hsw_events_attrs[] = {
+       EVENT_PTR(tx_start),
+       EVENT_PTR(tx_commit),
+       EVENT_PTR(tx_abort),
+       EVENT_PTR(tx_capacity),
+       EVENT_PTR(tx_conflict),
+       EVENT_PTR(el_start),
+       EVENT_PTR(el_commit),
+       EVENT_PTR(el_abort),
+       EVENT_PTR(el_capacity),
+       EVENT_PTR(el_conflict),
+       EVENT_PTR(cycles_t),
+       EVENT_PTR(cycles_ct),
+       EVENT_PTR(mem_ld_hsw),
+       EVENT_PTR(mem_st_hsw),
+       NULL
+};
+
+__init int intel_pmu_init(void)
+{
+       union cpuid10_edx edx;
+       union cpuid10_eax eax;
+       union cpuid10_ebx ebx;
+       struct event_constraint *c;
+       unsigned int unused;
+       struct extra_reg *er;
+       int version, i;
+
+       if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+               switch (boot_cpu_data.x86) {
+               case 0x6:
+                       return p6_pmu_init();
+               case 0xb:
+                       return knc_pmu_init();
+               case 0xf:
+                       return p4_pmu_init();
+               }
+               return -ENODEV;
+       }
+
+       /*
+        * Check whether the Architectural PerfMon supports
+        * Branch Misses Retired hw_event or not.
+        */
+       cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
+       if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
+               return -ENODEV;
+
+       version = eax.split.version_id;
+       if (version < 2)
+               x86_pmu = core_pmu;
+       else
+               x86_pmu = intel_pmu;
+
+       x86_pmu.version                 = version;
+       x86_pmu.num_counters            = eax.split.num_counters;
+       x86_pmu.cntval_bits             = eax.split.bit_width;
+       x86_pmu.cntval_mask             = (1ULL << eax.split.bit_width) - 1;
+
+       x86_pmu.events_maskl            = ebx.full;
+       x86_pmu.events_mask_len         = eax.split.mask_length;
+
+       x86_pmu.max_pebs_events         = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
+
+       /*
+        * Quirk: v2 perfmon does not report fixed-purpose events, so
+        * assume at least 3 events:
+        */
+       if (version > 1)
+               x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
+
+       if (boot_cpu_has(X86_FEATURE_PDCM)) {
+               u64 capabilities;
+
+               rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
+               x86_pmu.intel_cap.capabilities = capabilities;
+       }
+
+       intel_ds_init();
+
+       x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
+
+       /*
+        * Install the hw-cache-events table:
+        */
+       switch (boot_cpu_data.x86_model) {
+       case 14: /* 65nm Core "Yonah" */
+               pr_cont("Core events, ");
+               break;
+
+       case 15: /* 65nm Core2 "Merom"          */
+               x86_add_quirk(intel_clovertown_quirk);
+       case 22: /* 65nm Core2 "Merom-L"        */
+       case 23: /* 45nm Core2 "Penryn"         */
+       case 29: /* 45nm Core2 "Dunnington (MP) */
+               memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
+                      sizeof(hw_cache_event_ids));
+
+               intel_pmu_lbr_init_core();
+
+               x86_pmu.event_constraints = intel_core2_event_constraints;
+               x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
+               pr_cont("Core2 events, ");
+               break;
+
+       case 30: /* 45nm Nehalem    */
+       case 26: /* 45nm Nehalem-EP */
+       case 46: /* 45nm Nehalem-EX */
+               memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
+                      sizeof(hw_cache_event_ids));
+               memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
+                      sizeof(hw_cache_extra_regs));
+
+               intel_pmu_lbr_init_nhm();
+
+               x86_pmu.event_constraints = intel_nehalem_event_constraints;
+               x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
+               x86_pmu.enable_all = intel_pmu_nhm_enable_all;
+               x86_pmu.extra_regs = intel_nehalem_extra_regs;
+
+               x86_pmu.cpu_events = nhm_events_attrs;
+
+               /* UOPS_ISSUED.STALLED_CYCLES */
+               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+                       X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+               /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
+               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+                       X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
+
+               intel_pmu_pebs_data_source_nhm();
+               x86_add_quirk(intel_nehalem_quirk);
+
+               pr_cont("Nehalem events, ");
+               break;
+
+       case 28: /* 45nm Atom "Pineview"   */
+       case 38: /* 45nm Atom "Lincroft"   */
+       case 39: /* 32nm Atom "Penwell"    */
+       case 53: /* 32nm Atom "Cloverview" */
+       case 54: /* 32nm Atom "Cedarview"  */
+               memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
+                      sizeof(hw_cache_event_ids));
+
+               intel_pmu_lbr_init_atom();
+
+               x86_pmu.event_constraints = intel_gen_event_constraints;
+               x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
+               x86_pmu.pebs_aliases = intel_pebs_aliases_core2;
+               pr_cont("Atom events, ");
+               break;
+
+       case 55: /* 22nm Atom "Silvermont"                */
+       case 76: /* 14nm Atom "Airmont"                   */
+       case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+               memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
+                       sizeof(hw_cache_event_ids));
+               memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
+                      sizeof(hw_cache_extra_regs));
+
+               intel_pmu_lbr_init_atom();
+
+               x86_pmu.event_constraints = intel_slm_event_constraints;
+               x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
+               x86_pmu.extra_regs = intel_slm_extra_regs;
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+               pr_cont("Silvermont events, ");
+               break;
+
+       case 37: /* 32nm Westmere    */
+       case 44: /* 32nm Westmere-EP */
+       case 47: /* 32nm Westmere-EX */
+               memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
+                      sizeof(hw_cache_event_ids));
+               memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
+                      sizeof(hw_cache_extra_regs));
+
+               intel_pmu_lbr_init_nhm();
+
+               x86_pmu.event_constraints = intel_westmere_event_constraints;
+               x86_pmu.enable_all = intel_pmu_nhm_enable_all;
+               x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
+               x86_pmu.extra_regs = intel_westmere_extra_regs;
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+
+               x86_pmu.cpu_events = nhm_events_attrs;
+
+               /* UOPS_ISSUED.STALLED_CYCLES */
+               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+                       X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+               /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
+               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+                       X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
+
+               intel_pmu_pebs_data_source_nhm();
+               pr_cont("Westmere events, ");
+               break;
+
+       case 42: /* 32nm SandyBridge         */
+       case 45: /* 32nm SandyBridge-E/EN/EP */
+               x86_add_quirk(intel_sandybridge_quirk);
+               x86_add_quirk(intel_ht_bug);
+               memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
+                      sizeof(hw_cache_event_ids));
+               memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
+                      sizeof(hw_cache_extra_regs));
+
+               intel_pmu_lbr_init_snb();
+
+               x86_pmu.event_constraints = intel_snb_event_constraints;
+               x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
+               x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+               if (boot_cpu_data.x86_model == 45)
+                       x86_pmu.extra_regs = intel_snbep_extra_regs;
+               else
+                       x86_pmu.extra_regs = intel_snb_extra_regs;
+
+
+               /* all extra regs are per-cpu when HT is on */
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+               x86_pmu.cpu_events = snb_events_attrs;
+
+               /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
+               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+                       X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+               /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
+               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+                       X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
+
+               pr_cont("SandyBridge events, ");
+               break;
+
+       case 58: /* 22nm IvyBridge       */
+       case 62: /* 22nm IvyBridge-EP/EX */
+               x86_add_quirk(intel_ht_bug);
+               memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
+                      sizeof(hw_cache_event_ids));
+               /* dTLB-load-misses on IVB is different than SNB */
+               hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */
+
+               memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
+                      sizeof(hw_cache_extra_regs));
+
+               intel_pmu_lbr_init_snb();
+
+               x86_pmu.event_constraints = intel_ivb_event_constraints;
+               x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
+               x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+               x86_pmu.pebs_prec_dist = true;
+               if (boot_cpu_data.x86_model == 62)
+                       x86_pmu.extra_regs = intel_snbep_extra_regs;
+               else
+                       x86_pmu.extra_regs = intel_snb_extra_regs;
+               /* all extra regs are per-cpu when HT is on */
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+               x86_pmu.cpu_events = snb_events_attrs;
+
+               /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
+               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+                       X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+
+               pr_cont("IvyBridge events, ");
+               break;
+
+
+       case 60: /* 22nm Haswell Core */
+       case 63: /* 22nm Haswell Server */
+       case 69: /* 22nm Haswell ULT */
+       case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+               x86_add_quirk(intel_ht_bug);
+               x86_pmu.late_ack = true;
+               memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+               memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+               intel_pmu_lbr_init_hsw();
+
+               x86_pmu.event_constraints = intel_hsw_event_constraints;
+               x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
+               x86_pmu.extra_regs = intel_snbep_extra_regs;
+               x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+               x86_pmu.pebs_prec_dist = true;
+               /* all extra regs are per-cpu when HT is on */
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+               x86_pmu.hw_config = hsw_hw_config;
+               x86_pmu.get_event_constraints = hsw_get_event_constraints;
+               x86_pmu.cpu_events = hsw_events_attrs;
+               x86_pmu.lbr_double_abort = true;
+               pr_cont("Haswell events, ");
+               break;
+
+       case 61: /* 14nm Broadwell Core-M */
+       case 86: /* 14nm Broadwell Xeon D */
+       case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+       case 79: /* 14nm Broadwell Server */
+               x86_pmu.late_ack = true;
+               memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+               memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+               /* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
+               hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
+                                                                        BDW_L3_MISS|HSW_SNOOP_DRAM;
+               hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
+                                                                         HSW_SNOOP_DRAM;
+               hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
+                                                                            BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+               hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
+                                                                             BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+
+               intel_pmu_lbr_init_hsw();
+
+               x86_pmu.event_constraints = intel_bdw_event_constraints;
+               x86_pmu.pebs_constraints = intel_bdw_pebs_event_constraints;
+               x86_pmu.extra_regs = intel_snbep_extra_regs;
+               x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+               x86_pmu.pebs_prec_dist = true;
+               /* all extra regs are per-cpu when HT is on */
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+               x86_pmu.hw_config = hsw_hw_config;
+               x86_pmu.get_event_constraints = hsw_get_event_constraints;
+               x86_pmu.cpu_events = hsw_events_attrs;
+               x86_pmu.limit_period = bdw_limit_period;
+               pr_cont("Broadwell events, ");
+               break;
+
+       case 87: /* Knights Landing Xeon Phi */
+               memcpy(hw_cache_event_ids,
+                      slm_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+               memcpy(hw_cache_extra_regs,
+                      knl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+               intel_pmu_lbr_init_knl();
+
+               x86_pmu.event_constraints = intel_slm_event_constraints;
+               x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
+               x86_pmu.extra_regs = intel_knl_extra_regs;
+
+               /* all extra regs are per-cpu when HT is on */
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+               pr_cont("Knights Landing events, ");
+               break;
+
+       case 78: /* 14nm Skylake Mobile */
+       case 94: /* 14nm Skylake Desktop */
+               x86_pmu.late_ack = true;
+               memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+               memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+               intel_pmu_lbr_init_skl();
+
+               x86_pmu.event_constraints = intel_skl_event_constraints;
+               x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
+               x86_pmu.extra_regs = intel_skl_extra_regs;
+               x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
+               x86_pmu.pebs_prec_dist = true;
+               /* all extra regs are per-cpu when HT is on */
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+               x86_pmu.hw_config = hsw_hw_config;
+               x86_pmu.get_event_constraints = hsw_get_event_constraints;
+               x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr,
+                                                 skl_format_attr);
+               WARN_ON(!x86_pmu.format_attrs);
+               x86_pmu.cpu_events = hsw_events_attrs;
+               pr_cont("Skylake events, ");
+               break;
+
+       default:
+               switch (x86_pmu.version) {
+               case 1:
+                       x86_pmu.event_constraints = intel_v1_event_constraints;
+                       pr_cont("generic architected perfmon v1, ");
+                       break;
+               default:
+                       /*
+                        * default constraints for v2 and up
+                        */
+                       x86_pmu.event_constraints = intel_gen_event_constraints;
+                       pr_cont("generic architected perfmon, ");
+                       break;
+               }
+       }
+
+       if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
+               WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
+                    x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
+               x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC;
+       }
+       x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
+
+       if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) {
+               WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
+                    x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED);
+               x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
+       }
+
+       x86_pmu.intel_ctrl |=
+               ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
+
+       if (x86_pmu.event_constraints) {
+               /*
+                * event on fixed counter2 (REF_CYCLES) only works on this
+                * counter, so do not extend mask to generic counters
+                */
+               for_each_event_constraint(c, x86_pmu.event_constraints) {
+                       if (c->cmask == FIXED_EVENT_FLAGS
+                           && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) {
+                               c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
+                       }
+                       c->idxmsk64 &=
+                               ~(~0UL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
+                       c->weight = hweight64(c->idxmsk64);
+               }
+       }
+
+       /*
+        * Access LBR MSR may cause #GP under certain circumstances.
+        * E.g. KVM doesn't support LBR MSR
+        * Check all LBT MSR here.
+        * Disable LBR access if any LBR MSRs can not be accessed.
+        */
+       if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL))
+               x86_pmu.lbr_nr = 0;
+       for (i = 0; i < x86_pmu.lbr_nr; i++) {
+               if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
+                     check_msr(x86_pmu.lbr_to + i, 0xffffUL)))
+                       x86_pmu.lbr_nr = 0;
+       }
+
+       /*
+        * Access extra MSR may cause #GP under certain circumstances.
+        * E.g. KVM doesn't support offcore event
+        * Check all extra_regs here.
+        */
+       if (x86_pmu.extra_regs) {
+               for (er = x86_pmu.extra_regs; er->msr; er++) {
+                       er->extra_msr_access = check_msr(er->msr, 0x11UL);
+                       /* Disable LBR select mapping */
+                       if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
+                               x86_pmu.lbr_sel_map = NULL;
+               }
+       }
+
+       /* Support full width counters using alternative MSR range */
+       if (x86_pmu.intel_cap.full_width_write) {
+               x86_pmu.max_period = x86_pmu.cntval_mask;
+               x86_pmu.perfctr = MSR_IA32_PMC0;
+               pr_cont("full-width counters, ");
+       }
+
+       return 0;
+}
+
+/*
+ * HT bug: phase 2 init
+ * Called once we have valid topology information to check
+ * whether or not HT is enabled
+ * If HT is off, then we disable the workaround
+ */
+static __init int fixup_ht_bug(void)
+{
+       int cpu = smp_processor_id();
+       int w, c;
+       /*
+        * problem not present on this CPU model, nothing to do
+        */
+       if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
+               return 0;
+
+       w = cpumask_weight(topology_sibling_cpumask(cpu));
+       if (w > 1) {
+               pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
+               return 0;
+       }
+
+       if (lockup_detector_suspend() != 0) {
+               pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
+               return 0;
+       }
+
+       x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
+
+       x86_pmu.start_scheduling = NULL;
+       x86_pmu.commit_scheduling = NULL;
+       x86_pmu.stop_scheduling = NULL;
+
+       lockup_detector_resume();
+
+       get_online_cpus();
+
+       for_each_online_cpu(c) {
+               free_excl_cntrs(c);
+       }
+
+       put_online_cpus();
+       pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
+       return 0;
+}
+subsys_initcall(fixup_ht_bug)
diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c

new file mode 100644 (file)

index 0000000..93cb412
--- /dev/null
+++ b/arch/x86/events/intel/cqm.c
@@ -0,0 +1,1381 @@
+/*
+ * Intel Cache Quality-of-Service Monitoring (CQM) support.
+ *
+ * Based very, very heavily on work by Peter Zijlstra.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/slab.h>
+#include <asm/cpu_device_id.h>
+#include "../perf_event.h"
+
+#define MSR_IA32_PQR_ASSOC     0x0c8f
+#define MSR_IA32_QM_CTR                0x0c8e
+#define MSR_IA32_QM_EVTSEL     0x0c8d
+
+static u32 cqm_max_rmid = -1;
+static unsigned int cqm_l3_scale; /* supposedly cacheline size */
+
+/**
+ * struct intel_pqr_state - State cache for the PQR MSR
+ * @rmid:              The cached Resource Monitoring ID
+ * @closid:            The cached Class Of Service ID
+ * @rmid_usecnt:       The usage counter for rmid
+ *
+ * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
+ * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
+ * contains both parts, so we need to cache them.
+ *
+ * The cache also helps to avoid pointless updates if the value does
+ * not change.
+ */
+struct intel_pqr_state {
+       u32                     rmid;
+       u32                     closid;
+       int                     rmid_usecnt;
+};
+
+/*
+ * The cached intel_pqr_state is strictly per CPU and can never be
+ * updated from a remote CPU. Both functions which modify the state
+ * (intel_cqm_event_start and intel_cqm_event_stop) are called with
+ * interrupts disabled, which is sufficient for the protection.
+ */
+static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
+
+/*
+ * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
+ * Also protects event->hw.cqm_rmid
+ *
+ * Hold either for stability, both for modification of ->hw.cqm_rmid.
+ */
+static DEFINE_MUTEX(cache_mutex);
+static DEFINE_RAW_SPINLOCK(cache_lock);
+
+/*
+ * Groups of events that have the same target(s), one RMID per group.
+ */
+static LIST_HEAD(cache_groups);
+
+/*
+ * Mask of CPUs for reading CQM values. We only need one per-socket.
+ */
+static cpumask_t cqm_cpumask;
+
+#define RMID_VAL_ERROR         (1ULL << 63)
+#define RMID_VAL_UNAVAIL       (1ULL << 62)
+
+#define QOS_L3_OCCUP_EVENT_ID  (1 << 0)
+
+#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID
+
+/*
+ * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
+ *
+ * This rmid is always free and is guaranteed to have an associated
+ * near-zero occupancy value, i.e. no cachelines are tagged with this
+ * RMID, once __intel_cqm_rmid_rotate() returns.
+ */
+static u32 intel_cqm_rotation_rmid;
+
+#define INVALID_RMID           (-1)
+
+/*
+ * Is @rmid valid for programming the hardware?
+ *
+ * rmid 0 is reserved by the hardware for all non-monitored tasks, which
+ * means that we should never come across an rmid with that value.
+ * Likewise, an rmid value of -1 is used to indicate "no rmid currently
+ * assigned" and is used as part of the rotation code.
+ */
+static inline bool __rmid_valid(u32 rmid)
+{
+       if (!rmid || rmid == INVALID_RMID)
+               return false;
+
+       return true;
+}
+
+static u64 __rmid_read(u32 rmid)
+{
+       u64 val;
+
+       /*
+        * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
+        * it just says that to increase confusion.
+        */
+       wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
+       rdmsrl(MSR_IA32_QM_CTR, val);
+
+       /*
+        * Aside from the ERROR and UNAVAIL bits, assume this thing returns
+        * the number of cachelines tagged with @rmid.
+        */
+       return val;
+}
+
+enum rmid_recycle_state {
+       RMID_YOUNG = 0,
+       RMID_AVAILABLE,
+       RMID_DIRTY,
+};
+
+struct cqm_rmid_entry {
+       u32 rmid;
+       enum rmid_recycle_state state;
+       struct list_head list;
+       unsigned long queue_time;
+};
+
+/*
+ * cqm_rmid_free_lru - A least recently used list of RMIDs.
+ *
+ * Oldest entry at the head, newest (most recently used) entry at the
+ * tail. This list is never traversed, it's only used to keep track of
+ * the lru order. That is, we only pick entries of the head or insert
+ * them on the tail.
+ *
+ * All entries on the list are 'free', and their RMIDs are not currently
+ * in use. To mark an RMID as in use, remove its entry from the lru
+ * list.
+ *
+ *
+ * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
+ *
+ * This list is contains RMIDs that no one is currently using but that
+ * may have a non-zero occupancy value associated with them. The
+ * rotation worker moves RMIDs from the limbo list to the free list once
+ * the occupancy value drops below __intel_cqm_threshold.
+ *
+ * Both lists are protected by cache_mutex.
+ */
+static LIST_HEAD(cqm_rmid_free_lru);
+static LIST_HEAD(cqm_rmid_limbo_lru);
+
+/*
+ * We use a simple array of pointers so that we can lookup a struct
+ * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
+ * and __put_rmid() from having to worry about dealing with struct
+ * cqm_rmid_entry - they just deal with rmids, i.e. integers.
+ *
+ * Once this array is initialized it is read-only. No locks are required
+ * to access it.
+ *
+ * All entries for all RMIDs can be looked up in the this array at all
+ * times.
+ */
+static struct cqm_rmid_entry **cqm_rmid_ptrs;
+
+static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
+{
+       struct cqm_rmid_entry *entry;
+
+       entry = cqm_rmid_ptrs[rmid];
+       WARN_ON(entry->rmid != rmid);
+
+       return entry;
+}
+
+/*
+ * Returns < 0 on fail.
+ *
+ * We expect to be called with cache_mutex held.
+ */
+static u32 __get_rmid(void)
+{
+       struct cqm_rmid_entry *entry;
+
+       lockdep_assert_held(&cache_mutex);
+
+       if (list_empty(&cqm_rmid_free_lru))
+               return INVALID_RMID;
+
+       entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
+       list_del(&entry->list);
+
+       return entry->rmid;
+}
+
+static void __put_rmid(u32 rmid)
+{
+       struct cqm_rmid_entry *entry;
+
+       lockdep_assert_held(&cache_mutex);
+
+       WARN_ON(!__rmid_valid(rmid));
+       entry = __rmid_entry(rmid);
+
+       entry->queue_time = jiffies;
+       entry->state = RMID_YOUNG;
+
+       list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
+}
+
+static int intel_cqm_setup_rmid_cache(void)
+{
+       struct cqm_rmid_entry *entry;
+       unsigned int nr_rmids;
+       int r = 0;
+
+       nr_rmids = cqm_max_rmid + 1;
+       cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
+                               nr_rmids, GFP_KERNEL);
+       if (!cqm_rmid_ptrs)
+               return -ENOMEM;
+
+       for (; r <= cqm_max_rmid; r++) {
+               struct cqm_rmid_entry *entry;
+
+               entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+               if (!entry)
+                       goto fail;
+
+               INIT_LIST_HEAD(&entry->list);
+               entry->rmid = r;
+               cqm_rmid_ptrs[r] = entry;
+
+               list_add_tail(&entry->list, &cqm_rmid_free_lru);
+       }
+
+       /*
+        * RMID 0 is special and is always allocated. It's used for all
+        * tasks that are not monitored.
+        */
+       entry = __rmid_entry(0);
+       list_del(&entry->list);
+
+       mutex_lock(&cache_mutex);
+       intel_cqm_rotation_rmid = __get_rmid();
+       mutex_unlock(&cache_mutex);
+
+       return 0;
+fail:
+       while (r--)
+               kfree(cqm_rmid_ptrs[r]);
+
+       kfree(cqm_rmid_ptrs);
+       return -ENOMEM;
+}
+
+/*
+ * Determine if @a and @b measure the same set of tasks.
+ *
+ * If @a and @b measure the same set of tasks then we want to share a
+ * single RMID.
+ */
+static bool __match_event(struct perf_event *a, struct perf_event *b)
+{
+       /* Per-cpu and task events don't mix */
+       if ((a->attach_state & PERF_ATTACH_TASK) !=
+           (b->attach_state & PERF_ATTACH_TASK))
+               return false;
+
+#ifdef CONFIG_CGROUP_PERF
+       if (a->cgrp != b->cgrp)
+               return false;
+#endif
+
+       /* If not task event, we're machine wide */
+       if (!(b->attach_state & PERF_ATTACH_TASK))
+               return true;
+
+       /*
+        * Events that target same task are placed into the same cache group.
+        */
+       if (a->hw.target == b->hw.target)
+               return true;
+
+       /*
+        * Are we an inherited event?
+        */
+       if (b->parent == a)
+               return true;
+
+       return false;
+}
+
+#ifdef CONFIG_CGROUP_PERF
+static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
+{
+       if (event->attach_state & PERF_ATTACH_TASK)
+               return perf_cgroup_from_task(event->hw.target, event->ctx);
+
+       return event->cgrp;
+}
+#endif
+
+/*
+ * Determine if @a's tasks intersect with @b's tasks
+ *
+ * There are combinations of events that we explicitly prohibit,
+ *
+ *                PROHIBITS
+ *     system-wide    ->       cgroup and task
+ *     cgroup        ->        system-wide
+ *                           ->        task in cgroup
+ *     task          ->        system-wide
+ *                           ->        task in cgroup
+ *
+ * Call this function before allocating an RMID.
+ */
+static bool __conflict_event(struct perf_event *a, struct perf_event *b)
+{
+#ifdef CONFIG_CGROUP_PERF
+       /*
+        * We can have any number of cgroups but only one system-wide
+        * event at a time.
+        */
+       if (a->cgrp && b->cgrp) {
+               struct perf_cgroup *ac = a->cgrp;
+               struct perf_cgroup *bc = b->cgrp;
+
+               /*
+                * This condition should have been caught in
+                * __match_event() and we should be sharing an RMID.
+                */
+               WARN_ON_ONCE(ac == bc);
+
+               if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+                   cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+                       return true;
+
+               return false;
+       }
+
+       if (a->cgrp || b->cgrp) {
+               struct perf_cgroup *ac, *bc;
+
+               /*
+                * cgroup and system-wide events are mutually exclusive
+                */
+               if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
+                   (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
+                       return true;
+
+               /*
+                * Ensure neither event is part of the other's cgroup
+                */
+               ac = event_to_cgroup(a);
+               bc = event_to_cgroup(b);
+               if (ac == bc)
+                       return true;
+
+               /*
+                * Must have cgroup and non-intersecting task events.
+                */
+               if (!ac || !bc)
+                       return false;
+
+               /*
+                * We have cgroup and task events, and the task belongs
+                * to a cgroup. Check for for overlap.
+                */
+               if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+                   cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+                       return true;
+
+               return false;
+       }
+#endif
+       /*
+        * If one of them is not a task, same story as above with cgroups.
+        */
+       if (!(a->attach_state & PERF_ATTACH_TASK) ||
+           !(b->attach_state & PERF_ATTACH_TASK))
+               return true;
+
+       /*
+        * Must be non-overlapping.
+        */
+       return false;
+}
+
+struct rmid_read {
+       u32 rmid;
+       atomic64_t value;
+};
+
+static void __intel_cqm_event_count(void *info);
+
+/*
+ * Exchange the RMID of a group of events.
+ */
+static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
+{
+       struct perf_event *event;
+       struct list_head *head = &group->hw.cqm_group_entry;
+       u32 old_rmid = group->hw.cqm_rmid;
+
+       lockdep_assert_held(&cache_mutex);
+
+       /*
+        * If our RMID is being deallocated, perform a read now.
+        */
+       if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
+               struct rmid_read rr = {
+                       .value = ATOMIC64_INIT(0),
+                       .rmid = old_rmid,
+               };
+
+               on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count,
+                                &rr, 1);
+               local64_set(&group->count, atomic64_read(&rr.value));
+       }
+
+       raw_spin_lock_irq(&cache_lock);
+
+       group->hw.cqm_rmid = rmid;
+       list_for_each_entry(event, head, hw.cqm_group_entry)
+               event->hw.cqm_rmid = rmid;
+
+       raw_spin_unlock_irq(&cache_lock);
+
+       return old_rmid;
+}
+
+/*
+ * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
+ * cachelines are still tagged with RMIDs in limbo, we progressively
+ * increment the threshold until we find an RMID in limbo with <=
+ * __intel_cqm_threshold lines tagged. This is designed to mitigate the
+ * problem where cachelines tagged with an RMID are not steadily being
+ * evicted.
+ *
+ * On successful rotations we decrease the threshold back towards zero.
+ *
+ * __intel_cqm_max_threshold provides an upper bound on the threshold,
+ * and is measured in bytes because it's exposed to userland.
+ */
+static unsigned int __intel_cqm_threshold;
+static unsigned int __intel_cqm_max_threshold;
+
+/*
+ * Test whether an RMID has a zero occupancy value on this cpu.
+ */
+static void intel_cqm_stable(void *arg)
+{
+       struct cqm_rmid_entry *entry;
+
+       list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
+               if (entry->state != RMID_AVAILABLE)
+                       break;
+
+               if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
+                       entry->state = RMID_DIRTY;
+       }
+}
+
+/*
+ * If we have group events waiting for an RMID that don't conflict with
+ * events already running, assign @rmid.
+ */
+static bool intel_cqm_sched_in_event(u32 rmid)
+{
+       struct perf_event *leader, *event;
+
+       lockdep_assert_held(&cache_mutex);
+
+       leader = list_first_entry(&cache_groups, struct perf_event,
+                                 hw.cqm_groups_entry);
+       event = leader;
+
+       list_for_each_entry_continue(event, &cache_groups,
+                                    hw.cqm_groups_entry) {
+               if (__rmid_valid(event->hw.cqm_rmid))
+                       continue;
+
+               if (__conflict_event(event, leader))
+                       continue;
+
+               intel_cqm_xchg_rmid(event, rmid);
+               return true;
+       }
+
+       return false;
+}
+
+/*
+ * Initially use this constant for both the limbo queue time and the
+ * rotation timer interval, pmu::hrtimer_interval_ms.
+ *
+ * They don't need to be the same, but the two are related since if you
+ * rotate faster than you recycle RMIDs, you may run out of available
+ * RMIDs.
+ */
+#define RMID_DEFAULT_QUEUE_TIME 250    /* ms */
+
+static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
+
+/*
+ * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
+ * @nr_available: number of freeable RMIDs on the limbo list
+ *
+ * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
+ * cachelines are tagged with those RMIDs. After this we can reuse them
+ * and know that the current set of active RMIDs is stable.
+ *
+ * Return %true or %false depending on whether stabilization needs to be
+ * reattempted.
+ *
+ * If we return %true then @nr_available is updated to indicate the
+ * number of RMIDs on the limbo list that have been queued for the
+ * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
+ * are above __intel_cqm_threshold.
+ */
+static bool intel_cqm_rmid_stabilize(unsigned int *available)
+{
+       struct cqm_rmid_entry *entry, *tmp;
+
+       lockdep_assert_held(&cache_mutex);
+
+       *available = 0;
+       list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
+               unsigned long min_queue_time;
+               unsigned long now = jiffies;
+
+               /*
+                * We hold RMIDs placed into limbo for a minimum queue
+                * time. Before the minimum queue time has elapsed we do
+                * not recycle RMIDs.
+                *
+                * The reasoning is that until a sufficient time has
+                * passed since we stopped using an RMID, any RMID
+                * placed onto the limbo list will likely still have
+                * data tagged in the cache, which means we'll probably
+                * fail to recycle it anyway.
+                *
+                * We can save ourselves an expensive IPI by skipping
+                * any RMIDs that have not been queued for the minimum
+                * time.
+                */
+               min_queue_time = entry->queue_time +
+                       msecs_to_jiffies(__rmid_queue_time_ms);
+
+               if (time_after(min_queue_time, now))
+                       break;
+
+               entry->state = RMID_AVAILABLE;
+               (*available)++;
+       }
+
+       /*
+        * Fast return if none of the RMIDs on the limbo list have been
+        * sitting on the queue for the minimum queue time.
+        */
+       if (!*available)
+               return false;
+
+       /*
+        * Test whether an RMID is free for each package.
+        */
+       on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
+
+       list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
+               /*
+                * Exhausted all RMIDs that have waited min queue time.
+                */
+               if (entry->state == RMID_YOUNG)
+                       break;
+
+               if (entry->state == RMID_DIRTY)
+                       continue;
+
+               list_del(&entry->list); /* remove from limbo */
+
+               /*
+                * The rotation RMID gets priority if it's
+                * currently invalid. In which case, skip adding
+                * the RMID to the the free lru.
+                */
+               if (!__rmid_valid(intel_cqm_rotation_rmid)) {
+                       intel_cqm_rotation_rmid = entry->rmid;
+                       continue;
+               }
+
+               /*
+                * If we have groups waiting for RMIDs, hand
+                * them one now provided they don't conflict.
+                */
+               if (intel_cqm_sched_in_event(entry->rmid))
+                       continue;
+
+               /*
+                * Otherwise place it onto the free list.
+                */
+               list_add_tail(&entry->list, &cqm_rmid_free_lru);
+       }
+
+
+       return __rmid_valid(intel_cqm_rotation_rmid);
+}
+
+/*
+ * Pick a victim group and move it to the tail of the group list.
+ * @next: The first group without an RMID
+ */
+static void __intel_cqm_pick_and_rotate(struct perf_event *next)
+{
+       struct perf_event *rotor;
+       u32 rmid;
+
+       lockdep_assert_held(&cache_mutex);
+
+       rotor = list_first_entry(&cache_groups, struct perf_event,
+                                hw.cqm_groups_entry);
+
+       /*
+        * The group at the front of the list should always have a valid
+        * RMID. If it doesn't then no groups have RMIDs assigned and we
+        * don't need to rotate the list.
+        */
+       if (next == rotor)
+               return;
+
+       rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
+       __put_rmid(rmid);
+
+       list_rotate_left(&cache_groups);
+}
+
+/*
+ * Deallocate the RMIDs from any events that conflict with @event, and
+ * place them on the back of the group list.
+ */
+static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
+{
+       struct perf_event *group, *g;
+       u32 rmid;
+
+       lockdep_assert_held(&cache_mutex);
+
+       list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
+               if (group == event)
+                       continue;
+
+               rmid = group->hw.cqm_rmid;
+
+               /*
+                * Skip events that don't have a valid RMID.
+                */
+               if (!__rmid_valid(rmid))
+                       continue;
+
+               /*
+                * No conflict? No problem! Leave the event alone.
+                */
+               if (!__conflict_event(group, event))
+                       continue;
+
+               intel_cqm_xchg_rmid(group, INVALID_RMID);
+               __put_rmid(rmid);
+       }
+}
+
+/*
+ * Attempt to rotate the groups and assign new RMIDs.
+ *
+ * We rotate for two reasons,
+ *   1. To handle the scheduling of conflicting events
+ *   2. To recycle RMIDs
+ *
+ * Rotating RMIDs is complicated because the hardware doesn't give us
+ * any clues.
+ *
+ * There's problems with the hardware interface; when you change the
+ * task:RMID map cachelines retain their 'old' tags, giving a skewed
+ * picture. In order to work around this, we must always keep one free
+ * RMID - intel_cqm_rotation_rmid.
+ *
+ * Rotation works by taking away an RMID from a group (the old RMID),
+ * and assigning the free RMID to another group (the new RMID). We must
+ * then wait for the old RMID to not be used (no cachelines tagged).
+ * This ensure that all cachelines are tagged with 'active' RMIDs. At
+ * this point we can start reading values for the new RMID and treat the
+ * old RMID as the free RMID for the next rotation.
+ *
+ * Return %true or %false depending on whether we did any rotating.
+ */
+static bool __intel_cqm_rmid_rotate(void)
+{
+       struct perf_event *group, *start = NULL;
+       unsigned int threshold_limit;
+       unsigned int nr_needed = 0;
+       unsigned int nr_available;
+       bool rotated = false;
+
+       mutex_lock(&cache_mutex);
+
+again:
+       /*
+        * Fast path through this function if there are no groups and no
+        * RMIDs that need cleaning.
+        */
+       if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
+               goto out;
+
+       list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
+               if (!__rmid_valid(group->hw.cqm_rmid)) {
+                       if (!start)
+                               start = group;
+                       nr_needed++;
+               }
+       }
+
+       /*
+        * We have some event groups, but they all have RMIDs assigned
+        * and no RMIDs need cleaning.
+        */
+       if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
+               goto out;
+
+       if (!nr_needed)
+               goto stabilize;
+
+       /*
+        * We have more event groups without RMIDs than available RMIDs,
+        * or we have event groups that conflict with the ones currently
+        * scheduled.
+        *
+        * We force deallocate the rmid of the group at the head of
+        * cache_groups. The first event group without an RMID then gets
+        * assigned intel_cqm_rotation_rmid. This ensures we always make
+        * forward progress.
+        *
+        * Rotate the cache_groups list so the previous head is now the
+        * tail.
+        */
+       __intel_cqm_pick_and_rotate(start);
+
+       /*
+        * If the rotation is going to succeed, reduce the threshold so
+        * that we don't needlessly reuse dirty RMIDs.
+        */
+       if (__rmid_valid(intel_cqm_rotation_rmid)) {
+               intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
+               intel_cqm_rotation_rmid = __get_rmid();
+
+               intel_cqm_sched_out_conflicting_events(start);
+
+               if (__intel_cqm_threshold)
+                       __intel_cqm_threshold--;
+       }
+
+       rotated = true;
+
+stabilize:
+       /*
+        * We now need to stablize the RMID we freed above (if any) to
+        * ensure that the next time we rotate we have an RMID with zero
+        * occupancy value.
+        *
+        * Alternatively, if we didn't need to perform any rotation,
+        * we'll have a bunch of RMIDs in limbo that need stabilizing.
+        */
+       threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
+
+       while (intel_cqm_rmid_stabilize(&nr_available) &&
+              __intel_cqm_threshold < threshold_limit) {
+               unsigned int steal_limit;
+
+               /*
+                * Don't spin if nobody is actively waiting for an RMID,
+                * the rotation worker will be kicked as soon as an
+                * event needs an RMID anyway.
+                */
+               if (!nr_needed)
+                       break;
+
+               /* Allow max 25% of RMIDs to be in limbo. */
+               steal_limit = (cqm_max_rmid + 1) / 4;
+
+               /*
+                * We failed to stabilize any RMIDs so our rotation
+                * logic is now stuck. In order to make forward progress
+                * we have a few options:
+                *
+                *   1. rotate ("steal") another RMID
+                *   2. increase the threshold
+                *   3. do nothing
+                *
+                * We do both of 1. and 2. until we hit the steal limit.
+                *
+                * The steal limit prevents all RMIDs ending up on the
+                * limbo list. This can happen if every RMID has a
+                * non-zero occupancy above threshold_limit, and the
+                * occupancy values aren't dropping fast enough.
+                *
+                * Note that there is prioritisation at work here - we'd
+                * rather increase the number of RMIDs on the limbo list
+                * than increase the threshold, because increasing the
+                * threshold skews the event data (because we reuse
+                * dirty RMIDs) - threshold bumps are a last resort.
+                */
+               if (nr_available < steal_limit)
+                       goto again;
+
+               __intel_cqm_threshold++;
+       }
+
+out:
+       mutex_unlock(&cache_mutex);
+       return rotated;
+}
+
+static void intel_cqm_rmid_rotate(struct work_struct *work);
+
+static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
+
+static struct pmu intel_cqm_pmu;
+
+static void intel_cqm_rmid_rotate(struct work_struct *work)
+{
+       unsigned long delay;
+
+       __intel_cqm_rmid_rotate();
+
+       delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
+       schedule_delayed_work(&intel_cqm_rmid_work, delay);
+}
+
+/*
+ * Find a group and setup RMID.
+ *
+ * If we're part of a group, we use the group's RMID.
+ */
+static void intel_cqm_setup_event(struct perf_event *event,
+                                 struct perf_event **group)
+{
+       struct perf_event *iter;
+       bool conflict = false;
+       u32 rmid;
+
+       list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
+               rmid = iter->hw.cqm_rmid;
+
+               if (__match_event(iter, event)) {
+                       /* All tasks in a group share an RMID */
+                       event->hw.cqm_rmid = rmid;
+                       *group = iter;
+                       return;
+               }
+
+               /*
+                * We only care about conflicts for events that are
+                * actually scheduled in (and hence have a valid RMID).
+                */
+               if (__conflict_event(iter, event) && __rmid_valid(rmid))
+                       conflict = true;
+       }
+
+       if (conflict)
+               rmid = INVALID_RMID;
+       else
+               rmid = __get_rmid();
+
+       event->hw.cqm_rmid = rmid;
+}
+
+static void intel_cqm_event_read(struct perf_event *event)
+{
+       unsigned long flags;
+       u32 rmid;
+       u64 val;
+
+       /*
+        * Task events are handled by intel_cqm_event_count().
+        */
+       if (event->cpu == -1)
+               return;
+
+       raw_spin_lock_irqsave(&cache_lock, flags);
+       rmid = event->hw.cqm_rmid;
+
+       if (!__rmid_valid(rmid))
+               goto out;
+
+       val = __rmid_read(rmid);
+
+       /*
+        * Ignore this reading on error states and do not update the value.
+        */
+       if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+               goto out;
+
+       local64_set(&event->count, val);
+out:
+       raw_spin_unlock_irqrestore(&cache_lock, flags);
+}
+
+static void __intel_cqm_event_count(void *info)
+{
+       struct rmid_read *rr = info;
+       u64 val;
+
+       val = __rmid_read(rr->rmid);
+
+       if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+               return;
+
+       atomic64_add(val, &rr->value);
+}
+
+static inline bool cqm_group_leader(struct perf_event *event)
+{
+       return !list_empty(&event->hw.cqm_groups_entry);
+}
+
+static u64 intel_cqm_event_count(struct perf_event *event)
+{
+       unsigned long flags;
+       struct rmid_read rr = {
+               .value = ATOMIC64_INIT(0),
+       };
+
+       /*
+        * We only need to worry about task events. System-wide events
+        * are handled like usual, i.e. entirely with
+        * intel_cqm_event_read().
+        */
+       if (event->cpu != -1)
+               return __perf_event_count(event);
+
+       /*
+        * Only the group leader gets to report values. This stops us
+        * reporting duplicate values to userspace, and gives us a clear
+        * rule for which task gets to report the values.
+        *
+        * Note that it is impossible to attribute these values to
+        * specific packages - we forfeit that ability when we create
+        * task events.
+        */
+       if (!cqm_group_leader(event))
+               return 0;
+
+       /*
+        * Getting up-to-date values requires an SMP IPI which is not
+        * possible if we're being called in interrupt context. Return
+        * the cached values instead.
+        */
+       if (unlikely(in_interrupt()))
+               goto out;
+
+       /*
+        * Notice that we don't perform the reading of an RMID
+        * atomically, because we can't hold a spin lock across the
+        * IPIs.
+        *
+        * Speculatively perform the read, since @event might be
+        * assigned a different (possibly invalid) RMID while we're
+        * busying performing the IPI calls. It's therefore necessary to
+        * check @event's RMID afterwards, and if it has changed,
+        * discard the result of the read.
+        */
+       rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
+
+       if (!__rmid_valid(rr.rmid))
+               goto out;
+
+       on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+
+       raw_spin_lock_irqsave(&cache_lock, flags);
+       if (event->hw.cqm_rmid == rr.rmid)
+               local64_set(&event->count, atomic64_read(&rr.value));
+       raw_spin_unlock_irqrestore(&cache_lock, flags);
+out:
+       return __perf_event_count(event);
+}
+
+static void intel_cqm_event_start(struct perf_event *event, int mode)
+{
+       struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+       u32 rmid = event->hw.cqm_rmid;
+
+       if (!(event->hw.cqm_state & PERF_HES_STOPPED))
+               return;
+
+       event->hw.cqm_state &= ~PERF_HES_STOPPED;
+
+       if (state->rmid_usecnt++) {
+               if (!WARN_ON_ONCE(state->rmid != rmid))
+                       return;
+       } else {
+               WARN_ON_ONCE(state->rmid);
+       }
+
+       state->rmid = rmid;
+       wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
+}
+
+static void intel_cqm_event_stop(struct perf_event *event, int mode)
+{
+       struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+
+       if (event->hw.cqm_state & PERF_HES_STOPPED)
+               return;
+
+       event->hw.cqm_state |= PERF_HES_STOPPED;
+
+       intel_cqm_event_read(event);
+
+       if (!--state->rmid_usecnt) {
+               state->rmid = 0;
+               wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid);
+       } else {
+               WARN_ON_ONCE(!state->rmid);
+       }
+}
+
+static int intel_cqm_event_add(struct perf_event *event, int mode)
+{
+       unsigned long flags;
+       u32 rmid;
+
+       raw_spin_lock_irqsave(&cache_lock, flags);
+
+       event->hw.cqm_state = PERF_HES_STOPPED;
+       rmid = event->hw.cqm_rmid;
+
+       if (__rmid_valid(rmid) && (mode & PERF_EF_START))
+               intel_cqm_event_start(event, mode);
+
+       raw_spin_unlock_irqrestore(&cache_lock, flags);
+
+       return 0;
+}
+
+static void intel_cqm_event_destroy(struct perf_event *event)
+{
+       struct perf_event *group_other = NULL;
+
+       mutex_lock(&cache_mutex);
+
+       /*
+        * If there's another event in this group...
+        */
+       if (!list_empty(&event->hw.cqm_group_entry)) {
+               group_other = list_first_entry(&event->hw.cqm_group_entry,
+                                              struct perf_event,
+                                              hw.cqm_group_entry);
+               list_del(&event->hw.cqm_group_entry);
+       }
+
+       /*
+        * And we're the group leader..
+        */
+       if (cqm_group_leader(event)) {
+               /*
+                * If there was a group_other, make that leader, otherwise
+                * destroy the group and return the RMID.
+                */
+               if (group_other) {
+                       list_replace(&event->hw.cqm_groups_entry,
+                                    &group_other->hw.cqm_groups_entry);
+               } else {
+                       u32 rmid = event->hw.cqm_rmid;
+
+                       if (__rmid_valid(rmid))
+                               __put_rmid(rmid);
+                       list_del(&event->hw.cqm_groups_entry);
+               }
+       }
+
+       mutex_unlock(&cache_mutex);
+}
+
+static int intel_cqm_event_init(struct perf_event *event)
+{
+       struct perf_event *group = NULL;
+       bool rotate = false;
+
+       if (event->attr.type != intel_cqm_pmu.type)
+               return -ENOENT;
+
+       if (event->attr.config & ~QOS_EVENT_MASK)
+               return -EINVAL;
+
+       /* unsupported modes and filters */
+       if (event->attr.exclude_user   ||
+           event->attr.exclude_kernel ||
+           event->attr.exclude_hv     ||
+           event->attr.exclude_idle   ||
+           event->attr.exclude_host   ||
+           event->attr.exclude_guest  ||
+           event->attr.sample_period) /* no sampling */
+               return -EINVAL;
+
+       INIT_LIST_HEAD(&event->hw.cqm_group_entry);
+       INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
+
+       event->destroy = intel_cqm_event_destroy;
+
+       mutex_lock(&cache_mutex);
+
+       /* Will also set rmid */
+       intel_cqm_setup_event(event, &group);
+
+       if (group) {
+               list_add_tail(&event->hw.cqm_group_entry,
+                             &group->hw.cqm_group_entry);
+       } else {
+               list_add_tail(&event->hw.cqm_groups_entry,
+                             &cache_groups);
+
+               /*
+                * All RMIDs are either in use or have recently been
+                * used. Kick the rotation worker to clean/free some.
+                *
+                * We only do this for the group leader, rather than for
+                * every event in a group to save on needless work.
+                */
+               if (!__rmid_valid(event->hw.cqm_rmid))
+                       rotate = true;
+       }
+
+       mutex_unlock(&cache_mutex);
+
+       if (rotate)
+               schedule_delayed_work(&intel_cqm_rmid_work, 0);
+
+       return 0;
+}
+
+EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
+EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
+EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
+EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
+EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
+
+static struct attribute *intel_cqm_events_attr[] = {
+       EVENT_PTR(intel_cqm_llc),
+       EVENT_PTR(intel_cqm_llc_pkg),
+       EVENT_PTR(intel_cqm_llc_unit),
+       EVENT_PTR(intel_cqm_llc_scale),
+       EVENT_PTR(intel_cqm_llc_snapshot),
+       NULL,
+};
+
+static struct attribute_group intel_cqm_events_group = {
+       .name = "events",
+       .attrs = intel_cqm_events_attr,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+static struct attribute *intel_cqm_formats_attr[] = {
+       &format_attr_event.attr,
+       NULL,
+};
+
+static struct attribute_group intel_cqm_format_group = {
+       .name = "format",
+       .attrs = intel_cqm_formats_attr,
+};
+
+static ssize_t
+max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
+                          char *page)
+{
+       ssize_t rv;
+
+       mutex_lock(&cache_mutex);
+       rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
+       mutex_unlock(&cache_mutex);
+
+       return rv;
+}
+
+static ssize_t
+max_recycle_threshold_store(struct device *dev,
+                           struct device_attribute *attr,
+                           const char *buf, size_t count)
+{
+       unsigned int bytes, cachelines;
+       int ret;
+
+       ret = kstrtouint(buf, 0, &bytes);
+       if (ret)
+               return ret;
+
+       mutex_lock(&cache_mutex);
+
+       __intel_cqm_max_threshold = bytes;
+       cachelines = bytes / cqm_l3_scale;
+
+       /*
+        * The new maximum takes effect immediately.
+        */
+       if (__intel_cqm_threshold > cachelines)
+               __intel_cqm_threshold = cachelines;
+
+       mutex_unlock(&cache_mutex);
+
+       return count;
+}
+
+static DEVICE_ATTR_RW(max_recycle_threshold);
+
+static struct attribute *intel_cqm_attrs[] = {
+       &dev_attr_max_recycle_threshold.attr,
+       NULL,
+};
+
+static const struct attribute_group intel_cqm_group = {
+       .attrs = intel_cqm_attrs,
+};
+
+static const struct attribute_group *intel_cqm_attr_groups[] = {
+       &intel_cqm_events_group,
+       &intel_cqm_format_group,
+       &intel_cqm_group,
+       NULL,
+};
+
+static struct pmu intel_cqm_pmu = {
+       .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
+       .attr_groups         = intel_cqm_attr_groups,
+       .task_ctx_nr         = perf_sw_context,
+       .event_init          = intel_cqm_event_init,
+       .add                 = intel_cqm_event_add,
+       .del                 = intel_cqm_event_stop,
+       .start               = intel_cqm_event_start,
+       .stop                = intel_cqm_event_stop,
+       .read                = intel_cqm_event_read,
+       .count               = intel_cqm_event_count,
+};
+
+static inline void cqm_pick_event_reader(int cpu)
+{
+       int reader;
+
+       /* First online cpu in package becomes the reader */
+       reader = cpumask_any_and(&cqm_cpumask, topology_core_cpumask(cpu));
+       if (reader >= nr_cpu_ids)
+               cpumask_set_cpu(cpu, &cqm_cpumask);
+}
+
+static void intel_cqm_cpu_starting(unsigned int cpu)
+{
+       struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
+       struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+       state->rmid = 0;
+       state->closid = 0;
+       state->rmid_usecnt = 0;
+
+       WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
+       WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
+}
+
+static void intel_cqm_cpu_exit(unsigned int cpu)
+{
+       int target;
+
+       /* Is @cpu the current cqm reader for this package ? */
+       if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
+               return;
+
+       /* Find another online reader in this package */
+       target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
+
+       if (target < nr_cpu_ids)
+               cpumask_set_cpu(target, &cqm_cpumask);
+}
+
+static int intel_cqm_cpu_notifier(struct notifier_block *nb,
+                                 unsigned long action, void *hcpu)
+{
+       unsigned int cpu  = (unsigned long)hcpu;
+
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_DOWN_PREPARE:
+               intel_cqm_cpu_exit(cpu);
+               break;
+       case CPU_STARTING:
+               intel_cqm_cpu_starting(cpu);
+               cqm_pick_event_reader(cpu);
+               break;
+       }
+
+       return NOTIFY_OK;
+}
+
+static const struct x86_cpu_id intel_cqm_match[] = {
+       { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
+       {}
+};
+
+static int __init intel_cqm_init(void)
+{
+       char *str, scale[20];
+       int i, cpu, ret;
+
+       if (!x86_match_cpu(intel_cqm_match))
+               return -ENODEV;
+
+       cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
+
+       /*
+        * It's possible that not all resources support the same number
+        * of RMIDs. Instead of making scheduling much more complicated
+        * (where we have to match a task's RMID to a cpu that supports
+        * that many RMIDs) just find the minimum RMIDs supported across
+        * all cpus.
+        *
+        * Also, check that the scales match on all cpus.
+        */
+       cpu_notifier_register_begin();
+
+       for_each_online_cpu(cpu) {
+               struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+               if (c->x86_cache_max_rmid < cqm_max_rmid)
+                       cqm_max_rmid = c->x86_cache_max_rmid;
+
+               if (c->x86_cache_occ_scale != cqm_l3_scale) {
+                       pr_err("Multiple LLC scale values, disabling\n");
+                       ret = -EINVAL;
+                       goto out;
+               }
+       }
+
+       /*
+        * A reasonable upper limit on the max threshold is the number
+        * of lines tagged per RMID if all RMIDs have the same number of
+        * lines tagged in the LLC.
+        *
+        * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+        */
+       __intel_cqm_max_threshold =
+               boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
+
+       snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
+       str = kstrdup(scale, GFP_KERNEL);
+       if (!str) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       event_attr_intel_cqm_llc_scale.event_str = str;
+
+       ret = intel_cqm_setup_rmid_cache();
+       if (ret)
+               goto out;
+
+       for_each_online_cpu(i) {
+               intel_cqm_cpu_starting(i);
+               cqm_pick_event_reader(i);
+       }
+
+       __perf_cpu_notifier(intel_cqm_cpu_notifier);
+
+       ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
+       if (ret)
+               pr_err("Intel CQM perf registration failed: %d\n", ret);
+       else
+               pr_info("Intel CQM monitoring enabled\n");
+
+out:
+       cpu_notifier_register_done();
+
+       return ret;
+}
+device_initcall(intel_cqm_init);
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c

new file mode 100644 (file)

index 0000000..7946c42
--- /dev/null
+++ b/arch/x86/events/intel/cstate.c
@@ -0,0 +1,694 @@
+/*
+ * perf_event_intel_cstate.c: support cstate residency counters
+ *
+ * Copyright (C) 2015, Intel Corp.
+ * Author: Kan Liang (kan.liang@intel.com)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ */
+
+/*
+ * This file export cstate related free running (read-only) counters
+ * for perf. These counters may be use simultaneously by other tools,
+ * such as turbostat. However, it still make sense to implement them
+ * in perf. Because we can conveniently collect them together with
+ * other events, and allow to use them from tools without special MSR
+ * access code.
+ *
+ * The events only support system-wide mode counting. There is no
+ * sampling support because it is not supported by the hardware.
+ *
+ * According to counters' scope and category, two PMUs are registered
+ * with the perf_event core subsystem.
+ *  - 'cstate_core': The counter is available for each physical core.
+ *    The counters include CORE_C*_RESIDENCY.
+ *  - 'cstate_pkg': The counter is available for each physical package.
+ *    The counters include PKG_C*_RESIDENCY.
+ *
+ * All of these counters are specified in the Intel® 64 and IA-32
+ * Architectures Software Developer.s Manual Vol3b.
+ *
+ * Model specific counters:
+ *     MSR_CORE_C1_RES: CORE C1 Residency Counter
+ *                      perf code: 0x00
+ *                      Available model: SLM,AMT
+ *                      Scope: Core (each processor core has a MSR)
+ *     MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
+ *                            perf code: 0x01
+ *                            Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *                            Scope: Core
+ *     MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter
+ *                            perf code: 0x02
+ *                            Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *                            Scope: Core
+ *     MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
+ *                            perf code: 0x03
+ *                            Available model: SNB,IVB,HSW,BDW,SKL
+ *                            Scope: Core
+ *     MSR_PKG_C2_RESIDENCY:  Package C2 Residency Counter.
+ *                            perf code: 0x00
+ *                            Available model: SNB,IVB,HSW,BDW,SKL
+ *                            Scope: Package (physical package)
+ *     MSR_PKG_C3_RESIDENCY:  Package C3 Residency Counter.
+ *                            perf code: 0x01
+ *                            Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *                            Scope: Package (physical package)
+ *     MSR_PKG_C6_RESIDENCY:  Package C6 Residency Counter.
+ *                            perf code: 0x02
+ *                            Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *                            Scope: Package (physical package)
+ *     MSR_PKG_C7_RESIDENCY:  Package C7 Residency Counter.
+ *                            perf code: 0x03
+ *                            Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *                            Scope: Package (physical package)
+ *     MSR_PKG_C8_RESIDENCY:  Package C8 Residency Counter.
+ *                            perf code: 0x04
+ *                            Available model: HSW ULT only
+ *                            Scope: Package (physical package)
+ *     MSR_PKG_C9_RESIDENCY:  Package C9 Residency Counter.
+ *                            perf code: 0x05
+ *                            Available model: HSW ULT only
+ *                            Scope: Package (physical package)
+ *     MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
+ *                            perf code: 0x06
+ *                            Available model: HSW ULT only
+ *                            Scope: Package (physical package)
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <asm/cpu_device_id.h>
+#include "../perf_event.h"
+
+#define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format)                \
+static ssize_t __cstate_##_var##_show(struct kobject *kobj,    \
+                               struct kobj_attribute *attr,    \
+                               char *page)                     \
+{                                                              \
+       BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);             \
+       return sprintf(page, _format "\n");                     \
+}                                                              \
+static struct kobj_attribute format_attr_##_var =              \
+       __ATTR(_name, 0444, __cstate_##_var##_show, NULL)
+
+static ssize_t cstate_get_attr_cpumask(struct device *dev,
+                                      struct device_attribute *attr,
+                                      char *buf);
+
+struct perf_cstate_msr {
+       u64     msr;
+       struct  perf_pmu_events_attr *attr;
+       bool    (*test)(int idx);
+};
+
+
+/* cstate_core PMU */
+
+static struct pmu cstate_core_pmu;
+static bool has_cstate_core;
+
+enum perf_cstate_core_id {
+       /*
+        * cstate_core events
+        */
+       PERF_CSTATE_CORE_C1_RES = 0,
+       PERF_CSTATE_CORE_C3_RES,
+       PERF_CSTATE_CORE_C6_RES,
+       PERF_CSTATE_CORE_C7_RES,
+
+       PERF_CSTATE_CORE_EVENT_MAX,
+};
+
+bool test_core(int idx)
+{
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+           boot_cpu_data.x86 != 6)
+               return false;
+
+       switch (boot_cpu_data.x86_model) {
+       case 30: /* 45nm Nehalem    */
+       case 26: /* 45nm Nehalem-EP */
+       case 46: /* 45nm Nehalem-EX */
+
+       case 37: /* 32nm Westmere    */
+       case 44: /* 32nm Westmere-EP */
+       case 47: /* 32nm Westmere-EX */
+               if (idx == PERF_CSTATE_CORE_C3_RES ||
+                   idx == PERF_CSTATE_CORE_C6_RES)
+                       return true;
+               break;
+       case 42: /* 32nm SandyBridge         */
+       case 45: /* 32nm SandyBridge-E/EN/EP */
+
+       case 58: /* 22nm IvyBridge       */
+       case 62: /* 22nm IvyBridge-EP/EX */
+
+       case 60: /* 22nm Haswell Core */
+       case 63: /* 22nm Haswell Server */
+       case 69: /* 22nm Haswell ULT */
+       case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+       case 61: /* 14nm Broadwell Core-M */
+       case 86: /* 14nm Broadwell Xeon D */
+       case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+       case 79: /* 14nm Broadwell Server */
+
+       case 78: /* 14nm Skylake Mobile */
+       case 94: /* 14nm Skylake Desktop */
+               if (idx == PERF_CSTATE_CORE_C3_RES ||
+                   idx == PERF_CSTATE_CORE_C6_RES ||
+                   idx == PERF_CSTATE_CORE_C7_RES)
+                       return true;
+               break;
+       case 55: /* 22nm Atom "Silvermont"                */
+       case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+       case 76: /* 14nm Atom "Airmont"                   */
+               if (idx == PERF_CSTATE_CORE_C1_RES ||
+                   idx == PERF_CSTATE_CORE_C6_RES)
+                       return true;
+               break;
+       }
+
+       return false;
+}
+
+PMU_EVENT_ATTR_STRING(c1-residency, evattr_cstate_core_c1, "event=0x00");
+PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_core_c3, "event=0x01");
+PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_core_c6, "event=0x02");
+PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_core_c7, "event=0x03");
+
+static struct perf_cstate_msr core_msr[] = {
+       [PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES,          &evattr_cstate_core_c1, test_core, },
+       [PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY,    &evattr_cstate_core_c3, test_core, },
+       [PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY,    &evattr_cstate_core_c6, test_core, },
+       [PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY,    &evattr_cstate_core_c7, test_core, },
+};
+
+static struct attribute *core_events_attrs[PERF_CSTATE_CORE_EVENT_MAX + 1] = {
+       NULL,
+};
+
+static struct attribute_group core_events_attr_group = {
+       .name = "events",
+       .attrs = core_events_attrs,
+};
+
+DEFINE_CSTATE_FORMAT_ATTR(core_event, event, "config:0-63");
+static struct attribute *core_format_attrs[] = {
+       &format_attr_core_event.attr,
+       NULL,
+};
+
+static struct attribute_group core_format_attr_group = {
+       .name = "format",
+       .attrs = core_format_attrs,
+};
+
+static cpumask_t cstate_core_cpu_mask;
+static DEVICE_ATTR(cpumask, S_IRUGO, cstate_get_attr_cpumask, NULL);
+
+static struct attribute *cstate_cpumask_attrs[] = {
+       &dev_attr_cpumask.attr,
+       NULL,
+};
+
+static struct attribute_group cpumask_attr_group = {
+       .attrs = cstate_cpumask_attrs,
+};
+
+static const struct attribute_group *core_attr_groups[] = {
+       &core_events_attr_group,
+       &core_format_attr_group,
+       &cpumask_attr_group,
+       NULL,
+};
+
+/* cstate_core PMU end */
+
+
+/* cstate_pkg PMU */
+
+static struct pmu cstate_pkg_pmu;
+static bool has_cstate_pkg;
+
+enum perf_cstate_pkg_id {
+       /*
+        * cstate_pkg events
+        */
+       PERF_CSTATE_PKG_C2_RES = 0,
+       PERF_CSTATE_PKG_C3_RES,
+       PERF_CSTATE_PKG_C6_RES,
+       PERF_CSTATE_PKG_C7_RES,
+       PERF_CSTATE_PKG_C8_RES,
+       PERF_CSTATE_PKG_C9_RES,
+       PERF_CSTATE_PKG_C10_RES,
+
+       PERF_CSTATE_PKG_EVENT_MAX,
+};
+
+bool test_pkg(int idx)
+{
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+           boot_cpu_data.x86 != 6)
+               return false;
+
+       switch (boot_cpu_data.x86_model) {
+       case 30: /* 45nm Nehalem    */
+       case 26: /* 45nm Nehalem-EP */
+       case 46: /* 45nm Nehalem-EX */
+
+       case 37: /* 32nm Westmere    */
+       case 44: /* 32nm Westmere-EP */
+       case 47: /* 32nm Westmere-EX */
+               if (idx == PERF_CSTATE_CORE_C3_RES ||
+                   idx == PERF_CSTATE_CORE_C6_RES ||
+                   idx == PERF_CSTATE_CORE_C7_RES)
+                       return true;
+               break;
+       case 42: /* 32nm SandyBridge         */
+       case 45: /* 32nm SandyBridge-E/EN/EP */
+
+       case 58: /* 22nm IvyBridge       */
+       case 62: /* 22nm IvyBridge-EP/EX */
+
+       case 60: /* 22nm Haswell Core */
+       case 63: /* 22nm Haswell Server */
+       case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+       case 61: /* 14nm Broadwell Core-M */
+       case 86: /* 14nm Broadwell Xeon D */
+       case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+       case 79: /* 14nm Broadwell Server */
+
+       case 78: /* 14nm Skylake Mobile */
+       case 94: /* 14nm Skylake Desktop */
+               if (idx == PERF_CSTATE_PKG_C2_RES ||
+                   idx == PERF_CSTATE_PKG_C3_RES ||
+                   idx == PERF_CSTATE_PKG_C6_RES ||
+                   idx == PERF_CSTATE_PKG_C7_RES)
+                       return true;
+               break;
+       case 55: /* 22nm Atom "Silvermont"                */
+       case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+       case 76: /* 14nm Atom "Airmont"                   */
+               if (idx == PERF_CSTATE_CORE_C6_RES)
+                       return true;
+               break;
+       case 69: /* 22nm Haswell ULT */
+               if (idx == PERF_CSTATE_PKG_C2_RES ||
+                   idx == PERF_CSTATE_PKG_C3_RES ||
+                   idx == PERF_CSTATE_PKG_C6_RES ||
+                   idx == PERF_CSTATE_PKG_C7_RES ||
+                   idx == PERF_CSTATE_PKG_C8_RES ||
+                   idx == PERF_CSTATE_PKG_C9_RES ||
+                   idx == PERF_CSTATE_PKG_C10_RES)
+                       return true;
+               break;
+       }
+
+       return false;
+}
+
+PMU_EVENT_ATTR_STRING(c2-residency, evattr_cstate_pkg_c2, "event=0x00");
+PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_pkg_c3, "event=0x01");
+PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_pkg_c6, "event=0x02");
+PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_pkg_c7, "event=0x03");
+PMU_EVENT_ATTR_STRING(c8-residency, evattr_cstate_pkg_c8, "event=0x04");
+PMU_EVENT_ATTR_STRING(c9-residency, evattr_cstate_pkg_c9, "event=0x05");
+PMU_EVENT_ATTR_STRING(c10-residency, evattr_cstate_pkg_c10, "event=0x06");
+
+static struct perf_cstate_msr pkg_msr[] = {
+       [PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY,      &evattr_cstate_pkg_c2,  test_pkg, },
+       [PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY,      &evattr_cstate_pkg_c3,  test_pkg, },
+       [PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY,      &evattr_cstate_pkg_c6,  test_pkg, },
+       [PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY,      &evattr_cstate_pkg_c7,  test_pkg, },
+       [PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY,      &evattr_cstate_pkg_c8,  test_pkg, },
+       [PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY,      &evattr_cstate_pkg_c9,  test_pkg, },
+       [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY,    &evattr_cstate_pkg_c10, test_pkg, },
+};
+
+static struct attribute *pkg_events_attrs[PERF_CSTATE_PKG_EVENT_MAX + 1] = {
+       NULL,
+};
+
+static struct attribute_group pkg_events_attr_group = {
+       .name = "events",
+       .attrs = pkg_events_attrs,
+};
+
+DEFINE_CSTATE_FORMAT_ATTR(pkg_event, event, "config:0-63");
+static struct attribute *pkg_format_attrs[] = {
+       &format_attr_pkg_event.attr,
+       NULL,
+};
+static struct attribute_group pkg_format_attr_group = {
+       .name = "format",
+       .attrs = pkg_format_attrs,
+};
+
+static cpumask_t cstate_pkg_cpu_mask;
+
+static const struct attribute_group *pkg_attr_groups[] = {
+       &pkg_events_attr_group,
+       &pkg_format_attr_group,
+       &cpumask_attr_group,
+       NULL,
+};
+
+/* cstate_pkg PMU end*/
+
+static ssize_t cstate_get_attr_cpumask(struct device *dev,
+                                      struct device_attribute *attr,
+                                      char *buf)
+{
+       struct pmu *pmu = dev_get_drvdata(dev);
+
+       if (pmu == &cstate_core_pmu)
+               return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask);
+       else if (pmu == &cstate_pkg_pmu)
+               return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask);
+       else
+               return 0;
+}
+
+static int cstate_pmu_event_init(struct perf_event *event)
+{
+       u64 cfg = event->attr.config;
+       int ret = 0;
+
+       if (event->attr.type != event->pmu->type)
+               return -ENOENT;
+
+       /* unsupported modes and filters */
+       if (event->attr.exclude_user   ||
+           event->attr.exclude_kernel ||
+           event->attr.exclude_hv     ||
+           event->attr.exclude_idle   ||
+           event->attr.exclude_host   ||
+           event->attr.exclude_guest  ||
+           event->attr.sample_period) /* no sampling */
+               return -EINVAL;
+
+       if (event->pmu == &cstate_core_pmu) {
+               if (cfg >= PERF_CSTATE_CORE_EVENT_MAX)
+                       return -EINVAL;
+               if (!core_msr[cfg].attr)
+                       return -EINVAL;
+               event->hw.event_base = core_msr[cfg].msr;
+       } else if (event->pmu == &cstate_pkg_pmu) {
+               if (cfg >= PERF_CSTATE_PKG_EVENT_MAX)
+                       return -EINVAL;
+               if (!pkg_msr[cfg].attr)
+                       return -EINVAL;
+               event->hw.event_base = pkg_msr[cfg].msr;
+       } else
+               return -ENOENT;
+
+       /* must be done before validate_group */
+       event->hw.config = cfg;
+       event->hw.idx = -1;
+
+       return ret;
+}
+
+static inline u64 cstate_pmu_read_counter(struct perf_event *event)
+{
+       u64 val;
+
+       rdmsrl(event->hw.event_base, val);
+       return val;
+}
+
+static void cstate_pmu_event_update(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 prev_raw_count, new_raw_count;
+
+again:
+       prev_raw_count = local64_read(&hwc->prev_count);
+       new_raw_count = cstate_pmu_read_counter(event);
+
+       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+                           new_raw_count) != prev_raw_count)
+               goto again;
+
+       local64_add(new_raw_count - prev_raw_count, &event->count);
+}
+
+static void cstate_pmu_event_start(struct perf_event *event, int mode)
+{
+       local64_set(&event->hw.prev_count, cstate_pmu_read_counter(event));
+}
+
+static void cstate_pmu_event_stop(struct perf_event *event, int mode)
+{
+       cstate_pmu_event_update(event);
+}
+
+static void cstate_pmu_event_del(struct perf_event *event, int mode)
+{
+       cstate_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int cstate_pmu_event_add(struct perf_event *event, int mode)
+{
+       if (mode & PERF_EF_START)
+               cstate_pmu_event_start(event, mode);
+
+       return 0;
+}
+
+static void cstate_cpu_exit(int cpu)
+{
+       int i, id, target;
+
+       /* cpu exit for cstate core */
+       if (has_cstate_core) {
+               id = topology_core_id(cpu);
+               target = -1;
+
+               for_each_online_cpu(i) {
+                       if (i == cpu)
+                               continue;
+                       if (id == topology_core_id(i)) {
+                               target = i;
+                               break;
+                       }
+               }
+               if (cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask) && target >= 0)
+                       cpumask_set_cpu(target, &cstate_core_cpu_mask);
+               WARN_ON(cpumask_empty(&cstate_core_cpu_mask));
+               if (target >= 0)
+                       perf_pmu_migrate_context(&cstate_core_pmu, cpu, target);
+       }
+
+       /* cpu exit for cstate pkg */
+       if (has_cstate_pkg) {
+               id = topology_physical_package_id(cpu);
+               target = -1;
+
+               for_each_online_cpu(i) {
+                       if (i == cpu)
+                               continue;
+                       if (id == topology_physical_package_id(i)) {
+                               target = i;
+                               break;
+                       }
+               }
+               if (cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask) && target >= 0)
+                       cpumask_set_cpu(target, &cstate_pkg_cpu_mask);
+               WARN_ON(cpumask_empty(&cstate_pkg_cpu_mask));
+               if (target >= 0)
+                       perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target);
+       }
+}
+
+static void cstate_cpu_init(int cpu)
+{
+       int i, id;
+
+       /* cpu init for cstate core */
+       if (has_cstate_core) {
+               id = topology_core_id(cpu);
+               for_each_cpu(i, &cstate_core_cpu_mask) {
+                       if (id == topology_core_id(i))
+                               break;
+               }
+               if (i >= nr_cpu_ids)
+                       cpumask_set_cpu(cpu, &cstate_core_cpu_mask);
+       }
+
+       /* cpu init for cstate pkg */
+       if (has_cstate_pkg) {
+               id = topology_physical_package_id(cpu);
+               for_each_cpu(i, &cstate_pkg_cpu_mask) {
+                       if (id == topology_physical_package_id(i))
+                               break;
+               }
+               if (i >= nr_cpu_ids)
+                       cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask);
+       }
+}
+
+static int cstate_cpu_notifier(struct notifier_block *self,
+                                 unsigned long action, void *hcpu)
+{
+       unsigned int cpu = (long)hcpu;
+
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_UP_PREPARE:
+               break;
+       case CPU_STARTING:
+               cstate_cpu_init(cpu);
+               break;
+       case CPU_UP_CANCELED:
+       case CPU_DYING:
+               break;
+       case CPU_ONLINE:
+       case CPU_DEAD:
+               break;
+       case CPU_DOWN_PREPARE:
+               cstate_cpu_exit(cpu);
+               break;
+       default:
+               break;
+       }
+
+       return NOTIFY_OK;
+}
+
+/*
+ * Probe the cstate events and insert the available one into sysfs attrs
+ * Return false if there is no available events.
+ */
+static bool cstate_probe_msr(struct perf_cstate_msr *msr,
+                            struct attribute   **events_attrs,
+                            int max_event_nr)
+{
+       int i, j = 0;
+       u64 val;
+
+       /* Probe the cstate events. */
+       for (i = 0; i < max_event_nr; i++) {
+               if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
+                       msr[i].attr = NULL;
+       }
+
+       /* List remaining events in the sysfs attrs. */
+       for (i = 0; i < max_event_nr; i++) {
+               if (msr[i].attr)
+                       events_attrs[j++] = &msr[i].attr->attr.attr;
+       }
+       events_attrs[j] = NULL;
+
+       return (j > 0) ? true : false;
+}
+
+static int __init cstate_init(void)
+{
+       /* SLM has different MSR for PKG C6 */
+       switch (boot_cpu_data.x86_model) {
+       case 55:
+       case 76:
+       case 77:
+               pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY;
+       }
+
+       if (cstate_probe_msr(core_msr, core_events_attrs, PERF_CSTATE_CORE_EVENT_MAX))
+               has_cstate_core = true;
+
+       if (cstate_probe_msr(pkg_msr, pkg_events_attrs, PERF_CSTATE_PKG_EVENT_MAX))
+               has_cstate_pkg = true;
+
+       return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV;
+}
+
+static void __init cstate_cpumask_init(void)
+{
+       int cpu;
+
+       cpu_notifier_register_begin();
+
+       for_each_online_cpu(cpu)
+               cstate_cpu_init(cpu);
+
+       __perf_cpu_notifier(cstate_cpu_notifier);
+
+       cpu_notifier_register_done();
+}
+
+static struct pmu cstate_core_pmu = {
+       .attr_groups    = core_attr_groups,
+       .name           = "cstate_core",
+       .task_ctx_nr    = perf_invalid_context,
+       .event_init     = cstate_pmu_event_init,
+       .add            = cstate_pmu_event_add, /* must have */
+       .del            = cstate_pmu_event_del, /* must have */
+       .start          = cstate_pmu_event_start,
+       .stop           = cstate_pmu_event_stop,
+       .read           = cstate_pmu_event_update,
+       .capabilities   = PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static struct pmu cstate_pkg_pmu = {
+       .attr_groups    = pkg_attr_groups,
+       .name           = "cstate_pkg",
+       .task_ctx_nr    = perf_invalid_context,
+       .event_init     = cstate_pmu_event_init,
+       .add            = cstate_pmu_event_add, /* must have */
+       .del            = cstate_pmu_event_del, /* must have */
+       .start          = cstate_pmu_event_start,
+       .stop           = cstate_pmu_event_stop,
+       .read           = cstate_pmu_event_update,
+       .capabilities   = PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static void __init cstate_pmus_register(void)
+{
+       int err;
+
+       if (has_cstate_core) {
+               err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1);
+               if (WARN_ON(err))
+                       pr_info("Failed to register PMU %s error %d\n",
+                               cstate_core_pmu.name, err);
+       }
+
+       if (has_cstate_pkg) {
+               err = perf_pmu_register(&cstate_pkg_pmu, cstate_pkg_pmu.name, -1);
+               if (WARN_ON(err))
+                       pr_info("Failed to register PMU %s error %d\n",
+                               cstate_pkg_pmu.name, err);
+       }
+}
+
+static int __init cstate_pmu_init(void)
+{
+       int err;
+
+       if (cpu_has_hypervisor)
+               return -ENODEV;
+
+       err = cstate_init();
+       if (err)
+               return err;
+
+       cstate_cpumask_init();
+
+       cstate_pmus_register();
+
+       return 0;
+}
+
+device_initcall(cstate_pmu_init);
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c

new file mode 100644 (file)

index 0000000..ce7211a
--- /dev/null
+++ b/arch/x86/events/intel/ds.c
@@ -0,0 +1,1410 @@
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+
+#include <asm/perf_event.h>
+#include <asm/insn.h>
+
+#include "../perf_event.h"
+
+/* The size of a BTS record in bytes: */
+#define BTS_RECORD_SIZE                24
+
+#define BTS_BUFFER_SIZE                (PAGE_SIZE << 4)
+#define PEBS_BUFFER_SIZE       (PAGE_SIZE << 4)
+#define PEBS_FIXUP_SIZE                PAGE_SIZE
+
+/*
+ * pebs_record_32 for p4 and core not supported
+
+struct pebs_record_32 {
+       u32 flags, ip;
+       u32 ax, bc, cx, dx;
+       u32 si, di, bp, sp;
+};
+
+ */
+
+union intel_x86_pebs_dse {
+       u64 val;
+       struct {
+               unsigned int ld_dse:4;
+               unsigned int ld_stlb_miss:1;
+               unsigned int ld_locked:1;
+               unsigned int ld_reserved:26;
+       };
+       struct {
+               unsigned int st_l1d_hit:1;
+               unsigned int st_reserved1:3;
+               unsigned int st_stlb_miss:1;
+               unsigned int st_locked:1;
+               unsigned int st_reserved2:26;
+       };
+};
+
+
+/*
+ * Map PEBS Load Latency Data Source encodings to generic
+ * memory data source information
+ */
+#define P(a, b) PERF_MEM_S(a, b)
+#define OP_LH (P(OP, LOAD) | P(LVL, HIT))
+#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
+
+/* Version for Sandy Bridge and later */
+static u64 pebs_data_source[] = {
+       P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
+       OP_LH | P(LVL, L1)  | P(SNOOP, NONE),   /* 0x01: L1 local */
+       OP_LH | P(LVL, LFB) | P(SNOOP, NONE),   /* 0x02: LFB hit */
+       OP_LH | P(LVL, L2)  | P(SNOOP, NONE),   /* 0x03: L2 hit */
+       OP_LH | P(LVL, L3)  | P(SNOOP, NONE),   /* 0x04: L3 hit */
+       OP_LH | P(LVL, L3)  | P(SNOOP, MISS),   /* 0x05: L3 hit, snoop miss */
+       OP_LH | P(LVL, L3)  | P(SNOOP, HIT),    /* 0x06: L3 hit, snoop hit */
+       OP_LH | P(LVL, L3)  | P(SNOOP, HITM),   /* 0x07: L3 hit, snoop hitm */
+       OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT),  /* 0x08: L3 miss snoop hit */
+       OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
+       OP_LH | P(LVL, LOC_RAM)  | P(SNOOP, HIT),  /* 0x0a: L3 miss, shared */
+       OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT),  /* 0x0b: L3 miss, shared */
+       OP_LH | P(LVL, LOC_RAM)  | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */
+       OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */
+       OP_LH | P(LVL, IO)  | P(SNOOP, NONE), /* 0x0e: I/O */
+       OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */
+};
+
+/* Patch up minor differences in the bits */
+void __init intel_pmu_pebs_data_source_nhm(void)
+{
+       pebs_data_source[0x05] = OP_LH | P(LVL, L3)  | P(SNOOP, HIT);
+       pebs_data_source[0x06] = OP_LH | P(LVL, L3)  | P(SNOOP, HITM);
+       pebs_data_source[0x07] = OP_LH | P(LVL, L3)  | P(SNOOP, HITM);
+}
+
+static u64 precise_store_data(u64 status)
+{
+       union intel_x86_pebs_dse dse;
+       u64 val = P(OP, STORE) | P(SNOOP, NA) | P(LVL, L1) | P(TLB, L2);
+
+       dse.val = status;
+
+       /*
+        * bit 4: TLB access
+        * 1 = stored missed 2nd level TLB
+        *
+        * so it either hit the walker or the OS
+        * otherwise hit 2nd level TLB
+        */
+       if (dse.st_stlb_miss)
+               val |= P(TLB, MISS);
+       else
+               val |= P(TLB, HIT);
+
+       /*
+        * bit 0: hit L1 data cache
+        * if not set, then all we know is that
+        * it missed L1D
+        */
+       if (dse.st_l1d_hit)
+               val |= P(LVL, HIT);
+       else
+               val |= P(LVL, MISS);
+
+       /*
+        * bit 5: Locked prefix
+        */
+       if (dse.st_locked)
+               val |= P(LOCK, LOCKED);
+
+       return val;
+}
+
+static u64 precise_datala_hsw(struct perf_event *event, u64 status)
+{
+       union perf_mem_data_src dse;
+
+       dse.val = PERF_MEM_NA;
+
+       if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
+               dse.mem_op = PERF_MEM_OP_STORE;
+       else if (event->hw.flags & PERF_X86_EVENT_PEBS_LD_HSW)
+               dse.mem_op = PERF_MEM_OP_LOAD;
+
+       /*
+        * L1 info only valid for following events:
+        *
+        * MEM_UOPS_RETIRED.STLB_MISS_STORES
+        * MEM_UOPS_RETIRED.LOCK_STORES
+        * MEM_UOPS_RETIRED.SPLIT_STORES
+        * MEM_UOPS_RETIRED.ALL_STORES
+        */
+       if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) {
+               if (status & 1)
+                       dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
+               else
+                       dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
+       }
+       return dse.val;
+}
+
+static u64 load_latency_data(u64 status)
+{
+       union intel_x86_pebs_dse dse;
+       u64 val;
+       int model = boot_cpu_data.x86_model;
+       int fam = boot_cpu_data.x86;
+
+       dse.val = status;
+
+       /*
+        * use the mapping table for bit 0-3
+        */
+       val = pebs_data_source[dse.ld_dse];
+
+       /*
+        * Nehalem models do not support TLB, Lock infos
+        */
+       if (fam == 0x6 && (model == 26 || model == 30
+           || model == 31 || model == 46)) {
+               val |= P(TLB, NA) | P(LOCK, NA);
+               return val;
+       }
+       /*
+        * bit 4: TLB access
+        * 0 = did not miss 2nd level TLB
+        * 1 = missed 2nd level TLB
+        */
+       if (dse.ld_stlb_miss)
+               val |= P(TLB, MISS) | P(TLB, L2);
+       else
+               val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
+
+       /*
+        * bit 5: locked prefix
+        */
+       if (dse.ld_locked)
+               val |= P(LOCK, LOCKED);
+
+       return val;
+}
+
+struct pebs_record_core {
+       u64 flags, ip;
+       u64 ax, bx, cx, dx;
+       u64 si, di, bp, sp;
+       u64 r8,  r9,  r10, r11;
+       u64 r12, r13, r14, r15;
+};
+
+struct pebs_record_nhm {
+       u64 flags, ip;
+       u64 ax, bx, cx, dx;
+       u64 si, di, bp, sp;
+       u64 r8,  r9,  r10, r11;
+       u64 r12, r13, r14, r15;
+       u64 status, dla, dse, lat;
+};
+
+/*
+ * Same as pebs_record_nhm, with two additional fields.
+ */
+struct pebs_record_hsw {
+       u64 flags, ip;
+       u64 ax, bx, cx, dx;
+       u64 si, di, bp, sp;
+       u64 r8,  r9,  r10, r11;
+       u64 r12, r13, r14, r15;
+       u64 status, dla, dse, lat;
+       u64 real_ip, tsx_tuning;
+};
+
+union hsw_tsx_tuning {
+       struct {
+               u32 cycles_last_block     : 32,
+                   hle_abort             : 1,
+                   rtm_abort             : 1,
+                   instruction_abort     : 1,
+                   non_instruction_abort : 1,
+                   retry                 : 1,
+                   data_conflict         : 1,
+                   capacity_writes       : 1,
+                   capacity_reads        : 1;
+       };
+       u64         value;
+};
+
+#define PEBS_HSW_TSX_FLAGS     0xff00000000ULL
+
+/* Same as HSW, plus TSC */
+
+struct pebs_record_skl {
+       u64 flags, ip;
+       u64 ax, bx, cx, dx;
+       u64 si, di, bp, sp;
+       u64 r8,  r9,  r10, r11;
+       u64 r12, r13, r14, r15;
+       u64 status, dla, dse, lat;
+       u64 real_ip, tsx_tuning;
+       u64 tsc;
+};
+
+void init_debug_store_on_cpu(int cpu)
+{
+       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+       if (!ds)
+               return;
+
+       wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
+                    (u32)((u64)(unsigned long)ds),
+                    (u32)((u64)(unsigned long)ds >> 32));
+}
+
+void fini_debug_store_on_cpu(int cpu)
+{
+       if (!per_cpu(cpu_hw_events, cpu).ds)
+               return;
+
+       wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
+}
+
+static DEFINE_PER_CPU(void *, insn_buffer);
+
+static int alloc_pebs_buffer(int cpu)
+{
+       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+       int node = cpu_to_node(cpu);
+       int max;
+       void *buffer, *ibuffer;
+
+       if (!x86_pmu.pebs)
+               return 0;
+
+       buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
+       if (unlikely(!buffer))
+               return -ENOMEM;
+
+       /*
+        * HSW+ already provides us the eventing ip; no need to allocate this
+        * buffer then.
+        */
+       if (x86_pmu.intel_cap.pebs_format < 2) {
+               ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
+               if (!ibuffer) {
+                       kfree(buffer);
+                       return -ENOMEM;
+               }
+               per_cpu(insn_buffer, cpu) = ibuffer;
+       }
+
+       max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size;
+
+       ds->pebs_buffer_base = (u64)(unsigned long)buffer;
+       ds->pebs_index = ds->pebs_buffer_base;
+       ds->pebs_absolute_maximum = ds->pebs_buffer_base +
+               max * x86_pmu.pebs_record_size;
+
+       return 0;
+}
+
+static void release_pebs_buffer(int cpu)
+{
+       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+       if (!ds || !x86_pmu.pebs)
+               return;
+
+       kfree(per_cpu(insn_buffer, cpu));
+       per_cpu(insn_buffer, cpu) = NULL;
+
+       kfree((void *)(unsigned long)ds->pebs_buffer_base);
+       ds->pebs_buffer_base = 0;
+}
+
+static int alloc_bts_buffer(int cpu)
+{
+       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+       int node = cpu_to_node(cpu);
+       int max, thresh;
+       void *buffer;
+
+       if (!x86_pmu.bts)
+               return 0;
+
+       buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
+       if (unlikely(!buffer)) {
+               WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
+               return -ENOMEM;
+       }
+
+       max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
+       thresh = max / 16;
+
+       ds->bts_buffer_base = (u64)(unsigned long)buffer;
+       ds->bts_index = ds->bts_buffer_base;
+       ds->bts_absolute_maximum = ds->bts_buffer_base +
+               max * BTS_RECORD_SIZE;
+       ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
+               thresh * BTS_RECORD_SIZE;
+
+       return 0;
+}
+
+static void release_bts_buffer(int cpu)
+{
+       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+       if (!ds || !x86_pmu.bts)
+               return;
+
+       kfree((void *)(unsigned long)ds->bts_buffer_base);
+       ds->bts_buffer_base = 0;
+}
+
+static int alloc_ds_buffer(int cpu)
+{
+       int node = cpu_to_node(cpu);
+       struct debug_store *ds;
+
+       ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
+       if (unlikely(!ds))
+               return -ENOMEM;
+
+       per_cpu(cpu_hw_events, cpu).ds = ds;
+
+       return 0;
+}
+
+static void release_ds_buffer(int cpu)
+{
+       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+       if (!ds)
+               return;
+
+       per_cpu(cpu_hw_events, cpu).ds = NULL;
+       kfree(ds);
+}
+
+void release_ds_buffers(void)
+{
+       int cpu;
+
+       if (!x86_pmu.bts && !x86_pmu.pebs)
+               return;
+
+       get_online_cpus();
+       for_each_online_cpu(cpu)
+               fini_debug_store_on_cpu(cpu);
+
+       for_each_possible_cpu(cpu) {
+               release_pebs_buffer(cpu);
+               release_bts_buffer(cpu);
+               release_ds_buffer(cpu);
+       }
+       put_online_cpus();
+}
+
+void reserve_ds_buffers(void)
+{
+       int bts_err = 0, pebs_err = 0;
+       int cpu;
+
+       x86_pmu.bts_active = 0;
+       x86_pmu.pebs_active = 0;
+
+       if (!x86_pmu.bts && !x86_pmu.pebs)
+               return;
+
+       if (!x86_pmu.bts)
+               bts_err = 1;
+
+       if (!x86_pmu.pebs)
+               pebs_err = 1;
+
+       get_online_cpus();
+
+       for_each_possible_cpu(cpu) {
+               if (alloc_ds_buffer(cpu)) {
+                       bts_err = 1;
+                       pebs_err = 1;
+               }
+
+               if (!bts_err && alloc_bts_buffer(cpu))
+                       bts_err = 1;
+
+               if (!pebs_err && alloc_pebs_buffer(cpu))
+                       pebs_err = 1;
+
+               if (bts_err && pebs_err)
+                       break;
+       }
+
+       if (bts_err) {
+               for_each_possible_cpu(cpu)
+                       release_bts_buffer(cpu);
+       }
+
+       if (pebs_err) {
+               for_each_possible_cpu(cpu)
+                       release_pebs_buffer(cpu);
+       }
+
+       if (bts_err && pebs_err) {
+               for_each_possible_cpu(cpu)
+                       release_ds_buffer(cpu);
+       } else {
+               if (x86_pmu.bts && !bts_err)
+                       x86_pmu.bts_active = 1;
+
+               if (x86_pmu.pebs && !pebs_err)
+                       x86_pmu.pebs_active = 1;
+
+               for_each_online_cpu(cpu)
+                       init_debug_store_on_cpu(cpu);
+       }
+
+       put_online_cpus();
+}
+
+/*
+ * BTS
+ */
+
+struct event_constraint bts_constraint =
+       EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0);
+
+void intel_pmu_enable_bts(u64 config)
+{
+       unsigned long debugctlmsr;
+
+       debugctlmsr = get_debugctlmsr();
+
+       debugctlmsr |= DEBUGCTLMSR_TR;
+       debugctlmsr |= DEBUGCTLMSR_BTS;
+       if (config & ARCH_PERFMON_EVENTSEL_INT)
+               debugctlmsr |= DEBUGCTLMSR_BTINT;
+
+       if (!(config & ARCH_PERFMON_EVENTSEL_OS))
+               debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
+
+       if (!(config & ARCH_PERFMON_EVENTSEL_USR))
+               debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
+
+       update_debugctlmsr(debugctlmsr);
+}
+
+void intel_pmu_disable_bts(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       unsigned long debugctlmsr;
+
+       if (!cpuc->ds)
+               return;
+
+       debugctlmsr = get_debugctlmsr();
+
+       debugctlmsr &=
+               ~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
+                 DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
+
+       update_debugctlmsr(debugctlmsr);
+}
+
+int intel_pmu_drain_bts_buffer(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct debug_store *ds = cpuc->ds;
+       struct bts_record {
+               u64     from;
+               u64     to;
+               u64     flags;
+       };
+       struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
+       struct bts_record *at, *base, *top;
+       struct perf_output_handle handle;
+       struct perf_event_header header;
+       struct perf_sample_data data;
+       unsigned long skip = 0;
+       struct pt_regs regs;
+
+       if (!event)
+               return 0;
+
+       if (!x86_pmu.bts_active)
+               return 0;
+
+       base = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
+       top  = (struct bts_record *)(unsigned long)ds->bts_index;
+
+       if (top <= base)
+               return 0;
+
+       memset(&regs, 0, sizeof(regs));
+
+       ds->bts_index = ds->bts_buffer_base;
+
+       perf_sample_data_init(&data, 0, event->hw.last_period);
+
+       /*
+        * BTS leaks kernel addresses in branches across the cpl boundary,
+        * such as traps or system calls, so unless the user is asking for
+        * kernel tracing (and right now it's not possible), we'd need to
+        * filter them out. But first we need to count how many of those we
+        * have in the current batch. This is an extra O(n) pass, however,
+        * it's much faster than the other one especially considering that
+        * n <= 2560 (BTS_BUFFER_SIZE / BTS_RECORD_SIZE * 15/16; see the
+        * alloc_bts_buffer()).
+        */
+       for (at = base; at < top; at++) {
+               /*
+                * Note that right now *this* BTS code only works if
+                * attr::exclude_kernel is set, but let's keep this extra
+                * check here in case that changes.
+                */
+               if (event->attr.exclude_kernel &&
+                   (kernel_ip(at->from) || kernel_ip(at->to)))
+                       skip++;
+       }
+
+       /*
+        * Prepare a generic sample, i.e. fill in the invariant fields.
+        * We will overwrite the from and to address before we output
+        * the sample.
+        */
+       perf_prepare_sample(&header, &data, event, &regs);
+
+       if (perf_output_begin(&handle, event, header.size *
+                             (top - base - skip)))
+               return 1;
+
+       for (at = base; at < top; at++) {
+               /* Filter out any records that contain kernel addresses. */
+               if (event->attr.exclude_kernel &&
+                   (kernel_ip(at->from) || kernel_ip(at->to)))
+                       continue;
+
+               data.ip         = at->from;
+               data.addr       = at->to;
+
+               perf_output_sample(&handle, &header, &data, event);
+       }
+
+       perf_output_end(&handle);
+
+       /* There's new data available. */
+       event->hw.interrupts++;
+       event->pending_kill = POLL_IN;
+       return 1;
+}
+
+static inline void intel_pmu_drain_pebs_buffer(void)
+{
+       struct pt_regs regs;
+
+       x86_pmu.drain_pebs(&regs);
+}
+
+void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+       if (!sched_in)
+               intel_pmu_drain_pebs_buffer();
+}
+
+/*
+ * PEBS
+ */
+struct event_constraint intel_core2_pebs_event_constraints[] = {
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
+       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_atom_pebs_event_constraints[] = {
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
+       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
+       /* Allow all events as PEBS with no flags */
+       INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_slm_pebs_event_constraints[] = {
+       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x1),
+       /* Allow all events as PEBS with no flags */
+       INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_nehalem_pebs_event_constraints[] = {
+       INTEL_PLD_CONSTRAINT(0x100b, 0xf),      /* MEM_INST_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf),    /* INST_RETIRED.ANY */
+       INTEL_EVENT_CONSTRAINT(0xc2, 0xf),    /* UOPS_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf),    /* SSEX_UOPS_RETIRED.* */
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
+       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_westmere_pebs_event_constraints[] = {
+       INTEL_PLD_CONSTRAINT(0x100b, 0xf),      /* MEM_INST_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf),    /* INSTR_RETIRED.* */
+       INTEL_EVENT_CONSTRAINT(0xc2, 0xf),    /* UOPS_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xc5, 0xf),    /* BR_MISP_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf),    /* SSEX_UOPS_RETIRED.* */
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
+       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_snb_pebs_event_constraints[] = {
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+       INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
+       INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
+       /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+        INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
+        INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+        INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+        INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+       /* Allow all events as PEBS with no flags */
+       INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_ivb_pebs_event_constraints[] = {
+        INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+        INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
+       INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
+       /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+       /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+       INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+       /* Allow all events as PEBS with no flags */
+       INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+        EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_hsw_pebs_event_constraints[] = {
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+       INTEL_PLD_CONSTRAINT(0x01cd, 0xf),    /* MEM_TRANS_RETIRED.* */
+       /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+       /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
+       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd2, 0xf),    /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd3, 0xf),    /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
+       /* Allow all events as PEBS with no flags */
+       INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_bdw_pebs_event_constraints[] = {
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+       INTEL_PLD_CONSTRAINT(0x01cd, 0xf),    /* MEM_TRANS_RETIRED.* */
+       /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+       /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
+       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf),    /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf),    /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
+       /* Allow all events as PEBS with no flags */
+       INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+       EVENT_CONSTRAINT_END
+};
+
+
+struct event_constraint intel_skl_pebs_event_constraints[] = {
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2),      /* INST_RETIRED.PREC_DIST */
+       /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+       /* INST_RETIRED.TOTAL_CYCLES_PS (inv=1, cmask=16) (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
+       INTEL_PLD_CONSTRAINT(0x1cd, 0xf),                     /* MEM_TRANS_RETIRED.* */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x22d0, 0xf), /* MEM_INST_RETIRED.LOCK_STORES */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
+       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf),    /* MEM_LOAD_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf),    /* MEM_LOAD_L3_HIT_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf),    /* MEM_LOAD_L3_MISS_RETIRED.* */
+       /* Allow all events as PEBS with no flags */
+       INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint *intel_pebs_constraints(struct perf_event *event)
+{
+       struct event_constraint *c;
+
+       if (!event->attr.precise_ip)
+               return NULL;
+
+       if (x86_pmu.pebs_constraints) {
+               for_each_event_constraint(c, x86_pmu.pebs_constraints) {
+                       if ((event->hw.config & c->cmask) == c->code) {
+                               event->hw.flags |= c->flags;
+                               return c;
+                       }
+               }
+       }
+
+       return &emptyconstraint;
+}
+
+static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc)
+{
+       return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1));
+}
+
+void intel_pmu_pebs_enable(struct perf_event *event)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
+       struct debug_store *ds = cpuc->ds;
+       bool first_pebs;
+       u64 threshold;
+
+       hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
+
+       first_pebs = !pebs_is_enabled(cpuc);
+       cpuc->pebs_enabled |= 1ULL << hwc->idx;
+
+       if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
+               cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
+       else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
+               cpuc->pebs_enabled |= 1ULL << 63;
+
+       /*
+        * When the event is constrained enough we can use a larger
+        * threshold and run the event with less frequent PMI.
+        */
+       if (hwc->flags & PERF_X86_EVENT_FREERUNNING) {
+               threshold = ds->pebs_absolute_maximum -
+                       x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
+
+               if (first_pebs)
+                       perf_sched_cb_inc(event->ctx->pmu);
+       } else {
+               threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+
+               /*
+                * If not all events can use larger buffer,
+                * roll back to threshold = 1
+                */
+               if (!first_pebs &&
+                   (ds->pebs_interrupt_threshold > threshold))
+                       perf_sched_cb_dec(event->ctx->pmu);
+       }
+
+       /* Use auto-reload if possible to save a MSR write in the PMI */
+       if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
+               ds->pebs_event_reset[hwc->idx] =
+                       (u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
+       }
+
+       if (first_pebs || ds->pebs_interrupt_threshold > threshold)
+               ds->pebs_interrupt_threshold = threshold;
+}
+
+void intel_pmu_pebs_disable(struct perf_event *event)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
+       struct debug_store *ds = cpuc->ds;
+       bool large_pebs = ds->pebs_interrupt_threshold >
+               ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+
+       if (large_pebs)
+               intel_pmu_drain_pebs_buffer();
+
+       cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
+
+       if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
+               cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
+       else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
+               cpuc->pebs_enabled &= ~(1ULL << 63);
+
+       if (large_pebs && !pebs_is_enabled(cpuc))
+               perf_sched_cb_dec(event->ctx->pmu);
+
+       if (cpuc->enabled)
+               wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
+
+       hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
+}
+
+void intel_pmu_pebs_enable_all(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       if (cpuc->pebs_enabled)
+               wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
+}
+
+void intel_pmu_pebs_disable_all(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       if (cpuc->pebs_enabled)
+               wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
+}
+
+static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       unsigned long from = cpuc->lbr_entries[0].from;
+       unsigned long old_to, to = cpuc->lbr_entries[0].to;
+       unsigned long ip = regs->ip;
+       int is_64bit = 0;
+       void *kaddr;
+       int size;
+
+       /*
+        * We don't need to fixup if the PEBS assist is fault like
+        */
+       if (!x86_pmu.intel_cap.pebs_trap)
+               return 1;
+
+       /*
+        * No LBR entry, no basic block, no rewinding
+        */
+       if (!cpuc->lbr_stack.nr || !from || !to)
+               return 0;
+
+       /*
+        * Basic blocks should never cross user/kernel boundaries
+        */
+       if (kernel_ip(ip) != kernel_ip(to))
+               return 0;
+
+       /*
+        * unsigned math, either ip is before the start (impossible) or
+        * the basic block is larger than 1 page (sanity)
+        */
+       if ((ip - to) > PEBS_FIXUP_SIZE)
+               return 0;
+
+       /*
+        * We sampled a branch insn, rewind using the LBR stack
+        */
+       if (ip == to) {
+               set_linear_ip(regs, from);
+               return 1;
+       }
+
+       size = ip - to;
+       if (!kernel_ip(ip)) {
+               int bytes;
+               u8 *buf = this_cpu_read(insn_buffer);
+
+               /* 'size' must fit our buffer, see above */
+               bytes = copy_from_user_nmi(buf, (void __user *)to, size);
+               if (bytes != 0)
+                       return 0;
+
+               kaddr = buf;
+       } else {
+               kaddr = (void *)to;
+       }
+
+       do {
+               struct insn insn;
+
+               old_to = to;
+
+#ifdef CONFIG_X86_64
+               is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
+#endif
+               insn_init(&insn, kaddr, size, is_64bit);
+               insn_get_length(&insn);
+               /*
+                * Make sure there was not a problem decoding the
+                * instruction and getting the length.  This is
+                * doubly important because we have an infinite
+                * loop if insn.length=0.
+                */
+               if (!insn.length)
+                       break;
+
+               to += insn.length;
+               kaddr += insn.length;
+               size -= insn.length;
+       } while (to < ip);
+
+       if (to == ip) {
+               set_linear_ip(regs, old_to);
+               return 1;
+       }
+
+       /*
+        * Even though we decoded the basic block, the instruction stream
+        * never matched the given IP, either the TO or the IP got corrupted.
+        */
+       return 0;
+}
+
+static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs)
+{
+       if (pebs->tsx_tuning) {
+               union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning };
+               return tsx.cycles_last_block;
+       }
+       return 0;
+}
+
+static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs)
+{
+       u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32;
+
+       /* For RTM XABORTs also log the abort code from AX */
+       if ((txn & PERF_TXN_TRANSACTION) && (pebs->ax & 1))
+               txn |= ((pebs->ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT;
+       return txn;
+}
+
+static void setup_pebs_sample_data(struct perf_event *event,
+                                  struct pt_regs *iregs, void *__pebs,
+                                  struct perf_sample_data *data,
+                                  struct pt_regs *regs)
+{
+#define PERF_X86_EVENT_PEBS_HSW_PREC \
+               (PERF_X86_EVENT_PEBS_ST_HSW | \
+                PERF_X86_EVENT_PEBS_LD_HSW | \
+                PERF_X86_EVENT_PEBS_NA_HSW)
+       /*
+        * We cast to the biggest pebs_record but are careful not to
+        * unconditionally access the 'extra' entries.
+        */
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct pebs_record_skl *pebs = __pebs;
+       u64 sample_type;
+       int fll, fst, dsrc;
+       int fl = event->hw.flags;
+
+       if (pebs == NULL)
+               return;
+
+       sample_type = event->attr.sample_type;
+       dsrc = sample_type & PERF_SAMPLE_DATA_SRC;
+
+       fll = fl & PERF_X86_EVENT_PEBS_LDLAT;
+       fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
+
+       perf_sample_data_init(data, 0, event->hw.last_period);
+
+       data->period = event->hw.last_period;
+
+       /*
+        * Use latency for weight (only avail with PEBS-LL)
+        */
+       if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
+               data->weight = pebs->lat;
+
+       /*
+        * data.data_src encodes the data source
+        */
+       if (dsrc) {
+               u64 val = PERF_MEM_NA;
+               if (fll)
+                       val = load_latency_data(pebs->dse);
+               else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
+                       val = precise_datala_hsw(event, pebs->dse);
+               else if (fst)
+                       val = precise_store_data(pebs->dse);
+               data->data_src.val = val;
+       }
+
+       /*
+        * We use the interrupt regs as a base because the PEBS record
+        * does not contain a full regs set, specifically it seems to
+        * lack segment descriptors, which get used by things like
+        * user_mode().
+        *
+        * In the simple case fix up only the IP and BP,SP regs, for
+        * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
+        * A possible PERF_SAMPLE_REGS will have to transfer all regs.
+        */
+       *regs = *iregs;
+       regs->flags = pebs->flags;
+       set_linear_ip(regs, pebs->ip);
+       regs->bp = pebs->bp;
+       regs->sp = pebs->sp;
+
+       if (sample_type & PERF_SAMPLE_REGS_INTR) {
+               regs->ax = pebs->ax;
+               regs->bx = pebs->bx;
+               regs->cx = pebs->cx;
+               regs->dx = pebs->dx;
+               regs->si = pebs->si;
+               regs->di = pebs->di;
+               regs->bp = pebs->bp;
+               regs->sp = pebs->sp;
+
+               regs->flags = pebs->flags;
+#ifndef CONFIG_X86_32
+               regs->r8 = pebs->r8;
+               regs->r9 = pebs->r9;
+               regs->r10 = pebs->r10;
+               regs->r11 = pebs->r11;
+               regs->r12 = pebs->r12;
+               regs->r13 = pebs->r13;
+               regs->r14 = pebs->r14;
+               regs->r15 = pebs->r15;
+#endif
+       }
+
+       if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) {
+               regs->ip = pebs->real_ip;
+               regs->flags |= PERF_EFLAGS_EXACT;
+       } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(regs))
+               regs->flags |= PERF_EFLAGS_EXACT;
+       else
+               regs->flags &= ~PERF_EFLAGS_EXACT;
+
+       if ((sample_type & PERF_SAMPLE_ADDR) &&
+           x86_pmu.intel_cap.pebs_format >= 1)
+               data->addr = pebs->dla;
+
+       if (x86_pmu.intel_cap.pebs_format >= 2) {
+               /* Only set the TSX weight when no memory weight. */
+               if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
+                       data->weight = intel_hsw_weight(pebs);
+
+               if (sample_type & PERF_SAMPLE_TRANSACTION)
+                       data->txn = intel_hsw_transaction(pebs);
+       }
+
+       /*
+        * v3 supplies an accurate time stamp, so we use that
+        * for the time stamp.
+        *
+        * We can only do this for the default trace clock.
+        */
+       if (x86_pmu.intel_cap.pebs_format >= 3 &&
+               event->attr.use_clockid == 0)
+               data->time = native_sched_clock_from_tsc(pebs->tsc);
+
+       if (has_branch_stack(event))
+               data->br_stack = &cpuc->lbr_stack;
+}
+
+static inline void *
+get_next_pebs_record_by_bit(void *base, void *top, int bit)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       void *at;
+       u64 pebs_status;
+
+       /*
+        * fmt0 does not have a status bitfield (does not use
+        * perf_record_nhm format)
+        */
+       if (x86_pmu.intel_cap.pebs_format < 1)
+               return base;
+
+       if (base == NULL)
+               return NULL;
+
+       for (at = base; at < top; at += x86_pmu.pebs_record_size) {
+               struct pebs_record_nhm *p = at;
+
+               if (test_bit(bit, (unsigned long *)&p->status)) {
+                       /* PEBS v3 has accurate status bits */
+                       if (x86_pmu.intel_cap.pebs_format >= 3)
+                               return at;
+
+                       if (p->status == (1 << bit))
+                               return at;
+
+                       /* clear non-PEBS bit and re-check */
+                       pebs_status = p->status & cpuc->pebs_enabled;
+                       pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
+                       if (pebs_status == (1 << bit))
+                               return at;
+               }
+       }
+       return NULL;
+}
+
+static void __intel_pmu_pebs_event(struct perf_event *event,
+                                  struct pt_regs *iregs,
+                                  void *base, void *top,
+                                  int bit, int count)
+{
+       struct perf_sample_data data;
+       struct pt_regs regs;
+       void *at = get_next_pebs_record_by_bit(base, top, bit);
+
+       if (!intel_pmu_save_and_restart(event) &&
+           !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD))
+               return;
+
+       while (count > 1) {
+               setup_pebs_sample_data(event, iregs, at, &data, &regs);
+               perf_event_output(event, &data, &regs);
+               at += x86_pmu.pebs_record_size;
+               at = get_next_pebs_record_by_bit(at, top, bit);
+               count--;
+       }
+
+       setup_pebs_sample_data(event, iregs, at, &data, &regs);
+
+       /*
+        * All but the last records are processed.
+        * The last one is left to be able to call the overflow handler.
+        */
+       if (perf_event_overflow(event, &data, &regs)) {
+               x86_pmu_stop(event, 0);
+               return;
+       }
+
+}
+
+static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct debug_store *ds = cpuc->ds;
+       struct perf_event *event = cpuc->events[0]; /* PMC0 only */
+       struct pebs_record_core *at, *top;
+       int n;
+
+       if (!x86_pmu.pebs_active)
+               return;
+
+       at  = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
+       top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
+
+       /*
+        * Whatever else happens, drain the thing
+        */
+       ds->pebs_index = ds->pebs_buffer_base;
+
+       if (!test_bit(0, cpuc->active_mask))
+               return;
+
+       WARN_ON_ONCE(!event);
+
+       if (!event->attr.precise_ip)
+               return;
+
+       n = top - at;
+       if (n <= 0)
+               return;
+
+       __intel_pmu_pebs_event(event, iregs, at, top, 0, n);
+}
+
+static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct debug_store *ds = cpuc->ds;
+       struct perf_event *event;
+       void *base, *at, *top;
+       short counts[MAX_PEBS_EVENTS] = {};
+       short error[MAX_PEBS_EVENTS] = {};
+       int bit, i;
+
+       if (!x86_pmu.pebs_active)
+               return;
+
+       base = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
+       top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
+
+       ds->pebs_index = ds->pebs_buffer_base;
+
+       if (unlikely(base >= top))
+               return;
+
+       for (at = base; at < top; at += x86_pmu.pebs_record_size) {
+               struct pebs_record_nhm *p = at;
+               u64 pebs_status;
+
+               /* PEBS v3 has accurate status bits */
+               if (x86_pmu.intel_cap.pebs_format >= 3) {
+                       for_each_set_bit(bit, (unsigned long *)&p->status,
+                                        MAX_PEBS_EVENTS)
+                               counts[bit]++;
+
+                       continue;
+               }
+
+               pebs_status = p->status & cpuc->pebs_enabled;
+               pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1;
+
+               /*
+                * On some CPUs the PEBS status can be zero when PEBS is
+                * racing with clearing of GLOBAL_STATUS.
+                *
+                * Normally we would drop that record, but in the
+                * case when there is only a single active PEBS event
+                * we can assume it's for that event.
+                */
+               if (!pebs_status && cpuc->pebs_enabled &&
+                       !(cpuc->pebs_enabled & (cpuc->pebs_enabled-1)))
+                       pebs_status = cpuc->pebs_enabled;
+
+               bit = find_first_bit((unsigned long *)&pebs_status,
+                                       x86_pmu.max_pebs_events);
+               if (bit >= x86_pmu.max_pebs_events)
+                       continue;
+
+               /*
+                * The PEBS hardware does not deal well with the situation
+                * when events happen near to each other and multiple bits
+                * are set. But it should happen rarely.
+                *
+                * If these events include one PEBS and multiple non-PEBS
+                * events, it doesn't impact PEBS record. The record will
+                * be handled normally. (slow path)
+                *
+                * If these events include two or more PEBS events, the
+                * records for the events can be collapsed into a single
+                * one, and it's not possible to reconstruct all events
+                * that caused the PEBS record. It's called collision.
+                * If collision happened, the record will be dropped.
+                */
+               if (p->status != (1ULL << bit)) {
+                       for_each_set_bit(i, (unsigned long *)&pebs_status,
+                                        x86_pmu.max_pebs_events)
+                               error[i]++;
+                       continue;
+               }
+
+               counts[bit]++;
+       }
+
+       for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
+               if ((counts[bit] == 0) && (error[bit] == 0))
+                       continue;
+
+               event = cpuc->events[bit];
+               WARN_ON_ONCE(!event);
+               WARN_ON_ONCE(!event->attr.precise_ip);
+
+               /* log dropped samples number */
+               if (error[bit])
+                       perf_log_lost_samples(event, error[bit]);
+
+               if (counts[bit]) {
+                       __intel_pmu_pebs_event(event, iregs, base,
+                                              top, bit, counts[bit]);
+               }
+       }
+}
+
+/*
+ * BTS, PEBS probe and setup
+ */
+
+void __init intel_ds_init(void)
+{
+       /*
+        * No support for 32bit formats
+        */
+       if (!boot_cpu_has(X86_FEATURE_DTES64))
+               return;
+
+       x86_pmu.bts  = boot_cpu_has(X86_FEATURE_BTS);
+       x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
+       x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE;
+       if (x86_pmu.pebs) {
+               char pebs_type = x86_pmu.intel_cap.pebs_trap ?  '+' : '-';
+               int format = x86_pmu.intel_cap.pebs_format;
+
+               switch (format) {
+               case 0:
+                       pr_cont("PEBS fmt0%c, ", pebs_type);
+                       x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
+                       /*
+                        * Using >PAGE_SIZE buffers makes the WRMSR to
+                        * PERF_GLOBAL_CTRL in intel_pmu_enable_all()
+                        * mysteriously hang on Core2.
+                        *
+                        * As a workaround, we don't do this.
+                        */
+                       x86_pmu.pebs_buffer_size = PAGE_SIZE;
+                       x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
+                       break;
+
+               case 1:
+                       pr_cont("PEBS fmt1%c, ", pebs_type);
+                       x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
+                       x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+                       break;
+
+               case 2:
+                       pr_cont("PEBS fmt2%c, ", pebs_type);
+                       x86_pmu.pebs_record_size = sizeof(struct pebs_record_hsw);
+                       x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+                       break;
+
+               case 3:
+                       pr_cont("PEBS fmt3%c, ", pebs_type);
+                       x86_pmu.pebs_record_size =
+                                               sizeof(struct pebs_record_skl);
+                       x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+                       x86_pmu.free_running_flags |= PERF_SAMPLE_TIME;
+                       break;
+
+               default:
+                       pr_cont("no PEBS fmt%d%c, ", format, pebs_type);
+                       x86_pmu.pebs = 0;
+               }
+       }
+}
+
+void perf_restore_debug_store(void)
+{
+       struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
+
+       if (!x86_pmu.bts && !x86_pmu.pebs)
+               return;
+
+       wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ds);
+}
diff --git a/arch/x86/events/intel/knc.c b/arch/x86/events/intel/knc.c

new file mode 100644 (file)

index 0000000..548d5f7
--- /dev/null
+++ b/arch/x86/events/intel/knc.c
@@ -0,0 +1,321 @@
+/* Driver for Intel Xeon Phi "Knights Corner" PMU */
+
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+#include <asm/hardirq.h>
+
+#include "../perf_event.h"
+
+static const u64 knc_perfmon_event_map[] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]           = 0x002a,
+  [PERF_COUNT_HW_INSTRUCTIONS]         = 0x0016,
+  [PERF_COUNT_HW_CACHE_REFERENCES]     = 0x0028,
+  [PERF_COUNT_HW_CACHE_MISSES]         = 0x0029,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]  = 0x0012,
+  [PERF_COUNT_HW_BRANCH_MISSES]                = 0x002b,
+};
+
+static const u64 __initconst knc_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               /* On Xeon Phi event "0" is a valid DATA_READ          */
+               /*   (L1 Data Cache Reads) Instruction.                */
+               /* We code this as ARCH_PERFMON_EVENTSEL_INT as this   */
+               /* bit will always be set in x86_pmu_hw_config().      */
+               [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
+                                               /* DATA_READ           */
+               [ C(RESULT_MISS)   ] = 0x0003,  /* DATA_READ_MISS      */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0001,  /* DATA_WRITE          */
+               [ C(RESULT_MISS)   ] = 0x0004,  /* DATA_WRITE_MISS     */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0011,  /* L1_DATA_PF1         */
+               [ C(RESULT_MISS)   ] = 0x001c,  /* L1_DATA_PF1_MISS    */
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x000c,  /* CODE_READ          */
+               [ C(RESULT_MISS)   ] = 0x000e,  /* CODE_CACHE_MISS    */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0x10cb,  /* L2_READ_MISS */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x10cc,  /* L2_WRITE_HIT */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x10fc,  /* L2_DATA_PF2      */
+               [ C(RESULT_MISS)   ] = 0x10fe,  /* L2_DATA_PF2_MISS */
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
+                                               /* DATA_READ */
+                                               /* see note on L1 OP_READ */
+               [ C(RESULT_MISS)   ] = 0x0002,  /* DATA_PAGE_WALK */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0001,  /* DATA_WRITE */
+               [ C(RESULT_MISS)   ] = 0x0002,  /* DATA_PAGE_WALK */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x000c,  /* CODE_READ */
+               [ C(RESULT_MISS)   ] = 0x000d,  /* CODE_PAGE_WALK */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0012,  /* BRANCHES */
+               [ C(RESULT_MISS)   ] = 0x002b,  /* BRANCHES_MISPREDICTED */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+
+static u64 knc_pmu_event_map(int hw_event)
+{
+       return knc_perfmon_event_map[hw_event];
+}
+
+static struct event_constraint knc_event_constraints[] =
+{
+       INTEL_EVENT_CONSTRAINT(0xc3, 0x1),      /* HWP_L2HIT */
+       INTEL_EVENT_CONSTRAINT(0xc4, 0x1),      /* HWP_L2MISS */
+       INTEL_EVENT_CONSTRAINT(0xc8, 0x1),      /* L2_READ_HIT_E */
+       INTEL_EVENT_CONSTRAINT(0xc9, 0x1),      /* L2_READ_HIT_M */
+       INTEL_EVENT_CONSTRAINT(0xca, 0x1),      /* L2_READ_HIT_S */
+       INTEL_EVENT_CONSTRAINT(0xcb, 0x1),      /* L2_READ_MISS */
+       INTEL_EVENT_CONSTRAINT(0xcc, 0x1),      /* L2_WRITE_HIT */
+       INTEL_EVENT_CONSTRAINT(0xce, 0x1),      /* L2_STRONGLY_ORDERED_STREAMING_VSTORES_MISS */
+       INTEL_EVENT_CONSTRAINT(0xcf, 0x1),      /* L2_WEAKLY_ORDERED_STREAMING_VSTORE_MISS */
+       INTEL_EVENT_CONSTRAINT(0xd7, 0x1),      /* L2_VICTIM_REQ_WITH_DATA */
+       INTEL_EVENT_CONSTRAINT(0xe3, 0x1),      /* SNP_HITM_BUNIT */
+       INTEL_EVENT_CONSTRAINT(0xe6, 0x1),      /* SNP_HIT_L2 */
+       INTEL_EVENT_CONSTRAINT(0xe7, 0x1),      /* SNP_HITM_L2 */
+       INTEL_EVENT_CONSTRAINT(0xf1, 0x1),      /* L2_DATA_READ_MISS_CACHE_FILL */
+       INTEL_EVENT_CONSTRAINT(0xf2, 0x1),      /* L2_DATA_WRITE_MISS_CACHE_FILL */
+       INTEL_EVENT_CONSTRAINT(0xf6, 0x1),      /* L2_DATA_READ_MISS_MEM_FILL */
+       INTEL_EVENT_CONSTRAINT(0xf7, 0x1),      /* L2_DATA_WRITE_MISS_MEM_FILL */
+       INTEL_EVENT_CONSTRAINT(0xfc, 0x1),      /* L2_DATA_PF2 */
+       INTEL_EVENT_CONSTRAINT(0xfd, 0x1),      /* L2_DATA_PF2_DROP */
+       INTEL_EVENT_CONSTRAINT(0xfe, 0x1),      /* L2_DATA_PF2_MISS */
+       INTEL_EVENT_CONSTRAINT(0xff, 0x1),      /* L2_DATA_HIT_INFLIGHT_PF2 */
+       EVENT_CONSTRAINT_END
+};
+
+#define MSR_KNC_IA32_PERF_GLOBAL_STATUS                0x0000002d
+#define MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL   0x0000002e
+#define MSR_KNC_IA32_PERF_GLOBAL_CTRL          0x0000002f
+
+#define KNC_ENABLE_COUNTER0                    0x00000001
+#define KNC_ENABLE_COUNTER1                    0x00000002
+
+static void knc_pmu_disable_all(void)
+{
+       u64 val;
+
+       rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+       val &= ~(KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
+       wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+}
+
+static void knc_pmu_enable_all(int added)
+{
+       u64 val;
+
+       rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+       val |= (KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
+       wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+}
+
+static inline void
+knc_pmu_disable_event(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 val;
+
+       val = hwc->config;
+       val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+
+       (void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
+}
+
+static void knc_pmu_enable_event(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 val;
+
+       val = hwc->config;
+       val |= ARCH_PERFMON_EVENTSEL_ENABLE;
+
+       (void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
+}
+
+static inline u64 knc_pmu_get_status(void)
+{
+       u64 status;
+
+       rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_STATUS, status);
+
+       return status;
+}
+
+static inline void knc_pmu_ack_status(u64 ack)
+{
+       wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL, ack);
+}
+
+static int knc_pmu_handle_irq(struct pt_regs *regs)
+{
+       struct perf_sample_data data;
+       struct cpu_hw_events *cpuc;
+       int handled = 0;
+       int bit, loops;
+       u64 status;
+
+       cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       knc_pmu_disable_all();
+
+       status = knc_pmu_get_status();
+       if (!status) {
+               knc_pmu_enable_all(0);
+               return handled;
+       }
+
+       loops = 0;
+again:
+       knc_pmu_ack_status(status);
+       if (++loops > 100) {
+               WARN_ONCE(1, "perf: irq loop stuck!\n");
+               perf_event_print_debug();
+               goto done;
+       }
+
+       inc_irq_stat(apic_perf_irqs);
+
+       for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+               struct perf_event *event = cpuc->events[bit];
+
+               handled++;
+
+               if (!test_bit(bit, cpuc->active_mask))
+                       continue;
+
+               if (!intel_pmu_save_and_restart(event))
+                       continue;
+
+               perf_sample_data_init(&data, 0, event->hw.last_period);
+
+               if (perf_event_overflow(event, &data, regs))
+                       x86_pmu_stop(event, 0);
+       }
+
+       /*
+        * Repeat if there is more work to be done:
+        */
+       status = knc_pmu_get_status();
+       if (status)
+               goto again;
+
+done:
+       /* Only restore PMU state when it's active. See x86_pmu_disable(). */
+       if (cpuc->enabled)
+               knc_pmu_enable_all(0);
+
+       return handled;
+}
+
+
+PMU_FORMAT_ATTR(event, "config:0-7"    );
+PMU_FORMAT_ATTR(umask, "config:8-15"   );
+PMU_FORMAT_ATTR(edge,  "config:18"     );
+PMU_FORMAT_ATTR(inv,   "config:23"     );
+PMU_FORMAT_ATTR(cmask, "config:24-31"  );
+
+static struct attribute *intel_knc_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_cmask.attr,
+       NULL,
+};
+
+static const struct x86_pmu knc_pmu __initconst = {
+       .name                   = "knc",
+       .handle_irq             = knc_pmu_handle_irq,
+       .disable_all            = knc_pmu_disable_all,
+       .enable_all             = knc_pmu_enable_all,
+       .enable                 = knc_pmu_enable_event,
+       .disable                = knc_pmu_disable_event,
+       .hw_config              = x86_pmu_hw_config,
+       .schedule_events        = x86_schedule_events,
+       .eventsel               = MSR_KNC_EVNTSEL0,
+       .perfctr                = MSR_KNC_PERFCTR0,
+       .event_map              = knc_pmu_event_map,
+       .max_events             = ARRAY_SIZE(knc_perfmon_event_map),
+       .apic                   = 1,
+       .max_period             = (1ULL << 39) - 1,
+       .version                = 0,
+       .num_counters           = 2,
+       .cntval_bits            = 40,
+       .cntval_mask            = (1ULL << 40) - 1,
+       .get_event_constraints  = x86_get_event_constraints,
+       .event_constraints      = knc_event_constraints,
+       .format_attrs           = intel_knc_formats_attr,
+};
+
+__init int knc_pmu_init(void)
+{
+       x86_pmu = knc_pmu;
+
+       memcpy(hw_cache_event_ids, knc_hw_cache_event_ids, 
+               sizeof(hw_cache_event_ids));
+
+       return 0;
+}
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c

new file mode 100644 (file)

index 0000000..69dd118
--- /dev/null
+++ b/arch/x86/events/intel/lbr.c
@@ -0,0 +1,1062 @@
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+#include <asm/perf_event.h>
+#include <asm/msr.h>
+#include <asm/insn.h>
+
+#include "../perf_event.h"
+
+enum {
+       LBR_FORMAT_32           = 0x00,
+       LBR_FORMAT_LIP          = 0x01,
+       LBR_FORMAT_EIP          = 0x02,
+       LBR_FORMAT_EIP_FLAGS    = 0x03,
+       LBR_FORMAT_EIP_FLAGS2   = 0x04,
+       LBR_FORMAT_INFO         = 0x05,
+       LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_INFO,
+};
+
+static enum {
+       LBR_EIP_FLAGS           = 1,
+       LBR_TSX                 = 2,
+} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = {
+       [LBR_FORMAT_EIP_FLAGS]  = LBR_EIP_FLAGS,
+       [LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX,
+};
+
+/*
+ * Intel LBR_SELECT bits
+ * Intel Vol3a, April 2011, Section 16.7 Table 16-10
+ *
+ * Hardware branch filter (not available on all CPUs)
+ */
+#define LBR_KERNEL_BIT         0 /* do not capture at ring0 */
+#define LBR_USER_BIT           1 /* do not capture at ring > 0 */
+#define LBR_JCC_BIT            2 /* do not capture conditional branches */
+#define LBR_REL_CALL_BIT       3 /* do not capture relative calls */
+#define LBR_IND_CALL_BIT       4 /* do not capture indirect calls */
+#define LBR_RETURN_BIT         5 /* do not capture near returns */
+#define LBR_IND_JMP_BIT                6 /* do not capture indirect jumps */
+#define LBR_REL_JMP_BIT                7 /* do not capture relative jumps */
+#define LBR_FAR_BIT            8 /* do not capture far branches */
+#define LBR_CALL_STACK_BIT     9 /* enable call stack */
+
+/*
+ * Following bit only exists in Linux; we mask it out before writing it to
+ * the actual MSR. But it helps the constraint perf code to understand
+ * that this is a separate configuration.
+ */
+#define LBR_NO_INFO_BIT               63 /* don't read LBR_INFO. */
+
+#define LBR_KERNEL     (1 << LBR_KERNEL_BIT)
+#define LBR_USER       (1 << LBR_USER_BIT)
+#define LBR_JCC                (1 << LBR_JCC_BIT)
+#define LBR_REL_CALL   (1 << LBR_REL_CALL_BIT)
+#define LBR_IND_CALL   (1 << LBR_IND_CALL_BIT)
+#define LBR_RETURN     (1 << LBR_RETURN_BIT)
+#define LBR_REL_JMP    (1 << LBR_REL_JMP_BIT)
+#define LBR_IND_JMP    (1 << LBR_IND_JMP_BIT)
+#define LBR_FAR                (1 << LBR_FAR_BIT)
+#define LBR_CALL_STACK (1 << LBR_CALL_STACK_BIT)
+#define LBR_NO_INFO    (1ULL << LBR_NO_INFO_BIT)
+
+#define LBR_PLM (LBR_KERNEL | LBR_USER)
+
+#define LBR_SEL_MASK   0x1ff   /* valid bits in LBR_SELECT */
+#define LBR_NOT_SUPP   -1      /* LBR filter not supported */
+#define LBR_IGN                0       /* ignored */
+
+#define LBR_ANY                 \
+       (LBR_JCC        |\
+        LBR_REL_CALL   |\
+        LBR_IND_CALL   |\
+        LBR_RETURN     |\
+        LBR_REL_JMP    |\
+        LBR_IND_JMP    |\
+        LBR_FAR)
+
+#define LBR_FROM_FLAG_MISPRED  (1ULL << 63)
+#define LBR_FROM_FLAG_IN_TX    (1ULL << 62)
+#define LBR_FROM_FLAG_ABORT    (1ULL << 61)
+
+/*
+ * x86control flow change classification
+ * x86control flow changes include branches, interrupts, traps, faults
+ */
+enum {
+       X86_BR_NONE             = 0,      /* unknown */
+
+       X86_BR_USER             = 1 << 0, /* branch target is user */
+       X86_BR_KERNEL           = 1 << 1, /* branch target is kernel */
+
+       X86_BR_CALL             = 1 << 2, /* call */
+       X86_BR_RET              = 1 << 3, /* return */
+       X86_BR_SYSCALL          = 1 << 4, /* syscall */
+       X86_BR_SYSRET           = 1 << 5, /* syscall return */
+       X86_BR_INT              = 1 << 6, /* sw interrupt */
+       X86_BR_IRET             = 1 << 7, /* return from interrupt */
+       X86_BR_JCC              = 1 << 8, /* conditional */
+       X86_BR_JMP              = 1 << 9, /* jump */
+       X86_BR_IRQ              = 1 << 10,/* hw interrupt or trap or fault */
+       X86_BR_IND_CALL         = 1 << 11,/* indirect calls */
+       X86_BR_ABORT            = 1 << 12,/* transaction abort */
+       X86_BR_IN_TX            = 1 << 13,/* in transaction */
+       X86_BR_NO_TX            = 1 << 14,/* not in transaction */
+       X86_BR_ZERO_CALL        = 1 << 15,/* zero length call */
+       X86_BR_CALL_STACK       = 1 << 16,/* call stack */
+       X86_BR_IND_JMP          = 1 << 17,/* indirect jump */
+};
+
+#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
+#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX)
+
+#define X86_BR_ANY       \
+       (X86_BR_CALL    |\
+        X86_BR_RET     |\
+        X86_BR_SYSCALL |\
+        X86_BR_SYSRET  |\
+        X86_BR_INT     |\
+        X86_BR_IRET    |\
+        X86_BR_JCC     |\
+        X86_BR_JMP      |\
+        X86_BR_IRQ      |\
+        X86_BR_ABORT    |\
+        X86_BR_IND_CALL |\
+        X86_BR_IND_JMP  |\
+        X86_BR_ZERO_CALL)
+
+#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
+
+#define X86_BR_ANY_CALL                 \
+       (X86_BR_CALL            |\
+        X86_BR_IND_CALL        |\
+        X86_BR_ZERO_CALL       |\
+        X86_BR_SYSCALL         |\
+        X86_BR_IRQ             |\
+        X86_BR_INT)
+
+static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
+
+/*
+ * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
+ * otherwise it becomes near impossible to get a reliable stack.
+ */
+
+static void __intel_pmu_lbr_enable(bool pmi)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       u64 debugctl, lbr_select = 0, orig_debugctl;
+
+       /*
+        * No need to unfreeze manually, as v4 can do that as part
+        * of the GLOBAL_STATUS ack.
+        */
+       if (pmi && x86_pmu.version >= 4)
+               return;
+
+       /*
+        * No need to reprogram LBR_SELECT in a PMI, as it
+        * did not change.
+        */
+       if (cpuc->lbr_sel)
+               lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask;
+       if (!pmi && cpuc->lbr_sel)
+               wrmsrl(MSR_LBR_SELECT, lbr_select);
+
+       rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+       orig_debugctl = debugctl;
+       debugctl |= DEBUGCTLMSR_LBR;
+       /*
+        * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
+        * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
+        * may cause superfluous increase/decrease of LBR_TOS.
+        */
+       if (!(lbr_select & LBR_CALL_STACK))
+               debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+       if (orig_debugctl != debugctl)
+               wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+}
+
+static void __intel_pmu_lbr_disable(void)
+{
+       u64 debugctl;
+
+       rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+       debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+       wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+}
+
+static void intel_pmu_lbr_reset_32(void)
+{
+       int i;
+
+       for (i = 0; i < x86_pmu.lbr_nr; i++)
+               wrmsrl(x86_pmu.lbr_from + i, 0);
+}
+
+static void intel_pmu_lbr_reset_64(void)
+{
+       int i;
+
+       for (i = 0; i < x86_pmu.lbr_nr; i++) {
+               wrmsrl(x86_pmu.lbr_from + i, 0);
+               wrmsrl(x86_pmu.lbr_to   + i, 0);
+               if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+                       wrmsrl(MSR_LBR_INFO_0 + i, 0);
+       }
+}
+
+void intel_pmu_lbr_reset(void)
+{
+       if (!x86_pmu.lbr_nr)
+               return;
+
+       if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
+               intel_pmu_lbr_reset_32();
+       else
+               intel_pmu_lbr_reset_64();
+}
+
+/*
+ * TOS = most recently recorded branch
+ */
+static inline u64 intel_pmu_lbr_tos(void)
+{
+       u64 tos;
+
+       rdmsrl(x86_pmu.lbr_tos, tos);
+       return tos;
+}
+
+enum {
+       LBR_NONE,
+       LBR_VALID,
+};
+
+static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
+{
+       int i;
+       unsigned lbr_idx, mask;
+       u64 tos;
+
+       if (task_ctx->lbr_callstack_users == 0 ||
+           task_ctx->lbr_stack_state == LBR_NONE) {
+               intel_pmu_lbr_reset();
+               return;
+       }
+
+       mask = x86_pmu.lbr_nr - 1;
+       tos = task_ctx->tos;
+       for (i = 0; i < tos; i++) {
+               lbr_idx = (tos - i) & mask;
+               wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
+               wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+               if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+                       wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
+       }
+       wrmsrl(x86_pmu.lbr_tos, tos);
+       task_ctx->lbr_stack_state = LBR_NONE;
+}
+
+static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
+{
+       int i;
+       unsigned lbr_idx, mask;
+       u64 tos;
+
+       if (task_ctx->lbr_callstack_users == 0) {
+               task_ctx->lbr_stack_state = LBR_NONE;
+               return;
+       }
+
+       mask = x86_pmu.lbr_nr - 1;
+       tos = intel_pmu_lbr_tos();
+       for (i = 0; i < tos; i++) {
+               lbr_idx = (tos - i) & mask;
+               rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
+               rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+               if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+                       rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
+       }
+       task_ctx->tos = tos;
+       task_ctx->lbr_stack_state = LBR_VALID;
+}
+
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct x86_perf_task_context *task_ctx;
+
+       /*
+        * If LBR callstack feature is enabled and the stack was saved when
+        * the task was scheduled out, restore the stack. Otherwise flush
+        * the LBR stack.
+        */
+       task_ctx = ctx ? ctx->task_ctx_data : NULL;
+       if (task_ctx) {
+               if (sched_in) {
+                       __intel_pmu_lbr_restore(task_ctx);
+                       cpuc->lbr_context = ctx;
+               } else {
+                       __intel_pmu_lbr_save(task_ctx);
+               }
+               return;
+       }
+
+       /*
+        * When sampling the branck stack in system-wide, it may be
+        * necessary to flush the stack on context switch. This happens
+        * when the branch stack does not tag its entries with the pid
+        * of the current task. Otherwise it becomes impossible to
+        * associate a branch entry with a task. This ambiguity is more
+        * likely to appear when the branch stack supports priv level
+        * filtering and the user sets it to monitor only at the user
+        * level (which could be a useful measurement in system-wide
+        * mode). In that case, the risk is high of having a branch
+        * stack with branch from multiple tasks.
+        */
+       if (sched_in) {
+               intel_pmu_lbr_reset();
+               cpuc->lbr_context = ctx;
+       }
+}
+
+static inline bool branch_user_callstack(unsigned br_sel)
+{
+       return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
+}
+
+void intel_pmu_lbr_enable(struct perf_event *event)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct x86_perf_task_context *task_ctx;
+
+       if (!x86_pmu.lbr_nr)
+               return;
+
+       /*
+        * Reset the LBR stack if we changed task context to
+        * avoid data leaks.
+        */
+       if (event->ctx->task && cpuc->lbr_context != event->ctx) {
+               intel_pmu_lbr_reset();
+               cpuc->lbr_context = event->ctx;
+       }
+       cpuc->br_sel = event->hw.branch_reg.reg;
+
+       if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
+                                       event->ctx->task_ctx_data) {
+               task_ctx = event->ctx->task_ctx_data;
+               task_ctx->lbr_callstack_users++;
+       }
+
+       cpuc->lbr_users++;
+       perf_sched_cb_inc(event->ctx->pmu);
+}
+
+void intel_pmu_lbr_disable(struct perf_event *event)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct x86_perf_task_context *task_ctx;
+
+       if (!x86_pmu.lbr_nr)
+               return;
+
+       if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
+                                       event->ctx->task_ctx_data) {
+               task_ctx = event->ctx->task_ctx_data;
+               task_ctx->lbr_callstack_users--;
+       }
+
+       cpuc->lbr_users--;
+       WARN_ON_ONCE(cpuc->lbr_users < 0);
+       perf_sched_cb_dec(event->ctx->pmu);
+
+       if (cpuc->enabled && !cpuc->lbr_users) {
+               __intel_pmu_lbr_disable();
+               /* avoid stale pointer */
+               cpuc->lbr_context = NULL;
+       }
+}
+
+void intel_pmu_lbr_enable_all(bool pmi)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       if (cpuc->lbr_users)
+               __intel_pmu_lbr_enable(pmi);
+}
+
+void intel_pmu_lbr_disable_all(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       if (cpuc->lbr_users)
+               __intel_pmu_lbr_disable();
+}
+
+static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
+{
+       unsigned long mask = x86_pmu.lbr_nr - 1;
+       u64 tos = intel_pmu_lbr_tos();
+       int i;
+
+       for (i = 0; i < x86_pmu.lbr_nr; i++) {
+               unsigned long lbr_idx = (tos - i) & mask;
+               union {
+                       struct {
+                               u32 from;
+                               u32 to;
+                       };
+                       u64     lbr;
+               } msr_lastbranch;
+
+               rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
+
+               cpuc->lbr_entries[i].from       = msr_lastbranch.from;
+               cpuc->lbr_entries[i].to         = msr_lastbranch.to;
+               cpuc->lbr_entries[i].mispred    = 0;
+               cpuc->lbr_entries[i].predicted  = 0;
+               cpuc->lbr_entries[i].reserved   = 0;
+       }
+       cpuc->lbr_stack.nr = i;
+}
+
+/*
+ * Due to lack of segmentation in Linux the effective address (offset)
+ * is the same as the linear address, allowing us to merge the LIP and EIP
+ * LBR formats.
+ */
+static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
+{
+       bool need_info = false;
+       unsigned long mask = x86_pmu.lbr_nr - 1;
+       int lbr_format = x86_pmu.intel_cap.lbr_format;
+       u64 tos = intel_pmu_lbr_tos();
+       int i;
+       int out = 0;
+       int num = x86_pmu.lbr_nr;
+
+       if (cpuc->lbr_sel) {
+               need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO);
+               if (cpuc->lbr_sel->config & LBR_CALL_STACK)
+                       num = tos;
+       }
+
+       for (i = 0; i < num; i++) {
+               unsigned long lbr_idx = (tos - i) & mask;
+               u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
+               int skip = 0;
+               u16 cycles = 0;
+               int lbr_flags = lbr_desc[lbr_format];
+
+               rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
+               rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);
+
+               if (lbr_format == LBR_FORMAT_INFO && need_info) {
+                       u64 info;
+
+                       rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info);
+                       mis = !!(info & LBR_INFO_MISPRED);
+                       pred = !mis;
+                       in_tx = !!(info & LBR_INFO_IN_TX);
+                       abort = !!(info & LBR_INFO_ABORT);
+                       cycles = (info & LBR_INFO_CYCLES);
+               }
+               if (lbr_flags & LBR_EIP_FLAGS) {
+                       mis = !!(from & LBR_FROM_FLAG_MISPRED);
+                       pred = !mis;
+                       skip = 1;
+               }
+               if (lbr_flags & LBR_TSX) {
+                       in_tx = !!(from & LBR_FROM_FLAG_IN_TX);
+                       abort = !!(from & LBR_FROM_FLAG_ABORT);
+                       skip = 3;
+               }
+               from = (u64)((((s64)from) << skip) >> skip);
+
+               /*
+                * Some CPUs report duplicated abort records,
+                * with the second entry not having an abort bit set.
+                * Skip them here. This loop runs backwards,
+                * so we need to undo the previous record.
+                * If the abort just happened outside the window
+                * the extra entry cannot be removed.
+                */
+               if (abort && x86_pmu.lbr_double_abort && out > 0)
+                       out--;
+
+               cpuc->lbr_entries[out].from      = from;
+               cpuc->lbr_entries[out].to        = to;
+               cpuc->lbr_entries[out].mispred   = mis;
+               cpuc->lbr_entries[out].predicted = pred;
+               cpuc->lbr_entries[out].in_tx     = in_tx;
+               cpuc->lbr_entries[out].abort     = abort;
+               cpuc->lbr_entries[out].cycles    = cycles;
+               cpuc->lbr_entries[out].reserved  = 0;
+               out++;
+       }
+       cpuc->lbr_stack.nr = out;
+}
+
+void intel_pmu_lbr_read(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       if (!cpuc->lbr_users)
+               return;
+
+       if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
+               intel_pmu_lbr_read_32(cpuc);
+       else
+               intel_pmu_lbr_read_64(cpuc);
+
+       intel_pmu_lbr_filter(cpuc);
+}
+
+/*
+ * SW filter is used:
+ * - in case there is no HW filter
+ * - in case the HW filter has errata or limitations
+ */
+static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
+{
+       u64 br_type = event->attr.branch_sample_type;
+       int mask = 0;
+
+       if (br_type & PERF_SAMPLE_BRANCH_USER)
+               mask |= X86_BR_USER;
+
+       if (br_type & PERF_SAMPLE_BRANCH_KERNEL)
+               mask |= X86_BR_KERNEL;
+
+       /* we ignore BRANCH_HV here */
+
+       if (br_type & PERF_SAMPLE_BRANCH_ANY)
+               mask |= X86_BR_ANY;
+
+       if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL)
+               mask |= X86_BR_ANY_CALL;
+
+       if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
+               mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET;
+
+       if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
+               mask |= X86_BR_IND_CALL;
+
+       if (br_type & PERF_SAMPLE_BRANCH_ABORT_TX)
+               mask |= X86_BR_ABORT;
+
+       if (br_type & PERF_SAMPLE_BRANCH_IN_TX)
+               mask |= X86_BR_IN_TX;
+
+       if (br_type & PERF_SAMPLE_BRANCH_NO_TX)
+               mask |= X86_BR_NO_TX;
+
+       if (br_type & PERF_SAMPLE_BRANCH_COND)
+               mask |= X86_BR_JCC;
+
+       if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) {
+               if (!x86_pmu_has_lbr_callstack())
+                       return -EOPNOTSUPP;
+               if (mask & ~(X86_BR_USER | X86_BR_KERNEL))
+                       return -EINVAL;
+               mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET |
+                       X86_BR_CALL_STACK;
+       }
+
+       if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP)
+               mask |= X86_BR_IND_JMP;
+
+       if (br_type & PERF_SAMPLE_BRANCH_CALL)
+               mask |= X86_BR_CALL | X86_BR_ZERO_CALL;
+       /*
+        * stash actual user request into reg, it may
+        * be used by fixup code for some CPU
+        */
+       event->hw.branch_reg.reg = mask;
+       return 0;
+}
+
+/*
+ * setup the HW LBR filter
+ * Used only when available, may not be enough to disambiguate
+ * all branches, may need the help of the SW filter
+ */
+static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg;
+       u64 br_type = event->attr.branch_sample_type;
+       u64 mask = 0, v;
+       int i;
+
+       for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) {
+               if (!(br_type & (1ULL << i)))
+                       continue;
+
+               v = x86_pmu.lbr_sel_map[i];
+               if (v == LBR_NOT_SUPP)
+                       return -EOPNOTSUPP;
+
+               if (v != LBR_IGN)
+                       mask |= v;
+       }
+
+       reg = &event->hw.branch_reg;
+       reg->idx = EXTRA_REG_LBR;
+
+       /*
+        * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
+        * in suppress mode. So LBR_SELECT should be set to
+        * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
+        */
+       reg->config = mask ^ x86_pmu.lbr_sel_mask;
+
+       if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) &&
+           (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) &&
+           (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO))
+               reg->config |= LBR_NO_INFO;
+
+       return 0;
+}
+
+int intel_pmu_setup_lbr_filter(struct perf_event *event)
+{
+       int ret = 0;
+
+       /*
+        * no LBR on this PMU
+        */
+       if (!x86_pmu.lbr_nr)
+               return -EOPNOTSUPP;
+
+       /*
+        * setup SW LBR filter
+        */
+       ret = intel_pmu_setup_sw_lbr_filter(event);
+       if (ret)
+               return ret;
+
+       /*
+        * setup HW LBR filter, if any
+        */
+       if (x86_pmu.lbr_sel_map)
+               ret = intel_pmu_setup_hw_lbr_filter(event);
+
+       return ret;
+}
+
+/*
+ * return the type of control flow change at address "from"
+ * intruction is not necessarily a branch (in case of interrupt).
+ *
+ * The branch type returned also includes the priv level of the
+ * target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
+ *
+ * If a branch type is unknown OR the instruction cannot be
+ * decoded (e.g., text page not present), then X86_BR_NONE is
+ * returned.
+ */
+static int branch_type(unsigned long from, unsigned long to, int abort)
+{
+       struct insn insn;
+       void *addr;
+       int bytes_read, bytes_left;
+       int ret = X86_BR_NONE;
+       int ext, to_plm, from_plm;
+       u8 buf[MAX_INSN_SIZE];
+       int is64 = 0;
+
+       to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
+       from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER;
+
+       /*
+        * maybe zero if lbr did not fill up after a reset by the time
+        * we get a PMU interrupt
+        */
+       if (from == 0 || to == 0)
+               return X86_BR_NONE;
+
+       if (abort)
+               return X86_BR_ABORT | to_plm;
+
+       if (from_plm == X86_BR_USER) {
+               /*
+                * can happen if measuring at the user level only
+                * and we interrupt in a kernel thread, e.g., idle.
+                */
+               if (!current->mm)
+                       return X86_BR_NONE;
+
+               /* may fail if text not present */
+               bytes_left = copy_from_user_nmi(buf, (void __user *)from,
+                                               MAX_INSN_SIZE);
+               bytes_read = MAX_INSN_SIZE - bytes_left;
+               if (!bytes_read)
+                       return X86_BR_NONE;
+
+               addr = buf;
+       } else {
+               /*
+                * The LBR logs any address in the IP, even if the IP just
+                * faulted. This means userspace can control the from address.
+                * Ensure we don't blindy read any address by validating it is
+                * a known text address.
+                */
+               if (kernel_text_address(from)) {
+                       addr = (void *)from;
+                       /*
+                        * Assume we can get the maximum possible size
+                        * when grabbing kernel data.  This is not
+                        * _strictly_ true since we could possibly be
+                        * executing up next to a memory hole, but
+                        * it is very unlikely to be a problem.
+                        */
+                       bytes_read = MAX_INSN_SIZE;
+               } else {
+                       return X86_BR_NONE;
+               }
+       }
+
+       /*
+        * decoder needs to know the ABI especially
+        * on 64-bit systems running 32-bit apps
+        */
+#ifdef CONFIG_X86_64
+       is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32);
+#endif
+       insn_init(&insn, addr, bytes_read, is64);
+       insn_get_opcode(&insn);
+       if (!insn.opcode.got)
+               return X86_BR_ABORT;
+
+       switch (insn.opcode.bytes[0]) {
+       case 0xf:
+               switch (insn.opcode.bytes[1]) {
+               case 0x05: /* syscall */
+               case 0x34: /* sysenter */
+                       ret = X86_BR_SYSCALL;
+                       break;
+               case 0x07: /* sysret */
+               case 0x35: /* sysexit */
+                       ret = X86_BR_SYSRET;
+                       break;
+               case 0x80 ... 0x8f: /* conditional */
+                       ret = X86_BR_JCC;
+                       break;
+               default:
+                       ret = X86_BR_NONE;
+               }
+               break;
+       case 0x70 ... 0x7f: /* conditional */
+               ret = X86_BR_JCC;
+               break;
+       case 0xc2: /* near ret */
+       case 0xc3: /* near ret */
+       case 0xca: /* far ret */
+       case 0xcb: /* far ret */
+               ret = X86_BR_RET;
+               break;
+       case 0xcf: /* iret */
+               ret = X86_BR_IRET;
+               break;
+       case 0xcc ... 0xce: /* int */
+               ret = X86_BR_INT;
+               break;
+       case 0xe8: /* call near rel */
+               insn_get_immediate(&insn);
+               if (insn.immediate1.value == 0) {
+                       /* zero length call */
+                       ret = X86_BR_ZERO_CALL;
+                       break;
+               }
+       case 0x9a: /* call far absolute */
+               ret = X86_BR_CALL;
+               break;
+       case 0xe0 ... 0xe3: /* loop jmp */
+               ret = X86_BR_JCC;
+               break;
+       case 0xe9 ... 0xeb: /* jmp */
+               ret = X86_BR_JMP;
+               break;
+       case 0xff: /* call near absolute, call far absolute ind */
+               insn_get_modrm(&insn);
+               ext = (insn.modrm.bytes[0] >> 3) & 0x7;
+               switch (ext) {
+               case 2: /* near ind call */
+               case 3: /* far ind call */
+                       ret = X86_BR_IND_CALL;
+                       break;
+               case 4:
+               case 5:
+                       ret = X86_BR_IND_JMP;
+                       break;
+               }
+               break;
+       default:
+               ret = X86_BR_NONE;
+       }
+       /*
+        * interrupts, traps, faults (and thus ring transition) may
+        * occur on any instructions. Thus, to classify them correctly,
+        * we need to first look at the from and to priv levels. If they
+        * are different and to is in the kernel, then it indicates
+        * a ring transition. If the from instruction is not a ring
+        * transition instr (syscall, systenter, int), then it means
+        * it was a irq, trap or fault.
+        *
+        * we have no way of detecting kernel to kernel faults.
+        */
+       if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL
+           && ret != X86_BR_SYSCALL && ret != X86_BR_INT)
+               ret = X86_BR_IRQ;
+
+       /*
+        * branch priv level determined by target as
+        * is done by HW when LBR_SELECT is implemented
+        */
+       if (ret != X86_BR_NONE)
+               ret |= to_plm;
+
+       return ret;
+}
+
+/*
+ * implement actual branch filter based on user demand.
+ * Hardware may not exactly satisfy that request, thus
+ * we need to inspect opcodes. Mismatched branches are
+ * discarded. Therefore, the number of branches returned
+ * in PERF_SAMPLE_BRANCH_STACK sample may vary.
+ */
+static void
+intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
+{
+       u64 from, to;
+       int br_sel = cpuc->br_sel;
+       int i, j, type;
+       bool compress = false;
+
+       /* if sampling all branches, then nothing to filter */
+       if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
+               return;
+
+       for (i = 0; i < cpuc->lbr_stack.nr; i++) {
+
+               from = cpuc->lbr_entries[i].from;
+               to = cpuc->lbr_entries[i].to;
+
+               type = branch_type(from, to, cpuc->lbr_entries[i].abort);
+               if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) {
+                       if (cpuc->lbr_entries[i].in_tx)
+                               type |= X86_BR_IN_TX;
+                       else
+                               type |= X86_BR_NO_TX;
+               }
+
+               /* if type does not correspond, then discard */
+               if (type == X86_BR_NONE || (br_sel & type) != type) {
+                       cpuc->lbr_entries[i].from = 0;
+                       compress = true;
+               }
+       }
+
+       if (!compress)
+               return;
+
+       /* remove all entries with from=0 */
+       for (i = 0; i < cpuc->lbr_stack.nr; ) {
+               if (!cpuc->lbr_entries[i].from) {
+                       j = i;
+                       while (++j < cpuc->lbr_stack.nr)
+                               cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
+                       cpuc->lbr_stack.nr--;
+                       if (!cpuc->lbr_entries[i].from)
+                               continue;
+               }
+               i++;
+       }
+}
+
+/*
+ * Map interface branch filters onto LBR filters
+ */
+static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+       [PERF_SAMPLE_BRANCH_ANY_SHIFT]          = LBR_ANY,
+       [PERF_SAMPLE_BRANCH_USER_SHIFT]         = LBR_USER,
+       [PERF_SAMPLE_BRANCH_KERNEL_SHIFT]       = LBR_KERNEL,
+       [PERF_SAMPLE_BRANCH_HV_SHIFT]           = LBR_IGN,
+       [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]   = LBR_RETURN | LBR_REL_JMP
+                                               | LBR_IND_JMP | LBR_FAR,
+       /*
+        * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
+        */
+       [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] =
+        LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
+       /*
+        * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
+        */
+       [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
+       [PERF_SAMPLE_BRANCH_COND_SHIFT]     = LBR_JCC,
+       [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
+};
+
+static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+       [PERF_SAMPLE_BRANCH_ANY_SHIFT]          = LBR_ANY,
+       [PERF_SAMPLE_BRANCH_USER_SHIFT]         = LBR_USER,
+       [PERF_SAMPLE_BRANCH_KERNEL_SHIFT]       = LBR_KERNEL,
+       [PERF_SAMPLE_BRANCH_HV_SHIFT]           = LBR_IGN,
+       [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]   = LBR_RETURN | LBR_FAR,
+       [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]     = LBR_REL_CALL | LBR_IND_CALL
+                                               | LBR_FAR,
+       [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]     = LBR_IND_CALL,
+       [PERF_SAMPLE_BRANCH_COND_SHIFT]         = LBR_JCC,
+       [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]     = LBR_IND_JMP,
+       [PERF_SAMPLE_BRANCH_CALL_SHIFT]         = LBR_REL_CALL,
+};
+
+static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+       [PERF_SAMPLE_BRANCH_ANY_SHIFT]          = LBR_ANY,
+       [PERF_SAMPLE_BRANCH_USER_SHIFT]         = LBR_USER,
+       [PERF_SAMPLE_BRANCH_KERNEL_SHIFT]       = LBR_KERNEL,
+       [PERF_SAMPLE_BRANCH_HV_SHIFT]           = LBR_IGN,
+       [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]   = LBR_RETURN | LBR_FAR,
+       [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]     = LBR_REL_CALL | LBR_IND_CALL
+                                               | LBR_FAR,
+       [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]     = LBR_IND_CALL,
+       [PERF_SAMPLE_BRANCH_COND_SHIFT]         = LBR_JCC,
+       [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT]   = LBR_REL_CALL | LBR_IND_CALL
+                                               | LBR_RETURN | LBR_CALL_STACK,
+       [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]     = LBR_IND_JMP,
+       [PERF_SAMPLE_BRANCH_CALL_SHIFT]         = LBR_REL_CALL,
+};
+
+/* core */
+void __init intel_pmu_lbr_init_core(void)
+{
+       x86_pmu.lbr_nr     = 4;
+       x86_pmu.lbr_tos    = MSR_LBR_TOS;
+       x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
+       x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
+
+       /*
+        * SW branch filter usage:
+        * - compensate for lack of HW filter
+        */
+       pr_cont("4-deep LBR, ");
+}
+
+/* nehalem/westmere */
+void __init intel_pmu_lbr_init_nhm(void)
+{
+       x86_pmu.lbr_nr     = 16;
+       x86_pmu.lbr_tos    = MSR_LBR_TOS;
+       x86_pmu.lbr_from   = MSR_LBR_NHM_FROM;
+       x86_pmu.lbr_to     = MSR_LBR_NHM_TO;
+
+       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+       x86_pmu.lbr_sel_map  = nhm_lbr_sel_map;
+
+       /*
+        * SW branch filter usage:
+        * - workaround LBR_SEL errata (see above)
+        * - support syscall, sysret capture.
+        *   That requires LBR_FAR but that means far
+        *   jmp need to be filtered out
+        */
+       pr_cont("16-deep LBR, ");
+}
+
+/* sandy bridge */
+void __init intel_pmu_lbr_init_snb(void)
+{
+       x86_pmu.lbr_nr   = 16;
+       x86_pmu.lbr_tos  = MSR_LBR_TOS;
+       x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+       x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
+
+       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+       x86_pmu.lbr_sel_map  = snb_lbr_sel_map;
+
+       /*
+        * SW branch filter usage:
+        * - support syscall, sysret capture.
+        *   That requires LBR_FAR but that means far
+        *   jmp need to be filtered out
+        */
+       pr_cont("16-deep LBR, ");
+}
+
+/* haswell */
+void intel_pmu_lbr_init_hsw(void)
+{
+       x86_pmu.lbr_nr   = 16;
+       x86_pmu.lbr_tos  = MSR_LBR_TOS;
+       x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+       x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
+
+       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+       x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
+
+       pr_cont("16-deep LBR, ");
+}
+
+/* skylake */
+__init void intel_pmu_lbr_init_skl(void)
+{
+       x86_pmu.lbr_nr   = 32;
+       x86_pmu.lbr_tos  = MSR_LBR_TOS;
+       x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+       x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
+
+       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+       x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
+
+       /*
+        * SW branch filter usage:
+        * - support syscall, sysret capture.
+        *   That requires LBR_FAR but that means far
+        *   jmp need to be filtered out
+        */
+       pr_cont("32-deep LBR, ");
+}
+
+/* atom */
+void __init intel_pmu_lbr_init_atom(void)
+{
+       /*
+        * only models starting at stepping 10 seems
+        * to have an operational LBR which can freeze
+        * on PMU interrupt
+        */
+       if (boot_cpu_data.x86_model == 28
+           && boot_cpu_data.x86_mask < 10) {
+               pr_cont("LBR disabled due to erratum");
+               return;
+       }
+
+       x86_pmu.lbr_nr     = 8;
+       x86_pmu.lbr_tos    = MSR_LBR_TOS;
+       x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
+       x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
+
+       /*
+        * SW branch filter usage:
+        * - compensate for lack of HW filter
+        */
+       pr_cont("8-deep LBR, ");
+}
+
+/* Knights Landing */
+void intel_pmu_lbr_init_knl(void)
+{
+       x86_pmu.lbr_nr     = 8;
+       x86_pmu.lbr_tos    = MSR_LBR_TOS;
+       x86_pmu.lbr_from   = MSR_LBR_NHM_FROM;
+       x86_pmu.lbr_to     = MSR_LBR_NHM_TO;
+
+       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+       x86_pmu.lbr_sel_map  = snb_lbr_sel_map;
+
+       pr_cont("8-deep LBR, ");
+}
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c

new file mode 100644 (file)

index 0000000..0a5ede1
--- /dev/null
+++ b/arch/x86/events/intel/p4.c
@@ -0,0 +1,1376 @@
+/*
+ * Netburst Performance Events (P4, old Xeon)
+ *
+ *  Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
+ *  Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+
+#include <asm/perf_event_p4.h>
+#include <asm/hardirq.h>
+#include <asm/apic.h>
+
+#include "../perf_event.h"
+
+#define P4_CNTR_LIMIT 3
+/*
+ * array indices: 0,1 - HT threads, used with HT enabled cpu
+ */
+struct p4_event_bind {
+       unsigned int opcode;                    /* Event code and ESCR selector */
+       unsigned int escr_msr[2];               /* ESCR MSR for this event */
+       unsigned int escr_emask;                /* valid ESCR EventMask bits */
+       unsigned int shared;                    /* event is shared across threads */
+       char cntr[2][P4_CNTR_LIMIT];            /* counter index (offset), -1 on abscence */
+};
+
+struct p4_pebs_bind {
+       unsigned int metric_pebs;
+       unsigned int metric_vert;
+};
+
+/* it sets P4_PEBS_ENABLE_UOP_TAG as well */
+#define P4_GEN_PEBS_BIND(name, pebs, vert)                     \
+       [P4_PEBS_METRIC__##name] = {                            \
+               .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG,   \
+               .metric_vert = vert,                            \
+       }
+
+/*
+ * note we have P4_PEBS_ENABLE_UOP_TAG always set here
+ *
+ * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of
+ * event configuration to find out which values are to be
+ * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT
+ * resgisters
+ */
+static struct p4_pebs_bind p4_pebs_bind_map[] = {
+       P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired,  0x0000001, 0x0000001),
+       P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired,  0x0000002, 0x0000001),
+       P4_GEN_PEBS_BIND(dtlb_load_miss_retired,        0x0000004, 0x0000001),
+       P4_GEN_PEBS_BIND(dtlb_store_miss_retired,       0x0000004, 0x0000002),
+       P4_GEN_PEBS_BIND(dtlb_all_miss_retired,         0x0000004, 0x0000003),
+       P4_GEN_PEBS_BIND(tagged_mispred_branch,         0x0018000, 0x0000010),
+       P4_GEN_PEBS_BIND(mob_load_replay_retired,       0x0000200, 0x0000001),
+       P4_GEN_PEBS_BIND(split_load_retired,            0x0000400, 0x0000001),
+       P4_GEN_PEBS_BIND(split_store_retired,           0x0000400, 0x0000002),
+};
+
+/*
+ * Note that we don't use CCCR1 here, there is an
+ * exception for P4_BSQ_ALLOCATION but we just have
+ * no workaround
+ *
+ * consider this binding as resources which particular
+ * event may borrow, it doesn't contain EventMask,
+ * Tags and friends -- they are left to a caller
+ */
+static struct p4_event_bind p4_event_bind_map[] = {
+       [P4_EVENT_TC_DELIVER_MODE] = {
+               .opcode         = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
+               .escr_msr       = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID),
+               .shared         = 1,
+               .cntr           = { {4, 5, -1}, {6, 7, -1} },
+       },
+       [P4_EVENT_BPU_FETCH_REQUEST] = {
+               .opcode         = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
+               .escr_msr       = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS),
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_ITLB_REFERENCE] = {
+               .opcode         = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
+               .escr_msr       = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK),
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_MEMORY_CANCEL] = {
+               .opcode         = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
+               .escr_msr       = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF),
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_MEMORY_COMPLETE] = {
+               .opcode         = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
+               .escr_msr       = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC),
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_LOAD_PORT_REPLAY] = {
+               .opcode         = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
+               .escr_msr       = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD),
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_STORE_PORT_REPLAY] = {
+               .opcode         = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
+               .escr_msr       = { MSR_P4_SAAT_ESCR0 ,  MSR_P4_SAAT_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST),
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_MOB_LOAD_REPLAY] = {
+               .opcode         = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
+               .escr_msr       = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR),
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_PAGE_WALK_TYPE] = {
+               .opcode         = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
+               .escr_msr       = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS),
+               .shared         = 1,
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_BSQ_CACHE_REFERENCE] = {
+               .opcode         = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
+               .escr_msr       = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_IOQ_ALLOCATION] = {
+               .opcode         = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
+               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER)               |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH),
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_IOQ_ACTIVE_ENTRIES] = {       /* shared ESCR */
+               .opcode         = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
+               .escr_msr       = { MSR_P4_FSB_ESCR1,  MSR_P4_FSB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ)        |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC)          |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC)          |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT)          |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP)          |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB)          |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH),
+               .cntr           = { {2, -1, -1}, {3, -1, -1} },
+       },
+       [P4_EVENT_FSB_DATA_ACTIVITY] = {
+               .opcode         = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
+               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER),
+               .shared         = 1,
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_BSQ_ALLOCATION] = {           /* shared ESCR, broken CCCR1 */
+               .opcode         = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
+               .escr_msr       = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE)      |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE)      |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE)        |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE)        |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2),
+               .cntr           = { {0, -1, -1}, {1, -1, -1} },
+       },
+       [P4_EVENT_BSQ_ACTIVE_ENTRIES] = {       /* shared ESCR */
+               .opcode         = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
+               .escr_msr       = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0)        |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1)        |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE)     |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE)  |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE)  |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE)    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE)    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2),
+               .cntr           = { {2, -1, -1}, {3, -1, -1} },
+       },
+       [P4_EVENT_SSE_INPUT_ASSIST] = {
+               .opcode         = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
+               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL),
+               .shared         = 1,
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_PACKED_SP_UOP] = {
+               .opcode         = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
+               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL),
+               .shared         = 1,
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_PACKED_DP_UOP] = {
+               .opcode         = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
+               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL),
+               .shared         = 1,
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_SCALAR_SP_UOP] = {
+               .opcode         = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
+               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL),
+               .shared         = 1,
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_SCALAR_DP_UOP] = {
+               .opcode         = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
+               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL),
+               .shared         = 1,
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_64BIT_MMX_UOP] = {
+               .opcode         = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
+               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL),
+               .shared         = 1,
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_128BIT_MMX_UOP] = {
+               .opcode         = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
+               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL),
+               .shared         = 1,
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_X87_FP_UOP] = {
+               .opcode         = P4_OPCODE(P4_EVENT_X87_FP_UOP),
+               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL),
+               .shared         = 1,
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_TC_MISC] = {
+               .opcode         = P4_OPCODE(P4_EVENT_TC_MISC),
+               .escr_msr       = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH),
+               .cntr           = { {4, 5, -1}, {6, 7, -1} },
+       },
+       [P4_EVENT_GLOBAL_POWER_EVENTS] = {
+               .opcode         = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
+               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING),
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_TC_MS_XFER] = {
+               .opcode         = P4_OPCODE(P4_EVENT_TC_MS_XFER),
+               .escr_msr       = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC),
+               .cntr           = { {4, 5, -1}, {6, 7, -1} },
+       },
+       [P4_EVENT_UOP_QUEUE_WRITES] = {
+               .opcode         = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
+               .escr_msr       = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD)     |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM),
+               .cntr           = { {4, 5, -1}, {6, 7, -1} },
+       },
+       [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
+               .opcode         = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
+               .escr_msr       = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL)    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT),
+               .cntr           = { {4, 5, -1}, {6, 7, -1} },
+       },
+       [P4_EVENT_RETIRED_BRANCH_TYPE] = {
+               .opcode         = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
+               .escr_msr       = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT),
+               .cntr           = { {4, 5, -1}, {6, 7, -1} },
+       },
+       [P4_EVENT_RESOURCE_STALL] = {
+               .opcode         = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
+               .escr_msr       = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_WC_BUFFER] = {
+               .opcode         = P4_OPCODE(P4_EVENT_WC_BUFFER),
+               .escr_msr       = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS)               |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS),
+               .shared         = 1,
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_B2B_CYCLES] = {
+               .opcode         = P4_OPCODE(P4_EVENT_B2B_CYCLES),
+               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     = 0,
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_BNR] = {
+               .opcode         = P4_OPCODE(P4_EVENT_BNR),
+               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     = 0,
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_SNOOP] = {
+               .opcode         = P4_OPCODE(P4_EVENT_SNOOP),
+               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     = 0,
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_RESPONSE] = {
+               .opcode         = P4_OPCODE(P4_EVENT_RESPONSE),
+               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     = 0,
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_FRONT_END_EVENT] = {
+               .opcode         = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
+               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_EXECUTION_EVENT] = {
+               .opcode         = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
+               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_REPLAY_EVENT] = {
+               .opcode         = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
+               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_INSTR_RETIRED] = {
+               .opcode         = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
+               .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_UOPS_RETIRED] = {
+               .opcode         = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
+               .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_UOP_TYPE] = {
+               .opcode         = P4_OPCODE(P4_EVENT_UOP_TYPE),
+               .escr_msr       = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS)                  |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_BRANCH_RETIRED] = {
+               .opcode         = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
+               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_MISPRED_BRANCH_RETIRED] = {
+               .opcode         = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
+               .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_X87_ASSIST] = {
+               .opcode         = P4_OPCODE(P4_EVENT_X87_ASSIST),
+               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU)                    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO)                    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO)                    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU)                    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_MACHINE_CLEAR] = {
+               .opcode         = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
+               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_INSTR_COMPLETED] = {
+               .opcode         = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
+               .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+};
+
+#define P4_GEN_CACHE_EVENT(event, bit, metric)                           \
+       p4_config_pack_escr(P4_ESCR_EVENT(event)                        | \
+                           P4_ESCR_EMASK_BIT(event, bit))              | \
+       p4_config_pack_cccr(metric                                      | \
+                           P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
+
+static __initconst const u64 p4_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+                                               P4_PEBS_METRIC__1stl_cache_load_miss_retired),
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+                                               P4_PEBS_METRIC__2ndl_cache_load_miss_retired),
+       },
+},
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+                                               P4_PEBS_METRIC__dtlb_load_miss_retired),
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+                                               P4_PEBS_METRIC__dtlb_store_miss_retired),
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
+                                               P4_PEBS_METRIC__none),
+               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
+                                               P4_PEBS_METRIC__none),
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+/*
+ * Because of Netburst being quite restricted in how many
+ * identical events may run simultaneously, we introduce event aliases,
+ * ie the different events which have the same functionality but
+ * utilize non-intersected resources (ESCR/CCCR/counter registers).
+ *
+ * This allow us to relax restrictions a bit and run two or more
+ * identical events together.
+ *
+ * Never set any custom internal bits such as P4_CONFIG_HT,
+ * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are
+ * either up to date automatically or not applicable at all.
+ */
+struct p4_event_alias {
+       u64 original;
+       u64 alternative;
+} p4_event_aliases[] = {
+       {
+               /*
+                * Non-halted cycles can be substituted with non-sleeping cycles (see
+                * Intel SDM Vol3b for details). We need this alias to be able
+                * to run nmi-watchdog and 'perf top' (or any other user space tool
+                * which is interested in running PERF_COUNT_HW_CPU_CYCLES)
+                * simultaneously.
+                */
+       .original       =
+               p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)         |
+                                   P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
+       .alternative    =
+               p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT)             |
+                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)|
+                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)|
+                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)|
+                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)|
+                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
+                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
+                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
+                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))|
+               p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT          |
+                                   P4_CCCR_COMPARE),
+       },
+};
+
+static u64 p4_get_alias_event(u64 config)
+{
+       u64 config_match;
+       int i;
+
+       /*
+        * Only event with special mark is allowed,
+        * we're to be sure it didn't come as malformed
+        * RAW event.
+        */
+       if (!(config & P4_CONFIG_ALIASABLE))
+               return 0;
+
+       config_match = config & P4_CONFIG_EVENT_ALIAS_MASK;
+
+       for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) {
+               if (config_match == p4_event_aliases[i].original) {
+                       config_match = p4_event_aliases[i].alternative;
+                       break;
+               } else if (config_match == p4_event_aliases[i].alternative) {
+                       config_match = p4_event_aliases[i].original;
+                       break;
+               }
+       }
+
+       if (i >= ARRAY_SIZE(p4_event_aliases))
+               return 0;
+
+       return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS);
+}
+
+static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
+  /* non-halted CPU clocks */
+  [PERF_COUNT_HW_CPU_CYCLES] =
+       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)         |
+               P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING))       |
+               P4_CONFIG_ALIASABLE,
+
+  /*
+   * retired instructions
+   * in a sake of simplicity we don't use the FSB tagging
+   */
+  [PERF_COUNT_HW_INSTRUCTIONS] =
+       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED)               |
+               P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)           |
+               P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)),
+
+  /* cache hits */
+  [PERF_COUNT_HW_CACHE_REFERENCES] =
+       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE)         |
+               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)   |
+               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)   |
+               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)   |
+               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)   |
+               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)   |
+               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)),
+
+  /* cache misses */
+  [PERF_COUNT_HW_CACHE_MISSES] =
+       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE)         |
+               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)   |
+               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)   |
+               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)),
+
+  /* branch instructions retired */
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =
+       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE)         |
+               P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)    |
+               P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)           |
+               P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)         |
+               P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)),
+
+  /* mispredicted branches retired */
+  [PERF_COUNT_HW_BRANCH_MISSES]        =
+       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED)      |
+               P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)),
+
+  /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN):  */
+  [PERF_COUNT_HW_BUS_CYCLES] =
+       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY)           |
+               P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)         |
+               P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN))        |
+       p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
+};
+
+static struct p4_event_bind *p4_config_get_bind(u64 config)
+{
+       unsigned int evnt = p4_config_unpack_event(config);
+       struct p4_event_bind *bind = NULL;
+
+       if (evnt < ARRAY_SIZE(p4_event_bind_map))
+               bind = &p4_event_bind_map[evnt];
+
+       return bind;
+}
+
+static u64 p4_pmu_event_map(int hw_event)
+{
+       struct p4_event_bind *bind;
+       unsigned int esel;
+       u64 config;
+
+       config = p4_general_events[hw_event];
+       bind = p4_config_get_bind(config);
+       esel = P4_OPCODE_ESEL(bind->opcode);
+       config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
+
+       return config;
+}
+
+/* check cpu model specifics */
+static bool p4_event_match_cpu_model(unsigned int event_idx)
+{
+       /* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */
+       if (event_idx == P4_EVENT_INSTR_COMPLETED) {
+               if (boot_cpu_data.x86_model != 3 &&
+                       boot_cpu_data.x86_model != 4 &&
+                       boot_cpu_data.x86_model != 6)
+                       return false;
+       }
+
+       /*
+        * For info
+        * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2
+        */
+
+       return true;
+}
+
+static int p4_validate_raw_event(struct perf_event *event)
+{
+       unsigned int v, emask;
+
+       /* User data may have out-of-bound event index */
+       v = p4_config_unpack_event(event->attr.config);
+       if (v >= ARRAY_SIZE(p4_event_bind_map))
+               return -EINVAL;
+
+       /* It may be unsupported: */
+       if (!p4_event_match_cpu_model(v))
+               return -EINVAL;
+
+       /*
+        * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as
+        * in Architectural Performance Monitoring, it means not
+        * on _which_ logical cpu to count but rather _when_, ie it
+        * depends on logical cpu state -- count event if one cpu active,
+        * none, both or any, so we just allow user to pass any value
+        * desired.
+        *
+        * In turn we always set Tx_OS/Tx_USR bits bound to logical
+        * cpu without their propagation to another cpu
+        */
+
+       /*
+        * if an event is shared across the logical threads
+        * the user needs special permissions to be able to use it
+        */
+       if (p4_ht_active() && p4_event_bind_map[v].shared) {
+               if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+                       return -EACCES;
+       }
+
+       /* ESCR EventMask bits may be invalid */
+       emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK;
+       if (emask & ~p4_event_bind_map[v].escr_emask)
+               return -EINVAL;
+
+       /*
+        * it may have some invalid PEBS bits
+        */
+       if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE))
+               return -EINVAL;
+
+       v = p4_config_unpack_metric(event->attr.config);
+       if (v >= ARRAY_SIZE(p4_pebs_bind_map))
+               return -EINVAL;
+
+       return 0;
+}
+
+static int p4_hw_config(struct perf_event *event)
+{
+       int cpu = get_cpu();
+       int rc = 0;
+       u32 escr, cccr;
+
+       /*
+        * the reason we use cpu that early is that: if we get scheduled
+        * first time on the same cpu -- we will not need swap thread
+        * specific flags in config (and will save some cpu cycles)
+        */
+
+       cccr = p4_default_cccr_conf(cpu);
+       escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel,
+                                        event->attr.exclude_user);
+       event->hw.config = p4_config_pack_escr(escr) |
+                          p4_config_pack_cccr(cccr);
+
+       if (p4_ht_active() && p4_ht_thread(cpu))
+               event->hw.config = p4_set_ht_bit(event->hw.config);
+
+       if (event->attr.type == PERF_TYPE_RAW) {
+               struct p4_event_bind *bind;
+               unsigned int esel;
+               /*
+                * Clear bits we reserve to be managed by kernel itself
+                * and never allowed from a user space
+                */
+                event->attr.config &= P4_CONFIG_MASK;
+
+               rc = p4_validate_raw_event(event);
+               if (rc)
+                       goto out;
+
+               /*
+                * Note that for RAW events we allow user to use P4_CCCR_RESERVED
+                * bits since we keep additional info here (for cache events and etc)
+                */
+               event->hw.config |= event->attr.config;
+               bind = p4_config_get_bind(event->attr.config);
+               if (!bind) {
+                       rc = -EINVAL;
+                       goto out;
+               }
+               esel = P4_OPCODE_ESEL(bind->opcode);
+               event->hw.config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
+       }
+
+       rc = x86_setup_perfctr(event);
+out:
+       put_cpu();
+       return rc;
+}
+
+static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
+{
+       u64 v;
+
+       /* an official way for overflow indication */
+       rdmsrl(hwc->config_base, v);
+       if (v & P4_CCCR_OVF) {
+               wrmsrl(hwc->config_base, v & ~P4_CCCR_OVF);
+               return 1;
+       }
+
+       /*
+        * In some circumstances the overflow might issue an NMI but did
+        * not set P4_CCCR_OVF bit. Because a counter holds a negative value
+        * we simply check for high bit being set, if it's cleared it means
+        * the counter has reached zero value and continued counting before
+        * real NMI signal was received:
+        */
+       rdmsrl(hwc->event_base, v);
+       if (!(v & ARCH_P4_UNFLAGGED_BIT))
+               return 1;
+
+       return 0;
+}
+
+static void p4_pmu_disable_pebs(void)
+{
+       /*
+        * FIXME
+        *
+        * It's still allowed that two threads setup same cache
+        * events so we can't simply clear metrics until we knew
+        * no one is depending on us, so we need kind of counter
+        * for "ReplayEvent" users.
+        *
+        * What is more complex -- RAW events, if user (for some
+        * reason) will pass some cache event metric with improper
+        * event opcode -- it's fine from hardware point of view
+        * but completely nonsense from "meaning" of such action.
+        *
+        * So at moment let leave metrics turned on forever -- it's
+        * ok for now but need to be revisited!
+        *
+        * (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, 0);
+        * (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, 0);
+        */
+}
+
+static inline void p4_pmu_disable_event(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       /*
+        * If event gets disabled while counter is in overflowed
+        * state we need to clear P4_CCCR_OVF, otherwise interrupt get
+        * asserted again and again
+        */
+       (void)wrmsrl_safe(hwc->config_base,
+               p4_config_unpack_cccr(hwc->config) & ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
+}
+
+static void p4_pmu_disable_all(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       int idx;
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               struct perf_event *event = cpuc->events[idx];
+               if (!test_bit(idx, cpuc->active_mask))
+                       continue;
+               p4_pmu_disable_event(event);
+       }
+
+       p4_pmu_disable_pebs();
+}
+
+/* configuration must be valid */
+static void p4_pmu_enable_pebs(u64 config)
+{
+       struct p4_pebs_bind *bind;
+       unsigned int idx;
+
+       BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK);
+
+       idx = p4_config_unpack_metric(config);
+       if (idx == P4_PEBS_METRIC__none)
+               return;
+
+       bind = &p4_pebs_bind_map[idx];
+
+       (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs);
+       (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT,      (u64)bind->metric_vert);
+}
+
+static void p4_pmu_enable_event(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       int thread = p4_ht_config_thread(hwc->config);
+       u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
+       unsigned int idx = p4_config_unpack_event(hwc->config);
+       struct p4_event_bind *bind;
+       u64 escr_addr, cccr;
+
+       bind = &p4_event_bind_map[idx];
+       escr_addr = bind->escr_msr[thread];
+
+       /*
+        * - we dont support cascaded counters yet
+        * - and counter 1 is broken (erratum)
+        */
+       WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
+       WARN_ON_ONCE(hwc->idx == 1);
+
+       /* we need a real Event value */
+       escr_conf &= ~P4_ESCR_EVENT_MASK;
+       escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode));
+
+       cccr = p4_config_unpack_cccr(hwc->config);
+
+       /*
+        * it could be Cache event so we need to write metrics
+        * into additional MSRs
+        */
+       p4_pmu_enable_pebs(hwc->config);
+
+       (void)wrmsrl_safe(escr_addr, escr_conf);
+       (void)wrmsrl_safe(hwc->config_base,
+                               (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
+}
+
+static void p4_pmu_enable_all(int added)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       int idx;
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               struct perf_event *event = cpuc->events[idx];
+               if (!test_bit(idx, cpuc->active_mask))
+                       continue;
+               p4_pmu_enable_event(event);
+       }
+}
+
+static int p4_pmu_handle_irq(struct pt_regs *regs)
+{
+       struct perf_sample_data data;
+       struct cpu_hw_events *cpuc;
+       struct perf_event *event;
+       struct hw_perf_event *hwc;
+       int idx, handled = 0;
+       u64 val;
+
+       cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               int overflow;
+
+               if (!test_bit(idx, cpuc->active_mask)) {
+                       /* catch in-flight IRQs */
+                       if (__test_and_clear_bit(idx, cpuc->running))
+                               handled++;
+                       continue;
+               }
+
+               event = cpuc->events[idx];
+               hwc = &event->hw;
+
+               WARN_ON_ONCE(hwc->idx != idx);
+
+               /* it might be unflagged overflow */
+               overflow = p4_pmu_clear_cccr_ovf(hwc);
+
+               val = x86_perf_event_update(event);
+               if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
+                       continue;
+
+               handled += overflow;
+
+               /* event overflow for sure */
+               perf_sample_data_init(&data, 0, hwc->last_period);
+
+               if (!x86_perf_event_set_period(event))
+                       continue;
+
+
+               if (perf_event_overflow(event, &data, regs))
+                       x86_pmu_stop(event, 0);
+       }
+
+       if (handled)
+               inc_irq_stat(apic_perf_irqs);
+
+       /*
+        * When dealing with the unmasking of the LVTPC on P4 perf hw, it has
+        * been observed that the OVF bit flag has to be cleared first _before_
+        * the LVTPC can be unmasked.
+        *
+        * The reason is the NMI line will continue to be asserted while the OVF
+        * bit is set.  This causes a second NMI to generate if the LVTPC is
+        * unmasked before the OVF bit is cleared, leading to unknown NMI
+        * messages.
+        */
+       apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+       return handled;
+}
+
+/*
+ * swap thread specific fields according to a thread
+ * we are going to run on
+ */
+static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
+{
+       u32 escr, cccr;
+
+       /*
+        * we either lucky and continue on same cpu or no HT support
+        */
+       if (!p4_should_swap_ts(hwc->config, cpu))
+               return;
+
+       /*
+        * the event is migrated from an another logical
+        * cpu, so we need to swap thread specific flags
+        */
+
+       escr = p4_config_unpack_escr(hwc->config);
+       cccr = p4_config_unpack_cccr(hwc->config);
+
+       if (p4_ht_thread(cpu)) {
+               cccr &= ~P4_CCCR_OVF_PMI_T0;
+               cccr |= P4_CCCR_OVF_PMI_T1;
+               if (escr & P4_ESCR_T0_OS) {
+                       escr &= ~P4_ESCR_T0_OS;
+                       escr |= P4_ESCR_T1_OS;
+               }
+               if (escr & P4_ESCR_T0_USR) {
+                       escr &= ~P4_ESCR_T0_USR;
+                       escr |= P4_ESCR_T1_USR;
+               }
+               hwc->config  = p4_config_pack_escr(escr);
+               hwc->config |= p4_config_pack_cccr(cccr);
+               hwc->config |= P4_CONFIG_HT;
+       } else {
+               cccr &= ~P4_CCCR_OVF_PMI_T1;
+               cccr |= P4_CCCR_OVF_PMI_T0;
+               if (escr & P4_ESCR_T1_OS) {
+                       escr &= ~P4_ESCR_T1_OS;
+                       escr |= P4_ESCR_T0_OS;
+               }
+               if (escr & P4_ESCR_T1_USR) {
+                       escr &= ~P4_ESCR_T1_USR;
+                       escr |= P4_ESCR_T0_USR;
+               }
+               hwc->config  = p4_config_pack_escr(escr);
+               hwc->config |= p4_config_pack_cccr(cccr);
+               hwc->config &= ~P4_CONFIG_HT;
+       }
+}
+
+/*
+ * ESCR address hashing is tricky, ESCRs are not sequential
+ * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and
+ * the metric between any ESCRs is laid in range [0xa0,0xe1]
+ *
+ * so we make ~70% filled hashtable
+ */
+
+#define P4_ESCR_MSR_BASE               0x000003a0
+#define P4_ESCR_MSR_MAX                        0x000003e1
+#define P4_ESCR_MSR_TABLE_SIZE         (P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1)
+#define P4_ESCR_MSR_IDX(msr)           (msr - P4_ESCR_MSR_BASE)
+#define P4_ESCR_MSR_TABLE_ENTRY(msr)   [P4_ESCR_MSR_IDX(msr)] = msr
+
+static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = {
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1),
+};
+
+static int p4_get_escr_idx(unsigned int addr)
+{
+       unsigned int idx = P4_ESCR_MSR_IDX(addr);
+
+       if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE      ||
+                       !p4_escr_table[idx]             ||
+                       p4_escr_table[idx] != addr)) {
+               WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
+               return -1;
+       }
+
+       return idx;
+}
+
+static int p4_next_cntr(int thread, unsigned long *used_mask,
+                       struct p4_event_bind *bind)
+{
+       int i, j;
+
+       for (i = 0; i < P4_CNTR_LIMIT; i++) {
+               j = bind->cntr[thread][i];
+               if (j != -1 && !test_bit(j, used_mask))
+                       return j;
+       }
+
+       return -1;
+}
+
+static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
+{
+       unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+       unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
+       int cpu = smp_processor_id();
+       struct hw_perf_event *hwc;
+       struct p4_event_bind *bind;
+       unsigned int i, thread, num;
+       int cntr_idx, escr_idx;
+       u64 config_alias;
+       int pass;
+
+       bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+       bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
+
+       for (i = 0, num = n; i < n; i++, num--) {
+
+               hwc = &cpuc->event_list[i]->hw;
+               thread = p4_ht_thread(cpu);
+               pass = 0;
+
+again:
+               /*
+                * It's possible to hit a circular lock
+                * between original and alternative events
+                * if both are scheduled already.
+                */
+               if (pass > 2)
+                       goto done;
+
+               bind = p4_config_get_bind(hwc->config);
+               escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
+               if (unlikely(escr_idx == -1))
+                       goto done;
+
+               if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
+                       cntr_idx = hwc->idx;
+                       if (assign)
+                               assign[i] = hwc->idx;
+                       goto reserve;
+               }
+
+               cntr_idx = p4_next_cntr(thread, used_mask, bind);
+               if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) {
+                       /*
+                        * Check whether an event alias is still available.
+                        */
+                       config_alias = p4_get_alias_event(hwc->config);
+                       if (!config_alias)
+                               goto done;
+                       hwc->config = config_alias;
+                       pass++;
+                       goto again;
+               }
+               /*
+                * Perf does test runs to see if a whole group can be assigned
+                * together succesfully.  There can be multiple rounds of this.
+                * Unfortunately, p4_pmu_swap_config_ts touches the hwc->config
+                * bits, such that the next round of group assignments will
+                * cause the above p4_should_swap_ts to pass instead of fail.
+                * This leads to counters exclusive to thread0 being used by
+                * thread1.
+                *
+                * Solve this with a cheap hack, reset the idx back to -1 to
+                * force a new lookup (p4_next_cntr) to get the right counter
+                * for the right thread.
+                *
+                * This probably doesn't comply with the general spirit of how
+                * perf wants to work, but P4 is special. :-(
+                */
+               if (p4_should_swap_ts(hwc->config, cpu))
+                       hwc->idx = -1;
+               p4_pmu_swap_config_ts(hwc, cpu);
+               if (assign)
+                       assign[i] = cntr_idx;
+reserve:
+               set_bit(cntr_idx, used_mask);
+               set_bit(escr_idx, escr_mask);
+       }
+
+done:
+       return num ? -EINVAL : 0;
+}
+
+PMU_FORMAT_ATTR(cccr, "config:0-31" );
+PMU_FORMAT_ATTR(escr, "config:32-62");
+PMU_FORMAT_ATTR(ht,   "config:63"   );
+
+static struct attribute *intel_p4_formats_attr[] = {
+       &format_attr_cccr.attr,
+       &format_attr_escr.attr,
+       &format_attr_ht.attr,
+       NULL,
+};
+
+static __initconst const struct x86_pmu p4_pmu = {
+       .name                   = "Netburst P4/Xeon",
+       .handle_irq             = p4_pmu_handle_irq,
+       .disable_all            = p4_pmu_disable_all,
+       .enable_all             = p4_pmu_enable_all,
+       .enable                 = p4_pmu_enable_event,
+       .disable                = p4_pmu_disable_event,
+       .eventsel               = MSR_P4_BPU_CCCR0,
+       .perfctr                = MSR_P4_BPU_PERFCTR0,
+       .event_map              = p4_pmu_event_map,
+       .max_events             = ARRAY_SIZE(p4_general_events),
+       .get_event_constraints  = x86_get_event_constraints,
+       /*
+        * IF HT disabled we may need to use all
+        * ARCH_P4_MAX_CCCR counters simulaneously
+        * though leave it restricted at moment assuming
+        * HT is on
+        */
+       .num_counters           = ARCH_P4_MAX_CCCR,
+       .apic                   = 1,
+       .cntval_bits            = ARCH_P4_CNTRVAL_BITS,
+       .cntval_mask            = ARCH_P4_CNTRVAL_MASK,
+       .max_period             = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
+       .hw_config              = p4_hw_config,
+       .schedule_events        = p4_pmu_schedule_events,
+       /*
+        * This handles erratum N15 in intel doc 249199-029,
+        * the counter may not be updated correctly on write
+        * so we need a second write operation to do the trick
+        * (the official workaround didn't work)
+        *
+        * the former idea is taken from OProfile code
+        */
+       .perfctr_second_write   = 1,
+
+       .format_attrs           = intel_p4_formats_attr,
+};
+
+__init int p4_pmu_init(void)
+{
+       unsigned int low, high;
+       int i, reg;
+
+       /* If we get stripped -- indexing fails */
+       BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC);
+
+       rdmsr(MSR_IA32_MISC_ENABLE, low, high);
+       if (!(low & (1 << 7))) {
+               pr_cont("unsupported Netburst CPU model %d ",
+                       boot_cpu_data.x86_model);
+               return -ENODEV;
+       }
+
+       memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
+               sizeof(hw_cache_event_ids));
+
+       pr_cont("Netburst events, ");
+
+       x86_pmu = p4_pmu;
+
+       /*
+        * Even though the counters are configured to interrupt a particular
+        * logical processor when an overflow happens, testing has shown that
+        * on kdump kernels (which uses a single cpu), thread1's counter
+        * continues to run and will report an NMI on thread0.  Due to the
+        * overflow bug, this leads to a stream of unknown NMIs.
+        *
+        * Solve this by zero'ing out the registers to mimic a reset.
+        */
+       for (i = 0; i < x86_pmu.num_counters; i++) {
+               reg = x86_pmu_config_addr(i);
+               wrmsrl_safe(reg, 0ULL);
+       }
+
+       return 0;
+}
diff --git a/arch/x86/events/intel/p6.c b/arch/x86/events/intel/p6.c

new file mode 100644 (file)

index 0000000..1f5c47a
--- /dev/null
+++ b/arch/x86/events/intel/p6.c
@@ -0,0 +1,279 @@
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+#include "../perf_event.h"
+
+/*
+ * Not sure about some of these
+ */
+static const u64 p6_perfmon_event_map[] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]           = 0x0079,       /* CPU_CLK_UNHALTED */
+  [PERF_COUNT_HW_INSTRUCTIONS]         = 0x00c0,       /* INST_RETIRED     */
+  [PERF_COUNT_HW_CACHE_REFERENCES]     = 0x0f2e,       /* L2_RQSTS:M:E:S:I */
+  [PERF_COUNT_HW_CACHE_MISSES]         = 0x012e,       /* L2_RQSTS:I       */
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]  = 0x00c4,       /* BR_INST_RETIRED  */
+  [PERF_COUNT_HW_BRANCH_MISSES]                = 0x00c5,       /* BR_MISS_PRED_RETIRED */
+  [PERF_COUNT_HW_BUS_CYCLES]           = 0x0062,       /* BUS_DRDY_CLOCKS  */
+  [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a2,    /* RESOURCE_STALLS  */
+
+};
+
+static const u64 __initconst p6_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0043,  /* DATA_MEM_REFS       */
+                [ C(RESULT_MISS)   ] = 0x0045, /* DCU_LINES_IN        */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0x0f29,  /* L2_LD:M:E:S:I       */
+       },
+        [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+        },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0080,  /* IFU_IFETCH         */
+               [ C(RESULT_MISS)   ] = 0x0f28,  /* L2_IFETCH:M:E:S:I  */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0x0025,  /* L2_M_LINES_INM     */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0043,  /* DATA_MEM_REFS      */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0080,  /* IFU_IFETCH         */
+               [ C(RESULT_MISS)   ] = 0x0085,  /* ITLB_MISS          */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c4,  /* BR_INST_RETIRED      */
+               [ C(RESULT_MISS)   ] = 0x00c5,  /* BR_MISS_PRED_RETIRED */
+        },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+static u64 p6_pmu_event_map(int hw_event)
+{
+       return p6_perfmon_event_map[hw_event];
+}
+
+/*
+ * Event setting that is specified not to count anything.
+ * We use this to effectively disable a counter.
+ *
+ * L2_RQSTS with 0 MESI unit mask.
+ */
+#define P6_NOP_EVENT                   0x0000002EULL
+
+static struct event_constraint p6_event_constraints[] =
+{
+       INTEL_EVENT_CONSTRAINT(0xc1, 0x1),      /* FLOPS */
+       INTEL_EVENT_CONSTRAINT(0x10, 0x1),      /* FP_COMP_OPS_EXE */
+       INTEL_EVENT_CONSTRAINT(0x11, 0x2),      /* FP_ASSIST */
+       INTEL_EVENT_CONSTRAINT(0x12, 0x2),      /* MUL */
+       INTEL_EVENT_CONSTRAINT(0x13, 0x2),      /* DIV */
+       INTEL_EVENT_CONSTRAINT(0x14, 0x1),      /* CYCLES_DIV_BUSY */
+       EVENT_CONSTRAINT_END
+};
+
+static void p6_pmu_disable_all(void)
+{
+       u64 val;
+
+       /* p6 only has one enable register */
+       rdmsrl(MSR_P6_EVNTSEL0, val);
+       val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+       wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
+static void p6_pmu_enable_all(int added)
+{
+       unsigned long val;
+
+       /* p6 only has one enable register */
+       rdmsrl(MSR_P6_EVNTSEL0, val);
+       val |= ARCH_PERFMON_EVENTSEL_ENABLE;
+       wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
+static inline void
+p6_pmu_disable_event(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 val = P6_NOP_EVENT;
+
+       (void)wrmsrl_safe(hwc->config_base, val);
+}
+
+static void p6_pmu_enable_event(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 val;
+
+       val = hwc->config;
+
+       /*
+        * p6 only has a global event enable, set on PerfEvtSel0
+        * We "disable" events by programming P6_NOP_EVENT
+        * and we rely on p6_pmu_enable_all() being called
+        * to actually enable the events.
+        */
+
+       (void)wrmsrl_safe(hwc->config_base, val);
+}
+
+PMU_FORMAT_ATTR(event, "config:0-7"    );
+PMU_FORMAT_ATTR(umask, "config:8-15"   );
+PMU_FORMAT_ATTR(edge,  "config:18"     );
+PMU_FORMAT_ATTR(pc,    "config:19"     );
+PMU_FORMAT_ATTR(inv,   "config:23"     );
+PMU_FORMAT_ATTR(cmask, "config:24-31"  );
+
+static struct attribute *intel_p6_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_pc.attr,
+       &format_attr_inv.attr,
+       &format_attr_cmask.attr,
+       NULL,
+};
+
+static __initconst const struct x86_pmu p6_pmu = {
+       .name                   = "p6",
+       .handle_irq             = x86_pmu_handle_irq,
+       .disable_all            = p6_pmu_disable_all,
+       .enable_all             = p6_pmu_enable_all,
+       .enable                 = p6_pmu_enable_event,
+       .disable                = p6_pmu_disable_event,
+       .hw_config              = x86_pmu_hw_config,
+       .schedule_events        = x86_schedule_events,
+       .eventsel               = MSR_P6_EVNTSEL0,
+       .perfctr                = MSR_P6_PERFCTR0,
+       .event_map              = p6_pmu_event_map,
+       .max_events             = ARRAY_SIZE(p6_perfmon_event_map),
+       .apic                   = 1,
+       .max_period             = (1ULL << 31) - 1,
+       .version                = 0,
+       .num_counters           = 2,
+       /*
+        * Events have 40 bits implemented. However they are designed such
+        * that bits [32-39] are sign extensions of bit 31. As such the
+        * effective width of a event for P6-like PMU is 32 bits only.
+        *
+        * See IA-32 Intel Architecture Software developer manual Vol 3B
+        */
+       .cntval_bits            = 32,
+       .cntval_mask            = (1ULL << 32) - 1,
+       .get_event_constraints  = x86_get_event_constraints,
+       .event_constraints      = p6_event_constraints,
+
+       .format_attrs           = intel_p6_formats_attr,
+       .events_sysfs_show      = intel_event_sysfs_show,
+
+};
+
+static __init void p6_pmu_rdpmc_quirk(void)
+{
+       if (boot_cpu_data.x86_mask < 9) {
+               /*
+                * PPro erratum 26; fixed in stepping 9 and above.
+                */
+               pr_warn("Userspace RDPMC support disabled due to a CPU erratum\n");
+               x86_pmu.attr_rdpmc_broken = 1;
+               x86_pmu.attr_rdpmc = 0;
+       }
+}
+
+__init int p6_pmu_init(void)
+{
+       x86_pmu = p6_pmu;
+
+       switch (boot_cpu_data.x86_model) {
+       case  1: /* Pentium Pro */
+               x86_add_quirk(p6_pmu_rdpmc_quirk);
+               break;
+
+       case  3: /* Pentium II - Klamath */
+       case  5: /* Pentium II - Deschutes */
+       case  6: /* Pentium II - Mendocino */
+               break;
+
+       case  7: /* Pentium III - Katmai */
+       case  8: /* Pentium III - Coppermine */
+       case 10: /* Pentium III Xeon */
+       case 11: /* Pentium III - Tualatin */
+               break;
+
+       case  9: /* Pentium M - Banias */
+       case 13: /* Pentium M - Dothan */
+               break;
+
+       default:
+               pr_cont("unsupported p6 CPU model %d ", boot_cpu_data.x86_model);
+               return -ENODEV;
+       }
+
+       memcpy(hw_cache_event_ids, p6_hw_cache_event_ids,
+               sizeof(hw_cache_event_ids));
+
+       return 0;
+}
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c

new file mode 100644 (file)

index 0000000..6af7cf7
--- /dev/null
+++ b/arch/x86/events/intel/pt.c
@@ -0,0 +1,1188 @@
+/*
+ * Intel(R) Processor Trace PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
+ * Programming Reference:
+ * http://software.intel.com/en-us/intel-isa-extensions
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+
+#include <asm/perf_event.h>
+#include <asm/insn.h>
+#include <asm/io.h>
+#include <asm/intel_pt.h>
+
+#include "../perf_event.h"
+#include "pt.h"
+
+static DEFINE_PER_CPU(struct pt, pt_ctx);
+
+static struct pt_pmu pt_pmu;
+
+enum cpuid_regs {
+       CR_EAX = 0,
+       CR_ECX,
+       CR_EDX,
+       CR_EBX
+};
+
+/*
+ * Capabilities of Intel PT hardware, such as number of address bits or
+ * supported output schemes, are cached and exported to userspace as "caps"
+ * attribute group of pt pmu device
+ * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
+ * relevant bits together with intel_pt traces.
+ *
+ * These are necessary for both trace decoding (payloads_lip, contains address
+ * width encoded in IP-related packets), and event configuration (bitmasks with
+ * permitted values for certain bit fields).
+ */
+#define PT_CAP(_n, _l, _r, _m)                                         \
+       [PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l,        \
+                           .reg = _r, .mask = _m }
+
+static struct pt_cap_desc {
+       const char      *name;
+       u32             leaf;
+       u8              reg;
+       u32             mask;
+} pt_caps[] = {
+       PT_CAP(max_subleaf,             0, CR_EAX, 0xffffffff),
+       PT_CAP(cr3_filtering,           0, CR_EBX, BIT(0)),
+       PT_CAP(psb_cyc,                 0, CR_EBX, BIT(1)),
+       PT_CAP(mtc,                     0, CR_EBX, BIT(3)),
+       PT_CAP(topa_output,             0, CR_ECX, BIT(0)),
+       PT_CAP(topa_multiple_entries,   0, CR_ECX, BIT(1)),
+       PT_CAP(single_range_output,     0, CR_ECX, BIT(2)),
+       PT_CAP(payloads_lip,            0, CR_ECX, BIT(31)),
+       PT_CAP(mtc_periods,             1, CR_EAX, 0xffff0000),
+       PT_CAP(cycle_thresholds,        1, CR_EBX, 0xffff),
+       PT_CAP(psb_periods,             1, CR_EBX, 0xffff0000),
+};
+
+static u32 pt_cap_get(enum pt_capabilities cap)
+{
+       struct pt_cap_desc *cd = &pt_caps[cap];
+       u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
+       unsigned int shift = __ffs(cd->mask);
+
+       return (c & cd->mask) >> shift;
+}
+
+static ssize_t pt_cap_show(struct device *cdev,
+                          struct device_attribute *attr,
+                          char *buf)
+{
+       struct dev_ext_attribute *ea =
+               container_of(attr, struct dev_ext_attribute, attr);
+       enum pt_capabilities cap = (long)ea->var;
+
+       return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
+}
+
+static struct attribute_group pt_cap_group = {
+       .name   = "caps",
+};
+
+PMU_FORMAT_ATTR(cyc,           "config:1"      );
+PMU_FORMAT_ATTR(mtc,           "config:9"      );
+PMU_FORMAT_ATTR(tsc,           "config:10"     );
+PMU_FORMAT_ATTR(noretcomp,     "config:11"     );
+PMU_FORMAT_ATTR(mtc_period,    "config:14-17"  );
+PMU_FORMAT_ATTR(cyc_thresh,    "config:19-22"  );
+PMU_FORMAT_ATTR(psb_period,    "config:24-27"  );
+
+static struct attribute *pt_formats_attr[] = {
+       &format_attr_cyc.attr,
+       &format_attr_mtc.attr,
+       &format_attr_tsc.attr,
+       &format_attr_noretcomp.attr,
+       &format_attr_mtc_period.attr,
+       &format_attr_cyc_thresh.attr,
+       &format_attr_psb_period.attr,
+       NULL,
+};
+
+static struct attribute_group pt_format_group = {
+       .name   = "format",
+       .attrs  = pt_formats_attr,
+};
+
+static const struct attribute_group *pt_attr_groups[] = {
+       &pt_cap_group,
+       &pt_format_group,
+       NULL,
+};
+
+static int __init pt_pmu_hw_init(void)
+{
+       struct dev_ext_attribute *de_attrs;
+       struct attribute **attrs;
+       size_t size;
+       int ret;
+       long i;
+
+       attrs = NULL;
+
+       for (i = 0; i < PT_CPUID_LEAVES; i++) {
+               cpuid_count(20, i,
+                           &pt_pmu.caps[CR_EAX + i*PT_CPUID_REGS_NUM],
+                           &pt_pmu.caps[CR_EBX + i*PT_CPUID_REGS_NUM],
+                           &pt_pmu.caps[CR_ECX + i*PT_CPUID_REGS_NUM],
+                           &pt_pmu.caps[CR_EDX + i*PT_CPUID_REGS_NUM]);
+       }
+
+       ret = -ENOMEM;
+       size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
+       attrs = kzalloc(size, GFP_KERNEL);
+       if (!attrs)
+               goto fail;
+
+       size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
+       de_attrs = kzalloc(size, GFP_KERNEL);
+       if (!de_attrs)
+               goto fail;
+
+       for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
+               struct dev_ext_attribute *de_attr = de_attrs + i;
+
+               de_attr->attr.attr.name = pt_caps[i].name;
+
+               sysfs_attr_init(&de_attr->attr.attr);
+
+               de_attr->attr.attr.mode         = S_IRUGO;
+               de_attr->attr.show              = pt_cap_show;
+               de_attr->var                    = (void *)i;
+
+               attrs[i] = &de_attr->attr.attr;
+       }
+
+       pt_cap_group.attrs = attrs;
+
+       return 0;
+
+fail:
+       kfree(attrs);
+
+       return ret;
+}
+
+#define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC    | \
+                         RTIT_CTL_CYC_THRESH   | \
+                         RTIT_CTL_PSB_FREQ)
+
+#define RTIT_CTL_MTC   (RTIT_CTL_MTC_EN        | \
+                        RTIT_CTL_MTC_RANGE)
+
+#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN                | \
+                       RTIT_CTL_DISRETC        | \
+                       RTIT_CTL_CYC_PSB        | \
+                       RTIT_CTL_MTC)
+
+static bool pt_event_valid(struct perf_event *event)
+{
+       u64 config = event->attr.config;
+       u64 allowed, requested;
+
+       if ((config & PT_CONFIG_MASK) != config)
+               return false;
+
+       if (config & RTIT_CTL_CYC_PSB) {
+               if (!pt_cap_get(PT_CAP_psb_cyc))
+                       return false;
+
+               allowed = pt_cap_get(PT_CAP_psb_periods);
+               requested = (config & RTIT_CTL_PSB_FREQ) >>
+                       RTIT_CTL_PSB_FREQ_OFFSET;
+               if (requested && (!(allowed & BIT(requested))))
+                       return false;
+
+               allowed = pt_cap_get(PT_CAP_cycle_thresholds);
+               requested = (config & RTIT_CTL_CYC_THRESH) >>
+                       RTIT_CTL_CYC_THRESH_OFFSET;
+               if (requested && (!(allowed & BIT(requested))))
+                       return false;
+       }
+
+       if (config & RTIT_CTL_MTC) {
+               /*
+                * In the unlikely case that CPUID lists valid mtc periods,
+                * but not the mtc capability, drop out here.
+                *
+                * Spec says that setting mtc period bits while mtc bit in
+                * CPUID is 0 will #GP, so better safe than sorry.
+                */
+               if (!pt_cap_get(PT_CAP_mtc))
+                       return false;
+
+               allowed = pt_cap_get(PT_CAP_mtc_periods);
+               if (!allowed)
+                       return false;
+
+               requested = (config & RTIT_CTL_MTC_RANGE) >>
+                       RTIT_CTL_MTC_RANGE_OFFSET;
+
+               if (!(allowed & BIT(requested)))
+                       return false;
+       }
+
+       return true;
+}
+
+/*
+ * PT configuration helpers
+ * These all are cpu affine and operate on a local PT
+ */
+
+static void pt_config(struct perf_event *event)
+{
+       u64 reg;
+
+       if (!event->hw.itrace_started) {
+               event->hw.itrace_started = 1;
+               wrmsrl(MSR_IA32_RTIT_STATUS, 0);
+       }
+
+       reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
+
+       if (!event->attr.exclude_kernel)
+               reg |= RTIT_CTL_OS;
+       if (!event->attr.exclude_user)
+               reg |= RTIT_CTL_USR;
+
+       reg |= (event->attr.config & PT_CONFIG_MASK);
+
+       wrmsrl(MSR_IA32_RTIT_CTL, reg);
+}
+
+static void pt_config_start(bool start)
+{
+       u64 ctl;
+
+       rdmsrl(MSR_IA32_RTIT_CTL, ctl);
+       if (start)
+               ctl |= RTIT_CTL_TRACEEN;
+       else
+               ctl &= ~RTIT_CTL_TRACEEN;
+       wrmsrl(MSR_IA32_RTIT_CTL, ctl);
+
+       /*
+        * A wrmsr that disables trace generation serializes other PT
+        * registers and causes all data packets to be written to memory,
+        * but a fence is required for the data to become globally visible.
+        *
+        * The below WMB, separating data store and aux_head store matches
+        * the consumer's RMB that separates aux_head load and data load.
+        */
+       if (!start)
+               wmb();
+}
+
+static void pt_config_buffer(void *buf, unsigned int topa_idx,
+                            unsigned int output_off)
+{
+       u64 reg;
+
+       wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
+
+       reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
+
+       wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
+}
+
+/*
+ * Keep ToPA table-related metadata on the same page as the actual table,
+ * taking up a few words from the top
+ */
+
+#define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1)
+
+/**
+ * struct topa - page-sized ToPA table with metadata at the top
+ * @table:     actual ToPA table entries, as understood by PT hardware
+ * @list:      linkage to struct pt_buffer's list of tables
+ * @phys:      physical address of this page
+ * @offset:    offset of the first entry in this table in the buffer
+ * @size:      total size of all entries in this table
+ * @last:      index of the last initialized entry in this table
+ */
+struct topa {
+       struct topa_entry       table[TENTS_PER_PAGE];
+       struct list_head        list;
+       u64                     phys;
+       u64                     offset;
+       size_t                  size;
+       int                     last;
+};
+
+/* make -1 stand for the last table entry */
+#define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)])
+
+/**
+ * topa_alloc() - allocate page-sized ToPA table
+ * @cpu:       CPU on which to allocate.
+ * @gfp:       Allocation flags.
+ *
+ * Return:     On success, return the pointer to ToPA table page.
+ */
+static struct topa *topa_alloc(int cpu, gfp_t gfp)
+{
+       int node = cpu_to_node(cpu);
+       struct topa *topa;
+       struct page *p;
+
+       p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
+       if (!p)
+               return NULL;
+
+       topa = page_address(p);
+       topa->last = 0;
+       topa->phys = page_to_phys(p);
+
+       /*
+        * In case of singe-entry ToPA, always put the self-referencing END
+        * link as the 2nd entry in the table
+        */
+       if (!pt_cap_get(PT_CAP_topa_multiple_entries)) {
+               TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
+               TOPA_ENTRY(topa, 1)->end = 1;
+       }
+
+       return topa;
+}
+
+/**
+ * topa_free() - free a page-sized ToPA table
+ * @topa:      Table to deallocate.
+ */
+static void topa_free(struct topa *topa)
+{
+       free_page((unsigned long)topa);
+}
+
+/**
+ * topa_insert_table() - insert a ToPA table into a buffer
+ * @buf:        PT buffer that's being extended.
+ * @topa:       New topa table to be inserted.
+ *
+ * If it's the first table in this buffer, set up buffer's pointers
+ * accordingly; otherwise, add a END=1 link entry to @topa to the current
+ * "last" table and adjust the last table pointer to @topa.
+ */
+static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
+{
+       struct topa *last = buf->last;
+
+       list_add_tail(&topa->list, &buf->tables);
+
+       if (!buf->first) {
+               buf->first = buf->last = buf->cur = topa;
+               return;
+       }
+
+       topa->offset = last->offset + last->size;
+       buf->last = topa;
+
+       if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+               return;
+
+       BUG_ON(last->last != TENTS_PER_PAGE - 1);
+
+       TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT;
+       TOPA_ENTRY(last, -1)->end = 1;
+}
+
+/**
+ * topa_table_full() - check if a ToPA table is filled up
+ * @topa:      ToPA table.
+ */
+static bool topa_table_full(struct topa *topa)
+{
+       /* single-entry ToPA is a special case */
+       if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+               return !!topa->last;
+
+       return topa->last == TENTS_PER_PAGE - 1;
+}
+
+/**
+ * topa_insert_pages() - create a list of ToPA tables
+ * @buf:       PT buffer being initialized.
+ * @gfp:       Allocation flags.
+ *
+ * This initializes a list of ToPA tables with entries from
+ * the data_pages provided by rb_alloc_aux().
+ *
+ * Return:     0 on success or error code.
+ */
+static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp)
+{
+       struct topa *topa = buf->last;
+       int order = 0;
+       struct page *p;
+
+       p = virt_to_page(buf->data_pages[buf->nr_pages]);
+       if (PagePrivate(p))
+               order = page_private(p);
+
+       if (topa_table_full(topa)) {
+               topa = topa_alloc(buf->cpu, gfp);
+               if (!topa)
+                       return -ENOMEM;
+
+               topa_insert_table(buf, topa);
+       }
+
+       TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
+       TOPA_ENTRY(topa, -1)->size = order;
+       if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) {
+               TOPA_ENTRY(topa, -1)->intr = 1;
+               TOPA_ENTRY(topa, -1)->stop = 1;
+       }
+
+       topa->last++;
+       topa->size += sizes(order);
+
+       buf->nr_pages += 1ul << order;
+
+       return 0;
+}
+
+/**
+ * pt_topa_dump() - print ToPA tables and their entries
+ * @buf:       PT buffer.
+ */
+static void pt_topa_dump(struct pt_buffer *buf)
+{
+       struct topa *topa;
+
+       list_for_each_entry(topa, &buf->tables, list) {
+               int i;
+
+               pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table,
+                        topa->phys, topa->offset, topa->size);
+               for (i = 0; i < TENTS_PER_PAGE; i++) {
+                       pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
+                                &topa->table[i],
+                                (unsigned long)topa->table[i].base << TOPA_SHIFT,
+                                sizes(topa->table[i].size),
+                                topa->table[i].end ?  'E' : ' ',
+                                topa->table[i].intr ? 'I' : ' ',
+                                topa->table[i].stop ? 'S' : ' ',
+                                *(u64 *)&topa->table[i]);
+                       if ((pt_cap_get(PT_CAP_topa_multiple_entries) &&
+                            topa->table[i].stop) ||
+                           topa->table[i].end)
+                               break;
+               }
+       }
+}
+
+/**
+ * pt_buffer_advance() - advance to the next output region
+ * @buf:       PT buffer.
+ *
+ * Advance the current pointers in the buffer to the next ToPA entry.
+ */
+static void pt_buffer_advance(struct pt_buffer *buf)
+{
+       buf->output_off = 0;
+       buf->cur_idx++;
+
+       if (buf->cur_idx == buf->cur->last) {
+               if (buf->cur == buf->last)
+                       buf->cur = buf->first;
+               else
+                       buf->cur = list_entry(buf->cur->list.next, struct topa,
+                                             list);
+               buf->cur_idx = 0;
+       }
+}
+
+/**
+ * pt_update_head() - calculate current offsets and sizes
+ * @pt:                Per-cpu pt context.
+ *
+ * Update buffer's current write pointer position and data size.
+ */
+static void pt_update_head(struct pt *pt)
+{
+       struct pt_buffer *buf = perf_get_aux(&pt->handle);
+       u64 topa_idx, base, old;
+
+       /* offset of the first region in this table from the beginning of buf */
+       base = buf->cur->offset + buf->output_off;
+
+       /* offset of the current output region within this table */
+       for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
+               base += sizes(buf->cur->table[topa_idx].size);
+
+       if (buf->snapshot) {
+               local_set(&buf->data_size, base);
+       } else {
+               old = (local64_xchg(&buf->head, base) &
+                      ((buf->nr_pages << PAGE_SHIFT) - 1));
+               if (base < old)
+                       base += buf->nr_pages << PAGE_SHIFT;
+
+               local_add(base - old, &buf->data_size);
+       }
+}
+
+/**
+ * pt_buffer_region() - obtain current output region's address
+ * @buf:       PT buffer.
+ */
+static void *pt_buffer_region(struct pt_buffer *buf)
+{
+       return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT);
+}
+
+/**
+ * pt_buffer_region_size() - obtain current output region's size
+ * @buf:       PT buffer.
+ */
+static size_t pt_buffer_region_size(struct pt_buffer *buf)
+{
+       return sizes(buf->cur->table[buf->cur_idx].size);
+}
+
+/**
+ * pt_handle_status() - take care of possible status conditions
+ * @pt:                Per-cpu pt context.
+ */
+static void pt_handle_status(struct pt *pt)
+{
+       struct pt_buffer *buf = perf_get_aux(&pt->handle);
+       int advance = 0;
+       u64 status;
+
+       rdmsrl(MSR_IA32_RTIT_STATUS, status);
+
+       if (status & RTIT_STATUS_ERROR) {
+               pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
+               pt_topa_dump(buf);
+               status &= ~RTIT_STATUS_ERROR;
+       }
+
+       if (status & RTIT_STATUS_STOPPED) {
+               status &= ~RTIT_STATUS_STOPPED;
+
+               /*
+                * On systems that only do single-entry ToPA, hitting STOP
+                * means we are already losing data; need to let the decoder
+                * know.
+                */
+               if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
+                   buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
+                       local_inc(&buf->lost);
+                       advance++;
+               }
+       }
+
+       /*
+        * Also on single-entry ToPA implementations, interrupt will come
+        * before the output reaches its output region's boundary.
+        */
+       if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot &&
+           pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
+               void *head = pt_buffer_region(buf);
+
+               /* everything within this margin needs to be zeroed out */
+               memset(head + buf->output_off, 0,
+                      pt_buffer_region_size(buf) -
+                      buf->output_off);
+               advance++;
+       }
+
+       if (advance)
+               pt_buffer_advance(buf);
+
+       wrmsrl(MSR_IA32_RTIT_STATUS, status);
+}
+
+/**
+ * pt_read_offset() - translate registers into buffer pointers
+ * @buf:       PT buffer.
+ *
+ * Set buffer's output pointers from MSR values.
+ */
+static void pt_read_offset(struct pt_buffer *buf)
+{
+       u64 offset, base_topa;
+
+       rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
+       buf->cur = phys_to_virt(base_topa);
+
+       rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
+       /* offset within current output region */
+       buf->output_off = offset >> 32;
+       /* index of current output region within this table */
+       buf->cur_idx = (offset & 0xffffff80) >> 7;
+}
+
+/**
+ * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry
+ * @buf:       PT buffer.
+ * @pg:                Page offset in the buffer.
+ *
+ * When advancing to the next output region (ToPA entry), given a page offset
+ * into the buffer, we need to find the offset of the first page in the next
+ * region.
+ */
+static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
+{
+       struct topa_entry *te = buf->topa_index[pg];
+
+       /* one region */
+       if (buf->first == buf->last && buf->first->last == 1)
+               return pg;
+
+       do {
+               pg++;
+               pg &= buf->nr_pages - 1;
+       } while (buf->topa_index[pg] == te);
+
+       return pg;
+}
+
+/**
+ * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
+ * @buf:       PT buffer.
+ * @handle:    Current output handle.
+ *
+ * Place INT and STOP marks to prevent overwriting old data that the consumer
+ * hasn't yet collected and waking up the consumer after a certain fraction of
+ * the buffer has filled up. Only needed and sensible for non-snapshot counters.
+ *
+ * This obviously relies on buf::head to figure out buffer markers, so it has
+ * to be called after pt_buffer_reset_offsets() and before the hardware tracing
+ * is enabled.
+ */
+static int pt_buffer_reset_markers(struct pt_buffer *buf,
+                                  struct perf_output_handle *handle)
+
+{
+       unsigned long head = local64_read(&buf->head);
+       unsigned long idx, npages, wakeup;
+
+       /* can't stop in the middle of an output region */
+       if (buf->output_off + handle->size + 1 <
+           sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
+               return -EINVAL;
+
+
+       /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
+       if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+               return 0;
+
+       /* clear STOP and INT from current entry */
+       buf->topa_index[buf->stop_pos]->stop = 0;
+       buf->topa_index[buf->intr_pos]->intr = 0;
+
+       /* how many pages till the STOP marker */
+       npages = handle->size >> PAGE_SHIFT;
+
+       /* if it's on a page boundary, fill up one more page */
+       if (!offset_in_page(head + handle->size + 1))
+               npages++;
+
+       idx = (head >> PAGE_SHIFT) + npages;
+       idx &= buf->nr_pages - 1;
+       buf->stop_pos = idx;
+
+       wakeup = handle->wakeup >> PAGE_SHIFT;
+
+       /* in the worst case, wake up the consumer one page before hard stop */
+       idx = (head >> PAGE_SHIFT) + npages - 1;
+       if (idx > wakeup)
+               idx = wakeup;
+
+       idx &= buf->nr_pages - 1;
+       buf->intr_pos = idx;
+
+       buf->topa_index[buf->stop_pos]->stop = 1;
+       buf->topa_index[buf->intr_pos]->intr = 1;
+
+       return 0;
+}
+
+/**
+ * pt_buffer_setup_topa_index() - build topa_index[] table of regions
+ * @buf:       PT buffer.
+ *
+ * topa_index[] references output regions indexed by offset into the
+ * buffer for purposes of quick reverse lookup.
+ */
+static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
+{
+       struct topa *cur = buf->first, *prev = buf->last;
+       struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
+               *te_prev = TOPA_ENTRY(prev, prev->last - 1);
+       int pg = 0, idx = 0;
+
+       while (pg < buf->nr_pages) {
+               int tidx;
+
+               /* pages within one topa entry */
+               for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++)
+                       buf->topa_index[pg] = te_prev;
+
+               te_prev = te_cur;
+
+               if (idx == cur->last - 1) {
+                       /* advance to next topa table */
+                       idx = 0;
+                       cur = list_entry(cur->list.next, struct topa, list);
+               } else {
+                       idx++;
+               }
+               te_cur = TOPA_ENTRY(cur, idx);
+       }
+
+}
+
+/**
+ * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
+ * @buf:       PT buffer.
+ * @head:      Write pointer (aux_head) from AUX buffer.
+ *
+ * Find the ToPA table and entry corresponding to given @head and set buffer's
+ * "current" pointers accordingly. This is done after we have obtained the
+ * current aux_head position from a successful call to perf_aux_output_begin()
+ * to make sure the hardware is writing to the right place.
+ *
+ * This function modifies buf::{cur,cur_idx,output_off} that will be programmed
+ * into PT msrs when the tracing is enabled and buf::head and buf::data_size,
+ * which are used to determine INT and STOP markers' locations by a subsequent
+ * call to pt_buffer_reset_markers().
+ */
+static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
+{
+       int pg;
+
+       if (buf->snapshot)
+               head &= (buf->nr_pages << PAGE_SHIFT) - 1;
+
+       pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
+       pg = pt_topa_next_entry(buf, pg);
+
+       buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK);
+       buf->cur_idx = ((unsigned long)buf->topa_index[pg] -
+                       (unsigned long)buf->cur) / sizeof(struct topa_entry);
+       buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1);
+
+       local64_set(&buf->head, head);
+       local_set(&buf->data_size, 0);
+}
+
+/**
+ * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
+ * @buf:       PT buffer.
+ */
+static void pt_buffer_fini_topa(struct pt_buffer *buf)
+{
+       struct topa *topa, *iter;
+
+       list_for_each_entry_safe(topa, iter, &buf->tables, list) {
+               /*
+                * right now, this is in free_aux() path only, so
+                * no need to unlink this table from the list
+                */
+               topa_free(topa);
+       }
+}
+
+/**
+ * pt_buffer_init_topa() - initialize ToPA table for pt buffer
+ * @buf:       PT buffer.
+ * @size:      Total size of all regions within this ToPA.
+ * @gfp:       Allocation flags.
+ */
+static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages,
+                              gfp_t gfp)
+{
+       struct topa *topa;
+       int err;
+
+       topa = topa_alloc(buf->cpu, gfp);
+       if (!topa)
+               return -ENOMEM;
+
+       topa_insert_table(buf, topa);
+
+       while (buf->nr_pages < nr_pages) {
+               err = topa_insert_pages(buf, gfp);
+               if (err) {
+                       pt_buffer_fini_topa(buf);
+                       return -ENOMEM;
+               }
+       }
+
+       pt_buffer_setup_topa_index(buf);
+
+       /* link last table to the first one, unless we're double buffering */
+       if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
+               TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT;
+               TOPA_ENTRY(buf->last, -1)->end = 1;
+       }
+
+       pt_topa_dump(buf);
+       return 0;
+}
+
+/**
+ * pt_buffer_setup_aux() - set up topa tables for a PT buffer
+ * @cpu:       Cpu on which to allocate, -1 means current.
+ * @pages:     Array of pointers to buffer pages passed from perf core.
+ * @nr_pages:  Number of pages in the buffer.
+ * @snapshot:  If this is a snapshot/overwrite counter.
+ *
+ * This is a pmu::setup_aux callback that sets up ToPA tables and all the
+ * bookkeeping for an AUX buffer.
+ *
+ * Return:     Our private PT buffer structure.
+ */
+static void *
+pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot)
+{
+       struct pt_buffer *buf;
+       int node, ret;
+
+       if (!nr_pages)
+               return NULL;
+
+       if (cpu == -1)
+               cpu = raw_smp_processor_id();
+       node = cpu_to_node(cpu);
+
+       buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]),
+                          GFP_KERNEL, node);
+       if (!buf)
+               return NULL;
+
+       buf->cpu = cpu;
+       buf->snapshot = snapshot;
+       buf->data_pages = pages;
+
+       INIT_LIST_HEAD(&buf->tables);
+
+       ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL);
+       if (ret) {
+               kfree(buf);
+               return NULL;
+       }
+
+       return buf;
+}
+
+/**
+ * pt_buffer_free_aux() - perf AUX deallocation path callback
+ * @data:      PT buffer.
+ */
+static void pt_buffer_free_aux(void *data)
+{
+       struct pt_buffer *buf = data;
+
+       pt_buffer_fini_topa(buf);
+       kfree(buf);
+}
+
+/**
+ * pt_buffer_is_full() - check if the buffer is full
+ * @buf:       PT buffer.
+ * @pt:                Per-cpu pt handle.
+ *
+ * If the user hasn't read data from the output region that aux_head
+ * points to, the buffer is considered full: the user needs to read at
+ * least this region and update aux_tail to point past it.
+ */
+static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt)
+{
+       if (buf->snapshot)
+               return false;
+
+       if (local_read(&buf->data_size) >= pt->handle.size)
+               return true;
+
+       return false;
+}
+
+/**
+ * intel_pt_interrupt() - PT PMI handler
+ */
+void intel_pt_interrupt(void)
+{
+       struct pt *pt = this_cpu_ptr(&pt_ctx);
+       struct pt_buffer *buf;
+       struct perf_event *event = pt->handle.event;
+
+       /*
+        * There may be a dangling PT bit in the interrupt status register
+        * after PT has been disabled by pt_event_stop(). Make sure we don't
+        * do anything (particularly, re-enable) for this event here.
+        */
+       if (!ACCESS_ONCE(pt->handle_nmi))
+               return;
+
+       pt_config_start(false);
+
+       if (!event)
+               return;
+
+       buf = perf_get_aux(&pt->handle);
+       if (!buf)
+               return;
+
+       pt_read_offset(buf);
+
+       pt_handle_status(pt);
+
+       pt_update_head(pt);
+
+       perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
+                           local_xchg(&buf->lost, 0));
+
+       if (!event->hw.state) {
+               int ret;
+
+               buf = perf_aux_output_begin(&pt->handle, event);
+               if (!buf) {
+                       event->hw.state = PERF_HES_STOPPED;
+                       return;
+               }
+
+               pt_buffer_reset_offsets(buf, pt->handle.head);
+               /* snapshot counters don't use PMI, so it's safe */
+               ret = pt_buffer_reset_markers(buf, &pt->handle);
+               if (ret) {
+                       perf_aux_output_end(&pt->handle, 0, true);
+                       return;
+               }
+
+               pt_config_buffer(buf->cur->table, buf->cur_idx,
+                                buf->output_off);
+               pt_config(event);
+       }
+}
+
+/*
+ * PMU callbacks
+ */
+
+static void pt_event_start(struct perf_event *event, int mode)
+{
+       struct pt *pt = this_cpu_ptr(&pt_ctx);
+       struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+       if (!buf || pt_buffer_is_full(buf, pt)) {
+               event->hw.state = PERF_HES_STOPPED;
+               return;
+       }
+
+       ACCESS_ONCE(pt->handle_nmi) = 1;
+       event->hw.state = 0;
+
+       pt_config_buffer(buf->cur->table, buf->cur_idx,
+                        buf->output_off);
+       pt_config(event);
+}
+
+static void pt_event_stop(struct perf_event *event, int mode)
+{
+       struct pt *pt = this_cpu_ptr(&pt_ctx);
+
+       /*
+        * Protect against the PMI racing with disabling wrmsr,
+        * see comment in intel_pt_interrupt().
+        */
+       ACCESS_ONCE(pt->handle_nmi) = 0;
+       pt_config_start(false);
+
+       if (event->hw.state == PERF_HES_STOPPED)
+               return;
+
+       event->hw.state = PERF_HES_STOPPED;
+
+       if (mode & PERF_EF_UPDATE) {
+               struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+               if (!buf)
+                       return;
+
+               if (WARN_ON_ONCE(pt->handle.event != event))
+                       return;
+
+               pt_read_offset(buf);
+
+               pt_handle_status(pt);
+
+               pt_update_head(pt);
+       }
+}
+
+static void pt_event_del(struct perf_event *event, int mode)
+{
+       struct pt *pt = this_cpu_ptr(&pt_ctx);
+       struct pt_buffer *buf;
+
+       pt_event_stop(event, PERF_EF_UPDATE);
+
+       buf = perf_get_aux(&pt->handle);
+
+       if (buf) {
+               if (buf->snapshot)
+                       pt->handle.head =
+                               local_xchg(&buf->data_size,
+                                          buf->nr_pages << PAGE_SHIFT);
+               perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
+                                   local_xchg(&buf->lost, 0));
+       }
+}
+
+static int pt_event_add(struct perf_event *event, int mode)
+{
+       struct pt_buffer *buf;
+       struct pt *pt = this_cpu_ptr(&pt_ctx);
+       struct hw_perf_event *hwc = &event->hw;
+       int ret = -EBUSY;
+
+       if (pt->handle.event)
+               goto fail;
+
+       buf = perf_aux_output_begin(&pt->handle, event);
+       ret = -EINVAL;
+       if (!buf)
+               goto fail_stop;
+
+       pt_buffer_reset_offsets(buf, pt->handle.head);
+       if (!buf->snapshot) {
+               ret = pt_buffer_reset_markers(buf, &pt->handle);
+               if (ret)
+                       goto fail_end_stop;
+       }
+
+       if (mode & PERF_EF_START) {
+               pt_event_start(event, 0);
+               ret = -EBUSY;
+               if (hwc->state == PERF_HES_STOPPED)
+                       goto fail_end_stop;
+       } else {
+               hwc->state = PERF_HES_STOPPED;
+       }
+
+       return 0;
+
+fail_end_stop:
+       perf_aux_output_end(&pt->handle, 0, true);
+fail_stop:
+       hwc->state = PERF_HES_STOPPED;
+fail:
+       return ret;
+}
+
+static void pt_event_read(struct perf_event *event)
+{
+}
+
+static void pt_event_destroy(struct perf_event *event)
+{
+       x86_del_exclusive(x86_lbr_exclusive_pt);
+}
+
+static int pt_event_init(struct perf_event *event)
+{
+       if (event->attr.type != pt_pmu.pmu.type)
+               return -ENOENT;
+
+       if (!pt_event_valid(event))
+               return -EINVAL;
+
+       if (x86_add_exclusive(x86_lbr_exclusive_pt))
+               return -EBUSY;
+
+       event->destroy = pt_event_destroy;
+
+       return 0;
+}
+
+void cpu_emergency_stop_pt(void)
+{
+       struct pt *pt = this_cpu_ptr(&pt_ctx);
+
+       if (pt->handle.event)
+               pt_event_stop(pt->handle.event, PERF_EF_UPDATE);
+}
+
+static __init int pt_init(void)
+{
+       int ret, cpu, prior_warn = 0;
+
+       BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
+
+       if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
+               return -ENODEV;
+
+       get_online_cpus();
+       for_each_online_cpu(cpu) {
+               u64 ctl;
+
+               ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
+               if (!ret && (ctl & RTIT_CTL_TRACEEN))
+                       prior_warn++;
+       }
+       put_online_cpus();
+
+       if (prior_warn) {
+               x86_add_exclusive(x86_lbr_exclusive_pt);
+               pr_warn("PT is enabled at boot time, doing nothing\n");
+
+               return -EBUSY;
+       }
+
+       ret = pt_pmu_hw_init();
+       if (ret)
+               return ret;
+
+       if (!pt_cap_get(PT_CAP_topa_output)) {
+               pr_warn("ToPA output is not supported on this CPU\n");
+               return -ENODEV;
+       }
+
+       if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+               pt_pmu.pmu.capabilities =
+                       PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
+
+       pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
+       pt_pmu.pmu.attr_groups  = pt_attr_groups;
+       pt_pmu.pmu.task_ctx_nr  = perf_sw_context;
+       pt_pmu.pmu.event_init   = pt_event_init;
+       pt_pmu.pmu.add          = pt_event_add;
+       pt_pmu.pmu.del          = pt_event_del;
+       pt_pmu.pmu.start        = pt_event_start;
+       pt_pmu.pmu.stop         = pt_event_stop;
+       pt_pmu.pmu.read         = pt_event_read;
+       pt_pmu.pmu.setup_aux    = pt_buffer_setup_aux;
+       pt_pmu.pmu.free_aux     = pt_buffer_free_aux;
+       ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
+
+       return ret;
+}
+arch_initcall(pt_init);
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h

new file mode 100644 (file)

index 0000000..336878a
--- /dev/null
+++ b/arch/x86/events/intel/pt.h
@@ -0,0 +1,116 @@
+/*
+ * Intel(R) Processor Trace PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
+ * Programming Reference:
+ * http://software.intel.com/en-us/intel-isa-extensions
+ */
+
+#ifndef __INTEL_PT_H__
+#define __INTEL_PT_H__
+
+/*
+ * Single-entry ToPA: when this close to region boundary, switch
+ * buffers to avoid losing data.
+ */
+#define TOPA_PMI_MARGIN 512
+
+#define TOPA_SHIFT 12
+
+static inline unsigned int sizes(unsigned int tsz)
+{
+       return 1 << (tsz + TOPA_SHIFT);
+};
+
+struct topa_entry {
+       u64     end     : 1;
+       u64     rsvd0   : 1;
+       u64     intr    : 1;
+       u64     rsvd1   : 1;
+       u64     stop    : 1;
+       u64     rsvd2   : 1;
+       u64     size    : 4;
+       u64     rsvd3   : 2;
+       u64     base    : 36;
+       u64     rsvd4   : 16;
+};
+
+#define PT_CPUID_LEAVES                2
+#define PT_CPUID_REGS_NUM      4 /* number of regsters (eax, ebx, ecx, edx) */
+
+enum pt_capabilities {
+       PT_CAP_max_subleaf = 0,
+       PT_CAP_cr3_filtering,
+       PT_CAP_psb_cyc,
+       PT_CAP_mtc,
+       PT_CAP_topa_output,
+       PT_CAP_topa_multiple_entries,
+       PT_CAP_single_range_output,
+       PT_CAP_payloads_lip,
+       PT_CAP_mtc_periods,
+       PT_CAP_cycle_thresholds,
+       PT_CAP_psb_periods,
+};
+
+struct pt_pmu {
+       struct pmu              pmu;
+       u32                     caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
+};
+
+/**
+ * struct pt_buffer - buffer configuration; one buffer per task_struct or
+ *             cpu, depending on perf event configuration
+ * @cpu:       cpu for per-cpu allocation
+ * @tables:    list of ToPA tables in this buffer
+ * @first:     shorthand for first topa table
+ * @last:      shorthand for last topa table
+ * @cur:       current topa table
+ * @nr_pages:  buffer size in pages
+ * @cur_idx:   current output region's index within @cur table
+ * @output_off:        offset within the current output region
+ * @data_size: running total of the amount of data in this buffer
+ * @lost:      if data was lost/truncated
+ * @head:      logical write offset inside the buffer
+ * @snapshot:  if this is for a snapshot/overwrite counter
+ * @stop_pos:  STOP topa entry in the buffer
+ * @intr_pos:  INT topa entry in the buffer
+ * @data_pages:        array of pages from perf
+ * @topa_index:        table of topa entries indexed by page offset
+ */
+struct pt_buffer {
+       int                     cpu;
+       struct list_head        tables;
+       struct topa             *first, *last, *cur;
+       unsigned int            cur_idx;
+       size_t                  output_off;
+       unsigned long           nr_pages;
+       local_t                 data_size;
+       local_t                 lost;
+       local64_t               head;
+       bool                    snapshot;
+       unsigned long           stop_pos, intr_pos;
+       void                    **data_pages;
+       struct topa_entry       *topa_index[0];
+};
+
+/**
+ * struct pt - per-cpu pt context
+ * @handle:    perf output handle
+ * @handle_nmi:        do handle PT PMI on this cpu, there's an active event
+ */
+struct pt {
+       struct perf_output_handle handle;
+       int                     handle_nmi;
+};
+
+#endif /* __INTEL_PT_H__ */
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c

new file mode 100644 (file)

index 0000000..b834a3f
--- /dev/null
+++ b/arch/x86/events/intel/rapl.c
@@ -0,0 +1,767 @@
+/*
+ * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
+ * Copyright (C) 2013 Google, Inc., Stephane Eranian
+ *
+ * Intel RAPL interface is specified in the IA-32 Manual Vol3b
+ * section 14.7.1 (September 2013)
+ *
+ * RAPL provides more controls than just reporting energy consumption
+ * however here we only expose the 3 energy consumption free running
+ * counters (pp0, pkg, dram).
+ *
+ * Each of those counters increments in a power unit defined by the
+ * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
+ * but it can vary.
+ *
+ * Counter to rapl events mappings:
+ *
+ *  pp0 counter: consumption of all physical cores (power plane 0)
+ *       event: rapl_energy_cores
+ *    perf code: 0x1
+ *
+ *  pkg counter: consumption of the whole processor package
+ *       event: rapl_energy_pkg
+ *    perf code: 0x2
+ *
+ * dram counter: consumption of the dram domain (servers only)
+ *       event: rapl_energy_dram
+ *    perf code: 0x3
+ *
+ * dram counter: consumption of the builtin-gpu domain (client only)
+ *       event: rapl_energy_gpu
+ *    perf code: 0x4
+ *
+ * We manage those counters as free running (read-only). They may be
+ * use simultaneously by other tools, such as turbostat.
+ *
+ * The events only support system-wide mode counting. There is no
+ * sampling support because it does not make sense and is not
+ * supported by the RAPL hardware.
+ *
+ * Because we want to avoid floating-point operations in the kernel,
+ * the events are all reported in fixed point arithmetic (32.32).
+ * Tools must adjust the counts to convert them to Watts using
+ * the duration of the measurement. Tools may use a function such as
+ * ldexp(raw_count, -32);
+ */
+
+#define pr_fmt(fmt) "RAPL PMU: " fmt
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <asm/cpu_device_id.h>
+#include "../perf_event.h"
+
+/*
+ * RAPL energy status counters
+ */
+#define RAPL_IDX_PP0_NRG_STAT  0       /* all cores */
+#define INTEL_RAPL_PP0         0x1     /* pseudo-encoding */
+#define RAPL_IDX_PKG_NRG_STAT  1       /* entire package */
+#define INTEL_RAPL_PKG         0x2     /* pseudo-encoding */
+#define RAPL_IDX_RAM_NRG_STAT  2       /* DRAM */
+#define INTEL_RAPL_RAM         0x3     /* pseudo-encoding */
+#define RAPL_IDX_PP1_NRG_STAT  3       /* gpu */
+#define INTEL_RAPL_PP1         0x4     /* pseudo-encoding */
+
+#define NR_RAPL_DOMAINS         0x4
+static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
+       "pp0-core",
+       "package",
+       "dram",
+       "pp1-gpu",
+};
+
+/* Clients have PP0, PKG */
+#define RAPL_IDX_CLN   (1<<RAPL_IDX_PP0_NRG_STAT|\
+                        1<<RAPL_IDX_PKG_NRG_STAT|\
+                        1<<RAPL_IDX_PP1_NRG_STAT)
+
+/* Servers have PP0, PKG, RAM */
+#define RAPL_IDX_SRV   (1<<RAPL_IDX_PP0_NRG_STAT|\
+                        1<<RAPL_IDX_PKG_NRG_STAT|\
+                        1<<RAPL_IDX_RAM_NRG_STAT)
+
+/* Servers have PP0, PKG, RAM, PP1 */
+#define RAPL_IDX_HSW   (1<<RAPL_IDX_PP0_NRG_STAT|\
+                        1<<RAPL_IDX_PKG_NRG_STAT|\
+                        1<<RAPL_IDX_RAM_NRG_STAT|\
+                        1<<RAPL_IDX_PP1_NRG_STAT)
+
+/* Knights Landing has PKG, RAM */
+#define RAPL_IDX_KNL   (1<<RAPL_IDX_PKG_NRG_STAT|\
+                        1<<RAPL_IDX_RAM_NRG_STAT)
+
+/*
+ * event code: LSB 8 bits, passed in attr->config
+ * any other bit is reserved
+ */
+#define RAPL_EVENT_MASK        0xFFULL
+
+#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format)          \
+static ssize_t __rapl_##_var##_show(struct kobject *kobj,      \
+                               struct kobj_attribute *attr,    \
+                               char *page)                     \
+{                                                              \
+       BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);             \
+       return sprintf(page, _format "\n");                     \
+}                                                              \
+static struct kobj_attribute format_attr_##_var =              \
+       __ATTR(_name, 0444, __rapl_##_var##_show, NULL)
+
+#define RAPL_CNTR_WIDTH 32
+
+#define RAPL_EVENT_ATTR_STR(_name, v, str)                                     \
+static struct perf_pmu_events_attr event_attr_##v = {                          \
+       .attr           = __ATTR(_name, 0444, perf_event_sysfs_show, NULL),     \
+       .id             = 0,                                                    \
+       .event_str      = str,                                                  \
+};
+
+struct rapl_pmu {
+       raw_spinlock_t          lock;
+       int                     n_active;
+       int                     cpu;
+       struct list_head        active_list;
+       struct pmu              *pmu;
+       ktime_t                 timer_interval;
+       struct hrtimer          hrtimer;
+};
+
+struct rapl_pmus {
+       struct pmu              pmu;
+       unsigned int            maxpkg;
+       struct rapl_pmu         *pmus[];
+};
+
+ /* 1/2^hw_unit Joule */
+static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
+static struct rapl_pmus *rapl_pmus;
+static cpumask_t rapl_cpu_mask;
+static unsigned int rapl_cntr_mask;
+static u64 rapl_timer_ms;
+
+static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
+{
+       return rapl_pmus->pmus[topology_logical_package_id(cpu)];
+}
+
+static inline u64 rapl_read_counter(struct perf_event *event)
+{
+       u64 raw;
+       rdmsrl(event->hw.event_base, raw);
+       return raw;
+}
+
+static inline u64 rapl_scale(u64 v, int cfg)
+{
+       if (cfg > NR_RAPL_DOMAINS) {
+               pr_warn("Invalid domain %d, failed to scale data\n", cfg);
+               return v;
+       }
+       /*
+        * scale delta to smallest unit (1/2^32)
+        * users must then scale back: count * 1/(1e9*2^32) to get Joules
+        * or use ldexp(count, -32).
+        * Watts = Joules/Time delta
+        */
+       return v << (32 - rapl_hw_unit[cfg - 1]);
+}
+
+static u64 rapl_event_update(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 prev_raw_count, new_raw_count;
+       s64 delta, sdelta;
+       int shift = RAPL_CNTR_WIDTH;
+
+again:
+       prev_raw_count = local64_read(&hwc->prev_count);
+       rdmsrl(event->hw.event_base, new_raw_count);
+
+       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+                           new_raw_count) != prev_raw_count) {
+               cpu_relax();
+               goto again;
+       }
+
+       /*
+        * Now we have the new raw value and have updated the prev
+        * timestamp already. We can now calculate the elapsed delta
+        * (event-)time and add that to the generic event.
+        *
+        * Careful, not all hw sign-extends above the physical width
+        * of the count.
+        */
+       delta = (new_raw_count << shift) - (prev_raw_count << shift);
+       delta >>= shift;
+
+       sdelta = rapl_scale(delta, event->hw.config);
+
+       local64_add(sdelta, &event->count);
+
+       return new_raw_count;
+}
+
+static void rapl_start_hrtimer(struct rapl_pmu *pmu)
+{
+       hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
+                    HRTIMER_MODE_REL_PINNED);
+}
+
+static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
+{
+       struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
+       struct perf_event *event;
+       unsigned long flags;
+
+       if (!pmu->n_active)
+               return HRTIMER_NORESTART;
+
+       raw_spin_lock_irqsave(&pmu->lock, flags);
+
+       list_for_each_entry(event, &pmu->active_list, active_entry)
+               rapl_event_update(event);
+
+       raw_spin_unlock_irqrestore(&pmu->lock, flags);
+
+       hrtimer_forward_now(hrtimer, pmu->timer_interval);
+
+       return HRTIMER_RESTART;
+}
+
+static void rapl_hrtimer_init(struct rapl_pmu *pmu)
+{
+       struct hrtimer *hr = &pmu->hrtimer;
+
+       hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hr->function = rapl_hrtimer_handle;
+}
+
+static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
+                                  struct perf_event *event)
+{
+       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+               return;
+
+       event->hw.state = 0;
+
+       list_add_tail(&event->active_entry, &pmu->active_list);
+
+       local64_set(&event->hw.prev_count, rapl_read_counter(event));
+
+       pmu->n_active++;
+       if (pmu->n_active == 1)
+               rapl_start_hrtimer(pmu);
+}
+
+static void rapl_pmu_event_start(struct perf_event *event, int mode)
+{
+       struct rapl_pmu *pmu = event->pmu_private;
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&pmu->lock, flags);
+       __rapl_pmu_event_start(pmu, event);
+       raw_spin_unlock_irqrestore(&pmu->lock, flags);
+}
+
+static void rapl_pmu_event_stop(struct perf_event *event, int mode)
+{
+       struct rapl_pmu *pmu = event->pmu_private;
+       struct hw_perf_event *hwc = &event->hw;
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&pmu->lock, flags);
+
+       /* mark event as deactivated and stopped */
+       if (!(hwc->state & PERF_HES_STOPPED)) {
+               WARN_ON_ONCE(pmu->n_active <= 0);
+               pmu->n_active--;
+               if (pmu->n_active == 0)
+                       hrtimer_cancel(&pmu->hrtimer);
+
+               list_del(&event->active_entry);
+
+               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+               hwc->state |= PERF_HES_STOPPED;
+       }
+
+       /* check if update of sw counter is necessary */
+       if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+               /*
+                * Drain the remaining delta count out of a event
+                * that we are disabling:
+                */
+               rapl_event_update(event);
+               hwc->state |= PERF_HES_UPTODATE;
+       }
+
+       raw_spin_unlock_irqrestore(&pmu->lock, flags);
+}
+
+static int rapl_pmu_event_add(struct perf_event *event, int mode)
+{
+       struct rapl_pmu *pmu = event->pmu_private;
+       struct hw_perf_event *hwc = &event->hw;
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&pmu->lock, flags);
+
+       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+       if (mode & PERF_EF_START)
+               __rapl_pmu_event_start(pmu, event);
+
+       raw_spin_unlock_irqrestore(&pmu->lock, flags);
+
+       return 0;
+}
+
+static void rapl_pmu_event_del(struct perf_event *event, int flags)
+{
+       rapl_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int rapl_pmu_event_init(struct perf_event *event)
+{
+       u64 cfg = event->attr.config & RAPL_EVENT_MASK;
+       int bit, msr, ret = 0;
+       struct rapl_pmu *pmu;
+
+       /* only look at RAPL events */
+       if (event->attr.type != rapl_pmus->pmu.type)
+               return -ENOENT;
+
+       /* check only supported bits are set */
+       if (event->attr.config & ~RAPL_EVENT_MASK)
+               return -EINVAL;
+
+       if (event->cpu < 0)
+               return -EINVAL;
+
+       /*
+        * check event is known (determines counter)
+        */
+       switch (cfg) {
+       case INTEL_RAPL_PP0:
+               bit = RAPL_IDX_PP0_NRG_STAT;
+               msr = MSR_PP0_ENERGY_STATUS;
+               break;
+       case INTEL_RAPL_PKG:
+               bit = RAPL_IDX_PKG_NRG_STAT;
+               msr = MSR_PKG_ENERGY_STATUS;
+               break;
+       case INTEL_RAPL_RAM:
+               bit = RAPL_IDX_RAM_NRG_STAT;
+               msr = MSR_DRAM_ENERGY_STATUS;
+               break;
+       case INTEL_RAPL_PP1:
+               bit = RAPL_IDX_PP1_NRG_STAT;
+               msr = MSR_PP1_ENERGY_STATUS;
+               break;
+       default:
+               return -EINVAL;
+       }
+       /* check event supported */
+       if (!(rapl_cntr_mask & (1 << bit)))
+               return -EINVAL;
+
+       /* unsupported modes and filters */
+       if (event->attr.exclude_user   ||
+           event->attr.exclude_kernel ||
+           event->attr.exclude_hv     ||
+           event->attr.exclude_idle   ||
+           event->attr.exclude_host   ||
+           event->attr.exclude_guest  ||
+           event->attr.sample_period) /* no sampling */
+               return -EINVAL;
+
+       /* must be done before validate_group */
+       pmu = cpu_to_rapl_pmu(event->cpu);
+       event->cpu = pmu->cpu;
+       event->pmu_private = pmu;
+       event->hw.event_base = msr;
+       event->hw.config = cfg;
+       event->hw.idx = bit;
+
+       return ret;
+}
+
+static void rapl_pmu_event_read(struct perf_event *event)
+{
+       rapl_event_update(event);
+}
+
+static ssize_t rapl_get_attr_cpumask(struct device *dev,
+                               struct device_attribute *attr, char *buf)
+{
+       return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
+
+static struct attribute *rapl_pmu_attrs[] = {
+       &dev_attr_cpumask.attr,
+       NULL,
+};
+
+static struct attribute_group rapl_pmu_attr_group = {
+       .attrs = rapl_pmu_attrs,
+};
+
+RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
+RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
+RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
+RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
+
+RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
+
+/*
+ * we compute in 0.23 nJ increments regardless of MSR
+ */
+RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
+
+static struct attribute *rapl_events_srv_attr[] = {
+       EVENT_PTR(rapl_cores),
+       EVENT_PTR(rapl_pkg),
+       EVENT_PTR(rapl_ram),
+
+       EVENT_PTR(rapl_cores_unit),
+       EVENT_PTR(rapl_pkg_unit),
+       EVENT_PTR(rapl_ram_unit),
+
+       EVENT_PTR(rapl_cores_scale),
+       EVENT_PTR(rapl_pkg_scale),
+       EVENT_PTR(rapl_ram_scale),
+       NULL,
+};
+
+static struct attribute *rapl_events_cln_attr[] = {
+       EVENT_PTR(rapl_cores),
+       EVENT_PTR(rapl_pkg),
+       EVENT_PTR(rapl_gpu),
+
+       EVENT_PTR(rapl_cores_unit),
+       EVENT_PTR(rapl_pkg_unit),
+       EVENT_PTR(rapl_gpu_unit),
+
+       EVENT_PTR(rapl_cores_scale),
+       EVENT_PTR(rapl_pkg_scale),
+       EVENT_PTR(rapl_gpu_scale),
+       NULL,
+};
+
+static struct attribute *rapl_events_hsw_attr[] = {
+       EVENT_PTR(rapl_cores),
+       EVENT_PTR(rapl_pkg),
+       EVENT_PTR(rapl_gpu),
+       EVENT_PTR(rapl_ram),
+
+       EVENT_PTR(rapl_cores_unit),
+       EVENT_PTR(rapl_pkg_unit),
+       EVENT_PTR(rapl_gpu_unit),
+       EVENT_PTR(rapl_ram_unit),
+
+       EVENT_PTR(rapl_cores_scale),
+       EVENT_PTR(rapl_pkg_scale),
+       EVENT_PTR(rapl_gpu_scale),
+       EVENT_PTR(rapl_ram_scale),
+       NULL,
+};
+
+static struct attribute *rapl_events_knl_attr[] = {
+       EVENT_PTR(rapl_pkg),
+       EVENT_PTR(rapl_ram),
+
+       EVENT_PTR(rapl_pkg_unit),
+       EVENT_PTR(rapl_ram_unit),
+
+       EVENT_PTR(rapl_pkg_scale),
+       EVENT_PTR(rapl_ram_scale),
+       NULL,
+};
+
+static struct attribute_group rapl_pmu_events_group = {
+       .name = "events",
+       .attrs = NULL, /* patched at runtime */
+};
+
+DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
+static struct attribute *rapl_formats_attr[] = {
+       &format_attr_event.attr,
+       NULL,
+};
+
+static struct attribute_group rapl_pmu_format_group = {
+       .name = "format",
+       .attrs = rapl_formats_attr,
+};
+
+const struct attribute_group *rapl_attr_groups[] = {
+       &rapl_pmu_attr_group,
+       &rapl_pmu_format_group,
+       &rapl_pmu_events_group,
+       NULL,
+};
+
+static void rapl_cpu_exit(int cpu)
+{
+       struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
+       int target;
+
+       /* Check if exiting cpu is used for collecting rapl events */
+       if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
+               return;
+
+       pmu->cpu = -1;
+       /* Find a new cpu to collect rapl events */
+       target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
+
+       /* Migrate rapl events to the new target */
+       if (target < nr_cpu_ids) {
+               cpumask_set_cpu(target, &rapl_cpu_mask);
+               pmu->cpu = target;
+               perf_pmu_migrate_context(pmu->pmu, cpu, target);
+       }
+}
+
+static void rapl_cpu_init(int cpu)
+{
+       struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
+       int target;
+
+       /*
+        * Check if there is an online cpu in the package which collects rapl
+        * events already.
+        */
+       target = cpumask_any_and(&rapl_cpu_mask, topology_core_cpumask(cpu));
+       if (target < nr_cpu_ids)
+               return;
+
+       cpumask_set_cpu(cpu, &rapl_cpu_mask);
+       pmu->cpu = cpu;
+}
+
+static int rapl_cpu_prepare(int cpu)
+{
+       struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
+
+       if (pmu)
+               return 0;
+
+       pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
+       if (!pmu)
+               return -ENOMEM;
+
+       raw_spin_lock_init(&pmu->lock);
+       INIT_LIST_HEAD(&pmu->active_list);
+       pmu->pmu = &rapl_pmus->pmu;
+       pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
+       pmu->cpu = -1;
+       rapl_hrtimer_init(pmu);
+       rapl_pmus->pmus[topology_logical_package_id(cpu)] = pmu;
+       return 0;
+}
+
+static int rapl_cpu_notifier(struct notifier_block *self,
+                            unsigned long action, void *hcpu)
+{
+       unsigned int cpu = (long)hcpu;
+
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_UP_PREPARE:
+               rapl_cpu_prepare(cpu);
+               break;
+
+       case CPU_DOWN_FAILED:
+       case CPU_ONLINE:
+               rapl_cpu_init(cpu);
+               break;
+
+       case CPU_DOWN_PREPARE:
+               rapl_cpu_exit(cpu);
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static int rapl_check_hw_unit(bool apply_quirk)
+{
+       u64 msr_rapl_power_unit_bits;
+       int i;
+
+       /* protect rdmsrl() to handle virtualization */
+       if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
+               return -1;
+       for (i = 0; i < NR_RAPL_DOMAINS; i++)
+               rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
+
+       /*
+        * DRAM domain on HSW server and KNL has fixed energy unit which can be
+        * different than the unit from power unit MSR. See
+        * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
+        * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
+        */
+       if (apply_quirk)
+               rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
+
+       /*
+        * Calculate the timer rate:
+        * Use reference of 200W for scaling the timeout to avoid counter
+        * overflows. 200W = 200 Joules/sec
+        * Divide interval by 2 to avoid lockstep (2 * 100)
+        * if hw unit is 32, then we use 2 ms 1/200/2
+        */
+       rapl_timer_ms = 2;
+       if (rapl_hw_unit[0] < 32) {
+               rapl_timer_ms = (1000 / (2 * 100));
+               rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
+       }
+       return 0;
+}
+
+static void __init rapl_advertise(void)
+{
+       int i;
+
+       pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
+               hweight32(rapl_cntr_mask), rapl_timer_ms);
+
+       for (i = 0; i < NR_RAPL_DOMAINS; i++) {
+               if (rapl_cntr_mask & (1 << i)) {
+                       pr_info("hw unit of domain %s 2^-%d Joules\n",
+                               rapl_domain_names[i], rapl_hw_unit[i]);
+               }
+       }
+}
+
+static int __init rapl_prepare_cpus(void)
+{
+       unsigned int cpu, pkg;
+       int ret;
+
+       for_each_online_cpu(cpu) {
+               pkg = topology_logical_package_id(cpu);
+               if (rapl_pmus->pmus[pkg])
+                       continue;
+
+               ret = rapl_cpu_prepare(cpu);
+               if (ret)
+                       return ret;
+               rapl_cpu_init(cpu);
+       }
+       return 0;
+}
+
+static void __init cleanup_rapl_pmus(void)
+{
+       int i;
+
+       for (i = 0; i < rapl_pmus->maxpkg; i++)
+               kfree(rapl_pmus->pmus + i);
+       kfree(rapl_pmus);
+}
+
+static int __init init_rapl_pmus(void)
+{
+       int maxpkg = topology_max_packages();
+       size_t size;
+
+       size = sizeof(*rapl_pmus) + maxpkg * sizeof(struct rapl_pmu *);
+       rapl_pmus = kzalloc(size, GFP_KERNEL);
+       if (!rapl_pmus)
+               return -ENOMEM;
+
+       rapl_pmus->maxpkg               = maxpkg;
+       rapl_pmus->pmu.attr_groups      = rapl_attr_groups;
+       rapl_pmus->pmu.task_ctx_nr      = perf_invalid_context;
+       rapl_pmus->pmu.event_init       = rapl_pmu_event_init;
+       rapl_pmus->pmu.add              = rapl_pmu_event_add;
+       rapl_pmus->pmu.del              = rapl_pmu_event_del;
+       rapl_pmus->pmu.start            = rapl_pmu_event_start;
+       rapl_pmus->pmu.stop             = rapl_pmu_event_stop;
+       rapl_pmus->pmu.read             = rapl_pmu_event_read;
+       return 0;
+}
+
+static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
+       [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
+       [1] = {},
+};
+
+static int __init rapl_pmu_init(void)
+{
+       bool apply_quirk = false;
+       int ret;
+
+       if (!x86_match_cpu(rapl_cpu_match))
+               return -ENODEV;
+
+       switch (boot_cpu_data.x86_model) {
+       case 42: /* Sandy Bridge */
+       case 58: /* Ivy Bridge */
+               rapl_cntr_mask = RAPL_IDX_CLN;
+               rapl_pmu_events_group.attrs = rapl_events_cln_attr;
+               break;
+       case 63: /* Haswell-Server */
+               apply_quirk = true;
+               rapl_cntr_mask = RAPL_IDX_SRV;
+               rapl_pmu_events_group.attrs = rapl_events_srv_attr;
+               break;
+       case 60: /* Haswell */
+       case 69: /* Haswell-Celeron */
+       case 61: /* Broadwell */
+               rapl_cntr_mask = RAPL_IDX_HSW;
+               rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
+               break;
+       case 45: /* Sandy Bridge-EP */
+       case 62: /* IvyTown */
+               rapl_cntr_mask = RAPL_IDX_SRV;
+               rapl_pmu_events_group.attrs = rapl_events_srv_attr;
+               break;
+       case 87: /* Knights Landing */
+               apply_quirk = true;
+               rapl_cntr_mask = RAPL_IDX_KNL;
+               rapl_pmu_events_group.attrs = rapl_events_knl_attr;
+               break;
+       default:
+               return -ENODEV;
+       }
+
+       ret = rapl_check_hw_unit(apply_quirk);
+       if (ret)
+               return ret;
+
+       ret = init_rapl_pmus();
+       if (ret)
+               return ret;
+
+       cpu_notifier_register_begin();
+
+       ret = rapl_prepare_cpus();
+       if (ret)
+               goto out;
+
+       ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
+       if (ret)
+               goto out;
+
+       __perf_cpu_notifier(rapl_cpu_notifier);
+       cpu_notifier_register_done();
+       rapl_advertise();
+       return 0;
+
+out:
+       pr_warn("Initialization failed (%d), disabled\n", ret);
+       cleanup_rapl_pmus();
+       cpu_notifier_register_done();
+       return ret;
+}
+device_initcall(rapl_pmu_init);
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c

new file mode 100644 (file)

index 0000000..7012d18
--- /dev/null
+++ b/arch/x86/events/intel/uncore.c
@@ -0,0 +1,1412 @@
+#include "uncore.h"
+
+static struct intel_uncore_type *empty_uncore[] = { NULL, };
+struct intel_uncore_type **uncore_msr_uncores = empty_uncore;
+struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
+
+static bool pcidrv_registered;
+struct pci_driver *uncore_pci_driver;
+/* pci bus to socket mapping */
+DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
+struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
+struct pci_extra_dev *uncore_extra_pci_dev;
+static int max_packages;
+
+/* mask of cpus that collect uncore events */
+static cpumask_t uncore_cpu_mask;
+
+/* constraint for the fixed counter */
+static struct event_constraint uncore_constraint_fixed =
+       EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL);
+struct event_constraint uncore_constraint_empty =
+       EVENT_CONSTRAINT(0, 0, 0);
+
+static int uncore_pcibus_to_physid(struct pci_bus *bus)
+{
+       struct pci2phy_map *map;
+       int phys_id = -1;
+
+       raw_spin_lock(&pci2phy_map_lock);
+       list_for_each_entry(map, &pci2phy_map_head, list) {
+               if (map->segment == pci_domain_nr(bus)) {
+                       phys_id = map->pbus_to_physid[bus->number];
+                       break;
+               }
+       }
+       raw_spin_unlock(&pci2phy_map_lock);
+
+       return phys_id;
+}
+
+static void uncore_free_pcibus_map(void)
+{
+       struct pci2phy_map *map, *tmp;
+
+       list_for_each_entry_safe(map, tmp, &pci2phy_map_head, list) {
+               list_del(&map->list);
+               kfree(map);
+       }
+}
+
+struct pci2phy_map *__find_pci2phy_map(int segment)
+{
+       struct pci2phy_map *map, *alloc = NULL;
+       int i;
+
+       lockdep_assert_held(&pci2phy_map_lock);
+
+lookup:
+       list_for_each_entry(map, &pci2phy_map_head, list) {
+               if (map->segment == segment)
+                       goto end;
+       }
+
+       if (!alloc) {
+               raw_spin_unlock(&pci2phy_map_lock);
+               alloc = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL);
+               raw_spin_lock(&pci2phy_map_lock);
+
+               if (!alloc)
+                       return NULL;
+
+               goto lookup;
+       }
+
+       map = alloc;
+       alloc = NULL;
+       map->segment = segment;
+       for (i = 0; i < 256; i++)
+               map->pbus_to_physid[i] = -1;
+       list_add_tail(&map->list, &pci2phy_map_head);
+
+end:
+       kfree(alloc);
+       return map;
+}
+
+ssize_t uncore_event_show(struct kobject *kobj,
+                         struct kobj_attribute *attr, char *buf)
+{
+       struct uncore_event_desc *event =
+               container_of(attr, struct uncore_event_desc, attr);
+       return sprintf(buf, "%s", event->config);
+}
+
+struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
+{
+       return pmu->boxes[topology_logical_package_id(cpu)];
+}
+
+u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+       u64 count;
+
+       rdmsrl(event->hw.event_base, count);
+
+       return count;
+}
+
+/*
+ * generic get constraint function for shared match/mask registers.
+ */
+struct event_constraint *
+uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct intel_uncore_extra_reg *er;
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+       unsigned long flags;
+       bool ok = false;
+
+       /*
+        * reg->alloc can be set due to existing state, so for fake box we
+        * need to ignore this, otherwise we might fail to allocate proper
+        * fake state for this extra reg constraint.
+        */
+       if (reg1->idx == EXTRA_REG_NONE ||
+           (!uncore_box_is_fake(box) && reg1->alloc))
+               return NULL;
+
+       er = &box->shared_regs[reg1->idx];
+       raw_spin_lock_irqsave(&er->lock, flags);
+       if (!atomic_read(&er->ref) ||
+           (er->config1 == reg1->config && er->config2 == reg2->config)) {
+               atomic_inc(&er->ref);
+               er->config1 = reg1->config;
+               er->config2 = reg2->config;
+               ok = true;
+       }
+       raw_spin_unlock_irqrestore(&er->lock, flags);
+
+       if (ok) {
+               if (!uncore_box_is_fake(box))
+                       reg1->alloc = 1;
+               return NULL;
+       }
+
+       return &uncore_constraint_empty;
+}
+
+void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct intel_uncore_extra_reg *er;
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+
+       /*
+        * Only put constraint if extra reg was actually allocated. Also
+        * takes care of event which do not use an extra shared reg.
+        *
+        * Also, if this is a fake box we shouldn't touch any event state
+        * (reg->alloc) and we don't care about leaving inconsistent box
+        * state either since it will be thrown out.
+        */
+       if (uncore_box_is_fake(box) || !reg1->alloc)
+               return;
+
+       er = &box->shared_regs[reg1->idx];
+       atomic_dec(&er->ref);
+       reg1->alloc = 0;
+}
+
+u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx)
+{
+       struct intel_uncore_extra_reg *er;
+       unsigned long flags;
+       u64 config;
+
+       er = &box->shared_regs[idx];
+
+       raw_spin_lock_irqsave(&er->lock, flags);
+       config = er->config;
+       raw_spin_unlock_irqrestore(&er->lock, flags);
+
+       return config;
+}
+
+static void uncore_assign_hw_event(struct intel_uncore_box *box,
+                                  struct perf_event *event, int idx)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       hwc->idx = idx;
+       hwc->last_tag = ++box->tags[idx];
+
+       if (hwc->idx == UNCORE_PMC_IDX_FIXED) {
+               hwc->event_base = uncore_fixed_ctr(box);
+               hwc->config_base = uncore_fixed_ctl(box);
+               return;
+       }
+
+       hwc->config_base = uncore_event_ctl(box, hwc->idx);
+       hwc->event_base  = uncore_perf_ctr(box, hwc->idx);
+}
+
+void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event)
+{
+       u64 prev_count, new_count, delta;
+       int shift;
+
+       if (event->hw.idx >= UNCORE_PMC_IDX_FIXED)
+               shift = 64 - uncore_fixed_ctr_bits(box);
+       else
+               shift = 64 - uncore_perf_ctr_bits(box);
+
+       /* the hrtimer might modify the previous event value */
+again:
+       prev_count = local64_read(&event->hw.prev_count);
+       new_count = uncore_read_counter(box, event);
+       if (local64_xchg(&event->hw.prev_count, new_count) != prev_count)
+               goto again;
+
+       delta = (new_count << shift) - (prev_count << shift);
+       delta >>= shift;
+
+       local64_add(delta, &event->count);
+}
+
+/*
+ * The overflow interrupt is unavailable for SandyBridge-EP, is broken
+ * for SandyBridge. So we use hrtimer to periodically poll the counter
+ * to avoid overflow.
+ */
+static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
+{
+       struct intel_uncore_box *box;
+       struct perf_event *event;
+       unsigned long flags;
+       int bit;
+
+       box = container_of(hrtimer, struct intel_uncore_box, hrtimer);
+       if (!box->n_active || box->cpu != smp_processor_id())
+               return HRTIMER_NORESTART;
+       /*
+        * disable local interrupt to prevent uncore_pmu_event_start/stop
+        * to interrupt the update process
+        */
+       local_irq_save(flags);
+
+       /*
+        * handle boxes with an active event list as opposed to active
+        * counters
+        */
+       list_for_each_entry(event, &box->active_list, active_entry) {
+               uncore_perf_event_update(box, event);
+       }
+
+       for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX)
+               uncore_perf_event_update(box, box->events[bit]);
+
+       local_irq_restore(flags);
+
+       hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration));
+       return HRTIMER_RESTART;
+}
+
+void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
+{
+       hrtimer_start(&box->hrtimer, ns_to_ktime(box->hrtimer_duration),
+                     HRTIMER_MODE_REL_PINNED);
+}
+
+void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
+{
+       hrtimer_cancel(&box->hrtimer);
+}
+
+static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box)
+{
+       hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       box->hrtimer.function = uncore_pmu_hrtimer;
+}
+
+static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,
+                                                int node)
+{
+       int i, size, numshared = type->num_shared_regs ;
+       struct intel_uncore_box *box;
+
+       size = sizeof(*box) + numshared * sizeof(struct intel_uncore_extra_reg);
+
+       box = kzalloc_node(size, GFP_KERNEL, node);
+       if (!box)
+               return NULL;
+
+       for (i = 0; i < numshared; i++)
+               raw_spin_lock_init(&box->shared_regs[i].lock);
+
+       uncore_pmu_init_hrtimer(box);
+       box->cpu = -1;
+       box->pci_phys_id = -1;
+       box->pkgid = -1;
+
+       /* set default hrtimer timeout */
+       box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL;
+
+       INIT_LIST_HEAD(&box->active_list);
+
+       return box;
+}
+
+/*
+ * Using uncore_pmu_event_init pmu event_init callback
+ * as a detection point for uncore events.
+ */
+static int uncore_pmu_event_init(struct perf_event *event);
+
+static bool is_uncore_event(struct perf_event *event)
+{
+       return event->pmu->event_init == uncore_pmu_event_init;
+}
+
+static int
+uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader,
+                     bool dogrp)
+{
+       struct perf_event *event;
+       int n, max_count;
+
+       max_count = box->pmu->type->num_counters;
+       if (box->pmu->type->fixed_ctl)
+               max_count++;
+
+       if (box->n_events >= max_count)
+               return -EINVAL;
+
+       n = box->n_events;
+
+       if (is_uncore_event(leader)) {
+               box->event_list[n] = leader;
+               n++;
+       }
+
+       if (!dogrp)
+               return n;
+
+       list_for_each_entry(event, &leader->sibling_list, group_entry) {
+               if (!is_uncore_event(event) ||
+                   event->state <= PERF_EVENT_STATE_OFF)
+                       continue;
+
+               if (n >= max_count)
+                       return -EINVAL;
+
+               box->event_list[n] = event;
+               n++;
+       }
+       return n;
+}
+
+static struct event_constraint *
+uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct intel_uncore_type *type = box->pmu->type;
+       struct event_constraint *c;
+
+       if (type->ops->get_constraint) {
+               c = type->ops->get_constraint(box, event);
+               if (c)
+                       return c;
+       }
+
+       if (event->attr.config == UNCORE_FIXED_EVENT)
+               return &uncore_constraint_fixed;
+
+       if (type->constraints) {
+               for_each_event_constraint(c, type->constraints) {
+                       if ((event->hw.config & c->cmask) == c->code)
+                               return c;
+               }
+       }
+
+       return &type->unconstrainted;
+}
+
+static void uncore_put_event_constraint(struct intel_uncore_box *box,
+                                       struct perf_event *event)
+{
+       if (box->pmu->type->ops->put_constraint)
+               box->pmu->type->ops->put_constraint(box, event);
+}
+
+static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n)
+{
+       unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
+       struct event_constraint *c;
+       int i, wmin, wmax, ret = 0;
+       struct hw_perf_event *hwc;
+
+       bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);
+
+       for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
+               c = uncore_get_event_constraint(box, box->event_list[i]);
+               box->event_constraint[i] = c;
+               wmin = min(wmin, c->weight);
+               wmax = max(wmax, c->weight);
+       }
+
+       /* fastpath, try to reuse previous register */
+       for (i = 0; i < n; i++) {
+               hwc = &box->event_list[i]->hw;
+               c = box->event_constraint[i];
+
+               /* never assigned */
+               if (hwc->idx == -1)
+                       break;
+
+               /* constraint still honored */
+               if (!test_bit(hwc->idx, c->idxmsk))
+                       break;
+
+               /* not already used */
+               if (test_bit(hwc->idx, used_mask))
+                       break;
+
+               __set_bit(hwc->idx, used_mask);
+               if (assign)
+                       assign[i] = hwc->idx;
+       }
+       /* slow path */
+       if (i != n)
+               ret = perf_assign_events(box->event_constraint, n,
+                                        wmin, wmax, n, assign);
+
+       if (!assign || ret) {
+               for (i = 0; i < n; i++)
+                       uncore_put_event_constraint(box, box->event_list[i]);
+       }
+       return ret ? -EINVAL : 0;
+}
+
+static void uncore_pmu_event_start(struct perf_event *event, int flags)
+{
+       struct intel_uncore_box *box = uncore_event_to_box(event);
+       int idx = event->hw.idx;
+
+       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+               return;
+
+       if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX))
+               return;
+
+       event->hw.state = 0;
+       box->events[idx] = event;
+       box->n_active++;
+       __set_bit(idx, box->active_mask);
+
+       local64_set(&event->hw.prev_count, uncore_read_counter(box, event));
+       uncore_enable_event(box, event);
+
+       if (box->n_active == 1) {
+               uncore_enable_box(box);
+               uncore_pmu_start_hrtimer(box);
+       }
+}
+
+static void uncore_pmu_event_stop(struct perf_event *event, int flags)
+{
+       struct intel_uncore_box *box = uncore_event_to_box(event);
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (__test_and_clear_bit(hwc->idx, box->active_mask)) {
+               uncore_disable_event(box, event);
+               box->n_active--;
+               box->events[hwc->idx] = NULL;
+               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+               hwc->state |= PERF_HES_STOPPED;
+
+               if (box->n_active == 0) {
+                       uncore_disable_box(box);
+                       uncore_pmu_cancel_hrtimer(box);
+               }
+       }
+
+       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+               /*
+                * Drain the remaining delta count out of a event
+                * that we are disabling:
+                */
+               uncore_perf_event_update(box, event);
+               hwc->state |= PERF_HES_UPTODATE;
+       }
+}
+
+static int uncore_pmu_event_add(struct perf_event *event, int flags)
+{
+       struct intel_uncore_box *box = uncore_event_to_box(event);
+       struct hw_perf_event *hwc = &event->hw;
+       int assign[UNCORE_PMC_IDX_MAX];
+       int i, n, ret;
+
+       if (!box)
+               return -ENODEV;
+
+       ret = n = uncore_collect_events(box, event, false);
+       if (ret < 0)
+               return ret;
+
+       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+       if (!(flags & PERF_EF_START))
+               hwc->state |= PERF_HES_ARCH;
+
+       ret = uncore_assign_events(box, assign, n);
+       if (ret)
+               return ret;
+
+       /* save events moving to new counters */
+       for (i = 0; i < box->n_events; i++) {
+               event = box->event_list[i];
+               hwc = &event->hw;
+
+               if (hwc->idx == assign[i] &&
+                       hwc->last_tag == box->tags[assign[i]])
+                       continue;
+               /*
+                * Ensure we don't accidentally enable a stopped
+                * counter simply because we rescheduled.
+                */
+               if (hwc->state & PERF_HES_STOPPED)
+                       hwc->state |= PERF_HES_ARCH;
+
+               uncore_pmu_event_stop(event, PERF_EF_UPDATE);
+       }
+
+       /* reprogram moved events into new counters */
+       for (i = 0; i < n; i++) {
+               event = box->event_list[i];
+               hwc = &event->hw;
+
+               if (hwc->idx != assign[i] ||
+                       hwc->last_tag != box->tags[assign[i]])
+                       uncore_assign_hw_event(box, event, assign[i]);
+               else if (i < box->n_events)
+                       continue;
+
+               if (hwc->state & PERF_HES_ARCH)
+                       continue;
+
+               uncore_pmu_event_start(event, 0);
+       }
+       box->n_events = n;
+
+       return 0;
+}
+
+static void uncore_pmu_event_del(struct perf_event *event, int flags)
+{
+       struct intel_uncore_box *box = uncore_event_to_box(event);
+       int i;
+
+       uncore_pmu_event_stop(event, PERF_EF_UPDATE);
+
+       for (i = 0; i < box->n_events; i++) {
+               if (event == box->event_list[i]) {
+                       uncore_put_event_constraint(box, event);
+
+                       for (++i; i < box->n_events; i++)
+                               box->event_list[i - 1] = box->event_list[i];
+
+                       --box->n_events;
+                       break;
+               }
+       }
+
+       event->hw.idx = -1;
+       event->hw.last_tag = ~0ULL;
+}
+
+void uncore_pmu_event_read(struct perf_event *event)
+{
+       struct intel_uncore_box *box = uncore_event_to_box(event);
+       uncore_perf_event_update(box, event);
+}
+
+/*
+ * validation ensures the group can be loaded onto the
+ * PMU if it was the only group available.
+ */
+static int uncore_validate_group(struct intel_uncore_pmu *pmu,
+                               struct perf_event *event)
+{
+       struct perf_event *leader = event->group_leader;
+       struct intel_uncore_box *fake_box;
+       int ret = -EINVAL, n;
+
+       fake_box = uncore_alloc_box(pmu->type, NUMA_NO_NODE);
+       if (!fake_box)
+               return -ENOMEM;
+
+       fake_box->pmu = pmu;
+       /*
+        * the event is not yet connected with its
+        * siblings therefore we must first collect
+        * existing siblings, then add the new event
+        * before we can simulate the scheduling
+        */
+       n = uncore_collect_events(fake_box, leader, true);
+       if (n < 0)
+               goto out;
+
+       fake_box->n_events = n;
+       n = uncore_collect_events(fake_box, event, false);
+       if (n < 0)
+               goto out;
+
+       fake_box->n_events = n;
+
+       ret = uncore_assign_events(fake_box, NULL, n);
+out:
+       kfree(fake_box);
+       return ret;
+}
+
+static int uncore_pmu_event_init(struct perf_event *event)
+{
+       struct intel_uncore_pmu *pmu;
+       struct intel_uncore_box *box;
+       struct hw_perf_event *hwc = &event->hw;
+       int ret;
+
+       if (event->attr.type != event->pmu->type)
+               return -ENOENT;
+
+       pmu = uncore_event_to_pmu(event);
+       /* no device found for this pmu */
+       if (pmu->func_id < 0)
+               return -ENOENT;
+
+       /*
+        * Uncore PMU does measure at all privilege level all the time.
+        * So it doesn't make sense to specify any exclude bits.
+        */
+       if (event->attr.exclude_user || event->attr.exclude_kernel ||
+                       event->attr.exclude_hv || event->attr.exclude_idle)
+               return -EINVAL;
+
+       /* Sampling not supported yet */
+       if (hwc->sample_period)
+               return -EINVAL;
+
+       /*
+        * Place all uncore events for a particular physical package
+        * onto a single cpu
+        */
+       if (event->cpu < 0)
+               return -EINVAL;
+       box = uncore_pmu_to_box(pmu, event->cpu);
+       if (!box || box->cpu < 0)
+               return -EINVAL;
+       event->cpu = box->cpu;
+       event->pmu_private = box;
+
+       event->hw.idx = -1;
+       event->hw.last_tag = ~0ULL;
+       event->hw.extra_reg.idx = EXTRA_REG_NONE;
+       event->hw.branch_reg.idx = EXTRA_REG_NONE;
+
+       if (event->attr.config == UNCORE_FIXED_EVENT) {
+               /* no fixed counter */
+               if (!pmu->type->fixed_ctl)
+                       return -EINVAL;
+               /*
+                * if there is only one fixed counter, only the first pmu
+                * can access the fixed counter
+                */
+               if (pmu->type->single_fixed && pmu->pmu_idx > 0)
+                       return -EINVAL;
+
+               /* fixed counters have event field hardcoded to zero */
+               hwc->config = 0ULL;
+       } else {
+               hwc->config = event->attr.config & pmu->type->event_mask;
+               if (pmu->type->ops->hw_config) {
+                       ret = pmu->type->ops->hw_config(box, event);
+                       if (ret)
+                               return ret;
+               }
+       }
+
+       if (event->group_leader != event)
+               ret = uncore_validate_group(pmu, event);
+       else
+               ret = 0;
+
+       return ret;
+}
+
+static ssize_t uncore_get_attr_cpumask(struct device *dev,
+                               struct device_attribute *attr, char *buf)
+{
+       return cpumap_print_to_pagebuf(true, buf, &uncore_cpu_mask);
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL);
+
+static struct attribute *uncore_pmu_attrs[] = {
+       &dev_attr_cpumask.attr,
+       NULL,
+};
+
+static struct attribute_group uncore_pmu_attr_group = {
+       .attrs = uncore_pmu_attrs,
+};
+
+static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
+{
+       int ret;
+
+       if (!pmu->type->pmu) {
+               pmu->pmu = (struct pmu) {
+                       .attr_groups    = pmu->type->attr_groups,
+                       .task_ctx_nr    = perf_invalid_context,
+                       .event_init     = uncore_pmu_event_init,
+                       .add            = uncore_pmu_event_add,
+                       .del            = uncore_pmu_event_del,
+                       .start          = uncore_pmu_event_start,
+                       .stop           = uncore_pmu_event_stop,
+                       .read           = uncore_pmu_event_read,
+               };
+       } else {
+               pmu->pmu = *pmu->type->pmu;
+               pmu->pmu.attr_groups = pmu->type->attr_groups;
+       }
+
+       if (pmu->type->num_boxes == 1) {
+               if (strlen(pmu->type->name) > 0)
+                       sprintf(pmu->name, "uncore_%s", pmu->type->name);
+               else
+                       sprintf(pmu->name, "uncore");
+       } else {
+               sprintf(pmu->name, "uncore_%s_%d", pmu->type->name,
+                       pmu->pmu_idx);
+       }
+
+       ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
+       if (!ret)
+               pmu->registered = true;
+       return ret;
+}
+
+static void uncore_pmu_unregister(struct intel_uncore_pmu *pmu)
+{
+       if (!pmu->registered)
+               return;
+       perf_pmu_unregister(&pmu->pmu);
+       pmu->registered = false;
+}
+
+static void __init __uncore_exit_boxes(struct intel_uncore_type *type, int cpu)
+{
+       struct intel_uncore_pmu *pmu = type->pmus;
+       struct intel_uncore_box *box;
+       int i, pkg;
+
+       if (pmu) {
+               pkg = topology_physical_package_id(cpu);
+               for (i = 0; i < type->num_boxes; i++, pmu++) {
+                       box = pmu->boxes[pkg];
+                       if (box)
+                               uncore_box_exit(box);
+               }
+       }
+}
+
+static void __init uncore_exit_boxes(void *dummy)
+{
+       struct intel_uncore_type **types;
+
+       for (types = uncore_msr_uncores; *types; types++)
+               __uncore_exit_boxes(*types++, smp_processor_id());
+}
+
+static void uncore_free_boxes(struct intel_uncore_pmu *pmu)
+{
+       int pkg;
+
+       for (pkg = 0; pkg < max_packages; pkg++)
+               kfree(pmu->boxes[pkg]);
+       kfree(pmu->boxes);
+}
+
+static void __init uncore_type_exit(struct intel_uncore_type *type)
+{
+       struct intel_uncore_pmu *pmu = type->pmus;
+       int i;
+
+       if (pmu) {
+               for (i = 0; i < type->num_boxes; i++, pmu++) {
+                       uncore_pmu_unregister(pmu);
+                       uncore_free_boxes(pmu);
+               }
+               kfree(type->pmus);
+               type->pmus = NULL;
+       }
+       kfree(type->events_group);
+       type->events_group = NULL;
+}
+
+static void __init uncore_types_exit(struct intel_uncore_type **types)
+{
+       for (; *types; types++)
+               uncore_type_exit(*types);
+}
+
+static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
+{
+       struct intel_uncore_pmu *pmus;
+       struct attribute_group *attr_group;
+       struct attribute **attrs;
+       size_t size;
+       int i, j;
+
+       pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL);
+       if (!pmus)
+               return -ENOMEM;
+
+       size = max_packages * sizeof(struct intel_uncore_box *);
+
+       for (i = 0; i < type->num_boxes; i++) {
+               pmus[i].func_id = setid ? i : -1;
+               pmus[i].pmu_idx = i;
+               pmus[i].type    = type;
+               pmus[i].boxes   = kzalloc(size, GFP_KERNEL);
+               if (!pmus[i].boxes)
+                       return -ENOMEM;
+       }
+
+       type->pmus = pmus;
+       type->unconstrainted = (struct event_constraint)
+               __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
+                               0, type->num_counters, 0, 0);
+
+       if (type->event_descs) {
+               for (i = 0; type->event_descs[i].attr.attr.name; i++);
+
+               attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) +
+                                       sizeof(*attr_group), GFP_KERNEL);
+               if (!attr_group)
+                       return -ENOMEM;
+
+               attrs = (struct attribute **)(attr_group + 1);
+               attr_group->name = "events";
+               attr_group->attrs = attrs;
+
+               for (j = 0; j < i; j++)
+                       attrs[j] = &type->event_descs[j].attr.attr;
+
+               type->events_group = attr_group;
+       }
+
+       type->pmu_group = &uncore_pmu_attr_group;
+       return 0;
+}
+
+static int __init
+uncore_types_init(struct intel_uncore_type **types, bool setid)
+{
+       int ret;
+
+       for (; *types; types++) {
+               ret = uncore_type_init(*types, setid);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
+/*
+ * add a pci uncore device
+ */
+static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+       struct intel_uncore_type *type;
+       struct intel_uncore_pmu *pmu;
+       struct intel_uncore_box *box;
+       int phys_id, pkg, ret;
+
+       phys_id = uncore_pcibus_to_physid(pdev->bus);
+       if (phys_id < 0)
+               return -ENODEV;
+
+       pkg = topology_phys_to_logical_pkg(phys_id);
+       if (WARN_ON_ONCE(pkg < 0))
+               return -EINVAL;
+
+       if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
+               int idx = UNCORE_PCI_DEV_IDX(id->driver_data);
+
+               uncore_extra_pci_dev[pkg].dev[idx] = pdev;
+               pci_set_drvdata(pdev, NULL);
+               return 0;
+       }
+
+       type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
+       /*
+        * for performance monitoring unit with multiple boxes,
+        * each box has a different function id.
+        */
+       pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
+       /* Knights Landing uses a common PCI device ID for multiple instances of
+        * an uncore PMU device type. There is only one entry per device type in
+        * the knl_uncore_pci_ids table inspite of multiple devices present for
+        * some device types. Hence PCI device idx would be 0 for all devices.
+        * So increment pmu pointer to point to an unused array element.
+        */
+       if (boot_cpu_data.x86_model == 87) {
+               while (pmu->func_id >= 0)
+                       pmu++;
+       }
+
+       if (WARN_ON_ONCE(pmu->boxes[pkg] != NULL))
+               return -EINVAL;
+
+       box = uncore_alloc_box(type, NUMA_NO_NODE);
+       if (!box)
+               return -ENOMEM;
+
+       if (pmu->func_id < 0)
+               pmu->func_id = pdev->devfn;
+       else
+               WARN_ON_ONCE(pmu->func_id != pdev->devfn);
+
+       atomic_inc(&box->refcnt);
+       box->pci_phys_id = phys_id;
+       box->pkgid = pkg;
+       box->pci_dev = pdev;
+       box->pmu = pmu;
+       uncore_box_init(box);
+       pci_set_drvdata(pdev, box);
+
+       pmu->boxes[pkg] = box;
+       if (atomic_inc_return(&pmu->activeboxes) > 1)
+               return 0;
+
+       /* First active box registers the pmu */
+       ret = uncore_pmu_register(pmu);
+       if (ret) {
+               pci_set_drvdata(pdev, NULL);
+               pmu->boxes[pkg] = NULL;
+               uncore_box_exit(box);
+               kfree(box);
+       }
+       return ret;
+}
+
+static void uncore_pci_remove(struct pci_dev *pdev)
+{
+       struct intel_uncore_box *box = pci_get_drvdata(pdev);
+       struct intel_uncore_pmu *pmu;
+       int i, phys_id, pkg;
+
+       phys_id = uncore_pcibus_to_physid(pdev->bus);
+       pkg = topology_phys_to_logical_pkg(phys_id);
+
+       box = pci_get_drvdata(pdev);
+       if (!box) {
+               for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
+                       if (uncore_extra_pci_dev[pkg].dev[i] == pdev) {
+                               uncore_extra_pci_dev[pkg].dev[i] = NULL;
+                               break;
+                       }
+               }
+               WARN_ON_ONCE(i >= UNCORE_EXTRA_PCI_DEV_MAX);
+               return;
+       }
+
+       pmu = box->pmu;
+       if (WARN_ON_ONCE(phys_id != box->pci_phys_id))
+               return;
+
+       pci_set_drvdata(pdev, NULL);
+       pmu->boxes[pkg] = NULL;
+       if (atomic_dec_return(&pmu->activeboxes) == 0)
+               uncore_pmu_unregister(pmu);
+       uncore_box_exit(box);
+       kfree(box);
+}
+
+static int __init uncore_pci_init(void)
+{
+       size_t size;
+       int ret;
+
+       switch (boot_cpu_data.x86_model) {
+       case 45: /* Sandy Bridge-EP */
+               ret = snbep_uncore_pci_init();
+               break;
+       case 62: /* Ivy Bridge-EP */
+               ret = ivbep_uncore_pci_init();
+               break;
+       case 63: /* Haswell-EP */
+               ret = hswep_uncore_pci_init();
+               break;
+       case 79: /* BDX-EP */
+       case 86: /* BDX-DE */
+               ret = bdx_uncore_pci_init();
+               break;
+       case 42: /* Sandy Bridge */
+               ret = snb_uncore_pci_init();
+               break;
+       case 58: /* Ivy Bridge */
+               ret = ivb_uncore_pci_init();
+               break;
+       case 60: /* Haswell */
+       case 69: /* Haswell Celeron */
+               ret = hsw_uncore_pci_init();
+               break;
+       case 61: /* Broadwell */
+               ret = bdw_uncore_pci_init();
+               break;
+       case 87: /* Knights Landing */
+               ret = knl_uncore_pci_init();
+               break;
+       case 94: /* SkyLake */
+               ret = skl_uncore_pci_init();
+               break;
+       default:
+               return -ENODEV;
+       }
+
+       if (ret)
+               return ret;
+
+       size = max_packages * sizeof(struct pci_extra_dev);
+       uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL);
+       if (!uncore_extra_pci_dev) {
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       ret = uncore_types_init(uncore_pci_uncores, false);
+       if (ret)
+               goto errtype;
+
+       uncore_pci_driver->probe = uncore_pci_probe;
+       uncore_pci_driver->remove = uncore_pci_remove;
+
+       ret = pci_register_driver(uncore_pci_driver);
+       if (ret)
+               goto errtype;
+
+       pcidrv_registered = true;
+       return 0;
+
+errtype:
+       uncore_types_exit(uncore_pci_uncores);
+       kfree(uncore_extra_pci_dev);
+       uncore_extra_pci_dev = NULL;
+       uncore_free_pcibus_map();
+err:
+       uncore_pci_uncores = empty_uncore;
+       return ret;
+}
+
+static void __init uncore_pci_exit(void)
+{
+       if (pcidrv_registered) {
+               pcidrv_registered = false;
+               pci_unregister_driver(uncore_pci_driver);
+               uncore_types_exit(uncore_pci_uncores);
+               kfree(uncore_extra_pci_dev);
+               uncore_free_pcibus_map();
+       }
+}
+
+static void uncore_cpu_dying(int cpu)
+{
+       struct intel_uncore_type *type, **types = uncore_msr_uncores;
+       struct intel_uncore_pmu *pmu;
+       struct intel_uncore_box *box;
+       int i, pkg;
+
+       pkg = topology_logical_package_id(cpu);
+       for (; *types; types++) {
+               type = *types;
+               pmu = type->pmus;
+               for (i = 0; i < type->num_boxes; i++, pmu++) {
+                       box = pmu->boxes[pkg];
+                       if (box && atomic_dec_return(&box->refcnt) == 0)
+                               uncore_box_exit(box);
+               }
+       }
+}
+
+static void uncore_cpu_starting(int cpu, bool init)
+{
+       struct intel_uncore_type *type, **types = uncore_msr_uncores;
+       struct intel_uncore_pmu *pmu;
+       struct intel_uncore_box *box;
+       int i, pkg, ncpus = 1;
+
+       if (init) {
+               /*
+                * On init we get the number of online cpus in the package
+                * and set refcount for all of them.
+                */
+               ncpus = cpumask_weight(topology_core_cpumask(cpu));
+       }
+
+       pkg = topology_logical_package_id(cpu);
+       for (; *types; types++) {
+               type = *types;
+               pmu = type->pmus;
+               for (i = 0; i < type->num_boxes; i++, pmu++) {
+                       box = pmu->boxes[pkg];
+                       if (!box)
+                               continue;
+                       /* The first cpu on a package activates the box */
+                       if (atomic_add_return(ncpus, &box->refcnt) == ncpus)
+                               uncore_box_init(box);
+               }
+       }
+}
+
+static int uncore_cpu_prepare(int cpu)
+{
+       struct intel_uncore_type *type, **types = uncore_msr_uncores;
+       struct intel_uncore_pmu *pmu;
+       struct intel_uncore_box *box;
+       int i, pkg;
+
+       pkg = topology_logical_package_id(cpu);
+       for (; *types; types++) {
+               type = *types;
+               pmu = type->pmus;
+               for (i = 0; i < type->num_boxes; i++, pmu++) {
+                       if (pmu->boxes[pkg])
+                               continue;
+                       /* First cpu of a package allocates the box */
+                       box = uncore_alloc_box(type, cpu_to_node(cpu));
+                       if (!box)
+                               return -ENOMEM;
+                       box->pmu = pmu;
+                       box->pkgid = pkg;
+                       pmu->boxes[pkg] = box;
+               }
+       }
+       return 0;
+}
+
+static void uncore_change_type_ctx(struct intel_uncore_type *type, int old_cpu,
+                                  int new_cpu)
+{
+       struct intel_uncore_pmu *pmu = type->pmus;
+       struct intel_uncore_box *box;
+       int i, pkg;
+
+       pkg = topology_logical_package_id(old_cpu < 0 ? new_cpu : old_cpu);
+       for (i = 0; i < type->num_boxes; i++, pmu++) {
+               box = pmu->boxes[pkg];
+               if (!box)
+                       continue;
+
+               if (old_cpu < 0) {
+                       WARN_ON_ONCE(box->cpu != -1);
+                       box->cpu = new_cpu;
+                       continue;
+               }
+
+               WARN_ON_ONCE(box->cpu != old_cpu);
+               box->cpu = -1;
+               if (new_cpu < 0)
+                       continue;
+
+               uncore_pmu_cancel_hrtimer(box);
+               perf_pmu_migrate_context(&pmu->pmu, old_cpu, new_cpu);
+               box->cpu = new_cpu;
+       }
+}
+
+static void uncore_change_context(struct intel_uncore_type **uncores,
+                                 int old_cpu, int new_cpu)
+{
+       for (; *uncores; uncores++)
+               uncore_change_type_ctx(*uncores, old_cpu, new_cpu);
+}
+
+static void uncore_event_exit_cpu(int cpu)
+{
+       int target;
+
+       /* Check if exiting cpu is used for collecting uncore events */
+       if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
+               return;
+
+       /* Find a new cpu to collect uncore events */
+       target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
+
+       /* Migrate uncore events to the new target */
+       if (target < nr_cpu_ids)
+               cpumask_set_cpu(target, &uncore_cpu_mask);
+       else
+               target = -1;
+
+       uncore_change_context(uncore_msr_uncores, cpu, target);
+       uncore_change_context(uncore_pci_uncores, cpu, target);
+}
+
+static void uncore_event_init_cpu(int cpu)
+{
+       int target;
+
+       /*
+        * Check if there is an online cpu in the package
+        * which collects uncore events already.
+        */
+       target = cpumask_any_and(&uncore_cpu_mask, topology_core_cpumask(cpu));
+       if (target < nr_cpu_ids)
+               return;
+
+       cpumask_set_cpu(cpu, &uncore_cpu_mask);
+
+       uncore_change_context(uncore_msr_uncores, -1, cpu);
+       uncore_change_context(uncore_pci_uncores, -1, cpu);
+}
+
+static int uncore_cpu_notifier(struct notifier_block *self,
+                              unsigned long action, void *hcpu)
+{
+       unsigned int cpu = (long)hcpu;
+
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_UP_PREPARE:
+               return notifier_from_errno(uncore_cpu_prepare(cpu));
+
+       case CPU_STARTING:
+               uncore_cpu_starting(cpu, false);
+       case CPU_DOWN_FAILED:
+               uncore_event_init_cpu(cpu);
+               break;
+
+       case CPU_UP_CANCELED:
+       case CPU_DYING:
+               uncore_cpu_dying(cpu);
+               break;
+
+       case CPU_DOWN_PREPARE:
+               uncore_event_exit_cpu(cpu);
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static struct notifier_block uncore_cpu_nb = {
+       .notifier_call  = uncore_cpu_notifier,
+       /*
+        * to migrate uncore events, our notifier should be executed
+        * before perf core's notifier.
+        */
+       .priority       = CPU_PRI_PERF + 1,
+};
+
+static int __init type_pmu_register(struct intel_uncore_type *type)
+{
+       int i, ret;
+
+       for (i = 0; i < type->num_boxes; i++) {
+               ret = uncore_pmu_register(&type->pmus[i]);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
+static int __init uncore_msr_pmus_register(void)
+{
+       struct intel_uncore_type **types = uncore_msr_uncores;
+       int ret;
+
+       for (; *types; types++) {
+               ret = type_pmu_register(*types);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
+static int __init uncore_cpu_init(void)
+{
+       int ret;
+
+       switch (boot_cpu_data.x86_model) {
+       case 26: /* Nehalem */
+       case 30:
+       case 37: /* Westmere */
+       case 44:
+               nhm_uncore_cpu_init();
+               break;
+       case 42: /* Sandy Bridge */
+       case 58: /* Ivy Bridge */
+       case 60: /* Haswell */
+       case 69: /* Haswell */
+       case 70: /* Haswell */
+       case 61: /* Broadwell */
+       case 71: /* Broadwell */
+               snb_uncore_cpu_init();
+               break;
+       case 45: /* Sandy Bridge-EP */
+               snbep_uncore_cpu_init();
+               break;
+       case 46: /* Nehalem-EX */
+       case 47: /* Westmere-EX aka. Xeon E7 */
+               nhmex_uncore_cpu_init();
+               break;
+       case 62: /* Ivy Bridge-EP */
+               ivbep_uncore_cpu_init();
+               break;
+       case 63: /* Haswell-EP */
+               hswep_uncore_cpu_init();
+               break;
+       case 79: /* BDX-EP */
+       case 86: /* BDX-DE */
+               bdx_uncore_cpu_init();
+               break;
+       case 87: /* Knights Landing */
+               knl_uncore_cpu_init();
+               break;
+       default:
+               return -ENODEV;
+       }
+
+       ret = uncore_types_init(uncore_msr_uncores, true);
+       if (ret)
+               goto err;
+
+       ret = uncore_msr_pmus_register();
+       if (ret)
+               goto err;
+       return 0;
+err:
+       uncore_types_exit(uncore_msr_uncores);
+       uncore_msr_uncores = empty_uncore;
+       return ret;
+}
+
+static void __init uncore_cpu_setup(void *dummy)
+{
+       uncore_cpu_starting(smp_processor_id(), true);
+}
+
+/* Lazy to avoid allocation of a few bytes for the normal case */
+static __initdata DECLARE_BITMAP(packages, MAX_LOCAL_APIC);
+
+static int __init uncore_cpumask_init(bool msr)
+{
+       unsigned int cpu;
+
+       for_each_online_cpu(cpu) {
+               unsigned int pkg = topology_logical_package_id(cpu);
+               int ret;
+
+               if (test_and_set_bit(pkg, packages))
+                       continue;
+               /*
+                * The first online cpu of each package allocates and takes
+                * the refcounts for all other online cpus in that package.
+                * If msrs are not enabled no allocation is required.
+                */
+               if (msr) {
+                       ret = uncore_cpu_prepare(cpu);
+                       if (ret)
+                               return ret;
+               }
+               uncore_event_init_cpu(cpu);
+               smp_call_function_single(cpu, uncore_cpu_setup, NULL, 1);
+       }
+       __register_cpu_notifier(&uncore_cpu_nb);
+       return 0;
+}
+
+static int __init intel_uncore_init(void)
+{
+       int pret, cret, ret;
+
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+               return -ENODEV;
+
+       if (cpu_has_hypervisor)
+               return -ENODEV;
+
+       max_packages = topology_max_packages();
+
+       pret = uncore_pci_init();
+       cret = uncore_cpu_init();
+
+       if (cret && pret)
+               return -ENODEV;
+
+       cpu_notifier_register_begin();
+       ret = uncore_cpumask_init(!cret);
+       if (ret)
+               goto err;
+       cpu_notifier_register_done();
+       return 0;
+
+err:
+       /* Undo box->init_box() */
+       on_each_cpu_mask(&uncore_cpu_mask, uncore_exit_boxes, NULL, 1);
+       uncore_types_exit(uncore_msr_uncores);
+       uncore_pci_exit();
+       cpu_notifier_register_done();
+       return ret;
+}
+device_initcall(intel_uncore_init);
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h

new file mode 100644 (file)

index 0000000..79766b9
--- /dev/null
+++ b/arch/x86/events/intel/uncore.h
@@ -0,0 +1,378 @@
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <asm/apicdef.h>
+
+#include <linux/perf_event.h>
+#include "../perf_event.h"
+
+#define UNCORE_PMU_NAME_LEN            32
+#define UNCORE_PMU_HRTIMER_INTERVAL    (60LL * NSEC_PER_SEC)
+#define UNCORE_SNB_IMC_HRTIMER_INTERVAL (5ULL * NSEC_PER_SEC)
+
+#define UNCORE_FIXED_EVENT             0xff
+#define UNCORE_PMC_IDX_MAX_GENERIC     8
+#define UNCORE_PMC_IDX_FIXED           UNCORE_PMC_IDX_MAX_GENERIC
+#define UNCORE_PMC_IDX_MAX             (UNCORE_PMC_IDX_FIXED + 1)
+
+#define UNCORE_PCI_DEV_DATA(type, idx) ((type << 8) | idx)
+#define UNCORE_PCI_DEV_TYPE(data)      ((data >> 8) & 0xff)
+#define UNCORE_PCI_DEV_IDX(data)       (data & 0xff)
+#define UNCORE_EXTRA_PCI_DEV           0xff
+#define UNCORE_EXTRA_PCI_DEV_MAX       3
+
+#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
+
+struct pci_extra_dev {
+       struct pci_dev *dev[UNCORE_EXTRA_PCI_DEV_MAX];
+};
+
+struct intel_uncore_ops;
+struct intel_uncore_pmu;
+struct intel_uncore_box;
+struct uncore_event_desc;
+
+struct intel_uncore_type {
+       const char *name;
+       int num_counters;
+       int num_boxes;
+       int perf_ctr_bits;
+       int fixed_ctr_bits;
+       unsigned perf_ctr;
+       unsigned event_ctl;
+       unsigned event_mask;
+       unsigned fixed_ctr;
+       unsigned fixed_ctl;
+       unsigned box_ctl;
+       unsigned msr_offset;
+       unsigned num_shared_regs:8;
+       unsigned single_fixed:1;
+       unsigned pair_ctr_ctl:1;
+       unsigned *msr_offsets;
+       struct event_constraint unconstrainted;
+       struct event_constraint *constraints;
+       struct intel_uncore_pmu *pmus;
+       struct intel_uncore_ops *ops;
+       struct uncore_event_desc *event_descs;
+       const struct attribute_group *attr_groups[4];
+       struct pmu *pmu; /* for custom pmu ops */
+};
+
+#define pmu_group attr_groups[0]
+#define format_group attr_groups[1]
+#define events_group attr_groups[2]
+
+struct intel_uncore_ops {
+       void (*init_box)(struct intel_uncore_box *);
+       void (*exit_box)(struct intel_uncore_box *);
+       void (*disable_box)(struct intel_uncore_box *);
+       void (*enable_box)(struct intel_uncore_box *);
+       void (*disable_event)(struct intel_uncore_box *, struct perf_event *);
+       void (*enable_event)(struct intel_uncore_box *, struct perf_event *);
+       u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *);
+       int (*hw_config)(struct intel_uncore_box *, struct perf_event *);
+       struct event_constraint *(*get_constraint)(struct intel_uncore_box *,
+                                                  struct perf_event *);
+       void (*put_constraint)(struct intel_uncore_box *, struct perf_event *);
+};
+
+struct intel_uncore_pmu {
+       struct pmu                      pmu;
+       char                            name[UNCORE_PMU_NAME_LEN];
+       int                             pmu_idx;
+       int                             func_id;
+       bool                            registered;
+       atomic_t                        activeboxes;
+       struct intel_uncore_type        *type;
+       struct intel_uncore_box         **boxes;
+};
+
+struct intel_uncore_extra_reg {
+       raw_spinlock_t lock;
+       u64 config, config1, config2;
+       atomic_t ref;
+};
+
+struct intel_uncore_box {
+       int pci_phys_id;
+       int pkgid;
+       int n_active;   /* number of active events */
+       int n_events;
+       int cpu;        /* cpu to collect events */
+       unsigned long flags;
+       atomic_t refcnt;
+       struct perf_event *events[UNCORE_PMC_IDX_MAX];
+       struct perf_event *event_list[UNCORE_PMC_IDX_MAX];
+       struct event_constraint *event_constraint[UNCORE_PMC_IDX_MAX];
+       unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
+       u64 tags[UNCORE_PMC_IDX_MAX];
+       struct pci_dev *pci_dev;
+       struct intel_uncore_pmu *pmu;
+       u64 hrtimer_duration; /* hrtimer timeout for this box */
+       struct hrtimer hrtimer;
+       struct list_head list;
+       struct list_head active_list;
+       void *io_addr;
+       struct intel_uncore_extra_reg shared_regs[0];
+};
+
+#define UNCORE_BOX_FLAG_INITIATED      0
+
+struct uncore_event_desc {
+       struct kobj_attribute attr;
+       const char *config;
+};
+
+struct pci2phy_map {
+       struct list_head list;
+       int segment;
+       int pbus_to_physid[256];
+};
+
+struct pci2phy_map *__find_pci2phy_map(int segment);
+
+ssize_t uncore_event_show(struct kobject *kobj,
+                         struct kobj_attribute *attr, char *buf);
+
+#define INTEL_UNCORE_EVENT_DESC(_name, _config)                        \
+{                                                              \
+       .attr   = __ATTR(_name, 0444, uncore_event_show, NULL), \
+       .config = _config,                                      \
+}
+
+#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format)                        \
+static ssize_t __uncore_##_var##_show(struct kobject *kobj,            \
+                               struct kobj_attribute *attr,            \
+                               char *page)                             \
+{                                                                      \
+       BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);                     \
+       return sprintf(page, _format "\n");                             \
+}                                                                      \
+static struct kobj_attribute format_attr_##_var =                      \
+       __ATTR(_name, 0444, __uncore_##_var##_show, NULL)
+
+static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box)
+{
+       return box->pmu->type->box_ctl;
+}
+
+static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box)
+{
+       return box->pmu->type->fixed_ctl;
+}
+
+static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box)
+{
+       return box->pmu->type->fixed_ctr;
+}
+
+static inline
+unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx)
+{
+       return idx * 4 + box->pmu->type->event_ctl;
+}
+
+static inline
+unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx)
+{
+       return idx * 8 + box->pmu->type->perf_ctr;
+}
+
+static inline unsigned uncore_msr_box_offset(struct intel_uncore_box *box)
+{
+       struct intel_uncore_pmu *pmu = box->pmu;
+       return pmu->type->msr_offsets ?
+               pmu->type->msr_offsets[pmu->pmu_idx] :
+               pmu->type->msr_offset * pmu->pmu_idx;
+}
+
+static inline unsigned uncore_msr_box_ctl(struct intel_uncore_box *box)
+{
+       if (!box->pmu->type->box_ctl)
+               return 0;
+       return box->pmu->type->box_ctl + uncore_msr_box_offset(box);
+}
+
+static inline unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box)
+{
+       if (!box->pmu->type->fixed_ctl)
+               return 0;
+       return box->pmu->type->fixed_ctl + uncore_msr_box_offset(box);
+}
+
+static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box)
+{
+       return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box);
+}
+
+static inline
+unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx)
+{
+       return box->pmu->type->event_ctl +
+               (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
+               uncore_msr_box_offset(box);
+}
+
+static inline
+unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx)
+{
+       return box->pmu->type->perf_ctr +
+               (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
+               uncore_msr_box_offset(box);
+}
+
+static inline
+unsigned uncore_fixed_ctl(struct intel_uncore_box *box)
+{
+       if (box->pci_dev)
+               return uncore_pci_fixed_ctl(box);
+       else
+               return uncore_msr_fixed_ctl(box);
+}
+
+static inline
+unsigned uncore_fixed_ctr(struct intel_uncore_box *box)
+{
+       if (box->pci_dev)
+               return uncore_pci_fixed_ctr(box);
+       else
+               return uncore_msr_fixed_ctr(box);
+}
+
+static inline
+unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx)
+{
+       if (box->pci_dev)
+               return uncore_pci_event_ctl(box, idx);
+       else
+               return uncore_msr_event_ctl(box, idx);
+}
+
+static inline
+unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx)
+{
+       if (box->pci_dev)
+               return uncore_pci_perf_ctr(box, idx);
+       else
+               return uncore_msr_perf_ctr(box, idx);
+}
+
+static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box)
+{
+       return box->pmu->type->perf_ctr_bits;
+}
+
+static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box)
+{
+       return box->pmu->type->fixed_ctr_bits;
+}
+
+static inline int uncore_num_counters(struct intel_uncore_box *box)
+{
+       return box->pmu->type->num_counters;
+}
+
+static inline void uncore_disable_box(struct intel_uncore_box *box)
+{
+       if (box->pmu->type->ops->disable_box)
+               box->pmu->type->ops->disable_box(box);
+}
+
+static inline void uncore_enable_box(struct intel_uncore_box *box)
+{
+       if (box->pmu->type->ops->enable_box)
+               box->pmu->type->ops->enable_box(box);
+}
+
+static inline void uncore_disable_event(struct intel_uncore_box *box,
+                               struct perf_event *event)
+{
+       box->pmu->type->ops->disable_event(box, event);
+}
+
+static inline void uncore_enable_event(struct intel_uncore_box *box,
+                               struct perf_event *event)
+{
+       box->pmu->type->ops->enable_event(box, event);
+}
+
+static inline u64 uncore_read_counter(struct intel_uncore_box *box,
+                               struct perf_event *event)
+{
+       return box->pmu->type->ops->read_counter(box, event);
+}
+
+static inline void uncore_box_init(struct intel_uncore_box *box)
+{
+       if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
+               if (box->pmu->type->ops->init_box)
+                       box->pmu->type->ops->init_box(box);
+       }
+}
+
+static inline void uncore_box_exit(struct intel_uncore_box *box)
+{
+       if (test_and_clear_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
+               if (box->pmu->type->ops->exit_box)
+                       box->pmu->type->ops->exit_box(box);
+       }
+}
+
+static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
+{
+       return (box->pkgid < 0);
+}
+
+static inline struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
+{
+       return container_of(event->pmu, struct intel_uncore_pmu, pmu);
+}
+
+static inline struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
+{
+       return event->pmu_private;
+}
+
+struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu);
+u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event);
+void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
+void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
+void uncore_pmu_event_read(struct perf_event *event);
+void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event);
+struct event_constraint *
+uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event);
+void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event);
+u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx);
+
+extern struct intel_uncore_type **uncore_msr_uncores;
+extern struct intel_uncore_type **uncore_pci_uncores;
+extern struct pci_driver *uncore_pci_driver;
+extern raw_spinlock_t pci2phy_map_lock;
+extern struct list_head pci2phy_map_head;
+extern struct pci_extra_dev *uncore_extra_pci_dev;
+extern struct event_constraint uncore_constraint_empty;
+
+/* perf_event_intel_uncore_snb.c */
+int snb_uncore_pci_init(void);
+int ivb_uncore_pci_init(void);
+int hsw_uncore_pci_init(void);
+int bdw_uncore_pci_init(void);
+int skl_uncore_pci_init(void);
+void snb_uncore_cpu_init(void);
+void nhm_uncore_cpu_init(void);
+int snb_pci2phy_map_init(int devid);
+
+/* perf_event_intel_uncore_snbep.c */
+int snbep_uncore_pci_init(void);
+void snbep_uncore_cpu_init(void);
+int ivbep_uncore_pci_init(void);
+void ivbep_uncore_cpu_init(void);
+int hswep_uncore_pci_init(void);
+void hswep_uncore_cpu_init(void);
+int bdx_uncore_pci_init(void);
+void bdx_uncore_cpu_init(void);
+int knl_uncore_pci_init(void);
+void knl_uncore_cpu_init(void);
+
+/* perf_event_intel_uncore_nhmex.c */
+void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c

new file mode 100644 (file)

index 0000000..cda5693
--- /dev/null
+++ b/arch/x86/events/intel/uncore_nhmex.c
@@ -0,0 +1,1227 @@
+/* Nehalem-EX/Westmere-EX uncore support */
+#include "uncore.h"
+
+/* NHM-EX event control */
+#define NHMEX_PMON_CTL_EV_SEL_MASK     0x000000ff
+#define NHMEX_PMON_CTL_UMASK_MASK      0x0000ff00
+#define NHMEX_PMON_CTL_EN_BIT0         (1 << 0)
+#define NHMEX_PMON_CTL_EDGE_DET                (1 << 18)
+#define NHMEX_PMON_CTL_PMI_EN          (1 << 20)
+#define NHMEX_PMON_CTL_EN_BIT22                (1 << 22)
+#define NHMEX_PMON_CTL_INVERT          (1 << 23)
+#define NHMEX_PMON_CTL_TRESH_MASK      0xff000000
+#define NHMEX_PMON_RAW_EVENT_MASK      (NHMEX_PMON_CTL_EV_SEL_MASK | \
+                                        NHMEX_PMON_CTL_UMASK_MASK | \
+                                        NHMEX_PMON_CTL_EDGE_DET | \
+                                        NHMEX_PMON_CTL_INVERT | \
+                                        NHMEX_PMON_CTL_TRESH_MASK)
+
+/* NHM-EX Ubox */
+#define NHMEX_U_MSR_PMON_GLOBAL_CTL            0xc00
+#define NHMEX_U_MSR_PMON_CTR                   0xc11
+#define NHMEX_U_MSR_PMON_EV_SEL                        0xc10
+
+#define NHMEX_U_PMON_GLOBAL_EN                 (1 << 0)
+#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL       0x0000001e
+#define NHMEX_U_PMON_GLOBAL_EN_ALL             (1 << 28)
+#define NHMEX_U_PMON_GLOBAL_RST_ALL            (1 << 29)
+#define NHMEX_U_PMON_GLOBAL_FRZ_ALL            (1 << 31)
+
+#define NHMEX_U_PMON_RAW_EVENT_MASK            \
+               (NHMEX_PMON_CTL_EV_SEL_MASK |   \
+                NHMEX_PMON_CTL_EDGE_DET)
+
+/* NHM-EX Cbox */
+#define NHMEX_C0_MSR_PMON_GLOBAL_CTL           0xd00
+#define NHMEX_C0_MSR_PMON_CTR0                 0xd11
+#define NHMEX_C0_MSR_PMON_EV_SEL0              0xd10
+#define NHMEX_C_MSR_OFFSET                     0x20
+
+/* NHM-EX Bbox */
+#define NHMEX_B0_MSR_PMON_GLOBAL_CTL           0xc20
+#define NHMEX_B0_MSR_PMON_CTR0                 0xc31
+#define NHMEX_B0_MSR_PMON_CTL0                 0xc30
+#define NHMEX_B_MSR_OFFSET                     0x40
+#define NHMEX_B0_MSR_MATCH                     0xe45
+#define NHMEX_B0_MSR_MASK                      0xe46
+#define NHMEX_B1_MSR_MATCH                     0xe4d
+#define NHMEX_B1_MSR_MASK                      0xe4e
+
+#define NHMEX_B_PMON_CTL_EN                    (1 << 0)
+#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT          1
+#define NHMEX_B_PMON_CTL_EV_SEL_MASK           \
+               (0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT)
+#define NHMEX_B_PMON_CTR_SHIFT         6
+#define NHMEX_B_PMON_CTR_MASK          \
+               (0x3 << NHMEX_B_PMON_CTR_SHIFT)
+#define NHMEX_B_PMON_RAW_EVENT_MASK            \
+               (NHMEX_B_PMON_CTL_EV_SEL_MASK | \
+                NHMEX_B_PMON_CTR_MASK)
+
+/* NHM-EX Sbox */
+#define NHMEX_S0_MSR_PMON_GLOBAL_CTL           0xc40
+#define NHMEX_S0_MSR_PMON_CTR0                 0xc51
+#define NHMEX_S0_MSR_PMON_CTL0                 0xc50
+#define NHMEX_S_MSR_OFFSET                     0x80
+#define NHMEX_S0_MSR_MM_CFG                    0xe48
+#define NHMEX_S0_MSR_MATCH                     0xe49
+#define NHMEX_S0_MSR_MASK                      0xe4a
+#define NHMEX_S1_MSR_MM_CFG                    0xe58
+#define NHMEX_S1_MSR_MATCH                     0xe59
+#define NHMEX_S1_MSR_MASK                      0xe5a
+
+#define NHMEX_S_PMON_MM_CFG_EN                 (0x1ULL << 63)
+#define NHMEX_S_EVENT_TO_R_PROG_EV             0
+
+/* NHM-EX Mbox */
+#define NHMEX_M0_MSR_GLOBAL_CTL                        0xca0
+#define NHMEX_M0_MSR_PMU_DSP                   0xca5
+#define NHMEX_M0_MSR_PMU_ISS                   0xca6
+#define NHMEX_M0_MSR_PMU_MAP                   0xca7
+#define NHMEX_M0_MSR_PMU_MSC_THR               0xca8
+#define NHMEX_M0_MSR_PMU_PGT                   0xca9
+#define NHMEX_M0_MSR_PMU_PLD                   0xcaa
+#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC           0xcab
+#define NHMEX_M0_MSR_PMU_CTL0                  0xcb0
+#define NHMEX_M0_MSR_PMU_CNT0                  0xcb1
+#define NHMEX_M_MSR_OFFSET                     0x40
+#define NHMEX_M0_MSR_PMU_MM_CFG                        0xe54
+#define NHMEX_M1_MSR_PMU_MM_CFG                        0xe5c
+
+#define NHMEX_M_PMON_MM_CFG_EN                 (1ULL << 63)
+#define NHMEX_M_PMON_ADDR_MATCH_MASK           0x3ffffffffULL
+#define NHMEX_M_PMON_ADDR_MASK_MASK            0x7ffffffULL
+#define NHMEX_M_PMON_ADDR_MASK_SHIFT           34
+
+#define NHMEX_M_PMON_CTL_EN                    (1 << 0)
+#define NHMEX_M_PMON_CTL_PMI_EN                        (1 << 1)
+#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT      2
+#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK       \
+       (0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT)
+#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT    4
+#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK     \
+       (0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT)
+#define NHMEX_M_PMON_CTL_WRAP_MODE             (1 << 6)
+#define NHMEX_M_PMON_CTL_FLAG_MODE             (1 << 7)
+#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT         9
+#define NHMEX_M_PMON_CTL_INC_SEL_MASK          \
+       (0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
+#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT    19
+#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK     \
+       (0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT)
+#define NHMEX_M_PMON_RAW_EVENT_MASK                    \
+               (NHMEX_M_PMON_CTL_COUNT_MODE_MASK |     \
+                NHMEX_M_PMON_CTL_STORAGE_MODE_MASK |   \
+                NHMEX_M_PMON_CTL_WRAP_MODE |           \
+                NHMEX_M_PMON_CTL_FLAG_MODE |           \
+                NHMEX_M_PMON_CTL_INC_SEL_MASK |        \
+                NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK)
+
+#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK          (((1 << 11) - 1) | (1 << 23))
+#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (11 + 3 * (n)))
+
+#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK          (((1 << 12) - 1) | (1 << 24))
+#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (12 + 3 * (n)))
+
+/*
+ * use the 9~13 bits to select event If the 7th bit is not set,
+ * otherwise use the 19~21 bits to select event.
+ */
+#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
+#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \
+                               NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \
+                          NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \
+                               NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_INC_SEL_EXTAR_REG(c, r) \
+               EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \
+                               MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r)
+#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \
+               EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \
+                               MBOX_SET_FLAG_SEL_MASK, \
+                               (u64)-1, NHMEX_M_##r)
+
+/* NHM-EX Rbox */
+#define NHMEX_R_MSR_GLOBAL_CTL                 0xe00
+#define NHMEX_R_MSR_PMON_CTL0                  0xe10
+#define NHMEX_R_MSR_PMON_CNT0                  0xe11
+#define NHMEX_R_MSR_OFFSET                     0x20
+
+#define NHMEX_R_MSR_PORTN_QLX_CFG(n)           \
+               ((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4))
+#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n)                (0xe04 + (n))
+#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n)                (0xe24 + (n))
+#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n)                \
+               (((n) < 4 ? 0 : 0x10) + (n) * 4)
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n)   \
+               (0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n)    \
+               (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1)
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n)     \
+               (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2)
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n)   \
+               (0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n)    \
+               (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1)
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n)     \
+               (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2)
+
+#define NHMEX_R_PMON_CTL_EN                    (1 << 0)
+#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT          1
+#define NHMEX_R_PMON_CTL_EV_SEL_MASK           \
+               (0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT)
+#define NHMEX_R_PMON_CTL_PMI_EN                        (1 << 6)
+#define NHMEX_R_PMON_RAW_EVENT_MASK            NHMEX_R_PMON_CTL_EV_SEL_MASK
+
+/* NHM-EX Wbox */
+#define NHMEX_W_MSR_GLOBAL_CTL                 0xc80
+#define NHMEX_W_MSR_PMON_CNT0                  0xc90
+#define NHMEX_W_MSR_PMON_EVT_SEL0              0xc91
+#define NHMEX_W_MSR_PMON_FIXED_CTR             0x394
+#define NHMEX_W_MSR_PMON_FIXED_CTL             0x395
+
+#define NHMEX_W_PMON_GLOBAL_FIXED_EN           (1ULL << 31)
+
+#define __BITS_VALUE(x, i, n)  ((typeof(x))(((x) >> ((i) * (n))) & \
+                               ((1ULL << (n)) - 1)))
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7");
+DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63");
+
+static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+       wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL);
+}
+
+static void nhmex_uncore_msr_exit_box(struct intel_uncore_box *box)
+{
+       wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, 0);
+}
+
+static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+       unsigned msr = uncore_msr_box_ctl(box);
+       u64 config;
+
+       if (msr) {
+               rdmsrl(msr, config);
+               config &= ~((1ULL << uncore_num_counters(box)) - 1);
+               /* WBox has a fixed counter */
+               if (uncore_msr_fixed_ctl(box))
+                       config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN;
+               wrmsrl(msr, config);
+       }
+}
+
+static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+       unsigned msr = uncore_msr_box_ctl(box);
+       u64 config;
+
+       if (msr) {
+               rdmsrl(msr, config);
+               config |= (1ULL << uncore_num_counters(box)) - 1;
+               /* WBox has a fixed counter */
+               if (uncore_msr_fixed_ctl(box))
+                       config |= NHMEX_W_PMON_GLOBAL_FIXED_EN;
+               wrmsrl(msr, config);
+       }
+}
+
+static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       wrmsrl(event->hw.config_base, 0);
+}
+
+static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (hwc->idx >= UNCORE_PMC_IDX_FIXED)
+               wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0);
+       else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0)
+               wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
+       else
+               wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
+}
+
+#define NHMEX_UNCORE_OPS_COMMON_INIT()                         \
+       .init_box       = nhmex_uncore_msr_init_box,            \
+       .exit_box       = nhmex_uncore_msr_exit_box,            \
+       .disable_box    = nhmex_uncore_msr_disable_box,         \
+       .enable_box     = nhmex_uncore_msr_enable_box,          \
+       .disable_event  = nhmex_uncore_msr_disable_event,       \
+       .read_counter   = uncore_msr_read_counter
+
+static struct intel_uncore_ops nhmex_uncore_ops = {
+       NHMEX_UNCORE_OPS_COMMON_INIT(),
+       .enable_event   = nhmex_uncore_msr_enable_event,
+};
+
+static struct attribute *nhmex_uncore_ubox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_edge.attr,
+       NULL,
+};
+
+static struct attribute_group nhmex_uncore_ubox_format_group = {
+       .name           = "format",
+       .attrs          = nhmex_uncore_ubox_formats_attr,
+};
+
+static struct intel_uncore_type nhmex_uncore_ubox = {
+       .name           = "ubox",
+       .num_counters   = 1,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 48,
+       .event_ctl      = NHMEX_U_MSR_PMON_EV_SEL,
+       .perf_ctr       = NHMEX_U_MSR_PMON_CTR,
+       .event_mask     = NHMEX_U_PMON_RAW_EVENT_MASK,
+       .box_ctl        = NHMEX_U_MSR_PMON_GLOBAL_CTL,
+       .ops            = &nhmex_uncore_ops,
+       .format_group   = &nhmex_uncore_ubox_format_group
+};
+
+static struct attribute *nhmex_uncore_cbox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       NULL,
+};
+
+static struct attribute_group nhmex_uncore_cbox_format_group = {
+       .name = "format",
+       .attrs = nhmex_uncore_cbox_formats_attr,
+};
+
+/* msr offset for each instance of cbox */
+static unsigned nhmex_cbox_msr_offsets[] = {
+       0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0,
+};
+
+static struct intel_uncore_type nhmex_uncore_cbox = {
+       .name                   = "cbox",
+       .num_counters           = 6,
+       .num_boxes              = 10,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = NHMEX_C0_MSR_PMON_EV_SEL0,
+       .perf_ctr               = NHMEX_C0_MSR_PMON_CTR0,
+       .event_mask             = NHMEX_PMON_RAW_EVENT_MASK,
+       .box_ctl                = NHMEX_C0_MSR_PMON_GLOBAL_CTL,
+       .msr_offsets            = nhmex_cbox_msr_offsets,
+       .pair_ctr_ctl           = 1,
+       .ops                    = &nhmex_uncore_ops,
+       .format_group           = &nhmex_uncore_cbox_format_group
+};
+
+static struct uncore_event_desc nhmex_uncore_wbox_events[] = {
+       INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"),
+       { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_type nhmex_uncore_wbox = {
+       .name                   = "wbox",
+       .num_counters           = 4,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = NHMEX_W_MSR_PMON_CNT0,
+       .perf_ctr               = NHMEX_W_MSR_PMON_EVT_SEL0,
+       .fixed_ctr              = NHMEX_W_MSR_PMON_FIXED_CTR,
+       .fixed_ctl              = NHMEX_W_MSR_PMON_FIXED_CTL,
+       .event_mask             = NHMEX_PMON_RAW_EVENT_MASK,
+       .box_ctl                = NHMEX_W_MSR_GLOBAL_CTL,
+       .pair_ctr_ctl           = 1,
+       .event_descs            = nhmex_uncore_wbox_events,
+       .ops                    = &nhmex_uncore_ops,
+       .format_group           = &nhmex_uncore_cbox_format_group
+};
+
+static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+       int ctr, ev_sel;
+
+       ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >>
+               NHMEX_B_PMON_CTR_SHIFT;
+       ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >>
+                 NHMEX_B_PMON_CTL_EV_SEL_SHIFT;
+
+       /* events that do not use the match/mask registers */
+       if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) ||
+           (ctr == 2 && ev_sel != 0x4) || ctr == 3)
+               return 0;
+
+       if (box->pmu->pmu_idx == 0)
+               reg1->reg = NHMEX_B0_MSR_MATCH;
+       else
+               reg1->reg = NHMEX_B1_MSR_MATCH;
+       reg1->idx = 0;
+       reg1->config = event->attr.config1;
+       reg2->config = event->attr.config2;
+       return 0;
+}
+
+static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+       if (reg1->idx != EXTRA_REG_NONE) {
+               wrmsrl(reg1->reg, reg1->config);
+               wrmsrl(reg1->reg + 1, reg2->config);
+       }
+       wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
+               (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK));
+}
+
+/*
+ * The Bbox has 4 counters, but each counter monitors different events.
+ * Use bits 6-7 in the event config to select counter.
+ */
+static struct event_constraint nhmex_uncore_bbox_constraints[] = {
+       EVENT_CONSTRAINT(0 , 1, 0xc0),
+       EVENT_CONSTRAINT(0x40, 2, 0xc0),
+       EVENT_CONSTRAINT(0x80, 4, 0xc0),
+       EVENT_CONSTRAINT(0xc0, 8, 0xc0),
+       EVENT_CONSTRAINT_END,
+};
+
+static struct attribute *nhmex_uncore_bbox_formats_attr[] = {
+       &format_attr_event5.attr,
+       &format_attr_counter.attr,
+       &format_attr_match.attr,
+       &format_attr_mask.attr,
+       NULL,
+};
+
+static struct attribute_group nhmex_uncore_bbox_format_group = {
+       .name = "format",
+       .attrs = nhmex_uncore_bbox_formats_attr,
+};
+
+static struct intel_uncore_ops nhmex_uncore_bbox_ops = {
+       NHMEX_UNCORE_OPS_COMMON_INIT(),
+       .enable_event           = nhmex_bbox_msr_enable_event,
+       .hw_config              = nhmex_bbox_hw_config,
+       .get_constraint         = uncore_get_constraint,
+       .put_constraint         = uncore_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_bbox = {
+       .name                   = "bbox",
+       .num_counters           = 4,
+       .num_boxes              = 2,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = NHMEX_B0_MSR_PMON_CTL0,
+       .perf_ctr               = NHMEX_B0_MSR_PMON_CTR0,
+       .event_mask             = NHMEX_B_PMON_RAW_EVENT_MASK,
+       .box_ctl                = NHMEX_B0_MSR_PMON_GLOBAL_CTL,
+       .msr_offset             = NHMEX_B_MSR_OFFSET,
+       .pair_ctr_ctl           = 1,
+       .num_shared_regs        = 1,
+       .constraints            = nhmex_uncore_bbox_constraints,
+       .ops                    = &nhmex_uncore_bbox_ops,
+       .format_group           = &nhmex_uncore_bbox_format_group
+};
+
+static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+       /* only TO_R_PROG_EV event uses the match/mask register */
+       if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) !=
+           NHMEX_S_EVENT_TO_R_PROG_EV)
+               return 0;
+
+       if (box->pmu->pmu_idx == 0)
+               reg1->reg = NHMEX_S0_MSR_MM_CFG;
+       else
+               reg1->reg = NHMEX_S1_MSR_MM_CFG;
+       reg1->idx = 0;
+       reg1->config = event->attr.config1;
+       reg2->config = event->attr.config2;
+       return 0;
+}
+
+static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+       if (reg1->idx != EXTRA_REG_NONE) {
+               wrmsrl(reg1->reg, 0);
+               wrmsrl(reg1->reg + 1, reg1->config);
+               wrmsrl(reg1->reg + 2, reg2->config);
+               wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN);
+       }
+       wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
+}
+
+static struct attribute *nhmex_uncore_sbox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       &format_attr_match.attr,
+       &format_attr_mask.attr,
+       NULL,
+};
+
+static struct attribute_group nhmex_uncore_sbox_format_group = {
+       .name                   = "format",
+       .attrs                  = nhmex_uncore_sbox_formats_attr,
+};
+
+static struct intel_uncore_ops nhmex_uncore_sbox_ops = {
+       NHMEX_UNCORE_OPS_COMMON_INIT(),
+       .enable_event           = nhmex_sbox_msr_enable_event,
+       .hw_config              = nhmex_sbox_hw_config,
+       .get_constraint         = uncore_get_constraint,
+       .put_constraint         = uncore_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_sbox = {
+       .name                   = "sbox",
+       .num_counters           = 4,
+       .num_boxes              = 2,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = NHMEX_S0_MSR_PMON_CTL0,
+       .perf_ctr               = NHMEX_S0_MSR_PMON_CTR0,
+       .event_mask             = NHMEX_PMON_RAW_EVENT_MASK,
+       .box_ctl                = NHMEX_S0_MSR_PMON_GLOBAL_CTL,
+       .msr_offset             = NHMEX_S_MSR_OFFSET,
+       .pair_ctr_ctl           = 1,
+       .num_shared_regs        = 1,
+       .ops                    = &nhmex_uncore_sbox_ops,
+       .format_group           = &nhmex_uncore_sbox_format_group
+};
+
+enum {
+       EXTRA_REG_NHMEX_M_FILTER,
+       EXTRA_REG_NHMEX_M_DSP,
+       EXTRA_REG_NHMEX_M_ISS,
+       EXTRA_REG_NHMEX_M_MAP,
+       EXTRA_REG_NHMEX_M_MSC_THR,
+       EXTRA_REG_NHMEX_M_PGT,
+       EXTRA_REG_NHMEX_M_PLD,
+       EXTRA_REG_NHMEX_M_ZDP_CTL_FVC,
+};
+
+static struct extra_reg nhmex_uncore_mbox_extra_regs[] = {
+       MBOX_INC_SEL_EXTAR_REG(0x0, DSP),
+       MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR),
+       MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR),
+       MBOX_INC_SEL_EXTAR_REG(0x9, ISS),
+       /* event 0xa uses two extra registers */
+       MBOX_INC_SEL_EXTAR_REG(0xa, ISS),
+       MBOX_INC_SEL_EXTAR_REG(0xa, PLD),
+       MBOX_INC_SEL_EXTAR_REG(0xb, PLD),
+       /* events 0xd ~ 0x10 use the same extra register */
+       MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC),
+       MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC),
+       MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC),
+       MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC),
+       MBOX_INC_SEL_EXTAR_REG(0x16, PGT),
+       MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP),
+       MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS),
+       MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT),
+       MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP),
+       EVENT_EXTRA_END
+};
+
+/* Nehalem-EX or Westmere-EX ? */
+static bool uncore_nhmex;
+
+static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config)
+{
+       struct intel_uncore_extra_reg *er;
+       unsigned long flags;
+       bool ret = false;
+       u64 mask;
+
+       if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+               er = &box->shared_regs[idx];
+               raw_spin_lock_irqsave(&er->lock, flags);
+               if (!atomic_read(&er->ref) || er->config == config) {
+                       atomic_inc(&er->ref);
+                       er->config = config;
+                       ret = true;
+               }
+               raw_spin_unlock_irqrestore(&er->lock, flags);
+
+               return ret;
+       }
+       /*
+        * The ZDP_CTL_FVC MSR has 4 fields which are used to control
+        * events 0xd ~ 0x10. Besides these 4 fields, there are additional
+        * fields which are shared.
+        */
+       idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+       if (WARN_ON_ONCE(idx >= 4))
+               return false;
+
+       /* mask of the shared fields */
+       if (uncore_nhmex)
+               mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK;
+       else
+               mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK;
+       er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+
+       raw_spin_lock_irqsave(&er->lock, flags);
+       /* add mask of the non-shared field if it's in use */
+       if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) {
+               if (uncore_nhmex)
+                       mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+               else
+                       mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+       }
+
+       if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) {
+               atomic_add(1 << (idx * 8), &er->ref);
+               if (uncore_nhmex)
+                       mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK |
+                               NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+               else
+                       mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK |
+                               WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+               er->config &= ~mask;
+               er->config |= (config & mask);
+               ret = true;
+       }
+       raw_spin_unlock_irqrestore(&er->lock, flags);
+
+       return ret;
+}
+
+static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx)
+{
+       struct intel_uncore_extra_reg *er;
+
+       if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+               er = &box->shared_regs[idx];
+               atomic_dec(&er->ref);
+               return;
+       }
+
+       idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+       er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+       atomic_sub(1 << (idx * 8), &er->ref);
+}
+
+static u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       u64 idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8);
+       u64 config = reg1->config;
+
+       /* get the non-shared control bits and shift them */
+       idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+       if (uncore_nhmex)
+               config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+       else
+               config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+       if (new_idx > orig_idx) {
+               idx = new_idx - orig_idx;
+               config <<= 3 * idx;
+       } else {
+               idx = orig_idx - new_idx;
+               config >>= 3 * idx;
+       }
+
+       /* add the shared control bits back */
+       if (uncore_nhmex)
+               config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+       else
+               config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+       config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+       if (modify) {
+               /* adjust the main event selector */
+               if (new_idx > orig_idx)
+                       hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
+               else
+                       hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
+               reg1->config = config;
+               reg1->idx = ~0xff | new_idx;
+       }
+       return config;
+}
+
+static struct event_constraint *
+nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+       int i, idx[2], alloc = 0;
+       u64 config1 = reg1->config;
+
+       idx[0] = __BITS_VALUE(reg1->idx, 0, 8);
+       idx[1] = __BITS_VALUE(reg1->idx, 1, 8);
+again:
+       for (i = 0; i < 2; i++) {
+               if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
+                       idx[i] = 0xff;
+
+               if (idx[i] == 0xff)
+                       continue;
+
+               if (!nhmex_mbox_get_shared_reg(box, idx[i],
+                               __BITS_VALUE(config1, i, 32)))
+                       goto fail;
+               alloc |= (0x1 << i);
+       }
+
+       /* for the match/mask registers */
+       if (reg2->idx != EXTRA_REG_NONE &&
+           (uncore_box_is_fake(box) || !reg2->alloc) &&
+           !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config))
+               goto fail;
+
+       /*
+        * If it's a fake box -- as per validate_{group,event}() we
+        * shouldn't touch event state and we can avoid doing so
+        * since both will only call get_event_constraints() once
+        * on each event, this avoids the need for reg->alloc.
+        */
+       if (!uncore_box_is_fake(box)) {
+               if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8))
+                       nhmex_mbox_alter_er(event, idx[0], true);
+               reg1->alloc |= alloc;
+               if (reg2->idx != EXTRA_REG_NONE)
+                       reg2->alloc = 1;
+       }
+       return NULL;
+fail:
+       if (idx[0] != 0xff && !(alloc & 0x1) &&
+           idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+               /*
+                * events 0xd ~ 0x10 are functional identical, but are
+                * controlled by different fields in the ZDP_CTL_FVC
+                * register. If we failed to take one field, try the
+                * rest 3 choices.
+                */
+               BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff);
+               idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+               idx[0] = (idx[0] + 1) % 4;
+               idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+               if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) {
+                       config1 = nhmex_mbox_alter_er(event, idx[0], false);
+                       goto again;
+               }
+       }
+
+       if (alloc & 0x1)
+               nhmex_mbox_put_shared_reg(box, idx[0]);
+       if (alloc & 0x2)
+               nhmex_mbox_put_shared_reg(box, idx[1]);
+       return &uncore_constraint_empty;
+}
+
+static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+
+       if (uncore_box_is_fake(box))
+               return;
+
+       if (reg1->alloc & 0x1)
+               nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8));
+       if (reg1->alloc & 0x2)
+               nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8));
+       reg1->alloc = 0;
+
+       if (reg2->alloc) {
+               nhmex_mbox_put_shared_reg(box, reg2->idx);
+               reg2->alloc = 0;
+       }
+}
+
+static int nhmex_mbox_extra_reg_idx(struct extra_reg *er)
+{
+       if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
+               return er->idx;
+       return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd;
+}
+
+static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct intel_uncore_type *type = box->pmu->type;
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+       struct extra_reg *er;
+       unsigned msr;
+       int reg_idx = 0;
+       /*
+        * The mbox events may require 2 extra MSRs at the most. But only
+        * the lower 32 bits in these MSRs are significant, so we can use
+        * config1 to pass two MSRs' config.
+        */
+       for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) {
+               if (er->event != (event->hw.config & er->config_mask))
+                       continue;
+               if (event->attr.config1 & ~er->valid_mask)
+                       return -EINVAL;
+
+               msr = er->msr + type->msr_offset * box->pmu->pmu_idx;
+               if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff))
+                       return -EINVAL;
+
+               /* always use the 32~63 bits to pass the PLD config */
+               if (er->idx == EXTRA_REG_NHMEX_M_PLD)
+                       reg_idx = 1;
+               else if (WARN_ON_ONCE(reg_idx > 0))
+                       return -EINVAL;
+
+               reg1->idx &= ~(0xff << (reg_idx * 8));
+               reg1->reg &= ~(0xffff << (reg_idx * 16));
+               reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8);
+               reg1->reg |= msr << (reg_idx * 16);
+               reg1->config = event->attr.config1;
+               reg_idx++;
+       }
+       /*
+        * The mbox only provides ability to perform address matching
+        * for the PLD events.
+        */
+       if (reg_idx == 2) {
+               reg2->idx = EXTRA_REG_NHMEX_M_FILTER;
+               if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN)
+                       reg2->config = event->attr.config2;
+               else
+                       reg2->config = ~0ULL;
+               if (box->pmu->pmu_idx == 0)
+                       reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG;
+               else
+                       reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG;
+       }
+       return 0;
+}
+
+static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx)
+{
+       struct intel_uncore_extra_reg *er;
+       unsigned long flags;
+       u64 config;
+
+       if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
+               return box->shared_regs[idx].config;
+
+       er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+       raw_spin_lock_irqsave(&er->lock, flags);
+       config = er->config;
+       raw_spin_unlock_irqrestore(&er->lock, flags);
+       return config;
+}
+
+static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+       int idx;
+
+       idx = __BITS_VALUE(reg1->idx, 0, 8);
+       if (idx != 0xff)
+               wrmsrl(__BITS_VALUE(reg1->reg, 0, 16),
+                       nhmex_mbox_shared_reg_config(box, idx));
+       idx = __BITS_VALUE(reg1->idx, 1, 8);
+       if (idx != 0xff)
+               wrmsrl(__BITS_VALUE(reg1->reg, 1, 16),
+                       nhmex_mbox_shared_reg_config(box, idx));
+
+       if (reg2->idx != EXTRA_REG_NONE) {
+               wrmsrl(reg2->reg, 0);
+               if (reg2->config != ~0ULL) {
+                       wrmsrl(reg2->reg + 1,
+                               reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK);
+                       wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK &
+                               (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT));
+                       wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN);
+               }
+       }
+
+       wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
+}
+
+DEFINE_UNCORE_FORMAT_ATTR(count_mode,          count_mode,     "config:2-3");
+DEFINE_UNCORE_FORMAT_ATTR(storage_mode,                storage_mode,   "config:4-5");
+DEFINE_UNCORE_FORMAT_ATTR(wrap_mode,           wrap_mode,      "config:6");
+DEFINE_UNCORE_FORMAT_ATTR(flag_mode,           flag_mode,      "config:7");
+DEFINE_UNCORE_FORMAT_ATTR(inc_sel,             inc_sel,        "config:9-13");
+DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel,                set_flag_sel,   "config:19-21");
+DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en,       filter_cfg_en,  "config2:63");
+DEFINE_UNCORE_FORMAT_ATTR(filter_match,                filter_match,   "config2:0-33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_mask,         filter_mask,    "config2:34-61");
+DEFINE_UNCORE_FORMAT_ATTR(dsp,                 dsp,            "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(thr,                 thr,            "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(fvc,                 fvc,            "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(pgt,                 pgt,            "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(map,                 map,            "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(iss,                 iss,            "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(pld,                 pld,            "config1:32-63");
+
+static struct attribute *nhmex_uncore_mbox_formats_attr[] = {
+       &format_attr_count_mode.attr,
+       &format_attr_storage_mode.attr,
+       &format_attr_wrap_mode.attr,
+       &format_attr_flag_mode.attr,
+       &format_attr_inc_sel.attr,
+       &format_attr_set_flag_sel.attr,
+       &format_attr_filter_cfg_en.attr,
+       &format_attr_filter_match.attr,
+       &format_attr_filter_mask.attr,
+       &format_attr_dsp.attr,
+       &format_attr_thr.attr,
+       &format_attr_fvc.attr,
+       &format_attr_pgt.attr,
+       &format_attr_map.attr,
+       &format_attr_iss.attr,
+       &format_attr_pld.attr,
+       NULL,
+};
+
+static struct attribute_group nhmex_uncore_mbox_format_group = {
+       .name           = "format",
+       .attrs          = nhmex_uncore_mbox_formats_attr,
+};
+
+static struct uncore_event_desc nhmex_uncore_mbox_events[] = {
+       INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"),
+       INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"),
+       { /* end: all zeroes */ },
+};
+
+static struct uncore_event_desc wsmex_uncore_mbox_events[] = {
+       INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"),
+       INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"),
+       { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhmex_uncore_mbox_ops = {
+       NHMEX_UNCORE_OPS_COMMON_INIT(),
+       .enable_event   = nhmex_mbox_msr_enable_event,
+       .hw_config      = nhmex_mbox_hw_config,
+       .get_constraint = nhmex_mbox_get_constraint,
+       .put_constraint = nhmex_mbox_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_mbox = {
+       .name                   = "mbox",
+       .num_counters           = 6,
+       .num_boxes              = 2,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = NHMEX_M0_MSR_PMU_CTL0,
+       .perf_ctr               = NHMEX_M0_MSR_PMU_CNT0,
+       .event_mask             = NHMEX_M_PMON_RAW_EVENT_MASK,
+       .box_ctl                = NHMEX_M0_MSR_GLOBAL_CTL,
+       .msr_offset             = NHMEX_M_MSR_OFFSET,
+       .pair_ctr_ctl           = 1,
+       .num_shared_regs        = 8,
+       .event_descs            = nhmex_uncore_mbox_events,
+       .ops                    = &nhmex_uncore_mbox_ops,
+       .format_group           = &nhmex_uncore_mbox_format_group,
+};
+
+static void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+       /* adjust the main event selector and extra register index */
+       if (reg1->idx % 2) {
+               reg1->idx--;
+               hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+       } else {
+               reg1->idx++;
+               hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+       }
+
+       /* adjust extra register config */
+       switch (reg1->idx % 6) {
+       case 2:
+               /* shift the 8~15 bits to the 0~7 bits */
+               reg1->config >>= 8;
+               break;
+       case 3:
+               /* shift the 0~7 bits to the 8~15 bits */
+               reg1->config <<= 8;
+               break;
+       }
+}
+
+/*
+ * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7.
+ * An event set consists of 6 events, the 3rd and 4th events in
+ * an event set use the same extra register. So an event set uses
+ * 5 extra registers.
+ */
+static struct event_constraint *
+nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+       struct intel_uncore_extra_reg *er;
+       unsigned long flags;
+       int idx, er_idx;
+       u64 config1;
+       bool ok = false;
+
+       if (!uncore_box_is_fake(box) && reg1->alloc)
+               return NULL;
+
+       idx = reg1->idx % 6;
+       config1 = reg1->config;
+again:
+       er_idx = idx;
+       /* the 3rd and 4th events use the same extra register */
+       if (er_idx > 2)
+               er_idx--;
+       er_idx += (reg1->idx / 6) * 5;
+
+       er = &box->shared_regs[er_idx];
+       raw_spin_lock_irqsave(&er->lock, flags);
+       if (idx < 2) {
+               if (!atomic_read(&er->ref) || er->config == reg1->config) {
+                       atomic_inc(&er->ref);
+                       er->config = reg1->config;
+                       ok = true;
+               }
+       } else if (idx == 2 || idx == 3) {
+               /*
+                * these two events use different fields in a extra register,
+                * the 0~7 bits and the 8~15 bits respectively.
+                */
+               u64 mask = 0xff << ((idx - 2) * 8);
+               if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) ||
+                               !((er->config ^ config1) & mask)) {
+                       atomic_add(1 << ((idx - 2) * 8), &er->ref);
+                       er->config &= ~mask;
+                       er->config |= config1 & mask;
+                       ok = true;
+               }
+       } else {
+               if (!atomic_read(&er->ref) ||
+                               (er->config == (hwc->config >> 32) &&
+                                er->config1 == reg1->config &&
+                                er->config2 == reg2->config)) {
+                       atomic_inc(&er->ref);
+                       er->config = (hwc->config >> 32);
+                       er->config1 = reg1->config;
+                       er->config2 = reg2->config;
+                       ok = true;
+               }
+       }
+       raw_spin_unlock_irqrestore(&er->lock, flags);
+
+       if (!ok) {
+               /*
+                * The Rbox events are always in pairs. The paired
+                * events are functional identical, but use different
+                * extra registers. If we failed to take an extra
+                * register, try the alternative.
+                */
+               idx ^= 1;
+               if (idx != reg1->idx % 6) {
+                       if (idx == 2)
+                               config1 >>= 8;
+                       else if (idx == 3)
+                               config1 <<= 8;
+                       goto again;
+               }
+       } else {
+               if (!uncore_box_is_fake(box)) {
+                       if (idx != reg1->idx % 6)
+                               nhmex_rbox_alter_er(box, event);
+                       reg1->alloc = 1;
+               }
+               return NULL;
+       }
+       return &uncore_constraint_empty;
+}
+
+static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct intel_uncore_extra_reg *er;
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       int idx, er_idx;
+
+       if (uncore_box_is_fake(box) || !reg1->alloc)
+               return;
+
+       idx = reg1->idx % 6;
+       er_idx = idx;
+       if (er_idx > 2)
+               er_idx--;
+       er_idx += (reg1->idx / 6) * 5;
+
+       er = &box->shared_regs[er_idx];
+       if (idx == 2 || idx == 3)
+               atomic_sub(1 << ((idx - 2) * 8), &er->ref);
+       else
+               atomic_dec(&er->ref);
+
+       reg1->alloc = 0;
+}
+
+static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+       int idx;
+
+       idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >>
+               NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+       if (idx >= 0x18)
+               return -EINVAL;
+
+       reg1->idx = idx;
+       reg1->config = event->attr.config1;
+
+       switch (idx % 6) {
+       case 4:
+       case 5:
+               hwc->config |= event->attr.config & (~0ULL << 32);
+               reg2->config = event->attr.config2;
+               break;
+       }
+       return 0;
+}
+
+static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+       int idx, port;
+
+       idx = reg1->idx;
+       port = idx / 6 + box->pmu->pmu_idx * 4;
+
+       switch (idx % 6) {
+       case 0:
+               wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config);
+               break;
+       case 1:
+               wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config);
+               break;
+       case 2:
+       case 3:
+               wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port),
+                       uncore_shared_reg_config(box, 2 + (idx / 6) * 5));
+               break;
+       case 4:
+               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port),
+                       hwc->config >> 32);
+               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config);
+               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config);
+               break;
+       case 5:
+               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port),
+                       hwc->config >> 32);
+               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config);
+               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config);
+               break;
+       }
+
+       wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
+               (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK));
+}
+
+DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63");
+DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15");
+DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31");
+
+static struct attribute *nhmex_uncore_rbox_formats_attr[] = {
+       &format_attr_event5.attr,
+       &format_attr_xbr_mm_cfg.attr,
+       &format_attr_xbr_match.attr,
+       &format_attr_xbr_mask.attr,
+       &format_attr_qlx_cfg.attr,
+       &format_attr_iperf_cfg.attr,
+       NULL,
+};
+
+static struct attribute_group nhmex_uncore_rbox_format_group = {
+       .name = "format",
+       .attrs = nhmex_uncore_rbox_formats_attr,
+};
+
+static struct uncore_event_desc nhmex_uncore_rbox_events[] = {
+       INTEL_UNCORE_EVENT_DESC(qpi0_flit_send,         "event=0x0,iperf_cfg=0x80000000"),
+       INTEL_UNCORE_EVENT_DESC(qpi1_filt_send,         "event=0x6,iperf_cfg=0x80000000"),
+       INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt,         "event=0x0,iperf_cfg=0x40000000"),
+       INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt,         "event=0x6,iperf_cfg=0x40000000"),
+       INTEL_UNCORE_EVENT_DESC(qpi0_date_response,     "event=0x0,iperf_cfg=0xc4"),
+       INTEL_UNCORE_EVENT_DESC(qpi1_date_response,     "event=0x6,iperf_cfg=0xc4"),
+       { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhmex_uncore_rbox_ops = {
+       NHMEX_UNCORE_OPS_COMMON_INIT(),
+       .enable_event           = nhmex_rbox_msr_enable_event,
+       .hw_config              = nhmex_rbox_hw_config,
+       .get_constraint         = nhmex_rbox_get_constraint,
+       .put_constraint         = nhmex_rbox_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_rbox = {
+       .name                   = "rbox",
+       .num_counters           = 8,
+       .num_boxes              = 2,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = NHMEX_R_MSR_PMON_CTL0,
+       .perf_ctr               = NHMEX_R_MSR_PMON_CNT0,
+       .event_mask             = NHMEX_R_PMON_RAW_EVENT_MASK,
+       .box_ctl                = NHMEX_R_MSR_GLOBAL_CTL,
+       .msr_offset             = NHMEX_R_MSR_OFFSET,
+       .pair_ctr_ctl           = 1,
+       .num_shared_regs        = 20,
+       .event_descs            = nhmex_uncore_rbox_events,
+       .ops                    = &nhmex_uncore_rbox_ops,
+       .format_group           = &nhmex_uncore_rbox_format_group
+};
+
+static struct intel_uncore_type *nhmex_msr_uncores[] = {
+       &nhmex_uncore_ubox,
+       &nhmex_uncore_cbox,
+       &nhmex_uncore_bbox,
+       &nhmex_uncore_sbox,
+       &nhmex_uncore_mbox,
+       &nhmex_uncore_rbox,
+       &nhmex_uncore_wbox,
+       NULL,
+};
+
+void nhmex_uncore_cpu_init(void)
+{
+       if (boot_cpu_data.x86_model == 46)
+               uncore_nhmex = true;
+       else
+               nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events;
+       if (nhmex_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+               nhmex_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+       uncore_msr_uncores = nhmex_msr_uncores;
+}
+/* end of Nehalem-EX uncore support */
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c

new file mode 100644 (file)

index 0000000..96531d2
--- /dev/null
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -0,0 +1,731 @@
+/* Nehalem/SandBridge/Haswell uncore support */
+#include "uncore.h"
+
+/* Uncore IMC PCI IDs */
+#define PCI_DEVICE_ID_INTEL_SNB_IMC    0x0100
+#define PCI_DEVICE_ID_INTEL_IVB_IMC    0x0154
+#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
+#define PCI_DEVICE_ID_INTEL_HSW_IMC    0x0c00
+#define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
+#define PCI_DEVICE_ID_INTEL_BDW_IMC    0x1604
+#define PCI_DEVICE_ID_INTEL_SKL_IMC    0x191f
+
+/* SNB event control */
+#define SNB_UNC_CTL_EV_SEL_MASK                        0x000000ff
+#define SNB_UNC_CTL_UMASK_MASK                 0x0000ff00
+#define SNB_UNC_CTL_EDGE_DET                   (1 << 18)
+#define SNB_UNC_CTL_EN                         (1 << 22)
+#define SNB_UNC_CTL_INVERT                     (1 << 23)
+#define SNB_UNC_CTL_CMASK_MASK                 0x1f000000
+#define NHM_UNC_CTL_CMASK_MASK                 0xff000000
+#define NHM_UNC_FIXED_CTR_CTL_EN               (1 << 0)
+
+#define SNB_UNC_RAW_EVENT_MASK                 (SNB_UNC_CTL_EV_SEL_MASK | \
+                                                SNB_UNC_CTL_UMASK_MASK | \
+                                                SNB_UNC_CTL_EDGE_DET | \
+                                                SNB_UNC_CTL_INVERT | \
+                                                SNB_UNC_CTL_CMASK_MASK)
+
+#define NHM_UNC_RAW_EVENT_MASK                 (SNB_UNC_CTL_EV_SEL_MASK | \
+                                                SNB_UNC_CTL_UMASK_MASK | \
+                                                SNB_UNC_CTL_EDGE_DET | \
+                                                SNB_UNC_CTL_INVERT | \
+                                                NHM_UNC_CTL_CMASK_MASK)
+
+/* SNB global control register */
+#define SNB_UNC_PERF_GLOBAL_CTL                 0x391
+#define SNB_UNC_FIXED_CTR_CTRL                  0x394
+#define SNB_UNC_FIXED_CTR                       0x395
+
+/* SNB uncore global control */
+#define SNB_UNC_GLOBAL_CTL_CORE_ALL             ((1 << 4) - 1)
+#define SNB_UNC_GLOBAL_CTL_EN                   (1 << 29)
+
+/* SNB Cbo register */
+#define SNB_UNC_CBO_0_PERFEVTSEL0               0x700
+#define SNB_UNC_CBO_0_PER_CTR0                  0x706
+#define SNB_UNC_CBO_MSR_OFFSET                  0x10
+
+/* SNB ARB register */
+#define SNB_UNC_ARB_PER_CTR0                   0x3b0
+#define SNB_UNC_ARB_PERFEVTSEL0                        0x3b2
+#define SNB_UNC_ARB_MSR_OFFSET                 0x10
+
+/* NHM global control register */
+#define NHM_UNC_PERF_GLOBAL_CTL                 0x391
+#define NHM_UNC_FIXED_CTR                       0x394
+#define NHM_UNC_FIXED_CTR_CTRL                  0x395
+
+/* NHM uncore global control */
+#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL            ((1ULL << 8) - 1)
+#define NHM_UNC_GLOBAL_CTL_EN_FC                (1ULL << 32)
+
+/* NHM uncore register */
+#define NHM_UNC_PERFEVTSEL0                     0x3c0
+#define NHM_UNC_UNCORE_PMC0                     0x3b0
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
+DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
+
+/* Sandy Bridge uncore support */
+static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (hwc->idx < UNCORE_PMC_IDX_FIXED)
+               wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
+       else
+               wrmsrl(hwc->config_base, SNB_UNC_CTL_EN);
+}
+
+static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       wrmsrl(event->hw.config_base, 0);
+}
+
+static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+       if (box->pmu->pmu_idx == 0) {
+               wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
+                       SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
+       }
+}
+
+static void snb_uncore_msr_exit_box(struct intel_uncore_box *box)
+{
+       if (box->pmu->pmu_idx == 0)
+               wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, 0);
+}
+
+static struct uncore_event_desc snb_uncore_events[] = {
+       INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
+       { /* end: all zeroes */ },
+};
+
+static struct attribute *snb_uncore_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_cmask5.attr,
+       NULL,
+};
+
+static struct attribute_group snb_uncore_format_group = {
+       .name           = "format",
+       .attrs          = snb_uncore_formats_attr,
+};
+
+static struct intel_uncore_ops snb_uncore_msr_ops = {
+       .init_box       = snb_uncore_msr_init_box,
+       .exit_box       = snb_uncore_msr_exit_box,
+       .disable_event  = snb_uncore_msr_disable_event,
+       .enable_event   = snb_uncore_msr_enable_event,
+       .read_counter   = uncore_msr_read_counter,
+};
+
+static struct event_constraint snb_uncore_arb_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
+       EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type snb_uncore_cbox = {
+       .name           = "cbox",
+       .num_counters   = 2,
+       .num_boxes      = 4,
+       .perf_ctr_bits  = 44,
+       .fixed_ctr_bits = 48,
+       .perf_ctr       = SNB_UNC_CBO_0_PER_CTR0,
+       .event_ctl      = SNB_UNC_CBO_0_PERFEVTSEL0,
+       .fixed_ctr      = SNB_UNC_FIXED_CTR,
+       .fixed_ctl      = SNB_UNC_FIXED_CTR_CTRL,
+       .single_fixed   = 1,
+       .event_mask     = SNB_UNC_RAW_EVENT_MASK,
+       .msr_offset     = SNB_UNC_CBO_MSR_OFFSET,
+       .ops            = &snb_uncore_msr_ops,
+       .format_group   = &snb_uncore_format_group,
+       .event_descs    = snb_uncore_events,
+};
+
+static struct intel_uncore_type snb_uncore_arb = {
+       .name           = "arb",
+       .num_counters   = 2,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 44,
+       .perf_ctr       = SNB_UNC_ARB_PER_CTR0,
+       .event_ctl      = SNB_UNC_ARB_PERFEVTSEL0,
+       .event_mask     = SNB_UNC_RAW_EVENT_MASK,
+       .msr_offset     = SNB_UNC_ARB_MSR_OFFSET,
+       .constraints    = snb_uncore_arb_constraints,
+       .ops            = &snb_uncore_msr_ops,
+       .format_group   = &snb_uncore_format_group,
+};
+
+static struct intel_uncore_type *snb_msr_uncores[] = {
+       &snb_uncore_cbox,
+       &snb_uncore_arb,
+       NULL,
+};
+
+void snb_uncore_cpu_init(void)
+{
+       uncore_msr_uncores = snb_msr_uncores;
+       if (snb_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+               snb_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+}
+
+enum {
+       SNB_PCI_UNCORE_IMC,
+};
+
+static struct uncore_event_desc snb_uncore_imc_events[] = {
+       INTEL_UNCORE_EVENT_DESC(data_reads,  "event=0x01"),
+       INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"),
+       INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"),
+
+       INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"),
+       INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"),
+       INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"),
+
+       { /* end: all zeroes */ },
+};
+
+#define SNB_UNCORE_PCI_IMC_EVENT_MASK          0xff
+#define SNB_UNCORE_PCI_IMC_BAR_OFFSET          0x48
+
+/* page size multiple covering all config regs */
+#define SNB_UNCORE_PCI_IMC_MAP_SIZE            0x6000
+
+#define SNB_UNCORE_PCI_IMC_DATA_READS          0x1
+#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE     0x5050
+#define SNB_UNCORE_PCI_IMC_DATA_WRITES         0x2
+#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE    0x5054
+#define SNB_UNCORE_PCI_IMC_CTR_BASE            SNB_UNCORE_PCI_IMC_DATA_READS_BASE
+
+static struct attribute *snb_uncore_imc_formats_attr[] = {
+       &format_attr_event.attr,
+       NULL,
+};
+
+static struct attribute_group snb_uncore_imc_format_group = {
+       .name = "format",
+       .attrs = snb_uncore_imc_formats_attr,
+};
+
+static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
+       resource_size_t addr;
+       u32 pci_dword;
+
+       pci_read_config_dword(pdev, where, &pci_dword);
+       addr = pci_dword;
+
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+       pci_read_config_dword(pdev, where + 4, &pci_dword);
+       addr |= ((resource_size_t)pci_dword << 32);
+#endif
+
+       addr &= ~(PAGE_SIZE - 1);
+
+       box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
+       box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
+}
+
+static void snb_uncore_imc_exit_box(struct intel_uncore_box *box)
+{
+       iounmap(box->io_addr);
+}
+
+static void snb_uncore_imc_enable_box(struct intel_uncore_box *box)
+{}
+
+static void snb_uncore_imc_disable_box(struct intel_uncore_box *box)
+{}
+
+static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{}
+
+static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{}
+
+static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       return (u64)*(unsigned int *)(box->io_addr + hwc->event_base);
+}
+
+/*
+ * custom event_init() function because we define our own fixed, free
+ * running counters, so we do not want to conflict with generic uncore
+ * logic. Also simplifies processing
+ */
+static int snb_uncore_imc_event_init(struct perf_event *event)
+{
+       struct intel_uncore_pmu *pmu;
+       struct intel_uncore_box *box;
+       struct hw_perf_event *hwc = &event->hw;
+       u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK;
+       int idx, base;
+
+       if (event->attr.type != event->pmu->type)
+               return -ENOENT;
+
+       pmu = uncore_event_to_pmu(event);
+       /* no device found for this pmu */
+       if (pmu->func_id < 0)
+               return -ENOENT;
+
+       /* Sampling not supported yet */
+       if (hwc->sample_period)
+               return -EINVAL;
+
+       /* unsupported modes and filters */
+       if (event->attr.exclude_user   ||
+           event->attr.exclude_kernel ||
+           event->attr.exclude_hv     ||
+           event->attr.exclude_idle   ||
+           event->attr.exclude_host   ||
+           event->attr.exclude_guest  ||
+           event->attr.sample_period) /* no sampling */
+               return -EINVAL;
+
+       /*
+        * Place all uncore events for a particular physical package
+        * onto a single cpu
+        */
+       if (event->cpu < 0)
+               return -EINVAL;
+
+       /* check only supported bits are set */
+       if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK)
+               return -EINVAL;
+
+       box = uncore_pmu_to_box(pmu, event->cpu);
+       if (!box || box->cpu < 0)
+               return -EINVAL;
+
+       event->cpu = box->cpu;
+       event->pmu_private = box;
+
+       event->hw.idx = -1;
+       event->hw.last_tag = ~0ULL;
+       event->hw.extra_reg.idx = EXTRA_REG_NONE;
+       event->hw.branch_reg.idx = EXTRA_REG_NONE;
+       /*
+        * check event is known (whitelist, determines counter)
+        */
+       switch (cfg) {
+       case SNB_UNCORE_PCI_IMC_DATA_READS:
+               base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE;
+               idx = UNCORE_PMC_IDX_FIXED;
+               break;
+       case SNB_UNCORE_PCI_IMC_DATA_WRITES:
+               base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE;
+               idx = UNCORE_PMC_IDX_FIXED + 1;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       /* must be done before validate_group */
+       event->hw.event_base = base;
+       event->hw.config = cfg;
+       event->hw.idx = idx;
+
+       /* no group validation needed, we have free running counters */
+
+       return 0;
+}
+
+static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       return 0;
+}
+
+static void snb_uncore_imc_event_start(struct perf_event *event, int flags)
+{
+       struct intel_uncore_box *box = uncore_event_to_box(event);
+       u64 count;
+
+       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+               return;
+
+       event->hw.state = 0;
+       box->n_active++;
+
+       list_add_tail(&event->active_entry, &box->active_list);
+
+       count = snb_uncore_imc_read_counter(box, event);
+       local64_set(&event->hw.prev_count, count);
+
+       if (box->n_active == 1)
+               uncore_pmu_start_hrtimer(box);
+}
+
+static void snb_uncore_imc_event_stop(struct perf_event *event, int flags)
+{
+       struct intel_uncore_box *box = uncore_event_to_box(event);
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (!(hwc->state & PERF_HES_STOPPED)) {
+               box->n_active--;
+
+               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+               hwc->state |= PERF_HES_STOPPED;
+
+               list_del(&event->active_entry);
+
+               if (box->n_active == 0)
+                       uncore_pmu_cancel_hrtimer(box);
+       }
+
+       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+               /*
+                * Drain the remaining delta count out of a event
+                * that we are disabling:
+                */
+               uncore_perf_event_update(box, event);
+               hwc->state |= PERF_HES_UPTODATE;
+       }
+}
+
+static int snb_uncore_imc_event_add(struct perf_event *event, int flags)
+{
+       struct intel_uncore_box *box = uncore_event_to_box(event);
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (!box)
+               return -ENODEV;
+
+       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+       if (!(flags & PERF_EF_START))
+               hwc->state |= PERF_HES_ARCH;
+
+       snb_uncore_imc_event_start(event, 0);
+
+       box->n_events++;
+
+       return 0;
+}
+
+static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
+{
+       struct intel_uncore_box *box = uncore_event_to_box(event);
+       int i;
+
+       snb_uncore_imc_event_stop(event, PERF_EF_UPDATE);
+
+       for (i = 0; i < box->n_events; i++) {
+               if (event == box->event_list[i]) {
+                       --box->n_events;
+                       break;
+               }
+       }
+}
+
+int snb_pci2phy_map_init(int devid)
+{
+       struct pci_dev *dev = NULL;
+       struct pci2phy_map *map;
+       int bus, segment;
+
+       dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
+       if (!dev)
+               return -ENOTTY;
+
+       bus = dev->bus->number;
+       segment = pci_domain_nr(dev->bus);
+
+       raw_spin_lock(&pci2phy_map_lock);
+       map = __find_pci2phy_map(segment);
+       if (!map) {
+               raw_spin_unlock(&pci2phy_map_lock);
+               pci_dev_put(dev);
+               return -ENOMEM;
+       }
+       map->pbus_to_physid[bus] = 0;
+       raw_spin_unlock(&pci2phy_map_lock);
+
+       pci_dev_put(dev);
+
+       return 0;
+}
+
+static struct pmu snb_uncore_imc_pmu = {
+       .task_ctx_nr    = perf_invalid_context,
+       .event_init     = snb_uncore_imc_event_init,
+       .add            = snb_uncore_imc_event_add,
+       .del            = snb_uncore_imc_event_del,
+       .start          = snb_uncore_imc_event_start,
+       .stop           = snb_uncore_imc_event_stop,
+       .read           = uncore_pmu_event_read,
+};
+
+static struct intel_uncore_ops snb_uncore_imc_ops = {
+       .init_box       = snb_uncore_imc_init_box,
+       .exit_box       = snb_uncore_imc_exit_box,
+       .enable_box     = snb_uncore_imc_enable_box,
+       .disable_box    = snb_uncore_imc_disable_box,
+       .disable_event  = snb_uncore_imc_disable_event,
+       .enable_event   = snb_uncore_imc_enable_event,
+       .hw_config      = snb_uncore_imc_hw_config,
+       .read_counter   = snb_uncore_imc_read_counter,
+};
+
+static struct intel_uncore_type snb_uncore_imc = {
+       .name           = "imc",
+       .num_counters   = 2,
+       .num_boxes      = 1,
+       .fixed_ctr_bits = 32,
+       .fixed_ctr      = SNB_UNCORE_PCI_IMC_CTR_BASE,
+       .event_descs    = snb_uncore_imc_events,
+       .format_group   = &snb_uncore_imc_format_group,
+       .perf_ctr       = SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
+       .event_mask     = SNB_UNCORE_PCI_IMC_EVENT_MASK,
+       .ops            = &snb_uncore_imc_ops,
+       .pmu            = &snb_uncore_imc_pmu,
+};
+
+static struct intel_uncore_type *snb_pci_uncores[] = {
+       [SNB_PCI_UNCORE_IMC]    = &snb_uncore_imc,
+       NULL,
+};
+
+static const struct pci_device_id snb_uncore_pci_ids[] = {
+       { /* IMC */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+       },
+       { /* end: all zeroes */ },
+};
+
+static const struct pci_device_id ivb_uncore_pci_ids[] = {
+       { /* IMC */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+       },
+       { /* IMC */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_E3_IMC),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+       },
+       { /* end: all zeroes */ },
+};
+
+static const struct pci_device_id hsw_uncore_pci_ids[] = {
+       { /* IMC */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+       },
+       { /* IMC */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+       },
+       { /* end: all zeroes */ },
+};
+
+static const struct pci_device_id bdw_uncore_pci_ids[] = {
+       { /* IMC */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+       },
+       { /* end: all zeroes */ },
+};
+
+static const struct pci_device_id skl_uncore_pci_ids[] = {
+       { /* IMC */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_IMC),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+       },
+       { /* end: all zeroes */ },
+};
+
+static struct pci_driver snb_uncore_pci_driver = {
+       .name           = "snb_uncore",
+       .id_table       = snb_uncore_pci_ids,
+};
+
+static struct pci_driver ivb_uncore_pci_driver = {
+       .name           = "ivb_uncore",
+       .id_table       = ivb_uncore_pci_ids,
+};
+
+static struct pci_driver hsw_uncore_pci_driver = {
+       .name           = "hsw_uncore",
+       .id_table       = hsw_uncore_pci_ids,
+};
+
+static struct pci_driver bdw_uncore_pci_driver = {
+       .name           = "bdw_uncore",
+       .id_table       = bdw_uncore_pci_ids,
+};
+
+static struct pci_driver skl_uncore_pci_driver = {
+       .name           = "skl_uncore",
+       .id_table       = skl_uncore_pci_ids,
+};
+
+struct imc_uncore_pci_dev {
+       __u32 pci_id;
+       struct pci_driver *driver;
+};
+#define IMC_DEV(a, d) \
+       { .pci_id = PCI_DEVICE_ID_INTEL_##a, .driver = (d) }
+
+static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
+       IMC_DEV(SNB_IMC, &snb_uncore_pci_driver),
+       IMC_DEV(IVB_IMC, &ivb_uncore_pci_driver),    /* 3rd Gen Core processor */
+       IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */
+       IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver),    /* 4th Gen Core Processor */
+       IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver),  /* 4th Gen Core ULT Mobile Processor */
+       IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver),    /* 5th Gen Core U */
+       IMC_DEV(SKL_IMC, &skl_uncore_pci_driver),    /* 6th Gen Core */
+       {  /* end marker */ }
+};
+
+
+#define for_each_imc_pci_id(x, t) \
+       for (x = (t); (x)->pci_id; x++)
+
+static struct pci_driver *imc_uncore_find_dev(void)
+{
+       const struct imc_uncore_pci_dev *p;
+       int ret;
+
+       for_each_imc_pci_id(p, desktop_imc_pci_ids) {
+               ret = snb_pci2phy_map_init(p->pci_id);
+               if (ret == 0)
+                       return p->driver;
+       }
+       return NULL;
+}
+
+static int imc_uncore_pci_init(void)
+{
+       struct pci_driver *imc_drv = imc_uncore_find_dev();
+
+       if (!imc_drv)
+               return -ENODEV;
+
+       uncore_pci_uncores = snb_pci_uncores;
+       uncore_pci_driver = imc_drv;
+
+       return 0;
+}
+
+int snb_uncore_pci_init(void)
+{
+       return imc_uncore_pci_init();
+}
+
+int ivb_uncore_pci_init(void)
+{
+       return imc_uncore_pci_init();
+}
+int hsw_uncore_pci_init(void)
+{
+       return imc_uncore_pci_init();
+}
+
+int bdw_uncore_pci_init(void)
+{
+       return imc_uncore_pci_init();
+}
+
+int skl_uncore_pci_init(void)
+{
+       return imc_uncore_pci_init();
+}
+
+/* end of Sandy Bridge uncore support */
+
+/* Nehalem uncore support */
+static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+       wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0);
+}
+
+static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+       wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC);
+}
+
+static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (hwc->idx < UNCORE_PMC_IDX_FIXED)
+               wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
+       else
+               wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN);
+}
+
+static struct attribute *nhm_uncore_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_cmask8.attr,
+       NULL,
+};
+
+static struct attribute_group nhm_uncore_format_group = {
+       .name = "format",
+       .attrs = nhm_uncore_formats_attr,
+};
+
+static struct uncore_event_desc nhm_uncore_events[] = {
+       INTEL_UNCORE_EVENT_DESC(clockticks,                "event=0xff,umask=0x00"),
+       INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any,       "event=0x2f,umask=0x0f"),
+       INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any,      "event=0x2c,umask=0x0f"),
+       INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads,     "event=0x20,umask=0x01"),
+       INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes,    "event=0x20,umask=0x02"),
+       INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads,  "event=0x20,umask=0x04"),
+       INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"),
+       INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads,   "event=0x20,umask=0x10"),
+       INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes,  "event=0x20,umask=0x20"),
+       { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhm_uncore_msr_ops = {
+       .disable_box    = nhm_uncore_msr_disable_box,
+       .enable_box     = nhm_uncore_msr_enable_box,
+       .disable_event  = snb_uncore_msr_disable_event,
+       .enable_event   = nhm_uncore_msr_enable_event,
+       .read_counter   = uncore_msr_read_counter,
+};
+
+static struct intel_uncore_type nhm_uncore = {
+       .name           = "",
+       .num_counters   = 8,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 48,
+       .fixed_ctr_bits = 48,
+       .event_ctl      = NHM_UNC_PERFEVTSEL0,
+       .perf_ctr       = NHM_UNC_UNCORE_PMC0,
+       .fixed_ctr      = NHM_UNC_FIXED_CTR,
+       .fixed_ctl      = NHM_UNC_FIXED_CTR_CTRL,
+       .event_mask     = NHM_UNC_RAW_EVENT_MASK,
+       .event_descs    = nhm_uncore_events,
+       .ops            = &nhm_uncore_msr_ops,
+       .format_group   = &nhm_uncore_format_group,
+};
+
+static struct intel_uncore_type *nhm_msr_uncores[] = {
+       &nhm_uncore,
+       NULL,
+};
+
+void nhm_uncore_cpu_init(void)
+{
+       uncore_msr_uncores = nhm_msr_uncores;
+}
+
+/* end of Nehalem uncore support */
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c

new file mode 100644 (file)

index 0000000..93f6bd9
--- /dev/null
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -0,0 +1,3135 @@
+/* SandyBridge-EP/IvyTown uncore support */
+#include "uncore.h"
+
+/* SNB-EP Box level control */
+#define SNBEP_PMON_BOX_CTL_RST_CTRL    (1 << 0)
+#define SNBEP_PMON_BOX_CTL_RST_CTRS    (1 << 1)
+#define SNBEP_PMON_BOX_CTL_FRZ         (1 << 8)
+#define SNBEP_PMON_BOX_CTL_FRZ_EN      (1 << 16)
+#define SNBEP_PMON_BOX_CTL_INT         (SNBEP_PMON_BOX_CTL_RST_CTRL | \
+                                        SNBEP_PMON_BOX_CTL_RST_CTRS | \
+                                        SNBEP_PMON_BOX_CTL_FRZ_EN)
+/* SNB-EP event control */
+#define SNBEP_PMON_CTL_EV_SEL_MASK     0x000000ff
+#define SNBEP_PMON_CTL_UMASK_MASK      0x0000ff00
+#define SNBEP_PMON_CTL_RST             (1 << 17)
+#define SNBEP_PMON_CTL_EDGE_DET                (1 << 18)
+#define SNBEP_PMON_CTL_EV_SEL_EXT      (1 << 21)
+#define SNBEP_PMON_CTL_EN              (1 << 22)
+#define SNBEP_PMON_CTL_INVERT          (1 << 23)
+#define SNBEP_PMON_CTL_TRESH_MASK      0xff000000
+#define SNBEP_PMON_RAW_EVENT_MASK      (SNBEP_PMON_CTL_EV_SEL_MASK | \
+                                        SNBEP_PMON_CTL_UMASK_MASK | \
+                                        SNBEP_PMON_CTL_EDGE_DET | \
+                                        SNBEP_PMON_CTL_INVERT | \
+                                        SNBEP_PMON_CTL_TRESH_MASK)
+
+/* SNB-EP Ubox event control */
+#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK                0x1f000000
+#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK                \
+                               (SNBEP_PMON_CTL_EV_SEL_MASK | \
+                                SNBEP_PMON_CTL_UMASK_MASK | \
+                                SNBEP_PMON_CTL_EDGE_DET | \
+                                SNBEP_PMON_CTL_INVERT | \
+                                SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
+
+#define SNBEP_CBO_PMON_CTL_TID_EN              (1 << 19)
+#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK      (SNBEP_PMON_RAW_EVENT_MASK | \
+                                                SNBEP_CBO_PMON_CTL_TID_EN)
+
+/* SNB-EP PCU event control */
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK    0x0000c000
+#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK      0x1f000000
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT      (1 << 30)
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET    (1 << 31)
+#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK      \
+                               (SNBEP_PMON_CTL_EV_SEL_MASK | \
+                                SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+                                SNBEP_PMON_CTL_EDGE_DET | \
+                                SNBEP_PMON_CTL_EV_SEL_EXT | \
+                                SNBEP_PMON_CTL_INVERT | \
+                                SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
+                                SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+                                SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+
+#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK      \
+                               (SNBEP_PMON_RAW_EVENT_MASK | \
+                                SNBEP_PMON_CTL_EV_SEL_EXT)
+
+/* SNB-EP pci control register */
+#define SNBEP_PCI_PMON_BOX_CTL                 0xf4
+#define SNBEP_PCI_PMON_CTL0                    0xd8
+/* SNB-EP pci counter register */
+#define SNBEP_PCI_PMON_CTR0                    0xa0
+
+/* SNB-EP home agent register */
+#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0       0x40
+#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1       0x44
+#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH      0x48
+/* SNB-EP memory controller register */
+#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL                0xf0
+#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR                0xd0
+/* SNB-EP QPI register */
+#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0         0x228
+#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1         0x22c
+#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0          0x238
+#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1          0x23c
+
+/* SNB-EP Ubox register */
+#define SNBEP_U_MSR_PMON_CTR0                  0xc16
+#define SNBEP_U_MSR_PMON_CTL0                  0xc10
+
+#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL                0xc08
+#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR                0xc09
+
+/* SNB-EP Cbo register */
+#define SNBEP_C0_MSR_PMON_CTR0                 0xd16
+#define SNBEP_C0_MSR_PMON_CTL0                 0xd10
+#define SNBEP_C0_MSR_PMON_BOX_CTL              0xd04
+#define SNBEP_C0_MSR_PMON_BOX_FILTER           0xd14
+#define SNBEP_CBO_MSR_OFFSET                   0x20
+
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_TID      0x1f
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_NID      0x3fc00
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE    0x7c0000
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC      0xff800000
+
+#define SNBEP_CBO_EVENT_EXTRA_REG(e, m, i) {   \
+       .event = (e),                           \
+       .msr = SNBEP_C0_MSR_PMON_BOX_FILTER,    \
+       .config_mask = (m),                     \
+       .idx = (i)                              \
+}
+
+/* SNB-EP PCU register */
+#define SNBEP_PCU_MSR_PMON_CTR0                        0xc36
+#define SNBEP_PCU_MSR_PMON_CTL0                        0xc30
+#define SNBEP_PCU_MSR_PMON_BOX_CTL             0xc24
+#define SNBEP_PCU_MSR_PMON_BOX_FILTER          0xc34
+#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK     0xffffffff
+#define SNBEP_PCU_MSR_CORE_C3_CTR              0x3fc
+#define SNBEP_PCU_MSR_CORE_C6_CTR              0x3fd
+
+/* IVBEP event control */
+#define IVBEP_PMON_BOX_CTL_INT         (SNBEP_PMON_BOX_CTL_RST_CTRL | \
+                                        SNBEP_PMON_BOX_CTL_RST_CTRS)
+#define IVBEP_PMON_RAW_EVENT_MASK              (SNBEP_PMON_CTL_EV_SEL_MASK | \
+                                        SNBEP_PMON_CTL_UMASK_MASK | \
+                                        SNBEP_PMON_CTL_EDGE_DET | \
+                                        SNBEP_PMON_CTL_TRESH_MASK)
+/* IVBEP Ubox */
+#define IVBEP_U_MSR_PMON_GLOBAL_CTL            0xc00
+#define IVBEP_U_PMON_GLOBAL_FRZ_ALL            (1 << 31)
+#define IVBEP_U_PMON_GLOBAL_UNFRZ_ALL          (1 << 29)
+
+#define IVBEP_U_MSR_PMON_RAW_EVENT_MASK        \
+                               (SNBEP_PMON_CTL_EV_SEL_MASK | \
+                                SNBEP_PMON_CTL_UMASK_MASK | \
+                                SNBEP_PMON_CTL_EDGE_DET | \
+                                SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
+/* IVBEP Cbo */
+#define IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK              (IVBEP_PMON_RAW_EVENT_MASK | \
+                                                SNBEP_CBO_PMON_CTL_TID_EN)
+
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_TID              (0x1fULL << 0)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK     (0xfULL << 5)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE    (0x3fULL << 17)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NID              (0xffffULL << 32)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC              (0x1ffULL << 52)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_C6               (0x1ULL << 61)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NC               (0x1ULL << 62)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC     (0x1ULL << 63)
+
+/* IVBEP home agent */
+#define IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST                (1 << 16)
+#define IVBEP_HA_PCI_PMON_RAW_EVENT_MASK               \
+                               (IVBEP_PMON_RAW_EVENT_MASK | \
+                                IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST)
+/* IVBEP PCU */
+#define IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK      \
+                               (SNBEP_PMON_CTL_EV_SEL_MASK | \
+                                SNBEP_PMON_CTL_EV_SEL_EXT | \
+                                SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+                                SNBEP_PMON_CTL_EDGE_DET | \
+                                SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
+                                SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+                                SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+/* IVBEP QPI */
+#define IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK      \
+                               (IVBEP_PMON_RAW_EVENT_MASK | \
+                                SNBEP_PMON_CTL_EV_SEL_EXT)
+
+#define __BITS_VALUE(x, i, n)  ((typeof(x))(((x) >> ((i) * (n))) & \
+                               ((1ULL << (n)) - 1)))
+
+/* Haswell-EP Ubox */
+#define HSWEP_U_MSR_PMON_CTR0                  0x709
+#define HSWEP_U_MSR_PMON_CTL0                  0x705
+#define HSWEP_U_MSR_PMON_FILTER                        0x707
+
+#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTL                0x703
+#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTR                0x704
+
+#define HSWEP_U_MSR_PMON_BOX_FILTER_TID                (0x1 << 0)
+#define HSWEP_U_MSR_PMON_BOX_FILTER_CID                (0x1fULL << 1)
+#define HSWEP_U_MSR_PMON_BOX_FILTER_MASK \
+                                       (HSWEP_U_MSR_PMON_BOX_FILTER_TID | \
+                                        HSWEP_U_MSR_PMON_BOX_FILTER_CID)
+
+/* Haswell-EP CBo */
+#define HSWEP_C0_MSR_PMON_CTR0                 0xe08
+#define HSWEP_C0_MSR_PMON_CTL0                 0xe01
+#define HSWEP_C0_MSR_PMON_BOX_CTL                      0xe00
+#define HSWEP_C0_MSR_PMON_BOX_FILTER0          0xe05
+#define HSWEP_CBO_MSR_OFFSET                   0x10
+
+
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_TID              (0x3fULL << 0)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK     (0xfULL << 6)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE    (0x7fULL << 17)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NID              (0xffffULL << 32)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC              (0x1ffULL << 52)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_C6               (0x1ULL << 61)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NC               (0x1ULL << 62)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC     (0x1ULL << 63)
+
+
+/* Haswell-EP Sbox */
+#define HSWEP_S0_MSR_PMON_CTR0                 0x726
+#define HSWEP_S0_MSR_PMON_CTL0                 0x721
+#define HSWEP_S0_MSR_PMON_BOX_CTL                      0x720
+#define HSWEP_SBOX_MSR_OFFSET                  0xa
+#define HSWEP_S_MSR_PMON_RAW_EVENT_MASK                (SNBEP_PMON_RAW_EVENT_MASK | \
+                                                SNBEP_CBO_PMON_CTL_TID_EN)
+
+/* Haswell-EP PCU */
+#define HSWEP_PCU_MSR_PMON_CTR0                        0x717
+#define HSWEP_PCU_MSR_PMON_CTL0                        0x711
+#define HSWEP_PCU_MSR_PMON_BOX_CTL             0x710
+#define HSWEP_PCU_MSR_PMON_BOX_FILTER          0x715
+
+/* KNL Ubox */
+#define KNL_U_MSR_PMON_RAW_EVENT_MASK \
+                                       (SNBEP_U_MSR_PMON_RAW_EVENT_MASK | \
+                                               SNBEP_CBO_PMON_CTL_TID_EN)
+/* KNL CHA */
+#define KNL_CHA_MSR_OFFSET                     0xc
+#define KNL_CHA_MSR_PMON_CTL_QOR               (1 << 16)
+#define KNL_CHA_MSR_PMON_RAW_EVENT_MASK \
+                                       (SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK | \
+                                        KNL_CHA_MSR_PMON_CTL_QOR)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_TID                0x1ff
+#define KNL_CHA_MSR_PMON_BOX_FILTER_STATE      (7 << 18)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_OP         (0xfffffe2aULL << 32)
+
+/* KNL EDC/MC UCLK */
+#define KNL_UCLK_MSR_PMON_CTR0_LOW             0x400
+#define KNL_UCLK_MSR_PMON_CTL0                 0x420
+#define KNL_UCLK_MSR_PMON_BOX_CTL              0x430
+#define KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW       0x44c
+#define KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL       0x454
+#define KNL_PMON_FIXED_CTL_EN                  0x1
+
+/* KNL EDC */
+#define KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW                0xa00
+#define KNL_EDC0_ECLK_MSR_PMON_CTL0            0xa20
+#define KNL_EDC0_ECLK_MSR_PMON_BOX_CTL         0xa30
+#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW  0xa3c
+#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL  0xa44
+
+/* KNL MC */
+#define KNL_MC0_CH0_MSR_PMON_CTR0_LOW          0xb00
+#define KNL_MC0_CH0_MSR_PMON_CTL0              0xb20
+#define KNL_MC0_CH0_MSR_PMON_BOX_CTL           0xb30
+#define KNL_MC0_CH0_MSR_PMON_FIXED_LOW         0xb3c
+#define KNL_MC0_CH0_MSR_PMON_FIXED_CTL         0xb44
+
+/* KNL IRP */
+#define KNL_IRP_PCI_PMON_BOX_CTL               0xf0
+#define KNL_IRP_PCI_PMON_RAW_EVENT_MASK                (SNBEP_PMON_RAW_EVENT_MASK | \
+                                                KNL_CHA_MSR_PMON_CTL_QOR)
+/* KNL PCU */
+#define KNL_PCU_PMON_CTL_EV_SEL_MASK           0x0000007f
+#define KNL_PCU_PMON_CTL_USE_OCC_CTR           (1 << 7)
+#define KNL_PCU_MSR_PMON_CTL_TRESH_MASK                0x3f000000
+#define KNL_PCU_MSR_PMON_RAW_EVENT_MASK        \
+                               (KNL_PCU_PMON_CTL_EV_SEL_MASK | \
+                                KNL_PCU_PMON_CTL_USE_OCC_CTR | \
+                                SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+                                SNBEP_PMON_CTL_EDGE_DET | \
+                                SNBEP_CBO_PMON_CTL_TID_EN | \
+                                SNBEP_PMON_CTL_EV_SEL_EXT | \
+                                SNBEP_PMON_CTL_INVERT | \
+                                KNL_PCU_MSR_PMON_CTL_TRESH_MASK | \
+                                SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+                                SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6");
+DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
+DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(thresh6, thresh, "config:24-29");
+DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
+DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
+DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
+DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
+DEFINE_UNCORE_FORMAT_ATTR(occ_edge_det, occ_edge_det, "config:31");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid4, filter_tid, "config1:0-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state3, filter_state, "config1:17-23");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state4, filter_state, "config1:18-20");
+DEFINE_UNCORE_FORMAT_ATTR(filter_local, filter_local, "config1:33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_all_op, filter_all_op, "config1:35");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nnm, filter_nnm, "config1:37");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc3, filter_opc, "config1:41-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nc, filter_nc, "config1:62");
+DEFINE_UNCORE_FORMAT_ATTR(filter_c6, filter_c6, "config1:61");
+DEFINE_UNCORE_FORMAT_ATTR(filter_isoc, filter_isoc, "config1:63");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51");
+DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35");
+DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31");
+DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17");
+DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12");
+DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4");
+DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31");
+DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17");
+DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12");
+DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4");
+DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63");
+
+static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       int box_ctl = uncore_pci_box_ctl(box);
+       u32 config = 0;
+
+       if (!pci_read_config_dword(pdev, box_ctl, &config)) {
+               config |= SNBEP_PMON_BOX_CTL_FRZ;
+               pci_write_config_dword(pdev, box_ctl, config);
+       }
+}
+
+static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       int box_ctl = uncore_pci_box_ctl(box);
+       u32 config = 0;
+
+       if (!pci_read_config_dword(pdev, box_ctl, &config)) {
+               config &= ~SNBEP_PMON_BOX_CTL_FRZ;
+               pci_write_config_dword(pdev, box_ctl, config);
+       }
+}
+
+static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       struct hw_perf_event *hwc = &event->hw;
+
+       pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       struct hw_perf_event *hwc = &event->hw;
+
+       pci_write_config_dword(pdev, hwc->config_base, hwc->config);
+}
+
+static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       struct hw_perf_event *hwc = &event->hw;
+       u64 count = 0;
+
+       pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
+       pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
+
+       return count;
+}
+
+static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       int box_ctl = uncore_pci_box_ctl(box);
+
+       pci_write_config_dword(pdev, box_ctl, SNBEP_PMON_BOX_CTL_INT);
+}
+
+static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+       u64 config;
+       unsigned msr;
+
+       msr = uncore_msr_box_ctl(box);
+       if (msr) {
+               rdmsrl(msr, config);
+               config |= SNBEP_PMON_BOX_CTL_FRZ;
+               wrmsrl(msr, config);
+       }
+}
+
+static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+       u64 config;
+       unsigned msr;
+
+       msr = uncore_msr_box_ctl(box);
+       if (msr) {
+               rdmsrl(msr, config);
+               config &= ~SNBEP_PMON_BOX_CTL_FRZ;
+               wrmsrl(msr, config);
+       }
+}
+
+static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+       if (reg1->idx != EXTRA_REG_NONE)
+               wrmsrl(reg1->reg, uncore_shared_reg_config(box, 0));
+
+       wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box,
+                                       struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       wrmsrl(hwc->config_base, hwc->config);
+}
+
+static void snbep_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+       unsigned msr = uncore_msr_box_ctl(box);
+
+       if (msr)
+               wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT);
+}
+
+static struct attribute *snbep_uncore_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       NULL,
+};
+
+static struct attribute *snbep_uncore_ubox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh5.attr,
+       NULL,
+};
+
+static struct attribute *snbep_uncore_cbox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_tid_en.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       &format_attr_filter_tid.attr,
+       &format_attr_filter_nid.attr,
+       &format_attr_filter_state.attr,
+       &format_attr_filter_opc.attr,
+       NULL,
+};
+
+static struct attribute *snbep_uncore_pcu_formats_attr[] = {
+       &format_attr_event_ext.attr,
+       &format_attr_occ_sel.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh5.attr,
+       &format_attr_occ_invert.attr,
+       &format_attr_occ_edge.attr,
+       &format_attr_filter_band0.attr,
+       &format_attr_filter_band1.attr,
+       &format_attr_filter_band2.attr,
+       &format_attr_filter_band3.attr,
+       NULL,
+};
+
+static struct attribute *snbep_uncore_qpi_formats_attr[] = {
+       &format_attr_event_ext.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       &format_attr_match_rds.attr,
+       &format_attr_match_rnid30.attr,
+       &format_attr_match_rnid4.attr,
+       &format_attr_match_dnid.attr,
+       &format_attr_match_mc.attr,
+       &format_attr_match_opc.attr,
+       &format_attr_match_vnw.attr,
+       &format_attr_match0.attr,
+       &format_attr_match1.attr,
+       &format_attr_mask_rds.attr,
+       &format_attr_mask_rnid30.attr,
+       &format_attr_mask_rnid4.attr,
+       &format_attr_mask_dnid.attr,
+       &format_attr_mask_mc.attr,
+       &format_attr_mask_opc.attr,
+       &format_attr_mask_vnw.attr,
+       &format_attr_mask0.attr,
+       &format_attr_mask1.attr,
+       NULL,
+};
+
+static struct uncore_event_desc snbep_uncore_imc_events[] = {
+       INTEL_UNCORE_EVENT_DESC(clockticks,      "event=0xff,umask=0x00"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_read,  "event=0x04,umask=0x03"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
+       { /* end: all zeroes */ },
+};
+
+static struct uncore_event_desc snbep_uncore_qpi_events[] = {
+       INTEL_UNCORE_EVENT_DESC(clockticks,       "event=0x14"),
+       INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"),
+       INTEL_UNCORE_EVENT_DESC(drs_data,         "event=0x102,umask=0x08"),
+       INTEL_UNCORE_EVENT_DESC(ncb_data,         "event=0x103,umask=0x04"),
+       { /* end: all zeroes */ },
+};
+
+static struct attribute_group snbep_uncore_format_group = {
+       .name = "format",
+       .attrs = snbep_uncore_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_ubox_format_group = {
+       .name = "format",
+       .attrs = snbep_uncore_ubox_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_cbox_format_group = {
+       .name = "format",
+       .attrs = snbep_uncore_cbox_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_pcu_format_group = {
+       .name = "format",
+       .attrs = snbep_uncore_pcu_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_qpi_format_group = {
+       .name = "format",
+       .attrs = snbep_uncore_qpi_formats_attr,
+};
+
+#define __SNBEP_UNCORE_MSR_OPS_COMMON_INIT()                   \
+       .disable_box    = snbep_uncore_msr_disable_box,         \
+       .enable_box     = snbep_uncore_msr_enable_box,          \
+       .disable_event  = snbep_uncore_msr_disable_event,       \
+       .enable_event   = snbep_uncore_msr_enable_event,        \
+       .read_counter   = uncore_msr_read_counter
+
+#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT()                     \
+       __SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),                   \
+       .init_box       = snbep_uncore_msr_init_box             \
+
+static struct intel_uncore_ops snbep_uncore_msr_ops = {
+       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+};
+
+#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT()                     \
+       .init_box       = snbep_uncore_pci_init_box,            \
+       .disable_box    = snbep_uncore_pci_disable_box,         \
+       .enable_box     = snbep_uncore_pci_enable_box,          \
+       .disable_event  = snbep_uncore_pci_disable_event,       \
+       .read_counter   = snbep_uncore_pci_read_counter
+
+static struct intel_uncore_ops snbep_uncore_pci_ops = {
+       SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
+       .enable_event   = snbep_uncore_pci_enable_event,        \
+};
+
+static struct event_constraint snbep_uncore_cbox_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x04, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x05, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x07, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x13, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x1b, 0xc),
+       UNCORE_EVENT_CONSTRAINT(0x1c, 0xc),
+       UNCORE_EVENT_CONSTRAINT(0x1d, 0xc),
+       UNCORE_EVENT_CONSTRAINT(0x1e, 0xc),
+       EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff),
+       UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
+       EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint snbep_uncore_r2pcie_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x12, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+       EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint snbep_uncore_r3qpi_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2a, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x30, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+       EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type snbep_uncore_ubox = {
+       .name           = "ubox",
+       .num_counters   = 2,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 44,
+       .fixed_ctr_bits = 48,
+       .perf_ctr       = SNBEP_U_MSR_PMON_CTR0,
+       .event_ctl      = SNBEP_U_MSR_PMON_CTL0,
+       .event_mask     = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+       .fixed_ctr      = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
+       .fixed_ctl      = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
+       .ops            = &snbep_uncore_msr_ops,
+       .format_group   = &snbep_uncore_ubox_format_group,
+};
+
+static struct extra_reg snbep_uncore_cbox_extra_regs[] = {
+       SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+                                 SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x2),
+       EVENT_EXTRA_END
+};
+
+static void snbep_cbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+       int i;
+
+       if (uncore_box_is_fake(box))
+               return;
+
+       for (i = 0; i < 5; i++) {
+               if (reg1->alloc & (0x1 << i))
+                       atomic_sub(1 << (i * 6), &er->ref);
+       }
+       reg1->alloc = 0;
+}
+
+static struct event_constraint *
+__snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event,
+                           u64 (*cbox_filter_mask)(int fields))
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+       int i, alloc = 0;
+       unsigned long flags;
+       u64 mask;
+
+       if (reg1->idx == EXTRA_REG_NONE)
+               return NULL;
+
+       raw_spin_lock_irqsave(&er->lock, flags);
+       for (i = 0; i < 5; i++) {
+               if (!(reg1->idx & (0x1 << i)))
+                       continue;
+               if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
+                       continue;
+
+               mask = cbox_filter_mask(0x1 << i);
+               if (!__BITS_VALUE(atomic_read(&er->ref), i, 6) ||
+                   !((reg1->config ^ er->config) & mask)) {
+                       atomic_add(1 << (i * 6), &er->ref);
+                       er->config &= ~mask;
+                       er->config |= reg1->config & mask;
+                       alloc |= (0x1 << i);
+               } else {
+                       break;
+               }
+       }
+       raw_spin_unlock_irqrestore(&er->lock, flags);
+       if (i < 5)
+               goto fail;
+
+       if (!uncore_box_is_fake(box))
+               reg1->alloc |= alloc;
+
+       return NULL;
+fail:
+       for (; i >= 0; i--) {
+               if (alloc & (0x1 << i))
+                       atomic_sub(1 << (i * 6), &er->ref);
+       }
+       return &uncore_constraint_empty;
+}
+
+static u64 snbep_cbox_filter_mask(int fields)
+{
+       u64 mask = 0;
+
+       if (fields & 0x1)
+               mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_TID;
+       if (fields & 0x2)
+               mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_NID;
+       if (fields & 0x4)
+               mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+       if (fields & 0x8)
+               mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+
+       return mask;
+}
+
+static struct event_constraint *
+snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       return __snbep_cbox_get_constraint(box, event, snbep_cbox_filter_mask);
+}
+
+static int snbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct extra_reg *er;
+       int idx = 0;
+
+       for (er = snbep_uncore_cbox_extra_regs; er->msr; er++) {
+               if (er->event != (event->hw.config & er->config_mask))
+                       continue;
+               idx |= er->idx;
+       }
+
+       if (idx) {
+               reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
+                       SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+               reg1->config = event->attr.config1 & snbep_cbox_filter_mask(idx);
+               reg1->idx = idx;
+       }
+       return 0;
+}
+
+static struct intel_uncore_ops snbep_uncore_cbox_ops = {
+       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+       .hw_config              = snbep_cbox_hw_config,
+       .get_constraint         = snbep_cbox_get_constraint,
+       .put_constraint         = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type snbep_uncore_cbox = {
+       .name                   = "cbox",
+       .num_counters           = 4,
+       .num_boxes              = 8,
+       .perf_ctr_bits          = 44,
+       .event_ctl              = SNBEP_C0_MSR_PMON_CTL0,
+       .perf_ctr               = SNBEP_C0_MSR_PMON_CTR0,
+       .event_mask             = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_C0_MSR_PMON_BOX_CTL,
+       .msr_offset             = SNBEP_CBO_MSR_OFFSET,
+       .num_shared_regs        = 1,
+       .constraints            = snbep_uncore_cbox_constraints,
+       .ops                    = &snbep_uncore_cbox_ops,
+       .format_group           = &snbep_uncore_cbox_format_group,
+};
+
+static u64 snbep_pcu_alter_er(struct perf_event *event, int new_idx, bool modify)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       u64 config = reg1->config;
+
+       if (new_idx > reg1->idx)
+               config <<= 8 * (new_idx - reg1->idx);
+       else
+               config >>= 8 * (reg1->idx - new_idx);
+
+       if (modify) {
+               hwc->config += new_idx - reg1->idx;
+               reg1->config = config;
+               reg1->idx = new_idx;
+       }
+       return config;
+}
+
+static struct event_constraint *
+snbep_pcu_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+       unsigned long flags;
+       int idx = reg1->idx;
+       u64 mask, config1 = reg1->config;
+       bool ok = false;
+
+       if (reg1->idx == EXTRA_REG_NONE ||
+           (!uncore_box_is_fake(box) && reg1->alloc))
+               return NULL;
+again:
+       mask = 0xffULL << (idx * 8);
+       raw_spin_lock_irqsave(&er->lock, flags);
+       if (!__BITS_VALUE(atomic_read(&er->ref), idx, 8) ||
+           !((config1 ^ er->config) & mask)) {
+               atomic_add(1 << (idx * 8), &er->ref);
+               er->config &= ~mask;
+               er->config |= config1 & mask;
+               ok = true;
+       }
+       raw_spin_unlock_irqrestore(&er->lock, flags);
+
+       if (!ok) {
+               idx = (idx + 1) % 4;
+               if (idx != reg1->idx) {
+                       config1 = snbep_pcu_alter_er(event, idx, false);
+                       goto again;
+               }
+               return &uncore_constraint_empty;
+       }
+
+       if (!uncore_box_is_fake(box)) {
+               if (idx != reg1->idx)
+                       snbep_pcu_alter_er(event, idx, true);
+               reg1->alloc = 1;
+       }
+       return NULL;
+}
+
+static void snbep_pcu_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+
+       if (uncore_box_is_fake(box) || !reg1->alloc)
+               return;
+
+       atomic_sub(1 << (reg1->idx * 8), &er->ref);
+       reg1->alloc = 0;
+}
+
+static int snbep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
+
+       if (ev_sel >= 0xb && ev_sel <= 0xe) {
+               reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER;
+               reg1->idx = ev_sel - 0xb;
+               reg1->config = event->attr.config1 & (0xff << (reg1->idx * 8));
+       }
+       return 0;
+}
+
+static struct intel_uncore_ops snbep_uncore_pcu_ops = {
+       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+       .hw_config              = snbep_pcu_hw_config,
+       .get_constraint         = snbep_pcu_get_constraint,
+       .put_constraint         = snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type snbep_uncore_pcu = {
+       .name                   = "pcu",
+       .num_counters           = 4,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .perf_ctr               = SNBEP_PCU_MSR_PMON_CTR0,
+       .event_ctl              = SNBEP_PCU_MSR_PMON_CTL0,
+       .event_mask             = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_PCU_MSR_PMON_BOX_CTL,
+       .num_shared_regs        = 1,
+       .ops                    = &snbep_uncore_pcu_ops,
+       .format_group           = &snbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *snbep_msr_uncores[] = {
+       &snbep_uncore_ubox,
+       &snbep_uncore_cbox,
+       &snbep_uncore_pcu,
+       NULL,
+};
+
+void snbep_uncore_cpu_init(void)
+{
+       if (snbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+               snbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+       uncore_msr_uncores = snbep_msr_uncores;
+}
+
+enum {
+       SNBEP_PCI_QPI_PORT0_FILTER,
+       SNBEP_PCI_QPI_PORT1_FILTER,
+       HSWEP_PCI_PCU_3,
+};
+
+static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+       if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) {
+               reg1->idx = 0;
+               reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0;
+               reg1->config = event->attr.config1;
+               reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0;
+               reg2->config = event->attr.config2;
+       }
+       return 0;
+}
+
+static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+       if (reg1->idx != EXTRA_REG_NONE) {
+               int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER;
+               int pkg = topology_phys_to_logical_pkg(box->pci_phys_id);
+               struct pci_dev *filter_pdev = uncore_extra_pci_dev[pkg].dev[idx];
+
+               if (filter_pdev) {
+                       pci_write_config_dword(filter_pdev, reg1->reg,
+                                               (u32)reg1->config);
+                       pci_write_config_dword(filter_pdev, reg1->reg + 4,
+                                               (u32)(reg1->config >> 32));
+                       pci_write_config_dword(filter_pdev, reg2->reg,
+                                               (u32)reg2->config);
+                       pci_write_config_dword(filter_pdev, reg2->reg + 4,
+                                               (u32)(reg2->config >> 32));
+               }
+       }
+
+       pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops snbep_uncore_qpi_ops = {
+       SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
+       .enable_event           = snbep_qpi_enable_event,
+       .hw_config              = snbep_qpi_hw_config,
+       .get_constraint         = uncore_get_constraint,
+       .put_constraint         = uncore_put_constraint,
+};
+
+#define SNBEP_UNCORE_PCI_COMMON_INIT()                         \
+       .perf_ctr       = SNBEP_PCI_PMON_CTR0,                  \
+       .event_ctl      = SNBEP_PCI_PMON_CTL0,                  \
+       .event_mask     = SNBEP_PMON_RAW_EVENT_MASK,            \
+       .box_ctl        = SNBEP_PCI_PMON_BOX_CTL,               \
+       .ops            = &snbep_uncore_pci_ops,                \
+       .format_group   = &snbep_uncore_format_group
+
+static struct intel_uncore_type snbep_uncore_ha = {
+       .name           = "ha",
+       .num_counters   = 4,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 48,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_imc = {
+       .name           = "imc",
+       .num_counters   = 4,
+       .num_boxes      = 4,
+       .perf_ctr_bits  = 48,
+       .fixed_ctr_bits = 48,
+       .fixed_ctr      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+       .fixed_ctl      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+       .event_descs    = snbep_uncore_imc_events,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_qpi = {
+       .name                   = "qpi",
+       .num_counters           = 4,
+       .num_boxes              = 2,
+       .perf_ctr_bits          = 48,
+       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
+       .event_ctl              = SNBEP_PCI_PMON_CTL0,
+       .event_mask             = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
+       .num_shared_regs        = 1,
+       .ops                    = &snbep_uncore_qpi_ops,
+       .event_descs            = snbep_uncore_qpi_events,
+       .format_group           = &snbep_uncore_qpi_format_group,
+};
+
+
+static struct intel_uncore_type snbep_uncore_r2pcie = {
+       .name           = "r2pcie",
+       .num_counters   = 4,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 44,
+       .constraints    = snbep_uncore_r2pcie_constraints,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_r3qpi = {
+       .name           = "r3qpi",
+       .num_counters   = 3,
+       .num_boxes      = 2,
+       .perf_ctr_bits  = 44,
+       .constraints    = snbep_uncore_r3qpi_constraints,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+       SNBEP_PCI_UNCORE_HA,
+       SNBEP_PCI_UNCORE_IMC,
+       SNBEP_PCI_UNCORE_QPI,
+       SNBEP_PCI_UNCORE_R2PCIE,
+       SNBEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *snbep_pci_uncores[] = {
+       [SNBEP_PCI_UNCORE_HA]           = &snbep_uncore_ha,
+       [SNBEP_PCI_UNCORE_IMC]          = &snbep_uncore_imc,
+       [SNBEP_PCI_UNCORE_QPI]          = &snbep_uncore_qpi,
+       [SNBEP_PCI_UNCORE_R2PCIE]       = &snbep_uncore_r2pcie,
+       [SNBEP_PCI_UNCORE_R3QPI]        = &snbep_uncore_r3qpi,
+       NULL,
+};
+
+static const struct pci_device_id snbep_uncore_pci_ids[] = {
+       { /* Home Agent */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0),
+       },
+       { /* MC Channel 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0),
+       },
+       { /* MC Channel 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1),
+       },
+       { /* MC Channel 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2),
+       },
+       { /* MC Channel 3 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3),
+       },
+       { /* QPI Port 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0),
+       },
+       { /* QPI Port 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1),
+       },
+       { /* R2PCIe */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0),
+       },
+       { /* R3QPI Link 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0),
+       },
+       { /* R3QPI Link 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1),
+       },
+       { /* QPI Port 0 filter  */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+                                                  SNBEP_PCI_QPI_PORT0_FILTER),
+       },
+       { /* QPI Port 0 filter  */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+                                                  SNBEP_PCI_QPI_PORT1_FILTER),
+       },
+       { /* end: all zeroes */ }
+};
+
+static struct pci_driver snbep_uncore_pci_driver = {
+       .name           = "snbep_uncore",
+       .id_table       = snbep_uncore_pci_ids,
+};
+
+/*
+ * build pci bus to socket mapping
+ */
+static int snbep_pci2phy_map_init(int devid)
+{
+       struct pci_dev *ubox_dev = NULL;
+       int i, bus, nodeid, segment;
+       struct pci2phy_map *map;
+       int err = 0;
+       u32 config = 0;
+
+       while (1) {
+               /* find the UBOX device */
+               ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, ubox_dev);
+               if (!ubox_dev)
+                       break;
+               bus = ubox_dev->bus->number;
+               /* get the Node ID of the local register */
+               err = pci_read_config_dword(ubox_dev, 0x40, &config);
+               if (err)
+                       break;
+               nodeid = config;
+               /* get the Node ID mapping */
+               err = pci_read_config_dword(ubox_dev, 0x54, &config);
+               if (err)
+                       break;
+
+               segment = pci_domain_nr(ubox_dev->bus);
+               raw_spin_lock(&pci2phy_map_lock);
+               map = __find_pci2phy_map(segment);
+               if (!map) {
+                       raw_spin_unlock(&pci2phy_map_lock);
+                       err = -ENOMEM;
+                       break;
+               }
+
+               /*
+                * every three bits in the Node ID mapping register maps
+                * to a particular node.
+                */
+               for (i = 0; i < 8; i++) {
+                       if (nodeid == ((config >> (3 * i)) & 0x7)) {
+                               map->pbus_to_physid[bus] = i;
+                               break;
+                       }
+               }
+               raw_spin_unlock(&pci2phy_map_lock);
+       }
+
+       if (!err) {
+               /*
+                * For PCI bus with no UBOX device, find the next bus
+                * that has UBOX device and use its mapping.
+                */
+               raw_spin_lock(&pci2phy_map_lock);
+               list_for_each_entry(map, &pci2phy_map_head, list) {
+                       i = -1;
+                       for (bus = 255; bus >= 0; bus--) {
+                               if (map->pbus_to_physid[bus] >= 0)
+                                       i = map->pbus_to_physid[bus];
+                               else
+                                       map->pbus_to_physid[bus] = i;
+                       }
+               }
+               raw_spin_unlock(&pci2phy_map_lock);
+       }
+
+       pci_dev_put(ubox_dev);
+
+       return err ? pcibios_err_to_errno(err) : 0;
+}
+
+int snbep_uncore_pci_init(void)
+{
+       int ret = snbep_pci2phy_map_init(0x3ce0);
+       if (ret)
+               return ret;
+       uncore_pci_uncores = snbep_pci_uncores;
+       uncore_pci_driver = &snbep_uncore_pci_driver;
+       return 0;
+}
+/* end of Sandy Bridge-EP uncore support */
+
+/* IvyTown uncore support */
+static void ivbep_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+       unsigned msr = uncore_msr_box_ctl(box);
+       if (msr)
+               wrmsrl(msr, IVBEP_PMON_BOX_CTL_INT);
+}
+
+static void ivbep_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+       struct pci_dev *pdev = box->pci_dev;
+
+       pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT);
+}
+
+#define IVBEP_UNCORE_MSR_OPS_COMMON_INIT()                     \
+       .init_box       = ivbep_uncore_msr_init_box,            \
+       .disable_box    = snbep_uncore_msr_disable_box,         \
+       .enable_box     = snbep_uncore_msr_enable_box,          \
+       .disable_event  = snbep_uncore_msr_disable_event,       \
+       .enable_event   = snbep_uncore_msr_enable_event,        \
+       .read_counter   = uncore_msr_read_counter
+
+static struct intel_uncore_ops ivbep_uncore_msr_ops = {
+       IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+};
+
+static struct intel_uncore_ops ivbep_uncore_pci_ops = {
+       .init_box       = ivbep_uncore_pci_init_box,
+       .disable_box    = snbep_uncore_pci_disable_box,
+       .enable_box     = snbep_uncore_pci_enable_box,
+       .disable_event  = snbep_uncore_pci_disable_event,
+       .enable_event   = snbep_uncore_pci_enable_event,
+       .read_counter   = snbep_uncore_pci_read_counter,
+};
+
+#define IVBEP_UNCORE_PCI_COMMON_INIT()                         \
+       .perf_ctr       = SNBEP_PCI_PMON_CTR0,                  \
+       .event_ctl      = SNBEP_PCI_PMON_CTL0,                  \
+       .event_mask     = IVBEP_PMON_RAW_EVENT_MASK,            \
+       .box_ctl        = SNBEP_PCI_PMON_BOX_CTL,               \
+       .ops            = &ivbep_uncore_pci_ops,                        \
+       .format_group   = &ivbep_uncore_format_group
+
+static struct attribute *ivbep_uncore_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       NULL,
+};
+
+static struct attribute *ivbep_uncore_ubox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh5.attr,
+       NULL,
+};
+
+static struct attribute *ivbep_uncore_cbox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_tid_en.attr,
+       &format_attr_thresh8.attr,
+       &format_attr_filter_tid.attr,
+       &format_attr_filter_link.attr,
+       &format_attr_filter_state2.attr,
+       &format_attr_filter_nid2.attr,
+       &format_attr_filter_opc2.attr,
+       &format_attr_filter_nc.attr,
+       &format_attr_filter_c6.attr,
+       &format_attr_filter_isoc.attr,
+       NULL,
+};
+
+static struct attribute *ivbep_uncore_pcu_formats_attr[] = {
+       &format_attr_event_ext.attr,
+       &format_attr_occ_sel.attr,
+       &format_attr_edge.attr,
+       &format_attr_thresh5.attr,
+       &format_attr_occ_invert.attr,
+       &format_attr_occ_edge.attr,
+       &format_attr_filter_band0.attr,
+       &format_attr_filter_band1.attr,
+       &format_attr_filter_band2.attr,
+       &format_attr_filter_band3.attr,
+       NULL,
+};
+
+static struct attribute *ivbep_uncore_qpi_formats_attr[] = {
+       &format_attr_event_ext.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_thresh8.attr,
+       &format_attr_match_rds.attr,
+       &format_attr_match_rnid30.attr,
+       &format_attr_match_rnid4.attr,
+       &format_attr_match_dnid.attr,
+       &format_attr_match_mc.attr,
+       &format_attr_match_opc.attr,
+       &format_attr_match_vnw.attr,
+       &format_attr_match0.attr,
+       &format_attr_match1.attr,
+       &format_attr_mask_rds.attr,
+       &format_attr_mask_rnid30.attr,
+       &format_attr_mask_rnid4.attr,
+       &format_attr_mask_dnid.attr,
+       &format_attr_mask_mc.attr,
+       &format_attr_mask_opc.attr,
+       &format_attr_mask_vnw.attr,
+       &format_attr_mask0.attr,
+       &format_attr_mask1.attr,
+       NULL,
+};
+
+static struct attribute_group ivbep_uncore_format_group = {
+       .name = "format",
+       .attrs = ivbep_uncore_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_ubox_format_group = {
+       .name = "format",
+       .attrs = ivbep_uncore_ubox_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_cbox_format_group = {
+       .name = "format",
+       .attrs = ivbep_uncore_cbox_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_pcu_format_group = {
+       .name = "format",
+       .attrs = ivbep_uncore_pcu_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_qpi_format_group = {
+       .name = "format",
+       .attrs = ivbep_uncore_qpi_formats_attr,
+};
+
+static struct intel_uncore_type ivbep_uncore_ubox = {
+       .name           = "ubox",
+       .num_counters   = 2,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 44,
+       .fixed_ctr_bits = 48,
+       .perf_ctr       = SNBEP_U_MSR_PMON_CTR0,
+       .event_ctl      = SNBEP_U_MSR_PMON_CTL0,
+       .event_mask     = IVBEP_U_MSR_PMON_RAW_EVENT_MASK,
+       .fixed_ctr      = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
+       .fixed_ctl      = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
+       .ops            = &ivbep_uncore_msr_ops,
+       .format_group   = &ivbep_uncore_ubox_format_group,
+};
+
+static struct extra_reg ivbep_uncore_cbox_extra_regs[] = {
+       SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+                                 SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
+       EVENT_EXTRA_END
+};
+
+static u64 ivbep_cbox_filter_mask(int fields)
+{
+       u64 mask = 0;
+
+       if (fields & 0x1)
+               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_TID;
+       if (fields & 0x2)
+               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK;
+       if (fields & 0x4)
+               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+       if (fields & 0x8)
+               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NID;
+       if (fields & 0x10) {
+               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NC;
+               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_C6;
+               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC;
+       }
+
+       return mask;
+}
+
+static struct event_constraint *
+ivbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       return __snbep_cbox_get_constraint(box, event, ivbep_cbox_filter_mask);
+}
+
+static int ivbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct extra_reg *er;
+       int idx = 0;
+
+       for (er = ivbep_uncore_cbox_extra_regs; er->msr; er++) {
+               if (er->event != (event->hw.config & er->config_mask))
+                       continue;
+               idx |= er->idx;
+       }
+
+       if (idx) {
+               reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
+                       SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+               reg1->config = event->attr.config1 & ivbep_cbox_filter_mask(idx);
+               reg1->idx = idx;
+       }
+       return 0;
+}
+
+static void ivbep_cbox_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+       if (reg1->idx != EXTRA_REG_NONE) {
+               u64 filter = uncore_shared_reg_config(box, 0);
+               wrmsrl(reg1->reg, filter & 0xffffffff);
+               wrmsrl(reg1->reg + 6, filter >> 32);
+       }
+
+       wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops ivbep_uncore_cbox_ops = {
+       .init_box               = ivbep_uncore_msr_init_box,
+       .disable_box            = snbep_uncore_msr_disable_box,
+       .enable_box             = snbep_uncore_msr_enable_box,
+       .disable_event          = snbep_uncore_msr_disable_event,
+       .enable_event           = ivbep_cbox_enable_event,
+       .read_counter           = uncore_msr_read_counter,
+       .hw_config              = ivbep_cbox_hw_config,
+       .get_constraint         = ivbep_cbox_get_constraint,
+       .put_constraint         = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_cbox = {
+       .name                   = "cbox",
+       .num_counters           = 4,
+       .num_boxes              = 15,
+       .perf_ctr_bits          = 44,
+       .event_ctl              = SNBEP_C0_MSR_PMON_CTL0,
+       .perf_ctr               = SNBEP_C0_MSR_PMON_CTR0,
+       .event_mask             = IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_C0_MSR_PMON_BOX_CTL,
+       .msr_offset             = SNBEP_CBO_MSR_OFFSET,
+       .num_shared_regs        = 1,
+       .constraints            = snbep_uncore_cbox_constraints,
+       .ops                    = &ivbep_uncore_cbox_ops,
+       .format_group           = &ivbep_uncore_cbox_format_group,
+};
+
+static struct intel_uncore_ops ivbep_uncore_pcu_ops = {
+       IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+       .hw_config              = snbep_pcu_hw_config,
+       .get_constraint         = snbep_pcu_get_constraint,
+       .put_constraint         = snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_pcu = {
+       .name                   = "pcu",
+       .num_counters           = 4,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .perf_ctr               = SNBEP_PCU_MSR_PMON_CTR0,
+       .event_ctl              = SNBEP_PCU_MSR_PMON_CTL0,
+       .event_mask             = IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_PCU_MSR_PMON_BOX_CTL,
+       .num_shared_regs        = 1,
+       .ops                    = &ivbep_uncore_pcu_ops,
+       .format_group           = &ivbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *ivbep_msr_uncores[] = {
+       &ivbep_uncore_ubox,
+       &ivbep_uncore_cbox,
+       &ivbep_uncore_pcu,
+       NULL,
+};
+
+void ivbep_uncore_cpu_init(void)
+{
+       if (ivbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+               ivbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+       uncore_msr_uncores = ivbep_msr_uncores;
+}
+
+static struct intel_uncore_type ivbep_uncore_ha = {
+       .name           = "ha",
+       .num_counters   = 4,
+       .num_boxes      = 2,
+       .perf_ctr_bits  = 48,
+       IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type ivbep_uncore_imc = {
+       .name           = "imc",
+       .num_counters   = 4,
+       .num_boxes      = 8,
+       .perf_ctr_bits  = 48,
+       .fixed_ctr_bits = 48,
+       .fixed_ctr      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+       .fixed_ctl      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+       .event_descs    = snbep_uncore_imc_events,
+       IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+/* registers in IRP boxes are not properly aligned */
+static unsigned ivbep_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4};
+static unsigned ivbep_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0};
+
+static void ivbep_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       struct hw_perf_event *hwc = &event->hw;
+
+       pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx],
+                              hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void ivbep_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       struct hw_perf_event *hwc = &event->hw;
+
+       pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx], hwc->config);
+}
+
+static u64 ivbep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       struct hw_perf_event *hwc = &event->hw;
+       u64 count = 0;
+
+       pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
+       pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
+
+       return count;
+}
+
+static struct intel_uncore_ops ivbep_uncore_irp_ops = {
+       .init_box       = ivbep_uncore_pci_init_box,
+       .disable_box    = snbep_uncore_pci_disable_box,
+       .enable_box     = snbep_uncore_pci_enable_box,
+       .disable_event  = ivbep_uncore_irp_disable_event,
+       .enable_event   = ivbep_uncore_irp_enable_event,
+       .read_counter   = ivbep_uncore_irp_read_counter,
+};
+
+static struct intel_uncore_type ivbep_uncore_irp = {
+       .name                   = "irp",
+       .num_counters           = 4,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .event_mask             = IVBEP_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
+       .ops                    = &ivbep_uncore_irp_ops,
+       .format_group           = &ivbep_uncore_format_group,
+};
+
+static struct intel_uncore_ops ivbep_uncore_qpi_ops = {
+       .init_box       = ivbep_uncore_pci_init_box,
+       .disable_box    = snbep_uncore_pci_disable_box,
+       .enable_box     = snbep_uncore_pci_enable_box,
+       .disable_event  = snbep_uncore_pci_disable_event,
+       .enable_event   = snbep_qpi_enable_event,
+       .read_counter   = snbep_uncore_pci_read_counter,
+       .hw_config      = snbep_qpi_hw_config,
+       .get_constraint = uncore_get_constraint,
+       .put_constraint = uncore_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_qpi = {
+       .name                   = "qpi",
+       .num_counters           = 4,
+       .num_boxes              = 3,
+       .perf_ctr_bits          = 48,
+       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
+       .event_ctl              = SNBEP_PCI_PMON_CTL0,
+       .event_mask             = IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
+       .num_shared_regs        = 1,
+       .ops                    = &ivbep_uncore_qpi_ops,
+       .format_group           = &ivbep_uncore_qpi_format_group,
+};
+
+static struct intel_uncore_type ivbep_uncore_r2pcie = {
+       .name           = "r2pcie",
+       .num_counters   = 4,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 44,
+       .constraints    = snbep_uncore_r2pcie_constraints,
+       IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type ivbep_uncore_r3qpi = {
+       .name           = "r3qpi",
+       .num_counters   = 3,
+       .num_boxes      = 2,
+       .perf_ctr_bits  = 44,
+       .constraints    = snbep_uncore_r3qpi_constraints,
+       IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+       IVBEP_PCI_UNCORE_HA,
+       IVBEP_PCI_UNCORE_IMC,
+       IVBEP_PCI_UNCORE_IRP,
+       IVBEP_PCI_UNCORE_QPI,
+       IVBEP_PCI_UNCORE_R2PCIE,
+       IVBEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *ivbep_pci_uncores[] = {
+       [IVBEP_PCI_UNCORE_HA]   = &ivbep_uncore_ha,
+       [IVBEP_PCI_UNCORE_IMC]  = &ivbep_uncore_imc,
+       [IVBEP_PCI_UNCORE_IRP]  = &ivbep_uncore_irp,
+       [IVBEP_PCI_UNCORE_QPI]  = &ivbep_uncore_qpi,
+       [IVBEP_PCI_UNCORE_R2PCIE]       = &ivbep_uncore_r2pcie,
+       [IVBEP_PCI_UNCORE_R3QPI]        = &ivbep_uncore_r3qpi,
+       NULL,
+};
+
+static const struct pci_device_id ivbep_uncore_pci_ids[] = {
+       { /* Home Agent 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 0),
+       },
+       { /* Home Agent 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 1),
+       },
+       { /* MC0 Channel 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 0),
+       },
+       { /* MC0 Channel 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 1),
+       },
+       { /* MC0 Channel 3 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 2),
+       },
+       { /* MC0 Channel 4 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 3),
+       },
+       { /* MC1 Channel 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 4),
+       },
+       { /* MC1 Channel 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 5),
+       },
+       { /* MC1 Channel 3 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 6),
+       },
+       { /* MC1 Channel 4 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 7),
+       },
+       { /* IRP */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IRP, 0),
+       },
+       { /* QPI0 Port 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 0),
+       },
+       { /* QPI0 Port 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 1),
+       },
+       { /* QPI1 Port 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 2),
+       },
+       { /* R2PCIe */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R2PCIE, 0),
+       },
+       { /* R3QPI0 Link 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 0),
+       },
+       { /* R3QPI0 Link 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 1),
+       },
+       { /* R3QPI1 Link 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 2),
+       },
+       { /* QPI Port 0 filter  */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+                                                  SNBEP_PCI_QPI_PORT0_FILTER),
+       },
+       { /* QPI Port 0 filter  */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+                                                  SNBEP_PCI_QPI_PORT1_FILTER),
+       },
+       { /* end: all zeroes */ }
+};
+
+static struct pci_driver ivbep_uncore_pci_driver = {
+       .name           = "ivbep_uncore",
+       .id_table       = ivbep_uncore_pci_ids,
+};
+
+int ivbep_uncore_pci_init(void)
+{
+       int ret = snbep_pci2phy_map_init(0x0e1e);
+       if (ret)
+               return ret;
+       uncore_pci_uncores = ivbep_pci_uncores;
+       uncore_pci_driver = &ivbep_uncore_pci_driver;
+       return 0;
+}
+/* end of IvyTown uncore support */
+
+/* KNL uncore support */
+static struct attribute *knl_uncore_ubox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_tid_en.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh5.attr,
+       NULL,
+};
+
+static struct attribute_group knl_uncore_ubox_format_group = {
+       .name = "format",
+       .attrs = knl_uncore_ubox_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_ubox = {
+       .name                   = "ubox",
+       .num_counters           = 2,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .fixed_ctr_bits         = 48,
+       .perf_ctr               = HSWEP_U_MSR_PMON_CTR0,
+       .event_ctl              = HSWEP_U_MSR_PMON_CTL0,
+       .event_mask             = KNL_U_MSR_PMON_RAW_EVENT_MASK,
+       .fixed_ctr              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+       .fixed_ctl              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+       .ops                    = &snbep_uncore_msr_ops,
+       .format_group           = &knl_uncore_ubox_format_group,
+};
+
+static struct attribute *knl_uncore_cha_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_qor.attr,
+       &format_attr_edge.attr,
+       &format_attr_tid_en.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       &format_attr_filter_tid4.attr,
+       &format_attr_filter_link3.attr,
+       &format_attr_filter_state4.attr,
+       &format_attr_filter_local.attr,
+       &format_attr_filter_all_op.attr,
+       &format_attr_filter_nnm.attr,
+       &format_attr_filter_opc3.attr,
+       &format_attr_filter_nc.attr,
+       &format_attr_filter_isoc.attr,
+       NULL,
+};
+
+static struct attribute_group knl_uncore_cha_format_group = {
+       .name = "format",
+       .attrs = knl_uncore_cha_formats_attr,
+};
+
+static struct event_constraint knl_uncore_cha_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x1f, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+       EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg knl_uncore_cha_extra_regs[] = {
+       SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+                                 SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x3d, 0xff, 0x2),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x4),
+       EVENT_EXTRA_END
+};
+
+static u64 knl_cha_filter_mask(int fields)
+{
+       u64 mask = 0;
+
+       if (fields & 0x1)
+               mask |= KNL_CHA_MSR_PMON_BOX_FILTER_TID;
+       if (fields & 0x2)
+               mask |= KNL_CHA_MSR_PMON_BOX_FILTER_STATE;
+       if (fields & 0x4)
+               mask |= KNL_CHA_MSR_PMON_BOX_FILTER_OP;
+       return mask;
+}
+
+static struct event_constraint *
+knl_cha_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       return __snbep_cbox_get_constraint(box, event, knl_cha_filter_mask);
+}
+
+static int knl_cha_hw_config(struct intel_uncore_box *box,
+                            struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct extra_reg *er;
+       int idx = 0;
+
+       for (er = knl_uncore_cha_extra_regs; er->msr; er++) {
+               if (er->event != (event->hw.config & er->config_mask))
+                       continue;
+               idx |= er->idx;
+       }
+
+       if (idx) {
+               reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
+                           KNL_CHA_MSR_OFFSET * box->pmu->pmu_idx;
+               reg1->config = event->attr.config1 & knl_cha_filter_mask(idx);
+               reg1->idx = idx;
+       }
+       return 0;
+}
+
+static void hswep_cbox_enable_event(struct intel_uncore_box *box,
+                                   struct perf_event *event);
+
+static struct intel_uncore_ops knl_uncore_cha_ops = {
+       .init_box               = snbep_uncore_msr_init_box,
+       .disable_box            = snbep_uncore_msr_disable_box,
+       .enable_box             = snbep_uncore_msr_enable_box,
+       .disable_event          = snbep_uncore_msr_disable_event,
+       .enable_event           = hswep_cbox_enable_event,
+       .read_counter           = uncore_msr_read_counter,
+       .hw_config              = knl_cha_hw_config,
+       .get_constraint         = knl_cha_get_constraint,
+       .put_constraint         = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type knl_uncore_cha = {
+       .name                   = "cha",
+       .num_counters           = 4,
+       .num_boxes              = 38,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = HSWEP_C0_MSR_PMON_CTL0,
+       .perf_ctr               = HSWEP_C0_MSR_PMON_CTR0,
+       .event_mask             = KNL_CHA_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = HSWEP_C0_MSR_PMON_BOX_CTL,
+       .msr_offset             = KNL_CHA_MSR_OFFSET,
+       .num_shared_regs        = 1,
+       .constraints            = knl_uncore_cha_constraints,
+       .ops                    = &knl_uncore_cha_ops,
+       .format_group           = &knl_uncore_cha_format_group,
+};
+
+static struct attribute *knl_uncore_pcu_formats_attr[] = {
+       &format_attr_event2.attr,
+       &format_attr_use_occ_ctr.attr,
+       &format_attr_occ_sel.attr,
+       &format_attr_edge.attr,
+       &format_attr_tid_en.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh6.attr,
+       &format_attr_occ_invert.attr,
+       &format_attr_occ_edge_det.attr,
+       NULL,
+};
+
+static struct attribute_group knl_uncore_pcu_format_group = {
+       .name = "format",
+       .attrs = knl_uncore_pcu_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_pcu = {
+       .name                   = "pcu",
+       .num_counters           = 4,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .perf_ctr               = HSWEP_PCU_MSR_PMON_CTR0,
+       .event_ctl              = HSWEP_PCU_MSR_PMON_CTL0,
+       .event_mask             = KNL_PCU_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = HSWEP_PCU_MSR_PMON_BOX_CTL,
+       .ops                    = &snbep_uncore_msr_ops,
+       .format_group           = &knl_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *knl_msr_uncores[] = {
+       &knl_uncore_ubox,
+       &knl_uncore_cha,
+       &knl_uncore_pcu,
+       NULL,
+};
+
+void knl_uncore_cpu_init(void)
+{
+       uncore_msr_uncores = knl_msr_uncores;
+}
+
+static void knl_uncore_imc_enable_box(struct intel_uncore_box *box)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       int box_ctl = uncore_pci_box_ctl(box);
+
+       pci_write_config_dword(pdev, box_ctl, 0);
+}
+
+static void knl_uncore_imc_enable_event(struct intel_uncore_box *box,
+                                       struct perf_event *event)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       struct hw_perf_event *hwc = &event->hw;
+
+       if ((event->attr.config & SNBEP_PMON_CTL_EV_SEL_MASK)
+                                                       == UNCORE_FIXED_EVENT)
+               pci_write_config_dword(pdev, hwc->config_base,
+                                      hwc->config | KNL_PMON_FIXED_CTL_EN);
+       else
+               pci_write_config_dword(pdev, hwc->config_base,
+                                      hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops knl_uncore_imc_ops = {
+       .init_box       = snbep_uncore_pci_init_box,
+       .disable_box    = snbep_uncore_pci_disable_box,
+       .enable_box     = knl_uncore_imc_enable_box,
+       .read_counter   = snbep_uncore_pci_read_counter,
+       .enable_event   = knl_uncore_imc_enable_event,
+       .disable_event  = snbep_uncore_pci_disable_event,
+};
+
+static struct intel_uncore_type knl_uncore_imc_uclk = {
+       .name                   = "imc_uclk",
+       .num_counters           = 4,
+       .num_boxes              = 2,
+       .perf_ctr_bits          = 48,
+       .fixed_ctr_bits         = 48,
+       .perf_ctr               = KNL_UCLK_MSR_PMON_CTR0_LOW,
+       .event_ctl              = KNL_UCLK_MSR_PMON_CTL0,
+       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
+       .fixed_ctr              = KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
+       .fixed_ctl              = KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
+       .box_ctl                = KNL_UCLK_MSR_PMON_BOX_CTL,
+       .ops                    = &knl_uncore_imc_ops,
+       .format_group           = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_imc_dclk = {
+       .name                   = "imc",
+       .num_counters           = 4,
+       .num_boxes              = 6,
+       .perf_ctr_bits          = 48,
+       .fixed_ctr_bits         = 48,
+       .perf_ctr               = KNL_MC0_CH0_MSR_PMON_CTR0_LOW,
+       .event_ctl              = KNL_MC0_CH0_MSR_PMON_CTL0,
+       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
+       .fixed_ctr              = KNL_MC0_CH0_MSR_PMON_FIXED_LOW,
+       .fixed_ctl              = KNL_MC0_CH0_MSR_PMON_FIXED_CTL,
+       .box_ctl                = KNL_MC0_CH0_MSR_PMON_BOX_CTL,
+       .ops                    = &knl_uncore_imc_ops,
+       .format_group           = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_edc_uclk = {
+       .name                   = "edc_uclk",
+       .num_counters           = 4,
+       .num_boxes              = 8,
+       .perf_ctr_bits          = 48,
+       .fixed_ctr_bits         = 48,
+       .perf_ctr               = KNL_UCLK_MSR_PMON_CTR0_LOW,
+       .event_ctl              = KNL_UCLK_MSR_PMON_CTL0,
+       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
+       .fixed_ctr              = KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
+       .fixed_ctl              = KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
+       .box_ctl                = KNL_UCLK_MSR_PMON_BOX_CTL,
+       .ops                    = &knl_uncore_imc_ops,
+       .format_group           = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_edc_eclk = {
+       .name                   = "edc_eclk",
+       .num_counters           = 4,
+       .num_boxes              = 8,
+       .perf_ctr_bits          = 48,
+       .fixed_ctr_bits         = 48,
+       .perf_ctr               = KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW,
+       .event_ctl              = KNL_EDC0_ECLK_MSR_PMON_CTL0,
+       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
+       .fixed_ctr              = KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW,
+       .fixed_ctl              = KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL,
+       .box_ctl                = KNL_EDC0_ECLK_MSR_PMON_BOX_CTL,
+       .ops                    = &knl_uncore_imc_ops,
+       .format_group           = &snbep_uncore_format_group,
+};
+
+static struct event_constraint knl_uncore_m2pcie_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+       EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type knl_uncore_m2pcie = {
+       .name           = "m2pcie",
+       .num_counters   = 4,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 48,
+       .constraints    = knl_uncore_m2pcie_constraints,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct attribute *knl_uncore_irp_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_qor.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       NULL,
+};
+
+static struct attribute_group knl_uncore_irp_format_group = {
+       .name = "format",
+       .attrs = knl_uncore_irp_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_irp = {
+       .name                   = "irp",
+       .num_counters           = 2,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
+       .event_ctl              = SNBEP_PCI_PMON_CTL0,
+       .event_mask             = KNL_IRP_PCI_PMON_RAW_EVENT_MASK,
+       .box_ctl                = KNL_IRP_PCI_PMON_BOX_CTL,
+       .ops                    = &snbep_uncore_pci_ops,
+       .format_group           = &knl_uncore_irp_format_group,
+};
+
+enum {
+       KNL_PCI_UNCORE_MC_UCLK,
+       KNL_PCI_UNCORE_MC_DCLK,
+       KNL_PCI_UNCORE_EDC_UCLK,
+       KNL_PCI_UNCORE_EDC_ECLK,
+       KNL_PCI_UNCORE_M2PCIE,
+       KNL_PCI_UNCORE_IRP,
+};
+
+static struct intel_uncore_type *knl_pci_uncores[] = {
+       [KNL_PCI_UNCORE_MC_UCLK]        = &knl_uncore_imc_uclk,
+       [KNL_PCI_UNCORE_MC_DCLK]        = &knl_uncore_imc_dclk,
+       [KNL_PCI_UNCORE_EDC_UCLK]       = &knl_uncore_edc_uclk,
+       [KNL_PCI_UNCORE_EDC_ECLK]       = &knl_uncore_edc_eclk,
+       [KNL_PCI_UNCORE_M2PCIE]         = &knl_uncore_m2pcie,
+       [KNL_PCI_UNCORE_IRP]            = &knl_uncore_irp,
+       NULL,
+};
+
+/*
+ * KNL uses a common PCI device ID for multiple instances of an Uncore PMU
+ * device type. prior to KNL, each instance of a PMU device type had a unique
+ * device ID.
+ *
+ *     PCI Device ID   Uncore PMU Devices
+ *     ----------------------------------
+ *     0x7841          MC0 UClk, MC1 UClk
+ *     0x7843          MC0 DClk CH 0, MC0 DClk CH 1, MC0 DClk CH 2,
+ *                     MC1 DClk CH 0, MC1 DClk CH 1, MC1 DClk CH 2
+ *     0x7833          EDC0 UClk, EDC1 UClk, EDC2 UClk, EDC3 UClk,
+ *                     EDC4 UClk, EDC5 UClk, EDC6 UClk, EDC7 UClk
+ *     0x7835          EDC0 EClk, EDC1 EClk, EDC2 EClk, EDC3 EClk,
+ *                     EDC4 EClk, EDC5 EClk, EDC6 EClk, EDC7 EClk
+ *     0x7817          M2PCIe
+ *     0x7814          IRP
+*/
+
+static const struct pci_device_id knl_uncore_pci_ids[] = {
+       { /* MC UClk */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7841),
+               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_UCLK, 0),
+       },
+       { /* MC DClk Channel */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_DCLK, 0),
+       },
+       { /* EDC UClk */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_UCLK, 0),
+       },
+       { /* EDC EClk */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_ECLK, 0),
+       },
+       { /* M2PCIe */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7817),
+               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_M2PCIE, 0),
+       },
+       { /* IRP */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7814),
+               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_IRP, 0),
+       },
+       { /* end: all zeroes */ }
+};
+
+static struct pci_driver knl_uncore_pci_driver = {
+       .name           = "knl_uncore",
+       .id_table       = knl_uncore_pci_ids,
+};
+
+int knl_uncore_pci_init(void)
+{
+       int ret;
+
+       /* All KNL PCI based PMON units are on the same PCI bus except IRP */
+       ret = snb_pci2phy_map_init(0x7814); /* IRP */
+       if (ret)
+               return ret;
+       ret = snb_pci2phy_map_init(0x7817); /* M2PCIe */
+       if (ret)
+               return ret;
+       uncore_pci_uncores = knl_pci_uncores;
+       uncore_pci_driver = &knl_uncore_pci_driver;
+       return 0;
+}
+
+/* end of KNL uncore support */
+
+/* Haswell-EP uncore support */
+static struct attribute *hswep_uncore_ubox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh5.attr,
+       &format_attr_filter_tid2.attr,
+       &format_attr_filter_cid.attr,
+       NULL,
+};
+
+static struct attribute_group hswep_uncore_ubox_format_group = {
+       .name = "format",
+       .attrs = hswep_uncore_ubox_formats_attr,
+};
+
+static int hswep_ubox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       reg1->reg = HSWEP_U_MSR_PMON_FILTER;
+       reg1->config = event->attr.config1 & HSWEP_U_MSR_PMON_BOX_FILTER_MASK;
+       reg1->idx = 0;
+       return 0;
+}
+
+static struct intel_uncore_ops hswep_uncore_ubox_ops = {
+       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+       .hw_config              = hswep_ubox_hw_config,
+       .get_constraint         = uncore_get_constraint,
+       .put_constraint         = uncore_put_constraint,
+};
+
+static struct intel_uncore_type hswep_uncore_ubox = {
+       .name                   = "ubox",
+       .num_counters           = 2,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 44,
+       .fixed_ctr_bits         = 48,
+       .perf_ctr               = HSWEP_U_MSR_PMON_CTR0,
+       .event_ctl              = HSWEP_U_MSR_PMON_CTL0,
+       .event_mask             = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+       .fixed_ctr              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+       .fixed_ctl              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+       .num_shared_regs        = 1,
+       .ops                    = &hswep_uncore_ubox_ops,
+       .format_group           = &hswep_uncore_ubox_format_group,
+};
+
+static struct attribute *hswep_uncore_cbox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_tid_en.attr,
+       &format_attr_thresh8.attr,
+       &format_attr_filter_tid3.attr,
+       &format_attr_filter_link2.attr,
+       &format_attr_filter_state3.attr,
+       &format_attr_filter_nid2.attr,
+       &format_attr_filter_opc2.attr,
+       &format_attr_filter_nc.attr,
+       &format_attr_filter_c6.attr,
+       &format_attr_filter_isoc.attr,
+       NULL,
+};
+
+static struct attribute_group hswep_uncore_cbox_format_group = {
+       .name = "format",
+       .attrs = hswep_uncore_cbox_formats_attr,
+};
+
+static struct event_constraint hswep_uncore_cbox_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x09, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
+       EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg hswep_uncore_cbox_extra_regs[] = {
+       SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+                                 SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x2134, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4028, 0x40ff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4032, 0x40ff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4029, 0x40ff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4033, 0x40ff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x402A, 0x40ff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x12),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
+       EVENT_EXTRA_END
+};
+
+static u64 hswep_cbox_filter_mask(int fields)
+{
+       u64 mask = 0;
+       if (fields & 0x1)
+               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_TID;
+       if (fields & 0x2)
+               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK;
+       if (fields & 0x4)
+               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+       if (fields & 0x8)
+               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NID;
+       if (fields & 0x10) {
+               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NC;
+               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_C6;
+               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC;
+       }
+       return mask;
+}
+
+static struct event_constraint *
+hswep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       return __snbep_cbox_get_constraint(box, event, hswep_cbox_filter_mask);
+}
+
+static int hswep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct extra_reg *er;
+       int idx = 0;
+
+       for (er = hswep_uncore_cbox_extra_regs; er->msr; er++) {
+               if (er->event != (event->hw.config & er->config_mask))
+                       continue;
+               idx |= er->idx;
+       }
+
+       if (idx) {
+               reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
+                           HSWEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+               reg1->config = event->attr.config1 & hswep_cbox_filter_mask(idx);
+               reg1->idx = idx;
+       }
+       return 0;
+}
+
+static void hswep_cbox_enable_event(struct intel_uncore_box *box,
+                                 struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+       if (reg1->idx != EXTRA_REG_NONE) {
+               u64 filter = uncore_shared_reg_config(box, 0);
+               wrmsrl(reg1->reg, filter & 0xffffffff);
+               wrmsrl(reg1->reg + 1, filter >> 32);
+       }
+
+       wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops hswep_uncore_cbox_ops = {
+       .init_box               = snbep_uncore_msr_init_box,
+       .disable_box            = snbep_uncore_msr_disable_box,
+       .enable_box             = snbep_uncore_msr_enable_box,
+       .disable_event          = snbep_uncore_msr_disable_event,
+       .enable_event           = hswep_cbox_enable_event,
+       .read_counter           = uncore_msr_read_counter,
+       .hw_config              = hswep_cbox_hw_config,
+       .get_constraint         = hswep_cbox_get_constraint,
+       .put_constraint         = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type hswep_uncore_cbox = {
+       .name                   = "cbox",
+       .num_counters           = 4,
+       .num_boxes              = 18,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = HSWEP_C0_MSR_PMON_CTL0,
+       .perf_ctr               = HSWEP_C0_MSR_PMON_CTR0,
+       .event_mask             = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = HSWEP_C0_MSR_PMON_BOX_CTL,
+       .msr_offset             = HSWEP_CBO_MSR_OFFSET,
+       .num_shared_regs        = 1,
+       .constraints            = hswep_uncore_cbox_constraints,
+       .ops                    = &hswep_uncore_cbox_ops,
+       .format_group           = &hswep_uncore_cbox_format_group,
+};
+
+/*
+ * Write SBOX Initialization register bit by bit to avoid spurious #GPs
+ */
+static void hswep_uncore_sbox_msr_init_box(struct intel_uncore_box *box)
+{
+       unsigned msr = uncore_msr_box_ctl(box);
+
+       if (msr) {
+               u64 init = SNBEP_PMON_BOX_CTL_INT;
+               u64 flags = 0;
+               int i;
+
+               for_each_set_bit(i, (unsigned long *)&init, 64) {
+                       flags |= (1ULL << i);
+                       wrmsrl(msr, flags);
+               }
+       }
+}
+
+static struct intel_uncore_ops hswep_uncore_sbox_msr_ops = {
+       __SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+       .init_box               = hswep_uncore_sbox_msr_init_box
+};
+
+static struct attribute *hswep_uncore_sbox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_tid_en.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       NULL,
+};
+
+static struct attribute_group hswep_uncore_sbox_format_group = {
+       .name = "format",
+       .attrs = hswep_uncore_sbox_formats_attr,
+};
+
+static struct intel_uncore_type hswep_uncore_sbox = {
+       .name                   = "sbox",
+       .num_counters           = 4,
+       .num_boxes              = 4,
+       .perf_ctr_bits          = 44,
+       .event_ctl              = HSWEP_S0_MSR_PMON_CTL0,
+       .perf_ctr               = HSWEP_S0_MSR_PMON_CTR0,
+       .event_mask             = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = HSWEP_S0_MSR_PMON_BOX_CTL,
+       .msr_offset             = HSWEP_SBOX_MSR_OFFSET,
+       .ops                    = &hswep_uncore_sbox_msr_ops,
+       .format_group           = &hswep_uncore_sbox_format_group,
+};
+
+static int hswep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
+
+       if (ev_sel >= 0xb && ev_sel <= 0xe) {
+               reg1->reg = HSWEP_PCU_MSR_PMON_BOX_FILTER;
+               reg1->idx = ev_sel - 0xb;
+               reg1->config = event->attr.config1 & (0xff << reg1->idx);
+       }
+       return 0;
+}
+
+static struct intel_uncore_ops hswep_uncore_pcu_ops = {
+       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+       .hw_config              = hswep_pcu_hw_config,
+       .get_constraint         = snbep_pcu_get_constraint,
+       .put_constraint         = snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type hswep_uncore_pcu = {
+       .name                   = "pcu",
+       .num_counters           = 4,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .perf_ctr               = HSWEP_PCU_MSR_PMON_CTR0,
+       .event_ctl              = HSWEP_PCU_MSR_PMON_CTL0,
+       .event_mask             = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = HSWEP_PCU_MSR_PMON_BOX_CTL,
+       .num_shared_regs        = 1,
+       .ops                    = &hswep_uncore_pcu_ops,
+       .format_group           = &snbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *hswep_msr_uncores[] = {
+       &hswep_uncore_ubox,
+       &hswep_uncore_cbox,
+       &hswep_uncore_sbox,
+       &hswep_uncore_pcu,
+       NULL,
+};
+
+void hswep_uncore_cpu_init(void)
+{
+       int pkg = topology_phys_to_logical_pkg(0);
+
+       if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+               hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+
+       /* Detect 6-8 core systems with only two SBOXes */
+       if (uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]) {
+               u32 capid4;
+
+               pci_read_config_dword(uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3],
+                                     0x94, &capid4);
+               if (((capid4 >> 6) & 0x3) == 0)
+                       hswep_uncore_sbox.num_boxes = 2;
+       }
+
+       uncore_msr_uncores = hswep_msr_uncores;
+}
+
+static struct intel_uncore_type hswep_uncore_ha = {
+       .name           = "ha",
+       .num_counters   = 5,
+       .num_boxes      = 2,
+       .perf_ctr_bits  = 48,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct uncore_event_desc hswep_uncore_imc_events[] = {
+       INTEL_UNCORE_EVENT_DESC(clockticks,      "event=0x00,umask=0x00"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_read,  "event=0x04,umask=0x03"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
+       { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_type hswep_uncore_imc = {
+       .name           = "imc",
+       .num_counters   = 5,
+       .num_boxes      = 8,
+       .perf_ctr_bits  = 48,
+       .fixed_ctr_bits = 48,
+       .fixed_ctr      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+       .fixed_ctl      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+       .event_descs    = hswep_uncore_imc_events,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static unsigned hswep_uncore_irp_ctrs[] = {0xa0, 0xa8, 0xb0, 0xb8};
+
+static u64 hswep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       struct hw_perf_event *hwc = &event->hw;
+       u64 count = 0;
+
+       pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
+       pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
+
+       return count;
+}
+
+static struct intel_uncore_ops hswep_uncore_irp_ops = {
+       .init_box       = snbep_uncore_pci_init_box,
+       .disable_box    = snbep_uncore_pci_disable_box,
+       .enable_box     = snbep_uncore_pci_enable_box,
+       .disable_event  = ivbep_uncore_irp_disable_event,
+       .enable_event   = ivbep_uncore_irp_enable_event,
+       .read_counter   = hswep_uncore_irp_read_counter,
+};
+
+static struct intel_uncore_type hswep_uncore_irp = {
+       .name                   = "irp",
+       .num_counters           = 4,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
+       .ops                    = &hswep_uncore_irp_ops,
+       .format_group           = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type hswep_uncore_qpi = {
+       .name                   = "qpi",
+       .num_counters           = 5,
+       .num_boxes              = 3,
+       .perf_ctr_bits          = 48,
+       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
+       .event_ctl              = SNBEP_PCI_PMON_CTL0,
+       .event_mask             = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
+       .num_shared_regs        = 1,
+       .ops                    = &snbep_uncore_qpi_ops,
+       .format_group           = &snbep_uncore_qpi_format_group,
+};
+
+static struct event_constraint hswep_uncore_r2pcie_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x24, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x27, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2a, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
+       EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type hswep_uncore_r2pcie = {
+       .name           = "r2pcie",
+       .num_counters   = 4,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 48,
+       .constraints    = hswep_uncore_r2pcie_constraints,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct event_constraint hswep_uncore_r3qpi_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x01, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+       EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type hswep_uncore_r3qpi = {
+       .name           = "r3qpi",
+       .num_counters   = 4,
+       .num_boxes      = 3,
+       .perf_ctr_bits  = 44,
+       .constraints    = hswep_uncore_r3qpi_constraints,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+       HSWEP_PCI_UNCORE_HA,
+       HSWEP_PCI_UNCORE_IMC,
+       HSWEP_PCI_UNCORE_IRP,
+       HSWEP_PCI_UNCORE_QPI,
+       HSWEP_PCI_UNCORE_R2PCIE,
+       HSWEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *hswep_pci_uncores[] = {
+       [HSWEP_PCI_UNCORE_HA]   = &hswep_uncore_ha,
+       [HSWEP_PCI_UNCORE_IMC]  = &hswep_uncore_imc,
+       [HSWEP_PCI_UNCORE_IRP]  = &hswep_uncore_irp,
+       [HSWEP_PCI_UNCORE_QPI]  = &hswep_uncore_qpi,
+       [HSWEP_PCI_UNCORE_R2PCIE]       = &hswep_uncore_r2pcie,
+       [HSWEP_PCI_UNCORE_R3QPI]        = &hswep_uncore_r3qpi,
+       NULL,
+};
+
+static const struct pci_device_id hswep_uncore_pci_ids[] = {
+       { /* Home Agent 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f30),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 0),
+       },
+       { /* Home Agent 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f38),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 1),
+       },
+       { /* MC0 Channel 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb0),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 0),
+       },
+       { /* MC0 Channel 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb1),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 1),
+       },
+       { /* MC0 Channel 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb4),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 2),
+       },
+       { /* MC0 Channel 3 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb5),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 3),
+       },
+       { /* MC1 Channel 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd0),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 4),
+       },
+       { /* MC1 Channel 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd1),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 5),
+       },
+       { /* MC1 Channel 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd4),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 6),
+       },
+       { /* MC1 Channel 3 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd5),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 7),
+       },
+       { /* IRP */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f39),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IRP, 0),
+       },
+       { /* QPI0 Port 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f32),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 0),
+       },
+       { /* QPI0 Port 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f33),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 1),
+       },
+       { /* QPI1 Port 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3a),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 2),
+       },
+       { /* R2PCIe */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f34),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R2PCIE, 0),
+       },
+       { /* R3QPI0 Link 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f36),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 0),
+       },
+       { /* R3QPI0 Link 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f37),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 1),
+       },
+       { /* R3QPI1 Link 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3e),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 2),
+       },
+       { /* QPI Port 0 filter  */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f86),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+                                                  SNBEP_PCI_QPI_PORT0_FILTER),
+       },
+       { /* QPI Port 1 filter  */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f96),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+                                                  SNBEP_PCI_QPI_PORT1_FILTER),
+       },
+       { /* PCU.3 (for Capability registers) */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fc0),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+                                                  HSWEP_PCI_PCU_3),
+       },
+       { /* end: all zeroes */ }
+};
+
+static struct pci_driver hswep_uncore_pci_driver = {
+       .name           = "hswep_uncore",
+       .id_table       = hswep_uncore_pci_ids,
+};
+
+int hswep_uncore_pci_init(void)
+{
+       int ret = snbep_pci2phy_map_init(0x2f1e);
+       if (ret)
+               return ret;
+       uncore_pci_uncores = hswep_pci_uncores;
+       uncore_pci_driver = &hswep_uncore_pci_driver;
+       return 0;
+}
+/* end of Haswell-EP uncore support */
+
+/* BDX uncore support */
+
+static struct intel_uncore_type bdx_uncore_ubox = {
+       .name                   = "ubox",
+       .num_counters           = 2,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .fixed_ctr_bits         = 48,
+       .perf_ctr               = HSWEP_U_MSR_PMON_CTR0,
+       .event_ctl              = HSWEP_U_MSR_PMON_CTL0,
+       .event_mask             = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+       .fixed_ctr              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+       .fixed_ctl              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+       .num_shared_regs        = 1,
+       .ops                    = &ivbep_uncore_msr_ops,
+       .format_group           = &ivbep_uncore_ubox_format_group,
+};
+
+static struct event_constraint bdx_uncore_cbox_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
+       EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_cbox = {
+       .name                   = "cbox",
+       .num_counters           = 4,
+       .num_boxes              = 24,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = HSWEP_C0_MSR_PMON_CTL0,
+       .perf_ctr               = HSWEP_C0_MSR_PMON_CTR0,
+       .event_mask             = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = HSWEP_C0_MSR_PMON_BOX_CTL,
+       .msr_offset             = HSWEP_CBO_MSR_OFFSET,
+       .num_shared_regs        = 1,
+       .constraints            = bdx_uncore_cbox_constraints,
+       .ops                    = &hswep_uncore_cbox_ops,
+       .format_group           = &hswep_uncore_cbox_format_group,
+};
+
+static struct intel_uncore_type bdx_uncore_sbox = {
+       .name                   = "sbox",
+       .num_counters           = 4,
+       .num_boxes              = 4,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = HSWEP_S0_MSR_PMON_CTL0,
+       .perf_ctr               = HSWEP_S0_MSR_PMON_CTR0,
+       .event_mask             = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = HSWEP_S0_MSR_PMON_BOX_CTL,
+       .msr_offset             = HSWEP_SBOX_MSR_OFFSET,
+       .ops                    = &hswep_uncore_sbox_msr_ops,
+       .format_group           = &hswep_uncore_sbox_format_group,
+};
+
+#define BDX_MSR_UNCORE_SBOX    3
+
+static struct intel_uncore_type *bdx_msr_uncores[] = {
+       &bdx_uncore_ubox,
+       &bdx_uncore_cbox,
+       &hswep_uncore_pcu,
+       &bdx_uncore_sbox,
+       NULL,
+};
+
+void bdx_uncore_cpu_init(void)
+{
+       if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+               bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+       uncore_msr_uncores = bdx_msr_uncores;
+
+       /* BDX-DE doesn't have SBOX */
+       if (boot_cpu_data.x86_model == 86)
+               uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL;
+}
+
+static struct intel_uncore_type bdx_uncore_ha = {
+       .name           = "ha",
+       .num_counters   = 4,
+       .num_boxes      = 2,
+       .perf_ctr_bits  = 48,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type bdx_uncore_imc = {
+       .name           = "imc",
+       .num_counters   = 5,
+       .num_boxes      = 8,
+       .perf_ctr_bits  = 48,
+       .fixed_ctr_bits = 48,
+       .fixed_ctr      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+       .fixed_ctl      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+       .event_descs    = hswep_uncore_imc_events,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type bdx_uncore_irp = {
+       .name                   = "irp",
+       .num_counters           = 4,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
+       .ops                    = &hswep_uncore_irp_ops,
+       .format_group           = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type bdx_uncore_qpi = {
+       .name                   = "qpi",
+       .num_counters           = 4,
+       .num_boxes              = 3,
+       .perf_ctr_bits          = 48,
+       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
+       .event_ctl              = SNBEP_PCI_PMON_CTL0,
+       .event_mask             = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
+       .num_shared_regs        = 1,
+       .ops                    = &snbep_uncore_qpi_ops,
+       .format_group           = &snbep_uncore_qpi_format_group,
+};
+
+static struct event_constraint bdx_uncore_r2pcie_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+       EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_r2pcie = {
+       .name           = "r2pcie",
+       .num_counters   = 4,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 48,
+       .constraints    = bdx_uncore_r2pcie_constraints,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct event_constraint bdx_uncore_r3qpi_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x01, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+       EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_r3qpi = {
+       .name           = "r3qpi",
+       .num_counters   = 3,
+       .num_boxes      = 3,
+       .perf_ctr_bits  = 48,
+       .constraints    = bdx_uncore_r3qpi_constraints,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+       BDX_PCI_UNCORE_HA,
+       BDX_PCI_UNCORE_IMC,
+       BDX_PCI_UNCORE_IRP,
+       BDX_PCI_UNCORE_QPI,
+       BDX_PCI_UNCORE_R2PCIE,
+       BDX_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *bdx_pci_uncores[] = {
+       [BDX_PCI_UNCORE_HA]     = &bdx_uncore_ha,
+       [BDX_PCI_UNCORE_IMC]    = &bdx_uncore_imc,
+       [BDX_PCI_UNCORE_IRP]    = &bdx_uncore_irp,
+       [BDX_PCI_UNCORE_QPI]    = &bdx_uncore_qpi,
+       [BDX_PCI_UNCORE_R2PCIE] = &bdx_uncore_r2pcie,
+       [BDX_PCI_UNCORE_R3QPI]  = &bdx_uncore_r3qpi,
+       NULL,
+};
+
+static const struct pci_device_id bdx_uncore_pci_ids[] = {
+       { /* Home Agent 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0),
+       },
+       { /* Home Agent 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f38),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 1),
+       },
+       { /* MC0 Channel 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb0),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 0),
+       },
+       { /* MC0 Channel 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb1),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 1),
+       },
+       { /* MC0 Channel 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb4),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 2),
+       },
+       { /* MC0 Channel 3 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb5),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 3),
+       },
+       { /* MC1 Channel 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd0),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 4),
+       },
+       { /* MC1 Channel 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd1),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 5),
+       },
+       { /* MC1 Channel 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd4),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 6),
+       },
+       { /* MC1 Channel 3 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd5),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 7),
+       },
+       { /* IRP */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f39),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IRP, 0),
+       },
+       { /* QPI0 Port 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f32),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 0),
+       },
+       { /* QPI0 Port 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f33),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 1),
+       },
+       { /* QPI1 Port 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3a),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 2),
+       },
+       { /* R2PCIe */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f34),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R2PCIE, 0),
+       },
+       { /* R3QPI0 Link 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f36),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 0),
+       },
+       { /* R3QPI0 Link 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f37),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 1),
+       },
+       { /* R3QPI1 Link 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3e),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 2),
+       },
+       { /* QPI Port 0 filter  */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f86),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 0),
+       },
+       { /* QPI Port 1 filter  */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f96),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 1),
+       },
+       { /* QPI Port 2 filter  */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f46),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 2),
+       },
+       { /* end: all zeroes */ }
+};
+
+static struct pci_driver bdx_uncore_pci_driver = {
+       .name           = "bdx_uncore",
+       .id_table       = bdx_uncore_pci_ids,
+};
+
+int bdx_uncore_pci_init(void)
+{
+       int ret = snbep_pci2phy_map_init(0x6f1e);
+
+       if (ret)
+               return ret;
+       uncore_pci_uncores = bdx_pci_uncores;
+       uncore_pci_driver = &bdx_uncore_pci_driver;
+       return 0;
+}
+
+/* end of BDX uncore support */
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c

new file mode 100644 (file)

index 0000000..ec863b9
--- /dev/null
+++ b/arch/x86/events/msr.c
@@ -0,0 +1,241 @@
+#include <linux/perf_event.h>
+
+enum perf_msr_id {
+       PERF_MSR_TSC                    = 0,
+       PERF_MSR_APERF                  = 1,
+       PERF_MSR_MPERF                  = 2,
+       PERF_MSR_PPERF                  = 3,
+       PERF_MSR_SMI                    = 4,
+
+       PERF_MSR_EVENT_MAX,
+};
+
+static bool test_aperfmperf(int idx)
+{
+       return boot_cpu_has(X86_FEATURE_APERFMPERF);
+}
+
+static bool test_intel(int idx)
+{
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+           boot_cpu_data.x86 != 6)
+               return false;
+
+       switch (boot_cpu_data.x86_model) {
+       case 30: /* 45nm Nehalem    */
+       case 26: /* 45nm Nehalem-EP */
+       case 46: /* 45nm Nehalem-EX */
+
+       case 37: /* 32nm Westmere    */
+       case 44: /* 32nm Westmere-EP */
+       case 47: /* 32nm Westmere-EX */
+
+       case 42: /* 32nm SandyBridge         */
+       case 45: /* 32nm SandyBridge-E/EN/EP */
+
+       case 58: /* 22nm IvyBridge       */
+       case 62: /* 22nm IvyBridge-EP/EX */
+
+       case 60: /* 22nm Haswell Core */
+       case 63: /* 22nm Haswell Server */
+       case 69: /* 22nm Haswell ULT */
+       case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+       case 61: /* 14nm Broadwell Core-M */
+       case 86: /* 14nm Broadwell Xeon D */
+       case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+       case 79: /* 14nm Broadwell Server */
+
+       case 55: /* 22nm Atom "Silvermont"                */
+       case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+       case 76: /* 14nm Atom "Airmont"                   */
+               if (idx == PERF_MSR_SMI)
+                       return true;
+               break;
+
+       case 78: /* 14nm Skylake Mobile */
+       case 94: /* 14nm Skylake Desktop */
+               if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
+                       return true;
+               break;
+       }
+
+       return false;
+}
+
+struct perf_msr {
+       u64     msr;
+       struct  perf_pmu_events_attr *attr;
+       bool    (*test)(int idx);
+};
+
+PMU_EVENT_ATTR_STRING(tsc,   evattr_tsc,   "event=0x00");
+PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01");
+PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02");
+PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03");
+PMU_EVENT_ATTR_STRING(smi,   evattr_smi,   "event=0x04");
+
+static struct perf_msr msr[] = {
+       [PERF_MSR_TSC]   = { 0,                 &evattr_tsc,    NULL,            },
+       [PERF_MSR_APERF] = { MSR_IA32_APERF,    &evattr_aperf,  test_aperfmperf, },
+       [PERF_MSR_MPERF] = { MSR_IA32_MPERF,    &evattr_mperf,  test_aperfmperf, },
+       [PERF_MSR_PPERF] = { MSR_PPERF,         &evattr_pperf,  test_intel,      },
+       [PERF_MSR_SMI]   = { MSR_SMI_COUNT,     &evattr_smi,    test_intel,      },
+};
+
+static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = {
+       NULL,
+};
+
+static struct attribute_group events_attr_group = {
+       .name = "events",
+       .attrs = events_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-63");
+static struct attribute *format_attrs[] = {
+       &format_attr_event.attr,
+       NULL,
+};
+static struct attribute_group format_attr_group = {
+       .name = "format",
+       .attrs = format_attrs,
+};
+
+static const struct attribute_group *attr_groups[] = {
+       &events_attr_group,
+       &format_attr_group,
+       NULL,
+};
+
+static int msr_event_init(struct perf_event *event)
+{
+       u64 cfg = event->attr.config;
+
+       if (event->attr.type != event->pmu->type)
+               return -ENOENT;
+
+       if (cfg >= PERF_MSR_EVENT_MAX)
+               return -EINVAL;
+
+       /* unsupported modes and filters */
+       if (event->attr.exclude_user   ||
+           event->attr.exclude_kernel ||
+           event->attr.exclude_hv     ||
+           event->attr.exclude_idle   ||
+           event->attr.exclude_host   ||
+           event->attr.exclude_guest  ||
+           event->attr.sample_period) /* no sampling */
+               return -EINVAL;
+
+       if (!msr[cfg].attr)
+               return -EINVAL;
+
+       event->hw.idx = -1;
+       event->hw.event_base = msr[cfg].msr;
+       event->hw.config = cfg;
+
+       return 0;
+}
+
+static inline u64 msr_read_counter(struct perf_event *event)
+{
+       u64 now;
+
+       if (event->hw.event_base)
+               rdmsrl(event->hw.event_base, now);
+       else
+               rdtscll(now);
+
+       return now;
+}
+static void msr_event_update(struct perf_event *event)
+{
+       u64 prev, now;
+       s64 delta;
+
+       /* Careful, an NMI might modify the previous event value. */
+again:
+       prev = local64_read(&event->hw.prev_count);
+       now = msr_read_counter(event);
+
+       if (local64_cmpxchg(&event->hw.prev_count, prev, now) != prev)
+               goto again;
+
+       delta = now - prev;
+       if (unlikely(event->hw.event_base == MSR_SMI_COUNT))
+               delta = sign_extend64(delta, 31);
+
+       local64_add(now - prev, &event->count);
+}
+
+static void msr_event_start(struct perf_event *event, int flags)
+{
+       u64 now;
+
+       now = msr_read_counter(event);
+       local64_set(&event->hw.prev_count, now);
+}
+
+static void msr_event_stop(struct perf_event *event, int flags)
+{
+       msr_event_update(event);
+}
+
+static void msr_event_del(struct perf_event *event, int flags)
+{
+       msr_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int msr_event_add(struct perf_event *event, int flags)
+{
+       if (flags & PERF_EF_START)
+               msr_event_start(event, flags);
+
+       return 0;
+}
+
+static struct pmu pmu_msr = {
+       .task_ctx_nr    = perf_sw_context,
+       .attr_groups    = attr_groups,
+       .event_init     = msr_event_init,
+       .add            = msr_event_add,
+       .del            = msr_event_del,
+       .start          = msr_event_start,
+       .stop           = msr_event_stop,
+       .read           = msr_event_update,
+       .capabilities   = PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static int __init msr_init(void)
+{
+       int i, j = 0;
+
+       if (!boot_cpu_has(X86_FEATURE_TSC)) {
+               pr_cont("no MSR PMU driver.\n");
+               return 0;
+       }
+
+       /* Probe the MSRs. */
+       for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) {
+               u64 val;
+
+               /*
+                * Virt sucks arse; you cannot tell if a R/O MSR is present :/
+                */
+               if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
+                       msr[i].attr = NULL;
+       }
+
+       /* List remaining MSRs in the sysfs attrs. */
+       for (i = 0; i < PERF_MSR_EVENT_MAX; i++) {
+               if (msr[i].attr)
+                       events_attrs[j++] = &msr[i].attr->attr.attr;
+       }
+       events_attrs[j] = NULL;
+
+       perf_pmu_register(&pmu_msr, "msr", -1);
+
+       return 0;
+}
+device_initcall(msr_init);
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h

new file mode 100644 (file)

index 0000000..68155ca
--- /dev/null
+++ b/arch/x86/events/perf_event.h
@@ -0,0 +1,960 @@
+/*
+ * Performance events x86 architecture header
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2009 Jaswinder Singh Rajput
+ *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
+ *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
+ *  Copyright (C) 2009 Google, Inc., Stephane Eranian
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+
+/* To enable MSR tracing please use the generic trace points. */
+
+/*
+ *          |   NHM/WSM    |      SNB     |
+ * register -------------------------------
+ *          |  HT  | no HT |  HT  | no HT |
+ *-----------------------------------------
+ * offcore  | core | core  | cpu  | core  |
+ * lbr_sel  | core | core  | cpu  | core  |
+ * ld_lat   | cpu  | core  | cpu  | core  |
+ *-----------------------------------------
+ *
+ * Given that there is a small number of shared regs,
+ * we can pre-allocate their slot in the per-cpu
+ * per-core reg tables.
+ */
+enum extra_reg_type {
+       EXTRA_REG_NONE  = -1,   /* not used */
+
+       EXTRA_REG_RSP_0 = 0,    /* offcore_response_0 */
+       EXTRA_REG_RSP_1 = 1,    /* offcore_response_1 */
+       EXTRA_REG_LBR   = 2,    /* lbr_select */
+       EXTRA_REG_LDLAT = 3,    /* ld_lat_threshold */
+       EXTRA_REG_FE    = 4,    /* fe_* */
+
+       EXTRA_REG_MAX           /* number of entries needed */
+};
+
+struct event_constraint {
+       union {
+               unsigned long   idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+               u64             idxmsk64;
+       };
+       u64     code;
+       u64     cmask;
+       int     weight;
+       int     overlap;
+       int     flags;
+};
+/*
+ * struct hw_perf_event.flags flags
+ */
+#define PERF_X86_EVENT_PEBS_LDLAT      0x0001 /* ld+ldlat data address sampling */
+#define PERF_X86_EVENT_PEBS_ST         0x0002 /* st data address sampling */
+#define PERF_X86_EVENT_PEBS_ST_HSW     0x0004 /* haswell style datala, store */
+#define PERF_X86_EVENT_COMMITTED       0x0008 /* event passed commit_txn */
+#define PERF_X86_EVENT_PEBS_LD_HSW     0x0010 /* haswell style datala, load */
+#define PERF_X86_EVENT_PEBS_NA_HSW     0x0020 /* haswell style datala, unknown */
+#define PERF_X86_EVENT_EXCL            0x0040 /* HT exclusivity on counter */
+#define PERF_X86_EVENT_DYNAMIC         0x0080 /* dynamic alloc'd constraint */
+#define PERF_X86_EVENT_RDPMC_ALLOWED   0x0100 /* grant rdpmc permission */
+#define PERF_X86_EVENT_EXCL_ACCT       0x0200 /* accounted EXCL event */
+#define PERF_X86_EVENT_AUTO_RELOAD     0x0400 /* use PEBS auto-reload */
+#define PERF_X86_EVENT_FREERUNNING     0x0800 /* use freerunning PEBS */
+
+
+struct amd_nb {
+       int nb_id;  /* NorthBridge id */
+       int refcnt; /* reference count */
+       struct perf_event *owners[X86_PMC_IDX_MAX];
+       struct event_constraint event_constraints[X86_PMC_IDX_MAX];
+};
+
+/* The maximal number of PEBS events: */
+#define MAX_PEBS_EVENTS                8
+
+/*
+ * Flags PEBS can handle without an PMI.
+ *
+ * TID can only be handled by flushing at context switch.
+ *
+ */
+#define PEBS_FREERUNNING_FLAGS \
+       (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
+       PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
+       PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
+       PERF_SAMPLE_TRANSACTION)
+
+/*
+ * A debug store configuration.
+ *
+ * We only support architectures that use 64bit fields.
+ */
+struct debug_store {
+       u64     bts_buffer_base;
+       u64     bts_index;
+       u64     bts_absolute_maximum;
+       u64     bts_interrupt_threshold;
+       u64     pebs_buffer_base;
+       u64     pebs_index;
+       u64     pebs_absolute_maximum;
+       u64     pebs_interrupt_threshold;
+       u64     pebs_event_reset[MAX_PEBS_EVENTS];
+};
+
+/*
+ * Per register state.
+ */
+struct er_account {
+       raw_spinlock_t          lock;   /* per-core: protect structure */
+       u64                 config;     /* extra MSR config */
+       u64                 reg;        /* extra MSR number */
+       atomic_t            ref;        /* reference count */
+};
+
+/*
+ * Per core/cpu state
+ *
+ * Used to coordinate shared registers between HT threads or
+ * among events on a single PMU.
+ */
+struct intel_shared_regs {
+       struct er_account       regs[EXTRA_REG_MAX];
+       int                     refcnt;         /* per-core: #HT threads */
+       unsigned                core_id;        /* per-core: core id */
+};
+
+enum intel_excl_state_type {
+       INTEL_EXCL_UNUSED    = 0, /* counter is unused */
+       INTEL_EXCL_SHARED    = 1, /* counter can be used by both threads */
+       INTEL_EXCL_EXCLUSIVE = 2, /* counter can be used by one thread only */
+};
+
+struct intel_excl_states {
+       enum intel_excl_state_type state[X86_PMC_IDX_MAX];
+       bool sched_started; /* true if scheduling has started */
+};
+
+struct intel_excl_cntrs {
+       raw_spinlock_t  lock;
+
+       struct intel_excl_states states[2];
+
+       union {
+               u16     has_exclusive[2];
+               u32     exclusive_present;
+       };
+
+       int             refcnt;         /* per-core: #HT threads */
+       unsigned        core_id;        /* per-core: core id */
+};
+
+#define MAX_LBR_ENTRIES                32
+
+enum {
+       X86_PERF_KFREE_SHARED = 0,
+       X86_PERF_KFREE_EXCL   = 1,
+       X86_PERF_KFREE_MAX
+};
+
+struct cpu_hw_events {
+       /*
+        * Generic x86 PMC bits
+        */
+       struct perf_event       *events[X86_PMC_IDX_MAX]; /* in counter order */
+       unsigned long           active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+       unsigned long           running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+       int                     enabled;
+
+       int                     n_events; /* the # of events in the below arrays */
+       int                     n_added;  /* the # last events in the below arrays;
+                                            they've never been enabled yet */
+       int                     n_txn;    /* the # last events in the below arrays;
+                                            added in the current transaction */
+       int                     assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
+       u64                     tags[X86_PMC_IDX_MAX];
+
+       struct perf_event       *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
+       struct event_constraint *event_constraint[X86_PMC_IDX_MAX];
+
+       int                     n_excl; /* the number of exclusive events */
+
+       unsigned int            txn_flags;
+       int                     is_fake;
+
+       /*
+        * Intel DebugStore bits
+        */
+       struct debug_store      *ds;
+       u64                     pebs_enabled;
+
+       /*
+        * Intel LBR bits
+        */
+       int                             lbr_users;
+       void                            *lbr_context;
+       struct perf_branch_stack        lbr_stack;
+       struct perf_branch_entry        lbr_entries[MAX_LBR_ENTRIES];
+       struct er_account               *lbr_sel;
+       u64                             br_sel;
+
+       /*
+        * Intel host/guest exclude bits
+        */
+       u64                             intel_ctrl_guest_mask;
+       u64                             intel_ctrl_host_mask;
+       struct perf_guest_switch_msr    guest_switch_msrs[X86_PMC_IDX_MAX];
+
+       /*
+        * Intel checkpoint mask
+        */
+       u64                             intel_cp_status;
+
+       /*
+        * manage shared (per-core, per-cpu) registers
+        * used on Intel NHM/WSM/SNB
+        */
+       struct intel_shared_regs        *shared_regs;
+       /*
+        * manage exclusive counter access between hyperthread
+        */
+       struct event_constraint *constraint_list; /* in enable order */
+       struct intel_excl_cntrs         *excl_cntrs;
+       int excl_thread_id; /* 0 or 1 */
+
+       /*
+        * AMD specific bits
+        */
+       struct amd_nb                   *amd_nb;
+       /* Inverted mask of bits to clear in the perf_ctr ctrl registers */
+       u64                             perf_ctr_virt_mask;
+
+       void                            *kfree_on_online[X86_PERF_KFREE_MAX];
+};
+
+#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\
+       { .idxmsk64 = (n) },            \
+       .code = (c),                    \
+       .cmask = (m),                   \
+       .weight = (w),                  \
+       .overlap = (o),                 \
+       .flags = f,                     \
+}
+
+#define EVENT_CONSTRAINT(c, n, m)      \
+       __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)
+
+#define INTEL_EXCLEVT_CONSTRAINT(c, n) \
+       __EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\
+                          0, PERF_X86_EVENT_EXCL)
+
+/*
+ * The overlap flag marks event constraints with overlapping counter
+ * masks. This is the case if the counter mask of such an event is not
+ * a subset of any other counter mask of a constraint with an equal or
+ * higher weight, e.g.:
+ *
+ *  c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
+ *  c_another1 = EVENT_CONSTRAINT(0, 0x07, 0);
+ *  c_another2 = EVENT_CONSTRAINT(0, 0x38, 0);
+ *
+ * The event scheduler may not select the correct counter in the first
+ * cycle because it needs to know which subsequent events will be
+ * scheduled. It may fail to schedule the events then. So we set the
+ * overlap flag for such constraints to give the scheduler a hint which
+ * events to select for counter rescheduling.
+ *
+ * Care must be taken as the rescheduling algorithm is O(n!) which
+ * will increase scheduling cycles for an over-commited system
+ * dramatically.  The number of such EVENT_CONSTRAINT_OVERLAP() macros
+ * and its counter masks must be kept at a minimum.
+ */
+#define EVENT_CONSTRAINT_OVERLAP(c, n, m)      \
+       __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1, 0)
+
+/*
+ * Constraint on the Event code.
+ */
+#define INTEL_EVENT_CONSTRAINT(c, n)   \
+       EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
+
+/*
+ * Constraint on the Event code + UMask + fixed-mask
+ *
+ * filter mask to validate fixed counter events.
+ * the following filters disqualify for fixed counters:
+ *  - inv
+ *  - edge
+ *  - cnt-mask
+ *  - in_tx
+ *  - in_tx_checkpointed
+ *  The other filters are supported by fixed counters.
+ *  The any-thread option is supported starting with v3.
+ */
+#define FIXED_EVENT_FLAGS (X86_RAW_EVENT_MASK|HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)
+#define FIXED_EVENT_CONSTRAINT(c, n)   \
+       EVENT_CONSTRAINT(c, (1ULL << (32+n)), FIXED_EVENT_FLAGS)
+
+/*
+ * Constraint on the Event code + UMask
+ */
+#define INTEL_UEVENT_CONSTRAINT(c, n)  \
+       EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
+
+/* Constraint on specific umask bit only + event */
+#define INTEL_UBIT_EVENT_CONSTRAINT(c, n)      \
+       EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|(c))
+
+/* Like UEVENT_CONSTRAINT, but match flags too */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n)    \
+       EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
+
+#define INTEL_EXCLUEVT_CONSTRAINT(c, n)        \
+       __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
+                          HWEIGHT(n), 0, PERF_X86_EVENT_EXCL)
+
+#define INTEL_PLD_CONSTRAINT(c, n)     \
+       __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+                          HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
+
+#define INTEL_PST_CONSTRAINT(c, n)     \
+       __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)
+
+/* Event constraint, but match on all event flags too. */
+#define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \
+       EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
+
+/* Check only flags, but allow all event/umask */
+#define INTEL_ALL_EVENT_CONSTRAINT(code, n)    \
+       EVENT_CONSTRAINT(code, n, X86_ALL_EVENT_FLAGS)
+
+/* Check flags and event code, and set the HSW store flag */
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_ST(code, n) \
+       __EVENT_CONSTRAINT(code, n,                     \
+                         ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
+
+/* Check flags and event code, and set the HSW load flag */
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \
+       __EVENT_CONSTRAINT(code, n,                     \
+                         ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \
+       __EVENT_CONSTRAINT(code, n,                     \
+                         ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+                         HWEIGHT(n), 0, \
+                         PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
+
+/* Check flags and event code/umask, and set the HSW store flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \
+       __EVENT_CONSTRAINT(code, n,                     \
+                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
+
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(code, n) \
+       __EVENT_CONSTRAINT(code, n,                     \
+                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+                         HWEIGHT(n), 0, \
+                         PERF_X86_EVENT_PEBS_ST_HSW|PERF_X86_EVENT_EXCL)
+
+/* Check flags and event code/umask, and set the HSW load flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \
+       __EVENT_CONSTRAINT(code, n,                     \
+                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(code, n) \
+       __EVENT_CONSTRAINT(code, n,                     \
+                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+                         HWEIGHT(n), 0, \
+                         PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
+
+/* Check flags and event code/umask, and set the HSW N/A flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
+       __EVENT_CONSTRAINT(code, n,                     \
+                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW)
+
+
+/*
+ * We define the end marker as having a weight of -1
+ * to enable blacklisting of events using a counter bitmask
+ * of zero and thus a weight of zero.
+ * The end marker has a weight that cannot possibly be
+ * obtained from counting the bits in the bitmask.
+ */
+#define EVENT_CONSTRAINT_END { .weight = -1 }
+
+/*
+ * Check for end marker with weight == -1
+ */
+#define for_each_event_constraint(e, c)        \
+       for ((e) = (c); (e)->weight != -1; (e)++)
+
+/*
+ * Extra registers for specific events.
+ *
+ * Some events need large masks and require external MSRs.
+ * Those extra MSRs end up being shared for all events on
+ * a PMU and sometimes between PMU of sibling HT threads.
+ * In either case, the kernel needs to handle conflicting
+ * accesses to those extra, shared, regs. The data structure
+ * to manage those registers is stored in cpu_hw_event.
+ */
+struct extra_reg {
+       unsigned int            event;
+       unsigned int            msr;
+       u64                     config_mask;
+       u64                     valid_mask;
+       int                     idx;  /* per_xxx->regs[] reg index */
+       bool                    extra_msr_access;
+};
+
+#define EVENT_EXTRA_REG(e, ms, m, vm, i) {     \
+       .event = (e),                   \
+       .msr = (ms),                    \
+       .config_mask = (m),             \
+       .valid_mask = (vm),             \
+       .idx = EXTRA_REG_##i,           \
+       .extra_msr_access = true,       \
+       }
+
+#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx)     \
+       EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
+
+#define INTEL_UEVENT_EXTRA_REG(event, msr, vm, idx) \
+       EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \
+                       ARCH_PERFMON_EVENTSEL_UMASK, vm, idx)
+
+#define INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(c) \
+       INTEL_UEVENT_EXTRA_REG(c, \
+                              MSR_PEBS_LD_LAT_THRESHOLD, \
+                              0xffff, \
+                              LDLAT)
+
+#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
+
+union perf_capabilities {
+       struct {
+               u64     lbr_format:6;
+               u64     pebs_trap:1;
+               u64     pebs_arch_reg:1;
+               u64     pebs_format:4;
+               u64     smm_freeze:1;
+               /*
+                * PMU supports separate counter range for writing
+                * values > 32bit.
+                */
+               u64     full_width_write:1;
+       };
+       u64     capabilities;
+};
+
+struct x86_pmu_quirk {
+       struct x86_pmu_quirk *next;
+       void (*func)(void);
+};
+
+union x86_pmu_config {
+       struct {
+               u64 event:8,
+                   umask:8,
+                   usr:1,
+                   os:1,
+                   edge:1,
+                   pc:1,
+                   interrupt:1,
+                   __reserved1:1,
+                   en:1,
+                   inv:1,
+                   cmask:8,
+                   event2:4,
+                   __reserved2:4,
+                   go:1,
+                   ho:1;
+       } bits;
+       u64 value;
+};
+
+#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
+
+enum {
+       x86_lbr_exclusive_lbr,
+       x86_lbr_exclusive_bts,
+       x86_lbr_exclusive_pt,
+       x86_lbr_exclusive_max,
+};
+
+/*
+ * struct x86_pmu - generic x86 pmu
+ */
+struct x86_pmu {
+       /*
+        * Generic x86 PMC bits
+        */
+       const char      *name;
+       int             version;
+       int             (*handle_irq)(struct pt_regs *);
+       void            (*disable_all)(void);
+       void            (*enable_all)(int added);
+       void            (*enable)(struct perf_event *);
+       void            (*disable)(struct perf_event *);
+       int             (*hw_config)(struct perf_event *event);
+       int             (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
+       unsigned        eventsel;
+       unsigned        perfctr;
+       int             (*addr_offset)(int index, bool eventsel);
+       int             (*rdpmc_index)(int index);
+       u64             (*event_map)(int);
+       int             max_events;
+       int             num_counters;
+       int             num_counters_fixed;
+       int             cntval_bits;
+       u64             cntval_mask;
+       union {
+                       unsigned long events_maskl;
+                       unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)];
+       };
+       int             events_mask_len;
+       int             apic;
+       u64             max_period;
+       struct event_constraint *
+                       (*get_event_constraints)(struct cpu_hw_events *cpuc,
+                                                int idx,
+                                                struct perf_event *event);
+
+       void            (*put_event_constraints)(struct cpu_hw_events *cpuc,
+                                                struct perf_event *event);
+
+       void            (*start_scheduling)(struct cpu_hw_events *cpuc);
+
+       void            (*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr);
+
+       void            (*stop_scheduling)(struct cpu_hw_events *cpuc);
+
+       struct event_constraint *event_constraints;
+       struct x86_pmu_quirk *quirks;
+       int             perfctr_second_write;
+       bool            late_ack;
+       unsigned        (*limit_period)(struct perf_event *event, unsigned l);
+
+       /*
+        * sysfs attrs
+        */
+       int             attr_rdpmc_broken;
+       int             attr_rdpmc;
+       struct attribute **format_attrs;
+       struct attribute **event_attrs;
+
+       ssize_t         (*events_sysfs_show)(char *page, u64 config);
+       struct attribute **cpu_events;
+
+       /*
+        * CPU Hotplug hooks
+        */
+       int             (*cpu_prepare)(int cpu);
+       void            (*cpu_starting)(int cpu);
+       void            (*cpu_dying)(int cpu);
+       void            (*cpu_dead)(int cpu);
+
+       void            (*check_microcode)(void);
+       void            (*sched_task)(struct perf_event_context *ctx,
+                                     bool sched_in);
+
+       /*
+        * Intel Arch Perfmon v2+
+        */
+       u64                     intel_ctrl;
+       union perf_capabilities intel_cap;
+
+       /*
+        * Intel DebugStore bits
+        */
+       unsigned int    bts             :1,
+                       bts_active      :1,
+                       pebs            :1,
+                       pebs_active     :1,
+                       pebs_broken     :1,
+                       pebs_prec_dist  :1;
+       int             pebs_record_size;
+       int             pebs_buffer_size;
+       void            (*drain_pebs)(struct pt_regs *regs);
+       struct event_constraint *pebs_constraints;
+       void            (*pebs_aliases)(struct perf_event *event);
+       int             max_pebs_events;
+       unsigned long   free_running_flags;
+
+       /*
+        * Intel LBR
+        */
+       unsigned long   lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
+       int             lbr_nr;                    /* hardware stack size */
+       u64             lbr_sel_mask;              /* LBR_SELECT valid bits */
+       const int       *lbr_sel_map;              /* lbr_select mappings */
+       bool            lbr_double_abort;          /* duplicated lbr aborts */
+
+       /*
+        * Intel PT/LBR/BTS are exclusive
+        */
+       atomic_t        lbr_exclusive[x86_lbr_exclusive_max];
+
+       /*
+        * Extra registers for events
+        */
+       struct extra_reg *extra_regs;
+       unsigned int flags;
+
+       /*
+        * Intel host/guest support (KVM)
+        */
+       struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
+};
+
+struct x86_perf_task_context {
+       u64 lbr_from[MAX_LBR_ENTRIES];
+       u64 lbr_to[MAX_LBR_ENTRIES];
+       u64 lbr_info[MAX_LBR_ENTRIES];
+       int tos;
+       int lbr_callstack_users;
+       int lbr_stack_state;
+};
+
+#define x86_add_quirk(func_)                                           \
+do {                                                                   \
+       static struct x86_pmu_quirk __quirk __initdata = {              \
+               .func = func_,                                          \
+       };                                                              \
+       __quirk.next = x86_pmu.quirks;                                  \
+       x86_pmu.quirks = &__quirk;                                      \
+} while (0)
+
+/*
+ * x86_pmu flags
+ */
+#define PMU_FL_NO_HT_SHARING   0x1 /* no hyper-threading resource sharing */
+#define PMU_FL_HAS_RSP_1       0x2 /* has 2 equivalent offcore_rsp regs   */
+#define PMU_FL_EXCL_CNTRS      0x4 /* has exclusive counter requirements  */
+#define PMU_FL_EXCL_ENABLED    0x8 /* exclusive counter active */
+
+#define EVENT_VAR(_id)  event_attr_##_id
+#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
+
+#define EVENT_ATTR(_name, _id)                                         \
+static struct perf_pmu_events_attr EVENT_VAR(_id) = {                  \
+       .attr           = __ATTR(_name, 0444, events_sysfs_show, NULL), \
+       .id             = PERF_COUNT_HW_##_id,                          \
+       .event_str      = NULL,                                         \
+};
+
+#define EVENT_ATTR_STR(_name, v, str)                                  \
+static struct perf_pmu_events_attr event_attr_##v = {                  \
+       .attr           = __ATTR(_name, 0444, events_sysfs_show, NULL), \
+       .id             = 0,                                            \
+       .event_str      = str,                                          \
+};
+
+extern struct x86_pmu x86_pmu __read_mostly;
+
+static inline bool x86_pmu_has_lbr_callstack(void)
+{
+       return  x86_pmu.lbr_sel_map &&
+               x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
+}
+
+DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
+
+int x86_perf_event_set_period(struct perf_event *event);
+
+/*
+ * Generalized hw caching related hw_event table, filled
+ * in on a per model basis. A value of 0 means
+ * 'not supported', -1 means 'hw_event makes no sense on
+ * this CPU', any other value means the raw hw_event
+ * ID.
+ */
+
+#define C(x) PERF_COUNT_HW_CACHE_##x
+
+extern u64 __read_mostly hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX];
+extern u64 __read_mostly hw_cache_extra_regs
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+u64 x86_perf_event_update(struct perf_event *event);
+
+static inline unsigned int x86_pmu_config_addr(int index)
+{
+       return x86_pmu.eventsel + (x86_pmu.addr_offset ?
+                                  x86_pmu.addr_offset(index, true) : index);
+}
+
+static inline unsigned int x86_pmu_event_addr(int index)
+{
+       return x86_pmu.perfctr + (x86_pmu.addr_offset ?
+                                 x86_pmu.addr_offset(index, false) : index);
+}
+
+static inline int x86_pmu_rdpmc_index(int index)
+{
+       return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
+}
+
+int x86_add_exclusive(unsigned int what);
+
+void x86_del_exclusive(unsigned int what);
+
+int x86_reserve_hardware(void);
+
+void x86_release_hardware(void);
+
+void hw_perf_lbr_event_destroy(struct perf_event *event);
+
+int x86_setup_perfctr(struct perf_event *event);
+
+int x86_pmu_hw_config(struct perf_event *event);
+
+void x86_pmu_disable_all(void);
+
+static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
+                                         u64 enable_mask)
+{
+       u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
+
+       if (hwc->extra_reg.reg)
+               wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
+       wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);
+}
+
+void x86_pmu_enable_all(int added);
+
+int perf_assign_events(struct event_constraint **constraints, int n,
+                       int wmin, int wmax, int gpmax, int *assign);
+int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
+
+void x86_pmu_stop(struct perf_event *event, int flags);
+
+static inline void x86_pmu_disable_event(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       wrmsrl(hwc->config_base, hwc->config);
+}
+
+void x86_pmu_enable_event(struct perf_event *event);
+
+int x86_pmu_handle_irq(struct pt_regs *regs);
+
+extern struct event_constraint emptyconstraint;
+
+extern struct event_constraint unconstrained;
+
+static inline bool kernel_ip(unsigned long ip)
+{
+#ifdef CONFIG_X86_32
+       return ip > PAGE_OFFSET;
+#else
+       return (long)ip < 0;
+#endif
+}
+
+/*
+ * Not all PMUs provide the right context information to place the reported IP
+ * into full context. Specifically segment registers are typically not
+ * supplied.
+ *
+ * Assuming the address is a linear address (it is for IBS), we fake the CS and
+ * vm86 mode using the known zero-based code segment and 'fix up' the registers
+ * to reflect this.
+ *
+ * Intel PEBS/LBR appear to typically provide the effective address, nothing
+ * much we can do about that but pray and treat it like a linear address.
+ */
+static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip)
+{
+       regs->cs = kernel_ip(ip) ? __KERNEL_CS : __USER_CS;
+       if (regs->flags & X86_VM_MASK)
+               regs->flags ^= (PERF_EFLAGS_VM | X86_VM_MASK);
+       regs->ip = ip;
+}
+
+ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event);
+ssize_t intel_event_sysfs_show(char *page, u64 config);
+
+struct attribute **merge_attr(struct attribute **a, struct attribute **b);
+
+#ifdef CONFIG_CPU_SUP_AMD
+
+int amd_pmu_init(void);
+
+#else /* CONFIG_CPU_SUP_AMD */
+
+static inline int amd_pmu_init(void)
+{
+       return 0;
+}
+
+#endif /* CONFIG_CPU_SUP_AMD */
+
+#ifdef CONFIG_CPU_SUP_INTEL
+
+static inline bool intel_pmu_has_bts(struct perf_event *event)
+{
+       if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
+           !event->attr.freq && event->hw.sample_period == 1)
+               return true;
+
+       return false;
+}
+
+int intel_pmu_save_and_restart(struct perf_event *event);
+
+struct event_constraint *
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+                         struct perf_event *event);
+
+struct intel_shared_regs *allocate_shared_regs(int cpu);
+
+int intel_pmu_init(void);
+
+void init_debug_store_on_cpu(int cpu);
+
+void fini_debug_store_on_cpu(int cpu);
+
+void release_ds_buffers(void);
+
+void reserve_ds_buffers(void);
+
+extern struct event_constraint bts_constraint;
+
+void intel_pmu_enable_bts(u64 config);
+
+void intel_pmu_disable_bts(void);
+
+int intel_pmu_drain_bts_buffer(void);
+
+extern struct event_constraint intel_core2_pebs_event_constraints[];
+
+extern struct event_constraint intel_atom_pebs_event_constraints[];
+
+extern struct event_constraint intel_slm_pebs_event_constraints[];
+
+extern struct event_constraint intel_nehalem_pebs_event_constraints[];
+
+extern struct event_constraint intel_westmere_pebs_event_constraints[];
+
+extern struct event_constraint intel_snb_pebs_event_constraints[];
+
+extern struct event_constraint intel_ivb_pebs_event_constraints[];
+
+extern struct event_constraint intel_hsw_pebs_event_constraints[];
+
+extern struct event_constraint intel_bdw_pebs_event_constraints[];
+
+extern struct event_constraint intel_skl_pebs_event_constraints[];
+
+struct event_constraint *intel_pebs_constraints(struct perf_event *event);
+
+void intel_pmu_pebs_enable(struct perf_event *event);
+
+void intel_pmu_pebs_disable(struct perf_event *event);
+
+void intel_pmu_pebs_enable_all(void);
+
+void intel_pmu_pebs_disable_all(void);
+
+void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
+
+void intel_ds_init(void);
+
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
+
+void intel_pmu_lbr_reset(void);
+
+void intel_pmu_lbr_enable(struct perf_event *event);
+
+void intel_pmu_lbr_disable(struct perf_event *event);
+
+void intel_pmu_lbr_enable_all(bool pmi);
+
+void intel_pmu_lbr_disable_all(void);
+
+void intel_pmu_lbr_read(void);
+
+void intel_pmu_lbr_init_core(void);
+
+void intel_pmu_lbr_init_nhm(void);
+
+void intel_pmu_lbr_init_atom(void);
+
+void intel_pmu_lbr_init_snb(void);
+
+void intel_pmu_lbr_init_hsw(void);
+
+void intel_pmu_lbr_init_skl(void);
+
+void intel_pmu_lbr_init_knl(void);
+
+void intel_pmu_pebs_data_source_nhm(void);
+
+int intel_pmu_setup_lbr_filter(struct perf_event *event);
+
+void intel_pt_interrupt(void);
+
+int intel_bts_interrupt(void);
+
+void intel_bts_enable_local(void);
+
+void intel_bts_disable_local(void);
+
+int p4_pmu_init(void);
+
+int p6_pmu_init(void);
+
+int knc_pmu_init(void);
+
+ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
+                         char *page);
+
+static inline int is_ht_workaround_enabled(void)
+{
+       return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);
+}
+
+#else /* CONFIG_CPU_SUP_INTEL */
+
+static inline void reserve_ds_buffers(void)
+{
+}
+
+static inline void release_ds_buffers(void)
+{
+}
+
+static inline int intel_pmu_init(void)
+{
+       return 0;
+}
+
+static inline struct intel_shared_regs *allocate_shared_regs(int cpu)
+{
+       return NULL;
+}
+
+static inline int is_ht_workaround_enabled(void)
+{
+       return 0;
+}
+#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h

index 3c56ef1ae068e8c52b829bab6719d47fba0ef8bd..5e828da2e18f6e9b4ee05e0abab7997b2fe9c6d3 100644 (file)
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -27,15 +27,23 @@ struct amd_l3_cache {
  };
  
  struct threshold_block {
-       unsigned int            block;
-       unsigned int            bank;
-       unsigned int            cpu;
-       u32                     address;
-       u16                     interrupt_enable;
-       bool                    interrupt_capable;
-       u16                     threshold_limit;
-       struct kobject          kobj;
-       struct list_head        miscj;
+       unsigned int     block;                 /* Number within bank */
+       unsigned int     bank;                  /* MCA bank the block belongs to */
+       unsigned int     cpu;                   /* CPU which controls MCA bank */
+       u32              address;               /* MSR address for the block */
+       u16              interrupt_enable;      /* Enable/Disable APIC interrupt */
+       bool             interrupt_capable;     /* Bank can generate an interrupt. */
+
+       u16              threshold_limit;       /*
+                                                * Value upon which threshold
+                                                * interrupt is generated.
+                                                */
+
+       struct kobject   kobj;                  /* sysfs object */
+       struct list_head miscj;                 /*
+                                                * List of threshold blocks
+                                                * within a bank.
+                                                */
  };
  
  struct threshold_bank {
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h

index 189679aba703537393b87d699a4e11cbfe94e23c..f5063b6659eb37f1a835bc5e614cf251ac48e032 100644 (file)
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -44,19 +44,22 @@
  
  /* Exception table entry */
  #ifdef __ASSEMBLY__
-# define _ASM_EXTABLE(from,to)                                 \
+# define _ASM_EXTABLE_HANDLE(from, to, handler)                        \
         .pushsection "__ex_table","a" ;                         \
-       .balign 8 ;                                             \
+       .balign 4 ;                                             \
         .long (from) - . ;                                      \
         .long (to) - . ;                                        \
+       .long (handler) - . ;                                   \
         .popsection
  
-# define _ASM_EXTABLE_EX(from,to)                              \
-       .pushsection "__ex_table","a" ;                         \
-       .balign 8 ;                                             \
-       .long (from) - . ;                                      \
-       .long (to) - . + 0x7ffffff0 ;                           \
-       .popsection
+# define _ASM_EXTABLE(from, to)                                        \
+       _ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
+
+# define _ASM_EXTABLE_FAULT(from, to)                          \
+       _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
+
+# define _ASM_EXTABLE_EX(from, to)                             \
+       _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
  
  # define _ASM_NOKPROBE(entry)                                  \
         .pushsection "_kprobe_blacklist","aw" ;                 \
@@ -89,19 +92,24 @@
         .endm
  
  #else
-# define _ASM_EXTABLE(from,to)                                 \
+# define _EXPAND_EXTABLE_HANDLE(x) #x
+# define _ASM_EXTABLE_HANDLE(from, to, handler)                        \
         " .pushsection \"__ex_table\",\"a\"\n"                  \
-       " .balign 8\n"                                          \
+       " .balign 4\n"                                          \
         " .long (" #from ") - .\n"                              \
         " .long (" #to ") - .\n"                                \
+       " .long (" _EXPAND_EXTABLE_HANDLE(handler) ") - .\n"    \
         " .popsection\n"
  
-# define _ASM_EXTABLE_EX(from,to)                              \
-       " .pushsection \"__ex_table\",\"a\"\n"                  \
-       " .balign 8\n"                                          \
-       " .long (" #from ") - .\n"                              \
-       " .long (" #to ") - . + 0x7ffffff0\n"                   \
-       " .popsection\n"
+# define _ASM_EXTABLE(from, to)                                        \
+       _ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
+
+# define _ASM_EXTABLE_FAULT(from, to)                          \
+       _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
+
+# define _ASM_EXTABLE_EX(from, to)                             \
+       _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
+
  /* For C file, we already have NOKPROBE_SYMBOL macro */
  #endif
  
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h

index a584e1c50918406a0398cf4a013432e47abf143e..bfb28caf97b1be1f2d6aa8893bd905a39030e310 100644 (file)
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -6,18 +6,17 @@
  
  /*
   * Force strict CPU ordering.
- * And yes, this is required on UP too when we're talking
+ * And yes, this might be required on UP too when we're talking
   * to devices.
   */
  
  #ifdef CONFIG_X86_32
-/*
- * Some non-Intel clones support out of order store. wmb() ceases to be a
- * nop for these.
- */
-#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
-#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
-#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
+#define mb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "mfence", \
+                                     X86_FEATURE_XMM2) ::: "memory", "cc")
+#define rmb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "lfence", \
+                                      X86_FEATURE_XMM2) ::: "memory", "cc")
+#define wmb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "sfence", \
+                                      X86_FEATURE_XMM2) ::: "memory", "cc")
  #else
  #define mb()   asm volatile("mfence":::"memory")
  #define rmb()  asm volatile("lfence":::"memory")
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h

index e63aa38e85fb23375ecefa40efe8cd9d76da6c43..61518cf79437679788bd41a1ee1c942aea29a8cb 100644 (file)
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -91,16 +91,10 @@ void clflush_cache_range(void *addr, unsigned int size);
  
  #define mmio_flush_range(addr, size) clflush_cache_range(addr, size)
  
-#ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
  extern const int rodata_test_data;
  extern int kernel_set_to_readonly;
  void set_kernel_text_rw(void);
  void set_kernel_text_ro(void);
-#else
-static inline void set_kernel_text_rw(void) { }
-static inline void set_kernel_text_ro(void) { }
-#endif
  
  #ifdef CONFIG_DEBUG_RODATA_TEST
  int rodata_test(void);
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h

index 1514753fd43553e079696712b48a8d08b6966e98..15340e36ddcb3364e16eb63cd61c61a42676d756 100644 (file)
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -256,7 +256,7 @@ extern int force_personality32;
     instruction set this CPU supports.  This could be done in user space,
     but it's not easy, and we've already done it here.  */
  
-#define ELF_HWCAP              (boot_cpu_data.x86_capability[0])
+#define ELF_HWCAP              (boot_cpu_data.x86_capability[CPUID_1_EDX])
  
  /* This yields a string that ld.so will use to load implementation
     specific libraries for optimization.  This is more specific in
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h

index af30fdeb140da75691e91c52446f7f1bb7fe2e93..f23cd8c80b1c818057053e831ec704ab4006fba4 100644 (file)
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -20,16 +20,15 @@
  
  /* Supported features which support lazy state saving */
  #define XFEATURE_MASK_LAZY     (XFEATURE_MASK_FP | \
-                                XFEATURE_MASK_SSE)
-
-/* Supported features which require eager state saving */
-#define XFEATURE_MASK_EAGER    (XFEATURE_MASK_BNDREGS | \
-                                XFEATURE_MASK_BNDCSR | \
+                                XFEATURE_MASK_SSE | \
                                  XFEATURE_MASK_YMM | \
                                  XFEATURE_MASK_OPMASK | \
                                  XFEATURE_MASK_ZMM_Hi256 | \
                                  XFEATURE_MASK_Hi16_ZMM)
  
+/* Supported features which require eager state saving */
+#define XFEATURE_MASK_EAGER    (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)
+
  /* All currently supported features */
  #define XCNTXT_MASK    (XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER)
  
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h

index c1adf33fdd0d6f70f055b9a056bc7787bda7635e..bc62e7cbf1b1f883fc9acb0a145305384a3bf062 100644 (file)
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -17,15 +17,8 @@ static inline bool kvm_check_and_clear_guest_paused(void)
  }
  #endif /* CONFIG_KVM_GUEST */
  
-#ifdef CONFIG_DEBUG_RODATA
  #define KVM_HYPERCALL \
          ALTERNATIVE(".byte 0x0f,0x01,0xc1", ".byte 0x0f,0x01,0xd9", X86_FEATURE_VMMCALL)
-#else
-/* On AMD processors, vmcall will generate a trap that we will
- * then rewrite to the appropriate instruction.
- */
-#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
-#endif
  
  /* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall
   * instruction.  The hypervisor may replace it with something else but only the
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h

index 18d2ba9c8e44e398e2edd31c381150c515b63e37..92b6f651fa4fcc7e58d5764f317f9db8b0a5616b 100644 (file)
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -40,8 +40,20 @@
  #define MCI_STATUS_AR   (1ULL<<55)  /* Action required */
  
  /* AMD-specific bits */
-#define MCI_STATUS_DEFERRED    (1ULL<<44)  /* declare an uncorrected error */
+#define MCI_STATUS_DEFERRED    (1ULL<<44)  /* uncorrected error, deferred exception */
  #define MCI_STATUS_POISON      (1ULL<<43)  /* access poisonous data */
+#define MCI_STATUS_TCC         (1ULL<<55)  /* Task context corrupt */
+
+/*
+ * McaX field if set indicates a given bank supports MCA extensions:
+ *  - Deferred error interrupt type is specifiable by bank.
+ *  - MCx_MISC0[BlkPtr] field indicates presence of extended MISC registers,
+ *    But should not be used to determine MSR numbers.
+ *  - TCC bit is present in MCx_STATUS.
+ */
+#define MCI_CONFIG_MCAX                0x1
+#define MCI_IPID_MCATYPE       0xFFFF0000
+#define MCI_IPID_HWID          0xFFF
  
  /*
   * Note that the full MCACOD field of IA32_MCi_STATUS MSR is
@@ -91,6 +103,16 @@
  #define MCE_LOG_LEN 32
  #define MCE_LOG_SIGNATURE      "MACHINECHECK"
  
+/* AMD Scalable MCA */
+#define MSR_AMD64_SMCA_MC0_MISC0       0xc0002003
+#define MSR_AMD64_SMCA_MC0_CONFIG      0xc0002004
+#define MSR_AMD64_SMCA_MC0_IPID                0xc0002005
+#define MSR_AMD64_SMCA_MC0_MISC1       0xc000200a
+#define MSR_AMD64_SMCA_MCx_MISC(x)     (MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_CONFIG(x)   (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_IPID(x)     (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
+
  /*
   * This structure contains all data related to the MCE log.  Also
   * carries a signature to make it easier to find from external
@@ -288,4 +310,49 @@ struct cper_sec_mem_err;
  extern void apei_mce_report_mem_error(int corrected,
                                       struct cper_sec_mem_err *mem_err);
  
+/*
+ * Enumerate new IP types and HWID values in AMD processors which support
+ * Scalable MCA.
+ */
+#ifdef CONFIG_X86_MCE_AMD
+enum amd_ip_types {
+       SMCA_F17H_CORE = 0,     /* Core errors */
+       SMCA_DF,                /* Data Fabric */
+       SMCA_UMC,               /* Unified Memory Controller */
+       SMCA_PB,                /* Parameter Block */
+       SMCA_PSP,               /* Platform Security Processor */
+       SMCA_SMU,               /* System Management Unit */
+       N_AMD_IP_TYPES
+};
+
+struct amd_hwid {
+       const char *name;
+       unsigned int hwid;
+};
+
+extern struct amd_hwid amd_hwids[N_AMD_IP_TYPES];
+
+enum amd_core_mca_blocks {
+       SMCA_LS = 0,    /* Load Store */
+       SMCA_IF,        /* Instruction Fetch */
+       SMCA_L2_CACHE,  /* L2 cache */
+       SMCA_DE,        /* Decoder unit */
+       RES,            /* Reserved */
+       SMCA_EX,        /* Execution unit */
+       SMCA_FP,        /* Floating Point */
+       SMCA_L3_CACHE,  /* L3 cache */
+       N_CORE_MCA_BLOCKS
+};
+
+extern const char * const amd_core_mcablock_names[N_CORE_MCA_BLOCKS];
+
+enum amd_df_mca_blocks {
+       SMCA_CS = 0,    /* Coherent Slave */
+       SMCA_PIE,       /* Power management, Interrupts, etc */
+       N_DF_BLOCKS
+};
+
+extern const char * const amd_df_mcablock_names[N_DF_BLOCKS];
+#endif
+
  #endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h

index 7bcb861a04e5cf5b32c2c26919756fa29353cb07..5a2ed3ed2f261893d5de08022ea3259198c8c0fe 100644 (file)
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -165,6 +165,7 @@ struct x86_pmu_capability {
  #define GLOBAL_STATUS_ASIF                             BIT_ULL(60)
  #define GLOBAL_STATUS_COUNTERS_FROZEN                  BIT_ULL(59)
  #define GLOBAL_STATUS_LBRS_FROZEN                      BIT_ULL(58)
+#define GLOBAL_STATUS_TRACE_TOPAPMI                    BIT_ULL(55)
  
  /*
   * IBS cpuid feature detection
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h

index 50a6dc871cc08734ea3926a59f235ee3a1bf0b3f..983738ac014c6508f499b26650d44b6f5b3b1b2b 100644 (file)
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -128,6 +128,8 @@ struct cpuinfo_x86 {
         u16                     booted_cores;
         /* Physical processor id: */
         u16                     phys_proc_id;
+       /* Logical processor id: */
+       u16                     logical_proc_id;
         /* Core id: */
         u16                     cpu_core_id;
         /* Compute unit id */
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h

index 0a5242428659045cfb439d3045593cc1c63aad96..13b6cdd0af57049468e47ef884e6eccba49dca0e 100644 (file)
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -7,7 +7,7 @@
  extern char __brk_base[], __brk_limit[];
  extern struct exception_table_entry __stop___ex_table[];
  
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
  extern char __end_rodata_hpage_align[];
  #endif
  
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h

index ff8b9a17dc4b2d354971fb52b60b69e6fbb0e903..ca6ba36077054605580e8b8a6236eab7aa65733f 100644 (file)
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -78,6 +78,19 @@ int strcmp(const char *cs, const char *ct);
  #define memset(s, c, n) __memset(s, c, n)
  #endif
  
+/**
+ * memcpy_mcsafe - copy memory with indication if a machine check happened
+ *
+ * @dst:       destination address
+ * @src:       source address
+ * @cnt:       number of bytes to copy
+ *
+ * Low level memory copy function that catches machine checks
+ *
+ * Return true for success, false for fail
+ */
+bool memcpy_mcsafe(void *dst, const void *src, size_t cnt);
+
  #endif /* __KERNEL__ */
  
  #endif /* _ASM_X86_STRING_64_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h

index 0fb46482dfde160b9dcfad6ef57841e07c3830e2..7f991bd5031b24947e0773265023ab70b934a7b8 100644 (file)
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -119,12 +119,23 @@ static inline void setup_node_to_cpumask_map(void) { }
  
  extern const struct cpumask *cpu_coregroup_mask(int cpu);
  
+#define topology_logical_package_id(cpu)       (cpu_data(cpu).logical_proc_id)
  #define topology_physical_package_id(cpu)      (cpu_data(cpu).phys_proc_id)
  #define topology_core_id(cpu)                  (cpu_data(cpu).cpu_core_id)
  
  #ifdef ENABLE_TOPO_DEFINES
  #define topology_core_cpumask(cpu)             (per_cpu(cpu_core_map, cpu))
  #define topology_sibling_cpumask(cpu)          (per_cpu(cpu_sibling_map, cpu))
+
+extern unsigned int __max_logical_packages;
+#define topology_max_packages()                        (__max_logical_packages)
+int topology_update_package_map(unsigned int apicid, unsigned int cpu);
+extern int topology_phys_to_logical_pkg(unsigned int pkg);
+#else
+#define topology_max_packages()                        (1)
+static inline int
+topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; }
+static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; }
  #endif
  
  static inline void arch_fix_phys_package_id(int num, u32 slot)
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h

index a4a30e4b2d34321d96ddd02413d037d3f6cb95d9..c0f27d7ea7ff95eaea44987bd94417fabe68ca98 100644 (file)
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -90,12 +90,11 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
         likely(!__range_not_ok(addr, size, user_addr_max()))
  
  /*
- * The exception table consists of pairs of addresses relative to the
- * exception table enty itself: the first is the address of an
- * instruction that is allowed to fault, and the second is the address
- * at which the program should continue.  No registers are modified,
- * so it is entirely up to the continuation code to figure out what to
- * do.
+ * The exception table consists of triples of addresses relative to the
+ * exception table entry itself. The first address is of an instruction
+ * that is allowed to fault, the second is the target at which the program
+ * should continue. The third is a handler function to deal with the fault
+ * caused by the instruction in the first field.
   *
   * All the routines below use bits of fixup code that are out of line
   * with the main instruction path.  This means when everything is well,
@@ -104,13 +103,14 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
   */
  
  struct exception_table_entry {
-       int insn, fixup;
+       int insn, fixup, handler;
  };
  /* This is not the generic standard exception_table_entry format */
  #define ARCH_HAS_SORT_EXTABLE
  #define ARCH_HAS_SEARCH_EXTABLE
  
-extern int fixup_exception(struct pt_regs *regs);
+extern int fixup_exception(struct pt_regs *regs, int trapnr);
+extern bool ex_has_fault_handler(unsigned long ip);
  extern int early_fixup_exception(unsigned long *ip);
  
  /*
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c

index 8a5cddac7d444084c223e95948944b5099d15003..531b9611c51d5d4b42f721f45609a5f4ecd3cce2 100644 (file)
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2077,6 +2077,20 @@ int generic_processor_info(int apicid, int version)
         } else
                 cpu = cpumask_next_zero(-1, cpu_present_mask);
  
+       /*
+        * This can happen on physical hotplug. The sanity check at boot time
+        * is done from native_smp_prepare_cpus() after num_possible_cpus() is
+        * established.
+        */
+       if (topology_update_package_map(apicid, cpu) < 0) {
+               int thiscpu = max + disabled_cpus;
+
+               pr_warning("ACPI: Package limit reached. Processor %d/0x%x ignored.\n",
+                          thiscpu, apicid);
+               disabled_cpus++;
+               return -ENOSPC;
+       }
+
         /*
          * Validate version
          */
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile

index faa7b520412931d66ff61b69da6973b958420ea6..0d373d7affc8d48d1039f6c6d759a3b9cc191c38 100644 (file)
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -30,33 +30,11 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)               += centaur.o
  obj-$(CONFIG_CPU_SUP_TRANSMETA_32)     += transmeta.o
  obj-$(CONFIG_CPU_SUP_UMC_32)           += umc.o
  
-obj-$(CONFIG_PERF_EVENTS)              += perf_event.o
-
-ifdef CONFIG_PERF_EVENTS
-obj-$(CONFIG_CPU_SUP_AMD)              += perf_event_amd.o perf_event_amd_uncore.o
-ifdef CONFIG_AMD_IOMMU
-obj-$(CONFIG_CPU_SUP_AMD)              += perf_event_amd_iommu.o
-endif
-obj-$(CONFIG_CPU_SUP_INTEL)            += perf_event_p6.o perf_event_knc.o perf_event_p4.o
-obj-$(CONFIG_CPU_SUP_INTEL)            += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
-obj-$(CONFIG_CPU_SUP_INTEL)            += perf_event_intel_rapl.o perf_event_intel_cqm.o
-obj-$(CONFIG_CPU_SUP_INTEL)            += perf_event_intel_pt.o perf_event_intel_bts.o
-obj-$(CONFIG_CPU_SUP_INTEL)            += perf_event_intel_cstate.o
-
-obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \
-                                          perf_event_intel_uncore_snb.o \
-                                          perf_event_intel_uncore_snbep.o \
-                                          perf_event_intel_uncore_nhmex.o
-obj-$(CONFIG_CPU_SUP_INTEL)            += perf_event_msr.o
-obj-$(CONFIG_CPU_SUP_AMD)              += perf_event_msr.o
-endif
-
-
  obj-$(CONFIG_X86_MCE)                  += mcheck/
  obj-$(CONFIG_MTRR)                     += mtrr/
  obj-$(CONFIG_MICROCODE)                        += microcode/
  
-obj-$(CONFIG_X86_LOCAL_APIC)           += perfctr-watchdog.o perf_event_amd_ibs.o
+obj-$(CONFIG_X86_LOCAL_APIC)           += perfctr-watchdog.o
  
  obj-$(CONFIG_HYPERVISOR_GUEST)         += vmware.o hypervisor.o mshyperv.o
  
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c

index a07956a08936e8ea56c1a73075d700d281895577..97c59fd60702f4ceacad7c4d46a3c8d5de7dacbf 100644 (file)
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -117,7 +117,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
                 void (*f_vide)(void);
                 u64 d, d2;
  
-               printk(KERN_INFO "AMD K6 stepping B detected - ");
+               pr_info("AMD K6 stepping B detected - ");
  
                 /*
                  * It looks like AMD fixed the 2.6.2 bug and improved indirect
@@ -133,10 +133,9 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
                 d = d2-d;
  
                 if (d > 20*K6_BUG_LOOP)
-                       printk(KERN_CONT
-                               "system stability may be impaired when more than 32 MB are used.\n");
+                       pr_cont("system stability may be impaired when more than 32 MB are used.\n");
                 else
-                       printk(KERN_CONT "probably OK (after B9730xxxx).\n");
+                       pr_cont("probably OK (after B9730xxxx).\n");
         }
  
         /* K6 with old style WHCR */
@@ -154,7 +153,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
                         wbinvd();
                         wrmsr(MSR_K6_WHCR, l, h);
                         local_irq_restore(flags);
-                       printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
+                       pr_info("Enabling old style K6 write allocation for %d Mb\n",
                                 mbytes);
                 }
                 return;
@@ -175,7 +174,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
                         wbinvd();
                         wrmsr(MSR_K6_WHCR, l, h);
                         local_irq_restore(flags);
-                       printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
+                       pr_info("Enabling new style K6 write allocation for %d Mb\n",
                                 mbytes);
                 }
  
@@ -202,7 +201,7 @@ static void init_amd_k7(struct cpuinfo_x86 *c)
          */
         if (c->x86_model >= 6 && c->x86_model <= 10) {
                 if (!cpu_has(c, X86_FEATURE_XMM)) {
-                       printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
+                       pr_info("Enabling disabled K7/SSE Support.\n");
                         msr_clear_bit(MSR_K7_HWCR, 15);
                         set_cpu_cap(c, X86_FEATURE_XMM);
                 }
@@ -216,9 +215,8 @@ static void init_amd_k7(struct cpuinfo_x86 *c)
         if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
                 rdmsr(MSR_K7_CLK_CTL, l, h);
                 if ((l & 0xfff00000) != 0x20000000) {
-                       printk(KERN_INFO
-                           "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
-                                       l, ((l & 0x000fffff)|0x20000000));
+                       pr_info("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
+                               l, ((l & 0x000fffff)|0x20000000));
                         wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
                 }
         }
@@ -485,7 +483,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
                 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
                         unsigned long pfn = tseg >> PAGE_SHIFT;
  
-                       printk(KERN_DEBUG "tseg: %010llx\n", tseg);
+                       pr_debug("tseg: %010llx\n", tseg);
                         if (pfn_range_is_mapped(pfn, pfn + 1))
                                 set_memory_4k((unsigned long)__va(tseg), 1);
                 }
@@ -500,8 +498,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
  
                         rdmsrl(MSR_K7_HWCR, val);
                         if (!(val & BIT(24)))
-                               printk(KERN_WARNING FW_BUG "TSC doesn't count "
-                                       "with P0 frequency!\n");
+                               pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n");
                 }
         }
  
diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c

index 04f0fe5af83ec34bb4fd6ec09fd3cc8db1c7ee07..a972ac4c7e7df05238e78b08e093cded47640fcd 100644 (file)
--- a/arch/x86/kernel/cpu/bugs_64.c
+++ b/arch/x86/kernel/cpu/bugs_64.c
@@ -15,7 +15,7 @@ void __init check_bugs(void)
  {
         identify_boot_cpu();
  #if !defined(CONFIG_SMP)
-       printk(KERN_INFO "CPU: ");
+       pr_info("CPU: ");
         print_cpu_info(&boot_cpu_data);
  #endif
         alternative_instructions();
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c

index 6608c03c212675bd13b895021afdfd761b6cff7e..1661d8ec92805191cd4581c365db68ea5acfdf02 100644 (file)
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -29,7 +29,7 @@ static void init_c3(struct cpuinfo_x86 *c)
                         rdmsr(MSR_VIA_FCR, lo, hi);
                         lo |= ACE_FCR;          /* enable ACE unit */
                         wrmsr(MSR_VIA_FCR, lo, hi);
-                       printk(KERN_INFO "CPU: Enabled ACE h/w crypto\n");
+                       pr_info("CPU: Enabled ACE h/w crypto\n");
                 }
  
                 /* enable RNG unit, if present and disabled */
@@ -37,7 +37,7 @@ static void init_c3(struct cpuinfo_x86 *c)
                         rdmsr(MSR_VIA_RNG, lo, hi);
                         lo |= RNG_ENABLE;       /* enable RNG unit */
                         wrmsr(MSR_VIA_RNG, lo, hi);
-                       printk(KERN_INFO "CPU: Enabled h/w RNG\n");
+                       pr_info("CPU: Enabled h/w RNG\n");
                 }
  
                 /* store Centaur Extended Feature Flags as
@@ -130,7 +130,7 @@ static void init_centaur(struct cpuinfo_x86 *c)
                         name = "C6";
                         fcr_set = ECX8|DSMC|EDCTLB|EMMX|ERETSTK;
                         fcr_clr = DPDC;
-                       printk(KERN_NOTICE "Disabling bugged TSC.\n");
+                       pr_notice("Disabling bugged TSC.\n");
                         clear_cpu_cap(c, X86_FEATURE_TSC);
                         break;
                 case 8:
@@ -163,11 +163,11 @@ static void init_centaur(struct cpuinfo_x86 *c)
                 newlo = (lo|fcr_set) & (~fcr_clr);
  
                 if (newlo != lo) {
-                       printk(KERN_INFO "Centaur FCR was 0x%X now 0x%X\n",
+                       pr_info("Centaur FCR was 0x%X now 0x%X\n",
                                 lo, newlo);
                         wrmsr(MSR_IDT_FCR1, newlo, hi);
                 } else {
-                       printk(KERN_INFO "Centaur FCR is 0x%X\n", lo);
+                       pr_info("Centaur FCR is 0x%X\n", lo);
                 }
                 /* Emulate MTRRs using Centaur's MCR. */
                 set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c

index d8337f34b5f40b304ad76a0682369d8a91f59ecc..4e8d25d395eeca383248d0ed76cc0fe28d0134d2 100644 (file)
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -228,7 +228,7 @@ static void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
         lo |= 0x200000;
         wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
  
-       printk(KERN_NOTICE "CPU serial number disabled.\n");
+       pr_notice("CPU serial number disabled.\n");
         clear_cpu_cap(c, X86_FEATURE_PN);
  
         /* Disabling the serial number may affect the cpuid level */
@@ -329,9 +329,8 @@ static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
                 if (!warn)
                         continue;
  
-               printk(KERN_WARNING
-                      "CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n",
-                               x86_cap_flag(df->feature), df->level);
+               pr_warn("CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n",
+                       x86_cap_flag(df->feature), df->level);
         }
  }
  
@@ -510,7 +509,7 @@ void detect_ht(struct cpuinfo_x86 *c)
         smp_num_siblings = (ebx & 0xff0000) >> 16;
  
         if (smp_num_siblings == 1) {
-               printk_once(KERN_INFO "CPU0: Hyper-Threading is disabled\n");
+               pr_info_once("CPU0: Hyper-Threading is disabled\n");
                 goto out;
         }
  
@@ -531,10 +530,10 @@ void detect_ht(struct cpuinfo_x86 *c)
  
  out:
         if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) {
-               printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
-                      c->phys_proc_id);
-               printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
-                      c->cpu_core_id);
+               pr_info("CPU: Physical Processor ID: %d\n",
+                       c->phys_proc_id);
+               pr_info("CPU: Processor Core ID: %d\n",
+                       c->cpu_core_id);
                 printed = 1;
         }
  #endif
@@ -559,9 +558,8 @@ static void get_cpu_vendor(struct cpuinfo_x86 *c)
                 }
         }
  
-       printk_once(KERN_ERR
-                       "CPU: vendor_id '%s' unknown, using generic init.\n" \
-                       "CPU: Your system may be unstable.\n", v);
+       pr_err_once("CPU: vendor_id '%s' unknown, using generic init.\n" \
+                   "CPU: Your system may be unstable.\n", v);
  
         c->x86_vendor = X86_VENDOR_UNKNOWN;
         this_cpu = &default_cpu;
@@ -760,7 +758,7 @@ void __init early_cpu_init(void)
         int count = 0;
  
  #ifdef CONFIG_PROCESSOR_SELECT
-       printk(KERN_INFO "KERNEL supported cpus:\n");
+       pr_info("KERNEL supported cpus:\n");
  #endif
  
         for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
@@ -778,7 +776,7 @@ void __init early_cpu_init(void)
                         for (j = 0; j < 2; j++) {
                                 if (!cpudev->c_ident[j])
                                         continue;
-                               printk(KERN_INFO "  %s %s\n", cpudev->c_vendor,
+                               pr_info("  %s %s\n", cpudev->c_vendor,
                                         cpudev->c_ident[j]);
                         }
                 }
@@ -1002,6 +1000,8 @@ static void identify_cpu(struct cpuinfo_x86 *c)
  #ifdef CONFIG_NUMA
         numa_add_cpu(smp_processor_id());
  #endif
+       /* The boot/hotplug time assigment got cleared, restore it */
+       c->logical_proc_id = topology_phys_to_logical_pkg(c->phys_proc_id);
  }
  
  /*
@@ -1086,7 +1086,7 @@ static void __print_cpu_msr(void)
                 for (index = index_min; index < index_max; index++) {
                         if (rdmsrl_safe(index, &val))
                                 continue;
-                       printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
+                       pr_info(" MSR%08x: %016llx\n", index, val);
                 }
         }
  }
@@ -1125,19 +1125,19 @@ void print_cpu_info(struct cpuinfo_x86 *c)
         }
  
         if (vendor && !strstr(c->x86_model_id, vendor))
-               printk(KERN_CONT "%s ", vendor);
+               pr_cont("%s ", vendor);
  
         if (c->x86_model_id[0])
-               printk(KERN_CONT "%s", c->x86_model_id);
+               pr_cont("%s", c->x86_model_id);
         else
-               printk(KERN_CONT "%d86", c->x86);
+               pr_cont("%d86", c->x86);
  
-       printk(KERN_CONT " (family: 0x%x, model: 0x%x", c->x86, c->x86_model);
+       pr_cont(" (family: 0x%x, model: 0x%x", c->x86, c->x86_model);
  
         if (c->x86_mask || c->cpuid_level >= 0)
-               printk(KERN_CONT ", stepping: 0x%x)\n", c->x86_mask);
+               pr_cont(", stepping: 0x%x)\n", c->x86_mask);
         else
-               printk(KERN_CONT ")\n");
+               pr_cont(")\n");
  
         print_cpu_msr(c);
  }
@@ -1463,7 +1463,7 @@ void cpu_init(void)
  
         show_ucode_info_early();
  
-       printk(KERN_INFO "Initializing CPU#%d\n", cpu);
+       pr_info("Initializing CPU#%d\n", cpu);
  
         if (cpu_feature_enabled(X86_FEATURE_VME) ||
             cpu_has_tsc ||
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c

index 15e47c1cd41265443ad1e521baba45081239f39c..6adef9cac23ee99c96924e2789abeb2d1ad123f3 100644 (file)
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -104,7 +104,7 @@ static void check_cx686_slop(struct cpuinfo_x86 *c)
                 local_irq_restore(flags);
  
                 if (ccr5 & 2) { /* possible wrong calibration done */
-                       printk(KERN_INFO "Recalibrating delay loop with SLOP bit reset\n");
+                       pr_info("Recalibrating delay loop with SLOP bit reset\n");
                         calibrate_delay();
                         c->loops_per_jiffy = loops_per_jiffy;
                 }
@@ -116,7 +116,7 @@ static void set_cx86_reorder(void)
  {
         u8 ccr3;
  
-       printk(KERN_INFO "Enable Memory access reorder on Cyrix/NSC processor.\n");
+       pr_info("Enable Memory access reorder on Cyrix/NSC processor.\n");
         ccr3 = getCx86(CX86_CCR3);
         setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
  
@@ -129,7 +129,7 @@ static void set_cx86_reorder(void)
  
  static void set_cx86_memwb(void)
  {
-       printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
+       pr_info("Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
  
         /* CCR2 bit 2: unlock NW bit */
         setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04);
@@ -269,7 +269,7 @@ static void init_cyrix(struct cpuinfo_x86 *c)
                  *  VSA1 we work around however.
                  */
  
-               printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n");
+               pr_info("Working around Cyrix MediaGX virtual DMA bugs.\n");
                 isa_dma_bridge_buggy = 2;
  
                 /* We do this before the PCI layer is running. However we
@@ -427,7 +427,7 @@ static void cyrix_identify(struct cpuinfo_x86 *c)
                 if (dir0 == 5 || dir0 == 3) {
                         unsigned char ccr3;
                         unsigned long flags;
-                       printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n");
+                       pr_info("Enabling CPUID on Cyrix processor.\n");
                         local_irq_save(flags);
                         ccr3 = getCx86(CX86_CCR3);
                         /* enable MAPEN  */
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c

index d820d8eae96be0b3daa0ec01d9f4a22bf1ad930e..73d391ae452f82a7bfeca4be2276eed628dcd452 100644 (file)
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -56,7 +56,7 @@ detect_hypervisor_vendor(void)
         }
  
         if (max_pri)
-               printk(KERN_INFO "Hypervisor detected: %s\n", x86_hyper->name);
+               pr_info("Hypervisor detected: %s\n", x86_hyper->name);
  }
  
  void init_hypervisor(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c

index 9299e3bdfad6bacf4a8d8341567e79682b67f241..1f7fdb91a818bc10d4b975a070d6b2c1a947b15b 100644 (file)
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -61,7 +61,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
          */
         if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2 &&
             c->microcode < 0x20e) {
-               printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n");
+               pr_warn("Atom PSE erratum detected, BIOS microcode update recommended\n");
                 clear_cpu_cap(c, X86_FEATURE_PSE);
         }
  
@@ -140,7 +140,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
         if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
                 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
                 if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
-                       printk(KERN_INFO "Disabled fast string operations\n");
+                       pr_info("Disabled fast string operations\n");
                         setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
                         setup_clear_cpu_cap(X86_FEATURE_ERMS);
                 }
@@ -160,6 +160,19 @@ static void early_init_intel(struct cpuinfo_x86 *c)
                 pr_info("Disabling PGE capability bit\n");
                 setup_clear_cpu_cap(X86_FEATURE_PGE);
         }
+
+       if (c->cpuid_level >= 0x00000001) {
+               u32 eax, ebx, ecx, edx;
+
+               cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
+               /*
+                * If HTT (EDX[28]) is set EBX[16:23] contain the number of
+                * apicids which are reserved per package. Store the resulting
+                * shift value for the package management code.
+                */
+               if (edx & (1U << 28))
+                       c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff);
+       }
  }
  
  #ifdef CONFIG_X86_32
@@ -176,7 +189,7 @@ int ppro_with_ram_bug(void)
             boot_cpu_data.x86 == 6 &&
             boot_cpu_data.x86_model == 1 &&
             boot_cpu_data.x86_mask < 8) {
-               printk(KERN_INFO "Pentium Pro with Errata#50 detected. Taking evasive action.\n");
+               pr_info("Pentium Pro with Errata#50 detected. Taking evasive action.\n");
                 return 1;
         }
         return 0;
@@ -225,7 +238,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
  
                 set_cpu_bug(c, X86_BUG_F00F);
                 if (!f00f_workaround_enabled) {
-                       printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
+                       pr_notice("Intel Pentium with F0 0F bug - workaround enabled.\n");
                         f00f_workaround_enabled = 1;
                 }
         }
@@ -244,7 +257,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
          * Forcefully enable PAE if kernel parameter "forcepae" is present.
          */
         if (forcepae) {
-               printk(KERN_WARNING "PAE forced!\n");
+               pr_warn("PAE forced!\n");
                 set_cpu_cap(c, X86_FEATURE_PAE);
                 add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
         }
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c

index 341449c49f34317674a2edac0a60132ed43bd716..de6626c18e427b771ea902451ae77ab52e233915 100644 (file)
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -444,7 +444,7 @@ static ssize_t store_cache_disable(struct cacheinfo *this_leaf,
         err = amd_set_l3_disable_slot(nb, cpu, slot, val);
         if (err) {
                 if (err == -EEXIST)
-                       pr_warning("L3 slot %d in use/index already disabled!\n",
+                       pr_warn("L3 slot %d in use/index already disabled!\n",
                                    slot);
                 return err;
         }
diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/kernel/cpu/intel_pt.h

deleted file mode 100644 (file)

index 336878a..0000000
--- a/arch/x86/kernel/cpu/intel_pt.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Intel(R) Processor Trace PMU driver for perf
- * Copyright (c) 2013-2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * Intel PT is specified in the Intel Architecture Instruction Set Extensions
- * Programming Reference:
- * http://software.intel.com/en-us/intel-isa-extensions
- */
-
-#ifndef __INTEL_PT_H__
-#define __INTEL_PT_H__
-
-/*
- * Single-entry ToPA: when this close to region boundary, switch
- * buffers to avoid losing data.
- */
-#define TOPA_PMI_MARGIN 512
-
-#define TOPA_SHIFT 12
-
-static inline unsigned int sizes(unsigned int tsz)
-{
-       return 1 << (tsz + TOPA_SHIFT);
-};
-
-struct topa_entry {
-       u64     end     : 1;
-       u64     rsvd0   : 1;
-       u64     intr    : 1;
-       u64     rsvd1   : 1;
-       u64     stop    : 1;
-       u64     rsvd2   : 1;
-       u64     size    : 4;
-       u64     rsvd3   : 2;
-       u64     base    : 36;
-       u64     rsvd4   : 16;
-};
-
-#define PT_CPUID_LEAVES                2
-#define PT_CPUID_REGS_NUM      4 /* number of regsters (eax, ebx, ecx, edx) */
-
-enum pt_capabilities {
-       PT_CAP_max_subleaf = 0,
-       PT_CAP_cr3_filtering,
-       PT_CAP_psb_cyc,
-       PT_CAP_mtc,
-       PT_CAP_topa_output,
-       PT_CAP_topa_multiple_entries,
-       PT_CAP_single_range_output,
-       PT_CAP_payloads_lip,
-       PT_CAP_mtc_periods,
-       PT_CAP_cycle_thresholds,
-       PT_CAP_psb_periods,
-};
-
-struct pt_pmu {
-       struct pmu              pmu;
-       u32                     caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
-};
-
-/**
- * struct pt_buffer - buffer configuration; one buffer per task_struct or
- *             cpu, depending on perf event configuration
- * @cpu:       cpu for per-cpu allocation
- * @tables:    list of ToPA tables in this buffer
- * @first:     shorthand for first topa table
- * @last:      shorthand for last topa table
- * @cur:       current topa table
- * @nr_pages:  buffer size in pages
- * @cur_idx:   current output region's index within @cur table
- * @output_off:        offset within the current output region
- * @data_size: running total of the amount of data in this buffer
- * @lost:      if data was lost/truncated
- * @head:      logical write offset inside the buffer
- * @snapshot:  if this is for a snapshot/overwrite counter
- * @stop_pos:  STOP topa entry in the buffer
- * @intr_pos:  INT topa entry in the buffer
- * @data_pages:        array of pages from perf
- * @topa_index:        table of topa entries indexed by page offset
- */
-struct pt_buffer {
-       int                     cpu;
-       struct list_head        tables;
-       struct topa             *first, *last, *cur;
-       unsigned int            cur_idx;
-       size_t                  output_off;
-       unsigned long           nr_pages;
-       local_t                 data_size;
-       local_t                 lost;
-       local64_t               head;
-       bool                    snapshot;
-       unsigned long           stop_pos, intr_pos;
-       void                    **data_pages;
-       struct topa_entry       *topa_index[0];
-};
-
-/**
- * struct pt - per-cpu pt context
- * @handle:    perf output handle
- * @handle_nmi:        do handle PT PMI on this cpu, there's an active event
- */
-struct pt {
-       struct perf_output_handle handle;
-       int                     handle_nmi;
-};
-
-#endif /* __INTEL_PT_H__ */
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c

index 4cfba4371a71f28c4615610475e3eec9e8d9790c..517619ea6498b41839f87f28bce4a76b1e43c7ab 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -115,7 +115,7 @@ static int raise_local(void)
         int cpu = m->extcpu;
  
         if (m->inject_flags & MCJ_EXCEPTION) {
-               printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
+               pr_info("Triggering MCE exception on CPU %d\n", cpu);
                 switch (context) {
                 case MCJ_CTX_IRQ:
                         /*
@@ -128,15 +128,15 @@ static int raise_local(void)
                         raise_exception(m, NULL);
                         break;
                 default:
-                       printk(KERN_INFO "Invalid MCE context\n");
+                       pr_info("Invalid MCE context\n");
                         ret = -EINVAL;
                 }
-               printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
+               pr_info("MCE exception done on CPU %d\n", cpu);
         } else if (m->status) {
-               printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
+               pr_info("Starting machine check poll CPU %d\n", cpu);
                 raise_poll(m);
                 mce_notify_irq();
-               printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu);
+               pr_info("Machine check poll done on CPU %d\n", cpu);
         } else
                 m->finished = 0;
  
@@ -183,8 +183,7 @@ static void raise_mce(struct mce *m)
                 start = jiffies;
                 while (!cpumask_empty(mce_inject_cpumask)) {
                         if (!time_before(jiffies, start + 2*HZ)) {
-                               printk(KERN_ERR
-                               "Timeout waiting for mce inject %lx\n",
+                               pr_err("Timeout waiting for mce inject %lx\n",
                                         *cpumask_bits(mce_inject_cpumask));
                                 break;
                         }
@@ -241,7 +240,7 @@ static int inject_init(void)
  {
         if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
                 return -ENOMEM;
-       printk(KERN_INFO "Machine check injector initialized\n");
+       pr_info("Machine check injector initialized\n");
         register_mce_write_callback(mce_write);
         register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0,
                                 "mce_notify");
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c

index 9c682c222071db1960ba848c24c76567306d0542..5119766d988925a4c8eb9df23a3ff7b1b626d174 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -14,6 +14,7 @@
  #include <linux/init.h>
  #include <linux/debugfs.h>
  #include <asm/mce.h>
+#include <asm/uaccess.h>
  
  #include "mce-internal.h"
  
@@ -29,7 +30,7 @@
   * panic situations)
   */
  
-enum context { IN_KERNEL = 1, IN_USER = 2 };
+enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 };
  enum ser { SER_REQUIRED = 1, NO_SER = 2 };
  enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
  
@@ -48,6 +49,7 @@ static struct severity {
  #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
  #define  KERNEL                .context = IN_KERNEL
  #define  USER          .context = IN_USER
+#define  KERNEL_RECOV  .context = IN_KERNEL_RECOV
  #define  SER           .ser = SER_REQUIRED
  #define  NOSER         .ser = NO_SER
  #define  EXCP          .excp = EXCP_CONTEXT
@@ -86,6 +88,10 @@ static struct severity {
                 PANIC, "In kernel and no restart IP",
                 EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
                 ),
+       MCESEV(
+               PANIC, "In kernel and no restart IP",
+               EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0)
+               ),
         MCESEV(
                 DEFERRED, "Deferred error",
                 NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
@@ -122,6 +128,11 @@ static struct severity {
                 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR),
                 MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
                 ),
+       MCESEV(
+               AR, "Action required: data load in error recoverable area of kernel",
+               SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+               KERNEL_RECOV
+               ),
         MCESEV(
                 AR, "Action required: data load error in a user process",
                 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
@@ -170,6 +181,9 @@ static struct severity {
                 )       /* always matches. keep at end */
  };
  
+#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \
+                               (MCG_STATUS_RIPV|MCG_STATUS_EIPV))
+
  /*
   * If mcgstatus indicated that ip/cs on the stack were
   * no good, then "m->cs" will be zero and we will have
@@ -183,7 +197,11 @@ static struct severity {
   */
  static int error_context(struct mce *m)
  {
-       return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
+       if ((m->cs & 3) == 3)
+               return IN_USER;
+       if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip))
+               return IN_KERNEL_RECOV;
+       return IN_KERNEL;
  }
  
  /*
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c

index b5b187c8cc072a68ebf28e80c4e2bdc5caaaedd3..f0c921b03e4245e1f7a4b16687c0d9e7600b4256 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -961,6 +961,20 @@ static void mce_clear_state(unsigned long *toclear)
         }
  }
  
+static int do_memory_failure(struct mce *m)
+{
+       int flags = MF_ACTION_REQUIRED;
+       int ret;
+
+       pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
+       if (!(m->mcgstatus & MCG_STATUS_RIPV))
+               flags |= MF_MUST_KILL;
+       ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags);
+       if (ret)
+               pr_err("Memory error not recovered");
+       return ret;
+}
+
  /*
   * The actual machine check handler. This only handles real
   * exceptions when something got corrupted coming in through int 18.
@@ -998,8 +1012,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
         DECLARE_BITMAP(toclear, MAX_NR_BANKS);
         DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
         char *msg = "Unknown";
-       u64 recover_paddr = ~0ull;
-       int flags = MF_ACTION_REQUIRED;
         int lmce = 0;
  
         /* If this CPU is offline, just bail out. */
@@ -1136,22 +1148,13 @@ void do_machine_check(struct pt_regs *regs, long error_code)
         }
  
         /*
-        * At insane "tolerant" levels we take no action. Otherwise
-        * we only die if we have no other choice. For less serious
-        * issues we try to recover, or limit damage to the current
-        * process.
+        * If tolerant is at an insane level we drop requests to kill
+        * processes and continue even when there is no way out.
          */
-       if (cfg->tolerant < 3) {
-               if (no_way_out)
-                       mce_panic("Fatal machine check on current CPU", &m, msg);
-               if (worst == MCE_AR_SEVERITY) {
-                       recover_paddr = m.addr;
-                       if (!(m.mcgstatus & MCG_STATUS_RIPV))
-                               flags |= MF_MUST_KILL;
-               } else if (kill_it) {
-                       force_sig(SIGBUS, current);
-               }
-       }
+       if (cfg->tolerant == 3)
+               kill_it = 0;
+       else if (no_way_out)
+               mce_panic("Fatal machine check on current CPU", &m, msg);
  
         if (worst > 0)
                 mce_report_event(regs);
@@ -1159,25 +1162,24 @@ void do_machine_check(struct pt_regs *regs, long error_code)
  out:
         sync_core();
  
-       if (recover_paddr == ~0ull)
-               goto done;
+       if (worst != MCE_AR_SEVERITY && !kill_it)
+               goto out_ist;
  
-       pr_err("Uncorrected hardware memory error in user-access at %llx",
-                recover_paddr);
-       /*
-        * We must call memory_failure() here even if the current process is
-        * doomed. We still need to mark the page as poisoned and alert any
-        * other users of the page.
-        */
-       ist_begin_non_atomic(regs);
-       local_irq_enable();
-       if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
-               pr_err("Memory error not recovered");
-               force_sig(SIGBUS, current);
+       /* Fault was in user mode and we need to take some action */
+       if ((m.cs & 3) == 3) {
+               ist_begin_non_atomic(regs);
+               local_irq_enable();
+
+               if (kill_it || do_memory_failure(&m))
+                       force_sig(SIGBUS, current);
+               local_irq_disable();
+               ist_end_non_atomic();
+       } else {
+               if (!fixup_exception(regs, X86_TRAP_MC))
+                       mce_panic("Failed kernel mode recovery", &m, NULL);
         }
-       local_irq_disable();
-       ist_end_non_atomic();
-done:
+
+out_ist:
         ist_exit(regs);
  }
  EXPORT_SYMBOL_GPL(do_machine_check);
@@ -1628,10 +1630,10 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
         case X86_VENDOR_AMD: {
                 u32 ebx = cpuid_ebx(0x80000007);
  
-               mce_amd_feature_init(c);
                 mce_flags.overflow_recov = !!(ebx & BIT(0));
                 mce_flags.succor         = !!(ebx & BIT(1));
                 mce_flags.smca           = !!(ebx & BIT(3));
+               mce_amd_feature_init(c);
  
                 break;
                 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c

index e99b15077e9464b9c9f337873ae58101285e3215..9d656fd436efd6c90ad148a3cfbbd673248000f5 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -1,5 +1,5 @@
  /*
- *  (c) 2005-2015 Advanced Micro Devices, Inc.
+ *  (c) 2005-2016 Advanced Micro Devices, Inc.
   *  Your use of this code is subject to the terms and conditions of the
   *  GNU general public license version 2. See "COPYING" or
   *  http://www.gnu.org/licenses/gpl.html
@@ -28,7 +28,7 @@
  #include <asm/msr.h>
  #include <asm/trace/irq_vectors.h>
  
-#define NR_BLOCKS         9
+#define NR_BLOCKS         5
  #define THRESHOLD_MAX     0xFFF
  #define INT_TYPE_APIC     0x00020000
  #define MASK_VALID_HI     0x80000000
@@ -49,6 +49,19 @@
  #define DEF_LVT_OFF            0x2
  #define DEF_INT_TYPE_APIC      0x2
  
+/* Scalable MCA: */
+
+/* Threshold LVT offset is at MSR0xC0000410[15:12] */
+#define SMCA_THR_LVT_OFF       0xF000
+
+/*
+ * OS is required to set the MCAX bit to acknowledge that it is now using the
+ * new MSR ranges and new registers under each bank. It also means that the OS
+ * will configure deferred errors in the new MCx_CONFIG register. If the bit is
+ * not set, uncorrectable errors will cause a system panic.
+ */
+#define SMCA_MCAX_EN_OFF       0x1
+
  static const char * const th_names[] = {
         "load_store",
         "insn_fetch",
@@ -58,6 +71,35 @@ static const char * const th_names[] = {
         "execution_unit",
  };
  
+/* Define HWID to IP type mappings for Scalable MCA */
+struct amd_hwid amd_hwids[] = {
+       [SMCA_F17H_CORE]        = { "f17h_core",        0xB0 },
+       [SMCA_DF]               = { "data_fabric",      0x2E },
+       [SMCA_UMC]              = { "umc",              0x96 },
+       [SMCA_PB]               = { "param_block",      0x5 },
+       [SMCA_PSP]              = { "psp",              0xFF },
+       [SMCA_SMU]              = { "smu",              0x1 },
+};
+EXPORT_SYMBOL_GPL(amd_hwids);
+
+const char * const amd_core_mcablock_names[] = {
+       [SMCA_LS]               = "load_store",
+       [SMCA_IF]               = "insn_fetch",
+       [SMCA_L2_CACHE]         = "l2_cache",
+       [SMCA_DE]               = "decode_unit",
+       [RES]                   = "",
+       [SMCA_EX]               = "execution_unit",
+       [SMCA_FP]               = "floating_point",
+       [SMCA_L3_CACHE]         = "l3_cache",
+};
+EXPORT_SYMBOL_GPL(amd_core_mcablock_names);
+
+const char * const amd_df_mcablock_names[] = {
+       [SMCA_CS]               = "coherent_slave",
+       [SMCA_PIE]              = "pie",
+};
+EXPORT_SYMBOL_GPL(amd_df_mcablock_names);
+
  static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
  static DEFINE_PER_CPU(unsigned char, bank_map);        /* see which banks are on */
  
@@ -84,6 +126,13 @@ struct thresh_restart {
  
  static inline bool is_shared_bank(int bank)
  {
+       /*
+        * Scalable MCA provides for only one core to have access to the MSRs of
+        * a shared bank.
+        */
+       if (mce_flags.smca)
+               return false;
+
         /* Bank 4 is for northbridge reporting and is thus shared */
         return (bank == 4);
  }
@@ -135,6 +184,14 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
         }
  
         if (apic != msr) {
+               /*
+                * On SMCA CPUs, LVT offset is programmed at a different MSR, and
+                * the BIOS provides the value. The original field where LVT offset
+                * was set is reserved. Return early here:
+                */
+               if (mce_flags.smca)
+                       return 0;
+
                 pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
                        "for bank %d, block %d (MSR%08X=0x%x%08x)\n",
                        b->cpu, apic, b->bank, b->block, b->address, hi, lo);
@@ -144,10 +201,7 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
         return 1;
  };
  
-/*
- * Called via smp_call_function_single(), must be called with correct
- * cpu affinity.
- */
+/* Reprogram MCx_MISC MSR behind this threshold bank. */
  static void threshold_restart_bank(void *_tr)
  {
         struct thresh_restart *tr = _tr;
@@ -247,27 +301,116 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
         wrmsr(MSR_CU_DEF_ERR, low, high);
  }
  
+static u32 get_block_address(u32 current_addr, u32 low, u32 high,
+                            unsigned int bank, unsigned int block)
+{
+       u32 addr = 0, offset = 0;
+
+       if (mce_flags.smca) {
+               if (!block) {
+                       addr = MSR_AMD64_SMCA_MCx_MISC(bank);
+               } else {
+                       /*
+                        * For SMCA enabled processors, BLKPTR field of the
+                        * first MISC register (MCx_MISC0) indicates presence of
+                        * additional MISC register set (MISC1-4).
+                        */
+                       u32 low, high;
+
+                       if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
+                               return addr;
+
+                       if (!(low & MCI_CONFIG_MCAX))
+                               return addr;
+
+                       if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
+                           (low & MASK_BLKPTR_LO))
+                               addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
+               }
+               return addr;
+       }
+
+       /* Fall back to method we used for older processors: */
+       switch (block) {
+       case 0:
+               addr = MSR_IA32_MCx_MISC(bank);
+               break;
+       case 1:
+               offset = ((low & MASK_BLKPTR_LO) >> 21);
+               if (offset)
+                       addr = MCG_XBLK_ADDR + offset;
+               break;
+       default:
+               addr = ++current_addr;
+       }
+       return addr;
+}
+
+static int
+prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
+                       int offset, u32 misc_high)
+{
+       unsigned int cpu = smp_processor_id();
+       struct threshold_block b;
+       int new;
+
+       if (!block)
+               per_cpu(bank_map, cpu) |= (1 << bank);
+
+       memset(&b, 0, sizeof(b));
+       b.cpu                   = cpu;
+       b.bank                  = bank;
+       b.block                 = block;
+       b.address               = addr;
+       b.interrupt_capable     = lvt_interrupt_supported(bank, misc_high);
+
+       if (!b.interrupt_capable)
+               goto done;
+
+       b.interrupt_enable = 1;
+
+       if (mce_flags.smca) {
+               u32 smca_low, smca_high;
+               u32 smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(bank);
+
+               if (!rdmsr_safe(smca_addr, &smca_low, &smca_high)) {
+                       smca_high |= SMCA_MCAX_EN_OFF;
+                       wrmsr(smca_addr, smca_low, smca_high);
+               }
+
+               /* Gather LVT offset for thresholding: */
+               if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
+                       goto out;
+
+               new = (smca_low & SMCA_THR_LVT_OFF) >> 12;
+       } else {
+               new = (misc_high & MASK_LVTOFF_HI) >> 20;
+       }
+
+       offset = setup_APIC_mce_threshold(offset, new);
+
+       if ((offset == new) && (mce_threshold_vector != amd_threshold_interrupt))
+               mce_threshold_vector = amd_threshold_interrupt;
+
+done:
+       mce_threshold_block_init(&b, offset);
+
+out:
+       return offset;
+}
+
  /* cpu init entry point, called from mce.c with preempt off */
  void mce_amd_feature_init(struct cpuinfo_x86 *c)
  {
-       struct threshold_block b;
-       unsigned int cpu = smp_processor_id();
         u32 low = 0, high = 0, address = 0;
         unsigned int bank, block;
-       int offset = -1, new;
+       int offset = -1;
  
         for (bank = 0; bank < mca_cfg.banks; ++bank) {
                 for (block = 0; block < NR_BLOCKS; ++block) {
-                       if (block == 0)
-                               address = MSR_IA32_MCx_MISC(bank);
-                       else if (block == 1) {
-                               address = (low & MASK_BLKPTR_LO) >> 21;
-                               if (!address)
-                                       break;
-
-                               address += MCG_XBLK_ADDR;
-                       } else
-                               ++address;
+                       address = get_block_address(address, low, high, bank, block);
+                       if (!address)
+                               break;
  
                         if (rdmsr_safe(address, &low, &high))
                                 break;
@@ -279,29 +422,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
                              (high & MASK_LOCKED_HI))
                                 continue;
  
-                       if (!block)
-                               per_cpu(bank_map, cpu) |= (1 << bank);
-
-                       memset(&b, 0, sizeof(b));
-                       b.cpu                   = cpu;
-                       b.bank                  = bank;
-                       b.block                 = block;
-                       b.address               = address;
-                       b.interrupt_capable     = lvt_interrupt_supported(bank, high);
-
-                       if (!b.interrupt_capable)
-                               goto init;
-
-                       b.interrupt_enable = 1;
-                       new     = (high & MASK_LVTOFF_HI) >> 20;
-                       offset  = setup_APIC_mce_threshold(offset, new);
-
-                       if ((offset == new) &&
-                           (mce_threshold_vector != amd_threshold_interrupt))
-                               mce_threshold_vector = amd_threshold_interrupt;
-
-init:
-                       mce_threshold_block_init(&b, offset);
+                       offset = prepare_threshold_block(bank, block, address, offset, high);
                 }
         }
  
@@ -394,16 +515,9 @@ static void amd_threshold_interrupt(void)
                 if (!(per_cpu(bank_map, cpu) & (1 << bank)))
                         continue;
                 for (block = 0; block < NR_BLOCKS; ++block) {
-                       if (block == 0) {
-                               address = MSR_IA32_MCx_MISC(bank);
-                       } else if (block == 1) {
-                               address = (low & MASK_BLKPTR_LO) >> 21;
-                               if (!address)
-                                       break;
-                               address += MCG_XBLK_ADDR;
-                       } else {
-                               ++address;
-                       }
+                       address = get_block_address(address, low, high, bank, block);
+                       if (!address)
+                               break;
  
                         if (rdmsr_safe(address, &low, &high))
                                 break;
@@ -623,16 +737,11 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
         if (err)
                 goto out_free;
  recurse:
-       if (!block) {
-               address = (low & MASK_BLKPTR_LO) >> 21;
-               if (!address)
-                       return 0;
-               address += MCG_XBLK_ADDR;
-       } else {
-               ++address;
-       }
+       address = get_block_address(address, low, high, bank, ++block);
+       if (!address)
+               return 0;
  
-       err = allocate_threshold_blocks(cpu, bank, ++block, address);
+       err = allocate_threshold_blocks(cpu, bank, block, address);
         if (err)
                 goto out_free;
  
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c

index 12402e10aeffda428821dd6dcb88597203e48054..2a0717bf803372d3968dca9d2b2e6c81740d946d 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -26,14 +26,12 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)
         rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
         rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
  
-       printk(KERN_EMERG
-               "CPU#%d: Machine Check Exception:  0x%8X (type 0x%8X).\n",
-               smp_processor_id(), loaddr, lotype);
+       pr_emerg("CPU#%d: Machine Check Exception:  0x%8X (type 0x%8X).\n",
+                smp_processor_id(), loaddr, lotype);
  
         if (lotype & (1<<5)) {
-               printk(KERN_EMERG
-                       "CPU#%d: Possible thermal failure (CPU on fire ?).\n",
-                       smp_processor_id());
+               pr_emerg("CPU#%d: Possible thermal failure (CPU on fire ?).\n",
+                        smp_processor_id());
         }
  
         add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
@@ -61,12 +59,10 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
         /* Read registers before enabling: */
         rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
         rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
-       printk(KERN_INFO
-              "Intel old style machine check architecture supported.\n");
+       pr_info("Intel old style machine check architecture supported.\n");
  
         /* Enable MCE: */
         cr4_set_bits(X86_CR4_MCE);
-       printk(KERN_INFO
-              "Intel old style machine check reporting enabled on CPU#%d.\n",
-              smp_processor_id());
+       pr_info("Intel old style machine check reporting enabled on CPU#%d.\n",
+               smp_processor_id());
  }
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c

index 2c5aaf8c2e2f3dcc94d348dfe91da6d7d5000ae9..0b445c2ff735d44fedc469fd8ce0c1b8b1f1633b 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -190,7 +190,7 @@ static int therm_throt_process(bool new_event, int event, int level)
         /* if we just entered the thermal event */
         if (new_event) {
                 if (event == THERMAL_THROTTLING_EVENT)
-                       printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
+                       pr_crit("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
                                 this_cpu,
                                 level == CORE_LEVEL ? "Core" : "Package",
                                 state->count);
@@ -198,8 +198,7 @@ static int therm_throt_process(bool new_event, int event, int level)
         }
         if (old_event) {
                 if (event == THERMAL_THROTTLING_EVENT)
-                       printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
-                               this_cpu,
+                       pr_info("CPU%d: %s temperature/speed normal\n", this_cpu,
                                 level == CORE_LEVEL ? "Core" : "Package");
                 return 1;
         }
@@ -417,8 +416,8 @@ static void intel_thermal_interrupt(void)
  
  static void unexpected_thermal_interrupt(void)
  {
-       printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n",
-                       smp_processor_id());
+       pr_err("CPU%d: Unexpected LVT thermal interrupt!\n",
+               smp_processor_id());
  }
  
  static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
@@ -499,7 +498,7 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
  
         if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
                 if (system_state == SYSTEM_BOOTING)
-                       printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", cpu);
+                       pr_debug("CPU%d: Thermal monitoring handled by SMI\n", cpu);
                 return;
         }
  
@@ -557,8 +556,8 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
         l = apic_read(APIC_LVTTHMR);
         apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
  
-       printk_once(KERN_INFO "CPU0: Thermal monitoring enabled (%s)\n",
-                      tm2 ? "TM2" : "TM1");
+       pr_info_once("CPU0: Thermal monitoring enabled (%s)\n",
+                     tm2 ? "TM2" : "TM1");
  
         /* enable thermal throttle processing */
         atomic_set(&therm_throt_en, 1);
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c

index 7245980186eea047e643af5010e1509bfc991632..fcf9ae9384f4cb4693d67cc8ee6433562dfc9f34 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -12,8 +12,8 @@
  
  static void default_threshold_interrupt(void)
  {
-       printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
-                        THRESHOLD_APIC_VECTOR);
+       pr_err("Unexpected threshold interrupt at vector %x\n",
+               THRESHOLD_APIC_VECTOR);
  }
  
  void (*mce_threshold_vector)(void) = default_threshold_interrupt;
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c

index 01dd8702880b7f2db9fcdc3874ba8809c2ce9b80..c6a722e1d011458fa30ec5ad3ad4e78c58d2fa82 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -17,7 +17,7 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code)
  {
         ist_enter(regs);
  
-       printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
+       pr_emerg("CPU0: Machine Check Exception.\n");
         add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
  
         ist_exit(regs);
@@ -39,6 +39,5 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c)
  
         cr4_set_bits(X86_CR4_MCE);
  
-       printk(KERN_INFO
-              "Winchip machine check reporting enabled on CPU#0.\n");
+       pr_info("Winchip machine check reporting enabled on CPU#0.\n");
  }
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c

index 2233f8a766156891a52b9a7658f04efeaf4f86c8..75d3aab5f7b243623ca311c4c23bf07ef2e26ca4 100644 (file)
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -953,7 +953,7 @@ struct microcode_ops * __init init_amd_microcode(void)
         struct cpuinfo_x86 *c = &boot_cpu_data;
  
         if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
-               pr_warning("AMD CPU family 0x%x not supported\n", c->x86);
+               pr_warn("AMD CPU family 0x%x not supported\n", c->x86);
                 return NULL;
         }
  
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c

index 20e242ea1bc46b5f5828c7b95071d920853b7609..4e7c6933691cc8de42aa4a82473482127bd0faab 100644 (file)
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -161,8 +161,8 @@ static void __init ms_hyperv_init_platform(void)
         ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
         ms_hyperv.hints    = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
  
-       printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
-              ms_hyperv.features, ms_hyperv.hints);
+       pr_info("HyperV: features 0x%x, hints 0x%x\n",
+               ms_hyperv.features, ms_hyperv.hints);
  
  #ifdef CONFIG_X86_LOCAL_APIC
         if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) {
@@ -174,8 +174,8 @@ static void __init ms_hyperv_init_platform(void)
                 rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency);
                 hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ);
                 lapic_timer_frequency = hv_lapic_frequency;
-               printk(KERN_INFO "HyperV: LAPIC Timer Frequency: %#x\n",
-                               lapic_timer_frequency);
+               pr_info("HyperV: LAPIC Timer Frequency: %#x\n",
+                       lapic_timer_frequency);
         }
  #endif
  
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c

index 316fe3e60a9764e479d4e49d01346c7921763ee8..3d689937fc1b13974a0a8a5edd31ddda0f6401b2 100644 (file)
--- a/arch/x86/kernel/cpu/mtrr/centaur.c
+++ b/arch/x86/kernel/cpu/mtrr/centaur.c
@@ -103,7 +103,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t
          */
         if (type != MTRR_TYPE_WRCOMB &&
             (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) {
-               pr_warning("mtrr: only write-combining%s supported\n",
+               pr_warn("mtrr: only write-combining%s supported\n",
                            centaur_mcr_type ? " and uncacheable are" : " is");
                 return -EINVAL;
         }
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c

index 0d98503c2245aab81283526c062f746650b99c32..31e951ce6dff33782c9610c7b43898181d59d637 100644 (file)
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -57,9 +57,9 @@ static int __initdata                         nr_range;
  static struct var_mtrr_range_state __initdata  range_state[RANGE_NUM];
  
  static int __initdata debug_print;
-#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0)
+#define Dprintk(x...) do { if (debug_print) pr_debug(x); } while (0)
  
-#define BIOS_BUG_MSG KERN_WARNING \
+#define BIOS_BUG_MSG \
         "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
  
  static int __init
@@ -81,9 +81,9 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
                                                 base, base + size);
         }
         if (debug_print) {
-               printk(KERN_DEBUG "After WB checking\n");
+               pr_debug("After WB checking\n");
                 for (i = 0; i < nr_range; i++)
-                       printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
+                       pr_debug("MTRR MAP PFN: %016llx - %016llx\n",
                                  range[i].start, range[i].end);
         }
  
@@ -101,7 +101,7 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
                     (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
                     (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) {
                         /* Var MTRR contains UC entry below 1M? Skip it: */
-                       printk(BIOS_BUG_MSG, i);
+                       pr_warn(BIOS_BUG_MSG, i);
                         if (base + size <= (1<<(20-PAGE_SHIFT)))
                                 continue;
                         size -= (1<<(20-PAGE_SHIFT)) - base;
@@ -114,11 +114,11 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
                                  extra_remove_base + extra_remove_size);
  
         if  (debug_print) {
-               printk(KERN_DEBUG "After UC checking\n");
+               pr_debug("After UC checking\n");
                 for (i = 0; i < RANGE_NUM; i++) {
                         if (!range[i].end)
                                 continue;
-                       printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
+                       pr_debug("MTRR MAP PFN: %016llx - %016llx\n",
                                  range[i].start, range[i].end);
                 }
         }
@@ -126,9 +126,9 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
         /* sort the ranges */
         nr_range = clean_sort_range(range, RANGE_NUM);
         if  (debug_print) {
-               printk(KERN_DEBUG "After sorting\n");
+               pr_debug("After sorting\n");
                 for (i = 0; i < nr_range; i++)
-                       printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
+                       pr_debug("MTRR MAP PFN: %016llx - %016llx\n",
                                  range[i].start, range[i].end);
         }
  
@@ -544,7 +544,7 @@ static void __init print_out_mtrr_range_state(void)
                 start_base = to_size_factor(start_base, &start_factor),
                 type = range_state[i].type;
  
-               printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
+               pr_debug("reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
                         i, start_base, start_factor,
                         size_base, size_factor,
                         (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
@@ -713,7 +713,7 @@ int __init mtrr_cleanup(unsigned address_bits)
                 return 0;
  
         /* Print original var MTRRs at first, for debugging: */
-       printk(KERN_DEBUG "original variable MTRRs\n");
+       pr_debug("original variable MTRRs\n");
         print_out_mtrr_range_state();
  
         memset(range, 0, sizeof(range));
@@ -733,7 +733,7 @@ int __init mtrr_cleanup(unsigned address_bits)
                                           x_remove_base, x_remove_size);
  
         range_sums = sum_ranges(range, nr_range);
-       printk(KERN_INFO "total RAM covered: %ldM\n",
+       pr_info("total RAM covered: %ldM\n",
                range_sums >> (20 - PAGE_SHIFT));
  
         if (mtrr_chunk_size && mtrr_gran_size) {
@@ -745,12 +745,11 @@ int __init mtrr_cleanup(unsigned address_bits)
  
                 if (!result[i].bad) {
                         set_var_mtrr_all(address_bits);
-                       printk(KERN_DEBUG "New variable MTRRs\n");
+                       pr_debug("New variable MTRRs\n");
                         print_out_mtrr_range_state();
                         return 1;
                 }
-               printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
-                      "will find optimal one\n");
+               pr_info("invalid mtrr_gran_size or mtrr_chunk_size, will find optimal one\n");
         }
  
         i = 0;
@@ -768,7 +767,7 @@ int __init mtrr_cleanup(unsigned address_bits)
                                       x_remove_base, x_remove_size, i);
                         if (debug_print) {
                                 mtrr_print_out_one_result(i);
-                               printk(KERN_INFO "\n");
+                               pr_info("\n");
                         }
  
                         i++;
@@ -779,7 +778,7 @@ int __init mtrr_cleanup(unsigned address_bits)
         index_good = mtrr_search_optimal_index();
  
         if (index_good != -1) {
-               printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
+               pr_info("Found optimal setting for mtrr clean up\n");
                 i = index_good;
                 mtrr_print_out_one_result(i);
  
@@ -790,7 +789,7 @@ int __init mtrr_cleanup(unsigned address_bits)
                 gran_size <<= 10;
                 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
                 set_var_mtrr_all(address_bits);
-               printk(KERN_DEBUG "New variable MTRRs\n");
+               pr_debug("New variable MTRRs\n");
                 print_out_mtrr_range_state();
                 return 1;
         } else {
@@ -799,8 +798,8 @@ int __init mtrr_cleanup(unsigned address_bits)
                         mtrr_print_out_one_result(i);
         }
  
-       printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
-       printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
+       pr_info("mtrr_cleanup: can not find optimal value\n");
+       pr_info("please specify mtrr_gran_size/mtrr_chunk_size\n");
  
         return 0;
  }
@@ -918,7 +917,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
  
         /* kvm/qemu doesn't have mtrr set right, don't trim them all: */
         if (!highest_pfn) {
-               printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
+               pr_info("CPU MTRRs all blank - virtualized system.\n");
                 return 0;
         }
  
@@ -973,7 +972,8 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
                                                          end_pfn);
  
         if (total_trim_size) {
-               pr_warning("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n", total_trim_size >> 20);
+               pr_warn("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n",
+                       total_trim_size >> 20);
  
                 if (!changed_by_mtrr_cleanup)
                         WARN_ON(1);
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c

index c870af1610083ec3dda7cb61b966860c9a224374..fcbcb2f678ca47360d23623887d216165969d6e8 100644 (file)
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -55,7 +55,7 @@ static inline void k8_check_syscfg_dram_mod_en(void)
  
         rdmsr(MSR_K8_SYSCFG, lo, hi);
         if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) {
-               printk(KERN_ERR FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
+               pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
                        " not cleared by BIOS, clearing this bit\n",
                        smp_processor_id());
                 lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY;
@@ -501,14 +501,14 @@ void __init mtrr_state_warn(void)
         if (!mask)
                 return;
         if (mask & MTRR_CHANGE_MASK_FIXED)
-               pr_warning("mtrr: your CPUs had inconsistent fixed MTRR settings\n");
+               pr_warn("mtrr: your CPUs had inconsistent fixed MTRR settings\n");
         if (mask & MTRR_CHANGE_MASK_VARIABLE)
-               pr_warning("mtrr: your CPUs had inconsistent variable MTRR settings\n");
+               pr_warn("mtrr: your CPUs had inconsistent variable MTRR settings\n");
         if (mask & MTRR_CHANGE_MASK_DEFTYPE)
-               pr_warning("mtrr: your CPUs had inconsistent MTRRdefType settings\n");
+               pr_warn("mtrr: your CPUs had inconsistent MTRRdefType settings\n");
  
-       printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n");
-       printk(KERN_INFO "mtrr: corrected configuration.\n");
+       pr_info("mtrr: probably your BIOS does not setup all CPUs.\n");
+       pr_info("mtrr: corrected configuration.\n");
  }
  
  /*
@@ -519,8 +519,7 @@ void __init mtrr_state_warn(void)
  void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
  {
         if (wrmsr_safe(msr, a, b) < 0) {
-               printk(KERN_ERR
-                       "MTRR: CPU %u: Writing MSR %x to %x:%x failed\n",
+               pr_err("MTRR: CPU %u: Writing MSR %x to %x:%x failed\n",
                         smp_processor_id(), msr, a, b);
         }
  }
@@ -607,7 +606,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
                 tmp |= ~((1ULL<<(hi - 1)) - 1);
  
                 if (tmp != mask) {
-                       printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
+                       pr_warn("mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
                         add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
                         mask = tmp;
                 }
@@ -858,13 +857,13 @@ int generic_validate_add_page(unsigned long base, unsigned long size,
             boot_cpu_data.x86_model == 1 &&
             boot_cpu_data.x86_mask <= 7) {
                 if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) {
-                       pr_warning("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
+                       pr_warn("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
                         return -EINVAL;
                 }
                 if (!(base + size < 0x70000 || base > 0x7003F) &&
                     (type == MTRR_TYPE_WRCOMB
                      || type == MTRR_TYPE_WRBACK)) {
-                       pr_warning("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
+                       pr_warn("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
                         return -EINVAL;
                 }
         }
@@ -878,7 +877,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size,
              lbase = lbase >> 1, last = last >> 1)
                 ;
         if (lbase != last) {
-               pr_warning("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size);
+               pr_warn("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size);
                 return -EINVAL;
         }
         return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c

index 74f1d90f9c298d7051cff46c24736b8b6cfda7fc..10f8d4796240709cea61c0e56522d75eeff8ffdd 100644 (file)
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -300,24 +300,24 @@ int mtrr_add_page(unsigned long base, unsigned long size,
                 return error;
  
         if (type >= MTRR_NUM_TYPES) {
-               pr_warning("mtrr: type: %u invalid\n", type);
+               pr_warn("mtrr: type: %u invalid\n", type);
                 return -EINVAL;
         }
  
         /* If the type is WC, check that this processor supports it */
         if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) {
-               pr_warning("mtrr: your processor doesn't support write-combining\n");
+               pr_warn("mtrr: your processor doesn't support write-combining\n");
                 return -ENOSYS;
         }
  
         if (!size) {
-               pr_warning("mtrr: zero sized request\n");
+               pr_warn("mtrr: zero sized request\n");
                 return -EINVAL;
         }
  
         if ((base | (base + size - 1)) >>
             (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) {
-               pr_warning("mtrr: base or size exceeds the MTRR width\n");
+               pr_warn("mtrr: base or size exceeds the MTRR width\n");
                 return -EINVAL;
         }
  
@@ -348,7 +348,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
                                 } else if (types_compatible(type, ltype))
                                         continue;
                         }
-                       pr_warning("mtrr: 0x%lx000,0x%lx000 overlaps existing"
+                       pr_warn("mtrr: 0x%lx000,0x%lx000 overlaps existing"
                                 " 0x%lx000,0x%lx000\n", base, size, lbase,
                                 lsize);
                         goto out;
@@ -357,7 +357,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
                 if (ltype != type) {
                         if (types_compatible(type, ltype))
                                 continue;
-                       pr_warning("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
+                       pr_warn("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
                                 base, size, mtrr_attrib_to_str(ltype),
                                 mtrr_attrib_to_str(type));
                         goto out;
@@ -395,7 +395,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
  static int mtrr_check(unsigned long base, unsigned long size)
  {
         if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
-               pr_warning("mtrr: size and base must be multiples of 4 kiB\n");
+               pr_warn("mtrr: size and base must be multiples of 4 kiB\n");
                 pr_debug("mtrr: size: 0x%lx  base: 0x%lx\n", size, base);
                 dump_stack();
                 return -1;
@@ -493,16 +493,16 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
                 }
         }
         if (reg >= max) {
-               pr_warning("mtrr: register: %d too big\n", reg);
+               pr_warn("mtrr: register: %d too big\n", reg);
                 goto out;
         }
         mtrr_if->get(reg, &lbase, &lsize, &ltype);
         if (lsize < 1) {
-               pr_warning("mtrr: MTRR %d not used\n", reg);
+               pr_warn("mtrr: MTRR %d not used\n", reg);
                 goto out;
         }
         if (mtrr_usage_table[reg] < 1) {
-               pr_warning("mtrr: reg: %d has count=0\n", reg);
+               pr_warn("mtrr: reg: %d has count=0\n", reg);
                 goto out;
         }
         if (--mtrr_usage_table[reg] < 1)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c

deleted file mode 100644 (file)

index 1b443db..0000000
--- a/arch/x86/kernel/cpu/perf_event.c
+++ /dev/null
@@ -1,2428 +0,0 @@
-/*
- * Performance events x86 architecture code
- *
- *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
- *  Copyright (C) 2009 Jaswinder Singh Rajput
- *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
- *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
- *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
- *  Copyright (C) 2009 Google, Inc., Stephane Eranian
- *
- *  For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_event.h>
-#include <linux/capability.h>
-#include <linux/notifier.h>
-#include <linux/hardirq.h>
-#include <linux/kprobes.h>
-#include <linux/module.h>
-#include <linux/kdebug.h>
-#include <linux/sched.h>
-#include <linux/uaccess.h>
-#include <linux/slab.h>
-#include <linux/cpu.h>
-#include <linux/bitops.h>
-#include <linux/device.h>
-
-#include <asm/apic.h>
-#include <asm/stacktrace.h>
-#include <asm/nmi.h>
-#include <asm/smp.h>
-#include <asm/alternative.h>
-#include <asm/mmu_context.h>
-#include <asm/tlbflush.h>
-#include <asm/timer.h>
-#include <asm/desc.h>
-#include <asm/ldt.h>
-
-#include "perf_event.h"
-
-struct x86_pmu x86_pmu __read_mostly;
-
-DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
-       .enabled = 1,
-};
-
-struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE;
-
-u64 __read_mostly hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX];
-u64 __read_mostly hw_cache_extra_regs
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX];
-
-/*
- * Propagate event elapsed time into the generic event.
- * Can only be executed on the CPU where the event is active.
- * Returns the delta events processed.
- */
-u64 x86_perf_event_update(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       int shift = 64 - x86_pmu.cntval_bits;
-       u64 prev_raw_count, new_raw_count;
-       int idx = hwc->idx;
-       s64 delta;
-
-       if (idx == INTEL_PMC_IDX_FIXED_BTS)
-               return 0;
-
-       /*
-        * Careful: an NMI might modify the previous event value.
-        *
-        * Our tactic to handle this is to first atomically read and
-        * exchange a new raw count - then add that new-prev delta
-        * count to the generic event atomically:
-        */
-again:
-       prev_raw_count = local64_read(&hwc->prev_count);
-       rdpmcl(hwc->event_base_rdpmc, new_raw_count);
-
-       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-                                       new_raw_count) != prev_raw_count)
-               goto again;
-
-       /*
-        * Now we have the new raw value and have updated the prev
-        * timestamp already. We can now calculate the elapsed delta
-        * (event-)time and add that to the generic event.
-        *
-        * Careful, not all hw sign-extends above the physical width
-        * of the count.
-        */
-       delta = (new_raw_count << shift) - (prev_raw_count << shift);
-       delta >>= shift;
-
-       local64_add(delta, &event->count);
-       local64_sub(delta, &hwc->period_left);
-
-       return new_raw_count;
-}
-
-/*
- * Find and validate any extra registers to set up.
- */
-static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg;
-       struct extra_reg *er;
-
-       reg = &event->hw.extra_reg;
-
-       if (!x86_pmu.extra_regs)
-               return 0;
-
-       for (er = x86_pmu.extra_regs; er->msr; er++) {
-               if (er->event != (config & er->config_mask))
-                       continue;
-               if (event->attr.config1 & ~er->valid_mask)
-                       return -EINVAL;
-               /* Check if the extra msrs can be safely accessed*/
-               if (!er->extra_msr_access)
-                       return -ENXIO;
-
-               reg->idx = er->idx;
-               reg->config = event->attr.config1;
-               reg->reg = er->msr;
-               break;
-       }
-       return 0;
-}
-
-static atomic_t active_events;
-static atomic_t pmc_refcount;
-static DEFINE_MUTEX(pmc_reserve_mutex);
-
-#ifdef CONFIG_X86_LOCAL_APIC
-
-static bool reserve_pmc_hardware(void)
-{
-       int i;
-
-       for (i = 0; i < x86_pmu.num_counters; i++) {
-               if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
-                       goto perfctr_fail;
-       }
-
-       for (i = 0; i < x86_pmu.num_counters; i++) {
-               if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
-                       goto eventsel_fail;
-       }
-
-       return true;
-
-eventsel_fail:
-       for (i--; i >= 0; i--)
-               release_evntsel_nmi(x86_pmu_config_addr(i));
-
-       i = x86_pmu.num_counters;
-
-perfctr_fail:
-       for (i--; i >= 0; i--)
-               release_perfctr_nmi(x86_pmu_event_addr(i));
-
-       return false;
-}
-
-static void release_pmc_hardware(void)
-{
-       int i;
-
-       for (i = 0; i < x86_pmu.num_counters; i++) {
-               release_perfctr_nmi(x86_pmu_event_addr(i));
-               release_evntsel_nmi(x86_pmu_config_addr(i));
-       }
-}
-
-#else
-
-static bool reserve_pmc_hardware(void) { return true; }
-static void release_pmc_hardware(void) {}
-
-#endif
-
-static bool check_hw_exists(void)
-{
-       u64 val, val_fail, val_new= ~0;
-       int i, reg, reg_fail, ret = 0;
-       int bios_fail = 0;
-       int reg_safe = -1;
-
-       /*
-        * Check to see if the BIOS enabled any of the counters, if so
-        * complain and bail.
-        */
-       for (i = 0; i < x86_pmu.num_counters; i++) {
-               reg = x86_pmu_config_addr(i);
-               ret = rdmsrl_safe(reg, &val);
-               if (ret)
-                       goto msr_fail;
-               if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
-                       bios_fail = 1;
-                       val_fail = val;
-                       reg_fail = reg;
-               } else {
-                       reg_safe = i;
-               }
-       }
-
-       if (x86_pmu.num_counters_fixed) {
-               reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
-               ret = rdmsrl_safe(reg, &val);
-               if (ret)
-                       goto msr_fail;
-               for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
-                       if (val & (0x03 << i*4)) {
-                               bios_fail = 1;
-                               val_fail = val;
-                               reg_fail = reg;
-                       }
-               }
-       }
-
-       /*
-        * If all the counters are enabled, the below test will always
-        * fail.  The tools will also become useless in this scenario.
-        * Just fail and disable the hardware counters.
-        */
-
-       if (reg_safe == -1) {
-               reg = reg_safe;
-               goto msr_fail;
-       }
-
-       /*
-        * Read the current value, change it and read it back to see if it
-        * matches, this is needed to detect certain hardware emulators
-        * (qemu/kvm) that don't trap on the MSR access and always return 0s.
-        */
-       reg = x86_pmu_event_addr(reg_safe);
-       if (rdmsrl_safe(reg, &val))
-               goto msr_fail;
-       val ^= 0xffffUL;
-       ret = wrmsrl_safe(reg, val);
-       ret |= rdmsrl_safe(reg, &val_new);
-       if (ret || val != val_new)
-               goto msr_fail;
-
-       /*
-        * We still allow the PMU driver to operate:
-        */
-       if (bios_fail) {
-               printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n");
-               printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg_fail, val_fail);
-       }
-
-       return true;
-
-msr_fail:
-       printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
-       printk("%sFailed to access perfctr msr (MSR %x is %Lx)\n",
-               boot_cpu_has(X86_FEATURE_HYPERVISOR) ? KERN_INFO : KERN_ERR,
-               reg, val_new);
-
-       return false;
-}
-
-static void hw_perf_event_destroy(struct perf_event *event)
-{
-       x86_release_hardware();
-       atomic_dec(&active_events);
-}
-
-void hw_perf_lbr_event_destroy(struct perf_event *event)
-{
-       hw_perf_event_destroy(event);
-
-       /* undo the lbr/bts event accounting */
-       x86_del_exclusive(x86_lbr_exclusive_lbr);
-}
-
-static inline int x86_pmu_initialized(void)
-{
-       return x86_pmu.handle_irq != NULL;
-}
-
-static inline int
-set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
-{
-       struct perf_event_attr *attr = &event->attr;
-       unsigned int cache_type, cache_op, cache_result;
-       u64 config, val;
-
-       config = attr->config;
-
-       cache_type = (config >>  0) & 0xff;
-       if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
-               return -EINVAL;
-
-       cache_op = (config >>  8) & 0xff;
-       if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
-               return -EINVAL;
-
-       cache_result = (config >> 16) & 0xff;
-       if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
-               return -EINVAL;
-
-       val = hw_cache_event_ids[cache_type][cache_op][cache_result];
-
-       if (val == 0)
-               return -ENOENT;
-
-       if (val == -1)
-               return -EINVAL;
-
-       hwc->config |= val;
-       attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
-       return x86_pmu_extra_regs(val, event);
-}
-
-int x86_reserve_hardware(void)
-{
-       int err = 0;
-
-       if (!atomic_inc_not_zero(&pmc_refcount)) {
-               mutex_lock(&pmc_reserve_mutex);
-               if (atomic_read(&pmc_refcount) == 0) {
-                       if (!reserve_pmc_hardware())
-                               err = -EBUSY;
-                       else
-                               reserve_ds_buffers();
-               }
-               if (!err)
-                       atomic_inc(&pmc_refcount);
-               mutex_unlock(&pmc_reserve_mutex);
-       }
-
-       return err;
-}
-
-void x86_release_hardware(void)
-{
-       if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
-               release_pmc_hardware();
-               release_ds_buffers();
-               mutex_unlock(&pmc_reserve_mutex);
-       }
-}
-
-/*
- * Check if we can create event of a certain type (that no conflicting events
- * are present).
- */
-int x86_add_exclusive(unsigned int what)
-{
-       int i;
-
-       if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
-               mutex_lock(&pmc_reserve_mutex);
-               for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
-                       if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
-                               goto fail_unlock;
-               }
-               atomic_inc(&x86_pmu.lbr_exclusive[what]);
-               mutex_unlock(&pmc_reserve_mutex);
-       }
-
-       atomic_inc(&active_events);
-       return 0;
-
-fail_unlock:
-       mutex_unlock(&pmc_reserve_mutex);
-       return -EBUSY;
-}
-
-void x86_del_exclusive(unsigned int what)
-{
-       atomic_dec(&x86_pmu.lbr_exclusive[what]);
-       atomic_dec(&active_events);
-}
-
-int x86_setup_perfctr(struct perf_event *event)
-{
-       struct perf_event_attr *attr = &event->attr;
-       struct hw_perf_event *hwc = &event->hw;
-       u64 config;
-
-       if (!is_sampling_event(event)) {
-               hwc->sample_period = x86_pmu.max_period;
-               hwc->last_period = hwc->sample_period;
-               local64_set(&hwc->period_left, hwc->sample_period);
-       }
-
-       if (attr->type == PERF_TYPE_RAW)
-               return x86_pmu_extra_regs(event->attr.config, event);
-
-       if (attr->type == PERF_TYPE_HW_CACHE)
-               return set_ext_hw_attr(hwc, event);
-
-       if (attr->config >= x86_pmu.max_events)
-               return -EINVAL;
-
-       /*
-        * The generic map:
-        */
-       config = x86_pmu.event_map(attr->config);
-
-       if (config == 0)
-               return -ENOENT;
-
-       if (config == -1LL)
-               return -EINVAL;
-
-       /*
-        * Branch tracing:
-        */
-       if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
-           !attr->freq && hwc->sample_period == 1) {
-               /* BTS is not supported by this architecture. */
-               if (!x86_pmu.bts_active)
-                       return -EOPNOTSUPP;
-
-               /* BTS is currently only allowed for user-mode. */
-               if (!attr->exclude_kernel)
-                       return -EOPNOTSUPP;
-
-               /* disallow bts if conflicting events are present */
-               if (x86_add_exclusive(x86_lbr_exclusive_lbr))
-                       return -EBUSY;
-
-               event->destroy = hw_perf_lbr_event_destroy;
-       }
-
-       hwc->config |= config;
-
-       return 0;
-}
-
-/*
- * check that branch_sample_type is compatible with
- * settings needed for precise_ip > 1 which implies
- * using the LBR to capture ALL taken branches at the
- * priv levels of the measurement
- */
-static inline int precise_br_compat(struct perf_event *event)
-{
-       u64 m = event->attr.branch_sample_type;
-       u64 b = 0;
-
-       /* must capture all branches */
-       if (!(m & PERF_SAMPLE_BRANCH_ANY))
-               return 0;
-
-       m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
-
-       if (!event->attr.exclude_user)
-               b |= PERF_SAMPLE_BRANCH_USER;
-
-       if (!event->attr.exclude_kernel)
-               b |= PERF_SAMPLE_BRANCH_KERNEL;
-
-       /*
-        * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
-        */
-
-       return m == b;
-}
-
-int x86_pmu_hw_config(struct perf_event *event)
-{
-       if (event->attr.precise_ip) {
-               int precise = 0;
-
-               /* Support for constant skid */
-               if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
-                       precise++;
-
-                       /* Support for IP fixup */
-                       if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
-                               precise++;
-
-                       if (x86_pmu.pebs_prec_dist)
-                               precise++;
-               }
-
-               if (event->attr.precise_ip > precise)
-                       return -EOPNOTSUPP;
-       }
-       /*
-        * check that PEBS LBR correction does not conflict with
-        * whatever the user is asking with attr->branch_sample_type
-        */
-       if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
-               u64 *br_type = &event->attr.branch_sample_type;
-
-               if (has_branch_stack(event)) {
-                       if (!precise_br_compat(event))
-                               return -EOPNOTSUPP;
-
-                       /* branch_sample_type is compatible */
-
-               } else {
-                       /*
-                        * user did not specify  branch_sample_type
-                        *
-                        * For PEBS fixups, we capture all
-                        * the branches at the priv level of the
-                        * event.
-                        */
-                       *br_type = PERF_SAMPLE_BRANCH_ANY;
-
-                       if (!event->attr.exclude_user)
-                               *br_type |= PERF_SAMPLE_BRANCH_USER;
-
-                       if (!event->attr.exclude_kernel)
-                               *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
-               }
-       }
-
-       if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
-               event->attach_state |= PERF_ATTACH_TASK_DATA;
-
-       /*
-        * Generate PMC IRQs:
-        * (keep 'enabled' bit clear for now)
-        */
-       event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
-
-       /*
-        * Count user and OS events unless requested not to
-        */
-       if (!event->attr.exclude_user)
-               event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
-       if (!event->attr.exclude_kernel)
-               event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
-
-       if (event->attr.type == PERF_TYPE_RAW)
-               event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
-
-       if (event->attr.sample_period && x86_pmu.limit_period) {
-               if (x86_pmu.limit_period(event, event->attr.sample_period) >
-                               event->attr.sample_period)
-                       return -EINVAL;
-       }
-
-       return x86_setup_perfctr(event);
-}
-
-/*
- * Setup the hardware configuration for a given attr_type
- */
-static int __x86_pmu_event_init(struct perf_event *event)
-{
-       int err;
-
-       if (!x86_pmu_initialized())
-               return -ENODEV;
-
-       err = x86_reserve_hardware();
-       if (err)
-               return err;
-
-       atomic_inc(&active_events);
-       event->destroy = hw_perf_event_destroy;
-
-       event->hw.idx = -1;
-       event->hw.last_cpu = -1;
-       event->hw.last_tag = ~0ULL;
-
-       /* mark unused */
-       event->hw.extra_reg.idx = EXTRA_REG_NONE;
-       event->hw.branch_reg.idx = EXTRA_REG_NONE;
-
-       return x86_pmu.hw_config(event);
-}
-
-void x86_pmu_disable_all(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       int idx;
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               u64 val;
-
-               if (!test_bit(idx, cpuc->active_mask))
-                       continue;
-               rdmsrl(x86_pmu_config_addr(idx), val);
-               if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
-                       continue;
-               val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-               wrmsrl(x86_pmu_config_addr(idx), val);
-       }
-}
-
-static void x86_pmu_disable(struct pmu *pmu)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       if (!x86_pmu_initialized())
-               return;
-
-       if (!cpuc->enabled)
-               return;
-
-       cpuc->n_added = 0;
-       cpuc->enabled = 0;
-       barrier();
-
-       x86_pmu.disable_all();
-}
-
-void x86_pmu_enable_all(int added)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       int idx;
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
-
-               if (!test_bit(idx, cpuc->active_mask))
-                       continue;
-
-               __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
-       }
-}
-
-static struct pmu pmu;
-
-static inline int is_x86_event(struct perf_event *event)
-{
-       return event->pmu == &pmu;
-}
-
-/*
- * Event scheduler state:
- *
- * Assign events iterating over all events and counters, beginning
- * with events with least weights first. Keep the current iterator
- * state in struct sched_state.
- */
-struct sched_state {
-       int     weight;
-       int     event;          /* event index */
-       int     counter;        /* counter index */
-       int     unassigned;     /* number of events to be assigned left */
-       int     nr_gp;          /* number of GP counters used */
-       unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-};
-
-/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
-#define        SCHED_STATES_MAX        2
-
-struct perf_sched {
-       int                     max_weight;
-       int                     max_events;
-       int                     max_gp;
-       int                     saved_states;
-       struct event_constraint **constraints;
-       struct sched_state      state;
-       struct sched_state      saved[SCHED_STATES_MAX];
-};
-
-/*
- * Initialize interator that runs through all events and counters.
- */
-static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
-                           int num, int wmin, int wmax, int gpmax)
-{
-       int idx;
-
-       memset(sched, 0, sizeof(*sched));
-       sched->max_events       = num;
-       sched->max_weight       = wmax;
-       sched->max_gp           = gpmax;
-       sched->constraints      = constraints;
-
-       for (idx = 0; idx < num; idx++) {
-               if (constraints[idx]->weight == wmin)
-                       break;
-       }
-
-       sched->state.event      = idx;          /* start with min weight */
-       sched->state.weight     = wmin;
-       sched->state.unassigned = num;
-}
-
-static void perf_sched_save_state(struct perf_sched *sched)
-{
-       if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
-               return;
-
-       sched->saved[sched->saved_states] = sched->state;
-       sched->saved_states++;
-}
-
-static bool perf_sched_restore_state(struct perf_sched *sched)
-{
-       if (!sched->saved_states)
-               return false;
-
-       sched->saved_states--;
-       sched->state = sched->saved[sched->saved_states];
-
-       /* continue with next counter: */
-       clear_bit(sched->state.counter++, sched->state.used);
-
-       return true;
-}
-
-/*
- * Select a counter for the current event to schedule. Return true on
- * success.
- */
-static bool __perf_sched_find_counter(struct perf_sched *sched)
-{
-       struct event_constraint *c;
-       int idx;
-
-       if (!sched->state.unassigned)
-               return false;
-
-       if (sched->state.event >= sched->max_events)
-               return false;
-
-       c = sched->constraints[sched->state.event];
-       /* Prefer fixed purpose counters */
-       if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
-               idx = INTEL_PMC_IDX_FIXED;
-               for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
-                       if (!__test_and_set_bit(idx, sched->state.used))
-                               goto done;
-               }
-       }
-
-       /* Grab the first unused counter starting with idx */
-       idx = sched->state.counter;
-       for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
-               if (!__test_and_set_bit(idx, sched->state.used)) {
-                       if (sched->state.nr_gp++ >= sched->max_gp)
-                               return false;
-
-                       goto done;
-               }
-       }
-
-       return false;
-
-done:
-       sched->state.counter = idx;
-
-       if (c->overlap)
-               perf_sched_save_state(sched);
-
-       return true;
-}
-
-static bool perf_sched_find_counter(struct perf_sched *sched)
-{
-       while (!__perf_sched_find_counter(sched)) {
-               if (!perf_sched_restore_state(sched))
-                       return false;
-       }
-
-       return true;
-}
-
-/*
- * Go through all unassigned events and find the next one to schedule.
- * Take events with the least weight first. Return true on success.
- */
-static bool perf_sched_next_event(struct perf_sched *sched)
-{
-       struct event_constraint *c;
-
-       if (!sched->state.unassigned || !--sched->state.unassigned)
-               return false;
-
-       do {
-               /* next event */
-               sched->state.event++;
-               if (sched->state.event >= sched->max_events) {
-                       /* next weight */
-                       sched->state.event = 0;
-                       sched->state.weight++;
-                       if (sched->state.weight > sched->max_weight)
-                               return false;
-               }
-               c = sched->constraints[sched->state.event];
-       } while (c->weight != sched->state.weight);
-
-       sched->state.counter = 0;       /* start with first counter */
-
-       return true;
-}
-
-/*
- * Assign a counter for each event.
- */
-int perf_assign_events(struct event_constraint **constraints, int n,
-                       int wmin, int wmax, int gpmax, int *assign)
-{
-       struct perf_sched sched;
-
-       perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
-
-       do {
-               if (!perf_sched_find_counter(&sched))
-                       break;  /* failed */
-               if (assign)
-                       assign[sched.state.event] = sched.state.counter;
-       } while (perf_sched_next_event(&sched));
-
-       return sched.state.unassigned;
-}
-EXPORT_SYMBOL_GPL(perf_assign_events);
-
-int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
-{
-       struct event_constraint *c;
-       unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-       struct perf_event *e;
-       int i, wmin, wmax, unsched = 0;
-       struct hw_perf_event *hwc;
-
-       bitmap_zero(used_mask, X86_PMC_IDX_MAX);
-
-       if (x86_pmu.start_scheduling)
-               x86_pmu.start_scheduling(cpuc);
-
-       for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
-               cpuc->event_constraint[i] = NULL;
-               c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
-               cpuc->event_constraint[i] = c;
-
-               wmin = min(wmin, c->weight);
-               wmax = max(wmax, c->weight);
-       }
-
-       /*
-        * fastpath, try to reuse previous register
-        */
-       for (i = 0; i < n; i++) {
-               hwc = &cpuc->event_list[i]->hw;
-               c = cpuc->event_constraint[i];
-
-               /* never assigned */
-               if (hwc->idx == -1)
-                       break;
-
-               /* constraint still honored */
-               if (!test_bit(hwc->idx, c->idxmsk))
-                       break;
-
-               /* not already used */
-               if (test_bit(hwc->idx, used_mask))
-                       break;
-
-               __set_bit(hwc->idx, used_mask);
-               if (assign)
-                       assign[i] = hwc->idx;
-       }
-
-       /* slow path */
-       if (i != n) {
-               int gpmax = x86_pmu.num_counters;
-
-               /*
-                * Do not allow scheduling of more than half the available
-                * generic counters.
-                *
-                * This helps avoid counter starvation of sibling thread by
-                * ensuring at most half the counters cannot be in exclusive
-                * mode. There is no designated counters for the limits. Any
-                * N/2 counters can be used. This helps with events with
-                * specific counter constraints.
-                */
-               if (is_ht_workaround_enabled() && !cpuc->is_fake &&
-                   READ_ONCE(cpuc->excl_cntrs->exclusive_present))
-                       gpmax /= 2;
-
-               unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
-                                            wmax, gpmax, assign);
-       }
-
-       /*
-        * In case of success (unsched = 0), mark events as committed,
-        * so we do not put_constraint() in case new events are added
-        * and fail to be scheduled
-        *
-        * We invoke the lower level commit callback to lock the resource
-        *
-        * We do not need to do all of this in case we are called to
-        * validate an event group (assign == NULL)
-        */
-       if (!unsched && assign) {
-               for (i = 0; i < n; i++) {
-                       e = cpuc->event_list[i];
-                       e->hw.flags |= PERF_X86_EVENT_COMMITTED;
-                       if (x86_pmu.commit_scheduling)
-                               x86_pmu.commit_scheduling(cpuc, i, assign[i]);
-               }
-       } else {
-               for (i = 0; i < n; i++) {
-                       e = cpuc->event_list[i];
-                       /*
-                        * do not put_constraint() on comitted events,
-                        * because they are good to go
-                        */
-                       if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
-                               continue;
-
-                       /*
-                        * release events that failed scheduling
-                        */
-                       if (x86_pmu.put_event_constraints)
-                               x86_pmu.put_event_constraints(cpuc, e);
-               }
-       }
-
-       if (x86_pmu.stop_scheduling)
-               x86_pmu.stop_scheduling(cpuc);
-
-       return unsched ? -EINVAL : 0;
-}
-
-/*
- * dogrp: true if must collect siblings events (group)
- * returns total number of events and error code
- */
-static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
-{
-       struct perf_event *event;
-       int n, max_count;
-
-       max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
-
-       /* current number of events already accepted */
-       n = cpuc->n_events;
-
-       if (is_x86_event(leader)) {
-               if (n >= max_count)
-                       return -EINVAL;
-               cpuc->event_list[n] = leader;
-               n++;
-       }
-       if (!dogrp)
-               return n;
-
-       list_for_each_entry(event, &leader->sibling_list, group_entry) {
-               if (!is_x86_event(event) ||
-                   event->state <= PERF_EVENT_STATE_OFF)
-                       continue;
-
-               if (n >= max_count)
-                       return -EINVAL;
-
-               cpuc->event_list[n] = event;
-               n++;
-       }
-       return n;
-}
-
-static inline void x86_assign_hw_event(struct perf_event *event,
-                               struct cpu_hw_events *cpuc, int i)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       hwc->idx = cpuc->assign[i];
-       hwc->last_cpu = smp_processor_id();
-       hwc->last_tag = ++cpuc->tags[i];
-
-       if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) {
-               hwc->config_base = 0;
-               hwc->event_base = 0;
-       } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
-               hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
-               hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED);
-               hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30;
-       } else {
-               hwc->config_base = x86_pmu_config_addr(hwc->idx);
-               hwc->event_base  = x86_pmu_event_addr(hwc->idx);
-               hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
-       }
-}
-
-static inline int match_prev_assignment(struct hw_perf_event *hwc,
-                                       struct cpu_hw_events *cpuc,
-                                       int i)
-{
-       return hwc->idx == cpuc->assign[i] &&
-               hwc->last_cpu == smp_processor_id() &&
-               hwc->last_tag == cpuc->tags[i];
-}
-
-static void x86_pmu_start(struct perf_event *event, int flags);
-
-static void x86_pmu_enable(struct pmu *pmu)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct perf_event *event;
-       struct hw_perf_event *hwc;
-       int i, added = cpuc->n_added;
-
-       if (!x86_pmu_initialized())
-               return;
-
-       if (cpuc->enabled)
-               return;
-
-       if (cpuc->n_added) {
-               int n_running = cpuc->n_events - cpuc->n_added;
-               /*
-                * apply assignment obtained either from
-                * hw_perf_group_sched_in() or x86_pmu_enable()
-                *
-                * step1: save events moving to new counters
-                */
-               for (i = 0; i < n_running; i++) {
-                       event = cpuc->event_list[i];
-                       hwc = &event->hw;
-
-                       /*
-                        * we can avoid reprogramming counter if:
-                        * - assigned same counter as last time
-                        * - running on same CPU as last time
-                        * - no other event has used the counter since
-                        */
-                       if (hwc->idx == -1 ||
-                           match_prev_assignment(hwc, cpuc, i))
-                               continue;
-
-                       /*
-                        * Ensure we don't accidentally enable a stopped
-                        * counter simply because we rescheduled.
-                        */
-                       if (hwc->state & PERF_HES_STOPPED)
-                               hwc->state |= PERF_HES_ARCH;
-
-                       x86_pmu_stop(event, PERF_EF_UPDATE);
-               }
-
-               /*
-                * step2: reprogram moved events into new counters
-                */
-               for (i = 0; i < cpuc->n_events; i++) {
-                       event = cpuc->event_list[i];
-                       hwc = &event->hw;
-
-                       if (!match_prev_assignment(hwc, cpuc, i))
-                               x86_assign_hw_event(event, cpuc, i);
-                       else if (i < n_running)
-                               continue;
-
-                       if (hwc->state & PERF_HES_ARCH)
-                               continue;
-
-                       x86_pmu_start(event, PERF_EF_RELOAD);
-               }
-               cpuc->n_added = 0;
-               perf_events_lapic_init();
-       }
-
-       cpuc->enabled = 1;
-       barrier();
-
-       x86_pmu.enable_all(added);
-}
-
-static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
-
-/*
- * Set the next IRQ period, based on the hwc->period_left value.
- * To be called with the event disabled in hw:
- */
-int x86_perf_event_set_period(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       s64 left = local64_read(&hwc->period_left);
-       s64 period = hwc->sample_period;
-       int ret = 0, idx = hwc->idx;
-
-       if (idx == INTEL_PMC_IDX_FIXED_BTS)
-               return 0;
-
-       /*
-        * If we are way outside a reasonable range then just skip forward:
-        */
-       if (unlikely(left <= -period)) {
-               left = period;
-               local64_set(&hwc->period_left, left);
-               hwc->last_period = period;
-               ret = 1;
-       }
-
-       if (unlikely(left <= 0)) {
-               left += period;
-               local64_set(&hwc->period_left, left);
-               hwc->last_period = period;
-               ret = 1;
-       }
-       /*
-        * Quirk: certain CPUs dont like it if just 1 hw_event is left:
-        */
-       if (unlikely(left < 2))
-               left = 2;
-
-       if (left > x86_pmu.max_period)
-               left = x86_pmu.max_period;
-
-       if (x86_pmu.limit_period)
-               left = x86_pmu.limit_period(event, left);
-
-       per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
-
-       if (!(hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) ||
-           local64_read(&hwc->prev_count) != (u64)-left) {
-               /*
-                * The hw event starts counting from this event offset,
-                * mark it to be able to extra future deltas:
-                */
-               local64_set(&hwc->prev_count, (u64)-left);
-
-               wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
-       }
-
-       /*
-        * Due to erratum on certan cpu we need
-        * a second write to be sure the register
-        * is updated properly
-        */
-       if (x86_pmu.perfctr_second_write) {
-               wrmsrl(hwc->event_base,
-                       (u64)(-left) & x86_pmu.cntval_mask);
-       }
-
-       perf_event_update_userpage(event);
-
-       return ret;
-}
-
-void x86_pmu_enable_event(struct perf_event *event)
-{
-       if (__this_cpu_read(cpu_hw_events.enabled))
-               __x86_pmu_enable_event(&event->hw,
-                                      ARCH_PERFMON_EVENTSEL_ENABLE);
-}
-
-/*
- * Add a single event to the PMU.
- *
- * The event is added to the group of enabled events
- * but only if it can be scehduled with existing events.
- */
-static int x86_pmu_add(struct perf_event *event, int flags)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct hw_perf_event *hwc;
-       int assign[X86_PMC_IDX_MAX];
-       int n, n0, ret;
-
-       hwc = &event->hw;
-
-       n0 = cpuc->n_events;
-       ret = n = collect_events(cpuc, event, false);
-       if (ret < 0)
-               goto out;
-
-       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-       if (!(flags & PERF_EF_START))
-               hwc->state |= PERF_HES_ARCH;
-
-       /*
-        * If group events scheduling transaction was started,
-        * skip the schedulability test here, it will be performed
-        * at commit time (->commit_txn) as a whole.
-        */
-       if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
-               goto done_collect;
-
-       ret = x86_pmu.schedule_events(cpuc, n, assign);
-       if (ret)
-               goto out;
-       /*
-        * copy new assignment, now we know it is possible
-        * will be used by hw_perf_enable()
-        */
-       memcpy(cpuc->assign, assign, n*sizeof(int));
-
-done_collect:
-       /*
-        * Commit the collect_events() state. See x86_pmu_del() and
-        * x86_pmu_*_txn().
-        */
-       cpuc->n_events = n;
-       cpuc->n_added += n - n0;
-       cpuc->n_txn += n - n0;
-
-       ret = 0;
-out:
-       return ret;
-}
-
-static void x86_pmu_start(struct perf_event *event, int flags)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       int idx = event->hw.idx;
-
-       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
-               return;
-
-       if (WARN_ON_ONCE(idx == -1))
-               return;
-
-       if (flags & PERF_EF_RELOAD) {
-               WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
-               x86_perf_event_set_period(event);
-       }
-
-       event->hw.state = 0;
-
-       cpuc->events[idx] = event;
-       __set_bit(idx, cpuc->active_mask);
-       __set_bit(idx, cpuc->running);
-       x86_pmu.enable(event);
-       perf_event_update_userpage(event);
-}
-
-void perf_event_print_debug(void)
-{
-       u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
-       u64 pebs, debugctl;
-       struct cpu_hw_events *cpuc;
-       unsigned long flags;
-       int cpu, idx;
-
-       if (!x86_pmu.num_counters)
-               return;
-
-       local_irq_save(flags);
-
-       cpu = smp_processor_id();
-       cpuc = &per_cpu(cpu_hw_events, cpu);
-
-       if (x86_pmu.version >= 2) {
-               rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
-               rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-               rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
-               rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
-
-               pr_info("\n");
-               pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
-               pr_info("CPU#%d: status:     %016llx\n", cpu, status);
-               pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
-               pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
-               if (x86_pmu.pebs_constraints) {
-                       rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
-                       pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
-               }
-               if (x86_pmu.lbr_nr) {
-                       rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-                       pr_info("CPU#%d: debugctl:   %016llx\n", cpu, debugctl);
-               }
-       }
-       pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
-               rdmsrl(x86_pmu_event_addr(idx), pmc_count);
-
-               prev_left = per_cpu(pmc_prev_left[idx], cpu);
-
-               pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
-                       cpu, idx, pmc_ctrl);
-               pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
-                       cpu, idx, pmc_count);
-               pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
-                       cpu, idx, prev_left);
-       }
-       for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
-               rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
-
-               pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
-                       cpu, idx, pmc_count);
-       }
-       local_irq_restore(flags);
-}
-
-void x86_pmu_stop(struct perf_event *event, int flags)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
-               x86_pmu.disable(event);
-               cpuc->events[hwc->idx] = NULL;
-               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-               hwc->state |= PERF_HES_STOPPED;
-       }
-
-       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-               /*
-                * Drain the remaining delta count out of a event
-                * that we are disabling:
-                */
-               x86_perf_event_update(event);
-               hwc->state |= PERF_HES_UPTODATE;
-       }
-}
-
-static void x86_pmu_del(struct perf_event *event, int flags)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       int i;
-
-       /*
-        * event is descheduled
-        */
-       event->hw.flags &= ~PERF_X86_EVENT_COMMITTED;
-
-       /*
-        * If we're called during a txn, we don't need to do anything.
-        * The events never got scheduled and ->cancel_txn will truncate
-        * the event_list.
-        *
-        * XXX assumes any ->del() called during a TXN will only be on
-        * an event added during that same TXN.
-        */
-       if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
-               return;
-
-       /*
-        * Not a TXN, therefore cleanup properly.
-        */
-       x86_pmu_stop(event, PERF_EF_UPDATE);
-
-       for (i = 0; i < cpuc->n_events; i++) {
-               if (event == cpuc->event_list[i])
-                       break;
-       }
-
-       if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */
-               return;
-
-       /* If we have a newly added event; make sure to decrease n_added. */
-       if (i >= cpuc->n_events - cpuc->n_added)
-               --cpuc->n_added;
-
-       if (x86_pmu.put_event_constraints)
-               x86_pmu.put_event_constraints(cpuc, event);
-
-       /* Delete the array entry. */
-       while (++i < cpuc->n_events) {
-               cpuc->event_list[i-1] = cpuc->event_list[i];
-               cpuc->event_constraint[i-1] = cpuc->event_constraint[i];
-       }
-       --cpuc->n_events;
-
-       perf_event_update_userpage(event);
-}
-
-int x86_pmu_handle_irq(struct pt_regs *regs)
-{
-       struct perf_sample_data data;
-       struct cpu_hw_events *cpuc;
-       struct perf_event *event;
-       int idx, handled = 0;
-       u64 val;
-
-       cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       /*
-        * Some chipsets need to unmask the LVTPC in a particular spot
-        * inside the nmi handler.  As a result, the unmasking was pushed
-        * into all the nmi handlers.
-        *
-        * This generic handler doesn't seem to have any issues where the
-        * unmasking occurs so it was left at the top.
-        */
-       apic_write(APIC_LVTPC, APIC_DM_NMI);
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               if (!test_bit(idx, cpuc->active_mask)) {
-                       /*
-                        * Though we deactivated the counter some cpus
-                        * might still deliver spurious interrupts still
-                        * in flight. Catch them:
-                        */
-                       if (__test_and_clear_bit(idx, cpuc->running))
-                               handled++;
-                       continue;
-               }
-
-               event = cpuc->events[idx];
-
-               val = x86_perf_event_update(event);
-               if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
-                       continue;
-
-               /*
-                * event overflow
-                */
-               handled++;
-               perf_sample_data_init(&data, 0, event->hw.last_period);
-
-               if (!x86_perf_event_set_period(event))
-                       continue;
-
-               if (perf_event_overflow(event, &data, regs))
-                       x86_pmu_stop(event, 0);
-       }
-
-       if (handled)
-               inc_irq_stat(apic_perf_irqs);
-
-       return handled;
-}
-
-void perf_events_lapic_init(void)
-{
-       if (!x86_pmu.apic || !x86_pmu_initialized())
-               return;
-
-       /*
-        * Always use NMI for PMU
-        */
-       apic_write(APIC_LVTPC, APIC_DM_NMI);
-}
-
-static int
-perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
-{
-       u64 start_clock;
-       u64 finish_clock;
-       int ret;
-
-       /*
-        * All PMUs/events that share this PMI handler should make sure to
-        * increment active_events for their events.
-        */
-       if (!atomic_read(&active_events))
-               return NMI_DONE;
-
-       start_clock = sched_clock();
-       ret = x86_pmu.handle_irq(regs);
-       finish_clock = sched_clock();
-
-       perf_sample_event_took(finish_clock - start_clock);
-
-       return ret;
-}
-NOKPROBE_SYMBOL(perf_event_nmi_handler);
-
-struct event_constraint emptyconstraint;
-struct event_constraint unconstrained;
-
-static int
-x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
-{
-       unsigned int cpu = (long)hcpu;
-       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-       int i, ret = NOTIFY_OK;
-
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_UP_PREPARE:
-               for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
-                       cpuc->kfree_on_online[i] = NULL;
-               if (x86_pmu.cpu_prepare)
-                       ret = x86_pmu.cpu_prepare(cpu);
-               break;
-
-       case CPU_STARTING:
-               if (x86_pmu.cpu_starting)
-                       x86_pmu.cpu_starting(cpu);
-               break;
-
-       case CPU_ONLINE:
-               for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
-                       kfree(cpuc->kfree_on_online[i]);
-                       cpuc->kfree_on_online[i] = NULL;
-               }
-               break;
-
-       case CPU_DYING:
-               if (x86_pmu.cpu_dying)
-                       x86_pmu.cpu_dying(cpu);
-               break;
-
-       case CPU_UP_CANCELED:
-       case CPU_DEAD:
-               if (x86_pmu.cpu_dead)
-                       x86_pmu.cpu_dead(cpu);
-               break;
-
-       default:
-               break;
-       }
-
-       return ret;
-}
-
-static void __init pmu_check_apic(void)
-{
-       if (cpu_has_apic)
-               return;
-
-       x86_pmu.apic = 0;
-       pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
-       pr_info("no hardware sampling interrupt available.\n");
-
-       /*
-        * If we have a PMU initialized but no APIC
-        * interrupts, we cannot sample hardware
-        * events (user-space has to fall back and
-        * sample via a hrtimer based software event):
-        */
-       pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
-
-}
-
-static struct attribute_group x86_pmu_format_group = {
-       .name = "format",
-       .attrs = NULL,
-};
-
-/*
- * Remove all undefined events (x86_pmu.event_map(id) == 0)
- * out of events_attr attributes.
- */
-static void __init filter_events(struct attribute **attrs)
-{
-       struct device_attribute *d;
-       struct perf_pmu_events_attr *pmu_attr;
-       int offset = 0;
-       int i, j;
-
-       for (i = 0; attrs[i]; i++) {
-               d = (struct device_attribute *)attrs[i];
-               pmu_attr = container_of(d, struct perf_pmu_events_attr, attr);
-               /* str trumps id */
-               if (pmu_attr->event_str)
-                       continue;
-               if (x86_pmu.event_map(i + offset))
-                       continue;
-
-               for (j = i; attrs[j]; j++)
-                       attrs[j] = attrs[j + 1];
-
-               /* Check the shifted attr. */
-               i--;
-
-               /*
-                * event_map() is index based, the attrs array is organized
-                * by increasing event index. If we shift the events, then
-                * we need to compensate for the event_map(), otherwise
-                * we are looking up the wrong event in the map
-                */
-               offset++;
-       }
-}
-
-/* Merge two pointer arrays */
-__init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
-{
-       struct attribute **new;
-       int j, i;
-
-       for (j = 0; a[j]; j++)
-               ;
-       for (i = 0; b[i]; i++)
-               j++;
-       j++;
-
-       new = kmalloc(sizeof(struct attribute *) * j, GFP_KERNEL);
-       if (!new)
-               return NULL;
-
-       j = 0;
-       for (i = 0; a[i]; i++)
-               new[j++] = a[i];
-       for (i = 0; b[i]; i++)
-               new[j++] = b[i];
-       new[j] = NULL;
-
-       return new;
-}
-
-ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
-                         char *page)
-{
-       struct perf_pmu_events_attr *pmu_attr = \
-               container_of(attr, struct perf_pmu_events_attr, attr);
-       u64 config = x86_pmu.event_map(pmu_attr->id);
-
-       /* string trumps id */
-       if (pmu_attr->event_str)
-               return sprintf(page, "%s", pmu_attr->event_str);
-
-       return x86_pmu.events_sysfs_show(page, config);
-}
-
-EVENT_ATTR(cpu-cycles,                 CPU_CYCLES              );
-EVENT_ATTR(instructions,               INSTRUCTIONS            );
-EVENT_ATTR(cache-references,           CACHE_REFERENCES        );
-EVENT_ATTR(cache-misses,               CACHE_MISSES            );
-EVENT_ATTR(branch-instructions,                BRANCH_INSTRUCTIONS     );
-EVENT_ATTR(branch-misses,              BRANCH_MISSES           );
-EVENT_ATTR(bus-cycles,                 BUS_CYCLES              );
-EVENT_ATTR(stalled-cycles-frontend,    STALLED_CYCLES_FRONTEND );
-EVENT_ATTR(stalled-cycles-backend,     STALLED_CYCLES_BACKEND  );
-EVENT_ATTR(ref-cycles,                 REF_CPU_CYCLES          );
-
-static struct attribute *empty_attrs;
-
-static struct attribute *events_attr[] = {
-       EVENT_PTR(CPU_CYCLES),
-       EVENT_PTR(INSTRUCTIONS),
-       EVENT_PTR(CACHE_REFERENCES),
-       EVENT_PTR(CACHE_MISSES),
-       EVENT_PTR(BRANCH_INSTRUCTIONS),
-       EVENT_PTR(BRANCH_MISSES),
-       EVENT_PTR(BUS_CYCLES),
-       EVENT_PTR(STALLED_CYCLES_FRONTEND),
-       EVENT_PTR(STALLED_CYCLES_BACKEND),
-       EVENT_PTR(REF_CPU_CYCLES),
-       NULL,
-};
-
-static struct attribute_group x86_pmu_events_group = {
-       .name = "events",
-       .attrs = events_attr,
-};
-
-ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
-{
-       u64 umask  = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
-       u64 cmask  = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
-       bool edge  = (config & ARCH_PERFMON_EVENTSEL_EDGE);
-       bool pc    = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL);
-       bool any   = (config & ARCH_PERFMON_EVENTSEL_ANY);
-       bool inv   = (config & ARCH_PERFMON_EVENTSEL_INV);
-       ssize_t ret;
-
-       /*
-       * We have whole page size to spend and just little data
-       * to write, so we can safely use sprintf.
-       */
-       ret = sprintf(page, "event=0x%02llx", event);
-
-       if (umask)
-               ret += sprintf(page + ret, ",umask=0x%02llx", umask);
-
-       if (edge)
-               ret += sprintf(page + ret, ",edge");
-
-       if (pc)
-               ret += sprintf(page + ret, ",pc");
-
-       if (any)
-               ret += sprintf(page + ret, ",any");
-
-       if (inv)
-               ret += sprintf(page + ret, ",inv");
-
-       if (cmask)
-               ret += sprintf(page + ret, ",cmask=0x%02llx", cmask);
-
-       ret += sprintf(page + ret, "\n");
-
-       return ret;
-}
-
-static int __init init_hw_perf_events(void)
-{
-       struct x86_pmu_quirk *quirk;
-       int err;
-
-       pr_info("Performance Events: ");
-
-       switch (boot_cpu_data.x86_vendor) {
-       case X86_VENDOR_INTEL:
-               err = intel_pmu_init();
-               break;
-       case X86_VENDOR_AMD:
-               err = amd_pmu_init();
-               break;
-       default:
-               err = -ENOTSUPP;
-       }
-       if (err != 0) {
-               pr_cont("no PMU driver, software events only.\n");
-               return 0;
-       }
-
-       pmu_check_apic();
-
-       /* sanity check that the hardware exists or is emulated */
-       if (!check_hw_exists())
-               return 0;
-
-       pr_cont("%s PMU driver.\n", x86_pmu.name);
-
-       x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
-
-       for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
-               quirk->func();
-
-       if (!x86_pmu.intel_ctrl)
-               x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
-
-       perf_events_lapic_init();
-       register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
-
-       unconstrained = (struct event_constraint)
-               __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
-                                  0, x86_pmu.num_counters, 0, 0);
-
-       x86_pmu_format_group.attrs = x86_pmu.format_attrs;
-
-       if (x86_pmu.event_attrs)
-               x86_pmu_events_group.attrs = x86_pmu.event_attrs;
-
-       if (!x86_pmu.events_sysfs_show)
-               x86_pmu_events_group.attrs = &empty_attrs;
-       else
-               filter_events(x86_pmu_events_group.attrs);
-
-       if (x86_pmu.cpu_events) {
-               struct attribute **tmp;
-
-               tmp = merge_attr(x86_pmu_events_group.attrs, x86_pmu.cpu_events);
-               if (!WARN_ON(!tmp))
-                       x86_pmu_events_group.attrs = tmp;
-       }
-
-       pr_info("... version:                %d\n",     x86_pmu.version);
-       pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
-       pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
-       pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
-       pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
-       pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
-       pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
-
-       perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
-       perf_cpu_notifier(x86_pmu_notifier);
-
-       return 0;
-}
-early_initcall(init_hw_perf_events);
-
-static inline void x86_pmu_read(struct perf_event *event)
-{
-       x86_perf_event_update(event);
-}
-
-/*
- * Start group events scheduling transaction
- * Set the flag to make pmu::enable() not perform the
- * schedulability test, it will be performed at commit time
- *
- * We only support PERF_PMU_TXN_ADD transactions. Save the
- * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
- * transactions.
- */
-static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       WARN_ON_ONCE(cpuc->txn_flags);          /* txn already in flight */
-
-       cpuc->txn_flags = txn_flags;
-       if (txn_flags & ~PERF_PMU_TXN_ADD)
-               return;
-
-       perf_pmu_disable(pmu);
-       __this_cpu_write(cpu_hw_events.n_txn, 0);
-}
-
-/*
- * Stop group events scheduling transaction
- * Clear the flag and pmu::enable() will perform the
- * schedulability test.
- */
-static void x86_pmu_cancel_txn(struct pmu *pmu)
-{
-       unsigned int txn_flags;
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
-
-       txn_flags = cpuc->txn_flags;
-       cpuc->txn_flags = 0;
-       if (txn_flags & ~PERF_PMU_TXN_ADD)
-               return;
-
-       /*
-        * Truncate collected array by the number of events added in this
-        * transaction. See x86_pmu_add() and x86_pmu_*_txn().
-        */
-       __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
-       __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
-       perf_pmu_enable(pmu);
-}
-
-/*
- * Commit group events scheduling transaction
- * Perform the group schedulability test as a whole
- * Return 0 if success
- *
- * Does not cancel the transaction on failure; expects the caller to do this.
- */
-static int x86_pmu_commit_txn(struct pmu *pmu)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       int assign[X86_PMC_IDX_MAX];
-       int n, ret;
-
-       WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
-
-       if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) {
-               cpuc->txn_flags = 0;
-               return 0;
-       }
-
-       n = cpuc->n_events;
-
-       if (!x86_pmu_initialized())
-               return -EAGAIN;
-
-       ret = x86_pmu.schedule_events(cpuc, n, assign);
-       if (ret)
-               return ret;
-
-       /*
-        * copy new assignment, now we know it is possible
-        * will be used by hw_perf_enable()
-        */
-       memcpy(cpuc->assign, assign, n*sizeof(int));
-
-       cpuc->txn_flags = 0;
-       perf_pmu_enable(pmu);
-       return 0;
-}
-/*
- * a fake_cpuc is used to validate event groups. Due to
- * the extra reg logic, we need to also allocate a fake
- * per_core and per_cpu structure. Otherwise, group events
- * using extra reg may conflict without the kernel being
- * able to catch this when the last event gets added to
- * the group.
- */
-static void free_fake_cpuc(struct cpu_hw_events *cpuc)
-{
-       kfree(cpuc->shared_regs);
-       kfree(cpuc);
-}
-
-static struct cpu_hw_events *allocate_fake_cpuc(void)
-{
-       struct cpu_hw_events *cpuc;
-       int cpu = raw_smp_processor_id();
-
-       cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
-       if (!cpuc)
-               return ERR_PTR(-ENOMEM);
-
-       /* only needed, if we have extra_regs */
-       if (x86_pmu.extra_regs) {
-               cpuc->shared_regs = allocate_shared_regs(cpu);
-               if (!cpuc->shared_regs)
-                       goto error;
-       }
-       cpuc->is_fake = 1;
-       return cpuc;
-error:
-       free_fake_cpuc(cpuc);
-       return ERR_PTR(-ENOMEM);
-}
-
-/*
- * validate that we can schedule this event
- */
-static int validate_event(struct perf_event *event)
-{
-       struct cpu_hw_events *fake_cpuc;
-       struct event_constraint *c;
-       int ret = 0;
-
-       fake_cpuc = allocate_fake_cpuc();
-       if (IS_ERR(fake_cpuc))
-               return PTR_ERR(fake_cpuc);
-
-       c = x86_pmu.get_event_constraints(fake_cpuc, -1, event);
-
-       if (!c || !c->weight)
-               ret = -EINVAL;
-
-       if (x86_pmu.put_event_constraints)
-               x86_pmu.put_event_constraints(fake_cpuc, event);
-
-       free_fake_cpuc(fake_cpuc);
-
-       return ret;
-}
-
-/*
- * validate a single event group
- *
- * validation include:
- *     - check events are compatible which each other
- *     - events do not compete for the same counter
- *     - number of events <= number of counters
- *
- * validation ensures the group can be loaded onto the
- * PMU if it was the only group available.
- */
-static int validate_group(struct perf_event *event)
-{
-       struct perf_event *leader = event->group_leader;
-       struct cpu_hw_events *fake_cpuc;
-       int ret = -EINVAL, n;
-
-       fake_cpuc = allocate_fake_cpuc();
-       if (IS_ERR(fake_cpuc))
-               return PTR_ERR(fake_cpuc);
-       /*
-        * the event is not yet connected with its
-        * siblings therefore we must first collect
-        * existing siblings, then add the new event
-        * before we can simulate the scheduling
-        */
-       n = collect_events(fake_cpuc, leader, true);
-       if (n < 0)
-               goto out;
-
-       fake_cpuc->n_events = n;
-       n = collect_events(fake_cpuc, event, false);
-       if (n < 0)
-               goto out;
-
-       fake_cpuc->n_events = n;
-
-       ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
-
-out:
-       free_fake_cpuc(fake_cpuc);
-       return ret;
-}
-
-static int x86_pmu_event_init(struct perf_event *event)
-{
-       struct pmu *tmp;
-       int err;
-
-       switch (event->attr.type) {
-       case PERF_TYPE_RAW:
-       case PERF_TYPE_HARDWARE:
-       case PERF_TYPE_HW_CACHE:
-               break;
-
-       default:
-               return -ENOENT;
-       }
-
-       err = __x86_pmu_event_init(event);
-       if (!err) {
-               /*
-                * we temporarily connect event to its pmu
-                * such that validate_group() can classify
-                * it as an x86 event using is_x86_event()
-                */
-               tmp = event->pmu;
-               event->pmu = &pmu;
-
-               if (event->group_leader != event)
-                       err = validate_group(event);
-               else
-                       err = validate_event(event);
-
-               event->pmu = tmp;
-       }
-       if (err) {
-               if (event->destroy)
-                       event->destroy(event);
-       }
-
-       if (ACCESS_ONCE(x86_pmu.attr_rdpmc))
-               event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
-
-       return err;
-}
-
-static void refresh_pce(void *ignored)
-{
-       if (current->mm)
-               load_mm_cr4(current->mm);
-}
-
-static void x86_pmu_event_mapped(struct perf_event *event)
-{
-       if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
-               return;
-
-       if (atomic_inc_return(&current->mm->context.perf_rdpmc_allowed) == 1)
-               on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
-}
-
-static void x86_pmu_event_unmapped(struct perf_event *event)
-{
-       if (!current->mm)
-               return;
-
-       if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
-               return;
-
-       if (atomic_dec_and_test(&current->mm->context.perf_rdpmc_allowed))
-               on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
-}
-
-static int x86_pmu_event_idx(struct perf_event *event)
-{
-       int idx = event->hw.idx;
-
-       if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
-               return 0;
-
-       if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
-               idx -= INTEL_PMC_IDX_FIXED;
-               idx |= 1 << 30;
-       }
-
-       return idx + 1;
-}
-
-static ssize_t get_attr_rdpmc(struct device *cdev,
-                             struct device_attribute *attr,
-                             char *buf)
-{
-       return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
-}
-
-static ssize_t set_attr_rdpmc(struct device *cdev,
-                             struct device_attribute *attr,
-                             const char *buf, size_t count)
-{
-       unsigned long val;
-       ssize_t ret;
-
-       ret = kstrtoul(buf, 0, &val);
-       if (ret)
-               return ret;
-
-       if (val > 2)
-               return -EINVAL;
-
-       if (x86_pmu.attr_rdpmc_broken)
-               return -ENOTSUPP;
-
-       if ((val == 2) != (x86_pmu.attr_rdpmc == 2)) {
-               /*
-                * Changing into or out of always available, aka
-                * perf-event-bypassing mode.  This path is extremely slow,
-                * but only root can trigger it, so it's okay.
-                */
-               if (val == 2)
-                       static_key_slow_inc(&rdpmc_always_available);
-               else
-                       static_key_slow_dec(&rdpmc_always_available);
-               on_each_cpu(refresh_pce, NULL, 1);
-       }
-
-       x86_pmu.attr_rdpmc = val;
-
-       return count;
-}
-
-static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
-
-static struct attribute *x86_pmu_attrs[] = {
-       &dev_attr_rdpmc.attr,
-       NULL,
-};
-
-static struct attribute_group x86_pmu_attr_group = {
-       .attrs = x86_pmu_attrs,
-};
-
-static const struct attribute_group *x86_pmu_attr_groups[] = {
-       &x86_pmu_attr_group,
-       &x86_pmu_format_group,
-       &x86_pmu_events_group,
-       NULL,
-};
-
-static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
-{
-       if (x86_pmu.sched_task)
-               x86_pmu.sched_task(ctx, sched_in);
-}
-
-void perf_check_microcode(void)
-{
-       if (x86_pmu.check_microcode)
-               x86_pmu.check_microcode();
-}
-EXPORT_SYMBOL_GPL(perf_check_microcode);
-
-static struct pmu pmu = {
-       .pmu_enable             = x86_pmu_enable,
-       .pmu_disable            = x86_pmu_disable,
-
-       .attr_groups            = x86_pmu_attr_groups,
-
-       .event_init             = x86_pmu_event_init,
-
-       .event_mapped           = x86_pmu_event_mapped,
-       .event_unmapped         = x86_pmu_event_unmapped,
-
-       .add                    = x86_pmu_add,
-       .del                    = x86_pmu_del,
-       .start                  = x86_pmu_start,
-       .stop                   = x86_pmu_stop,
-       .read                   = x86_pmu_read,
-
-       .start_txn              = x86_pmu_start_txn,
-       .cancel_txn             = x86_pmu_cancel_txn,
-       .commit_txn             = x86_pmu_commit_txn,
-
-       .event_idx              = x86_pmu_event_idx,
-       .sched_task             = x86_pmu_sched_task,
-       .task_ctx_size          = sizeof(struct x86_perf_task_context),
-};
-
-void arch_perf_update_userpage(struct perf_event *event,
-                              struct perf_event_mmap_page *userpg, u64 now)
-{
-       struct cyc2ns_data *data;
-
-       userpg->cap_user_time = 0;
-       userpg->cap_user_time_zero = 0;
-       userpg->cap_user_rdpmc =
-               !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
-       userpg->pmc_width = x86_pmu.cntval_bits;
-
-       if (!sched_clock_stable())
-               return;
-
-       data = cyc2ns_read_begin();
-
-       /*
-        * Internal timekeeping for enabled/running/stopped times
-        * is always in the local_clock domain.
-        */
-       userpg->cap_user_time = 1;
-       userpg->time_mult = data->cyc2ns_mul;
-       userpg->time_shift = data->cyc2ns_shift;
-       userpg->time_offset = data->cyc2ns_offset - now;
-
-       /*
-        * cap_user_time_zero doesn't make sense when we're using a different
-        * time base for the records.
-        */
-       if (event->clock == &local_clock) {
-               userpg->cap_user_time_zero = 1;
-               userpg->time_zero = data->cyc2ns_offset;
-       }
-
-       cyc2ns_read_end(data);
-}
-
-/*
- * callchain support
- */
-
-static int backtrace_stack(void *data, char *name)
-{
-       return 0;
-}
-
-static void backtrace_address(void *data, unsigned long addr, int reliable)
-{
-       struct perf_callchain_entry *entry = data;
-
-       perf_callchain_store(entry, addr);
-}
-
-static const struct stacktrace_ops backtrace_ops = {
-       .stack                  = backtrace_stack,
-       .address                = backtrace_address,
-       .walk_stack             = print_context_stack_bp,
-};
-
-void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
-{
-       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-               /* TODO: We don't support guest os callchain now */
-               return;
-       }
-
-       perf_callchain_store(entry, regs->ip);
-
-       dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
-}
-
-static inline int
-valid_user_frame(const void __user *fp, unsigned long size)
-{
-       return (__range_not_ok(fp, size, TASK_SIZE) == 0);
-}
-
-static unsigned long get_segment_base(unsigned int segment)
-{
-       struct desc_struct *desc;
-       int idx = segment >> 3;
-
-       if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
-#ifdef CONFIG_MODIFY_LDT_SYSCALL
-               struct ldt_struct *ldt;
-
-               if (idx > LDT_ENTRIES)
-                       return 0;
-
-               /* IRQs are off, so this synchronizes with smp_store_release */
-               ldt = lockless_dereference(current->active_mm->context.ldt);
-               if (!ldt || idx > ldt->size)
-                       return 0;
-
-               desc = &ldt->entries[idx];
-#else
-               return 0;
-#endif
-       } else {
-               if (idx > GDT_ENTRIES)
-                       return 0;
-
-               desc = raw_cpu_ptr(gdt_page.gdt) + idx;
-       }
-
-       return get_desc_base(desc);
-}
-
-#ifdef CONFIG_IA32_EMULATION
-
-#include <asm/compat.h>
-
-static inline int
-perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-       /* 32-bit process in 64-bit kernel. */
-       unsigned long ss_base, cs_base;
-       struct stack_frame_ia32 frame;
-       const void __user *fp;
-
-       if (!test_thread_flag(TIF_IA32))
-               return 0;
-
-       cs_base = get_segment_base(regs->cs);
-       ss_base = get_segment_base(regs->ss);
-
-       fp = compat_ptr(ss_base + regs->bp);
-       pagefault_disable();
-       while (entry->nr < PERF_MAX_STACK_DEPTH) {
-               unsigned long bytes;
-               frame.next_frame     = 0;
-               frame.return_address = 0;
-
-               if (!access_ok(VERIFY_READ, fp, 8))
-                       break;
-
-               bytes = __copy_from_user_nmi(&frame.next_frame, fp, 4);
-               if (bytes != 0)
-                       break;
-               bytes = __copy_from_user_nmi(&frame.return_address, fp+4, 4);
-               if (bytes != 0)
-                       break;
-
-               if (!valid_user_frame(fp, sizeof(frame)))
-                       break;
-
-               perf_callchain_store(entry, cs_base + frame.return_address);
-               fp = compat_ptr(ss_base + frame.next_frame);
-       }
-       pagefault_enable();
-       return 1;
-}
-#else
-static inline int
-perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-    return 0;
-}
-#endif
-
-void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
-{
-       struct stack_frame frame;
-       const void __user *fp;
-
-       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-               /* TODO: We don't support guest os callchain now */
-               return;
-       }
-
-       /*
-        * We don't know what to do with VM86 stacks.. ignore them for now.
-        */
-       if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
-               return;
-
-       fp = (void __user *)regs->bp;
-
-       perf_callchain_store(entry, regs->ip);
-
-       if (!current->mm)
-               return;
-
-       if (perf_callchain_user32(regs, entry))
-               return;
-
-       pagefault_disable();
-       while (entry->nr < PERF_MAX_STACK_DEPTH) {
-               unsigned long bytes;
-               frame.next_frame             = NULL;
-               frame.return_address = 0;
-
-               if (!access_ok(VERIFY_READ, fp, 16))
-                       break;
-
-               bytes = __copy_from_user_nmi(&frame.next_frame, fp, 8);
-               if (bytes != 0)
-                       break;
-               bytes = __copy_from_user_nmi(&frame.return_address, fp+8, 8);
-               if (bytes != 0)
-                       break;
-
-               if (!valid_user_frame(fp, sizeof(frame)))
-                       break;
-
-               perf_callchain_store(entry, frame.return_address);
-               fp = (void __user *)frame.next_frame;
-       }
-       pagefault_enable();
-}
-
-/*
- * Deal with code segment offsets for the various execution modes:
- *
- *   VM86 - the good olde 16 bit days, where the linear address is
- *          20 bits and we use regs->ip + 0x10 * regs->cs.
- *
- *   IA32 - Where we need to look at GDT/LDT segment descriptor tables
- *          to figure out what the 32bit base address is.
- *
- *    X32 - has TIF_X32 set, but is running in x86_64
- *
- * X86_64 - CS,DS,SS,ES are all zero based.
- */
-static unsigned long code_segment_base(struct pt_regs *regs)
-{
-       /*
-        * For IA32 we look at the GDT/LDT segment base to convert the
-        * effective IP to a linear address.
-        */
-
-#ifdef CONFIG_X86_32
-       /*
-        * If we are in VM86 mode, add the segment offset to convert to a
-        * linear address.
-        */
-       if (regs->flags & X86_VM_MASK)
-               return 0x10 * regs->cs;
-
-       if (user_mode(regs) && regs->cs != __USER_CS)
-               return get_segment_base(regs->cs);
-#else
-       if (user_mode(regs) && !user_64bit_mode(regs) &&
-           regs->cs != __USER32_CS)
-               return get_segment_base(regs->cs);
-#endif
-       return 0;
-}
-
-unsigned long perf_instruction_pointer(struct pt_regs *regs)
-{
-       if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
-               return perf_guest_cbs->get_guest_ip();
-
-       return regs->ip + code_segment_base(regs);
-}
-
-unsigned long perf_misc_flags(struct pt_regs *regs)
-{
-       int misc = 0;
-
-       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-               if (perf_guest_cbs->is_user_mode())
-                       misc |= PERF_RECORD_MISC_GUEST_USER;
-               else
-                       misc |= PERF_RECORD_MISC_GUEST_KERNEL;
-       } else {
-               if (user_mode(regs))
-                       misc |= PERF_RECORD_MISC_USER;
-               else
-                       misc |= PERF_RECORD_MISC_KERNEL;
-       }
-
-       if (regs->flags & PERF_EFLAGS_EXACT)
-               misc |= PERF_RECORD_MISC_EXACT_IP;
-
-       return misc;
-}
-
-void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
-{
-       cap->version            = x86_pmu.version;
-       cap->num_counters_gp    = x86_pmu.num_counters;
-       cap->num_counters_fixed = x86_pmu.num_counters_fixed;
-       cap->bit_width_gp       = x86_pmu.cntval_bits;
-       cap->bit_width_fixed    = x86_pmu.cntval_bits;
-       cap->events_mask        = (unsigned int)x86_pmu.events_maskl;
-       cap->events_mask_len    = x86_pmu.events_mask_len;
-}
-EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h

deleted file mode 100644 (file)

index 7bb61e3..0000000
--- a/arch/x86/kernel/cpu/perf_event.h
+++ /dev/null
@@ -1,955 +0,0 @@
-/*
- * Performance events x86 architecture header
- *
- *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
- *  Copyright (C) 2009 Jaswinder Singh Rajput
- *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
- *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
- *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
- *  Copyright (C) 2009 Google, Inc., Stephane Eranian
- *
- *  For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_event.h>
-
-/* To enable MSR tracing please use the generic trace points. */
-
-/*
- *          |   NHM/WSM    |      SNB     |
- * register -------------------------------
- *          |  HT  | no HT |  HT  | no HT |
- *-----------------------------------------
- * offcore  | core | core  | cpu  | core  |
- * lbr_sel  | core | core  | cpu  | core  |
- * ld_lat   | cpu  | core  | cpu  | core  |
- *-----------------------------------------
- *
- * Given that there is a small number of shared regs,
- * we can pre-allocate their slot in the per-cpu
- * per-core reg tables.
- */
-enum extra_reg_type {
-       EXTRA_REG_NONE  = -1,   /* not used */
-
-       EXTRA_REG_RSP_0 = 0,    /* offcore_response_0 */
-       EXTRA_REG_RSP_1 = 1,    /* offcore_response_1 */
-       EXTRA_REG_LBR   = 2,    /* lbr_select */
-       EXTRA_REG_LDLAT = 3,    /* ld_lat_threshold */
-       EXTRA_REG_FE    = 4,    /* fe_* */
-
-       EXTRA_REG_MAX           /* number of entries needed */
-};
-
-struct event_constraint {
-       union {
-               unsigned long   idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-               u64             idxmsk64;
-       };
-       u64     code;
-       u64     cmask;
-       int     weight;
-       int     overlap;
-       int     flags;
-};
-/*
- * struct hw_perf_event.flags flags
- */
-#define PERF_X86_EVENT_PEBS_LDLAT      0x0001 /* ld+ldlat data address sampling */
-#define PERF_X86_EVENT_PEBS_ST         0x0002 /* st data address sampling */
-#define PERF_X86_EVENT_PEBS_ST_HSW     0x0004 /* haswell style datala, store */
-#define PERF_X86_EVENT_COMMITTED       0x0008 /* event passed commit_txn */
-#define PERF_X86_EVENT_PEBS_LD_HSW     0x0010 /* haswell style datala, load */
-#define PERF_X86_EVENT_PEBS_NA_HSW     0x0020 /* haswell style datala, unknown */
-#define PERF_X86_EVENT_EXCL            0x0040 /* HT exclusivity on counter */
-#define PERF_X86_EVENT_DYNAMIC         0x0080 /* dynamic alloc'd constraint */
-#define PERF_X86_EVENT_RDPMC_ALLOWED   0x0100 /* grant rdpmc permission */
-#define PERF_X86_EVENT_EXCL_ACCT       0x0200 /* accounted EXCL event */
-#define PERF_X86_EVENT_AUTO_RELOAD     0x0400 /* use PEBS auto-reload */
-#define PERF_X86_EVENT_FREERUNNING     0x0800 /* use freerunning PEBS */
-
-
-struct amd_nb {
-       int nb_id;  /* NorthBridge id */
-       int refcnt; /* reference count */
-       struct perf_event *owners[X86_PMC_IDX_MAX];
-       struct event_constraint event_constraints[X86_PMC_IDX_MAX];
-};
-
-/* The maximal number of PEBS events: */
-#define MAX_PEBS_EVENTS                8
-
-/*
- * Flags PEBS can handle without an PMI.
- *
- * TID can only be handled by flushing at context switch.
- *
- */
-#define PEBS_FREERUNNING_FLAGS \
-       (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
-       PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
-       PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
-       PERF_SAMPLE_TRANSACTION)
-
-/*
- * A debug store configuration.
- *
- * We only support architectures that use 64bit fields.
- */
-struct debug_store {
-       u64     bts_buffer_base;
-       u64     bts_index;
-       u64     bts_absolute_maximum;
-       u64     bts_interrupt_threshold;
-       u64     pebs_buffer_base;
-       u64     pebs_index;
-       u64     pebs_absolute_maximum;
-       u64     pebs_interrupt_threshold;
-       u64     pebs_event_reset[MAX_PEBS_EVENTS];
-};
-
-/*
- * Per register state.
- */
-struct er_account {
-       raw_spinlock_t          lock;   /* per-core: protect structure */
-       u64                 config;     /* extra MSR config */
-       u64                 reg;        /* extra MSR number */
-       atomic_t            ref;        /* reference count */
-};
-
-/*
- * Per core/cpu state
- *
- * Used to coordinate shared registers between HT threads or
- * among events on a single PMU.
- */
-struct intel_shared_regs {
-       struct er_account       regs[EXTRA_REG_MAX];
-       int                     refcnt;         /* per-core: #HT threads */
-       unsigned                core_id;        /* per-core: core id */
-};
-
-enum intel_excl_state_type {
-       INTEL_EXCL_UNUSED    = 0, /* counter is unused */
-       INTEL_EXCL_SHARED    = 1, /* counter can be used by both threads */
-       INTEL_EXCL_EXCLUSIVE = 2, /* counter can be used by one thread only */
-};
-
-struct intel_excl_states {
-       enum intel_excl_state_type state[X86_PMC_IDX_MAX];
-       bool sched_started; /* true if scheduling has started */
-};
-
-struct intel_excl_cntrs {
-       raw_spinlock_t  lock;
-
-       struct intel_excl_states states[2];
-
-       union {
-               u16     has_exclusive[2];
-               u32     exclusive_present;
-       };
-
-       int             refcnt;         /* per-core: #HT threads */
-       unsigned        core_id;        /* per-core: core id */
-};
-
-#define MAX_LBR_ENTRIES                32
-
-enum {
-       X86_PERF_KFREE_SHARED = 0,
-       X86_PERF_KFREE_EXCL   = 1,
-       X86_PERF_KFREE_MAX
-};
-
-struct cpu_hw_events {
-       /*
-        * Generic x86 PMC bits
-        */
-       struct perf_event       *events[X86_PMC_IDX_MAX]; /* in counter order */
-       unsigned long           active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-       unsigned long           running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-       int                     enabled;
-
-       int                     n_events; /* the # of events in the below arrays */
-       int                     n_added;  /* the # last events in the below arrays;
-                                            they've never been enabled yet */
-       int                     n_txn;    /* the # last events in the below arrays;
-                                            added in the current transaction */
-       int                     assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
-       u64                     tags[X86_PMC_IDX_MAX];
-
-       struct perf_event       *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
-       struct event_constraint *event_constraint[X86_PMC_IDX_MAX];
-
-       int                     n_excl; /* the number of exclusive events */
-
-       unsigned int            txn_flags;
-       int                     is_fake;
-
-       /*
-        * Intel DebugStore bits
-        */
-       struct debug_store      *ds;
-       u64                     pebs_enabled;
-
-       /*
-        * Intel LBR bits
-        */
-       int                             lbr_users;
-       void                            *lbr_context;
-       struct perf_branch_stack        lbr_stack;
-       struct perf_branch_entry        lbr_entries[MAX_LBR_ENTRIES];
-       struct er_account               *lbr_sel;
-       u64                             br_sel;
-
-       /*
-        * Intel host/guest exclude bits
-        */
-       u64                             intel_ctrl_guest_mask;
-       u64                             intel_ctrl_host_mask;
-       struct perf_guest_switch_msr    guest_switch_msrs[X86_PMC_IDX_MAX];
-
-       /*
-        * Intel checkpoint mask
-        */
-       u64                             intel_cp_status;
-
-       /*
-        * manage shared (per-core, per-cpu) registers
-        * used on Intel NHM/WSM/SNB
-        */
-       struct intel_shared_regs        *shared_regs;
-       /*
-        * manage exclusive counter access between hyperthread
-        */
-       struct event_constraint *constraint_list; /* in enable order */
-       struct intel_excl_cntrs         *excl_cntrs;
-       int excl_thread_id; /* 0 or 1 */
-
-       /*
-        * AMD specific bits
-        */
-       struct amd_nb                   *amd_nb;
-       /* Inverted mask of bits to clear in the perf_ctr ctrl registers */
-       u64                             perf_ctr_virt_mask;
-
-       void                            *kfree_on_online[X86_PERF_KFREE_MAX];
-};
-
-#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\
-       { .idxmsk64 = (n) },            \
-       .code = (c),                    \
-       .cmask = (m),                   \
-       .weight = (w),                  \
-       .overlap = (o),                 \
-       .flags = f,                     \
-}
-
-#define EVENT_CONSTRAINT(c, n, m)      \
-       __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)
-
-#define INTEL_EXCLEVT_CONSTRAINT(c, n) \
-       __EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\
-                          0, PERF_X86_EVENT_EXCL)
-
-/*
- * The overlap flag marks event constraints with overlapping counter
- * masks. This is the case if the counter mask of such an event is not
- * a subset of any other counter mask of a constraint with an equal or
- * higher weight, e.g.:
- *
- *  c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
- *  c_another1 = EVENT_CONSTRAINT(0, 0x07, 0);
- *  c_another2 = EVENT_CONSTRAINT(0, 0x38, 0);
- *
- * The event scheduler may not select the correct counter in the first
- * cycle because it needs to know which subsequent events will be
- * scheduled. It may fail to schedule the events then. So we set the
- * overlap flag for such constraints to give the scheduler a hint which
- * events to select for counter rescheduling.
- *
- * Care must be taken as the rescheduling algorithm is O(n!) which
- * will increase scheduling cycles for an over-commited system
- * dramatically.  The number of such EVENT_CONSTRAINT_OVERLAP() macros
- * and its counter masks must be kept at a minimum.
- */
-#define EVENT_CONSTRAINT_OVERLAP(c, n, m)      \
-       __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1, 0)
-
-/*
- * Constraint on the Event code.
- */
-#define INTEL_EVENT_CONSTRAINT(c, n)   \
-       EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
-
-/*
- * Constraint on the Event code + UMask + fixed-mask
- *
- * filter mask to validate fixed counter events.
- * the following filters disqualify for fixed counters:
- *  - inv
- *  - edge
- *  - cnt-mask
- *  - in_tx
- *  - in_tx_checkpointed
- *  The other filters are supported by fixed counters.
- *  The any-thread option is supported starting with v3.
- */
-#define FIXED_EVENT_FLAGS (X86_RAW_EVENT_MASK|HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)
-#define FIXED_EVENT_CONSTRAINT(c, n)   \
-       EVENT_CONSTRAINT(c, (1ULL << (32+n)), FIXED_EVENT_FLAGS)
-
-/*
- * Constraint on the Event code + UMask
- */
-#define INTEL_UEVENT_CONSTRAINT(c, n)  \
-       EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
-
-/* Constraint on specific umask bit only + event */
-#define INTEL_UBIT_EVENT_CONSTRAINT(c, n)      \
-       EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|(c))
-
-/* Like UEVENT_CONSTRAINT, but match flags too */
-#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n)    \
-       EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
-
-#define INTEL_EXCLUEVT_CONSTRAINT(c, n)        \
-       __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
-                          HWEIGHT(n), 0, PERF_X86_EVENT_EXCL)
-
-#define INTEL_PLD_CONSTRAINT(c, n)     \
-       __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-                          HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
-
-#define INTEL_PST_CONSTRAINT(c, n)     \
-       __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)
-
-/* Event constraint, but match on all event flags too. */
-#define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \
-       EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
-
-/* Check only flags, but allow all event/umask */
-#define INTEL_ALL_EVENT_CONSTRAINT(code, n)    \
-       EVENT_CONSTRAINT(code, n, X86_ALL_EVENT_FLAGS)
-
-/* Check flags and event code, and set the HSW store flag */
-#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_ST(code, n) \
-       __EVENT_CONSTRAINT(code, n,                     \
-                         ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
-                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
-
-/* Check flags and event code, and set the HSW load flag */
-#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \
-       __EVENT_CONSTRAINT(code, n,                     \
-                         ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
-                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
-
-#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \
-       __EVENT_CONSTRAINT(code, n,                     \
-                         ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
-                         HWEIGHT(n), 0, \
-                         PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
-
-/* Check flags and event code/umask, and set the HSW store flag */
-#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \
-       __EVENT_CONSTRAINT(code, n,                     \
-                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
-
-#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(code, n) \
-       __EVENT_CONSTRAINT(code, n,                     \
-                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-                         HWEIGHT(n), 0, \
-                         PERF_X86_EVENT_PEBS_ST_HSW|PERF_X86_EVENT_EXCL)
-
-/* Check flags and event code/umask, and set the HSW load flag */
-#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \
-       __EVENT_CONSTRAINT(code, n,                     \
-                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
-
-#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(code, n) \
-       __EVENT_CONSTRAINT(code, n,                     \
-                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-                         HWEIGHT(n), 0, \
-                         PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
-
-/* Check flags and event code/umask, and set the HSW N/A flag */
-#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
-       __EVENT_CONSTRAINT(code, n,                     \
-                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW)
-
-
-/*
- * We define the end marker as having a weight of -1
- * to enable blacklisting of events using a counter bitmask
- * of zero and thus a weight of zero.
- * The end marker has a weight that cannot possibly be
- * obtained from counting the bits in the bitmask.
- */
-#define EVENT_CONSTRAINT_END { .weight = -1 }
-
-/*
- * Check for end marker with weight == -1
- */
-#define for_each_event_constraint(e, c)        \
-       for ((e) = (c); (e)->weight != -1; (e)++)
-
-/*
- * Extra registers for specific events.
- *
- * Some events need large masks and require external MSRs.
- * Those extra MSRs end up being shared for all events on
- * a PMU and sometimes between PMU of sibling HT threads.
- * In either case, the kernel needs to handle conflicting
- * accesses to those extra, shared, regs. The data structure
- * to manage those registers is stored in cpu_hw_event.
- */
-struct extra_reg {
-       unsigned int            event;
-       unsigned int            msr;
-       u64                     config_mask;
-       u64                     valid_mask;
-       int                     idx;  /* per_xxx->regs[] reg index */
-       bool                    extra_msr_access;
-};
-
-#define EVENT_EXTRA_REG(e, ms, m, vm, i) {     \
-       .event = (e),                   \
-       .msr = (ms),                    \
-       .config_mask = (m),             \
-       .valid_mask = (vm),             \
-       .idx = EXTRA_REG_##i,           \
-       .extra_msr_access = true,       \
-       }
-
-#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx)     \
-       EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
-
-#define INTEL_UEVENT_EXTRA_REG(event, msr, vm, idx) \
-       EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \
-                       ARCH_PERFMON_EVENTSEL_UMASK, vm, idx)
-
-#define INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(c) \
-       INTEL_UEVENT_EXTRA_REG(c, \
-                              MSR_PEBS_LD_LAT_THRESHOLD, \
-                              0xffff, \
-                              LDLAT)
-
-#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
-
-union perf_capabilities {
-       struct {
-               u64     lbr_format:6;
-               u64     pebs_trap:1;
-               u64     pebs_arch_reg:1;
-               u64     pebs_format:4;
-               u64     smm_freeze:1;
-               /*
-                * PMU supports separate counter range for writing
-                * values > 32bit.
-                */
-               u64     full_width_write:1;
-       };
-       u64     capabilities;
-};
-
-struct x86_pmu_quirk {
-       struct x86_pmu_quirk *next;
-       void (*func)(void);
-};
-
-union x86_pmu_config {
-       struct {
-               u64 event:8,
-                   umask:8,
-                   usr:1,
-                   os:1,
-                   edge:1,
-                   pc:1,
-                   interrupt:1,
-                   __reserved1:1,
-                   en:1,
-                   inv:1,
-                   cmask:8,
-                   event2:4,
-                   __reserved2:4,
-                   go:1,
-                   ho:1;
-       } bits;
-       u64 value;
-};
-
-#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
-
-enum {
-       x86_lbr_exclusive_lbr,
-       x86_lbr_exclusive_bts,
-       x86_lbr_exclusive_pt,
-       x86_lbr_exclusive_max,
-};
-
-/*
- * struct x86_pmu - generic x86 pmu
- */
-struct x86_pmu {
-       /*
-        * Generic x86 PMC bits
-        */
-       const char      *name;
-       int             version;
-       int             (*handle_irq)(struct pt_regs *);
-       void            (*disable_all)(void);
-       void            (*enable_all)(int added);
-       void            (*enable)(struct perf_event *);
-       void            (*disable)(struct perf_event *);
-       int             (*hw_config)(struct perf_event *event);
-       int             (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
-       unsigned        eventsel;
-       unsigned        perfctr;
-       int             (*addr_offset)(int index, bool eventsel);
-       int             (*rdpmc_index)(int index);
-       u64             (*event_map)(int);
-       int             max_events;
-       int             num_counters;
-       int             num_counters_fixed;
-       int             cntval_bits;
-       u64             cntval_mask;
-       union {
-                       unsigned long events_maskl;
-                       unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)];
-       };
-       int             events_mask_len;
-       int             apic;
-       u64             max_period;
-       struct event_constraint *
-                       (*get_event_constraints)(struct cpu_hw_events *cpuc,
-                                                int idx,
-                                                struct perf_event *event);
-
-       void            (*put_event_constraints)(struct cpu_hw_events *cpuc,
-                                                struct perf_event *event);
-
-       void            (*start_scheduling)(struct cpu_hw_events *cpuc);
-
-       void            (*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr);
-
-       void            (*stop_scheduling)(struct cpu_hw_events *cpuc);
-
-       struct event_constraint *event_constraints;
-       struct x86_pmu_quirk *quirks;
-       int             perfctr_second_write;
-       bool            late_ack;
-       unsigned        (*limit_period)(struct perf_event *event, unsigned l);
-
-       /*
-        * sysfs attrs
-        */
-       int             attr_rdpmc_broken;
-       int             attr_rdpmc;
-       struct attribute **format_attrs;
-       struct attribute **event_attrs;
-
-       ssize_t         (*events_sysfs_show)(char *page, u64 config);
-       struct attribute **cpu_events;
-
-       /*
-        * CPU Hotplug hooks
-        */
-       int             (*cpu_prepare)(int cpu);
-       void            (*cpu_starting)(int cpu);
-       void            (*cpu_dying)(int cpu);
-       void            (*cpu_dead)(int cpu);
-
-       void            (*check_microcode)(void);
-       void            (*sched_task)(struct perf_event_context *ctx,
-                                     bool sched_in);
-
-       /*
-        * Intel Arch Perfmon v2+
-        */
-       u64                     intel_ctrl;
-       union perf_capabilities intel_cap;
-
-       /*
-        * Intel DebugStore bits
-        */
-       unsigned int    bts             :1,
-                       bts_active      :1,
-                       pebs            :1,
-                       pebs_active     :1,
-                       pebs_broken     :1,
-                       pebs_prec_dist  :1;
-       int             pebs_record_size;
-       void            (*drain_pebs)(struct pt_regs *regs);
-       struct event_constraint *pebs_constraints;
-       void            (*pebs_aliases)(struct perf_event *event);
-       int             max_pebs_events;
-       unsigned long   free_running_flags;
-
-       /*
-        * Intel LBR
-        */
-       unsigned long   lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
-       int             lbr_nr;                    /* hardware stack size */
-       u64             lbr_sel_mask;              /* LBR_SELECT valid bits */
-       const int       *lbr_sel_map;              /* lbr_select mappings */
-       bool            lbr_double_abort;          /* duplicated lbr aborts */
-
-       /*
-        * Intel PT/LBR/BTS are exclusive
-        */
-       atomic_t        lbr_exclusive[x86_lbr_exclusive_max];
-
-       /*
-        * Extra registers for events
-        */
-       struct extra_reg *extra_regs;
-       unsigned int flags;
-
-       /*
-        * Intel host/guest support (KVM)
-        */
-       struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
-};
-
-struct x86_perf_task_context {
-       u64 lbr_from[MAX_LBR_ENTRIES];
-       u64 lbr_to[MAX_LBR_ENTRIES];
-       u64 lbr_info[MAX_LBR_ENTRIES];
-       int tos;
-       int lbr_callstack_users;
-       int lbr_stack_state;
-};
-
-#define x86_add_quirk(func_)                                           \
-do {                                                                   \
-       static struct x86_pmu_quirk __quirk __initdata = {              \
-               .func = func_,                                          \
-       };                                                              \
-       __quirk.next = x86_pmu.quirks;                                  \
-       x86_pmu.quirks = &__quirk;                                      \
-} while (0)
-
-/*
- * x86_pmu flags
- */
-#define PMU_FL_NO_HT_SHARING   0x1 /* no hyper-threading resource sharing */
-#define PMU_FL_HAS_RSP_1       0x2 /* has 2 equivalent offcore_rsp regs   */
-#define PMU_FL_EXCL_CNTRS      0x4 /* has exclusive counter requirements  */
-#define PMU_FL_EXCL_ENABLED    0x8 /* exclusive counter active */
-
-#define EVENT_VAR(_id)  event_attr_##_id
-#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
-
-#define EVENT_ATTR(_name, _id)                                         \
-static struct perf_pmu_events_attr EVENT_VAR(_id) = {                  \
-       .attr           = __ATTR(_name, 0444, events_sysfs_show, NULL), \
-       .id             = PERF_COUNT_HW_##_id,                          \
-       .event_str      = NULL,                                         \
-};
-
-#define EVENT_ATTR_STR(_name, v, str)                                  \
-static struct perf_pmu_events_attr event_attr_##v = {                  \
-       .attr           = __ATTR(_name, 0444, events_sysfs_show, NULL), \
-       .id             = 0,                                            \
-       .event_str      = str,                                          \
-};
-
-extern struct x86_pmu x86_pmu __read_mostly;
-
-static inline bool x86_pmu_has_lbr_callstack(void)
-{
-       return  x86_pmu.lbr_sel_map &&
-               x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
-}
-
-DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
-
-int x86_perf_event_set_period(struct perf_event *event);
-
-/*
- * Generalized hw caching related hw_event table, filled
- * in on a per model basis. A value of 0 means
- * 'not supported', -1 means 'hw_event makes no sense on
- * this CPU', any other value means the raw hw_event
- * ID.
- */
-
-#define C(x) PERF_COUNT_HW_CACHE_##x
-
-extern u64 __read_mostly hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX];
-extern u64 __read_mostly hw_cache_extra_regs
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX];
-
-u64 x86_perf_event_update(struct perf_event *event);
-
-static inline unsigned int x86_pmu_config_addr(int index)
-{
-       return x86_pmu.eventsel + (x86_pmu.addr_offset ?
-                                  x86_pmu.addr_offset(index, true) : index);
-}
-
-static inline unsigned int x86_pmu_event_addr(int index)
-{
-       return x86_pmu.perfctr + (x86_pmu.addr_offset ?
-                                 x86_pmu.addr_offset(index, false) : index);
-}
-
-static inline int x86_pmu_rdpmc_index(int index)
-{
-       return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
-}
-
-int x86_add_exclusive(unsigned int what);
-
-void x86_del_exclusive(unsigned int what);
-
-int x86_reserve_hardware(void);
-
-void x86_release_hardware(void);
-
-void hw_perf_lbr_event_destroy(struct perf_event *event);
-
-int x86_setup_perfctr(struct perf_event *event);
-
-int x86_pmu_hw_config(struct perf_event *event);
-
-void x86_pmu_disable_all(void);
-
-static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
-                                         u64 enable_mask)
-{
-       u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
-
-       if (hwc->extra_reg.reg)
-               wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
-       wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);
-}
-
-void x86_pmu_enable_all(int added);
-
-int perf_assign_events(struct event_constraint **constraints, int n,
-                       int wmin, int wmax, int gpmax, int *assign);
-int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
-
-void x86_pmu_stop(struct perf_event *event, int flags);
-
-static inline void x86_pmu_disable_event(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       wrmsrl(hwc->config_base, hwc->config);
-}
-
-void x86_pmu_enable_event(struct perf_event *event);
-
-int x86_pmu_handle_irq(struct pt_regs *regs);
-
-extern struct event_constraint emptyconstraint;
-
-extern struct event_constraint unconstrained;
-
-static inline bool kernel_ip(unsigned long ip)
-{
-#ifdef CONFIG_X86_32
-       return ip > PAGE_OFFSET;
-#else
-       return (long)ip < 0;
-#endif
-}
-
-/*
- * Not all PMUs provide the right context information to place the reported IP
- * into full context. Specifically segment registers are typically not
- * supplied.
- *
- * Assuming the address is a linear address (it is for IBS), we fake the CS and
- * vm86 mode using the known zero-based code segment and 'fix up' the registers
- * to reflect this.
- *
- * Intel PEBS/LBR appear to typically provide the effective address, nothing
- * much we can do about that but pray and treat it like a linear address.
- */
-static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip)
-{
-       regs->cs = kernel_ip(ip) ? __KERNEL_CS : __USER_CS;
-       if (regs->flags & X86_VM_MASK)
-               regs->flags ^= (PERF_EFLAGS_VM | X86_VM_MASK);
-       regs->ip = ip;
-}
-
-ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event);
-ssize_t intel_event_sysfs_show(char *page, u64 config);
-
-struct attribute **merge_attr(struct attribute **a, struct attribute **b);
-
-#ifdef CONFIG_CPU_SUP_AMD
-
-int amd_pmu_init(void);
-
-#else /* CONFIG_CPU_SUP_AMD */
-
-static inline int amd_pmu_init(void)
-{
-       return 0;
-}
-
-#endif /* CONFIG_CPU_SUP_AMD */
-
-#ifdef CONFIG_CPU_SUP_INTEL
-
-static inline bool intel_pmu_has_bts(struct perf_event *event)
-{
-       if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
-           !event->attr.freq && event->hw.sample_period == 1)
-               return true;
-
-       return false;
-}
-
-int intel_pmu_save_and_restart(struct perf_event *event);
-
-struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-                         struct perf_event *event);
-
-struct intel_shared_regs *allocate_shared_regs(int cpu);
-
-int intel_pmu_init(void);
-
-void init_debug_store_on_cpu(int cpu);
-
-void fini_debug_store_on_cpu(int cpu);
-
-void release_ds_buffers(void);
-
-void reserve_ds_buffers(void);
-
-extern struct event_constraint bts_constraint;
-
-void intel_pmu_enable_bts(u64 config);
-
-void intel_pmu_disable_bts(void);
-
-int intel_pmu_drain_bts_buffer(void);
-
-extern struct event_constraint intel_core2_pebs_event_constraints[];
-
-extern struct event_constraint intel_atom_pebs_event_constraints[];
-
-extern struct event_constraint intel_slm_pebs_event_constraints[];
-
-extern struct event_constraint intel_nehalem_pebs_event_constraints[];
-
-extern struct event_constraint intel_westmere_pebs_event_constraints[];
-
-extern struct event_constraint intel_snb_pebs_event_constraints[];
-
-extern struct event_constraint intel_ivb_pebs_event_constraints[];
-
-extern struct event_constraint intel_hsw_pebs_event_constraints[];
-
-extern struct event_constraint intel_skl_pebs_event_constraints[];
-
-struct event_constraint *intel_pebs_constraints(struct perf_event *event);
-
-void intel_pmu_pebs_enable(struct perf_event *event);
-
-void intel_pmu_pebs_disable(struct perf_event *event);
-
-void intel_pmu_pebs_enable_all(void);
-
-void intel_pmu_pebs_disable_all(void);
-
-void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
-
-void intel_ds_init(void);
-
-void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
-
-void intel_pmu_lbr_reset(void);
-
-void intel_pmu_lbr_enable(struct perf_event *event);
-
-void intel_pmu_lbr_disable(struct perf_event *event);
-
-void intel_pmu_lbr_enable_all(bool pmi);
-
-void intel_pmu_lbr_disable_all(void);
-
-void intel_pmu_lbr_read(void);
-
-void intel_pmu_lbr_init_core(void);
-
-void intel_pmu_lbr_init_nhm(void);
-
-void intel_pmu_lbr_init_atom(void);
-
-void intel_pmu_lbr_init_snb(void);
-
-void intel_pmu_lbr_init_hsw(void);
-
-void intel_pmu_lbr_init_skl(void);
-
-void intel_pmu_lbr_init_knl(void);
-
-int intel_pmu_setup_lbr_filter(struct perf_event *event);
-
-void intel_pt_interrupt(void);
-
-int intel_bts_interrupt(void);
-
-void intel_bts_enable_local(void);
-
-void intel_bts_disable_local(void);
-
-int p4_pmu_init(void);
-
-int p6_pmu_init(void);
-
-int knc_pmu_init(void);
-
-ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
-                         char *page);
-
-static inline int is_ht_workaround_enabled(void)
-{
-       return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);
-}
-
-#else /* CONFIG_CPU_SUP_INTEL */
-
-static inline void reserve_ds_buffers(void)
-{
-}
-
-static inline void release_ds_buffers(void)
-{
-}
-
-static inline int intel_pmu_init(void)
-{
-       return 0;
-}
-
-static inline struct intel_shared_regs *allocate_shared_regs(int cpu)
-{
-       return NULL;
-}
-
-static inline int is_ht_workaround_enabled(void)
-{
-       return 0;
-}
-#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c

deleted file mode 100644 (file)

index 5861053..0000000
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ /dev/null
@@ -1,731 +0,0 @@
-#include <linux/perf_event.h>
-#include <linux/export.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <asm/apicdef.h>
-
-#include "perf_event.h"
-
-static __initconst const u64 amd_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
-               [ C(RESULT_MISS)   ] = 0x0141, /* Data Cache Misses          */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts  */
-               [ C(RESULT_MISS)   ] = 0x0167, /* Data Prefetcher :cancelled */
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches  */
-               [ C(RESULT_MISS)   ] = 0x0081, /* Instruction cache misses   */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
-               [ C(RESULT_MISS)   ] = 0x037E, /* L2 Cache Misses : IC+DC     */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback           */
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
-               [ C(RESULT_MISS)   ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */
-               [ C(RESULT_MISS)   ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr.      */
-               [ C(RESULT_MISS)   ] = 0x00c3, /* Retired Mispredicted BI    */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */
-               [ C(RESULT_MISS)   ] = 0x98e9, /* CPU Request to Memory, r   */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
-};
-
-/*
- * AMD Performance Monitor K7 and later.
- */
-static const u64 amd_perfmon_event_map[] =
-{
-  [PERF_COUNT_HW_CPU_CYCLES]                   = 0x0076,
-  [PERF_COUNT_HW_INSTRUCTIONS]                 = 0x00c0,
-  [PERF_COUNT_HW_CACHE_REFERENCES]             = 0x0080,
-  [PERF_COUNT_HW_CACHE_MISSES]                 = 0x0081,
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]          = 0x00c2,
-  [PERF_COUNT_HW_BRANCH_MISSES]                        = 0x00c3,
-  [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]      = 0x00d0, /* "Decoder empty" event */
-  [PERF_COUNT_HW_STALLED_CYCLES_BACKEND]       = 0x00d1, /* "Dispatch stalls" event */
-};
-
-static u64 amd_pmu_event_map(int hw_event)
-{
-       return amd_perfmon_event_map[hw_event];
-}
-
-/*
- * Previously calculated offsets
- */
-static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly;
-static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly;
-
-/*
- * Legacy CPUs:
- *   4 counters starting at 0xc0010000 each offset by 1
- *
- * CPUs with core performance counter extensions:
- *   6 counters starting at 0xc0010200 each offset by 2
- */
-static inline int amd_pmu_addr_offset(int index, bool eventsel)
-{
-       int offset;
-
-       if (!index)
-               return index;
-
-       if (eventsel)
-               offset = event_offsets[index];
-       else
-               offset = count_offsets[index];
-
-       if (offset)
-               return offset;
-
-       if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
-               offset = index;
-       else
-               offset = index << 1;
-
-       if (eventsel)
-               event_offsets[index] = offset;
-       else
-               count_offsets[index] = offset;
-
-       return offset;
-}
-
-static int amd_core_hw_config(struct perf_event *event)
-{
-       if (event->attr.exclude_host && event->attr.exclude_guest)
-               /*
-                * When HO == GO == 1 the hardware treats that as GO == HO == 0
-                * and will count in both modes. We don't want to count in that
-                * case so we emulate no-counting by setting US = OS = 0.
-                */
-               event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR |
-                                     ARCH_PERFMON_EVENTSEL_OS);
-       else if (event->attr.exclude_host)
-               event->hw.config |= AMD64_EVENTSEL_GUESTONLY;
-       else if (event->attr.exclude_guest)
-               event->hw.config |= AMD64_EVENTSEL_HOSTONLY;
-
-       return 0;
-}
-
-/*
- * AMD64 events are detected based on their event codes.
- */
-static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc)
-{
-       return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff);
-}
-
-static inline int amd_is_nb_event(struct hw_perf_event *hwc)
-{
-       return (hwc->config & 0xe0) == 0xe0;
-}
-
-static inline int amd_has_nb(struct cpu_hw_events *cpuc)
-{
-       struct amd_nb *nb = cpuc->amd_nb;
-
-       return nb && nb->nb_id != -1;
-}
-
-static int amd_pmu_hw_config(struct perf_event *event)
-{
-       int ret;
-
-       /* pass precise event sampling to ibs: */
-       if (event->attr.precise_ip && get_ibs_caps())
-               return -ENOENT;
-
-       if (has_branch_stack(event))
-               return -EOPNOTSUPP;
-
-       ret = x86_pmu_hw_config(event);
-       if (ret)
-               return ret;
-
-       if (event->attr.type == PERF_TYPE_RAW)
-               event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
-
-       return amd_core_hw_config(event);
-}
-
-static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc,
-                                          struct perf_event *event)
-{
-       struct amd_nb *nb = cpuc->amd_nb;
-       int i;
-
-       /*
-        * need to scan whole list because event may not have
-        * been assigned during scheduling
-        *
-        * no race condition possible because event can only
-        * be removed on one CPU at a time AND PMU is disabled
-        * when we come here
-        */
-       for (i = 0; i < x86_pmu.num_counters; i++) {
-               if (cmpxchg(nb->owners + i, event, NULL) == event)
-                       break;
-       }
-}
-
- /*
-  * AMD64 NorthBridge events need special treatment because
-  * counter access needs to be synchronized across all cores
-  * of a package. Refer to BKDG section 3.12
-  *
-  * NB events are events measuring L3 cache, Hypertransport
-  * traffic. They are identified by an event code >= 0xe00.
-  * They measure events on the NorthBride which is shared
-  * by all cores on a package. NB events are counted on a
-  * shared set of counters. When a NB event is programmed
-  * in a counter, the data actually comes from a shared
-  * counter. Thus, access to those counters needs to be
-  * synchronized.
-  *
-  * We implement the synchronization such that no two cores
-  * can be measuring NB events using the same counters. Thus,
-  * we maintain a per-NB allocation table. The available slot
-  * is propagated using the event_constraint structure.
-  *
-  * We provide only one choice for each NB event based on
-  * the fact that only NB events have restrictions. Consequently,
-  * if a counter is available, there is a guarantee the NB event
-  * will be assigned to it. If no slot is available, an empty
-  * constraint is returned and scheduling will eventually fail
-  * for this event.
-  *
-  * Note that all cores attached the same NB compete for the same
-  * counters to host NB events, this is why we use atomic ops. Some
-  * multi-chip CPUs may have more than one NB.
-  *
-  * Given that resources are allocated (cmpxchg), they must be
-  * eventually freed for others to use. This is accomplished by
-  * calling __amd_put_nb_event_constraints()
-  *
-  * Non NB events are not impacted by this restriction.
-  */
-static struct event_constraint *
-__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
-                              struct event_constraint *c)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct amd_nb *nb = cpuc->amd_nb;
-       struct perf_event *old;
-       int idx, new = -1;
-
-       if (!c)
-               c = &unconstrained;
-
-       if (cpuc->is_fake)
-               return c;
-
-       /*
-        * detect if already present, if so reuse
-        *
-        * cannot merge with actual allocation
-        * because of possible holes
-        *
-        * event can already be present yet not assigned (in hwc->idx)
-        * because of successive calls to x86_schedule_events() from
-        * hw_perf_group_sched_in() without hw_perf_enable()
-        */
-       for_each_set_bit(idx, c->idxmsk, x86_pmu.num_counters) {
-               if (new == -1 || hwc->idx == idx)
-                       /* assign free slot, prefer hwc->idx */
-                       old = cmpxchg(nb->owners + idx, NULL, event);
-               else if (nb->owners[idx] == event)
-                       /* event already present */
-                       old = event;
-               else
-                       continue;
-
-               if (old && old != event)
-                       continue;
-
-               /* reassign to this slot */
-               if (new != -1)
-                       cmpxchg(nb->owners + new, event, NULL);
-               new = idx;
-
-               /* already present, reuse */
-               if (old == event)
-                       break;
-       }
-
-       if (new == -1)
-               return &emptyconstraint;
-
-       return &nb->event_constraints[new];
-}
-
-static struct amd_nb *amd_alloc_nb(int cpu)
-{
-       struct amd_nb *nb;
-       int i;
-
-       nb = kzalloc_node(sizeof(struct amd_nb), GFP_KERNEL, cpu_to_node(cpu));
-       if (!nb)
-               return NULL;
-
-       nb->nb_id = -1;
-
-       /*
-        * initialize all possible NB constraints
-        */
-       for (i = 0; i < x86_pmu.num_counters; i++) {
-               __set_bit(i, nb->event_constraints[i].idxmsk);
-               nb->event_constraints[i].weight = 1;
-       }
-       return nb;
-}
-
-static int amd_pmu_cpu_prepare(int cpu)
-{
-       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-
-       WARN_ON_ONCE(cpuc->amd_nb);
-
-       if (boot_cpu_data.x86_max_cores < 2)
-               return NOTIFY_OK;
-
-       cpuc->amd_nb = amd_alloc_nb(cpu);
-       if (!cpuc->amd_nb)
-               return NOTIFY_BAD;
-
-       return NOTIFY_OK;
-}
-
-static void amd_pmu_cpu_starting(int cpu)
-{
-       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-       void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
-       struct amd_nb *nb;
-       int i, nb_id;
-
-       cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
-
-       if (boot_cpu_data.x86_max_cores < 2)
-               return;
-
-       nb_id = amd_get_nb_id(cpu);
-       WARN_ON_ONCE(nb_id == BAD_APICID);
-
-       for_each_online_cpu(i) {
-               nb = per_cpu(cpu_hw_events, i).amd_nb;
-               if (WARN_ON_ONCE(!nb))
-                       continue;
-
-               if (nb->nb_id == nb_id) {
-                       *onln = cpuc->amd_nb;
-                       cpuc->amd_nb = nb;
-                       break;
-               }
-       }
-
-       cpuc->amd_nb->nb_id = nb_id;
-       cpuc->amd_nb->refcnt++;
-}
-
-static void amd_pmu_cpu_dead(int cpu)
-{
-       struct cpu_hw_events *cpuhw;
-
-       if (boot_cpu_data.x86_max_cores < 2)
-               return;
-
-       cpuhw = &per_cpu(cpu_hw_events, cpu);
-
-       if (cpuhw->amd_nb) {
-               struct amd_nb *nb = cpuhw->amd_nb;
-
-               if (nb->nb_id == -1 || --nb->refcnt == 0)
-                       kfree(nb);
-
-               cpuhw->amd_nb = NULL;
-       }
-}
-
-static struct event_constraint *
-amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-                         struct perf_event *event)
-{
-       /*
-        * if not NB event or no NB, then no constraints
-        */
-       if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)))
-               return &unconstrained;
-
-       return __amd_get_nb_event_constraints(cpuc, event, NULL);
-}
-
-static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
-                                     struct perf_event *event)
-{
-       if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))
-               __amd_put_nb_event_constraints(cpuc, event);
-}
-
-PMU_FORMAT_ATTR(event, "config:0-7,32-35");
-PMU_FORMAT_ATTR(umask, "config:8-15"   );
-PMU_FORMAT_ATTR(edge,  "config:18"     );
-PMU_FORMAT_ATTR(inv,   "config:23"     );
-PMU_FORMAT_ATTR(cmask, "config:24-31"  );
-
-static struct attribute *amd_format_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_cmask.attr,
-       NULL,
-};
-
-/* AMD Family 15h */
-
-#define AMD_EVENT_TYPE_MASK    0x000000F0ULL
-
-#define AMD_EVENT_FP           0x00000000ULL ... 0x00000010ULL
-#define AMD_EVENT_LS           0x00000020ULL ... 0x00000030ULL
-#define AMD_EVENT_DC           0x00000040ULL ... 0x00000050ULL
-#define AMD_EVENT_CU           0x00000060ULL ... 0x00000070ULL
-#define AMD_EVENT_IC_DE                0x00000080ULL ... 0x00000090ULL
-#define AMD_EVENT_EX_LS                0x000000C0ULL
-#define AMD_EVENT_DE           0x000000D0ULL
-#define AMD_EVENT_NB           0x000000E0ULL ... 0x000000F0ULL
-
-/*
- * AMD family 15h event code/PMC mappings:
- *
- * type = event_code & 0x0F0:
- *
- * 0x000       FP      PERF_CTL[5:3]
- * 0x010       FP      PERF_CTL[5:3]
- * 0x020       LS      PERF_CTL[5:0]
- * 0x030       LS      PERF_CTL[5:0]
- * 0x040       DC      PERF_CTL[5:0]
- * 0x050       DC      PERF_CTL[5:0]
- * 0x060       CU      PERF_CTL[2:0]
- * 0x070       CU      PERF_CTL[2:0]
- * 0x080       IC/DE   PERF_CTL[2:0]
- * 0x090       IC/DE   PERF_CTL[2:0]
- * 0x0A0       ---
- * 0x0B0       ---
- * 0x0C0       EX/LS   PERF_CTL[5:0]
- * 0x0D0       DE      PERF_CTL[2:0]
- * 0x0E0       NB      NB_PERF_CTL[3:0]
- * 0x0F0       NB      NB_PERF_CTL[3:0]
- *
- * Exceptions:
- *
- * 0x000       FP      PERF_CTL[3], PERF_CTL[5:3] (*)
- * 0x003       FP      PERF_CTL[3]
- * 0x004       FP      PERF_CTL[3], PERF_CTL[5:3] (*)
- * 0x00B       FP      PERF_CTL[3]
- * 0x00D       FP      PERF_CTL[3]
- * 0x023       DE      PERF_CTL[2:0]
- * 0x02D       LS      PERF_CTL[3]
- * 0x02E       LS      PERF_CTL[3,0]
- * 0x031       LS      PERF_CTL[2:0] (**)
- * 0x043       CU      PERF_CTL[2:0]
- * 0x045       CU      PERF_CTL[2:0]
- * 0x046       CU      PERF_CTL[2:0]
- * 0x054       CU      PERF_CTL[2:0]
- * 0x055       CU      PERF_CTL[2:0]
- * 0x08F       IC      PERF_CTL[0]
- * 0x187       DE      PERF_CTL[0]
- * 0x188       DE      PERF_CTL[0]
- * 0x0DB       EX      PERF_CTL[5:0]
- * 0x0DC       LS      PERF_CTL[5:0]
- * 0x0DD       LS      PERF_CTL[5:0]
- * 0x0DE       LS      PERF_CTL[5:0]
- * 0x0DF       LS      PERF_CTL[5:0]
- * 0x1C0       EX      PERF_CTL[5:3]
- * 0x1D6       EX      PERF_CTL[5:0]
- * 0x1D8       EX      PERF_CTL[5:0]
- *
- * (*)  depending on the umask all FPU counters may be used
- * (**) only one unitmask enabled at a time
- */
-
-static struct event_constraint amd_f15_PMC0  = EVENT_CONSTRAINT(0, 0x01, 0);
-static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
-static struct event_constraint amd_f15_PMC3  = EVENT_CONSTRAINT(0, 0x08, 0);
-static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
-static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
-static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
-
-static struct event_constraint *
-amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx,
-                              struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       unsigned int event_code = amd_get_event_code(hwc);
-
-       switch (event_code & AMD_EVENT_TYPE_MASK) {
-       case AMD_EVENT_FP:
-               switch (event_code) {
-               case 0x000:
-                       if (!(hwc->config & 0x0000F000ULL))
-                               break;
-                       if (!(hwc->config & 0x00000F00ULL))
-                               break;
-                       return &amd_f15_PMC3;
-               case 0x004:
-                       if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
-                               break;
-                       return &amd_f15_PMC3;
-               case 0x003:
-               case 0x00B:
-               case 0x00D:
-                       return &amd_f15_PMC3;
-               }
-               return &amd_f15_PMC53;
-       case AMD_EVENT_LS:
-       case AMD_EVENT_DC:
-       case AMD_EVENT_EX_LS:
-               switch (event_code) {
-               case 0x023:
-               case 0x043:
-               case 0x045:
-               case 0x046:
-               case 0x054:
-               case 0x055:
-                       return &amd_f15_PMC20;
-               case 0x02D:
-                       return &amd_f15_PMC3;
-               case 0x02E:
-                       return &amd_f15_PMC30;
-               case 0x031:
-                       if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
-                               return &amd_f15_PMC20;
-                       return &emptyconstraint;
-               case 0x1C0:
-                       return &amd_f15_PMC53;
-               default:
-                       return &amd_f15_PMC50;
-               }
-       case AMD_EVENT_CU:
-       case AMD_EVENT_IC_DE:
-       case AMD_EVENT_DE:
-               switch (event_code) {
-               case 0x08F:
-               case 0x187:
-               case 0x188:
-                       return &amd_f15_PMC0;
-               case 0x0DB ... 0x0DF:
-               case 0x1D6:
-               case 0x1D8:
-                       return &amd_f15_PMC50;
-               default:
-                       return &amd_f15_PMC20;
-               }
-       case AMD_EVENT_NB:
-               /* moved to perf_event_amd_uncore.c */
-               return &emptyconstraint;
-       default:
-               return &emptyconstraint;
-       }
-}
-
-static ssize_t amd_event_sysfs_show(char *page, u64 config)
-{
-       u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) |
-                   (config & AMD64_EVENTSEL_EVENT) >> 24;
-
-       return x86_event_sysfs_show(page, config, event);
-}
-
-static __initconst const struct x86_pmu amd_pmu = {
-       .name                   = "AMD",
-       .handle_irq             = x86_pmu_handle_irq,
-       .disable_all            = x86_pmu_disable_all,
-       .enable_all             = x86_pmu_enable_all,
-       .enable                 = x86_pmu_enable_event,
-       .disable                = x86_pmu_disable_event,
-       .hw_config              = amd_pmu_hw_config,
-       .schedule_events        = x86_schedule_events,
-       .eventsel               = MSR_K7_EVNTSEL0,
-       .perfctr                = MSR_K7_PERFCTR0,
-       .addr_offset            = amd_pmu_addr_offset,
-       .event_map              = amd_pmu_event_map,
-       .max_events             = ARRAY_SIZE(amd_perfmon_event_map),
-       .num_counters           = AMD64_NUM_COUNTERS,
-       .cntval_bits            = 48,
-       .cntval_mask            = (1ULL << 48) - 1,
-       .apic                   = 1,
-       /* use highest bit to detect overflow */
-       .max_period             = (1ULL << 47) - 1,
-       .get_event_constraints  = amd_get_event_constraints,
-       .put_event_constraints  = amd_put_event_constraints,
-
-       .format_attrs           = amd_format_attr,
-       .events_sysfs_show      = amd_event_sysfs_show,
-
-       .cpu_prepare            = amd_pmu_cpu_prepare,
-       .cpu_starting           = amd_pmu_cpu_starting,
-       .cpu_dead               = amd_pmu_cpu_dead,
-};
-
-static int __init amd_core_pmu_init(void)
-{
-       if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
-               return 0;
-
-       switch (boot_cpu_data.x86) {
-       case 0x15:
-               pr_cont("Fam15h ");
-               x86_pmu.get_event_constraints = amd_get_event_constraints_f15h;
-               break;
-
-       default:
-               pr_err("core perfctr but no constraints; unknown hardware!\n");
-               return -ENODEV;
-       }
-
-       /*
-        * If core performance counter extensions exists, we must use
-        * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also
-        * amd_pmu_addr_offset().
-        */
-       x86_pmu.eventsel        = MSR_F15H_PERF_CTL;
-       x86_pmu.perfctr         = MSR_F15H_PERF_CTR;
-       x86_pmu.num_counters    = AMD64_NUM_COUNTERS_CORE;
-
-       pr_cont("core perfctr, ");
-       return 0;
-}
-
-__init int amd_pmu_init(void)
-{
-       int ret;
-
-       /* Performance-monitoring supported from K7 and later: */
-       if (boot_cpu_data.x86 < 6)
-               return -ENODEV;
-
-       x86_pmu = amd_pmu;
-
-       ret = amd_core_pmu_init();
-       if (ret)
-               return ret;
-
-       /* Events are common for all AMDs */
-       memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
-              sizeof(hw_cache_event_ids));
-
-       return 0;
-}
-
-void amd_pmu_enable_virt(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       cpuc->perf_ctr_virt_mask = 0;
-
-       /* Reload all events */
-       x86_pmu_disable_all();
-       x86_pmu_enable_all(0);
-}
-EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
-
-void amd_pmu_disable_virt(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       /*
-        * We only mask out the Host-only bit so that host-only counting works
-        * when SVM is disabled. If someone sets up a guest-only counter when
-        * SVM is disabled the Guest-only bits still gets set and the counter
-        * will not count anything.
-        */
-       cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
-
-       /* Reload all events */
-       x86_pmu_disable_all();
-       x86_pmu_enable_all(0);
-}
-EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c

deleted file mode 100644 (file)

index 989d3c2..0000000
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ /dev/null
@@ -1,959 +0,0 @@
-/*
- * Performance events - AMD IBS
- *
- *  Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter
- *
- *  For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_event.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/ptrace.h>
-#include <linux/syscore_ops.h>
-
-#include <asm/apic.h>
-
-#include "perf_event.h"
-
-static u32 ibs_caps;
-
-#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
-
-#include <linux/kprobes.h>
-#include <linux/hardirq.h>
-
-#include <asm/nmi.h>
-
-#define IBS_FETCH_CONFIG_MASK  (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
-#define IBS_OP_CONFIG_MASK     IBS_OP_MAX_CNT
-
-enum ibs_states {
-       IBS_ENABLED     = 0,
-       IBS_STARTED     = 1,
-       IBS_STOPPING    = 2,
-
-       IBS_MAX_STATES,
-};
-
-struct cpu_perf_ibs {
-       struct perf_event       *event;
-       unsigned long           state[BITS_TO_LONGS(IBS_MAX_STATES)];
-};
-
-struct perf_ibs {
-       struct pmu                      pmu;
-       unsigned int                    msr;
-       u64                             config_mask;
-       u64                             cnt_mask;
-       u64                             enable_mask;
-       u64                             valid_mask;
-       u64                             max_period;
-       unsigned long                   offset_mask[1];
-       int                             offset_max;
-       struct cpu_perf_ibs __percpu    *pcpu;
-
-       struct attribute                **format_attrs;
-       struct attribute_group          format_group;
-       const struct attribute_group    *attr_groups[2];
-
-       u64                             (*get_count)(u64 config);
-};
-
-struct perf_ibs_data {
-       u32             size;
-       union {
-               u32     data[0];        /* data buffer starts here */
-               u32     caps;
-       };
-       u64             regs[MSR_AMD64_IBS_REG_COUNT_MAX];
-};
-
-static int
-perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
-{
-       s64 left = local64_read(&hwc->period_left);
-       s64 period = hwc->sample_period;
-       int overflow = 0;
-
-       /*
-        * If we are way outside a reasonable range then just skip forward:
-        */
-       if (unlikely(left <= -period)) {
-               left = period;
-               local64_set(&hwc->period_left, left);
-               hwc->last_period = period;
-               overflow = 1;
-       }
-
-       if (unlikely(left < (s64)min)) {
-               left += period;
-               local64_set(&hwc->period_left, left);
-               hwc->last_period = period;
-               overflow = 1;
-       }
-
-       /*
-        * If the hw period that triggers the sw overflow is too short
-        * we might hit the irq handler. This biases the results.
-        * Thus we shorten the next-to-last period and set the last
-        * period to the max period.
-        */
-       if (left > max) {
-               left -= max;
-               if (left > max)
-                       left = max;
-               else if (left < min)
-                       left = min;
-       }
-
-       *hw_period = (u64)left;
-
-       return overflow;
-}
-
-static  int
-perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       int shift = 64 - width;
-       u64 prev_raw_count;
-       u64 delta;
-
-       /*
-        * Careful: an NMI might modify the previous event value.
-        *
-        * Our tactic to handle this is to first atomically read and
-        * exchange a new raw count - then add that new-prev delta
-        * count to the generic event atomically:
-        */
-       prev_raw_count = local64_read(&hwc->prev_count);
-       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-                                       new_raw_count) != prev_raw_count)
-               return 0;
-
-       /*
-        * Now we have the new raw value and have updated the prev
-        * timestamp already. We can now calculate the elapsed delta
-        * (event-)time and add that to the generic event.
-        *
-        * Careful, not all hw sign-extends above the physical width
-        * of the count.
-        */
-       delta = (new_raw_count << shift) - (prev_raw_count << shift);
-       delta >>= shift;
-
-       local64_add(delta, &event->count);
-       local64_sub(delta, &hwc->period_left);
-
-       return 1;
-}
-
-static struct perf_ibs perf_ibs_fetch;
-static struct perf_ibs perf_ibs_op;
-
-static struct perf_ibs *get_ibs_pmu(int type)
-{
-       if (perf_ibs_fetch.pmu.type == type)
-               return &perf_ibs_fetch;
-       if (perf_ibs_op.pmu.type == type)
-               return &perf_ibs_op;
-       return NULL;
-}
-
-/*
- * Use IBS for precise event sampling:
- *
- *  perf record -a -e cpu-cycles:p ...    # use ibs op counting cycle count
- *  perf record -a -e r076:p ...          # same as -e cpu-cycles:p
- *  perf record -a -e r0C1:p ...          # use ibs op counting micro-ops
- *
- * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
- * MSRC001_1033) is used to select either cycle or micro-ops counting
- * mode.
- *
- * The rip of IBS samples has skid 0. Thus, IBS supports precise
- * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
- * rip is invalid when IBS was not able to record the rip correctly.
- * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
- *
- */
-static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
-{
-       switch (event->attr.precise_ip) {
-       case 0:
-               return -ENOENT;
-       case 1:
-       case 2:
-               break;
-       default:
-               return -EOPNOTSUPP;
-       }
-
-       switch (event->attr.type) {
-       case PERF_TYPE_HARDWARE:
-               switch (event->attr.config) {
-               case PERF_COUNT_HW_CPU_CYCLES:
-                       *config = 0;
-                       return 0;
-               }
-               break;
-       case PERF_TYPE_RAW:
-               switch (event->attr.config) {
-               case 0x0076:
-                       *config = 0;
-                       return 0;
-               case 0x00C1:
-                       *config = IBS_OP_CNT_CTL;
-                       return 0;
-               }
-               break;
-       default:
-               return -ENOENT;
-       }
-
-       return -EOPNOTSUPP;
-}
-
-static const struct perf_event_attr ibs_notsupp = {
-       .exclude_user   = 1,
-       .exclude_kernel = 1,
-       .exclude_hv     = 1,
-       .exclude_idle   = 1,
-       .exclude_host   = 1,
-       .exclude_guest  = 1,
-};
-
-static int perf_ibs_init(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct perf_ibs *perf_ibs;
-       u64 max_cnt, config;
-       int ret;
-
-       perf_ibs = get_ibs_pmu(event->attr.type);
-       if (perf_ibs) {
-               config = event->attr.config;
-       } else {
-               perf_ibs = &perf_ibs_op;
-               ret = perf_ibs_precise_event(event, &config);
-               if (ret)
-                       return ret;
-       }
-
-       if (event->pmu != &perf_ibs->pmu)
-               return -ENOENT;
-
-       if (perf_flags(&event->attr) & perf_flags(&ibs_notsupp))
-               return -EINVAL;
-
-       if (config & ~perf_ibs->config_mask)
-               return -EINVAL;
-
-       if (hwc->sample_period) {
-               if (config & perf_ibs->cnt_mask)
-                       /* raw max_cnt may not be set */
-                       return -EINVAL;
-               if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
-                       /*
-                        * lower 4 bits can not be set in ibs max cnt,
-                        * but allowing it in case we adjust the
-                        * sample period to set a frequency.
-                        */
-                       return -EINVAL;
-               hwc->sample_period &= ~0x0FULL;
-               if (!hwc->sample_period)
-                       hwc->sample_period = 0x10;
-       } else {
-               max_cnt = config & perf_ibs->cnt_mask;
-               config &= ~perf_ibs->cnt_mask;
-               event->attr.sample_period = max_cnt << 4;
-               hwc->sample_period = event->attr.sample_period;
-       }
-
-       if (!hwc->sample_period)
-               return -EINVAL;
-
-       /*
-        * If we modify hwc->sample_period, we also need to update
-        * hwc->last_period and hwc->period_left.
-        */
-       hwc->last_period = hwc->sample_period;
-       local64_set(&hwc->period_left, hwc->sample_period);
-
-       hwc->config_base = perf_ibs->msr;
-       hwc->config = config;
-
-       return 0;
-}
-
-static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
-                              struct hw_perf_event *hwc, u64 *period)
-{
-       int overflow;
-
-       /* ignore lower 4 bits in min count: */
-       overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
-       local64_set(&hwc->prev_count, 0);
-
-       return overflow;
-}
-
-static u64 get_ibs_fetch_count(u64 config)
-{
-       return (config & IBS_FETCH_CNT) >> 12;
-}
-
-static u64 get_ibs_op_count(u64 config)
-{
-       u64 count = 0;
-
-       if (config & IBS_OP_VAL)
-               count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */
-
-       if (ibs_caps & IBS_CAPS_RDWROPCNT)
-               count += (config & IBS_OP_CUR_CNT) >> 32;
-
-       return count;
-}
-
-static void
-perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
-                     u64 *config)
-{
-       u64 count = perf_ibs->get_count(*config);
-
-       /*
-        * Set width to 64 since we do not overflow on max width but
-        * instead on max count. In perf_ibs_set_period() we clear
-        * prev count manually on overflow.
-        */
-       while (!perf_event_try_update(event, count, 64)) {
-               rdmsrl(event->hw.config_base, *config);
-               count = perf_ibs->get_count(*config);
-       }
-}
-
-static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
-                                        struct hw_perf_event *hwc, u64 config)
-{
-       wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask);
-}
-
-/*
- * Erratum #420 Instruction-Based Sampling Engine May Generate
- * Interrupt that Cannot Be Cleared:
- *
- * Must clear counter mask first, then clear the enable bit. See
- * Revision Guide for AMD Family 10h Processors, Publication #41322.
- */
-static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
-                                         struct hw_perf_event *hwc, u64 config)
-{
-       config &= ~perf_ibs->cnt_mask;
-       wrmsrl(hwc->config_base, config);
-       config &= ~perf_ibs->enable_mask;
-       wrmsrl(hwc->config_base, config);
-}
-
-/*
- * We cannot restore the ibs pmu state, so we always needs to update
- * the event while stopping it and then reset the state when starting
- * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
- * perf_ibs_start()/perf_ibs_stop() and instead always do it.
- */
-static void perf_ibs_start(struct perf_event *event, int flags)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
-       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-       u64 period;
-
-       if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
-               return;
-
-       WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
-       hwc->state = 0;
-
-       perf_ibs_set_period(perf_ibs, hwc, &period);
-       set_bit(IBS_STARTED, pcpu->state);
-       perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
-
-       perf_event_update_userpage(event);
-}
-
-static void perf_ibs_stop(struct perf_event *event, int flags)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
-       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-       u64 config;
-       int stopping;
-
-       stopping = test_and_clear_bit(IBS_STARTED, pcpu->state);
-
-       if (!stopping && (hwc->state & PERF_HES_UPTODATE))
-               return;
-
-       rdmsrl(hwc->config_base, config);
-
-       if (stopping) {
-               set_bit(IBS_STOPPING, pcpu->state);
-               perf_ibs_disable_event(perf_ibs, hwc, config);
-               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-               hwc->state |= PERF_HES_STOPPED;
-       }
-
-       if (hwc->state & PERF_HES_UPTODATE)
-               return;
-
-       /*
-        * Clear valid bit to not count rollovers on update, rollovers
-        * are only updated in the irq handler.
-        */
-       config &= ~perf_ibs->valid_mask;
-
-       perf_ibs_event_update(perf_ibs, event, &config);
-       hwc->state |= PERF_HES_UPTODATE;
-}
-
-static int perf_ibs_add(struct perf_event *event, int flags)
-{
-       struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
-       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-
-       if (test_and_set_bit(IBS_ENABLED, pcpu->state))
-               return -ENOSPC;
-
-       event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
-       pcpu->event = event;
-
-       if (flags & PERF_EF_START)
-               perf_ibs_start(event, PERF_EF_RELOAD);
-
-       return 0;
-}
-
-static void perf_ibs_del(struct perf_event *event, int flags)
-{
-       struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
-       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-
-       if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
-               return;
-
-       perf_ibs_stop(event, PERF_EF_UPDATE);
-
-       pcpu->event = NULL;
-
-       perf_event_update_userpage(event);
-}
-
-static void perf_ibs_read(struct perf_event *event) { }
-
-PMU_FORMAT_ATTR(rand_en,       "config:57");
-PMU_FORMAT_ATTR(cnt_ctl,       "config:19");
-
-static struct attribute *ibs_fetch_format_attrs[] = {
-       &format_attr_rand_en.attr,
-       NULL,
-};
-
-static struct attribute *ibs_op_format_attrs[] = {
-       NULL,   /* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */
-       NULL,
-};
-
-static struct perf_ibs perf_ibs_fetch = {
-       .pmu = {
-               .task_ctx_nr    = perf_invalid_context,
-
-               .event_init     = perf_ibs_init,
-               .add            = perf_ibs_add,
-               .del            = perf_ibs_del,
-               .start          = perf_ibs_start,
-               .stop           = perf_ibs_stop,
-               .read           = perf_ibs_read,
-       },
-       .msr                    = MSR_AMD64_IBSFETCHCTL,
-       .config_mask            = IBS_FETCH_CONFIG_MASK,
-       .cnt_mask               = IBS_FETCH_MAX_CNT,
-       .enable_mask            = IBS_FETCH_ENABLE,
-       .valid_mask             = IBS_FETCH_VAL,
-       .max_period             = IBS_FETCH_MAX_CNT << 4,
-       .offset_mask            = { MSR_AMD64_IBSFETCH_REG_MASK },
-       .offset_max             = MSR_AMD64_IBSFETCH_REG_COUNT,
-       .format_attrs           = ibs_fetch_format_attrs,
-
-       .get_count              = get_ibs_fetch_count,
-};
-
-static struct perf_ibs perf_ibs_op = {
-       .pmu = {
-               .task_ctx_nr    = perf_invalid_context,
-
-               .event_init     = perf_ibs_init,
-               .add            = perf_ibs_add,
-               .del            = perf_ibs_del,
-               .start          = perf_ibs_start,
-               .stop           = perf_ibs_stop,
-               .read           = perf_ibs_read,
-       },
-       .msr                    = MSR_AMD64_IBSOPCTL,
-       .config_mask            = IBS_OP_CONFIG_MASK,
-       .cnt_mask               = IBS_OP_MAX_CNT,
-       .enable_mask            = IBS_OP_ENABLE,
-       .valid_mask             = IBS_OP_VAL,
-       .max_period             = IBS_OP_MAX_CNT << 4,
-       .offset_mask            = { MSR_AMD64_IBSOP_REG_MASK },
-       .offset_max             = MSR_AMD64_IBSOP_REG_COUNT,
-       .format_attrs           = ibs_op_format_attrs,
-
-       .get_count              = get_ibs_op_count,
-};
-
-static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
-{
-       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-       struct perf_event *event = pcpu->event;
-       struct hw_perf_event *hwc = &event->hw;
-       struct perf_sample_data data;
-       struct perf_raw_record raw;
-       struct pt_regs regs;
-       struct perf_ibs_data ibs_data;
-       int offset, size, check_rip, offset_max, throttle = 0;
-       unsigned int msr;
-       u64 *buf, *config, period;
-
-       if (!test_bit(IBS_STARTED, pcpu->state)) {
-               /*
-                * Catch spurious interrupts after stopping IBS: After
-                * disabling IBS there could be still incoming NMIs
-                * with samples that even have the valid bit cleared.
-                * Mark all this NMIs as handled.
-                */
-               return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0;
-       }
-
-       msr = hwc->config_base;
-       buf = ibs_data.regs;
-       rdmsrl(msr, *buf);
-       if (!(*buf++ & perf_ibs->valid_mask))
-               return 0;
-
-       config = &ibs_data.regs[0];
-       perf_ibs_event_update(perf_ibs, event, config);
-       perf_sample_data_init(&data, 0, hwc->last_period);
-       if (!perf_ibs_set_period(perf_ibs, hwc, &period))
-               goto out;       /* no sw counter overflow */
-
-       ibs_data.caps = ibs_caps;
-       size = 1;
-       offset = 1;
-       check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
-       if (event->attr.sample_type & PERF_SAMPLE_RAW)
-               offset_max = perf_ibs->offset_max;
-       else if (check_rip)
-               offset_max = 2;
-       else
-               offset_max = 1;
-       do {
-               rdmsrl(msr + offset, *buf++);
-               size++;
-               offset = find_next_bit(perf_ibs->offset_mask,
-                                      perf_ibs->offset_max,
-                                      offset + 1);
-       } while (offset < offset_max);
-       if (event->attr.sample_type & PERF_SAMPLE_RAW) {
-               /*
-                * Read IbsBrTarget and IbsOpData4 separately
-                * depending on their availability.
-                * Can't add to offset_max as they are staggered
-                */
-               if (ibs_caps & IBS_CAPS_BRNTRGT) {
-                       rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++);
-                       size++;
-               }
-               if (ibs_caps & IBS_CAPS_OPDATA4) {
-                       rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++);
-                       size++;
-               }
-       }
-       ibs_data.size = sizeof(u64) * size;
-
-       regs = *iregs;
-       if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
-               regs.flags &= ~PERF_EFLAGS_EXACT;
-       } else {
-               set_linear_ip(&regs, ibs_data.regs[1]);
-               regs.flags |= PERF_EFLAGS_EXACT;
-       }
-
-       if (event->attr.sample_type & PERF_SAMPLE_RAW) {
-               raw.size = sizeof(u32) + ibs_data.size;
-               raw.data = ibs_data.data;
-               data.raw = &raw;
-       }
-
-       throttle = perf_event_overflow(event, &data, &regs);
-out:
-       if (throttle)
-               perf_ibs_disable_event(perf_ibs, hwc, *config);
-       else
-               perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
-
-       perf_event_update_userpage(event);
-
-       return 1;
-}
-
-static int
-perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
-{
-       int handled = 0;
-
-       handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
-       handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
-
-       if (handled)
-               inc_irq_stat(apic_perf_irqs);
-
-       return handled;
-}
-NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
-
-static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
-{
-       struct cpu_perf_ibs __percpu *pcpu;
-       int ret;
-
-       pcpu = alloc_percpu(struct cpu_perf_ibs);
-       if (!pcpu)
-               return -ENOMEM;
-
-       perf_ibs->pcpu = pcpu;
-
-       /* register attributes */
-       if (perf_ibs->format_attrs[0]) {
-               memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group));
-               perf_ibs->format_group.name     = "format";
-               perf_ibs->format_group.attrs    = perf_ibs->format_attrs;
-
-               memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups));
-               perf_ibs->attr_groups[0]        = &perf_ibs->format_group;
-               perf_ibs->pmu.attr_groups       = perf_ibs->attr_groups;
-       }
-
-       ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
-       if (ret) {
-               perf_ibs->pcpu = NULL;
-               free_percpu(pcpu);
-       }
-
-       return ret;
-}
-
-static __init int perf_event_ibs_init(void)
-{
-       struct attribute **attr = ibs_op_format_attrs;
-
-       if (!ibs_caps)
-               return -ENODEV; /* ibs not supported by the cpu */
-
-       perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
-
-       if (ibs_caps & IBS_CAPS_OPCNT) {
-               perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
-               *attr++ = &format_attr_cnt_ctl.attr;
-       }
-       perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
-
-       register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
-       printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps);
-
-       return 0;
-}
-
-#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */
-
-static __init int perf_event_ibs_init(void) { return 0; }
-
-#endif
-
-/* IBS - apic initialization, for perf and oprofile */
-
-static __init u32 __get_ibs_caps(void)
-{
-       u32 caps;
-       unsigned int max_level;
-
-       if (!boot_cpu_has(X86_FEATURE_IBS))
-               return 0;
-
-       /* check IBS cpuid feature flags */
-       max_level = cpuid_eax(0x80000000);
-       if (max_level < IBS_CPUID_FEATURES)
-               return IBS_CAPS_DEFAULT;
-
-       caps = cpuid_eax(IBS_CPUID_FEATURES);
-       if (!(caps & IBS_CAPS_AVAIL))
-               /* cpuid flags not valid */
-               return IBS_CAPS_DEFAULT;
-
-       return caps;
-}
-
-u32 get_ibs_caps(void)
-{
-       return ibs_caps;
-}
-
-EXPORT_SYMBOL(get_ibs_caps);
-
-static inline int get_eilvt(int offset)
-{
-       return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
-}
-
-static inline int put_eilvt(int offset)
-{
-       return !setup_APIC_eilvt(offset, 0, 0, 1);
-}
-
-/*
- * Check and reserve APIC extended interrupt LVT offset for IBS if available.
- */
-static inline int ibs_eilvt_valid(void)
-{
-       int offset;
-       u64 val;
-       int valid = 0;
-
-       preempt_disable();
-
-       rdmsrl(MSR_AMD64_IBSCTL, val);
-       offset = val & IBSCTL_LVT_OFFSET_MASK;
-
-       if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
-               pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
-                      smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
-               goto out;
-       }
-
-       if (!get_eilvt(offset)) {
-               pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
-                      smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
-               goto out;
-       }
-
-       valid = 1;
-out:
-       preempt_enable();
-
-       return valid;
-}
-
-static int setup_ibs_ctl(int ibs_eilvt_off)
-{
-       struct pci_dev *cpu_cfg;
-       int nodes;
-       u32 value = 0;
-
-       nodes = 0;
-       cpu_cfg = NULL;
-       do {
-               cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
-                                        PCI_DEVICE_ID_AMD_10H_NB_MISC,
-                                        cpu_cfg);
-               if (!cpu_cfg)
-                       break;
-               ++nodes;
-               pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
-                                      | IBSCTL_LVT_OFFSET_VALID);
-               pci_read_config_dword(cpu_cfg, IBSCTL, &value);
-               if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
-                       pci_dev_put(cpu_cfg);
-                       printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
-                              "IBSCTL = 0x%08x\n", value);
-                       return -EINVAL;
-               }
-       } while (1);
-
-       if (!nodes) {
-               printk(KERN_DEBUG "No CPU node configured for IBS\n");
-               return -ENODEV;
-       }
-
-       return 0;
-}
-
-/*
- * This runs only on the current cpu. We try to find an LVT offset and
- * setup the local APIC. For this we must disable preemption. On
- * success we initialize all nodes with this offset. This updates then
- * the offset in the IBS_CTL per-node msr. The per-core APIC setup of
- * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
- * is using the new offset.
- */
-static void force_ibs_eilvt_setup(void)
-{
-       int offset;
-       int ret;
-
-       preempt_disable();
-       /* find the next free available EILVT entry, skip offset 0 */
-       for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
-               if (get_eilvt(offset))
-                       break;
-       }
-       preempt_enable();
-
-       if (offset == APIC_EILVT_NR_MAX) {
-               printk(KERN_DEBUG "No EILVT entry available\n");
-               return;
-       }
-
-       ret = setup_ibs_ctl(offset);
-       if (ret)
-               goto out;
-
-       if (!ibs_eilvt_valid())
-               goto out;
-
-       pr_info("IBS: LVT offset %d assigned\n", offset);
-
-       return;
-out:
-       preempt_disable();
-       put_eilvt(offset);
-       preempt_enable();
-       return;
-}
-
-static void ibs_eilvt_setup(void)
-{
-       /*
-        * Force LVT offset assignment for family 10h: The offsets are
-        * not assigned by the BIOS for this family, so the OS is
-        * responsible for doing it. If the OS assignment fails, fall
-        * back to BIOS settings and try to setup this.
-        */
-       if (boot_cpu_data.x86 == 0x10)
-               force_ibs_eilvt_setup();
-}
-
-static inline int get_ibs_lvt_offset(void)
-{
-       u64 val;
-
-       rdmsrl(MSR_AMD64_IBSCTL, val);
-       if (!(val & IBSCTL_LVT_OFFSET_VALID))
-               return -EINVAL;
-
-       return val & IBSCTL_LVT_OFFSET_MASK;
-}
-
-static void setup_APIC_ibs(void *dummy)
-{
-       int offset;
-
-       offset = get_ibs_lvt_offset();
-       if (offset < 0)
-               goto failed;
-
-       if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
-               return;
-failed:
-       pr_warn("perf: IBS APIC setup failed on cpu #%d\n",
-               smp_processor_id());
-}
-
-static void clear_APIC_ibs(void *dummy)
-{
-       int offset;
-
-       offset = get_ibs_lvt_offset();
-       if (offset >= 0)
-               setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
-}
-
-#ifdef CONFIG_PM
-
-static int perf_ibs_suspend(void)
-{
-       clear_APIC_ibs(NULL);
-       return 0;
-}
-
-static void perf_ibs_resume(void)
-{
-       ibs_eilvt_setup();
-       setup_APIC_ibs(NULL);
-}
-
-static struct syscore_ops perf_ibs_syscore_ops = {
-       .resume         = perf_ibs_resume,
-       .suspend        = perf_ibs_suspend,
-};
-
-static void perf_ibs_pm_init(void)
-{
-       register_syscore_ops(&perf_ibs_syscore_ops);
-}
-
-#else
-
-static inline void perf_ibs_pm_init(void) { }
-
-#endif
-
-static int
-perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
-{
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_STARTING:
-               setup_APIC_ibs(NULL);
-               break;
-       case CPU_DYING:
-               clear_APIC_ibs(NULL);
-               break;
-       default:
-               break;
-       }
-
-       return NOTIFY_OK;
-}
-
-static __init int amd_ibs_init(void)
-{
-       u32 caps;
-       int ret = -EINVAL;
-
-       caps = __get_ibs_caps();
-       if (!caps)
-               return -ENODEV; /* ibs not supported by the cpu */
-
-       ibs_eilvt_setup();
-
-       if (!ibs_eilvt_valid())
-               goto out;
-
-       perf_ibs_pm_init();
-       cpu_notifier_register_begin();
-       ibs_caps = caps;
-       /* make ibs_caps visible to other cpus: */
-       smp_mb();
-       smp_call_function(setup_APIC_ibs, NULL, 1);
-       __perf_cpu_notifier(perf_ibs_cpu_notifier);
-       cpu_notifier_register_done();
-
-       ret = perf_event_ibs_init();
-out:
-       if (ret)
-               pr_err("Failed to setup IBS, %d\n", ret);
-       return ret;
-}
-
-/* Since we need the pci subsystem to init ibs we can't do this earlier: */
-device_initcall(amd_ibs_init);
diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.c b/arch/x86/kernel/cpu/perf_event_amd_iommu.c

deleted file mode 100644 (file)

index 97242a9..0000000
--- a/arch/x86/kernel/cpu/perf_event_amd_iommu.c
+++ /dev/null
@@ -1,499 +0,0 @@
-/*
- * Copyright (C) 2013 Advanced Micro Devices, Inc.
- *
- * Author: Steven Kinney <Steven.Kinney@amd.com>
- * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
- *
- * Perf: amd_iommu - AMD IOMMU Performance Counter PMU implementation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/perf_event.h>
-#include <linux/module.h>
-#include <linux/cpumask.h>
-#include <linux/slab.h>
-
-#include "perf_event.h"
-#include "perf_event_amd_iommu.h"
-
-#define COUNTER_SHIFT          16
-
-#define _GET_BANK(ev)       ((u8)(ev->hw.extra_reg.reg >> 8))
-#define _GET_CNTR(ev)       ((u8)(ev->hw.extra_reg.reg))
-
-/* iommu pmu config masks */
-#define _GET_CSOURCE(ev)    ((ev->hw.config & 0xFFULL))
-#define _GET_DEVID(ev)      ((ev->hw.config >> 8)  & 0xFFFFULL)
-#define _GET_PASID(ev)      ((ev->hw.config >> 24) & 0xFFFFULL)
-#define _GET_DOMID(ev)      ((ev->hw.config >> 40) & 0xFFFFULL)
-#define _GET_DEVID_MASK(ev) ((ev->hw.extra_reg.config)  & 0xFFFFULL)
-#define _GET_PASID_MASK(ev) ((ev->hw.extra_reg.config >> 16) & 0xFFFFULL)
-#define _GET_DOMID_MASK(ev) ((ev->hw.extra_reg.config >> 32) & 0xFFFFULL)
-
-static struct perf_amd_iommu __perf_iommu;
-
-struct perf_amd_iommu {
-       struct pmu pmu;
-       u8 max_banks;
-       u8 max_counters;
-       u64 cntr_assign_mask;
-       raw_spinlock_t lock;
-       const struct attribute_group *attr_groups[4];
-};
-
-#define format_group   attr_groups[0]
-#define cpumask_group  attr_groups[1]
-#define events_group   attr_groups[2]
-#define null_group     attr_groups[3]
-
-/*---------------------------------------------
- * sysfs format attributes
- *---------------------------------------------*/
-PMU_FORMAT_ATTR(csource,    "config:0-7");
-PMU_FORMAT_ATTR(devid,      "config:8-23");
-PMU_FORMAT_ATTR(pasid,      "config:24-39");
-PMU_FORMAT_ATTR(domid,      "config:40-55");
-PMU_FORMAT_ATTR(devid_mask, "config1:0-15");
-PMU_FORMAT_ATTR(pasid_mask, "config1:16-31");
-PMU_FORMAT_ATTR(domid_mask, "config1:32-47");
-
-static struct attribute *iommu_format_attrs[] = {
-       &format_attr_csource.attr,
-       &format_attr_devid.attr,
-       &format_attr_pasid.attr,
-       &format_attr_domid.attr,
-       &format_attr_devid_mask.attr,
-       &format_attr_pasid_mask.attr,
-       &format_attr_domid_mask.attr,
-       NULL,
-};
-
-static struct attribute_group amd_iommu_format_group = {
-       .name = "format",
-       .attrs = iommu_format_attrs,
-};
-
-/*---------------------------------------------
- * sysfs events attributes
- *---------------------------------------------*/
-struct amd_iommu_event_desc {
-       struct kobj_attribute attr;
-       const char *event;
-};
-
-static ssize_t _iommu_event_show(struct kobject *kobj,
-                               struct kobj_attribute *attr, char *buf)
-{
-       struct amd_iommu_event_desc *event =
-               container_of(attr, struct amd_iommu_event_desc, attr);
-       return sprintf(buf, "%s\n", event->event);
-}
-
-#define AMD_IOMMU_EVENT_DESC(_name, _event)                    \
-{                                                              \
-       .attr  = __ATTR(_name, 0444, _iommu_event_show, NULL),  \
-       .event = _event,                                        \
-}
-
-static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = {
-       AMD_IOMMU_EVENT_DESC(mem_pass_untrans,        "csource=0x01"),
-       AMD_IOMMU_EVENT_DESC(mem_pass_pretrans,       "csource=0x02"),
-       AMD_IOMMU_EVENT_DESC(mem_pass_excl,           "csource=0x03"),
-       AMD_IOMMU_EVENT_DESC(mem_target_abort,        "csource=0x04"),
-       AMD_IOMMU_EVENT_DESC(mem_trans_total,         "csource=0x05"),
-       AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_hit,   "csource=0x06"),
-       AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_mis,   "csource=0x07"),
-       AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_hit,   "csource=0x08"),
-       AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_mis,   "csource=0x09"),
-       AMD_IOMMU_EVENT_DESC(mem_dte_hit,             "csource=0x0a"),
-       AMD_IOMMU_EVENT_DESC(mem_dte_mis,             "csource=0x0b"),
-       AMD_IOMMU_EVENT_DESC(page_tbl_read_tot,       "csource=0x0c"),
-       AMD_IOMMU_EVENT_DESC(page_tbl_read_nst,       "csource=0x0d"),
-       AMD_IOMMU_EVENT_DESC(page_tbl_read_gst,       "csource=0x0e"),
-       AMD_IOMMU_EVENT_DESC(int_dte_hit,             "csource=0x0f"),
-       AMD_IOMMU_EVENT_DESC(int_dte_mis,             "csource=0x10"),
-       AMD_IOMMU_EVENT_DESC(cmd_processed,           "csource=0x11"),
-       AMD_IOMMU_EVENT_DESC(cmd_processed_inv,       "csource=0x12"),
-       AMD_IOMMU_EVENT_DESC(tlb_inv,                 "csource=0x13"),
-       { /* end: all zeroes */ },
-};
-
-/*---------------------------------------------
- * sysfs cpumask attributes
- *---------------------------------------------*/
-static cpumask_t iommu_cpumask;
-
-static ssize_t _iommu_cpumask_show(struct device *dev,
-                                  struct device_attribute *attr,
-                                  char *buf)
-{
-       return cpumap_print_to_pagebuf(true, buf, &iommu_cpumask);
-}
-static DEVICE_ATTR(cpumask, S_IRUGO, _iommu_cpumask_show, NULL);
-
-static struct attribute *iommu_cpumask_attrs[] = {
-       &dev_attr_cpumask.attr,
-       NULL,
-};
-
-static struct attribute_group amd_iommu_cpumask_group = {
-       .attrs = iommu_cpumask_attrs,
-};
-
-/*---------------------------------------------*/
-
-static int get_next_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu)
-{
-       unsigned long flags;
-       int shift, bank, cntr, retval;
-       int max_banks = perf_iommu->max_banks;
-       int max_cntrs = perf_iommu->max_counters;
-
-       raw_spin_lock_irqsave(&perf_iommu->lock, flags);
-
-       for (bank = 0, shift = 0; bank < max_banks; bank++) {
-               for (cntr = 0; cntr < max_cntrs; cntr++) {
-                       shift = bank + (bank*3) + cntr;
-                       if (perf_iommu->cntr_assign_mask & (1ULL<<shift)) {
-                               continue;
-                       } else {
-                               perf_iommu->cntr_assign_mask |= (1ULL<<shift);
-                               retval = ((u16)((u16)bank<<8) | (u8)(cntr));
-                               goto out;
-                       }
-               }
-       }
-       retval = -ENOSPC;
-out:
-       raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
-       return retval;
-}
-
-static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu,
-                                       u8 bank, u8 cntr)
-{
-       unsigned long flags;
-       int max_banks, max_cntrs;
-       int shift = 0;
-
-       max_banks = perf_iommu->max_banks;
-       max_cntrs = perf_iommu->max_counters;
-
-       if ((bank > max_banks) || (cntr > max_cntrs))
-               return -EINVAL;
-
-       shift = bank + cntr + (bank*3);
-
-       raw_spin_lock_irqsave(&perf_iommu->lock, flags);
-       perf_iommu->cntr_assign_mask &= ~(1ULL<<shift);
-       raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
-
-       return 0;
-}
-
-static int perf_iommu_event_init(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct perf_amd_iommu *perf_iommu;
-       u64 config, config1;
-
-       /* test the event attr type check for PMU enumeration */
-       if (event->attr.type != event->pmu->type)
-               return -ENOENT;
-
-       /*
-        * IOMMU counters are shared across all cores.
-        * Therefore, it does not support per-process mode.
-        * Also, it does not support event sampling mode.
-        */
-       if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
-               return -EINVAL;
-
-       /* IOMMU counters do not have usr/os/guest/host bits */
-       if (event->attr.exclude_user || event->attr.exclude_kernel ||
-           event->attr.exclude_host || event->attr.exclude_guest)
-               return -EINVAL;
-
-       if (event->cpu < 0)
-               return -EINVAL;
-
-       perf_iommu = &__perf_iommu;
-
-       if (event->pmu != &perf_iommu->pmu)
-               return -ENOENT;
-
-       if (perf_iommu) {
-               config = event->attr.config;
-               config1 = event->attr.config1;
-       } else {
-               return -EINVAL;
-       }
-
-       /* integrate with iommu base devid (0000), assume one iommu */
-       perf_iommu->max_banks =
-               amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID);
-       perf_iommu->max_counters =
-               amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID);
-       if ((perf_iommu->max_banks == 0) || (perf_iommu->max_counters == 0))
-               return -EINVAL;
-
-       /* update the hw_perf_event struct with the iommu config data */
-       hwc->config = config;
-       hwc->extra_reg.config = config1;
-
-       return 0;
-}
-
-static void perf_iommu_enable_event(struct perf_event *ev)
-{
-       u8 csource = _GET_CSOURCE(ev);
-       u16 devid = _GET_DEVID(ev);
-       u64 reg = 0ULL;
-
-       reg = csource;
-       amd_iommu_pc_get_set_reg_val(devid,
-                       _GET_BANK(ev), _GET_CNTR(ev) ,
-                        IOMMU_PC_COUNTER_SRC_REG, &reg, true);
-
-       reg = 0ULL | devid | (_GET_DEVID_MASK(ev) << 32);
-       if (reg)
-               reg |= (1UL << 31);
-       amd_iommu_pc_get_set_reg_val(devid,
-                       _GET_BANK(ev), _GET_CNTR(ev) ,
-                        IOMMU_PC_DEVID_MATCH_REG, &reg, true);
-
-       reg = 0ULL | _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32);
-       if (reg)
-               reg |= (1UL << 31);
-       amd_iommu_pc_get_set_reg_val(devid,
-                       _GET_BANK(ev), _GET_CNTR(ev) ,
-                        IOMMU_PC_PASID_MATCH_REG, &reg, true);
-
-       reg = 0ULL | _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32);
-       if (reg)
-               reg |= (1UL << 31);
-       amd_iommu_pc_get_set_reg_val(devid,
-                       _GET_BANK(ev), _GET_CNTR(ev) ,
-                        IOMMU_PC_DOMID_MATCH_REG, &reg, true);
-}
-
-static void perf_iommu_disable_event(struct perf_event *event)
-{
-       u64 reg = 0ULL;
-
-       amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
-                       _GET_BANK(event), _GET_CNTR(event),
-                       IOMMU_PC_COUNTER_SRC_REG, &reg, true);
-}
-
-static void perf_iommu_start(struct perf_event *event, int flags)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       pr_debug("perf: amd_iommu:perf_iommu_start\n");
-       if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
-               return;
-
-       WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
-       hwc->state = 0;
-
-       if (flags & PERF_EF_RELOAD) {
-               u64 prev_raw_count =  local64_read(&hwc->prev_count);
-               amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
-                               _GET_BANK(event), _GET_CNTR(event),
-                               IOMMU_PC_COUNTER_REG, &prev_raw_count, true);
-       }
-
-       perf_iommu_enable_event(event);
-       perf_event_update_userpage(event);
-
-}
-
-static void perf_iommu_read(struct perf_event *event)
-{
-       u64 count = 0ULL;
-       u64 prev_raw_count = 0ULL;
-       u64 delta = 0ULL;
-       struct hw_perf_event *hwc = &event->hw;
-       pr_debug("perf: amd_iommu:perf_iommu_read\n");
-
-       amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
-                               _GET_BANK(event), _GET_CNTR(event),
-                               IOMMU_PC_COUNTER_REG, &count, false);
-
-       /* IOMMU pc counter register is only 48 bits */
-       count &= 0xFFFFFFFFFFFFULL;
-
-       prev_raw_count =  local64_read(&hwc->prev_count);
-       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-                                       count) != prev_raw_count)
-               return;
-
-       /* Handling 48-bit counter overflowing */
-       delta = (count << COUNTER_SHIFT) - (prev_raw_count << COUNTER_SHIFT);
-       delta >>= COUNTER_SHIFT;
-       local64_add(delta, &event->count);
-
-}
-
-static void perf_iommu_stop(struct perf_event *event, int flags)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       u64 config;
-
-       pr_debug("perf: amd_iommu:perf_iommu_stop\n");
-
-       if (hwc->state & PERF_HES_UPTODATE)
-               return;
-
-       perf_iommu_disable_event(event);
-       WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-       hwc->state |= PERF_HES_STOPPED;
-
-       if (hwc->state & PERF_HES_UPTODATE)
-               return;
-
-       config = hwc->config;
-       perf_iommu_read(event);
-       hwc->state |= PERF_HES_UPTODATE;
-}
-
-static int perf_iommu_add(struct perf_event *event, int flags)
-{
-       int retval;
-       struct perf_amd_iommu *perf_iommu =
-                       container_of(event->pmu, struct perf_amd_iommu, pmu);
-
-       pr_debug("perf: amd_iommu:perf_iommu_add\n");
-       event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
-       /* request an iommu bank/counter */
-       retval = get_next_avail_iommu_bnk_cntr(perf_iommu);
-       if (retval != -ENOSPC)
-               event->hw.extra_reg.reg = (u16)retval;
-       else
-               return retval;
-
-       if (flags & PERF_EF_START)
-               perf_iommu_start(event, PERF_EF_RELOAD);
-
-       return 0;
-}
-
-static void perf_iommu_del(struct perf_event *event, int flags)
-{
-       struct perf_amd_iommu *perf_iommu =
-                       container_of(event->pmu, struct perf_amd_iommu, pmu);
-
-       pr_debug("perf: amd_iommu:perf_iommu_del\n");
-       perf_iommu_stop(event, PERF_EF_UPDATE);
-
-       /* clear the assigned iommu bank/counter */
-       clear_avail_iommu_bnk_cntr(perf_iommu,
-                                    _GET_BANK(event),
-                                    _GET_CNTR(event));
-
-       perf_event_update_userpage(event);
-}
-
-static __init int _init_events_attrs(struct perf_amd_iommu *perf_iommu)
-{
-       struct attribute **attrs;
-       struct attribute_group *attr_group;
-       int i = 0, j;
-
-       while (amd_iommu_v2_event_descs[i].attr.attr.name)
-               i++;
-
-       attr_group = kzalloc(sizeof(struct attribute *)
-               * (i + 1) + sizeof(*attr_group), GFP_KERNEL);
-       if (!attr_group)
-               return -ENOMEM;
-
-       attrs = (struct attribute **)(attr_group + 1);
-       for (j = 0; j < i; j++)
-               attrs[j] = &amd_iommu_v2_event_descs[j].attr.attr;
-
-       attr_group->name = "events";
-       attr_group->attrs = attrs;
-       perf_iommu->events_group = attr_group;
-
-       return 0;
-}
-
-static __init void amd_iommu_pc_exit(void)
-{
-       if (__perf_iommu.events_group != NULL) {
-               kfree(__perf_iommu.events_group);
-               __perf_iommu.events_group = NULL;
-       }
-}
-
-static __init int _init_perf_amd_iommu(
-       struct perf_amd_iommu *perf_iommu, char *name)
-{
-       int ret;
-
-       raw_spin_lock_init(&perf_iommu->lock);
-
-       /* Init format attributes */
-       perf_iommu->format_group = &amd_iommu_format_group;
-
-       /* Init cpumask attributes to only core 0 */
-       cpumask_set_cpu(0, &iommu_cpumask);
-       perf_iommu->cpumask_group = &amd_iommu_cpumask_group;
-
-       /* Init events attributes */
-       if (_init_events_attrs(perf_iommu) != 0)
-               pr_err("perf: amd_iommu: Only support raw events.\n");
-
-       /* Init null attributes */
-       perf_iommu->null_group = NULL;
-       perf_iommu->pmu.attr_groups = perf_iommu->attr_groups;
-
-       ret = perf_pmu_register(&perf_iommu->pmu, name, -1);
-       if (ret) {
-               pr_err("perf: amd_iommu: Failed to initialized.\n");
-               amd_iommu_pc_exit();
-       } else {
-               pr_info("perf: amd_iommu: Detected. (%d banks, %d counters/bank)\n",
-                       amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID),
-                       amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID));
-       }
-
-       return ret;
-}
-
-static struct perf_amd_iommu __perf_iommu = {
-       .pmu = {
-               .event_init     = perf_iommu_event_init,
-               .add            = perf_iommu_add,
-               .del            = perf_iommu_del,
-               .start          = perf_iommu_start,
-               .stop           = perf_iommu_stop,
-               .read           = perf_iommu_read,
-       },
-       .max_banks              = 0x00,
-       .max_counters           = 0x00,
-       .cntr_assign_mask       = 0ULL,
-       .format_group           = NULL,
-       .cpumask_group          = NULL,
-       .events_group           = NULL,
-       .null_group             = NULL,
-};
-
-static __init int amd_iommu_pc_init(void)
-{
-       /* Make sure the IOMMU PC resource is available */
-       if (!amd_iommu_pc_supported())
-               return -ENODEV;
-
-       _init_perf_amd_iommu(&__perf_iommu, "amd_iommu");
-
-       return 0;
-}
-
-device_initcall(amd_iommu_pc_init);
diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.h b/arch/x86/kernel/cpu/perf_event_amd_iommu.h

deleted file mode 100644 (file)

index 845d173..0000000
--- a/arch/x86/kernel/cpu/perf_event_amd_iommu.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (C) 2013 Advanced Micro Devices, Inc.
- *
- * Author: Steven Kinney <Steven.Kinney@amd.com>
- * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef _PERF_EVENT_AMD_IOMMU_H_
-#define _PERF_EVENT_AMD_IOMMU_H_
-
-/* iommu pc mmio region register indexes */
-#define IOMMU_PC_COUNTER_REG                   0x00
-#define IOMMU_PC_COUNTER_SRC_REG               0x08
-#define IOMMU_PC_PASID_MATCH_REG               0x10
-#define IOMMU_PC_DOMID_MATCH_REG               0x18
-#define IOMMU_PC_DEVID_MATCH_REG               0x20
-#define IOMMU_PC_COUNTER_REPORT_REG            0x28
-
-/* maximun specified bank/counters */
-#define PC_MAX_SPEC_BNKS                       64
-#define PC_MAX_SPEC_CNTRS                      16
-
-/* iommu pc reg masks*/
-#define IOMMU_BASE_DEVID                       0x0000
-
-/* amd_iommu_init.c external support functions */
-extern bool amd_iommu_pc_supported(void);
-
-extern u8 amd_iommu_pc_get_max_banks(u16 devid);
-
-extern u8 amd_iommu_pc_get_max_counters(u16 devid);
-
-extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr,
-                       u8 fxn, u64 *value, bool is_write);
-
-#endif /*_PERF_EVENT_AMD_IOMMU_H_*/
diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c

deleted file mode 100644 (file)

index 8836fc9..0000000
--- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c
+++ /dev/null
@@ -1,603 +0,0 @@
-/*
- * Copyright (C) 2013 Advanced Micro Devices, Inc.
- *
- * Author: Jacob Shin <jacob.shin@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/perf_event.h>
-#include <linux/percpu.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/cpu.h>
-#include <linux/cpumask.h>
-
-#include <asm/cpufeature.h>
-#include <asm/perf_event.h>
-#include <asm/msr.h>
-
-#define NUM_COUNTERS_NB                4
-#define NUM_COUNTERS_L2                4
-#define MAX_COUNTERS           NUM_COUNTERS_NB
-
-#define RDPMC_BASE_NB          6
-#define RDPMC_BASE_L2          10
-
-#define COUNTER_SHIFT          16
-
-struct amd_uncore {
-       int id;
-       int refcnt;
-       int cpu;
-       int num_counters;
-       int rdpmc_base;
-       u32 msr_base;
-       cpumask_t *active_mask;
-       struct pmu *pmu;
-       struct perf_event *events[MAX_COUNTERS];
-       struct amd_uncore *free_when_cpu_online;
-};
-
-static struct amd_uncore * __percpu *amd_uncore_nb;
-static struct amd_uncore * __percpu *amd_uncore_l2;
-
-static struct pmu amd_nb_pmu;
-static struct pmu amd_l2_pmu;
-
-static cpumask_t amd_nb_active_mask;
-static cpumask_t amd_l2_active_mask;
-
-static bool is_nb_event(struct perf_event *event)
-{
-       return event->pmu->type == amd_nb_pmu.type;
-}
-
-static bool is_l2_event(struct perf_event *event)
-{
-       return event->pmu->type == amd_l2_pmu.type;
-}
-
-static struct amd_uncore *event_to_amd_uncore(struct perf_event *event)
-{
-       if (is_nb_event(event) && amd_uncore_nb)
-               return *per_cpu_ptr(amd_uncore_nb, event->cpu);
-       else if (is_l2_event(event) && amd_uncore_l2)
-               return *per_cpu_ptr(amd_uncore_l2, event->cpu);
-
-       return NULL;
-}
-
-static void amd_uncore_read(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       u64 prev, new;
-       s64 delta;
-
-       /*
-        * since we do not enable counter overflow interrupts,
-        * we do not have to worry about prev_count changing on us
-        */
-
-       prev = local64_read(&hwc->prev_count);
-       rdpmcl(hwc->event_base_rdpmc, new);
-       local64_set(&hwc->prev_count, new);
-       delta = (new << COUNTER_SHIFT) - (prev << COUNTER_SHIFT);
-       delta >>= COUNTER_SHIFT;
-       local64_add(delta, &event->count);
-}
-
-static void amd_uncore_start(struct perf_event *event, int flags)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (flags & PERF_EF_RELOAD)
-               wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count));
-
-       hwc->state = 0;
-       wrmsrl(hwc->config_base, (hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE));
-       perf_event_update_userpage(event);
-}
-
-static void amd_uncore_stop(struct perf_event *event, int flags)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       wrmsrl(hwc->config_base, hwc->config);
-       hwc->state |= PERF_HES_STOPPED;
-
-       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-               amd_uncore_read(event);
-               hwc->state |= PERF_HES_UPTODATE;
-       }
-}
-
-static int amd_uncore_add(struct perf_event *event, int flags)
-{
-       int i;
-       struct amd_uncore *uncore = event_to_amd_uncore(event);
-       struct hw_perf_event *hwc = &event->hw;
-
-       /* are we already assigned? */
-       if (hwc->idx != -1 && uncore->events[hwc->idx] == event)
-               goto out;
-
-       for (i = 0; i < uncore->num_counters; i++) {
-               if (uncore->events[i] == event) {
-                       hwc->idx = i;
-                       goto out;
-               }
-       }
-
-       /* if not, take the first available counter */
-       hwc->idx = -1;
-       for (i = 0; i < uncore->num_counters; i++) {
-               if (cmpxchg(&uncore->events[i], NULL, event) == NULL) {
-                       hwc->idx = i;
-                       break;
-               }
-       }
-
-out:
-       if (hwc->idx == -1)
-               return -EBUSY;
-
-       hwc->config_base = uncore->msr_base + (2 * hwc->idx);
-       hwc->event_base = uncore->msr_base + 1 + (2 * hwc->idx);
-       hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx;
-       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
-       if (flags & PERF_EF_START)
-               amd_uncore_start(event, PERF_EF_RELOAD);
-
-       return 0;
-}
-
-static void amd_uncore_del(struct perf_event *event, int flags)
-{
-       int i;
-       struct amd_uncore *uncore = event_to_amd_uncore(event);
-       struct hw_perf_event *hwc = &event->hw;
-
-       amd_uncore_stop(event, PERF_EF_UPDATE);
-
-       for (i = 0; i < uncore->num_counters; i++) {
-               if (cmpxchg(&uncore->events[i], event, NULL) == event)
-                       break;
-       }
-
-       hwc->idx = -1;
-}
-
-static int amd_uncore_event_init(struct perf_event *event)
-{
-       struct amd_uncore *uncore;
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (event->attr.type != event->pmu->type)
-               return -ENOENT;
-
-       /*
-        * NB and L2 counters (MSRs) are shared across all cores that share the
-        * same NB / L2 cache. Interrupts can be directed to a single target
-        * core, however, event counts generated by processes running on other
-        * cores cannot be masked out. So we do not support sampling and
-        * per-thread events.
-        */
-       if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
-               return -EINVAL;
-
-       /* NB and L2 counters do not have usr/os/guest/host bits */
-       if (event->attr.exclude_user || event->attr.exclude_kernel ||
-           event->attr.exclude_host || event->attr.exclude_guest)
-               return -EINVAL;
-
-       /* and we do not enable counter overflow interrupts */
-       hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB;
-       hwc->idx = -1;
-
-       if (event->cpu < 0)
-               return -EINVAL;
-
-       uncore = event_to_amd_uncore(event);
-       if (!uncore)
-               return -ENODEV;
-
-       /*
-        * since request can come in to any of the shared cores, we will remap
-        * to a single common cpu.
-        */
-       event->cpu = uncore->cpu;
-
-       return 0;
-}
-
-static ssize_t amd_uncore_attr_show_cpumask(struct device *dev,
-                                           struct device_attribute *attr,
-                                           char *buf)
-{
-       cpumask_t *active_mask;
-       struct pmu *pmu = dev_get_drvdata(dev);
-
-       if (pmu->type == amd_nb_pmu.type)
-               active_mask = &amd_nb_active_mask;
-       else if (pmu->type == amd_l2_pmu.type)
-               active_mask = &amd_l2_active_mask;
-       else
-               return 0;
-
-       return cpumap_print_to_pagebuf(true, buf, active_mask);
-}
-static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL);
-
-static struct attribute *amd_uncore_attrs[] = {
-       &dev_attr_cpumask.attr,
-       NULL,
-};
-
-static struct attribute_group amd_uncore_attr_group = {
-       .attrs = amd_uncore_attrs,
-};
-
-PMU_FORMAT_ATTR(event, "config:0-7,32-35");
-PMU_FORMAT_ATTR(umask, "config:8-15");
-
-static struct attribute *amd_uncore_format_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       NULL,
-};
-
-static struct attribute_group amd_uncore_format_group = {
-       .name = "format",
-       .attrs = amd_uncore_format_attr,
-};
-
-static const struct attribute_group *amd_uncore_attr_groups[] = {
-       &amd_uncore_attr_group,
-       &amd_uncore_format_group,
-       NULL,
-};
-
-static struct pmu amd_nb_pmu = {
-       .attr_groups    = amd_uncore_attr_groups,
-       .name           = "amd_nb",
-       .event_init     = amd_uncore_event_init,
-       .add            = amd_uncore_add,
-       .del            = amd_uncore_del,
-       .start          = amd_uncore_start,
-       .stop           = amd_uncore_stop,
-       .read           = amd_uncore_read,
-};
-
-static struct pmu amd_l2_pmu = {
-       .attr_groups    = amd_uncore_attr_groups,
-       .name           = "amd_l2",
-       .event_init     = amd_uncore_event_init,
-       .add            = amd_uncore_add,
-       .del            = amd_uncore_del,
-       .start          = amd_uncore_start,
-       .stop           = amd_uncore_stop,
-       .read           = amd_uncore_read,
-};
-
-static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
-{
-       return kzalloc_node(sizeof(struct amd_uncore), GFP_KERNEL,
-                       cpu_to_node(cpu));
-}
-
-static int amd_uncore_cpu_up_prepare(unsigned int cpu)
-{
-       struct amd_uncore *uncore_nb = NULL, *uncore_l2;
-
-       if (amd_uncore_nb) {
-               uncore_nb = amd_uncore_alloc(cpu);
-               if (!uncore_nb)
-                       goto fail;
-               uncore_nb->cpu = cpu;
-               uncore_nb->num_counters = NUM_COUNTERS_NB;
-               uncore_nb->rdpmc_base = RDPMC_BASE_NB;
-               uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL;
-               uncore_nb->active_mask = &amd_nb_active_mask;
-               uncore_nb->pmu = &amd_nb_pmu;
-               *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb;
-       }
-
-       if (amd_uncore_l2) {
-               uncore_l2 = amd_uncore_alloc(cpu);
-               if (!uncore_l2)
-                       goto fail;
-               uncore_l2->cpu = cpu;
-               uncore_l2->num_counters = NUM_COUNTERS_L2;
-               uncore_l2->rdpmc_base = RDPMC_BASE_L2;
-               uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL;
-               uncore_l2->active_mask = &amd_l2_active_mask;
-               uncore_l2->pmu = &amd_l2_pmu;
-               *per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2;
-       }
-
-       return 0;
-
-fail:
-       if (amd_uncore_nb)
-               *per_cpu_ptr(amd_uncore_nb, cpu) = NULL;
-       kfree(uncore_nb);
-       return -ENOMEM;
-}
-
-static struct amd_uncore *
-amd_uncore_find_online_sibling(struct amd_uncore *this,
-                              struct amd_uncore * __percpu *uncores)
-{
-       unsigned int cpu;
-       struct amd_uncore *that;
-
-       for_each_online_cpu(cpu) {
-               that = *per_cpu_ptr(uncores, cpu);
-
-               if (!that)
-                       continue;
-
-               if (this == that)
-                       continue;
-
-               if (this->id == that->id) {
-                       that->free_when_cpu_online = this;
-                       this = that;
-                       break;
-               }
-       }
-
-       this->refcnt++;
-       return this;
-}
-
-static void amd_uncore_cpu_starting(unsigned int cpu)
-{
-       unsigned int eax, ebx, ecx, edx;
-       struct amd_uncore *uncore;
-
-       if (amd_uncore_nb) {
-               uncore = *per_cpu_ptr(amd_uncore_nb, cpu);
-               cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
-               uncore->id = ecx & 0xff;
-
-               uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_nb);
-               *per_cpu_ptr(amd_uncore_nb, cpu) = uncore;
-       }
-
-       if (amd_uncore_l2) {
-               unsigned int apicid = cpu_data(cpu).apicid;
-               unsigned int nshared;
-
-               uncore = *per_cpu_ptr(amd_uncore_l2, cpu);
-               cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx);
-               nshared = ((eax >> 14) & 0xfff) + 1;
-               uncore->id = apicid - (apicid % nshared);
-
-               uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_l2);
-               *per_cpu_ptr(amd_uncore_l2, cpu) = uncore;
-       }
-}
-
-static void uncore_online(unsigned int cpu,
-                         struct amd_uncore * __percpu *uncores)
-{
-       struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
-
-       kfree(uncore->free_when_cpu_online);
-       uncore->free_when_cpu_online = NULL;
-
-       if (cpu == uncore->cpu)
-               cpumask_set_cpu(cpu, uncore->active_mask);
-}
-
-static void amd_uncore_cpu_online(unsigned int cpu)
-{
-       if (amd_uncore_nb)
-               uncore_online(cpu, amd_uncore_nb);
-
-       if (amd_uncore_l2)
-               uncore_online(cpu, amd_uncore_l2);
-}
-
-static void uncore_down_prepare(unsigned int cpu,
-                               struct amd_uncore * __percpu *uncores)
-{
-       unsigned int i;
-       struct amd_uncore *this = *per_cpu_ptr(uncores, cpu);
-
-       if (this->cpu != cpu)
-               return;
-
-       /* this cpu is going down, migrate to a shared sibling if possible */
-       for_each_online_cpu(i) {
-               struct amd_uncore *that = *per_cpu_ptr(uncores, i);
-
-               if (cpu == i)
-                       continue;
-
-               if (this == that) {
-                       perf_pmu_migrate_context(this->pmu, cpu, i);
-                       cpumask_clear_cpu(cpu, that->active_mask);
-                       cpumask_set_cpu(i, that->active_mask);
-                       that->cpu = i;
-                       break;
-               }
-       }
-}
-
-static void amd_uncore_cpu_down_prepare(unsigned int cpu)
-{
-       if (amd_uncore_nb)
-               uncore_down_prepare(cpu, amd_uncore_nb);
-
-       if (amd_uncore_l2)
-               uncore_down_prepare(cpu, amd_uncore_l2);
-}
-
-static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores)
-{
-       struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
-
-       if (cpu == uncore->cpu)
-               cpumask_clear_cpu(cpu, uncore->active_mask);
-
-       if (!--uncore->refcnt)
-               kfree(uncore);
-       *per_cpu_ptr(uncores, cpu) = NULL;
-}
-
-static void amd_uncore_cpu_dead(unsigned int cpu)
-{
-       if (amd_uncore_nb)
-               uncore_dead(cpu, amd_uncore_nb);
-
-       if (amd_uncore_l2)
-               uncore_dead(cpu, amd_uncore_l2);
-}
-
-static int
-amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action,
-                       void *hcpu)
-{
-       unsigned int cpu = (long)hcpu;
-
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_UP_PREPARE:
-               if (amd_uncore_cpu_up_prepare(cpu))
-                       return notifier_from_errno(-ENOMEM);
-               break;
-
-       case CPU_STARTING:
-               amd_uncore_cpu_starting(cpu);
-               break;
-
-       case CPU_ONLINE:
-               amd_uncore_cpu_online(cpu);
-               break;
-
-       case CPU_DOWN_PREPARE:
-               amd_uncore_cpu_down_prepare(cpu);
-               break;
-
-       case CPU_UP_CANCELED:
-       case CPU_DEAD:
-               amd_uncore_cpu_dead(cpu);
-               break;
-
-       default:
-               break;
-       }
-
-       return NOTIFY_OK;
-}
-
-static struct notifier_block amd_uncore_cpu_notifier_block = {
-       .notifier_call  = amd_uncore_cpu_notifier,
-       .priority       = CPU_PRI_PERF + 1,
-};
-
-static void __init init_cpu_already_online(void *dummy)
-{
-       unsigned int cpu = smp_processor_id();
-
-       amd_uncore_cpu_starting(cpu);
-       amd_uncore_cpu_online(cpu);
-}
-
-static void cleanup_cpu_online(void *dummy)
-{
-       unsigned int cpu = smp_processor_id();
-
-       amd_uncore_cpu_dead(cpu);
-}
-
-static int __init amd_uncore_init(void)
-{
-       unsigned int cpu, cpu2;
-       int ret = -ENODEV;
-
-       if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
-               goto fail_nodev;
-
-       if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
-               goto fail_nodev;
-
-       if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) {
-               amd_uncore_nb = alloc_percpu(struct amd_uncore *);
-               if (!amd_uncore_nb) {
-                       ret = -ENOMEM;
-                       goto fail_nb;
-               }
-               ret = perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1);
-               if (ret)
-                       goto fail_nb;
-
-               printk(KERN_INFO "perf: AMD NB counters detected\n");
-               ret = 0;
-       }
-
-       if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) {
-               amd_uncore_l2 = alloc_percpu(struct amd_uncore *);
-               if (!amd_uncore_l2) {
-                       ret = -ENOMEM;
-                       goto fail_l2;
-               }
-               ret = perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1);
-               if (ret)
-                       goto fail_l2;
-
-               printk(KERN_INFO "perf: AMD L2I counters detected\n");
-               ret = 0;
-       }
-
-       if (ret)
-               goto fail_nodev;
-
-       cpu_notifier_register_begin();
-
-       /* init cpus already online before registering for hotplug notifier */
-       for_each_online_cpu(cpu) {
-               ret = amd_uncore_cpu_up_prepare(cpu);
-               if (ret)
-                       goto fail_online;
-               smp_call_function_single(cpu, init_cpu_already_online, NULL, 1);
-       }
-
-       __register_cpu_notifier(&amd_uncore_cpu_notifier_block);
-       cpu_notifier_register_done();
-
-       return 0;
-
-
-fail_online:
-       for_each_online_cpu(cpu2) {
-               if (cpu2 == cpu)
-                       break;
-               smp_call_function_single(cpu, cleanup_cpu_online, NULL, 1);
-       }
-       cpu_notifier_register_done();
-
-       /* amd_uncore_nb/l2 should have been freed by cleanup_cpu_online */
-       amd_uncore_nb = amd_uncore_l2 = NULL;
-
-       if (boot_cpu_has(X86_FEATURE_PERFCTR_L2))
-               perf_pmu_unregister(&amd_l2_pmu);
-fail_l2:
-       if (boot_cpu_has(X86_FEATURE_PERFCTR_NB))
-               perf_pmu_unregister(&amd_nb_pmu);
-       if (amd_uncore_l2)
-               free_percpu(amd_uncore_l2);
-fail_nb:
-       if (amd_uncore_nb)
-               free_percpu(amd_uncore_nb);
-
-fail_nodev:
-       return ret;
-}
-device_initcall(amd_uncore_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c

deleted file mode 100644 (file)

index fed2ab1..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ /dev/null
@@ -1,3773 +0,0 @@
-/*
- * Per core/cpu state
- *
- * Used to coordinate shared registers between HT threads or
- * among events on a single PMU.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/stddef.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-#include <linux/nmi.h>
-
-#include <asm/cpufeature.h>
-#include <asm/hardirq.h>
-#include <asm/apic.h>
-
-#include "perf_event.h"
-
-/*
- * Intel PerfMon, used on Core and later.
- */
-static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
-{
-       [PERF_COUNT_HW_CPU_CYCLES]              = 0x003c,
-       [PERF_COUNT_HW_INSTRUCTIONS]            = 0x00c0,
-       [PERF_COUNT_HW_CACHE_REFERENCES]        = 0x4f2e,
-       [PERF_COUNT_HW_CACHE_MISSES]            = 0x412e,
-       [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]     = 0x00c4,
-       [PERF_COUNT_HW_BRANCH_MISSES]           = 0x00c5,
-       [PERF_COUNT_HW_BUS_CYCLES]              = 0x013c,
-       [PERF_COUNT_HW_REF_CPU_CYCLES]          = 0x0300, /* pseudo-encoding */
-};
-
-static struct event_constraint intel_core_event_constraints[] __read_mostly =
-{
-       INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
-       INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
-       INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
-       INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
-       INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
-       INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */
-       EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_core2_event_constraints[] __read_mostly =
-{
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-       INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
-       INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
-       INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
-       INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
-       INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
-       INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
-       INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
-       INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
-       INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */
-       INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
-       EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
-{
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-       INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
-       INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
-       INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
-       INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
-       INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */
-       INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
-       INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
-       INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
-       EVENT_CONSTRAINT_END
-};
-
-static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
-{
-       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
-       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
-       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
-       EVENT_EXTRA_END
-};
-
-static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
-{
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-       INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
-       INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
-       INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
-       INTEL_EVENT_CONSTRAINT(0xb3, 0x1), /* SNOOPQ_REQUEST_OUTSTANDING */
-       EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_snb_event_constraints[] __read_mostly =
-{
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-       INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
-       INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x06a3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
-       INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
-       INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
-       INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
-       INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
-
-       INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
-
-       EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
-{
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-       INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */
-       INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */
-       INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
-       INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x06a3, 0xf), /* CYCLE_ACTIVITY.STALLS_LDM_PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
-
-       INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
-
-       EVENT_CONSTRAINT_END
-};
-
-static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
-{
-       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
-       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
-       INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
-       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
-       EVENT_EXTRA_END
-};
-
-static struct event_constraint intel_v1_event_constraints[] __read_mostly =
-{
-       EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_gen_event_constraints[] __read_mostly =
-{
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-       EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_slm_event_constraints[] __read_mostly =
-{
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_skl_event_constraints[] = {
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0),      /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1),      /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2),      /* CPU_CLK_UNHALTED.REF */
-       INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2),    /* INST_RETIRED.PREC_DIST */
-       EVENT_CONSTRAINT_END
-};
-
-static struct extra_reg intel_knl_extra_regs[] __read_mostly = {
-       INTEL_UEVENT_EXTRA_REG(0x01b7,
-                              MSR_OFFCORE_RSP_0, 0x7f9ffbffffull, RSP_0),
-       INTEL_UEVENT_EXTRA_REG(0x02b7,
-                              MSR_OFFCORE_RSP_1, 0x3f9ffbffffull, RSP_1),
-       EVENT_EXTRA_END
-};
-
-static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
-       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
-       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
-       INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1),
-       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
-       EVENT_EXTRA_END
-};
-
-static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
-       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
-       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
-       INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
-       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
-       EVENT_EXTRA_END
-};
-
-static struct extra_reg intel_skl_extra_regs[] __read_mostly = {
-       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
-       INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
-       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
-       /*
-        * Note the low 8 bits eventsel code is not a continuous field, containing
-        * some #GPing bits. These are masked out.
-        */
-       INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
-       EVENT_EXTRA_END
-};
-
-EVENT_ATTR_STR(mem-loads,      mem_ld_nhm,     "event=0x0b,umask=0x10,ldlat=3");
-EVENT_ATTR_STR(mem-loads,      mem_ld_snb,     "event=0xcd,umask=0x1,ldlat=3");
-EVENT_ATTR_STR(mem-stores,     mem_st_snb,     "event=0xcd,umask=0x2");
-
-struct attribute *nhm_events_attrs[] = {
-       EVENT_PTR(mem_ld_nhm),
-       NULL,
-};
-
-struct attribute *snb_events_attrs[] = {
-       EVENT_PTR(mem_ld_snb),
-       EVENT_PTR(mem_st_snb),
-       NULL,
-};
-
-static struct event_constraint intel_hsw_event_constraints[] = {
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-       INTEL_UEVENT_CONSTRAINT(0x148, 0x4),    /* L1D_PEND_MISS.PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
-       INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
-       /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4),
-       /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
-       /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
-       INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf),
-
-       INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
-
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_bdw_event_constraints[] = {
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0),      /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1),      /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2),      /* CPU_CLK_UNHALTED.REF */
-       INTEL_UEVENT_CONSTRAINT(0x148, 0x4),    /* L1D_PEND_MISS.PENDING */
-       INTEL_UBIT_EVENT_CONSTRAINT(0x8a3, 0x4),        /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
-       EVENT_CONSTRAINT_END
-};
-
-static u64 intel_pmu_event_map(int hw_event)
-{
-       return intel_perfmon_event_map[hw_event];
-}
-
-/*
- * Notes on the events:
- * - data reads do not include code reads (comparable to earlier tables)
- * - data counts include speculative execution (except L1 write, dtlb, bpu)
- * - remote node access includes remote memory, remote cache, remote mmio.
- * - prefetches are not included in the counts.
- * - icache miss does not include decoded icache
- */
-
-#define SKL_DEMAND_DATA_RD             BIT_ULL(0)
-#define SKL_DEMAND_RFO                 BIT_ULL(1)
-#define SKL_ANY_RESPONSE               BIT_ULL(16)
-#define SKL_SUPPLIER_NONE              BIT_ULL(17)
-#define SKL_L3_MISS_LOCAL_DRAM         BIT_ULL(26)
-#define SKL_L3_MISS_REMOTE_HOP0_DRAM   BIT_ULL(27)
-#define SKL_L3_MISS_REMOTE_HOP1_DRAM   BIT_ULL(28)
-#define SKL_L3_MISS_REMOTE_HOP2P_DRAM  BIT_ULL(29)
-#define SKL_L3_MISS                    (SKL_L3_MISS_LOCAL_DRAM| \
-                                        SKL_L3_MISS_REMOTE_HOP0_DRAM| \
-                                        SKL_L3_MISS_REMOTE_HOP1_DRAM| \
-                                        SKL_L3_MISS_REMOTE_HOP2P_DRAM)
-#define SKL_SPL_HIT                    BIT_ULL(30)
-#define SKL_SNOOP_NONE                 BIT_ULL(31)
-#define SKL_SNOOP_NOT_NEEDED           BIT_ULL(32)
-#define SKL_SNOOP_MISS                 BIT_ULL(33)
-#define SKL_SNOOP_HIT_NO_FWD           BIT_ULL(34)
-#define SKL_SNOOP_HIT_WITH_FWD         BIT_ULL(35)
-#define SKL_SNOOP_HITM                 BIT_ULL(36)
-#define SKL_SNOOP_NON_DRAM             BIT_ULL(37)
-#define SKL_ANY_SNOOP                  (SKL_SPL_HIT|SKL_SNOOP_NONE| \
-                                        SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
-                                        SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
-                                        SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM)
-#define SKL_DEMAND_READ                        SKL_DEMAND_DATA_RD
-#define SKL_SNOOP_DRAM                 (SKL_SNOOP_NONE| \
-                                        SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
-                                        SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
-                                        SKL_SNOOP_HITM|SKL_SPL_HIT)
-#define SKL_DEMAND_WRITE               SKL_DEMAND_RFO
-#define SKL_LLC_ACCESS                 SKL_ANY_RESPONSE
-#define SKL_L3_MISS_REMOTE             (SKL_L3_MISS_REMOTE_HOP0_DRAM| \
-                                        SKL_L3_MISS_REMOTE_HOP1_DRAM| \
-                                        SKL_L3_MISS_REMOTE_HOP2P_DRAM)
-
-static __initconst const u64 skl_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_INST_RETIRED.ALL_LOADS */
-               [ C(RESULT_MISS)   ] = 0x151,   /* L1D.REPLACEMENT */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_INST_RETIRED.ALL_STORES */
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x283,   /* ICACHE_64B.MISS */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
-               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
-               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_INST_RETIRED.ALL_LOADS */
-               [ C(RESULT_MISS)   ] = 0x608,   /* DTLB_LOAD_MISSES.WALK_COMPLETED */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_INST_RETIRED.ALL_STORES */
-               [ C(RESULT_MISS)   ] = 0x649,   /* DTLB_STORE_MISSES.WALK_COMPLETED */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x2085,  /* ITLB_MISSES.STLB_HIT */
-               [ C(RESULT_MISS)   ] = 0xe85,   /* ITLB_MISSES.WALK_COMPLETED */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0xc4,    /* BR_INST_RETIRED.ALL_BRANCHES */
-               [ C(RESULT_MISS)   ] = 0xc5,    /* BR_MISP_RETIRED.ALL_BRANCHES */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
-               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
-               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
-};
-
-static __initconst const u64 skl_hw_cache_extra_regs
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
-                                      SKL_LLC_ACCESS|SKL_ANY_SNOOP,
-               [ C(RESULT_MISS)   ] = SKL_DEMAND_READ|
-                                      SKL_L3_MISS|SKL_ANY_SNOOP|
-                                      SKL_SUPPLIER_NONE,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
-                                      SKL_LLC_ACCESS|SKL_ANY_SNOOP,
-               [ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE|
-                                      SKL_L3_MISS|SKL_ANY_SNOOP|
-                                      SKL_SUPPLIER_NONE,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
-                                      SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
-               [ C(RESULT_MISS)   ] = SKL_DEMAND_READ|
-                                      SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
-                                      SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
-               [ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE|
-                                      SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
-};
-
-#define SNB_DMND_DATA_RD       (1ULL << 0)
-#define SNB_DMND_RFO           (1ULL << 1)
-#define SNB_DMND_IFETCH                (1ULL << 2)
-#define SNB_DMND_WB            (1ULL << 3)
-#define SNB_PF_DATA_RD         (1ULL << 4)
-#define SNB_PF_RFO             (1ULL << 5)
-#define SNB_PF_IFETCH          (1ULL << 6)
-#define SNB_LLC_DATA_RD                (1ULL << 7)
-#define SNB_LLC_RFO            (1ULL << 8)
-#define SNB_LLC_IFETCH         (1ULL << 9)
-#define SNB_BUS_LOCKS          (1ULL << 10)
-#define SNB_STRM_ST            (1ULL << 11)
-#define SNB_OTHER              (1ULL << 15)
-#define SNB_RESP_ANY           (1ULL << 16)
-#define SNB_NO_SUPP            (1ULL << 17)
-#define SNB_LLC_HITM           (1ULL << 18)
-#define SNB_LLC_HITE           (1ULL << 19)
-#define SNB_LLC_HITS           (1ULL << 20)
-#define SNB_LLC_HITF           (1ULL << 21)
-#define SNB_LOCAL              (1ULL << 22)
-#define SNB_REMOTE             (0xffULL << 23)
-#define SNB_SNP_NONE           (1ULL << 31)
-#define SNB_SNP_NOT_NEEDED     (1ULL << 32)
-#define SNB_SNP_MISS           (1ULL << 33)
-#define SNB_NO_FWD             (1ULL << 34)
-#define SNB_SNP_FWD            (1ULL << 35)
-#define SNB_HITM               (1ULL << 36)
-#define SNB_NON_DRAM           (1ULL << 37)
-
-#define SNB_DMND_READ          (SNB_DMND_DATA_RD|SNB_LLC_DATA_RD)
-#define SNB_DMND_WRITE         (SNB_DMND_RFO|SNB_LLC_RFO)
-#define SNB_DMND_PREFETCH      (SNB_PF_DATA_RD|SNB_PF_RFO)
-
-#define SNB_SNP_ANY            (SNB_SNP_NONE|SNB_SNP_NOT_NEEDED| \
-                                SNB_SNP_MISS|SNB_NO_FWD|SNB_SNP_FWD| \
-                                SNB_HITM)
-
-#define SNB_DRAM_ANY           (SNB_LOCAL|SNB_REMOTE|SNB_SNP_ANY)
-#define SNB_DRAM_REMOTE                (SNB_REMOTE|SNB_SNP_ANY)
-
-#define SNB_L3_ACCESS          SNB_RESP_ANY
-#define SNB_L3_MISS            (SNB_DRAM_ANY|SNB_NON_DRAM)
-
-static __initconst const u64 snb_hw_cache_extra_regs
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_L3_ACCESS,
-               [ C(RESULT_MISS)   ] = SNB_DMND_READ|SNB_L3_MISS,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_L3_ACCESS,
-               [ C(RESULT_MISS)   ] = SNB_DMND_WRITE|SNB_L3_MISS,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_L3_ACCESS,
-               [ C(RESULT_MISS)   ] = SNB_DMND_PREFETCH|SNB_L3_MISS,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_DRAM_ANY,
-               [ C(RESULT_MISS)   ] = SNB_DMND_READ|SNB_DRAM_REMOTE,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_DRAM_ANY,
-               [ C(RESULT_MISS)   ] = SNB_DMND_WRITE|SNB_DRAM_REMOTE,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_DRAM_ANY,
-               [ C(RESULT_MISS)   ] = SNB_DMND_PREFETCH|SNB_DRAM_REMOTE,
-       },
- },
-};
-
-static __initconst const u64 snb_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS        */
-               [ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPLACEMENT              */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES       */
-               [ C(RESULT_MISS)   ] = 0x0851, /* L1D.ALL_M_REPLACEMENT        */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x024e, /* HW_PRE_REQ.DL1_MISS          */
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0280, /* ICACHE.MISSES */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_WRITE) ] = {
-               /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_PREFETCH) ] = {
-               /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */
-               [ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */
-               [ C(RESULT_MISS)   ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT         */
-               [ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK    */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
-               [ C(RESULT_MISS)   ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
- },
-
-};
-
-/*
- * Notes on the events:
- * - data reads do not include code reads (comparable to earlier tables)
- * - data counts include speculative execution (except L1 write, dtlb, bpu)
- * - remote node access includes remote memory, remote cache, remote mmio.
- * - prefetches are not included in the counts because they are not
- *   reliably counted.
- */
-
-#define HSW_DEMAND_DATA_RD             BIT_ULL(0)
-#define HSW_DEMAND_RFO                 BIT_ULL(1)
-#define HSW_ANY_RESPONSE               BIT_ULL(16)
-#define HSW_SUPPLIER_NONE              BIT_ULL(17)
-#define HSW_L3_MISS_LOCAL_DRAM         BIT_ULL(22)
-#define HSW_L3_MISS_REMOTE_HOP0                BIT_ULL(27)
-#define HSW_L3_MISS_REMOTE_HOP1                BIT_ULL(28)
-#define HSW_L3_MISS_REMOTE_HOP2P       BIT_ULL(29)
-#define HSW_L3_MISS                    (HSW_L3_MISS_LOCAL_DRAM| \
-                                        HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
-                                        HSW_L3_MISS_REMOTE_HOP2P)
-#define HSW_SNOOP_NONE                 BIT_ULL(31)
-#define HSW_SNOOP_NOT_NEEDED           BIT_ULL(32)
-#define HSW_SNOOP_MISS                 BIT_ULL(33)
-#define HSW_SNOOP_HIT_NO_FWD           BIT_ULL(34)
-#define HSW_SNOOP_HIT_WITH_FWD         BIT_ULL(35)
-#define HSW_SNOOP_HITM                 BIT_ULL(36)
-#define HSW_SNOOP_NON_DRAM             BIT_ULL(37)
-#define HSW_ANY_SNOOP                  (HSW_SNOOP_NONE| \
-                                        HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
-                                        HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
-                                        HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
-#define HSW_SNOOP_DRAM                 (HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)
-#define HSW_DEMAND_READ                        HSW_DEMAND_DATA_RD
-#define HSW_DEMAND_WRITE               HSW_DEMAND_RFO
-#define HSW_L3_MISS_REMOTE             (HSW_L3_MISS_REMOTE_HOP0|\
-                                        HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
-#define HSW_LLC_ACCESS                 HSW_ANY_RESPONSE
-
-#define BDW_L3_MISS_LOCAL              BIT(26)
-#define BDW_L3_MISS                    (BDW_L3_MISS_LOCAL| \
-                                        HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
-                                        HSW_L3_MISS_REMOTE_HOP2P)
-
-
-static __initconst const u64 hsw_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_UOPS_RETIRED.ALL_LOADS */
-               [ C(RESULT_MISS)   ] = 0x151,   /* L1D.REPLACEMENT */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_UOPS_RETIRED.ALL_STORES */
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x280,   /* ICACHE.MISSES */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
-               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
-               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_UOPS_RETIRED.ALL_LOADS */
-               [ C(RESULT_MISS)   ] = 0x108,   /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_UOPS_RETIRED.ALL_STORES */
-               [ C(RESULT_MISS)   ] = 0x149,   /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x6085,  /* ITLB_MISSES.STLB_HIT */
-               [ C(RESULT_MISS)   ] = 0x185,   /* ITLB_MISSES.MISS_CAUSES_A_WALK */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0xc4,    /* BR_INST_RETIRED.ALL_BRANCHES */
-               [ C(RESULT_MISS)   ] = 0xc5,    /* BR_MISP_RETIRED.ALL_BRANCHES */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
-               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
-               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
-};
-
-static __initconst const u64 hsw_hw_cache_extra_regs
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
-                                      HSW_LLC_ACCESS,
-               [ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
-                                      HSW_L3_MISS|HSW_ANY_SNOOP,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
-                                      HSW_LLC_ACCESS,
-               [ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
-                                      HSW_L3_MISS|HSW_ANY_SNOOP,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
-                                      HSW_L3_MISS_LOCAL_DRAM|
-                                      HSW_SNOOP_DRAM,
-               [ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
-                                      HSW_L3_MISS_REMOTE|
-                                      HSW_SNOOP_DRAM,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
-                                      HSW_L3_MISS_LOCAL_DRAM|
-                                      HSW_SNOOP_DRAM,
-               [ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
-                                      HSW_L3_MISS_REMOTE|
-                                      HSW_SNOOP_DRAM,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
-};
-
-static __initconst const u64 westmere_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
-               [ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPL                     */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
-               [ C(RESULT_MISS)   ] = 0x0251, /* L1D.M_REPL                   */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
-               [ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
-               [ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       /*
-        * Use RFO, not WRITEBACK, because a write miss would typically occur
-        * on RFO.
-        */
-       [ C(OP_WRITE) ] = {
-               /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_PREFETCH) ] = {
-               /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
-               [ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
-               [ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
-               [ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISSES.ANY              */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
-               [ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
- },
-};
-
-/*
- * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits;
- * See IA32 SDM Vol 3B 30.6.1.3
- */
-
-#define NHM_DMND_DATA_RD       (1 << 0)
-#define NHM_DMND_RFO           (1 << 1)
-#define NHM_DMND_IFETCH                (1 << 2)
-#define NHM_DMND_WB            (1 << 3)
-#define NHM_PF_DATA_RD         (1 << 4)
-#define NHM_PF_DATA_RFO                (1 << 5)
-#define NHM_PF_IFETCH          (1 << 6)
-#define NHM_OFFCORE_OTHER      (1 << 7)
-#define NHM_UNCORE_HIT         (1 << 8)
-#define NHM_OTHER_CORE_HIT_SNP (1 << 9)
-#define NHM_OTHER_CORE_HITM    (1 << 10)
-                               /* reserved */
-#define NHM_REMOTE_CACHE_FWD   (1 << 12)
-#define NHM_REMOTE_DRAM                (1 << 13)
-#define NHM_LOCAL_DRAM         (1 << 14)
-#define NHM_NON_DRAM           (1 << 15)
-
-#define NHM_LOCAL              (NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD)
-#define NHM_REMOTE             (NHM_REMOTE_DRAM)
-
-#define NHM_DMND_READ          (NHM_DMND_DATA_RD)
-#define NHM_DMND_WRITE         (NHM_DMND_RFO|NHM_DMND_WB)
-#define NHM_DMND_PREFETCH      (NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
-
-#define NHM_L3_HIT     (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
-#define NHM_L3_MISS    (NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD)
-#define NHM_L3_ACCESS  (NHM_L3_HIT|NHM_L3_MISS)
-
-static __initconst const u64 nehalem_hw_cache_extra_regs
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS,
-               [ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_L3_MISS,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS,
-               [ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_L3_MISS,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
-               [ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE,
-               [ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_REMOTE,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE,
-               [ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_REMOTE,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE,
-               [ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_REMOTE,
-       },
- },
-};
-
-static __initconst const u64 nehalem_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
-               [ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPL                     */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
-               [ C(RESULT_MISS)   ] = 0x0251, /* L1D.M_REPL                   */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
-               [ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
-               [ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       /*
-        * Use RFO, not WRITEBACK, because a write miss would typically occur
-        * on RFO.
-        */
-       [ C(OP_WRITE) ] = {
-               /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_PREFETCH) ] = {
-               /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */
-               [ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */
-               [ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
-               [ C(RESULT_MISS)   ] = 0x20c8, /* ITLB_MISS_RETIRED            */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
-               [ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
- },
-};
-
-static __initconst const u64 core2_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI          */
-               [ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE       */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI          */
-               [ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE       */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS      */
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
-               [ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
-               [ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
-               [ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
-               [ C(RESULT_MISS)   ] = 0x0208, /* DTLB_MISSES.MISS_LD        */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
-               [ C(RESULT_MISS)   ] = 0x0808, /* DTLB_MISSES.MISS_ST        */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
-               [ C(RESULT_MISS)   ] = 0x1282, /* ITLBMISSES                 */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
-               [ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
-};
-
-static __initconst const u64 atom_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD               */
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST               */
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                  */
-               [ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                 */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
-               [ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
-               [ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI  (alias) */
-               [ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI  (alias) */
-               [ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
-               [ C(RESULT_MISS)   ] = 0x0282, /* ITLB.MISSES                */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
-               [ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
-};
-
-static struct extra_reg intel_slm_extra_regs[] __read_mostly =
-{
-       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
-       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0),
-       INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x368005ffffull, RSP_1),
-       EVENT_EXTRA_END
-};
-
-#define SLM_DMND_READ          SNB_DMND_DATA_RD
-#define SLM_DMND_WRITE         SNB_DMND_RFO
-#define SLM_DMND_PREFETCH      (SNB_PF_DATA_RD|SNB_PF_RFO)
-
-#define SLM_SNP_ANY            (SNB_SNP_NONE|SNB_SNP_MISS|SNB_NO_FWD|SNB_HITM)
-#define SLM_LLC_ACCESS         SNB_RESP_ANY
-#define SLM_LLC_MISS           (SLM_SNP_ANY|SNB_NON_DRAM)
-
-static __initconst const u64 slm_hw_cache_extra_regs
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = SLM_DMND_READ|SLM_LLC_ACCESS,
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = SLM_DMND_WRITE|SLM_LLC_ACCESS,
-               [ C(RESULT_MISS)   ] = SLM_DMND_WRITE|SLM_LLC_MISS,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = SLM_DMND_PREFETCH|SLM_LLC_ACCESS,
-               [ C(RESULT_MISS)   ] = SLM_DMND_PREFETCH|SLM_LLC_MISS,
-       },
- },
-};
-
-static __initconst const u64 slm_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0x0104, /* LD_DCU_MISS */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0380, /* ICACHE.ACCESSES */
-               [ C(RESULT_MISS)   ] = 0x0280, /* ICACGE.MISSES */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_WRITE) ] = {
-               /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_PREFETCH) ] = {
-               /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0x0804, /* LD_DTLB_MISS */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
-               [ C(RESULT_MISS)   ] = 0x40205, /* PAGE_WALKS.I_SIDE_WALKS */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
-               [ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
-};
-
-#define KNL_OT_L2_HITE         BIT_ULL(19) /* Other Tile L2 Hit */
-#define KNL_OT_L2_HITF         BIT_ULL(20) /* Other Tile L2 Hit */
-#define KNL_MCDRAM_LOCAL       BIT_ULL(21)
-#define KNL_MCDRAM_FAR         BIT_ULL(22)
-#define KNL_DDR_LOCAL          BIT_ULL(23)
-#define KNL_DDR_FAR            BIT_ULL(24)
-#define KNL_DRAM_ANY           (KNL_MCDRAM_LOCAL | KNL_MCDRAM_FAR | \
-                                   KNL_DDR_LOCAL | KNL_DDR_FAR)
-#define KNL_L2_READ            SLM_DMND_READ
-#define KNL_L2_WRITE           SLM_DMND_WRITE
-#define KNL_L2_PREFETCH                SLM_DMND_PREFETCH
-#define KNL_L2_ACCESS          SLM_LLC_ACCESS
-#define KNL_L2_MISS            (KNL_OT_L2_HITE | KNL_OT_L2_HITF | \
-                                  KNL_DRAM_ANY | SNB_SNP_ANY | \
-                                                 SNB_NON_DRAM)
-
-static __initconst const u64 knl_hw_cache_extra_regs
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
-       [C(LL)] = {
-               [C(OP_READ)] = {
-                       [C(RESULT_ACCESS)] = KNL_L2_READ | KNL_L2_ACCESS,
-                       [C(RESULT_MISS)]   = 0,
-               },
-               [C(OP_WRITE)] = {
-                       [C(RESULT_ACCESS)] = KNL_L2_WRITE | KNL_L2_ACCESS,
-                       [C(RESULT_MISS)]   = KNL_L2_WRITE | KNL_L2_MISS,
-               },
-               [C(OP_PREFETCH)] = {
-                       [C(RESULT_ACCESS)] = KNL_L2_PREFETCH | KNL_L2_ACCESS,
-                       [C(RESULT_MISS)]   = KNL_L2_PREFETCH | KNL_L2_MISS,
-               },
-       },
-};
-
-/*
- * Use from PMIs where the LBRs are already disabled.
- */
-static void __intel_pmu_disable_all(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-
-       if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
-               intel_pmu_disable_bts();
-       else
-               intel_bts_disable_local();
-
-       intel_pmu_pebs_disable_all();
-}
-
-static void intel_pmu_disable_all(void)
-{
-       __intel_pmu_disable_all();
-       intel_pmu_lbr_disable_all();
-}
-
-static void __intel_pmu_enable_all(int added, bool pmi)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       intel_pmu_pebs_enable_all();
-       intel_pmu_lbr_enable_all(pmi);
-       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
-                       x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
-
-       if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
-               struct perf_event *event =
-                       cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
-
-               if (WARN_ON_ONCE(!event))
-                       return;
-
-               intel_pmu_enable_bts(event->hw.config);
-       } else
-               intel_bts_enable_local();
-}
-
-static void intel_pmu_enable_all(int added)
-{
-       __intel_pmu_enable_all(added, false);
-}
-
-/*
- * Workaround for:
- *   Intel Errata AAK100 (model 26)
- *   Intel Errata AAP53  (model 30)
- *   Intel Errata BD53   (model 44)
- *
- * The official story:
- *   These chips need to be 'reset' when adding counters by programming the
- *   magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either
- *   in sequence on the same PMC or on different PMCs.
- *
- * In practise it appears some of these events do in fact count, and
- * we need to programm all 4 events.
- */
-static void intel_pmu_nhm_workaround(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       static const unsigned long nhm_magic[4] = {
-               0x4300B5,
-               0x4300D2,
-               0x4300B1,
-               0x4300B1
-       };
-       struct perf_event *event;
-       int i;
-
-       /*
-        * The Errata requires below steps:
-        * 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL;
-        * 2) Configure 4 PERFEVTSELx with the magic events and clear
-        *    the corresponding PMCx;
-        * 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL;
-        * 4) Clear MSR_CORE_PERF_GLOBAL_CTRL;
-        * 5) Clear 4 pairs of ERFEVTSELx and PMCx;
-        */
-
-       /*
-        * The real steps we choose are a little different from above.
-        * A) To reduce MSR operations, we don't run step 1) as they
-        *    are already cleared before this function is called;
-        * B) Call x86_perf_event_update to save PMCx before configuring
-        *    PERFEVTSELx with magic number;
-        * C) With step 5), we do clear only when the PERFEVTSELx is
-        *    not used currently.
-        * D) Call x86_perf_event_set_period to restore PMCx;
-        */
-
-       /* We always operate 4 pairs of PERF Counters */
-       for (i = 0; i < 4; i++) {
-               event = cpuc->events[i];
-               if (event)
-                       x86_perf_event_update(event);
-       }
-
-       for (i = 0; i < 4; i++) {
-               wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]);
-               wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0);
-       }
-
-       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf);
-       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
-
-       for (i = 0; i < 4; i++) {
-               event = cpuc->events[i];
-
-               if (event) {
-                       x86_perf_event_set_period(event);
-                       __x86_pmu_enable_event(&event->hw,
-                                       ARCH_PERFMON_EVENTSEL_ENABLE);
-               } else
-                       wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0);
-       }
-}
-
-static void intel_pmu_nhm_enable_all(int added)
-{
-       if (added)
-               intel_pmu_nhm_workaround();
-       intel_pmu_enable_all(added);
-}
-
-static inline u64 intel_pmu_get_status(void)
-{
-       u64 status;
-
-       rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-
-       return status;
-}
-
-static inline void intel_pmu_ack_status(u64 ack)
-{
-       wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
-}
-
-static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
-{
-       int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
-       u64 ctrl_val, mask;
-
-       mask = 0xfULL << (idx * 4);
-
-       rdmsrl(hwc->config_base, ctrl_val);
-       ctrl_val &= ~mask;
-       wrmsrl(hwc->config_base, ctrl_val);
-}
-
-static inline bool event_is_checkpointed(struct perf_event *event)
-{
-       return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0;
-}
-
-static void intel_pmu_disable_event(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
-               intel_pmu_disable_bts();
-               intel_pmu_drain_bts_buffer();
-               return;
-       }
-
-       cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);
-       cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
-       cpuc->intel_cp_status &= ~(1ull << hwc->idx);
-
-       /*
-        * must disable before any actual event
-        * because any event may be combined with LBR
-        */
-       if (needs_branch_stack(event))
-               intel_pmu_lbr_disable(event);
-
-       if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
-               intel_pmu_disable_fixed(hwc);
-               return;
-       }
-
-       x86_pmu_disable_event(event);
-
-       if (unlikely(event->attr.precise_ip))
-               intel_pmu_pebs_disable(event);
-}
-
-static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
-{
-       int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
-       u64 ctrl_val, bits, mask;
-
-       /*
-        * Enable IRQ generation (0x8),
-        * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
-        * if requested:
-        */
-       bits = 0x8ULL;
-       if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
-               bits |= 0x2;
-       if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
-               bits |= 0x1;
-
-       /*
-        * ANY bit is supported in v3 and up
-        */
-       if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
-               bits |= 0x4;
-
-       bits <<= (idx * 4);
-       mask = 0xfULL << (idx * 4);
-
-       rdmsrl(hwc->config_base, ctrl_val);
-       ctrl_val &= ~mask;
-       ctrl_val |= bits;
-       wrmsrl(hwc->config_base, ctrl_val);
-}
-
-static void intel_pmu_enable_event(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
-               if (!__this_cpu_read(cpu_hw_events.enabled))
-                       return;
-
-               intel_pmu_enable_bts(hwc->config);
-               return;
-       }
-       /*
-        * must enabled before any actual event
-        * because any event may be combined with LBR
-        */
-       if (needs_branch_stack(event))
-               intel_pmu_lbr_enable(event);
-
-       if (event->attr.exclude_host)
-               cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
-       if (event->attr.exclude_guest)
-               cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx);
-
-       if (unlikely(event_is_checkpointed(event)))
-               cpuc->intel_cp_status |= (1ull << hwc->idx);
-
-       if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
-               intel_pmu_enable_fixed(hwc);
-               return;
-       }
-
-       if (unlikely(event->attr.precise_ip))
-               intel_pmu_pebs_enable(event);
-
-       __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
-}
-
-/*
- * Save and restart an expired event. Called by NMI contexts,
- * so it has to be careful about preempting normal event ops:
- */
-int intel_pmu_save_and_restart(struct perf_event *event)
-{
-       x86_perf_event_update(event);
-       /*
-        * For a checkpointed counter always reset back to 0.  This
-        * avoids a situation where the counter overflows, aborts the
-        * transaction and is then set back to shortly before the
-        * overflow, and overflows and aborts again.
-        */
-       if (unlikely(event_is_checkpointed(event))) {
-               /* No race with NMIs because the counter should not be armed */
-               wrmsrl(event->hw.event_base, 0);
-               local64_set(&event->hw.prev_count, 0);
-       }
-       return x86_perf_event_set_period(event);
-}
-
-static void intel_pmu_reset(void)
-{
-       struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
-       unsigned long flags;
-       int idx;
-
-       if (!x86_pmu.num_counters)
-               return;
-
-       local_irq_save(flags);
-
-       pr_info("clearing PMU state on CPU#%d\n", smp_processor_id());
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
-               wrmsrl_safe(x86_pmu_event_addr(idx),  0ull);
-       }
-       for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
-               wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
-
-       if (ds)
-               ds->bts_index = ds->bts_buffer_base;
-
-       /* Ack all overflows and disable fixed counters */
-       if (x86_pmu.version >= 2) {
-               intel_pmu_ack_status(intel_pmu_get_status());
-               wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-       }
-
-       /* Reset LBRs and LBR freezing */
-       if (x86_pmu.lbr_nr) {
-               update_debugctlmsr(get_debugctlmsr() &
-                       ~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR));
-       }
-
-       local_irq_restore(flags);
-}
-
-/*
- * This handler is triggered by the local APIC, so the APIC IRQ handling
- * rules apply:
- */
-static int intel_pmu_handle_irq(struct pt_regs *regs)
-{
-       struct perf_sample_data data;
-       struct cpu_hw_events *cpuc;
-       int bit, loops;
-       u64 status;
-       int handled;
-
-       cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       /*
-        * No known reason to not always do late ACK,
-        * but just in case do it opt-in.
-        */
-       if (!x86_pmu.late_ack)
-               apic_write(APIC_LVTPC, APIC_DM_NMI);
-       __intel_pmu_disable_all();
-       handled = intel_pmu_drain_bts_buffer();
-       handled += intel_bts_interrupt();
-       status = intel_pmu_get_status();
-       if (!status)
-               goto done;
-
-       loops = 0;
-again:
-       intel_pmu_lbr_read();
-       intel_pmu_ack_status(status);
-       if (++loops > 100) {
-               static bool warned = false;
-               if (!warned) {
-                       WARN(1, "perfevents: irq loop stuck!\n");
-                       perf_event_print_debug();
-                       warned = true;
-               }
-               intel_pmu_reset();
-               goto done;
-       }
-
-       inc_irq_stat(apic_perf_irqs);
-
-
-       /*
-        * Ignore a range of extra bits in status that do not indicate
-        * overflow by themselves.
-        */
-       status &= ~(GLOBAL_STATUS_COND_CHG |
-                   GLOBAL_STATUS_ASIF |
-                   GLOBAL_STATUS_LBRS_FROZEN);
-       if (!status)
-               goto done;
-
-       /*
-        * PEBS overflow sets bit 62 in the global status register
-        */
-       if (__test_and_clear_bit(62, (unsigned long *)&status)) {
-               handled++;
-               x86_pmu.drain_pebs(regs);
-       }
-
-       /*
-        * Intel PT
-        */
-       if (__test_and_clear_bit(55, (unsigned long *)&status)) {
-               handled++;
-               intel_pt_interrupt();
-       }
-
-       /*
-        * Checkpointed counters can lead to 'spurious' PMIs because the
-        * rollback caused by the PMI will have cleared the overflow status
-        * bit. Therefore always force probe these counters.
-        */
-       status |= cpuc->intel_cp_status;
-
-       for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
-               struct perf_event *event = cpuc->events[bit];
-
-               handled++;
-
-               if (!test_bit(bit, cpuc->active_mask))
-                       continue;
-
-               if (!intel_pmu_save_and_restart(event))
-                       continue;
-
-               perf_sample_data_init(&data, 0, event->hw.last_period);
-
-               if (has_branch_stack(event))
-                       data.br_stack = &cpuc->lbr_stack;
-
-               if (perf_event_overflow(event, &data, regs))
-                       x86_pmu_stop(event, 0);
-       }
-
-       /*
-        * Repeat if there is more work to be done:
-        */
-       status = intel_pmu_get_status();
-       if (status)
-               goto again;
-
-done:
-       __intel_pmu_enable_all(0, true);
-       /*
-        * Only unmask the NMI after the overflow counters
-        * have been reset. This avoids spurious NMIs on
-        * Haswell CPUs.
-        */
-       if (x86_pmu.late_ack)
-               apic_write(APIC_LVTPC, APIC_DM_NMI);
-       return handled;
-}
-
-static struct event_constraint *
-intel_bts_constraints(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       unsigned int hw_event, bts_event;
-
-       if (event->attr.freq)
-               return NULL;
-
-       hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
-       bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
-
-       if (unlikely(hw_event == bts_event && hwc->sample_period == 1))
-               return &bts_constraint;
-
-       return NULL;
-}
-
-static int intel_alt_er(int idx, u64 config)
-{
-       int alt_idx = idx;
-
-       if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
-               return idx;
-
-       if (idx == EXTRA_REG_RSP_0)
-               alt_idx = EXTRA_REG_RSP_1;
-
-       if (idx == EXTRA_REG_RSP_1)
-               alt_idx = EXTRA_REG_RSP_0;
-
-       if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask)
-               return idx;
-
-       return alt_idx;
-}
-
-static void intel_fixup_er(struct perf_event *event, int idx)
-{
-       event->hw.extra_reg.idx = idx;
-
-       if (idx == EXTRA_REG_RSP_0) {
-               event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
-               event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event;
-               event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
-       } else if (idx == EXTRA_REG_RSP_1) {
-               event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
-               event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event;
-               event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
-       }
-}
-
-/*
- * manage allocation of shared extra msr for certain events
- *
- * sharing can be:
- * per-cpu: to be shared between the various events on a single PMU
- * per-core: per-cpu + shared by HT threads
- */
-static struct event_constraint *
-__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
-                                  struct perf_event *event,
-                                  struct hw_perf_event_extra *reg)
-{
-       struct event_constraint *c = &emptyconstraint;
-       struct er_account *era;
-       unsigned long flags;
-       int idx = reg->idx;
-
-       /*
-        * reg->alloc can be set due to existing state, so for fake cpuc we
-        * need to ignore this, otherwise we might fail to allocate proper fake
-        * state for this extra reg constraint. Also see the comment below.
-        */
-       if (reg->alloc && !cpuc->is_fake)
-               return NULL; /* call x86_get_event_constraint() */
-
-again:
-       era = &cpuc->shared_regs->regs[idx];
-       /*
-        * we use spin_lock_irqsave() to avoid lockdep issues when
-        * passing a fake cpuc
-        */
-       raw_spin_lock_irqsave(&era->lock, flags);
-
-       if (!atomic_read(&era->ref) || era->config == reg->config) {
-
-               /*
-                * If its a fake cpuc -- as per validate_{group,event}() we
-                * shouldn't touch event state and we can avoid doing so
-                * since both will only call get_event_constraints() once
-                * on each event, this avoids the need for reg->alloc.
-                *
-                * Not doing the ER fixup will only result in era->reg being
-                * wrong, but since we won't actually try and program hardware
-                * this isn't a problem either.
-                */
-               if (!cpuc->is_fake) {
-                       if (idx != reg->idx)
-                               intel_fixup_er(event, idx);
-
-                       /*
-                        * x86_schedule_events() can call get_event_constraints()
-                        * multiple times on events in the case of incremental
-                        * scheduling(). reg->alloc ensures we only do the ER
-                        * allocation once.
-                        */
-                       reg->alloc = 1;
-               }
-
-               /* lock in msr value */
-               era->config = reg->config;
-               era->reg = reg->reg;
-
-               /* one more user */
-               atomic_inc(&era->ref);
-
-               /*
-                * need to call x86_get_event_constraint()
-                * to check if associated event has constraints
-                */
-               c = NULL;
-       } else {
-               idx = intel_alt_er(idx, reg->config);
-               if (idx != reg->idx) {
-                       raw_spin_unlock_irqrestore(&era->lock, flags);
-                       goto again;
-               }
-       }
-       raw_spin_unlock_irqrestore(&era->lock, flags);
-
-       return c;
-}
-
-static void
-__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
-                                  struct hw_perf_event_extra *reg)
-{
-       struct er_account *era;
-
-       /*
-        * Only put constraint if extra reg was actually allocated. Also takes
-        * care of event which do not use an extra shared reg.
-        *
-        * Also, if this is a fake cpuc we shouldn't touch any event state
-        * (reg->alloc) and we don't care about leaving inconsistent cpuc state
-        * either since it'll be thrown out.
-        */
-       if (!reg->alloc || cpuc->is_fake)
-               return;
-
-       era = &cpuc->shared_regs->regs[reg->idx];
-
-       /* one fewer user */
-       atomic_dec(&era->ref);
-
-       /* allocate again next time */
-       reg->alloc = 0;
-}
-
-static struct event_constraint *
-intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
-                             struct perf_event *event)
-{
-       struct event_constraint *c = NULL, *d;
-       struct hw_perf_event_extra *xreg, *breg;
-
-       xreg = &event->hw.extra_reg;
-       if (xreg->idx != EXTRA_REG_NONE) {
-               c = __intel_shared_reg_get_constraints(cpuc, event, xreg);
-               if (c == &emptyconstraint)
-                       return c;
-       }
-       breg = &event->hw.branch_reg;
-       if (breg->idx != EXTRA_REG_NONE) {
-               d = __intel_shared_reg_get_constraints(cpuc, event, breg);
-               if (d == &emptyconstraint) {
-                       __intel_shared_reg_put_constraints(cpuc, xreg);
-                       c = d;
-               }
-       }
-       return c;
-}
-
-struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-                         struct perf_event *event)
-{
-       struct event_constraint *c;
-
-       if (x86_pmu.event_constraints) {
-               for_each_event_constraint(c, x86_pmu.event_constraints) {
-                       if ((event->hw.config & c->cmask) == c->code) {
-                               event->hw.flags |= c->flags;
-                               return c;
-                       }
-               }
-       }
-
-       return &unconstrained;
-}
-
-static struct event_constraint *
-__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-                           struct perf_event *event)
-{
-       struct event_constraint *c;
-
-       c = intel_bts_constraints(event);
-       if (c)
-               return c;
-
-       c = intel_shared_regs_constraints(cpuc, event);
-       if (c)
-               return c;
-
-       c = intel_pebs_constraints(event);
-       if (c)
-               return c;
-
-       return x86_get_event_constraints(cpuc, idx, event);
-}
-
-static void
-intel_start_scheduling(struct cpu_hw_events *cpuc)
-{
-       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-       struct intel_excl_states *xl;
-       int tid = cpuc->excl_thread_id;
-
-       /*
-        * nothing needed if in group validation mode
-        */
-       if (cpuc->is_fake || !is_ht_workaround_enabled())
-               return;
-
-       /*
-        * no exclusion needed
-        */
-       if (WARN_ON_ONCE(!excl_cntrs))
-               return;
-
-       xl = &excl_cntrs->states[tid];
-
-       xl->sched_started = true;
-       /*
-        * lock shared state until we are done scheduling
-        * in stop_event_scheduling()
-        * makes scheduling appear as a transaction
-        */
-       raw_spin_lock(&excl_cntrs->lock);
-}
-
-static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
-{
-       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-       struct event_constraint *c = cpuc->event_constraint[idx];
-       struct intel_excl_states *xl;
-       int tid = cpuc->excl_thread_id;
-
-       if (cpuc->is_fake || !is_ht_workaround_enabled())
-               return;
-
-       if (WARN_ON_ONCE(!excl_cntrs))
-               return;
-
-       if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
-               return;
-
-       xl = &excl_cntrs->states[tid];
-
-       lockdep_assert_held(&excl_cntrs->lock);
-
-       if (c->flags & PERF_X86_EVENT_EXCL)
-               xl->state[cntr] = INTEL_EXCL_EXCLUSIVE;
-       else
-               xl->state[cntr] = INTEL_EXCL_SHARED;
-}
-
-static void
-intel_stop_scheduling(struct cpu_hw_events *cpuc)
-{
-       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-       struct intel_excl_states *xl;
-       int tid = cpuc->excl_thread_id;
-
-       /*
-        * nothing needed if in group validation mode
-        */
-       if (cpuc->is_fake || !is_ht_workaround_enabled())
-               return;
-       /*
-        * no exclusion needed
-        */
-       if (WARN_ON_ONCE(!excl_cntrs))
-               return;
-
-       xl = &excl_cntrs->states[tid];
-
-       xl->sched_started = false;
-       /*
-        * release shared state lock (acquired in intel_start_scheduling())
-        */
-       raw_spin_unlock(&excl_cntrs->lock);
-}
-
-static struct event_constraint *
-intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
-                          int idx, struct event_constraint *c)
-{
-       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-       struct intel_excl_states *xlo;
-       int tid = cpuc->excl_thread_id;
-       int is_excl, i;
-
-       /*
-        * validating a group does not require
-        * enforcing cross-thread  exclusion
-        */
-       if (cpuc->is_fake || !is_ht_workaround_enabled())
-               return c;
-
-       /*
-        * no exclusion needed
-        */
-       if (WARN_ON_ONCE(!excl_cntrs))
-               return c;
-
-       /*
-        * because we modify the constraint, we need
-        * to make a copy. Static constraints come
-        * from static const tables.
-        *
-        * only needed when constraint has not yet
-        * been cloned (marked dynamic)
-        */
-       if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
-               struct event_constraint *cx;
-
-               /*
-                * grab pre-allocated constraint entry
-                */
-               cx = &cpuc->constraint_list[idx];
-
-               /*
-                * initialize dynamic constraint
-                * with static constraint
-                */
-               *cx = *c;
-
-               /*
-                * mark constraint as dynamic, so we
-                * can free it later on
-                */
-               cx->flags |= PERF_X86_EVENT_DYNAMIC;
-               c = cx;
-       }
-
-       /*
-        * From here on, the constraint is dynamic.
-        * Either it was just allocated above, or it
-        * was allocated during a earlier invocation
-        * of this function
-        */
-
-       /*
-        * state of sibling HT
-        */
-       xlo = &excl_cntrs->states[tid ^ 1];
-
-       /*
-        * event requires exclusive counter access
-        * across HT threads
-        */
-       is_excl = c->flags & PERF_X86_EVENT_EXCL;
-       if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) {
-               event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT;
-               if (!cpuc->n_excl++)
-                       WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
-       }
-
-       /*
-        * Modify static constraint with current dynamic
-        * state of thread
-        *
-        * EXCLUSIVE: sibling counter measuring exclusive event
-        * SHARED   : sibling counter measuring non-exclusive event
-        * UNUSED   : sibling counter unused
-        */
-       for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) {
-               /*
-                * exclusive event in sibling counter
-                * our corresponding counter cannot be used
-                * regardless of our event
-                */
-               if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE)
-                       __clear_bit(i, c->idxmsk);
-               /*
-                * if measuring an exclusive event, sibling
-                * measuring non-exclusive, then counter cannot
-                * be used
-                */
-               if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED)
-                       __clear_bit(i, c->idxmsk);
-       }
-
-       /*
-        * recompute actual bit weight for scheduling algorithm
-        */
-       c->weight = hweight64(c->idxmsk64);
-
-       /*
-        * if we return an empty mask, then switch
-        * back to static empty constraint to avoid
-        * the cost of freeing later on
-        */
-       if (c->weight == 0)
-               c = &emptyconstraint;
-
-       return c;
-}
-
-static struct event_constraint *
-intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-                           struct perf_event *event)
-{
-       struct event_constraint *c1 = NULL;
-       struct event_constraint *c2;
-
-       if (idx >= 0) /* fake does < 0 */
-               c1 = cpuc->event_constraint[idx];
-
-       /*
-        * first time only
-        * - static constraint: no change across incremental scheduling calls
-        * - dynamic constraint: handled by intel_get_excl_constraints()
-        */
-       c2 = __intel_get_event_constraints(cpuc, idx, event);
-       if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) {
-               bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX);
-               c1->weight = c2->weight;
-               c2 = c1;
-       }
-
-       if (cpuc->excl_cntrs)
-               return intel_get_excl_constraints(cpuc, event, idx, c2);
-
-       return c2;
-}
-
-static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
-               struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-       int tid = cpuc->excl_thread_id;
-       struct intel_excl_states *xl;
-
-       /*
-        * nothing needed if in group validation mode
-        */
-       if (cpuc->is_fake)
-               return;
-
-       if (WARN_ON_ONCE(!excl_cntrs))
-               return;
-
-       if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) {
-               hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT;
-               if (!--cpuc->n_excl)
-                       WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);
-       }
-
-       /*
-        * If event was actually assigned, then mark the counter state as
-        * unused now.
-        */
-       if (hwc->idx >= 0) {
-               xl = &excl_cntrs->states[tid];
-
-               /*
-                * put_constraint may be called from x86_schedule_events()
-                * which already has the lock held so here make locking
-                * conditional.
-                */
-               if (!xl->sched_started)
-                       raw_spin_lock(&excl_cntrs->lock);
-
-               xl->state[hwc->idx] = INTEL_EXCL_UNUSED;
-
-               if (!xl->sched_started)
-                       raw_spin_unlock(&excl_cntrs->lock);
-       }
-}
-
-static void
-intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
-                                       struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg;
-
-       reg = &event->hw.extra_reg;
-       if (reg->idx != EXTRA_REG_NONE)
-               __intel_shared_reg_put_constraints(cpuc, reg);
-
-       reg = &event->hw.branch_reg;
-       if (reg->idx != EXTRA_REG_NONE)
-               __intel_shared_reg_put_constraints(cpuc, reg);
-}
-
-static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
-                                       struct perf_event *event)
-{
-       intel_put_shared_regs_event_constraints(cpuc, event);
-
-       /*
-        * is PMU has exclusive counter restrictions, then
-        * all events are subject to and must call the
-        * put_excl_constraints() routine
-        */
-       if (cpuc->excl_cntrs)
-               intel_put_excl_constraints(cpuc, event);
-}
-
-static void intel_pebs_aliases_core2(struct perf_event *event)
-{
-       if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
-               /*
-                * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
-                * (0x003c) so that we can use it with PEBS.
-                *
-                * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
-                * PEBS capable. However we can use INST_RETIRED.ANY_P
-                * (0x00c0), which is a PEBS capable event, to get the same
-                * count.
-                *
-                * INST_RETIRED.ANY_P counts the number of cycles that retires
-                * CNTMASK instructions. By setting CNTMASK to a value (16)
-                * larger than the maximum number of instructions that can be
-                * retired per cycle (4) and then inverting the condition, we
-                * count all cycles that retire 16 or less instructions, which
-                * is every cycle.
-                *
-                * Thereby we gain a PEBS capable cycle counter.
-                */
-               u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
-
-               alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
-               event->hw.config = alt_config;
-       }
-}
-
-static void intel_pebs_aliases_snb(struct perf_event *event)
-{
-       if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
-               /*
-                * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
-                * (0x003c) so that we can use it with PEBS.
-                *
-                * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
-                * PEBS capable. However we can use UOPS_RETIRED.ALL
-                * (0x01c2), which is a PEBS capable event, to get the same
-                * count.
-                *
-                * UOPS_RETIRED.ALL counts the number of cycles that retires
-                * CNTMASK micro-ops. By setting CNTMASK to a value (16)
-                * larger than the maximum number of micro-ops that can be
-                * retired per cycle (4) and then inverting the condition, we
-                * count all cycles that retire 16 or less micro-ops, which
-                * is every cycle.
-                *
-                * Thereby we gain a PEBS capable cycle counter.
-                */
-               u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16);
-
-               alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
-               event->hw.config = alt_config;
-       }
-}
-
-static void intel_pebs_aliases_precdist(struct perf_event *event)
-{
-       if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
-               /*
-                * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
-                * (0x003c) so that we can use it with PEBS.
-                *
-                * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
-                * PEBS capable. However we can use INST_RETIRED.PREC_DIST
-                * (0x01c0), which is a PEBS capable event, to get the same
-                * count.
-                *
-                * The PREC_DIST event has special support to minimize sample
-                * shadowing effects. One drawback is that it can be
-                * only programmed on counter 1, but that seems like an
-                * acceptable trade off.
-                */
-               u64 alt_config = X86_CONFIG(.event=0xc0, .umask=0x01, .inv=1, .cmask=16);
-
-               alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
-               event->hw.config = alt_config;
-       }
-}
-
-static void intel_pebs_aliases_ivb(struct perf_event *event)
-{
-       if (event->attr.precise_ip < 3)
-               return intel_pebs_aliases_snb(event);
-       return intel_pebs_aliases_precdist(event);
-}
-
-static void intel_pebs_aliases_skl(struct perf_event *event)
-{
-       if (event->attr.precise_ip < 3)
-               return intel_pebs_aliases_core2(event);
-       return intel_pebs_aliases_precdist(event);
-}
-
-static unsigned long intel_pmu_free_running_flags(struct perf_event *event)
-{
-       unsigned long flags = x86_pmu.free_running_flags;
-
-       if (event->attr.use_clockid)
-               flags &= ~PERF_SAMPLE_TIME;
-       return flags;
-}
-
-static int intel_pmu_hw_config(struct perf_event *event)
-{
-       int ret = x86_pmu_hw_config(event);
-
-       if (ret)
-               return ret;
-
-       if (event->attr.precise_ip) {
-               if (!event->attr.freq) {
-                       event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
-                       if (!(event->attr.sample_type &
-                             ~intel_pmu_free_running_flags(event)))
-                               event->hw.flags |= PERF_X86_EVENT_FREERUNNING;
-               }
-               if (x86_pmu.pebs_aliases)
-                       x86_pmu.pebs_aliases(event);
-       }
-
-       if (needs_branch_stack(event)) {
-               ret = intel_pmu_setup_lbr_filter(event);
-               if (ret)
-                       return ret;
-
-               /*
-                * BTS is set up earlier in this path, so don't account twice
-                */
-               if (!intel_pmu_has_bts(event)) {
-                       /* disallow lbr if conflicting events are present */
-                       if (x86_add_exclusive(x86_lbr_exclusive_lbr))
-                               return -EBUSY;
-
-                       event->destroy = hw_perf_lbr_event_destroy;
-               }
-       }
-
-       if (event->attr.type != PERF_TYPE_RAW)
-               return 0;
-
-       if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
-               return 0;
-
-       if (x86_pmu.version < 3)
-               return -EINVAL;
-
-       if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
-               return -EACCES;
-
-       event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
-
-       return 0;
-}
-
-struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
-{
-       if (x86_pmu.guest_get_msrs)
-               return x86_pmu.guest_get_msrs(nr);
-       *nr = 0;
-       return NULL;
-}
-EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
-
-static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
-
-       arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
-       arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
-       arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask;
-       /*
-        * If PMU counter has PEBS enabled it is not enough to disable counter
-        * on a guest entry since PEBS memory write can overshoot guest entry
-        * and corrupt guest memory. Disabling PEBS solves the problem.
-        */
-       arr[1].msr = MSR_IA32_PEBS_ENABLE;
-       arr[1].host = cpuc->pebs_enabled;
-       arr[1].guest = 0;
-
-       *nr = 2;
-       return arr;
-}
-
-static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
-       int idx;
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++)  {
-               struct perf_event *event = cpuc->events[idx];
-
-               arr[idx].msr = x86_pmu_config_addr(idx);
-               arr[idx].host = arr[idx].guest = 0;
-
-               if (!test_bit(idx, cpuc->active_mask))
-                       continue;
-
-               arr[idx].host = arr[idx].guest =
-                       event->hw.config | ARCH_PERFMON_EVENTSEL_ENABLE;
-
-               if (event->attr.exclude_host)
-                       arr[idx].host &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-               else if (event->attr.exclude_guest)
-                       arr[idx].guest &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-       }
-
-       *nr = x86_pmu.num_counters;
-       return arr;
-}
-
-static void core_pmu_enable_event(struct perf_event *event)
-{
-       if (!event->attr.exclude_host)
-               x86_pmu_enable_event(event);
-}
-
-static void core_pmu_enable_all(int added)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       int idx;
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
-
-               if (!test_bit(idx, cpuc->active_mask) ||
-                               cpuc->events[idx]->attr.exclude_host)
-                       continue;
-
-               __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
-       }
-}
-
-static int hsw_hw_config(struct perf_event *event)
-{
-       int ret = intel_pmu_hw_config(event);
-
-       if (ret)
-               return ret;
-       if (!boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has(X86_FEATURE_HLE))
-               return 0;
-       event->hw.config |= event->attr.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED);
-
-       /*
-        * IN_TX/IN_TX-CP filters are not supported by the Haswell PMU with
-        * PEBS or in ANY thread mode. Since the results are non-sensical forbid
-        * this combination.
-        */
-       if ((event->hw.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)) &&
-            ((event->hw.config & ARCH_PERFMON_EVENTSEL_ANY) ||
-             event->attr.precise_ip > 0))
-               return -EOPNOTSUPP;
-
-       if (event_is_checkpointed(event)) {
-               /*
-                * Sampling of checkpointed events can cause situations where
-                * the CPU constantly aborts because of a overflow, which is
-                * then checkpointed back and ignored. Forbid checkpointing
-                * for sampling.
-                *
-                * But still allow a long sampling period, so that perf stat
-                * from KVM works.
-                */
-               if (event->attr.sample_period > 0 &&
-                   event->attr.sample_period < 0x7fffffff)
-                       return -EOPNOTSUPP;
-       }
-       return 0;
-}
-
-static struct event_constraint counter2_constraint =
-                       EVENT_CONSTRAINT(0, 0x4, 0);
-
-static struct event_constraint *
-hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-                         struct perf_event *event)
-{
-       struct event_constraint *c;
-
-       c = intel_get_event_constraints(cpuc, idx, event);
-
-       /* Handle special quirk on in_tx_checkpointed only in counter 2 */
-       if (event->hw.config & HSW_IN_TX_CHECKPOINTED) {
-               if (c->idxmsk64 & (1U << 2))
-                       return &counter2_constraint;
-               return &emptyconstraint;
-       }
-
-       return c;
-}
-
-/*
- * Broadwell:
- *
- * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared
- * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine
- * the two to enforce a minimum period of 128 (the smallest value that has bits
- * 0-5 cleared and >= 100).
- *
- * Because of how the code in x86_perf_event_set_period() works, the truncation
- * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period
- * to make up for the 'lost' events due to carrying the 'error' in period_left.
- *
- * Therefore the effective (average) period matches the requested period,
- * despite coarser hardware granularity.
- */
-static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
-{
-       if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
-                       X86_CONFIG(.event=0xc0, .umask=0x01)) {
-               if (left < 128)
-                       left = 128;
-               left &= ~0x3fu;
-       }
-       return left;
-}
-
-PMU_FORMAT_ATTR(event, "config:0-7"    );
-PMU_FORMAT_ATTR(umask, "config:8-15"   );
-PMU_FORMAT_ATTR(edge,  "config:18"     );
-PMU_FORMAT_ATTR(pc,    "config:19"     );
-PMU_FORMAT_ATTR(any,   "config:21"     ); /* v3 + */
-PMU_FORMAT_ATTR(inv,   "config:23"     );
-PMU_FORMAT_ATTR(cmask, "config:24-31"  );
-PMU_FORMAT_ATTR(in_tx,  "config:32");
-PMU_FORMAT_ATTR(in_tx_cp, "config:33");
-
-static struct attribute *intel_arch_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_pc.attr,
-       &format_attr_inv.attr,
-       &format_attr_cmask.attr,
-       NULL,
-};
-
-ssize_t intel_event_sysfs_show(char *page, u64 config)
-{
-       u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
-
-       return x86_event_sysfs_show(page, config, event);
-}
-
-struct intel_shared_regs *allocate_shared_regs(int cpu)
-{
-       struct intel_shared_regs *regs;
-       int i;
-
-       regs = kzalloc_node(sizeof(struct intel_shared_regs),
-                           GFP_KERNEL, cpu_to_node(cpu));
-       if (regs) {
-               /*
-                * initialize the locks to keep lockdep happy
-                */
-               for (i = 0; i < EXTRA_REG_MAX; i++)
-                       raw_spin_lock_init(&regs->regs[i].lock);
-
-               regs->core_id = -1;
-       }
-       return regs;
-}
-
-static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
-{
-       struct intel_excl_cntrs *c;
-
-       c = kzalloc_node(sizeof(struct intel_excl_cntrs),
-                        GFP_KERNEL, cpu_to_node(cpu));
-       if (c) {
-               raw_spin_lock_init(&c->lock);
-               c->core_id = -1;
-       }
-       return c;
-}
-
-static int intel_pmu_cpu_prepare(int cpu)
-{
-       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-
-       if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
-               cpuc->shared_regs = allocate_shared_regs(cpu);
-               if (!cpuc->shared_regs)
-                       goto err;
-       }
-
-       if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
-               size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
-
-               cpuc->constraint_list = kzalloc(sz, GFP_KERNEL);
-               if (!cpuc->constraint_list)
-                       goto err_shared_regs;
-
-               cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
-               if (!cpuc->excl_cntrs)
-                       goto err_constraint_list;
-
-               cpuc->excl_thread_id = 0;
-       }
-
-       return NOTIFY_OK;
-
-err_constraint_list:
-       kfree(cpuc->constraint_list);
-       cpuc->constraint_list = NULL;
-
-err_shared_regs:
-       kfree(cpuc->shared_regs);
-       cpuc->shared_regs = NULL;
-
-err:
-       return NOTIFY_BAD;
-}
-
-static void intel_pmu_cpu_starting(int cpu)
-{
-       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-       int core_id = topology_core_id(cpu);
-       int i;
-
-       init_debug_store_on_cpu(cpu);
-       /*
-        * Deal with CPUs that don't clear their LBRs on power-up.
-        */
-       intel_pmu_lbr_reset();
-
-       cpuc->lbr_sel = NULL;
-
-       if (!cpuc->shared_regs)
-               return;
-
-       if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
-               for_each_cpu(i, topology_sibling_cpumask(cpu)) {
-                       struct intel_shared_regs *pc;
-
-                       pc = per_cpu(cpu_hw_events, i).shared_regs;
-                       if (pc && pc->core_id == core_id) {
-                               cpuc->kfree_on_online[0] = cpuc->shared_regs;
-                               cpuc->shared_regs = pc;
-                               break;
-                       }
-               }
-               cpuc->shared_regs->core_id = core_id;
-               cpuc->shared_regs->refcnt++;
-       }
-
-       if (x86_pmu.lbr_sel_map)
-               cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
-
-       if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
-               for_each_cpu(i, topology_sibling_cpumask(cpu)) {
-                       struct intel_excl_cntrs *c;
-
-                       c = per_cpu(cpu_hw_events, i).excl_cntrs;
-                       if (c && c->core_id == core_id) {
-                               cpuc->kfree_on_online[1] = cpuc->excl_cntrs;
-                               cpuc->excl_cntrs = c;
-                               cpuc->excl_thread_id = 1;
-                               break;
-                       }
-               }
-               cpuc->excl_cntrs->core_id = core_id;
-               cpuc->excl_cntrs->refcnt++;
-       }
-}
-
-static void free_excl_cntrs(int cpu)
-{
-       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-       struct intel_excl_cntrs *c;
-
-       c = cpuc->excl_cntrs;
-       if (c) {
-               if (c->core_id == -1 || --c->refcnt == 0)
-                       kfree(c);
-               cpuc->excl_cntrs = NULL;
-               kfree(cpuc->constraint_list);
-               cpuc->constraint_list = NULL;
-       }
-}
-
-static void intel_pmu_cpu_dying(int cpu)
-{
-       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-       struct intel_shared_regs *pc;
-
-       pc = cpuc->shared_regs;
-       if (pc) {
-               if (pc->core_id == -1 || --pc->refcnt == 0)
-                       kfree(pc);
-               cpuc->shared_regs = NULL;
-       }
-
-       free_excl_cntrs(cpu);
-
-       fini_debug_store_on_cpu(cpu);
-}
-
-static void intel_pmu_sched_task(struct perf_event_context *ctx,
-                                bool sched_in)
-{
-       if (x86_pmu.pebs_active)
-               intel_pmu_pebs_sched_task(ctx, sched_in);
-       if (x86_pmu.lbr_nr)
-               intel_pmu_lbr_sched_task(ctx, sched_in);
-}
-
-PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
-
-PMU_FORMAT_ATTR(ldlat, "config1:0-15");
-
-PMU_FORMAT_ATTR(frontend, "config1:0-23");
-
-static struct attribute *intel_arch3_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_pc.attr,
-       &format_attr_any.attr,
-       &format_attr_inv.attr,
-       &format_attr_cmask.attr,
-       &format_attr_in_tx.attr,
-       &format_attr_in_tx_cp.attr,
-
-       &format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */
-       &format_attr_ldlat.attr, /* PEBS load latency */
-       NULL,
-};
-
-static struct attribute *skl_format_attr[] = {
-       &format_attr_frontend.attr,
-       NULL,
-};
-
-static __initconst const struct x86_pmu core_pmu = {
-       .name                   = "core",
-       .handle_irq             = x86_pmu_handle_irq,
-       .disable_all            = x86_pmu_disable_all,
-       .enable_all             = core_pmu_enable_all,
-       .enable                 = core_pmu_enable_event,
-       .disable                = x86_pmu_disable_event,
-       .hw_config              = x86_pmu_hw_config,
-       .schedule_events        = x86_schedule_events,
-       .eventsel               = MSR_ARCH_PERFMON_EVENTSEL0,
-       .perfctr                = MSR_ARCH_PERFMON_PERFCTR0,
-       .event_map              = intel_pmu_event_map,
-       .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
-       .apic                   = 1,
-       .free_running_flags     = PEBS_FREERUNNING_FLAGS,
-
-       /*
-        * Intel PMCs cannot be accessed sanely above 32-bit width,
-        * so we install an artificial 1<<31 period regardless of
-        * the generic event period:
-        */
-       .max_period             = (1ULL<<31) - 1,
-       .get_event_constraints  = intel_get_event_constraints,
-       .put_event_constraints  = intel_put_event_constraints,
-       .event_constraints      = intel_core_event_constraints,
-       .guest_get_msrs         = core_guest_get_msrs,
-       .format_attrs           = intel_arch_formats_attr,
-       .events_sysfs_show      = intel_event_sysfs_show,
-
-       /*
-        * Virtual (or funny metal) CPU can define x86_pmu.extra_regs
-        * together with PMU version 1 and thus be using core_pmu with
-        * shared_regs. We need following callbacks here to allocate
-        * it properly.
-        */
-       .cpu_prepare            = intel_pmu_cpu_prepare,
-       .cpu_starting           = intel_pmu_cpu_starting,
-       .cpu_dying              = intel_pmu_cpu_dying,
-};
-
-static __initconst const struct x86_pmu intel_pmu = {
-       .name                   = "Intel",
-       .handle_irq             = intel_pmu_handle_irq,
-       .disable_all            = intel_pmu_disable_all,
-       .enable_all             = intel_pmu_enable_all,
-       .enable                 = intel_pmu_enable_event,
-       .disable                = intel_pmu_disable_event,
-       .hw_config              = intel_pmu_hw_config,
-       .schedule_events        = x86_schedule_events,
-       .eventsel               = MSR_ARCH_PERFMON_EVENTSEL0,
-       .perfctr                = MSR_ARCH_PERFMON_PERFCTR0,
-       .event_map              = intel_pmu_event_map,
-       .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
-       .apic                   = 1,
-       .free_running_flags     = PEBS_FREERUNNING_FLAGS,
-       /*
-        * Intel PMCs cannot be accessed sanely above 32 bit width,
-        * so we install an artificial 1<<31 period regardless of
-        * the generic event period:
-        */
-       .max_period             = (1ULL << 31) - 1,
-       .get_event_constraints  = intel_get_event_constraints,
-       .put_event_constraints  = intel_put_event_constraints,
-       .pebs_aliases           = intel_pebs_aliases_core2,
-
-       .format_attrs           = intel_arch3_formats_attr,
-       .events_sysfs_show      = intel_event_sysfs_show,
-
-       .cpu_prepare            = intel_pmu_cpu_prepare,
-       .cpu_starting           = intel_pmu_cpu_starting,
-       .cpu_dying              = intel_pmu_cpu_dying,
-       .guest_get_msrs         = intel_guest_get_msrs,
-       .sched_task             = intel_pmu_sched_task,
-};
-
-static __init void intel_clovertown_quirk(void)
-{
-       /*
-        * PEBS is unreliable due to:
-        *
-        *   AJ67  - PEBS may experience CPL leaks
-        *   AJ68  - PEBS PMI may be delayed by one event
-        *   AJ69  - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
-        *   AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
-        *
-        * AJ67 could be worked around by restricting the OS/USR flags.
-        * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
-        *
-        * AJ106 could possibly be worked around by not allowing LBR
-        *       usage from PEBS, including the fixup.
-        * AJ68  could possibly be worked around by always programming
-        *       a pebs_event_reset[0] value and coping with the lost events.
-        *
-        * But taken together it might just make sense to not enable PEBS on
-        * these chips.
-        */
-       pr_warn("PEBS disabled due to CPU errata\n");
-       x86_pmu.pebs = 0;
-       x86_pmu.pebs_constraints = NULL;
-}
-
-static int intel_snb_pebs_broken(int cpu)
-{
-       u32 rev = UINT_MAX; /* default to broken for unknown models */
-
-       switch (cpu_data(cpu).x86_model) {
-       case 42: /* SNB */
-               rev = 0x28;
-               break;
-
-       case 45: /* SNB-EP */
-               switch (cpu_data(cpu).x86_mask) {
-               case 6: rev = 0x618; break;
-               case 7: rev = 0x70c; break;
-               }
-       }
-
-       return (cpu_data(cpu).microcode < rev);
-}
-
-static void intel_snb_check_microcode(void)
-{
-       int pebs_broken = 0;
-       int cpu;
-
-       get_online_cpus();
-       for_each_online_cpu(cpu) {
-               if ((pebs_broken = intel_snb_pebs_broken(cpu)))
-                       break;
-       }
-       put_online_cpus();
-
-       if (pebs_broken == x86_pmu.pebs_broken)
-               return;
-
-       /*
-        * Serialized by the microcode lock..
-        */
-       if (x86_pmu.pebs_broken) {
-               pr_info("PEBS enabled due to microcode update\n");
-               x86_pmu.pebs_broken = 0;
-       } else {
-               pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n");
-               x86_pmu.pebs_broken = 1;
-       }
-}
-
-/*
- * Under certain circumstances, access certain MSR may cause #GP.
- * The function tests if the input MSR can be safely accessed.
- */
-static bool check_msr(unsigned long msr, u64 mask)
-{
-       u64 val_old, val_new, val_tmp;
-
-       /*
-        * Read the current value, change it and read it back to see if it
-        * matches, this is needed to detect certain hardware emulators
-        * (qemu/kvm) that don't trap on the MSR access and always return 0s.
-        */
-       if (rdmsrl_safe(msr, &val_old))
-               return false;
-
-       /*
-        * Only change the bits which can be updated by wrmsrl.
-        */
-       val_tmp = val_old ^ mask;
-       if (wrmsrl_safe(msr, val_tmp) ||
-           rdmsrl_safe(msr, &val_new))
-               return false;
-
-       if (val_new != val_tmp)
-               return false;
-
-       /* Here it's sure that the MSR can be safely accessed.
-        * Restore the old value and return.
-        */
-       wrmsrl(msr, val_old);
-
-       return true;
-}
-
-static __init void intel_sandybridge_quirk(void)
-{
-       x86_pmu.check_microcode = intel_snb_check_microcode;
-       intel_snb_check_microcode();
-}
-
-static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
-       { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
-       { PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
-       { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
-       { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
-       { PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
-       { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
-       { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
-};
-
-static __init void intel_arch_events_quirk(void)
-{
-       int bit;
-
-       /* disable event that reported as not presend by cpuid */
-       for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
-               intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
-               pr_warn("CPUID marked event: \'%s\' unavailable\n",
-                       intel_arch_events_map[bit].name);
-       }
-}
-
-static __init void intel_nehalem_quirk(void)
-{
-       union cpuid10_ebx ebx;
-
-       ebx.full = x86_pmu.events_maskl;
-       if (ebx.split.no_branch_misses_retired) {
-               /*
-                * Erratum AAJ80 detected, we work it around by using
-                * the BR_MISP_EXEC.ANY event. This will over-count
-                * branch-misses, but it's still much better than the
-                * architectural event which is often completely bogus:
-                */
-               intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
-               ebx.split.no_branch_misses_retired = 0;
-               x86_pmu.events_maskl = ebx.full;
-               pr_info("CPU erratum AAJ80 worked around\n");
-       }
-}
-
-/*
- * enable software workaround for errata:
- * SNB: BJ122
- * IVB: BV98
- * HSW: HSD29
- *
- * Only needed when HT is enabled. However detecting
- * if HT is enabled is difficult (model specific). So instead,
- * we enable the workaround in the early boot, and verify if
- * it is needed in a later initcall phase once we have valid
- * topology information to check if HT is actually enabled
- */
-static __init void intel_ht_bug(void)
-{
-       x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
-
-       x86_pmu.start_scheduling = intel_start_scheduling;
-       x86_pmu.commit_scheduling = intel_commit_scheduling;
-       x86_pmu.stop_scheduling = intel_stop_scheduling;
-}
-
-EVENT_ATTR_STR(mem-loads,      mem_ld_hsw,     "event=0xcd,umask=0x1,ldlat=3");
-EVENT_ATTR_STR(mem-stores,     mem_st_hsw,     "event=0xd0,umask=0x82")
-
-/* Haswell special events */
-EVENT_ATTR_STR(tx-start,       tx_start,       "event=0xc9,umask=0x1");
-EVENT_ATTR_STR(tx-commit,      tx_commit,      "event=0xc9,umask=0x2");
-EVENT_ATTR_STR(tx-abort,       tx_abort,       "event=0xc9,umask=0x4");
-EVENT_ATTR_STR(tx-capacity,    tx_capacity,    "event=0x54,umask=0x2");
-EVENT_ATTR_STR(tx-conflict,    tx_conflict,    "event=0x54,umask=0x1");
-EVENT_ATTR_STR(el-start,       el_start,       "event=0xc8,umask=0x1");
-EVENT_ATTR_STR(el-commit,      el_commit,      "event=0xc8,umask=0x2");
-EVENT_ATTR_STR(el-abort,       el_abort,       "event=0xc8,umask=0x4");
-EVENT_ATTR_STR(el-capacity,    el_capacity,    "event=0x54,umask=0x2");
-EVENT_ATTR_STR(el-conflict,    el_conflict,    "event=0x54,umask=0x1");
-EVENT_ATTR_STR(cycles-t,       cycles_t,       "event=0x3c,in_tx=1");
-EVENT_ATTR_STR(cycles-ct,      cycles_ct,      "event=0x3c,in_tx=1,in_tx_cp=1");
-
-static struct attribute *hsw_events_attrs[] = {
-       EVENT_PTR(tx_start),
-       EVENT_PTR(tx_commit),
-       EVENT_PTR(tx_abort),
-       EVENT_PTR(tx_capacity),
-       EVENT_PTR(tx_conflict),
-       EVENT_PTR(el_start),
-       EVENT_PTR(el_commit),
-       EVENT_PTR(el_abort),
-       EVENT_PTR(el_capacity),
-       EVENT_PTR(el_conflict),
-       EVENT_PTR(cycles_t),
-       EVENT_PTR(cycles_ct),
-       EVENT_PTR(mem_ld_hsw),
-       EVENT_PTR(mem_st_hsw),
-       NULL
-};
-
-__init int intel_pmu_init(void)
-{
-       union cpuid10_edx edx;
-       union cpuid10_eax eax;
-       union cpuid10_ebx ebx;
-       struct event_constraint *c;
-       unsigned int unused;
-       struct extra_reg *er;
-       int version, i;
-
-       if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
-               switch (boot_cpu_data.x86) {
-               case 0x6:
-                       return p6_pmu_init();
-               case 0xb:
-                       return knc_pmu_init();
-               case 0xf:
-                       return p4_pmu_init();
-               }
-               return -ENODEV;
-       }
-
-       /*
-        * Check whether the Architectural PerfMon supports
-        * Branch Misses Retired hw_event or not.
-        */
-       cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
-       if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
-               return -ENODEV;
-
-       version = eax.split.version_id;
-       if (version < 2)
-               x86_pmu = core_pmu;
-       else
-               x86_pmu = intel_pmu;
-
-       x86_pmu.version                 = version;
-       x86_pmu.num_counters            = eax.split.num_counters;
-       x86_pmu.cntval_bits             = eax.split.bit_width;
-       x86_pmu.cntval_mask             = (1ULL << eax.split.bit_width) - 1;
-
-       x86_pmu.events_maskl            = ebx.full;
-       x86_pmu.events_mask_len         = eax.split.mask_length;
-
-       x86_pmu.max_pebs_events         = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
-
-       /*
-        * Quirk: v2 perfmon does not report fixed-purpose events, so
-        * assume at least 3 events:
-        */
-       if (version > 1)
-               x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
-
-       if (boot_cpu_has(X86_FEATURE_PDCM)) {
-               u64 capabilities;
-
-               rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
-               x86_pmu.intel_cap.capabilities = capabilities;
-       }
-
-       intel_ds_init();
-
-       x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
-
-       /*
-        * Install the hw-cache-events table:
-        */
-       switch (boot_cpu_data.x86_model) {
-       case 14: /* 65nm Core "Yonah" */
-               pr_cont("Core events, ");
-               break;
-
-       case 15: /* 65nm Core2 "Merom"          */
-               x86_add_quirk(intel_clovertown_quirk);
-       case 22: /* 65nm Core2 "Merom-L"        */
-       case 23: /* 45nm Core2 "Penryn"         */
-       case 29: /* 45nm Core2 "Dunnington (MP) */
-               memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
-                      sizeof(hw_cache_event_ids));
-
-               intel_pmu_lbr_init_core();
-
-               x86_pmu.event_constraints = intel_core2_event_constraints;
-               x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
-               pr_cont("Core2 events, ");
-               break;
-
-       case 30: /* 45nm Nehalem    */
-       case 26: /* 45nm Nehalem-EP */
-       case 46: /* 45nm Nehalem-EX */
-               memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
-                      sizeof(hw_cache_event_ids));
-               memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
-                      sizeof(hw_cache_extra_regs));
-
-               intel_pmu_lbr_init_nhm();
-
-               x86_pmu.event_constraints = intel_nehalem_event_constraints;
-               x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
-               x86_pmu.enable_all = intel_pmu_nhm_enable_all;
-               x86_pmu.extra_regs = intel_nehalem_extra_regs;
-
-               x86_pmu.cpu_events = nhm_events_attrs;
-
-               /* UOPS_ISSUED.STALLED_CYCLES */
-               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
-                       X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
-               /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
-               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
-                       X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
-
-               x86_add_quirk(intel_nehalem_quirk);
-
-               pr_cont("Nehalem events, ");
-               break;
-
-       case 28: /* 45nm Atom "Pineview"   */
-       case 38: /* 45nm Atom "Lincroft"   */
-       case 39: /* 32nm Atom "Penwell"    */
-       case 53: /* 32nm Atom "Cloverview" */
-       case 54: /* 32nm Atom "Cedarview"  */
-               memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
-                      sizeof(hw_cache_event_ids));
-
-               intel_pmu_lbr_init_atom();
-
-               x86_pmu.event_constraints = intel_gen_event_constraints;
-               x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
-               x86_pmu.pebs_aliases = intel_pebs_aliases_core2;
-               pr_cont("Atom events, ");
-               break;
-
-       case 55: /* 22nm Atom "Silvermont"                */
-       case 76: /* 14nm Atom "Airmont"                   */
-       case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
-               memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
-                       sizeof(hw_cache_event_ids));
-               memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
-                      sizeof(hw_cache_extra_regs));
-
-               intel_pmu_lbr_init_atom();
-
-               x86_pmu.event_constraints = intel_slm_event_constraints;
-               x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
-               x86_pmu.extra_regs = intel_slm_extra_regs;
-               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-               pr_cont("Silvermont events, ");
-               break;
-
-       case 37: /* 32nm Westmere    */
-       case 44: /* 32nm Westmere-EP */
-       case 47: /* 32nm Westmere-EX */
-               memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
-                      sizeof(hw_cache_event_ids));
-               memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
-                      sizeof(hw_cache_extra_regs));
-
-               intel_pmu_lbr_init_nhm();
-
-               x86_pmu.event_constraints = intel_westmere_event_constraints;
-               x86_pmu.enable_all = intel_pmu_nhm_enable_all;
-               x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
-               x86_pmu.extra_regs = intel_westmere_extra_regs;
-               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-
-               x86_pmu.cpu_events = nhm_events_attrs;
-
-               /* UOPS_ISSUED.STALLED_CYCLES */
-               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
-                       X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
-               /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
-               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
-                       X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
-
-               pr_cont("Westmere events, ");
-               break;
-
-       case 42: /* 32nm SandyBridge         */
-       case 45: /* 32nm SandyBridge-E/EN/EP */
-               x86_add_quirk(intel_sandybridge_quirk);
-               x86_add_quirk(intel_ht_bug);
-               memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
-                      sizeof(hw_cache_event_ids));
-               memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
-                      sizeof(hw_cache_extra_regs));
-
-               intel_pmu_lbr_init_snb();
-
-               x86_pmu.event_constraints = intel_snb_event_constraints;
-               x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
-               x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
-               if (boot_cpu_data.x86_model == 45)
-                       x86_pmu.extra_regs = intel_snbep_extra_regs;
-               else
-                       x86_pmu.extra_regs = intel_snb_extra_regs;
-
-
-               /* all extra regs are per-cpu when HT is on */
-               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-               x86_pmu.cpu_events = snb_events_attrs;
-
-               /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
-               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
-                       X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
-               /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
-               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
-                       X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
-
-               pr_cont("SandyBridge events, ");
-               break;
-
-       case 58: /* 22nm IvyBridge       */
-       case 62: /* 22nm IvyBridge-EP/EX */
-               x86_add_quirk(intel_ht_bug);
-               memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
-                      sizeof(hw_cache_event_ids));
-               /* dTLB-load-misses on IVB is different than SNB */
-               hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */
-
-               memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
-                      sizeof(hw_cache_extra_regs));
-
-               intel_pmu_lbr_init_snb();
-
-               x86_pmu.event_constraints = intel_ivb_event_constraints;
-               x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
-               x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
-               x86_pmu.pebs_prec_dist = true;
-               if (boot_cpu_data.x86_model == 62)
-                       x86_pmu.extra_regs = intel_snbep_extra_regs;
-               else
-                       x86_pmu.extra_regs = intel_snb_extra_regs;
-               /* all extra regs are per-cpu when HT is on */
-               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-               x86_pmu.cpu_events = snb_events_attrs;
-
-               /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
-               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
-                       X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
-
-               pr_cont("IvyBridge events, ");
-               break;
-
-
-       case 60: /* 22nm Haswell Core */
-       case 63: /* 22nm Haswell Server */
-       case 69: /* 22nm Haswell ULT */
-       case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
-               x86_add_quirk(intel_ht_bug);
-               x86_pmu.late_ack = true;
-               memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-               memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
-
-               intel_pmu_lbr_init_hsw();
-
-               x86_pmu.event_constraints = intel_hsw_event_constraints;
-               x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
-               x86_pmu.extra_regs = intel_snbep_extra_regs;
-               x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
-               x86_pmu.pebs_prec_dist = true;
-               /* all extra regs are per-cpu when HT is on */
-               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-               x86_pmu.hw_config = hsw_hw_config;
-               x86_pmu.get_event_constraints = hsw_get_event_constraints;
-               x86_pmu.cpu_events = hsw_events_attrs;
-               x86_pmu.lbr_double_abort = true;
-               pr_cont("Haswell events, ");
-               break;
-
-       case 61: /* 14nm Broadwell Core-M */
-       case 86: /* 14nm Broadwell Xeon D */
-       case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
-       case 79: /* 14nm Broadwell Server */
-               x86_pmu.late_ack = true;
-               memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-               memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
-
-               /* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
-               hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
-                                                                        BDW_L3_MISS|HSW_SNOOP_DRAM;
-               hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
-                                                                         HSW_SNOOP_DRAM;
-               hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
-                                                                            BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
-               hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
-                                                                             BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
-
-               intel_pmu_lbr_init_hsw();
-
-               x86_pmu.event_constraints = intel_bdw_event_constraints;
-               x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
-               x86_pmu.extra_regs = intel_snbep_extra_regs;
-               x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
-               x86_pmu.pebs_prec_dist = true;
-               /* all extra regs are per-cpu when HT is on */
-               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-               x86_pmu.hw_config = hsw_hw_config;
-               x86_pmu.get_event_constraints = hsw_get_event_constraints;
-               x86_pmu.cpu_events = hsw_events_attrs;
-               x86_pmu.limit_period = bdw_limit_period;
-               pr_cont("Broadwell events, ");
-               break;
-
-       case 87: /* Knights Landing Xeon Phi */
-               memcpy(hw_cache_event_ids,
-                      slm_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-               memcpy(hw_cache_extra_regs,
-                      knl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
-               intel_pmu_lbr_init_knl();
-
-               x86_pmu.event_constraints = intel_slm_event_constraints;
-               x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
-               x86_pmu.extra_regs = intel_knl_extra_regs;
-
-               /* all extra regs are per-cpu when HT is on */
-               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-               pr_cont("Knights Landing events, ");
-               break;
-
-       case 78: /* 14nm Skylake Mobile */
-       case 94: /* 14nm Skylake Desktop */
-               x86_pmu.late_ack = true;
-               memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-               memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
-               intel_pmu_lbr_init_skl();
-
-               x86_pmu.event_constraints = intel_skl_event_constraints;
-               x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
-               x86_pmu.extra_regs = intel_skl_extra_regs;
-               x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
-               x86_pmu.pebs_prec_dist = true;
-               /* all extra regs are per-cpu when HT is on */
-               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-               x86_pmu.hw_config = hsw_hw_config;
-               x86_pmu.get_event_constraints = hsw_get_event_constraints;
-               x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr,
-                                                 skl_format_attr);
-               WARN_ON(!x86_pmu.format_attrs);
-               x86_pmu.cpu_events = hsw_events_attrs;
-               pr_cont("Skylake events, ");
-               break;
-
-       default:
-               switch (x86_pmu.version) {
-               case 1:
-                       x86_pmu.event_constraints = intel_v1_event_constraints;
-                       pr_cont("generic architected perfmon v1, ");
-                       break;
-               default:
-                       /*
-                        * default constraints for v2 and up
-                        */
-                       x86_pmu.event_constraints = intel_gen_event_constraints;
-                       pr_cont("generic architected perfmon, ");
-                       break;
-               }
-       }
-
-       if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
-               WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
-                    x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
-               x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC;
-       }
-       x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
-
-       if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) {
-               WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
-                    x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED);
-               x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
-       }
-
-       x86_pmu.intel_ctrl |=
-               ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
-
-       if (x86_pmu.event_constraints) {
-               /*
-                * event on fixed counter2 (REF_CYCLES) only works on this
-                * counter, so do not extend mask to generic counters
-                */
-               for_each_event_constraint(c, x86_pmu.event_constraints) {
-                       if (c->cmask == FIXED_EVENT_FLAGS
-                           && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) {
-                               c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
-                       }
-                       c->idxmsk64 &=
-                               ~(~0UL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
-                       c->weight = hweight64(c->idxmsk64);
-               }
-       }
-
-       /*
-        * Access LBR MSR may cause #GP under certain circumstances.
-        * E.g. KVM doesn't support LBR MSR
-        * Check all LBT MSR here.
-        * Disable LBR access if any LBR MSRs can not be accessed.
-        */
-       if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL))
-               x86_pmu.lbr_nr = 0;
-       for (i = 0; i < x86_pmu.lbr_nr; i++) {
-               if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
-                     check_msr(x86_pmu.lbr_to + i, 0xffffUL)))
-                       x86_pmu.lbr_nr = 0;
-       }
-
-       /*
-        * Access extra MSR may cause #GP under certain circumstances.
-        * E.g. KVM doesn't support offcore event
-        * Check all extra_regs here.
-        */
-       if (x86_pmu.extra_regs) {
-               for (er = x86_pmu.extra_regs; er->msr; er++) {
-                       er->extra_msr_access = check_msr(er->msr, 0x11UL);
-                       /* Disable LBR select mapping */
-                       if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
-                               x86_pmu.lbr_sel_map = NULL;
-               }
-       }
-
-       /* Support full width counters using alternative MSR range */
-       if (x86_pmu.intel_cap.full_width_write) {
-               x86_pmu.max_period = x86_pmu.cntval_mask;
-               x86_pmu.perfctr = MSR_IA32_PMC0;
-               pr_cont("full-width counters, ");
-       }
-
-       return 0;
-}
-
-/*
- * HT bug: phase 2 init
- * Called once we have valid topology information to check
- * whether or not HT is enabled
- * If HT is off, then we disable the workaround
- */
-static __init int fixup_ht_bug(void)
-{
-       int cpu = smp_processor_id();
-       int w, c;
-       /*
-        * problem not present on this CPU model, nothing to do
-        */
-       if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
-               return 0;
-
-       w = cpumask_weight(topology_sibling_cpumask(cpu));
-       if (w > 1) {
-               pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
-               return 0;
-       }
-
-       if (lockup_detector_suspend() != 0) {
-               pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
-               return 0;
-       }
-
-       x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
-
-       x86_pmu.start_scheduling = NULL;
-       x86_pmu.commit_scheduling = NULL;
-       x86_pmu.stop_scheduling = NULL;
-
-       lockup_detector_resume();
-
-       get_online_cpus();
-
-       for_each_online_cpu(c) {
-               free_excl_cntrs(c);
-       }
-
-       put_online_cpus();
-       pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
-       return 0;
-}
-subsys_initcall(fixup_ht_bug)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c

deleted file mode 100644 (file)

index 2cad71d..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_bts.c
+++ /dev/null
@@ -1,544 +0,0 @@
-/*
- * BTS PMU driver for perf
- * Copyright (c) 2013-2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- */
-
-#undef DEBUG
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/bitops.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/debugfs.h>
-#include <linux/device.h>
-#include <linux/coredump.h>
-
-#include <asm-generic/sizes.h>
-#include <asm/perf_event.h>
-
-#include "perf_event.h"
-
-struct bts_ctx {
-       struct perf_output_handle       handle;
-       struct debug_store              ds_back;
-       int                             started;
-};
-
-static DEFINE_PER_CPU(struct bts_ctx, bts_ctx);
-
-#define BTS_RECORD_SIZE                24
-#define BTS_SAFETY_MARGIN      4080
-
-struct bts_phys {
-       struct page     *page;
-       unsigned long   size;
-       unsigned long   offset;
-       unsigned long   displacement;
-};
-
-struct bts_buffer {
-       size_t          real_size;      /* multiple of BTS_RECORD_SIZE */
-       unsigned int    nr_pages;
-       unsigned int    nr_bufs;
-       unsigned int    cur_buf;
-       bool            snapshot;
-       local_t         data_size;
-       local_t         lost;
-       local_t         head;
-       unsigned long   end;
-       void            **data_pages;
-       struct bts_phys buf[0];
-};
-
-struct pmu bts_pmu;
-
-static size_t buf_size(struct page *page)
-{
-       return 1 << (PAGE_SHIFT + page_private(page));
-}
-
-static void *
-bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite)
-{
-       struct bts_buffer *buf;
-       struct page *page;
-       int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
-       unsigned long offset;
-       size_t size = nr_pages << PAGE_SHIFT;
-       int pg, nbuf, pad;
-
-       /* count all the high order buffers */
-       for (pg = 0, nbuf = 0; pg < nr_pages;) {
-               page = virt_to_page(pages[pg]);
-               if (WARN_ON_ONCE(!PagePrivate(page) && nr_pages > 1))
-                       return NULL;
-               pg += 1 << page_private(page);
-               nbuf++;
-       }
-
-       /*
-        * to avoid interrupts in overwrite mode, only allow one physical
-        */
-       if (overwrite && nbuf > 1)
-               return NULL;
-
-       buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node);
-       if (!buf)
-               return NULL;
-
-       buf->nr_pages = nr_pages;
-       buf->nr_bufs = nbuf;
-       buf->snapshot = overwrite;
-       buf->data_pages = pages;
-       buf->real_size = size - size % BTS_RECORD_SIZE;
-
-       for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) {
-               unsigned int __nr_pages;
-
-               page = virt_to_page(pages[pg]);
-               __nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1;
-               buf->buf[nbuf].page = page;
-               buf->buf[nbuf].offset = offset;
-               buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
-               buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement;
-               pad = buf->buf[nbuf].size % BTS_RECORD_SIZE;
-               buf->buf[nbuf].size -= pad;
-
-               pg += __nr_pages;
-               offset += __nr_pages << PAGE_SHIFT;
-       }
-
-       return buf;
-}
-
-static void bts_buffer_free_aux(void *data)
-{
-       kfree(data);
-}
-
-static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx)
-{
-       return buf->buf[idx].offset + buf->buf[idx].displacement;
-}
-
-static void
-bts_config_buffer(struct bts_buffer *buf)
-{
-       int cpu = raw_smp_processor_id();
-       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-       struct bts_phys *phys = &buf->buf[buf->cur_buf];
-       unsigned long index, thresh = 0, end = phys->size;
-       struct page *page = phys->page;
-
-       index = local_read(&buf->head);
-
-       if (!buf->snapshot) {
-               if (buf->end < phys->offset + buf_size(page))
-                       end = buf->end - phys->offset - phys->displacement;
-
-               index -= phys->offset + phys->displacement;
-
-               if (end - index > BTS_SAFETY_MARGIN)
-                       thresh = end - BTS_SAFETY_MARGIN;
-               else if (end - index > BTS_RECORD_SIZE)
-                       thresh = end - BTS_RECORD_SIZE;
-               else
-                       thresh = end;
-       }
-
-       ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
-       ds->bts_index = ds->bts_buffer_base + index;
-       ds->bts_absolute_maximum = ds->bts_buffer_base + end;
-       ds->bts_interrupt_threshold = !buf->snapshot
-               ? ds->bts_buffer_base + thresh
-               : ds->bts_absolute_maximum + BTS_RECORD_SIZE;
-}
-
-static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
-{
-       unsigned long index = head - phys->offset;
-
-       memset(page_address(phys->page) + index, 0, phys->size - index);
-}
-
-static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts)
-{
-       if (buf->snapshot)
-               return false;
-
-       if (local_read(&buf->data_size) >= bts->handle.size ||
-           bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE)
-               return true;
-
-       return false;
-}
-
-static void bts_update(struct bts_ctx *bts)
-{
-       int cpu = raw_smp_processor_id();
-       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-       struct bts_buffer *buf = perf_get_aux(&bts->handle);
-       unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
-
-       if (!buf)
-               return;
-
-       head = index + bts_buffer_offset(buf, buf->cur_buf);
-       old = local_xchg(&buf->head, head);
-
-       if (!buf->snapshot) {
-               if (old == head)
-                       return;
-
-               if (ds->bts_index >= ds->bts_absolute_maximum)
-                       local_inc(&buf->lost);
-
-               /*
-                * old and head are always in the same physical buffer, so we
-                * can subtract them to get the data size.
-                */
-               local_add(head - old, &buf->data_size);
-       } else {
-               local_set(&buf->data_size, head);
-       }
-}
-
-static void __bts_event_start(struct perf_event *event)
-{
-       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-       struct bts_buffer *buf = perf_get_aux(&bts->handle);
-       u64 config = 0;
-
-       if (!buf || bts_buffer_is_full(buf, bts))
-               return;
-
-       event->hw.itrace_started = 1;
-       event->hw.state = 0;
-
-       if (!buf->snapshot)
-               config |= ARCH_PERFMON_EVENTSEL_INT;
-       if (!event->attr.exclude_kernel)
-               config |= ARCH_PERFMON_EVENTSEL_OS;
-       if (!event->attr.exclude_user)
-               config |= ARCH_PERFMON_EVENTSEL_USR;
-
-       bts_config_buffer(buf);
-
-       /*
-        * local barrier to make sure that ds configuration made it
-        * before we enable BTS
-        */
-       wmb();
-
-       intel_pmu_enable_bts(config);
-}
-
-static void bts_event_start(struct perf_event *event, int flags)
-{
-       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-
-       __bts_event_start(event);
-
-       /* PMI handler: this counter is running and likely generating PMIs */
-       ACCESS_ONCE(bts->started) = 1;
-}
-
-static void __bts_event_stop(struct perf_event *event)
-{
-       /*
-        * No extra synchronization is mandated by the documentation to have
-        * BTS data stores globally visible.
-        */
-       intel_pmu_disable_bts();
-
-       if (event->hw.state & PERF_HES_STOPPED)
-               return;
-
-       ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED;
-}
-
-static void bts_event_stop(struct perf_event *event, int flags)
-{
-       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-
-       /* PMI handler: don't restart this counter */
-       ACCESS_ONCE(bts->started) = 0;
-
-       __bts_event_stop(event);
-
-       if (flags & PERF_EF_UPDATE)
-               bts_update(bts);
-}
-
-void intel_bts_enable_local(void)
-{
-       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-
-       if (bts->handle.event && bts->started)
-               __bts_event_start(bts->handle.event);
-}
-
-void intel_bts_disable_local(void)
-{
-       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-
-       if (bts->handle.event)
-               __bts_event_stop(bts->handle.event);
-}
-
-static int
-bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
-{
-       unsigned long head, space, next_space, pad, gap, skip, wakeup;
-       unsigned int next_buf;
-       struct bts_phys *phys, *next_phys;
-       int ret;
-
-       if (buf->snapshot)
-               return 0;
-
-       head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
-       if (WARN_ON_ONCE(head != local_read(&buf->head)))
-               return -EINVAL;
-
-       phys = &buf->buf[buf->cur_buf];
-       space = phys->offset + phys->displacement + phys->size - head;
-       pad = space;
-       if (space > handle->size) {
-               space = handle->size;
-               space -= space % BTS_RECORD_SIZE;
-       }
-       if (space <= BTS_SAFETY_MARGIN) {
-               /* See if next phys buffer has more space */
-               next_buf = buf->cur_buf + 1;
-               if (next_buf >= buf->nr_bufs)
-                       next_buf = 0;
-               next_phys = &buf->buf[next_buf];
-               gap = buf_size(phys->page) - phys->displacement - phys->size +
-                     next_phys->displacement;
-               skip = pad + gap;
-               if (handle->size >= skip) {
-                       next_space = next_phys->size;
-                       if (next_space + skip > handle->size) {
-                               next_space = handle->size - skip;
-                               next_space -= next_space % BTS_RECORD_SIZE;
-                       }
-                       if (next_space > space || !space) {
-                               if (pad)
-                                       bts_buffer_pad_out(phys, head);
-                               ret = perf_aux_output_skip(handle, skip);
-                               if (ret)
-                                       return ret;
-                               /* Advance to next phys buffer */
-                               phys = next_phys;
-                               space = next_space;
-                               head = phys->offset + phys->displacement;
-                               /*
-                                * After this, cur_buf and head won't match ds
-                                * anymore, so we must not be racing with
-                                * bts_update().
-                                */
-                               buf->cur_buf = next_buf;
-                               local_set(&buf->head, head);
-                       }
-               }
-       }
-
-       /* Don't go far beyond wakeup watermark */
-       wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
-                handle->head;
-       if (space > wakeup) {
-               space = wakeup;
-               space -= space % BTS_RECORD_SIZE;
-       }
-
-       buf->end = head + space;
-
-       /*
-        * If we have no space, the lost notification would have been sent when
-        * we hit absolute_maximum - see bts_update()
-        */
-       if (!space)
-               return -ENOSPC;
-
-       return 0;
-}
-
-int intel_bts_interrupt(void)
-{
-       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-       struct perf_event *event = bts->handle.event;
-       struct bts_buffer *buf;
-       s64 old_head;
-       int err;
-
-       if (!event || !bts->started)
-               return 0;
-
-       buf = perf_get_aux(&bts->handle);
-       /*
-        * Skip snapshot counters: they don't use the interrupt, but
-        * there's no other way of telling, because the pointer will
-        * keep moving
-        */
-       if (!buf || buf->snapshot)
-               return 0;
-
-       old_head = local_read(&buf->head);
-       bts_update(bts);
-
-       /* no new data */
-       if (old_head == local_read(&buf->head))
-               return 0;
-
-       perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
-                           !!local_xchg(&buf->lost, 0));
-
-       buf = perf_aux_output_begin(&bts->handle, event);
-       if (!buf)
-               return 1;
-
-       err = bts_buffer_reset(buf, &bts->handle);
-       if (err)
-               perf_aux_output_end(&bts->handle, 0, false);
-
-       return 1;
-}
-
-static void bts_event_del(struct perf_event *event, int mode)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-       struct bts_buffer *buf = perf_get_aux(&bts->handle);
-
-       bts_event_stop(event, PERF_EF_UPDATE);
-
-       if (buf) {
-               if (buf->snapshot)
-                       bts->handle.head =
-                               local_xchg(&buf->data_size,
-                                          buf->nr_pages << PAGE_SHIFT);
-               perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
-                                   !!local_xchg(&buf->lost, 0));
-       }
-
-       cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
-       cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
-       cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
-       cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
-}
-
-static int bts_event_add(struct perf_event *event, int mode)
-{
-       struct bts_buffer *buf;
-       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct hw_perf_event *hwc = &event->hw;
-       int ret = -EBUSY;
-
-       event->hw.state = PERF_HES_STOPPED;
-
-       if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
-               return -EBUSY;
-
-       if (bts->handle.event)
-               return -EBUSY;
-
-       buf = perf_aux_output_begin(&bts->handle, event);
-       if (!buf)
-               return -EINVAL;
-
-       ret = bts_buffer_reset(buf, &bts->handle);
-       if (ret) {
-               perf_aux_output_end(&bts->handle, 0, false);
-               return ret;
-       }
-
-       bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
-       bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
-       bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
-
-       if (mode & PERF_EF_START) {
-               bts_event_start(event, 0);
-               if (hwc->state & PERF_HES_STOPPED) {
-                       bts_event_del(event, 0);
-                       return -EBUSY;
-               }
-       }
-
-       return 0;
-}
-
-static void bts_event_destroy(struct perf_event *event)
-{
-       x86_release_hardware();
-       x86_del_exclusive(x86_lbr_exclusive_bts);
-}
-
-static int bts_event_init(struct perf_event *event)
-{
-       int ret;
-
-       if (event->attr.type != bts_pmu.type)
-               return -ENOENT;
-
-       if (x86_add_exclusive(x86_lbr_exclusive_bts))
-               return -EBUSY;
-
-       /*
-        * BTS leaks kernel addresses even when CPL0 tracing is
-        * disabled, so disallow intel_bts driver for unprivileged
-        * users on paranoid systems since it provides trace data
-        * to the user in a zero-copy fashion.
-        *
-        * Note that the default paranoia setting permits unprivileged
-        * users to profile the kernel.
-        */
-       if (event->attr.exclude_kernel && perf_paranoid_kernel() &&
-           !capable(CAP_SYS_ADMIN))
-               return -EACCES;
-
-       ret = x86_reserve_hardware();
-       if (ret) {
-               x86_del_exclusive(x86_lbr_exclusive_bts);
-               return ret;
-       }
-
-       event->destroy = bts_event_destroy;
-
-       return 0;
-}
-
-static void bts_event_read(struct perf_event *event)
-{
-}
-
-static __init int bts_init(void)
-{
-       if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
-               return -ENODEV;
-
-       bts_pmu.capabilities    = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE;
-       bts_pmu.task_ctx_nr     = perf_sw_context;
-       bts_pmu.event_init      = bts_event_init;
-       bts_pmu.add             = bts_event_add;
-       bts_pmu.del             = bts_event_del;
-       bts_pmu.start           = bts_event_start;
-       bts_pmu.stop            = bts_event_stop;
-       bts_pmu.read            = bts_event_read;
-       bts_pmu.setup_aux       = bts_buffer_setup_aux;
-       bts_pmu.free_aux        = bts_buffer_free_aux;
-
-       return perf_pmu_register(&bts_pmu, "intel_bts", -1);
-}
-arch_initcall(bts_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c

deleted file mode 100644 (file)

index a316ca9..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ /dev/null
@@ -1,1391 +0,0 @@
-/*
- * Intel Cache Quality-of-Service Monitoring (CQM) support.
- *
- * Based very, very heavily on work by Peter Zijlstra.
- */
-
-#include <linux/perf_event.h>
-#include <linux/slab.h>
-#include <asm/cpu_device_id.h>
-#include "perf_event.h"
-
-#define MSR_IA32_PQR_ASSOC     0x0c8f
-#define MSR_IA32_QM_CTR                0x0c8e
-#define MSR_IA32_QM_EVTSEL     0x0c8d
-
-static u32 cqm_max_rmid = -1;
-static unsigned int cqm_l3_scale; /* supposedly cacheline size */
-
-/**
- * struct intel_pqr_state - State cache for the PQR MSR
- * @rmid:              The cached Resource Monitoring ID
- * @closid:            The cached Class Of Service ID
- * @rmid_usecnt:       The usage counter for rmid
- *
- * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
- * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
- * contains both parts, so we need to cache them.
- *
- * The cache also helps to avoid pointless updates if the value does
- * not change.
- */
-struct intel_pqr_state {
-       u32                     rmid;
-       u32                     closid;
-       int                     rmid_usecnt;
-};
-
-/*
- * The cached intel_pqr_state is strictly per CPU and can never be
- * updated from a remote CPU. Both functions which modify the state
- * (intel_cqm_event_start and intel_cqm_event_stop) are called with
- * interrupts disabled, which is sufficient for the protection.
- */
-static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
-
-/*
- * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
- * Also protects event->hw.cqm_rmid
- *
- * Hold either for stability, both for modification of ->hw.cqm_rmid.
- */
-static DEFINE_MUTEX(cache_mutex);
-static DEFINE_RAW_SPINLOCK(cache_lock);
-
-/*
- * Groups of events that have the same target(s), one RMID per group.
- */
-static LIST_HEAD(cache_groups);
-
-/*
- * Mask of CPUs for reading CQM values. We only need one per-socket.
- */
-static cpumask_t cqm_cpumask;
-
-#define RMID_VAL_ERROR         (1ULL << 63)
-#define RMID_VAL_UNAVAIL       (1ULL << 62)
-
-#define QOS_L3_OCCUP_EVENT_ID  (1 << 0)
-
-#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID
-
-/*
- * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
- *
- * This rmid is always free and is guaranteed to have an associated
- * near-zero occupancy value, i.e. no cachelines are tagged with this
- * RMID, once __intel_cqm_rmid_rotate() returns.
- */
-static u32 intel_cqm_rotation_rmid;
-
-#define INVALID_RMID           (-1)
-
-/*
- * Is @rmid valid for programming the hardware?
- *
- * rmid 0 is reserved by the hardware for all non-monitored tasks, which
- * means that we should never come across an rmid with that value.
- * Likewise, an rmid value of -1 is used to indicate "no rmid currently
- * assigned" and is used as part of the rotation code.
- */
-static inline bool __rmid_valid(u32 rmid)
-{
-       if (!rmid || rmid == INVALID_RMID)
-               return false;
-
-       return true;
-}
-
-static u64 __rmid_read(u32 rmid)
-{
-       u64 val;
-
-       /*
-        * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
-        * it just says that to increase confusion.
-        */
-       wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
-       rdmsrl(MSR_IA32_QM_CTR, val);
-
-       /*
-        * Aside from the ERROR and UNAVAIL bits, assume this thing returns
-        * the number of cachelines tagged with @rmid.
-        */
-       return val;
-}
-
-enum rmid_recycle_state {
-       RMID_YOUNG = 0,
-       RMID_AVAILABLE,
-       RMID_DIRTY,
-};
-
-struct cqm_rmid_entry {
-       u32 rmid;
-       enum rmid_recycle_state state;
-       struct list_head list;
-       unsigned long queue_time;
-};
-
-/*
- * cqm_rmid_free_lru - A least recently used list of RMIDs.
- *
- * Oldest entry at the head, newest (most recently used) entry at the
- * tail. This list is never traversed, it's only used to keep track of
- * the lru order. That is, we only pick entries of the head or insert
- * them on the tail.
- *
- * All entries on the list are 'free', and their RMIDs are not currently
- * in use. To mark an RMID as in use, remove its entry from the lru
- * list.
- *
- *
- * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
- *
- * This list is contains RMIDs that no one is currently using but that
- * may have a non-zero occupancy value associated with them. The
- * rotation worker moves RMIDs from the limbo list to the free list once
- * the occupancy value drops below __intel_cqm_threshold.
- *
- * Both lists are protected by cache_mutex.
- */
-static LIST_HEAD(cqm_rmid_free_lru);
-static LIST_HEAD(cqm_rmid_limbo_lru);
-
-/*
- * We use a simple array of pointers so that we can lookup a struct
- * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
- * and __put_rmid() from having to worry about dealing with struct
- * cqm_rmid_entry - they just deal with rmids, i.e. integers.
- *
- * Once this array is initialized it is read-only. No locks are required
- * to access it.
- *
- * All entries for all RMIDs can be looked up in the this array at all
- * times.
- */
-static struct cqm_rmid_entry **cqm_rmid_ptrs;
-
-static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
-{
-       struct cqm_rmid_entry *entry;
-
-       entry = cqm_rmid_ptrs[rmid];
-       WARN_ON(entry->rmid != rmid);
-
-       return entry;
-}
-
-/*
- * Returns < 0 on fail.
- *
- * We expect to be called with cache_mutex held.
- */
-static u32 __get_rmid(void)
-{
-       struct cqm_rmid_entry *entry;
-
-       lockdep_assert_held(&cache_mutex);
-
-       if (list_empty(&cqm_rmid_free_lru))
-               return INVALID_RMID;
-
-       entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
-       list_del(&entry->list);
-
-       return entry->rmid;
-}
-
-static void __put_rmid(u32 rmid)
-{
-       struct cqm_rmid_entry *entry;
-
-       lockdep_assert_held(&cache_mutex);
-
-       WARN_ON(!__rmid_valid(rmid));
-       entry = __rmid_entry(rmid);
-
-       entry->queue_time = jiffies;
-       entry->state = RMID_YOUNG;
-
-       list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
-}
-
-static int intel_cqm_setup_rmid_cache(void)
-{
-       struct cqm_rmid_entry *entry;
-       unsigned int nr_rmids;
-       int r = 0;
-
-       nr_rmids = cqm_max_rmid + 1;
-       cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
-                               nr_rmids, GFP_KERNEL);
-       if (!cqm_rmid_ptrs)
-               return -ENOMEM;
-
-       for (; r <= cqm_max_rmid; r++) {
-               struct cqm_rmid_entry *entry;
-
-               entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-               if (!entry)
-                       goto fail;
-
-               INIT_LIST_HEAD(&entry->list);
-               entry->rmid = r;
-               cqm_rmid_ptrs[r] = entry;
-
-               list_add_tail(&entry->list, &cqm_rmid_free_lru);
-       }
-
-       /*
-        * RMID 0 is special and is always allocated. It's used for all
-        * tasks that are not monitored.
-        */
-       entry = __rmid_entry(0);
-       list_del(&entry->list);
-
-       mutex_lock(&cache_mutex);
-       intel_cqm_rotation_rmid = __get_rmid();
-       mutex_unlock(&cache_mutex);
-
-       return 0;
-fail:
-       while (r--)
-               kfree(cqm_rmid_ptrs[r]);
-
-       kfree(cqm_rmid_ptrs);
-       return -ENOMEM;
-}
-
-/*
- * Determine if @a and @b measure the same set of tasks.
- *
- * If @a and @b measure the same set of tasks then we want to share a
- * single RMID.
- */
-static bool __match_event(struct perf_event *a, struct perf_event *b)
-{
-       /* Per-cpu and task events don't mix */
-       if ((a->attach_state & PERF_ATTACH_TASK) !=
-           (b->attach_state & PERF_ATTACH_TASK))
-               return false;
-
-#ifdef CONFIG_CGROUP_PERF
-       if (a->cgrp != b->cgrp)
-               return false;
-#endif
-
-       /* If not task event, we're machine wide */
-       if (!(b->attach_state & PERF_ATTACH_TASK))
-               return true;
-
-       /*
-        * Events that target same task are placed into the same cache group.
-        */
-       if (a->hw.target == b->hw.target)
-               return true;
-
-       /*
-        * Are we an inherited event?
-        */
-       if (b->parent == a)
-               return true;
-
-       return false;
-}
-
-#ifdef CONFIG_CGROUP_PERF
-static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
-{
-       if (event->attach_state & PERF_ATTACH_TASK)
-               return perf_cgroup_from_task(event->hw.target, event->ctx);
-
-       return event->cgrp;
-}
-#endif
-
-/*
- * Determine if @a's tasks intersect with @b's tasks
- *
- * There are combinations of events that we explicitly prohibit,
- *
- *                PROHIBITS
- *     system-wide    ->       cgroup and task
- *     cgroup        ->        system-wide
- *                           ->        task in cgroup
- *     task          ->        system-wide
- *                           ->        task in cgroup
- *
- * Call this function before allocating an RMID.
- */
-static bool __conflict_event(struct perf_event *a, struct perf_event *b)
-{
-#ifdef CONFIG_CGROUP_PERF
-       /*
-        * We can have any number of cgroups but only one system-wide
-        * event at a time.
-        */
-       if (a->cgrp && b->cgrp) {
-               struct perf_cgroup *ac = a->cgrp;
-               struct perf_cgroup *bc = b->cgrp;
-
-               /*
-                * This condition should have been caught in
-                * __match_event() and we should be sharing an RMID.
-                */
-               WARN_ON_ONCE(ac == bc);
-
-               if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
-                   cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
-                       return true;
-
-               return false;
-       }
-
-       if (a->cgrp || b->cgrp) {
-               struct perf_cgroup *ac, *bc;
-
-               /*
-                * cgroup and system-wide events are mutually exclusive
-                */
-               if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
-                   (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
-                       return true;
-
-               /*
-                * Ensure neither event is part of the other's cgroup
-                */
-               ac = event_to_cgroup(a);
-               bc = event_to_cgroup(b);
-               if (ac == bc)
-                       return true;
-
-               /*
-                * Must have cgroup and non-intersecting task events.
-                */
-               if (!ac || !bc)
-                       return false;
-
-               /*
-                * We have cgroup and task events, and the task belongs
-                * to a cgroup. Check for for overlap.
-                */
-               if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
-                   cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
-                       return true;
-
-               return false;
-       }
-#endif
-       /*
-        * If one of them is not a task, same story as above with cgroups.
-        */
-       if (!(a->attach_state & PERF_ATTACH_TASK) ||
-           !(b->attach_state & PERF_ATTACH_TASK))
-               return true;
-
-       /*
-        * Must be non-overlapping.
-        */
-       return false;
-}
-
-struct rmid_read {
-       u32 rmid;
-       atomic64_t value;
-};
-
-static void __intel_cqm_event_count(void *info);
-
-/*
- * Exchange the RMID of a group of events.
- */
-static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
-{
-       struct perf_event *event;
-       struct list_head *head = &group->hw.cqm_group_entry;
-       u32 old_rmid = group->hw.cqm_rmid;
-
-       lockdep_assert_held(&cache_mutex);
-
-       /*
-        * If our RMID is being deallocated, perform a read now.
-        */
-       if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
-               struct rmid_read rr = {
-                       .value = ATOMIC64_INIT(0),
-                       .rmid = old_rmid,
-               };
-
-               on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count,
-                                &rr, 1);
-               local64_set(&group->count, atomic64_read(&rr.value));
-       }
-
-       raw_spin_lock_irq(&cache_lock);
-
-       group->hw.cqm_rmid = rmid;
-       list_for_each_entry(event, head, hw.cqm_group_entry)
-               event->hw.cqm_rmid = rmid;
-
-       raw_spin_unlock_irq(&cache_lock);
-
-       return old_rmid;
-}
-
-/*
- * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
- * cachelines are still tagged with RMIDs in limbo, we progressively
- * increment the threshold until we find an RMID in limbo with <=
- * __intel_cqm_threshold lines tagged. This is designed to mitigate the
- * problem where cachelines tagged with an RMID are not steadily being
- * evicted.
- *
- * On successful rotations we decrease the threshold back towards zero.
- *
- * __intel_cqm_max_threshold provides an upper bound on the threshold,
- * and is measured in bytes because it's exposed to userland.
- */
-static unsigned int __intel_cqm_threshold;
-static unsigned int __intel_cqm_max_threshold;
-
-/*
- * Test whether an RMID has a zero occupancy value on this cpu.
- */
-static void intel_cqm_stable(void *arg)
-{
-       struct cqm_rmid_entry *entry;
-
-       list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
-               if (entry->state != RMID_AVAILABLE)
-                       break;
-
-               if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
-                       entry->state = RMID_DIRTY;
-       }
-}
-
-/*
- * If we have group events waiting for an RMID that don't conflict with
- * events already running, assign @rmid.
- */
-static bool intel_cqm_sched_in_event(u32 rmid)
-{
-       struct perf_event *leader, *event;
-
-       lockdep_assert_held(&cache_mutex);
-
-       leader = list_first_entry(&cache_groups, struct perf_event,
-                                 hw.cqm_groups_entry);
-       event = leader;
-
-       list_for_each_entry_continue(event, &cache_groups,
-                                    hw.cqm_groups_entry) {
-               if (__rmid_valid(event->hw.cqm_rmid))
-                       continue;
-
-               if (__conflict_event(event, leader))
-                       continue;
-
-               intel_cqm_xchg_rmid(event, rmid);
-               return true;
-       }
-
-       return false;
-}
-
-/*
- * Initially use this constant for both the limbo queue time and the
- * rotation timer interval, pmu::hrtimer_interval_ms.
- *
- * They don't need to be the same, but the two are related since if you
- * rotate faster than you recycle RMIDs, you may run out of available
- * RMIDs.
- */
-#define RMID_DEFAULT_QUEUE_TIME 250    /* ms */
-
-static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
-
-/*
- * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
- * @nr_available: number of freeable RMIDs on the limbo list
- *
- * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
- * cachelines are tagged with those RMIDs. After this we can reuse them
- * and know that the current set of active RMIDs is stable.
- *
- * Return %true or %false depending on whether stabilization needs to be
- * reattempted.
- *
- * If we return %true then @nr_available is updated to indicate the
- * number of RMIDs on the limbo list that have been queued for the
- * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
- * are above __intel_cqm_threshold.
- */
-static bool intel_cqm_rmid_stabilize(unsigned int *available)
-{
-       struct cqm_rmid_entry *entry, *tmp;
-
-       lockdep_assert_held(&cache_mutex);
-
-       *available = 0;
-       list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
-               unsigned long min_queue_time;
-               unsigned long now = jiffies;
-
-               /*
-                * We hold RMIDs placed into limbo for a minimum queue
-                * time. Before the minimum queue time has elapsed we do
-                * not recycle RMIDs.
-                *
-                * The reasoning is that until a sufficient time has
-                * passed since we stopped using an RMID, any RMID
-                * placed onto the limbo list will likely still have
-                * data tagged in the cache, which means we'll probably
-                * fail to recycle it anyway.
-                *
-                * We can save ourselves an expensive IPI by skipping
-                * any RMIDs that have not been queued for the minimum
-                * time.
-                */
-               min_queue_time = entry->queue_time +
-                       msecs_to_jiffies(__rmid_queue_time_ms);
-
-               if (time_after(min_queue_time, now))
-                       break;
-
-               entry->state = RMID_AVAILABLE;
-               (*available)++;
-       }
-
-       /*
-        * Fast return if none of the RMIDs on the limbo list have been
-        * sitting on the queue for the minimum queue time.
-        */
-       if (!*available)
-               return false;
-
-       /*
-        * Test whether an RMID is free for each package.
-        */
-       on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
-
-       list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
-               /*
-                * Exhausted all RMIDs that have waited min queue time.
-                */
-               if (entry->state == RMID_YOUNG)
-                       break;
-
-               if (entry->state == RMID_DIRTY)
-                       continue;
-
-               list_del(&entry->list); /* remove from limbo */
-
-               /*
-                * The rotation RMID gets priority if it's
-                * currently invalid. In which case, skip adding
-                * the RMID to the the free lru.
-                */
-               if (!__rmid_valid(intel_cqm_rotation_rmid)) {
-                       intel_cqm_rotation_rmid = entry->rmid;
-                       continue;
-               }
-
-               /*
-                * If we have groups waiting for RMIDs, hand
-                * them one now provided they don't conflict.
-                */
-               if (intel_cqm_sched_in_event(entry->rmid))
-                       continue;
-
-               /*
-                * Otherwise place it onto the free list.
-                */
-               list_add_tail(&entry->list, &cqm_rmid_free_lru);
-       }
-
-
-       return __rmid_valid(intel_cqm_rotation_rmid);
-}
-
-/*
- * Pick a victim group and move it to the tail of the group list.
- * @next: The first group without an RMID
- */
-static void __intel_cqm_pick_and_rotate(struct perf_event *next)
-{
-       struct perf_event *rotor;
-       u32 rmid;
-
-       lockdep_assert_held(&cache_mutex);
-
-       rotor = list_first_entry(&cache_groups, struct perf_event,
-                                hw.cqm_groups_entry);
-
-       /*
-        * The group at the front of the list should always have a valid
-        * RMID. If it doesn't then no groups have RMIDs assigned and we
-        * don't need to rotate the list.
-        */
-       if (next == rotor)
-               return;
-
-       rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
-       __put_rmid(rmid);
-
-       list_rotate_left(&cache_groups);
-}
-
-/*
- * Deallocate the RMIDs from any events that conflict with @event, and
- * place them on the back of the group list.
- */
-static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
-{
-       struct perf_event *group, *g;
-       u32 rmid;
-
-       lockdep_assert_held(&cache_mutex);
-
-       list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
-               if (group == event)
-                       continue;
-
-               rmid = group->hw.cqm_rmid;
-
-               /*
-                * Skip events that don't have a valid RMID.
-                */
-               if (!__rmid_valid(rmid))
-                       continue;
-
-               /*
-                * No conflict? No problem! Leave the event alone.
-                */
-               if (!__conflict_event(group, event))
-                       continue;
-
-               intel_cqm_xchg_rmid(group, INVALID_RMID);
-               __put_rmid(rmid);
-       }
-}
-
-/*
- * Attempt to rotate the groups and assign new RMIDs.
- *
- * We rotate for two reasons,
- *   1. To handle the scheduling of conflicting events
- *   2. To recycle RMIDs
- *
- * Rotating RMIDs is complicated because the hardware doesn't give us
- * any clues.
- *
- * There's problems with the hardware interface; when you change the
- * task:RMID map cachelines retain their 'old' tags, giving a skewed
- * picture. In order to work around this, we must always keep one free
- * RMID - intel_cqm_rotation_rmid.
- *
- * Rotation works by taking away an RMID from a group (the old RMID),
- * and assigning the free RMID to another group (the new RMID). We must
- * then wait for the old RMID to not be used (no cachelines tagged).
- * This ensure that all cachelines are tagged with 'active' RMIDs. At
- * this point we can start reading values for the new RMID and treat the
- * old RMID as the free RMID for the next rotation.
- *
- * Return %true or %false depending on whether we did any rotating.
- */
-static bool __intel_cqm_rmid_rotate(void)
-{
-       struct perf_event *group, *start = NULL;
-       unsigned int threshold_limit;
-       unsigned int nr_needed = 0;
-       unsigned int nr_available;
-       bool rotated = false;
-
-       mutex_lock(&cache_mutex);
-
-again:
-       /*
-        * Fast path through this function if there are no groups and no
-        * RMIDs that need cleaning.
-        */
-       if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
-               goto out;
-
-       list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
-               if (!__rmid_valid(group->hw.cqm_rmid)) {
-                       if (!start)
-                               start = group;
-                       nr_needed++;
-               }
-       }
-
-       /*
-        * We have some event groups, but they all have RMIDs assigned
-        * and no RMIDs need cleaning.
-        */
-       if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
-               goto out;
-
-       if (!nr_needed)
-               goto stabilize;
-
-       /*
-        * We have more event groups without RMIDs than available RMIDs,
-        * or we have event groups that conflict with the ones currently
-        * scheduled.
-        *
-        * We force deallocate the rmid of the group at the head of
-        * cache_groups. The first event group without an RMID then gets
-        * assigned intel_cqm_rotation_rmid. This ensures we always make
-        * forward progress.
-        *
-        * Rotate the cache_groups list so the previous head is now the
-        * tail.
-        */
-       __intel_cqm_pick_and_rotate(start);
-
-       /*
-        * If the rotation is going to succeed, reduce the threshold so
-        * that we don't needlessly reuse dirty RMIDs.
-        */
-       if (__rmid_valid(intel_cqm_rotation_rmid)) {
-               intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
-               intel_cqm_rotation_rmid = __get_rmid();
-
-               intel_cqm_sched_out_conflicting_events(start);
-
-               if (__intel_cqm_threshold)
-                       __intel_cqm_threshold--;
-       }
-
-       rotated = true;
-
-stabilize:
-       /*
-        * We now need to stablize the RMID we freed above (if any) to
-        * ensure that the next time we rotate we have an RMID with zero
-        * occupancy value.
-        *
-        * Alternatively, if we didn't need to perform any rotation,
-        * we'll have a bunch of RMIDs in limbo that need stabilizing.
-        */
-       threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
-
-       while (intel_cqm_rmid_stabilize(&nr_available) &&
-              __intel_cqm_threshold < threshold_limit) {
-               unsigned int steal_limit;
-
-               /*
-                * Don't spin if nobody is actively waiting for an RMID,
-                * the rotation worker will be kicked as soon as an
-                * event needs an RMID anyway.
-                */
-               if (!nr_needed)
-                       break;
-
-               /* Allow max 25% of RMIDs to be in limbo. */
-               steal_limit = (cqm_max_rmid + 1) / 4;
-
-               /*
-                * We failed to stabilize any RMIDs so our rotation
-                * logic is now stuck. In order to make forward progress
-                * we have a few options:
-                *
-                *   1. rotate ("steal") another RMID
-                *   2. increase the threshold
-                *   3. do nothing
-                *
-                * We do both of 1. and 2. until we hit the steal limit.
-                *
-                * The steal limit prevents all RMIDs ending up on the
-                * limbo list. This can happen if every RMID has a
-                * non-zero occupancy above threshold_limit, and the
-                * occupancy values aren't dropping fast enough.
-                *
-                * Note that there is prioritisation at work here - we'd
-                * rather increase the number of RMIDs on the limbo list
-                * than increase the threshold, because increasing the
-                * threshold skews the event data (because we reuse
-                * dirty RMIDs) - threshold bumps are a last resort.
-                */
-               if (nr_available < steal_limit)
-                       goto again;
-
-               __intel_cqm_threshold++;
-       }
-
-out:
-       mutex_unlock(&cache_mutex);
-       return rotated;
-}
-
-static void intel_cqm_rmid_rotate(struct work_struct *work);
-
-static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
-
-static struct pmu intel_cqm_pmu;
-
-static void intel_cqm_rmid_rotate(struct work_struct *work)
-{
-       unsigned long delay;
-
-       __intel_cqm_rmid_rotate();
-
-       delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
-       schedule_delayed_work(&intel_cqm_rmid_work, delay);
-}
-
-/*
- * Find a group and setup RMID.
- *
- * If we're part of a group, we use the group's RMID.
- */
-static void intel_cqm_setup_event(struct perf_event *event,
-                                 struct perf_event **group)
-{
-       struct perf_event *iter;
-       bool conflict = false;
-       u32 rmid;
-
-       list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
-               rmid = iter->hw.cqm_rmid;
-
-               if (__match_event(iter, event)) {
-                       /* All tasks in a group share an RMID */
-                       event->hw.cqm_rmid = rmid;
-                       *group = iter;
-                       return;
-               }
-
-               /*
-                * We only care about conflicts for events that are
-                * actually scheduled in (and hence have a valid RMID).
-                */
-               if (__conflict_event(iter, event) && __rmid_valid(rmid))
-                       conflict = true;
-       }
-
-       if (conflict)
-               rmid = INVALID_RMID;
-       else
-               rmid = __get_rmid();
-
-       event->hw.cqm_rmid = rmid;
-}
-
-static void intel_cqm_event_read(struct perf_event *event)
-{
-       unsigned long flags;
-       u32 rmid;
-       u64 val;
-
-       /*
-        * Task events are handled by intel_cqm_event_count().
-        */
-       if (event->cpu == -1)
-               return;
-
-       raw_spin_lock_irqsave(&cache_lock, flags);
-       rmid = event->hw.cqm_rmid;
-
-       if (!__rmid_valid(rmid))
-               goto out;
-
-       val = __rmid_read(rmid);
-
-       /*
-        * Ignore this reading on error states and do not update the value.
-        */
-       if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-               goto out;
-
-       local64_set(&event->count, val);
-out:
-       raw_spin_unlock_irqrestore(&cache_lock, flags);
-}
-
-static void __intel_cqm_event_count(void *info)
-{
-       struct rmid_read *rr = info;
-       u64 val;
-
-       val = __rmid_read(rr->rmid);
-
-       if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-               return;
-
-       atomic64_add(val, &rr->value);
-}
-
-static inline bool cqm_group_leader(struct perf_event *event)
-{
-       return !list_empty(&event->hw.cqm_groups_entry);
-}
-
-static u64 intel_cqm_event_count(struct perf_event *event)
-{
-       unsigned long flags;
-       struct rmid_read rr = {
-               .value = ATOMIC64_INIT(0),
-       };
-
-       /*
-        * We only need to worry about task events. System-wide events
-        * are handled like usual, i.e. entirely with
-        * intel_cqm_event_read().
-        */
-       if (event->cpu != -1)
-               return __perf_event_count(event);
-
-       /*
-        * Only the group leader gets to report values. This stops us
-        * reporting duplicate values to userspace, and gives us a clear
-        * rule for which task gets to report the values.
-        *
-        * Note that it is impossible to attribute these values to
-        * specific packages - we forfeit that ability when we create
-        * task events.
-        */
-       if (!cqm_group_leader(event))
-               return 0;
-
-       /*
-        * Getting up-to-date values requires an SMP IPI which is not
-        * possible if we're being called in interrupt context. Return
-        * the cached values instead.
-        */
-       if (unlikely(in_interrupt()))
-               goto out;
-
-       /*
-        * Notice that we don't perform the reading of an RMID
-        * atomically, because we can't hold a spin lock across the
-        * IPIs.
-        *
-        * Speculatively perform the read, since @event might be
-        * assigned a different (possibly invalid) RMID while we're
-        * busying performing the IPI calls. It's therefore necessary to
-        * check @event's RMID afterwards, and if it has changed,
-        * discard the result of the read.
-        */
-       rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
-
-       if (!__rmid_valid(rr.rmid))
-               goto out;
-
-       on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
-
-       raw_spin_lock_irqsave(&cache_lock, flags);
-       if (event->hw.cqm_rmid == rr.rmid)
-               local64_set(&event->count, atomic64_read(&rr.value));
-       raw_spin_unlock_irqrestore(&cache_lock, flags);
-out:
-       return __perf_event_count(event);
-}
-
-static void intel_cqm_event_start(struct perf_event *event, int mode)
-{
-       struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
-       u32 rmid = event->hw.cqm_rmid;
-
-       if (!(event->hw.cqm_state & PERF_HES_STOPPED))
-               return;
-
-       event->hw.cqm_state &= ~PERF_HES_STOPPED;
-
-       if (state->rmid_usecnt++) {
-               if (!WARN_ON_ONCE(state->rmid != rmid))
-                       return;
-       } else {
-               WARN_ON_ONCE(state->rmid);
-       }
-
-       state->rmid = rmid;
-       wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
-}
-
-static void intel_cqm_event_stop(struct perf_event *event, int mode)
-{
-       struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
-
-       if (event->hw.cqm_state & PERF_HES_STOPPED)
-               return;
-
-       event->hw.cqm_state |= PERF_HES_STOPPED;
-
-       intel_cqm_event_read(event);
-
-       if (!--state->rmid_usecnt) {
-               state->rmid = 0;
-               wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid);
-       } else {
-               WARN_ON_ONCE(!state->rmid);
-       }
-}
-
-static int intel_cqm_event_add(struct perf_event *event, int mode)
-{
-       unsigned long flags;
-       u32 rmid;
-
-       raw_spin_lock_irqsave(&cache_lock, flags);
-
-       event->hw.cqm_state = PERF_HES_STOPPED;
-       rmid = event->hw.cqm_rmid;
-
-       if (__rmid_valid(rmid) && (mode & PERF_EF_START))
-               intel_cqm_event_start(event, mode);
-
-       raw_spin_unlock_irqrestore(&cache_lock, flags);
-
-       return 0;
-}
-
-static void intel_cqm_event_destroy(struct perf_event *event)
-{
-       struct perf_event *group_other = NULL;
-
-       mutex_lock(&cache_mutex);
-
-       /*
-        * If there's another event in this group...
-        */
-       if (!list_empty(&event->hw.cqm_group_entry)) {
-               group_other = list_first_entry(&event->hw.cqm_group_entry,
-                                              struct perf_event,
-                                              hw.cqm_group_entry);
-               list_del(&event->hw.cqm_group_entry);
-       }
-
-       /*
-        * And we're the group leader..
-        */
-       if (cqm_group_leader(event)) {
-               /*
-                * If there was a group_other, make that leader, otherwise
-                * destroy the group and return the RMID.
-                */
-               if (group_other) {
-                       list_replace(&event->hw.cqm_groups_entry,
-                                    &group_other->hw.cqm_groups_entry);
-               } else {
-                       u32 rmid = event->hw.cqm_rmid;
-
-                       if (__rmid_valid(rmid))
-                               __put_rmid(rmid);
-                       list_del(&event->hw.cqm_groups_entry);
-               }
-       }
-
-       mutex_unlock(&cache_mutex);
-}
-
-static int intel_cqm_event_init(struct perf_event *event)
-{
-       struct perf_event *group = NULL;
-       bool rotate = false;
-
-       if (event->attr.type != intel_cqm_pmu.type)
-               return -ENOENT;
-
-       if (event->attr.config & ~QOS_EVENT_MASK)
-               return -EINVAL;
-
-       /* unsupported modes and filters */
-       if (event->attr.exclude_user   ||
-           event->attr.exclude_kernel ||
-           event->attr.exclude_hv     ||
-           event->attr.exclude_idle   ||
-           event->attr.exclude_host   ||
-           event->attr.exclude_guest  ||
-           event->attr.sample_period) /* no sampling */
-               return -EINVAL;
-
-       INIT_LIST_HEAD(&event->hw.cqm_group_entry);
-       INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
-
-       event->destroy = intel_cqm_event_destroy;
-
-       mutex_lock(&cache_mutex);
-
-       /* Will also set rmid */
-       intel_cqm_setup_event(event, &group);
-
-       if (group) {
-               list_add_tail(&event->hw.cqm_group_entry,
-                             &group->hw.cqm_group_entry);
-       } else {
-               list_add_tail(&event->hw.cqm_groups_entry,
-                             &cache_groups);
-
-               /*
-                * All RMIDs are either in use or have recently been
-                * used. Kick the rotation worker to clean/free some.
-                *
-                * We only do this for the group leader, rather than for
-                * every event in a group to save on needless work.
-                */
-               if (!__rmid_valid(event->hw.cqm_rmid))
-                       rotate = true;
-       }
-
-       mutex_unlock(&cache_mutex);
-
-       if (rotate)
-               schedule_delayed_work(&intel_cqm_rmid_work, 0);
-
-       return 0;
-}
-
-EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
-EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
-EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
-EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
-EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
-
-static struct attribute *intel_cqm_events_attr[] = {
-       EVENT_PTR(intel_cqm_llc),
-       EVENT_PTR(intel_cqm_llc_pkg),
-       EVENT_PTR(intel_cqm_llc_unit),
-       EVENT_PTR(intel_cqm_llc_scale),
-       EVENT_PTR(intel_cqm_llc_snapshot),
-       NULL,
-};
-
-static struct attribute_group intel_cqm_events_group = {
-       .name = "events",
-       .attrs = intel_cqm_events_attr,
-};
-
-PMU_FORMAT_ATTR(event, "config:0-7");
-static struct attribute *intel_cqm_formats_attr[] = {
-       &format_attr_event.attr,
-       NULL,
-};
-
-static struct attribute_group intel_cqm_format_group = {
-       .name = "format",
-       .attrs = intel_cqm_formats_attr,
-};
-
-static ssize_t
-max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
-                          char *page)
-{
-       ssize_t rv;
-
-       mutex_lock(&cache_mutex);
-       rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
-       mutex_unlock(&cache_mutex);
-
-       return rv;
-}
-
-static ssize_t
-max_recycle_threshold_store(struct device *dev,
-                           struct device_attribute *attr,
-                           const char *buf, size_t count)
-{
-       unsigned int bytes, cachelines;
-       int ret;
-
-       ret = kstrtouint(buf, 0, &bytes);
-       if (ret)
-               return ret;
-
-       mutex_lock(&cache_mutex);
-
-       __intel_cqm_max_threshold = bytes;
-       cachelines = bytes / cqm_l3_scale;
-
-       /*
-        * The new maximum takes effect immediately.
-        */
-       if (__intel_cqm_threshold > cachelines)
-               __intel_cqm_threshold = cachelines;
-
-       mutex_unlock(&cache_mutex);
-
-       return count;
-}
-
-static DEVICE_ATTR_RW(max_recycle_threshold);
-
-static struct attribute *intel_cqm_attrs[] = {
-       &dev_attr_max_recycle_threshold.attr,
-       NULL,
-};
-
-static const struct attribute_group intel_cqm_group = {
-       .attrs = intel_cqm_attrs,
-};
-
-static const struct attribute_group *intel_cqm_attr_groups[] = {
-       &intel_cqm_events_group,
-       &intel_cqm_format_group,
-       &intel_cqm_group,
-       NULL,
-};
-
-static struct pmu intel_cqm_pmu = {
-       .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
-       .attr_groups         = intel_cqm_attr_groups,
-       .task_ctx_nr         = perf_sw_context,
-       .event_init          = intel_cqm_event_init,
-       .add                 = intel_cqm_event_add,
-       .del                 = intel_cqm_event_stop,
-       .start               = intel_cqm_event_start,
-       .stop                = intel_cqm_event_stop,
-       .read                = intel_cqm_event_read,
-       .count               = intel_cqm_event_count,
-};
-
-static inline void cqm_pick_event_reader(int cpu)
-{
-       int phys_id = topology_physical_package_id(cpu);
-       int i;
-
-       for_each_cpu(i, &cqm_cpumask) {
-               if (phys_id == topology_physical_package_id(i))
-                       return; /* already got reader for this socket */
-       }
-
-       cpumask_set_cpu(cpu, &cqm_cpumask);
-}
-
-static void intel_cqm_cpu_starting(unsigned int cpu)
-{
-       struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
-       struct cpuinfo_x86 *c = &cpu_data(cpu);
-
-       state->rmid = 0;
-       state->closid = 0;
-       state->rmid_usecnt = 0;
-
-       WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
-       WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
-}
-
-static void intel_cqm_cpu_exit(unsigned int cpu)
-{
-       int phys_id = topology_physical_package_id(cpu);
-       int i;
-
-       /*
-        * Is @cpu a designated cqm reader?
-        */
-       if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
-               return;
-
-       for_each_online_cpu(i) {
-               if (i == cpu)
-                       continue;
-
-               if (phys_id == topology_physical_package_id(i)) {
-                       cpumask_set_cpu(i, &cqm_cpumask);
-                       break;
-               }
-       }
-}
-
-static int intel_cqm_cpu_notifier(struct notifier_block *nb,
-                                 unsigned long action, void *hcpu)
-{
-       unsigned int cpu  = (unsigned long)hcpu;
-
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_DOWN_PREPARE:
-               intel_cqm_cpu_exit(cpu);
-               break;
-       case CPU_STARTING:
-               intel_cqm_cpu_starting(cpu);
-               cqm_pick_event_reader(cpu);
-               break;
-       }
-
-       return NOTIFY_OK;
-}
-
-static const struct x86_cpu_id intel_cqm_match[] = {
-       { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
-       {}
-};
-
-static int __init intel_cqm_init(void)
-{
-       char *str, scale[20];
-       int i, cpu, ret;
-
-       if (!x86_match_cpu(intel_cqm_match))
-               return -ENODEV;
-
-       cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
-
-       /*
-        * It's possible that not all resources support the same number
-        * of RMIDs. Instead of making scheduling much more complicated
-        * (where we have to match a task's RMID to a cpu that supports
-        * that many RMIDs) just find the minimum RMIDs supported across
-        * all cpus.
-        *
-        * Also, check that the scales match on all cpus.
-        */
-       cpu_notifier_register_begin();
-
-       for_each_online_cpu(cpu) {
-               struct cpuinfo_x86 *c = &cpu_data(cpu);
-
-               if (c->x86_cache_max_rmid < cqm_max_rmid)
-                       cqm_max_rmid = c->x86_cache_max_rmid;
-
-               if (c->x86_cache_occ_scale != cqm_l3_scale) {
-                       pr_err("Multiple LLC scale values, disabling\n");
-                       ret = -EINVAL;
-                       goto out;
-               }
-       }
-
-       /*
-        * A reasonable upper limit on the max threshold is the number
-        * of lines tagged per RMID if all RMIDs have the same number of
-        * lines tagged in the LLC.
-        *
-        * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
-        */
-       __intel_cqm_max_threshold =
-               boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
-
-       snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
-       str = kstrdup(scale, GFP_KERNEL);
-       if (!str) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       event_attr_intel_cqm_llc_scale.event_str = str;
-
-       ret = intel_cqm_setup_rmid_cache();
-       if (ret)
-               goto out;
-
-       for_each_online_cpu(i) {
-               intel_cqm_cpu_starting(i);
-               cqm_pick_event_reader(i);
-       }
-
-       __perf_cpu_notifier(intel_cqm_cpu_notifier);
-
-       ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
-       if (ret)
-               pr_err("Intel CQM perf registration failed: %d\n", ret);
-       else
-               pr_info("Intel CQM monitoring enabled\n");
-
-out:
-       cpu_notifier_register_done();
-
-       return ret;
-}
-device_initcall(intel_cqm_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cstate.c b/arch/x86/kernel/cpu/perf_event_intel_cstate.c

deleted file mode 100644 (file)

index 75a38b5..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_cstate.c
+++ /dev/null
@@ -1,694 +0,0 @@
-/*
- * perf_event_intel_cstate.c: support cstate residency counters
- *
- * Copyright (C) 2015, Intel Corp.
- * Author: Kan Liang (kan.liang@intel.com)
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Library General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Library General Public License for more details.
- *
- */
-
-/*
- * This file export cstate related free running (read-only) counters
- * for perf. These counters may be use simultaneously by other tools,
- * such as turbostat. However, it still make sense to implement them
- * in perf. Because we can conveniently collect them together with
- * other events, and allow to use them from tools without special MSR
- * access code.
- *
- * The events only support system-wide mode counting. There is no
- * sampling support because it is not supported by the hardware.
- *
- * According to counters' scope and category, two PMUs are registered
- * with the perf_event core subsystem.
- *  - 'cstate_core': The counter is available for each physical core.
- *    The counters include CORE_C*_RESIDENCY.
- *  - 'cstate_pkg': The counter is available for each physical package.
- *    The counters include PKG_C*_RESIDENCY.
- *
- * All of these counters are specified in the Intel® 64 and IA-32
- * Architectures Software Developer.s Manual Vol3b.
- *
- * Model specific counters:
- *     MSR_CORE_C1_RES: CORE C1 Residency Counter
- *                      perf code: 0x00
- *                      Available model: SLM,AMT
- *                      Scope: Core (each processor core has a MSR)
- *     MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
- *                            perf code: 0x01
- *                            Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
- *                            Scope: Core
- *     MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter
- *                            perf code: 0x02
- *                            Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL
- *                            Scope: Core
- *     MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
- *                            perf code: 0x03
- *                            Available model: SNB,IVB,HSW,BDW,SKL
- *                            Scope: Core
- *     MSR_PKG_C2_RESIDENCY:  Package C2 Residency Counter.
- *                            perf code: 0x00
- *                            Available model: SNB,IVB,HSW,BDW,SKL
- *                            Scope: Package (physical package)
- *     MSR_PKG_C3_RESIDENCY:  Package C3 Residency Counter.
- *                            perf code: 0x01
- *                            Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
- *                            Scope: Package (physical package)
- *     MSR_PKG_C6_RESIDENCY:  Package C6 Residency Counter.
- *                            perf code: 0x02
- *                            Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL
- *                            Scope: Package (physical package)
- *     MSR_PKG_C7_RESIDENCY:  Package C7 Residency Counter.
- *                            perf code: 0x03
- *                            Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
- *                            Scope: Package (physical package)
- *     MSR_PKG_C8_RESIDENCY:  Package C8 Residency Counter.
- *                            perf code: 0x04
- *                            Available model: HSW ULT only
- *                            Scope: Package (physical package)
- *     MSR_PKG_C9_RESIDENCY:  Package C9 Residency Counter.
- *                            perf code: 0x05
- *                            Available model: HSW ULT only
- *                            Scope: Package (physical package)
- *     MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
- *                            perf code: 0x06
- *                            Available model: HSW ULT only
- *                            Scope: Package (physical package)
- *
- */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/perf_event.h>
-#include <asm/cpu_device_id.h>
-#include "perf_event.h"
-
-#define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format)                \
-static ssize_t __cstate_##_var##_show(struct kobject *kobj,    \
-                               struct kobj_attribute *attr,    \
-                               char *page)                     \
-{                                                              \
-       BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);             \
-       return sprintf(page, _format "\n");                     \
-}                                                              \
-static struct kobj_attribute format_attr_##_var =              \
-       __ATTR(_name, 0444, __cstate_##_var##_show, NULL)
-
-static ssize_t cstate_get_attr_cpumask(struct device *dev,
-                                      struct device_attribute *attr,
-                                      char *buf);
-
-struct perf_cstate_msr {
-       u64     msr;
-       struct  perf_pmu_events_attr *attr;
-       bool    (*test)(int idx);
-};
-
-
-/* cstate_core PMU */
-
-static struct pmu cstate_core_pmu;
-static bool has_cstate_core;
-
-enum perf_cstate_core_id {
-       /*
-        * cstate_core events
-        */
-       PERF_CSTATE_CORE_C1_RES = 0,
-       PERF_CSTATE_CORE_C3_RES,
-       PERF_CSTATE_CORE_C6_RES,
-       PERF_CSTATE_CORE_C7_RES,
-
-       PERF_CSTATE_CORE_EVENT_MAX,
-};
-
-bool test_core(int idx)
-{
-       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-           boot_cpu_data.x86 != 6)
-               return false;
-
-       switch (boot_cpu_data.x86_model) {
-       case 30: /* 45nm Nehalem    */
-       case 26: /* 45nm Nehalem-EP */
-       case 46: /* 45nm Nehalem-EX */
-
-       case 37: /* 32nm Westmere    */
-       case 44: /* 32nm Westmere-EP */
-       case 47: /* 32nm Westmere-EX */
-               if (idx == PERF_CSTATE_CORE_C3_RES ||
-                   idx == PERF_CSTATE_CORE_C6_RES)
-                       return true;
-               break;
-       case 42: /* 32nm SandyBridge         */
-       case 45: /* 32nm SandyBridge-E/EN/EP */
-
-       case 58: /* 22nm IvyBridge       */
-       case 62: /* 22nm IvyBridge-EP/EX */
-
-       case 60: /* 22nm Haswell Core */
-       case 63: /* 22nm Haswell Server */
-       case 69: /* 22nm Haswell ULT */
-       case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
-
-       case 61: /* 14nm Broadwell Core-M */
-       case 86: /* 14nm Broadwell Xeon D */
-       case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
-       case 79: /* 14nm Broadwell Server */
-
-       case 78: /* 14nm Skylake Mobile */
-       case 94: /* 14nm Skylake Desktop */
-               if (idx == PERF_CSTATE_CORE_C3_RES ||
-                   idx == PERF_CSTATE_CORE_C6_RES ||
-                   idx == PERF_CSTATE_CORE_C7_RES)
-                       return true;
-               break;
-       case 55: /* 22nm Atom "Silvermont"                */
-       case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
-       case 76: /* 14nm Atom "Airmont"                   */
-               if (idx == PERF_CSTATE_CORE_C1_RES ||
-                   idx == PERF_CSTATE_CORE_C6_RES)
-                       return true;
-               break;
-       }
-
-       return false;
-}
-
-PMU_EVENT_ATTR_STRING(c1-residency, evattr_cstate_core_c1, "event=0x00");
-PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_core_c3, "event=0x01");
-PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_core_c6, "event=0x02");
-PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_core_c7, "event=0x03");
-
-static struct perf_cstate_msr core_msr[] = {
-       [PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES,          &evattr_cstate_core_c1, test_core, },
-       [PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY,    &evattr_cstate_core_c3, test_core, },
-       [PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY,    &evattr_cstate_core_c6, test_core, },
-       [PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY,    &evattr_cstate_core_c7, test_core, },
-};
-
-static struct attribute *core_events_attrs[PERF_CSTATE_CORE_EVENT_MAX + 1] = {
-       NULL,
-};
-
-static struct attribute_group core_events_attr_group = {
-       .name = "events",
-       .attrs = core_events_attrs,
-};
-
-DEFINE_CSTATE_FORMAT_ATTR(core_event, event, "config:0-63");
-static struct attribute *core_format_attrs[] = {
-       &format_attr_core_event.attr,
-       NULL,
-};
-
-static struct attribute_group core_format_attr_group = {
-       .name = "format",
-       .attrs = core_format_attrs,
-};
-
-static cpumask_t cstate_core_cpu_mask;
-static DEVICE_ATTR(cpumask, S_IRUGO, cstate_get_attr_cpumask, NULL);
-
-static struct attribute *cstate_cpumask_attrs[] = {
-       &dev_attr_cpumask.attr,
-       NULL,
-};
-
-static struct attribute_group cpumask_attr_group = {
-       .attrs = cstate_cpumask_attrs,
-};
-
-static const struct attribute_group *core_attr_groups[] = {
-       &core_events_attr_group,
-       &core_format_attr_group,
-       &cpumask_attr_group,
-       NULL,
-};
-
-/* cstate_core PMU end */
-
-
-/* cstate_pkg PMU */
-
-static struct pmu cstate_pkg_pmu;
-static bool has_cstate_pkg;
-
-enum perf_cstate_pkg_id {
-       /*
-        * cstate_pkg events
-        */
-       PERF_CSTATE_PKG_C2_RES = 0,
-       PERF_CSTATE_PKG_C3_RES,
-       PERF_CSTATE_PKG_C6_RES,
-       PERF_CSTATE_PKG_C7_RES,
-       PERF_CSTATE_PKG_C8_RES,
-       PERF_CSTATE_PKG_C9_RES,
-       PERF_CSTATE_PKG_C10_RES,
-
-       PERF_CSTATE_PKG_EVENT_MAX,
-};
-
-bool test_pkg(int idx)
-{
-       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-           boot_cpu_data.x86 != 6)
-               return false;
-
-       switch (boot_cpu_data.x86_model) {
-       case 30: /* 45nm Nehalem    */
-       case 26: /* 45nm Nehalem-EP */
-       case 46: /* 45nm Nehalem-EX */
-
-       case 37: /* 32nm Westmere    */
-       case 44: /* 32nm Westmere-EP */
-       case 47: /* 32nm Westmere-EX */
-               if (idx == PERF_CSTATE_CORE_C3_RES ||
-                   idx == PERF_CSTATE_CORE_C6_RES ||
-                   idx == PERF_CSTATE_CORE_C7_RES)
-                       return true;
-               break;
-       case 42: /* 32nm SandyBridge         */
-       case 45: /* 32nm SandyBridge-E/EN/EP */
-
-       case 58: /* 22nm IvyBridge       */
-       case 62: /* 22nm IvyBridge-EP/EX */
-
-       case 60: /* 22nm Haswell Core */
-       case 63: /* 22nm Haswell Server */
-       case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
-
-       case 61: /* 14nm Broadwell Core-M */
-       case 86: /* 14nm Broadwell Xeon D */
-       case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
-       case 79: /* 14nm Broadwell Server */
-
-       case 78: /* 14nm Skylake Mobile */
-       case 94: /* 14nm Skylake Desktop */
-               if (idx == PERF_CSTATE_PKG_C2_RES ||
-                   idx == PERF_CSTATE_PKG_C3_RES ||
-                   idx == PERF_CSTATE_PKG_C6_RES ||
-                   idx == PERF_CSTATE_PKG_C7_RES)
-                       return true;
-               break;
-       case 55: /* 22nm Atom "Silvermont"                */
-       case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
-       case 76: /* 14nm Atom "Airmont"                   */
-               if (idx == PERF_CSTATE_CORE_C6_RES)
-                       return true;
-               break;
-       case 69: /* 22nm Haswell ULT */
-               if (idx == PERF_CSTATE_PKG_C2_RES ||
-                   idx == PERF_CSTATE_PKG_C3_RES ||
-                   idx == PERF_CSTATE_PKG_C6_RES ||
-                   idx == PERF_CSTATE_PKG_C7_RES ||
-                   idx == PERF_CSTATE_PKG_C8_RES ||
-                   idx == PERF_CSTATE_PKG_C9_RES ||
-                   idx == PERF_CSTATE_PKG_C10_RES)
-                       return true;
-               break;
-       }
-
-       return false;
-}
-
-PMU_EVENT_ATTR_STRING(c2-residency, evattr_cstate_pkg_c2, "event=0x00");
-PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_pkg_c3, "event=0x01");
-PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_pkg_c6, "event=0x02");
-PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_pkg_c7, "event=0x03");
-PMU_EVENT_ATTR_STRING(c8-residency, evattr_cstate_pkg_c8, "event=0x04");
-PMU_EVENT_ATTR_STRING(c9-residency, evattr_cstate_pkg_c9, "event=0x05");
-PMU_EVENT_ATTR_STRING(c10-residency, evattr_cstate_pkg_c10, "event=0x06");
-
-static struct perf_cstate_msr pkg_msr[] = {
-       [PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY,      &evattr_cstate_pkg_c2,  test_pkg, },
-       [PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY,      &evattr_cstate_pkg_c3,  test_pkg, },
-       [PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY,      &evattr_cstate_pkg_c6,  test_pkg, },
-       [PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY,      &evattr_cstate_pkg_c7,  test_pkg, },
-       [PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY,      &evattr_cstate_pkg_c8,  test_pkg, },
-       [PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY,      &evattr_cstate_pkg_c9,  test_pkg, },
-       [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY,    &evattr_cstate_pkg_c10, test_pkg, },
-};
-
-static struct attribute *pkg_events_attrs[PERF_CSTATE_PKG_EVENT_MAX + 1] = {
-       NULL,
-};
-
-static struct attribute_group pkg_events_attr_group = {
-       .name = "events",
-       .attrs = pkg_events_attrs,
-};
-
-DEFINE_CSTATE_FORMAT_ATTR(pkg_event, event, "config:0-63");
-static struct attribute *pkg_format_attrs[] = {
-       &format_attr_pkg_event.attr,
-       NULL,
-};
-static struct attribute_group pkg_format_attr_group = {
-       .name = "format",
-       .attrs = pkg_format_attrs,
-};
-
-static cpumask_t cstate_pkg_cpu_mask;
-
-static const struct attribute_group *pkg_attr_groups[] = {
-       &pkg_events_attr_group,
-       &pkg_format_attr_group,
-       &cpumask_attr_group,
-       NULL,
-};
-
-/* cstate_pkg PMU end*/
-
-static ssize_t cstate_get_attr_cpumask(struct device *dev,
-                                      struct device_attribute *attr,
-                                      char *buf)
-{
-       struct pmu *pmu = dev_get_drvdata(dev);
-
-       if (pmu == &cstate_core_pmu)
-               return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask);
-       else if (pmu == &cstate_pkg_pmu)
-               return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask);
-       else
-               return 0;
-}
-
-static int cstate_pmu_event_init(struct perf_event *event)
-{
-       u64 cfg = event->attr.config;
-       int ret = 0;
-
-       if (event->attr.type != event->pmu->type)
-               return -ENOENT;
-
-       /* unsupported modes and filters */
-       if (event->attr.exclude_user   ||
-           event->attr.exclude_kernel ||
-           event->attr.exclude_hv     ||
-           event->attr.exclude_idle   ||
-           event->attr.exclude_host   ||
-           event->attr.exclude_guest  ||
-           event->attr.sample_period) /* no sampling */
-               return -EINVAL;
-
-       if (event->pmu == &cstate_core_pmu) {
-               if (cfg >= PERF_CSTATE_CORE_EVENT_MAX)
-                       return -EINVAL;
-               if (!core_msr[cfg].attr)
-                       return -EINVAL;
-               event->hw.event_base = core_msr[cfg].msr;
-       } else if (event->pmu == &cstate_pkg_pmu) {
-               if (cfg >= PERF_CSTATE_PKG_EVENT_MAX)
-                       return -EINVAL;
-               if (!pkg_msr[cfg].attr)
-                       return -EINVAL;
-               event->hw.event_base = pkg_msr[cfg].msr;
-       } else
-               return -ENOENT;
-
-       /* must be done before validate_group */
-       event->hw.config = cfg;
-       event->hw.idx = -1;
-
-       return ret;
-}
-
-static inline u64 cstate_pmu_read_counter(struct perf_event *event)
-{
-       u64 val;
-
-       rdmsrl(event->hw.event_base, val);
-       return val;
-}
-
-static void cstate_pmu_event_update(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       u64 prev_raw_count, new_raw_count;
-
-again:
-       prev_raw_count = local64_read(&hwc->prev_count);
-       new_raw_count = cstate_pmu_read_counter(event);
-
-       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-                           new_raw_count) != prev_raw_count)
-               goto again;
-
-       local64_add(new_raw_count - prev_raw_count, &event->count);
-}
-
-static void cstate_pmu_event_start(struct perf_event *event, int mode)
-{
-       local64_set(&event->hw.prev_count, cstate_pmu_read_counter(event));
-}
-
-static void cstate_pmu_event_stop(struct perf_event *event, int mode)
-{
-       cstate_pmu_event_update(event);
-}
-
-static void cstate_pmu_event_del(struct perf_event *event, int mode)
-{
-       cstate_pmu_event_stop(event, PERF_EF_UPDATE);
-}
-
-static int cstate_pmu_event_add(struct perf_event *event, int mode)
-{
-       if (mode & PERF_EF_START)
-               cstate_pmu_event_start(event, mode);
-
-       return 0;
-}
-
-static void cstate_cpu_exit(int cpu)
-{
-       int i, id, target;
-
-       /* cpu exit for cstate core */
-       if (has_cstate_core) {
-               id = topology_core_id(cpu);
-               target = -1;
-
-               for_each_online_cpu(i) {
-                       if (i == cpu)
-                               continue;
-                       if (id == topology_core_id(i)) {
-                               target = i;
-                               break;
-                       }
-               }
-               if (cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask) && target >= 0)
-                       cpumask_set_cpu(target, &cstate_core_cpu_mask);
-               WARN_ON(cpumask_empty(&cstate_core_cpu_mask));
-               if (target >= 0)
-                       perf_pmu_migrate_context(&cstate_core_pmu, cpu, target);
-       }
-
-       /* cpu exit for cstate pkg */
-       if (has_cstate_pkg) {
-               id = topology_physical_package_id(cpu);
-               target = -1;
-
-               for_each_online_cpu(i) {
-                       if (i == cpu)
-                               continue;
-                       if (id == topology_physical_package_id(i)) {
-                               target = i;
-                               break;
-                       }
-               }
-               if (cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask) && target >= 0)
-                       cpumask_set_cpu(target, &cstate_pkg_cpu_mask);
-               WARN_ON(cpumask_empty(&cstate_pkg_cpu_mask));
-               if (target >= 0)
-                       perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target);
-       }
-}
-
-static void cstate_cpu_init(int cpu)
-{
-       int i, id;
-
-       /* cpu init for cstate core */
-       if (has_cstate_core) {
-               id = topology_core_id(cpu);
-               for_each_cpu(i, &cstate_core_cpu_mask) {
-                       if (id == topology_core_id(i))
-                               break;
-               }
-               if (i >= nr_cpu_ids)
-                       cpumask_set_cpu(cpu, &cstate_core_cpu_mask);
-       }
-
-       /* cpu init for cstate pkg */
-       if (has_cstate_pkg) {
-               id = topology_physical_package_id(cpu);
-               for_each_cpu(i, &cstate_pkg_cpu_mask) {
-                       if (id == topology_physical_package_id(i))
-                               break;
-               }
-               if (i >= nr_cpu_ids)
-                       cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask);
-       }
-}
-
-static int cstate_cpu_notifier(struct notifier_block *self,
-                                 unsigned long action, void *hcpu)
-{
-       unsigned int cpu = (long)hcpu;
-
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_UP_PREPARE:
-               break;
-       case CPU_STARTING:
-               cstate_cpu_init(cpu);
-               break;
-       case CPU_UP_CANCELED:
-       case CPU_DYING:
-               break;
-       case CPU_ONLINE:
-       case CPU_DEAD:
-               break;
-       case CPU_DOWN_PREPARE:
-               cstate_cpu_exit(cpu);
-               break;
-       default:
-               break;
-       }
-
-       return NOTIFY_OK;
-}
-
-/*
- * Probe the cstate events and insert the available one into sysfs attrs
- * Return false if there is no available events.
- */
-static bool cstate_probe_msr(struct perf_cstate_msr *msr,
-                            struct attribute   **events_attrs,
-                            int max_event_nr)
-{
-       int i, j = 0;
-       u64 val;
-
-       /* Probe the cstate events. */
-       for (i = 0; i < max_event_nr; i++) {
-               if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
-                       msr[i].attr = NULL;
-       }
-
-       /* List remaining events in the sysfs attrs. */
-       for (i = 0; i < max_event_nr; i++) {
-               if (msr[i].attr)
-                       events_attrs[j++] = &msr[i].attr->attr.attr;
-       }
-       events_attrs[j] = NULL;
-
-       return (j > 0) ? true : false;
-}
-
-static int __init cstate_init(void)
-{
-       /* SLM has different MSR for PKG C6 */
-       switch (boot_cpu_data.x86_model) {
-       case 55:
-       case 76:
-       case 77:
-               pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY;
-       }
-
-       if (cstate_probe_msr(core_msr, core_events_attrs, PERF_CSTATE_CORE_EVENT_MAX))
-               has_cstate_core = true;
-
-       if (cstate_probe_msr(pkg_msr, pkg_events_attrs, PERF_CSTATE_PKG_EVENT_MAX))
-               has_cstate_pkg = true;
-
-       return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV;
-}
-
-static void __init cstate_cpumask_init(void)
-{
-       int cpu;
-
-       cpu_notifier_register_begin();
-
-       for_each_online_cpu(cpu)
-               cstate_cpu_init(cpu);
-
-       __perf_cpu_notifier(cstate_cpu_notifier);
-
-       cpu_notifier_register_done();
-}
-
-static struct pmu cstate_core_pmu = {
-       .attr_groups    = core_attr_groups,
-       .name           = "cstate_core",
-       .task_ctx_nr    = perf_invalid_context,
-       .event_init     = cstate_pmu_event_init,
-       .add            = cstate_pmu_event_add, /* must have */
-       .del            = cstate_pmu_event_del, /* must have */
-       .start          = cstate_pmu_event_start,
-       .stop           = cstate_pmu_event_stop,
-       .read           = cstate_pmu_event_update,
-       .capabilities   = PERF_PMU_CAP_NO_INTERRUPT,
-};
-
-static struct pmu cstate_pkg_pmu = {
-       .attr_groups    = pkg_attr_groups,
-       .name           = "cstate_pkg",
-       .task_ctx_nr    = perf_invalid_context,
-       .event_init     = cstate_pmu_event_init,
-       .add            = cstate_pmu_event_add, /* must have */
-       .del            = cstate_pmu_event_del, /* must have */
-       .start          = cstate_pmu_event_start,
-       .stop           = cstate_pmu_event_stop,
-       .read           = cstate_pmu_event_update,
-       .capabilities   = PERF_PMU_CAP_NO_INTERRUPT,
-};
-
-static void __init cstate_pmus_register(void)
-{
-       int err;
-
-       if (has_cstate_core) {
-               err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1);
-               if (WARN_ON(err))
-                       pr_info("Failed to register PMU %s error %d\n",
-                               cstate_core_pmu.name, err);
-       }
-
-       if (has_cstate_pkg) {
-               err = perf_pmu_register(&cstate_pkg_pmu, cstate_pkg_pmu.name, -1);
-               if (WARN_ON(err))
-                       pr_info("Failed to register PMU %s error %d\n",
-                               cstate_pkg_pmu.name, err);
-       }
-}
-
-static int __init cstate_pmu_init(void)
-{
-       int err;
-
-       if (cpu_has_hypervisor)
-               return -ENODEV;
-
-       err = cstate_init();
-       if (err)
-               return err;
-
-       cstate_cpumask_init();
-
-       cstate_pmus_register();
-
-       return 0;
-}
-
-device_initcall(cstate_pmu_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c

deleted file mode 100644 (file)

index 10602f0..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ /dev/null
@@ -1,1368 +0,0 @@
-#include <linux/bitops.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-
-#include <asm/perf_event.h>
-#include <asm/insn.h>
-
-#include "perf_event.h"
-
-/* The size of a BTS record in bytes: */
-#define BTS_RECORD_SIZE                24
-
-#define BTS_BUFFER_SIZE                (PAGE_SIZE << 4)
-#define PEBS_BUFFER_SIZE       (PAGE_SIZE << 4)
-#define PEBS_FIXUP_SIZE                PAGE_SIZE
-
-/*
- * pebs_record_32 for p4 and core not supported
-
-struct pebs_record_32 {
-       u32 flags, ip;
-       u32 ax, bc, cx, dx;
-       u32 si, di, bp, sp;
-};
-
- */
-
-union intel_x86_pebs_dse {
-       u64 val;
-       struct {
-               unsigned int ld_dse:4;
-               unsigned int ld_stlb_miss:1;
-               unsigned int ld_locked:1;
-               unsigned int ld_reserved:26;
-       };
-       struct {
-               unsigned int st_l1d_hit:1;
-               unsigned int st_reserved1:3;
-               unsigned int st_stlb_miss:1;
-               unsigned int st_locked:1;
-               unsigned int st_reserved2:26;
-       };
-};
-
-
-/*
- * Map PEBS Load Latency Data Source encodings to generic
- * memory data source information
- */
-#define P(a, b) PERF_MEM_S(a, b)
-#define OP_LH (P(OP, LOAD) | P(LVL, HIT))
-#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
-
-static const u64 pebs_data_source[] = {
-       P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
-       OP_LH | P(LVL, L1)  | P(SNOOP, NONE),   /* 0x01: L1 local */
-       OP_LH | P(LVL, LFB) | P(SNOOP, NONE),   /* 0x02: LFB hit */
-       OP_LH | P(LVL, L2)  | P(SNOOP, NONE),   /* 0x03: L2 hit */
-       OP_LH | P(LVL, L3)  | P(SNOOP, NONE),   /* 0x04: L3 hit */
-       OP_LH | P(LVL, L3)  | P(SNOOP, MISS),   /* 0x05: L3 hit, snoop miss */
-       OP_LH | P(LVL, L3)  | P(SNOOP, HIT),    /* 0x06: L3 hit, snoop hit */
-       OP_LH | P(LVL, L3)  | P(SNOOP, HITM),   /* 0x07: L3 hit, snoop hitm */
-       OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT),  /* 0x08: L3 miss snoop hit */
-       OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
-       OP_LH | P(LVL, LOC_RAM)  | P(SNOOP, HIT),  /* 0x0a: L3 miss, shared */
-       OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT),  /* 0x0b: L3 miss, shared */
-       OP_LH | P(LVL, LOC_RAM)  | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */
-       OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */
-       OP_LH | P(LVL, IO)  | P(SNOOP, NONE), /* 0x0e: I/O */
-       OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */
-};
-
-static u64 precise_store_data(u64 status)
-{
-       union intel_x86_pebs_dse dse;
-       u64 val = P(OP, STORE) | P(SNOOP, NA) | P(LVL, L1) | P(TLB, L2);
-
-       dse.val = status;
-
-       /*
-        * bit 4: TLB access
-        * 1 = stored missed 2nd level TLB
-        *
-        * so it either hit the walker or the OS
-        * otherwise hit 2nd level TLB
-        */
-       if (dse.st_stlb_miss)
-               val |= P(TLB, MISS);
-       else
-               val |= P(TLB, HIT);
-
-       /*
-        * bit 0: hit L1 data cache
-        * if not set, then all we know is that
-        * it missed L1D
-        */
-       if (dse.st_l1d_hit)
-               val |= P(LVL, HIT);
-       else
-               val |= P(LVL, MISS);
-
-       /*
-        * bit 5: Locked prefix
-        */
-       if (dse.st_locked)
-               val |= P(LOCK, LOCKED);
-
-       return val;
-}
-
-static u64 precise_datala_hsw(struct perf_event *event, u64 status)
-{
-       union perf_mem_data_src dse;
-
-       dse.val = PERF_MEM_NA;
-
-       if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
-               dse.mem_op = PERF_MEM_OP_STORE;
-       else if (event->hw.flags & PERF_X86_EVENT_PEBS_LD_HSW)
-               dse.mem_op = PERF_MEM_OP_LOAD;
-
-       /*
-        * L1 info only valid for following events:
-        *
-        * MEM_UOPS_RETIRED.STLB_MISS_STORES
-        * MEM_UOPS_RETIRED.LOCK_STORES
-        * MEM_UOPS_RETIRED.SPLIT_STORES
-        * MEM_UOPS_RETIRED.ALL_STORES
-        */
-       if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) {
-               if (status & 1)
-                       dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
-               else
-                       dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
-       }
-       return dse.val;
-}
-
-static u64 load_latency_data(u64 status)
-{
-       union intel_x86_pebs_dse dse;
-       u64 val;
-       int model = boot_cpu_data.x86_model;
-       int fam = boot_cpu_data.x86;
-
-       dse.val = status;
-
-       /*
-        * use the mapping table for bit 0-3
-        */
-       val = pebs_data_source[dse.ld_dse];
-
-       /*
-        * Nehalem models do not support TLB, Lock infos
-        */
-       if (fam == 0x6 && (model == 26 || model == 30
-           || model == 31 || model == 46)) {
-               val |= P(TLB, NA) | P(LOCK, NA);
-               return val;
-       }
-       /*
-        * bit 4: TLB access
-        * 0 = did not miss 2nd level TLB
-        * 1 = missed 2nd level TLB
-        */
-       if (dse.ld_stlb_miss)
-               val |= P(TLB, MISS) | P(TLB, L2);
-       else
-               val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
-
-       /*
-        * bit 5: locked prefix
-        */
-       if (dse.ld_locked)
-               val |= P(LOCK, LOCKED);
-
-       return val;
-}
-
-struct pebs_record_core {
-       u64 flags, ip;
-       u64 ax, bx, cx, dx;
-       u64 si, di, bp, sp;
-       u64 r8,  r9,  r10, r11;
-       u64 r12, r13, r14, r15;
-};
-
-struct pebs_record_nhm {
-       u64 flags, ip;
-       u64 ax, bx, cx, dx;
-       u64 si, di, bp, sp;
-       u64 r8,  r9,  r10, r11;
-       u64 r12, r13, r14, r15;
-       u64 status, dla, dse, lat;
-};
-
-/*
- * Same as pebs_record_nhm, with two additional fields.
- */
-struct pebs_record_hsw {
-       u64 flags, ip;
-       u64 ax, bx, cx, dx;
-       u64 si, di, bp, sp;
-       u64 r8,  r9,  r10, r11;
-       u64 r12, r13, r14, r15;
-       u64 status, dla, dse, lat;
-       u64 real_ip, tsx_tuning;
-};
-
-union hsw_tsx_tuning {
-       struct {
-               u32 cycles_last_block     : 32,
-                   hle_abort             : 1,
-                   rtm_abort             : 1,
-                   instruction_abort     : 1,
-                   non_instruction_abort : 1,
-                   retry                 : 1,
-                   data_conflict         : 1,
-                   capacity_writes       : 1,
-                   capacity_reads        : 1;
-       };
-       u64         value;
-};
-
-#define PEBS_HSW_TSX_FLAGS     0xff00000000ULL
-
-/* Same as HSW, plus TSC */
-
-struct pebs_record_skl {
-       u64 flags, ip;
-       u64 ax, bx, cx, dx;
-       u64 si, di, bp, sp;
-       u64 r8,  r9,  r10, r11;
-       u64 r12, r13, r14, r15;
-       u64 status, dla, dse, lat;
-       u64 real_ip, tsx_tuning;
-       u64 tsc;
-};
-
-void init_debug_store_on_cpu(int cpu)
-{
-       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-       if (!ds)
-               return;
-
-       wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
-                    (u32)((u64)(unsigned long)ds),
-                    (u32)((u64)(unsigned long)ds >> 32));
-}
-
-void fini_debug_store_on_cpu(int cpu)
-{
-       if (!per_cpu(cpu_hw_events, cpu).ds)
-               return;
-
-       wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
-}
-
-static DEFINE_PER_CPU(void *, insn_buffer);
-
-static int alloc_pebs_buffer(int cpu)
-{
-       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-       int node = cpu_to_node(cpu);
-       int max;
-       void *buffer, *ibuffer;
-
-       if (!x86_pmu.pebs)
-               return 0;
-
-       buffer = kzalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL, node);
-       if (unlikely(!buffer))
-               return -ENOMEM;
-
-       /*
-        * HSW+ already provides us the eventing ip; no need to allocate this
-        * buffer then.
-        */
-       if (x86_pmu.intel_cap.pebs_format < 2) {
-               ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
-               if (!ibuffer) {
-                       kfree(buffer);
-                       return -ENOMEM;
-               }
-               per_cpu(insn_buffer, cpu) = ibuffer;
-       }
-
-       max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
-
-       ds->pebs_buffer_base = (u64)(unsigned long)buffer;
-       ds->pebs_index = ds->pebs_buffer_base;
-       ds->pebs_absolute_maximum = ds->pebs_buffer_base +
-               max * x86_pmu.pebs_record_size;
-
-       return 0;
-}
-
-static void release_pebs_buffer(int cpu)
-{
-       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-       if (!ds || !x86_pmu.pebs)
-               return;
-
-       kfree(per_cpu(insn_buffer, cpu));
-       per_cpu(insn_buffer, cpu) = NULL;
-
-       kfree((void *)(unsigned long)ds->pebs_buffer_base);
-       ds->pebs_buffer_base = 0;
-}
-
-static int alloc_bts_buffer(int cpu)
-{
-       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-       int node = cpu_to_node(cpu);
-       int max, thresh;
-       void *buffer;
-
-       if (!x86_pmu.bts)
-               return 0;
-
-       buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
-       if (unlikely(!buffer)) {
-               WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
-               return -ENOMEM;
-       }
-
-       max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
-       thresh = max / 16;
-
-       ds->bts_buffer_base = (u64)(unsigned long)buffer;
-       ds->bts_index = ds->bts_buffer_base;
-       ds->bts_absolute_maximum = ds->bts_buffer_base +
-               max * BTS_RECORD_SIZE;
-       ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
-               thresh * BTS_RECORD_SIZE;
-
-       return 0;
-}
-
-static void release_bts_buffer(int cpu)
-{
-       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-       if (!ds || !x86_pmu.bts)
-               return;
-
-       kfree((void *)(unsigned long)ds->bts_buffer_base);
-       ds->bts_buffer_base = 0;
-}
-
-static int alloc_ds_buffer(int cpu)
-{
-       int node = cpu_to_node(cpu);
-       struct debug_store *ds;
-
-       ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
-       if (unlikely(!ds))
-               return -ENOMEM;
-
-       per_cpu(cpu_hw_events, cpu).ds = ds;
-
-       return 0;
-}
-
-static void release_ds_buffer(int cpu)
-{
-       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-       if (!ds)
-               return;
-
-       per_cpu(cpu_hw_events, cpu).ds = NULL;
-       kfree(ds);
-}
-
-void release_ds_buffers(void)
-{
-       int cpu;
-
-       if (!x86_pmu.bts && !x86_pmu.pebs)
-               return;
-
-       get_online_cpus();
-       for_each_online_cpu(cpu)
-               fini_debug_store_on_cpu(cpu);
-
-       for_each_possible_cpu(cpu) {
-               release_pebs_buffer(cpu);
-               release_bts_buffer(cpu);
-               release_ds_buffer(cpu);
-       }
-       put_online_cpus();
-}
-
-void reserve_ds_buffers(void)
-{
-       int bts_err = 0, pebs_err = 0;
-       int cpu;
-
-       x86_pmu.bts_active = 0;
-       x86_pmu.pebs_active = 0;
-
-       if (!x86_pmu.bts && !x86_pmu.pebs)
-               return;
-
-       if (!x86_pmu.bts)
-               bts_err = 1;
-
-       if (!x86_pmu.pebs)
-               pebs_err = 1;
-
-       get_online_cpus();
-
-       for_each_possible_cpu(cpu) {
-               if (alloc_ds_buffer(cpu)) {
-                       bts_err = 1;
-                       pebs_err = 1;
-               }
-
-               if (!bts_err && alloc_bts_buffer(cpu))
-                       bts_err = 1;
-
-               if (!pebs_err && alloc_pebs_buffer(cpu))
-                       pebs_err = 1;
-
-               if (bts_err && pebs_err)
-                       break;
-       }
-
-       if (bts_err) {
-               for_each_possible_cpu(cpu)
-                       release_bts_buffer(cpu);
-       }
-
-       if (pebs_err) {
-               for_each_possible_cpu(cpu)
-                       release_pebs_buffer(cpu);
-       }
-
-       if (bts_err && pebs_err) {
-               for_each_possible_cpu(cpu)
-                       release_ds_buffer(cpu);
-       } else {
-               if (x86_pmu.bts && !bts_err)
-                       x86_pmu.bts_active = 1;
-
-               if (x86_pmu.pebs && !pebs_err)
-                       x86_pmu.pebs_active = 1;
-
-               for_each_online_cpu(cpu)
-                       init_debug_store_on_cpu(cpu);
-       }
-
-       put_online_cpus();
-}
-
-/*
- * BTS
- */
-
-struct event_constraint bts_constraint =
-       EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0);
-
-void intel_pmu_enable_bts(u64 config)
-{
-       unsigned long debugctlmsr;
-
-       debugctlmsr = get_debugctlmsr();
-
-       debugctlmsr |= DEBUGCTLMSR_TR;
-       debugctlmsr |= DEBUGCTLMSR_BTS;
-       if (config & ARCH_PERFMON_EVENTSEL_INT)
-               debugctlmsr |= DEBUGCTLMSR_BTINT;
-
-       if (!(config & ARCH_PERFMON_EVENTSEL_OS))
-               debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
-
-       if (!(config & ARCH_PERFMON_EVENTSEL_USR))
-               debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
-
-       update_debugctlmsr(debugctlmsr);
-}
-
-void intel_pmu_disable_bts(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       unsigned long debugctlmsr;
-
-       if (!cpuc->ds)
-               return;
-
-       debugctlmsr = get_debugctlmsr();
-
-       debugctlmsr &=
-               ~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
-                 DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
-
-       update_debugctlmsr(debugctlmsr);
-}
-
-int intel_pmu_drain_bts_buffer(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct debug_store *ds = cpuc->ds;
-       struct bts_record {
-               u64     from;
-               u64     to;
-               u64     flags;
-       };
-       struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
-       struct bts_record *at, *base, *top;
-       struct perf_output_handle handle;
-       struct perf_event_header header;
-       struct perf_sample_data data;
-       unsigned long skip = 0;
-       struct pt_regs regs;
-
-       if (!event)
-               return 0;
-
-       if (!x86_pmu.bts_active)
-               return 0;
-
-       base = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
-       top  = (struct bts_record *)(unsigned long)ds->bts_index;
-
-       if (top <= base)
-               return 0;
-
-       memset(&regs, 0, sizeof(regs));
-
-       ds->bts_index = ds->bts_buffer_base;
-
-       perf_sample_data_init(&data, 0, event->hw.last_period);
-
-       /*
-        * BTS leaks kernel addresses in branches across the cpl boundary,
-        * such as traps or system calls, so unless the user is asking for
-        * kernel tracing (and right now it's not possible), we'd need to
-        * filter them out. But first we need to count how many of those we
-        * have in the current batch. This is an extra O(n) pass, however,
-        * it's much faster than the other one especially considering that
-        * n <= 2560 (BTS_BUFFER_SIZE / BTS_RECORD_SIZE * 15/16; see the
-        * alloc_bts_buffer()).
-        */
-       for (at = base; at < top; at++) {
-               /*
-                * Note that right now *this* BTS code only works if
-                * attr::exclude_kernel is set, but let's keep this extra
-                * check here in case that changes.
-                */
-               if (event->attr.exclude_kernel &&
-                   (kernel_ip(at->from) || kernel_ip(at->to)))
-                       skip++;
-       }
-
-       /*
-        * Prepare a generic sample, i.e. fill in the invariant fields.
-        * We will overwrite the from and to address before we output
-        * the sample.
-        */
-       perf_prepare_sample(&header, &data, event, &regs);
-
-       if (perf_output_begin(&handle, event, header.size *
-                             (top - base - skip)))
-               return 1;
-
-       for (at = base; at < top; at++) {
-               /* Filter out any records that contain kernel addresses. */
-               if (event->attr.exclude_kernel &&
-                   (kernel_ip(at->from) || kernel_ip(at->to)))
-                       continue;
-
-               data.ip         = at->from;
-               data.addr       = at->to;
-
-               perf_output_sample(&handle, &header, &data, event);
-       }
-
-       perf_output_end(&handle);
-
-       /* There's new data available. */
-       event->hw.interrupts++;
-       event->pending_kill = POLL_IN;
-       return 1;
-}
-
-static inline void intel_pmu_drain_pebs_buffer(void)
-{
-       struct pt_regs regs;
-
-       x86_pmu.drain_pebs(&regs);
-}
-
-void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
-{
-       if (!sched_in)
-               intel_pmu_drain_pebs_buffer();
-}
-
-/*
- * PEBS
- */
-struct event_constraint intel_core2_pebs_event_constraints[] = {
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
-       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_atom_pebs_event_constraints[] = {
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
-       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
-       /* Allow all events as PEBS with no flags */
-       INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_slm_pebs_event_constraints[] = {
-       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x1),
-       /* Allow all events as PEBS with no flags */
-       INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_nehalem_pebs_event_constraints[] = {
-       INTEL_PLD_CONSTRAINT(0x100b, 0xf),      /* MEM_INST_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf),    /* INST_RETIRED.ANY */
-       INTEL_EVENT_CONSTRAINT(0xc2, 0xf),    /* UOPS_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf),    /* SSEX_UOPS_RETIRED.* */
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
-       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_westmere_pebs_event_constraints[] = {
-       INTEL_PLD_CONSTRAINT(0x100b, 0xf),      /* MEM_INST_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf),    /* INSTR_RETIRED.* */
-       INTEL_EVENT_CONSTRAINT(0xc2, 0xf),    /* UOPS_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xc5, 0xf),    /* BR_MISP_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf),    /* SSEX_UOPS_RETIRED.* */
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
-       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_snb_pebs_event_constraints[] = {
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
-       INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
-       INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
-       /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
-        INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
-        INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
-        INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-        INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
-       /* Allow all events as PEBS with no flags */
-       INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_ivb_pebs_event_constraints[] = {
-        INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
-        INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
-       INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
-       /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
-       /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
-       INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
-       /* Allow all events as PEBS with no flags */
-       INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
-        EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_hsw_pebs_event_constraints[] = {
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
-       INTEL_PLD_CONSTRAINT(0x01cd, 0xf),    /* MEM_TRANS_RETIRED.* */
-       /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
-       /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
-       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd2, 0xf),    /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd3, 0xf),    /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
-       /* Allow all events as PEBS with no flags */
-       INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_skl_pebs_event_constraints[] = {
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2),      /* INST_RETIRED.PREC_DIST */
-       /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
-       /* INST_RETIRED.TOTAL_CYCLES_PS (inv=1, cmask=16) (cycles:p). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
-       INTEL_PLD_CONSTRAINT(0x1cd, 0xf),                     /* MEM_TRANS_RETIRED.* */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x22d0, 0xf), /* MEM_INST_RETIRED.LOCK_STORES */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
-       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf),    /* MEM_LOAD_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf),    /* MEM_LOAD_L3_HIT_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf),    /* MEM_LOAD_L3_MISS_RETIRED.* */
-       /* Allow all events as PEBS with no flags */
-       INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint *intel_pebs_constraints(struct perf_event *event)
-{
-       struct event_constraint *c;
-
-       if (!event->attr.precise_ip)
-               return NULL;
-
-       if (x86_pmu.pebs_constraints) {
-               for_each_event_constraint(c, x86_pmu.pebs_constraints) {
-                       if ((event->hw.config & c->cmask) == c->code) {
-                               event->hw.flags |= c->flags;
-                               return c;
-                       }
-               }
-       }
-
-       return &emptyconstraint;
-}
-
-static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc)
-{
-       return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1));
-}
-
-void intel_pmu_pebs_enable(struct perf_event *event)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct hw_perf_event *hwc = &event->hw;
-       struct debug_store *ds = cpuc->ds;
-       bool first_pebs;
-       u64 threshold;
-
-       hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
-
-       first_pebs = !pebs_is_enabled(cpuc);
-       cpuc->pebs_enabled |= 1ULL << hwc->idx;
-
-       if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
-               cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
-       else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
-               cpuc->pebs_enabled |= 1ULL << 63;
-
-       /*
-        * When the event is constrained enough we can use a larger
-        * threshold and run the event with less frequent PMI.
-        */
-       if (hwc->flags & PERF_X86_EVENT_FREERUNNING) {
-               threshold = ds->pebs_absolute_maximum -
-                       x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
-
-               if (first_pebs)
-                       perf_sched_cb_inc(event->ctx->pmu);
-       } else {
-               threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
-
-               /*
-                * If not all events can use larger buffer,
-                * roll back to threshold = 1
-                */
-               if (!first_pebs &&
-                   (ds->pebs_interrupt_threshold > threshold))
-                       perf_sched_cb_dec(event->ctx->pmu);
-       }
-
-       /* Use auto-reload if possible to save a MSR write in the PMI */
-       if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
-               ds->pebs_event_reset[hwc->idx] =
-                       (u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
-       }
-
-       if (first_pebs || ds->pebs_interrupt_threshold > threshold)
-               ds->pebs_interrupt_threshold = threshold;
-}
-
-void intel_pmu_pebs_disable(struct perf_event *event)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct hw_perf_event *hwc = &event->hw;
-       struct debug_store *ds = cpuc->ds;
-       bool large_pebs = ds->pebs_interrupt_threshold >
-               ds->pebs_buffer_base + x86_pmu.pebs_record_size;
-
-       if (large_pebs)
-               intel_pmu_drain_pebs_buffer();
-
-       cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
-
-       if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
-               cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
-       else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
-               cpuc->pebs_enabled &= ~(1ULL << 63);
-
-       if (large_pebs && !pebs_is_enabled(cpuc))
-               perf_sched_cb_dec(event->ctx->pmu);
-
-       if (cpuc->enabled)
-               wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
-
-       hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
-}
-
-void intel_pmu_pebs_enable_all(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       if (cpuc->pebs_enabled)
-               wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
-}
-
-void intel_pmu_pebs_disable_all(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       if (cpuc->pebs_enabled)
-               wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
-}
-
-static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       unsigned long from = cpuc->lbr_entries[0].from;
-       unsigned long old_to, to = cpuc->lbr_entries[0].to;
-       unsigned long ip = regs->ip;
-       int is_64bit = 0;
-       void *kaddr;
-       int size;
-
-       /*
-        * We don't need to fixup if the PEBS assist is fault like
-        */
-       if (!x86_pmu.intel_cap.pebs_trap)
-               return 1;
-
-       /*
-        * No LBR entry, no basic block, no rewinding
-        */
-       if (!cpuc->lbr_stack.nr || !from || !to)
-               return 0;
-
-       /*
-        * Basic blocks should never cross user/kernel boundaries
-        */
-       if (kernel_ip(ip) != kernel_ip(to))
-               return 0;
-
-       /*
-        * unsigned math, either ip is before the start (impossible) or
-        * the basic block is larger than 1 page (sanity)
-        */
-       if ((ip - to) > PEBS_FIXUP_SIZE)
-               return 0;
-
-       /*
-        * We sampled a branch insn, rewind using the LBR stack
-        */
-       if (ip == to) {
-               set_linear_ip(regs, from);
-               return 1;
-       }
-
-       size = ip - to;
-       if (!kernel_ip(ip)) {
-               int bytes;
-               u8 *buf = this_cpu_read(insn_buffer);
-
-               /* 'size' must fit our buffer, see above */
-               bytes = copy_from_user_nmi(buf, (void __user *)to, size);
-               if (bytes != 0)
-                       return 0;
-
-               kaddr = buf;
-       } else {
-               kaddr = (void *)to;
-       }
-
-       do {
-               struct insn insn;
-
-               old_to = to;
-
-#ifdef CONFIG_X86_64
-               is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
-#endif
-               insn_init(&insn, kaddr, size, is_64bit);
-               insn_get_length(&insn);
-               /*
-                * Make sure there was not a problem decoding the
-                * instruction and getting the length.  This is
-                * doubly important because we have an infinite
-                * loop if insn.length=0.
-                */
-               if (!insn.length)
-                       break;
-
-               to += insn.length;
-               kaddr += insn.length;
-               size -= insn.length;
-       } while (to < ip);
-
-       if (to == ip) {
-               set_linear_ip(regs, old_to);
-               return 1;
-       }
-
-       /*
-        * Even though we decoded the basic block, the instruction stream
-        * never matched the given IP, either the TO or the IP got corrupted.
-        */
-       return 0;
-}
-
-static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs)
-{
-       if (pebs->tsx_tuning) {
-               union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning };
-               return tsx.cycles_last_block;
-       }
-       return 0;
-}
-
-static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs)
-{
-       u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32;
-
-       /* For RTM XABORTs also log the abort code from AX */
-       if ((txn & PERF_TXN_TRANSACTION) && (pebs->ax & 1))
-               txn |= ((pebs->ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT;
-       return txn;
-}
-
-static void setup_pebs_sample_data(struct perf_event *event,
-                                  struct pt_regs *iregs, void *__pebs,
-                                  struct perf_sample_data *data,
-                                  struct pt_regs *regs)
-{
-#define PERF_X86_EVENT_PEBS_HSW_PREC \
-               (PERF_X86_EVENT_PEBS_ST_HSW | \
-                PERF_X86_EVENT_PEBS_LD_HSW | \
-                PERF_X86_EVENT_PEBS_NA_HSW)
-       /*
-        * We cast to the biggest pebs_record but are careful not to
-        * unconditionally access the 'extra' entries.
-        */
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct pebs_record_skl *pebs = __pebs;
-       u64 sample_type;
-       int fll, fst, dsrc;
-       int fl = event->hw.flags;
-
-       if (pebs == NULL)
-               return;
-
-       sample_type = event->attr.sample_type;
-       dsrc = sample_type & PERF_SAMPLE_DATA_SRC;
-
-       fll = fl & PERF_X86_EVENT_PEBS_LDLAT;
-       fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
-
-       perf_sample_data_init(data, 0, event->hw.last_period);
-
-       data->period = event->hw.last_period;
-
-       /*
-        * Use latency for weight (only avail with PEBS-LL)
-        */
-       if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
-               data->weight = pebs->lat;
-
-       /*
-        * data.data_src encodes the data source
-        */
-       if (dsrc) {
-               u64 val = PERF_MEM_NA;
-               if (fll)
-                       val = load_latency_data(pebs->dse);
-               else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
-                       val = precise_datala_hsw(event, pebs->dse);
-               else if (fst)
-                       val = precise_store_data(pebs->dse);
-               data->data_src.val = val;
-       }
-
-       /*
-        * We use the interrupt regs as a base because the PEBS record
-        * does not contain a full regs set, specifically it seems to
-        * lack segment descriptors, which get used by things like
-        * user_mode().
-        *
-        * In the simple case fix up only the IP and BP,SP regs, for
-        * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
-        * A possible PERF_SAMPLE_REGS will have to transfer all regs.
-        */
-       *regs = *iregs;
-       regs->flags = pebs->flags;
-       set_linear_ip(regs, pebs->ip);
-       regs->bp = pebs->bp;
-       regs->sp = pebs->sp;
-
-       if (sample_type & PERF_SAMPLE_REGS_INTR) {
-               regs->ax = pebs->ax;
-               regs->bx = pebs->bx;
-               regs->cx = pebs->cx;
-               regs->dx = pebs->dx;
-               regs->si = pebs->si;
-               regs->di = pebs->di;
-               regs->bp = pebs->bp;
-               regs->sp = pebs->sp;
-
-               regs->flags = pebs->flags;
-#ifndef CONFIG_X86_32
-               regs->r8 = pebs->r8;
-               regs->r9 = pebs->r9;
-               regs->r10 = pebs->r10;
-               regs->r11 = pebs->r11;
-               regs->r12 = pebs->r12;
-               regs->r13 = pebs->r13;
-               regs->r14 = pebs->r14;
-               regs->r15 = pebs->r15;
-#endif
-       }
-
-       if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) {
-               regs->ip = pebs->real_ip;
-               regs->flags |= PERF_EFLAGS_EXACT;
-       } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(regs))
-               regs->flags |= PERF_EFLAGS_EXACT;
-       else
-               regs->flags &= ~PERF_EFLAGS_EXACT;
-
-       if ((sample_type & PERF_SAMPLE_ADDR) &&
-           x86_pmu.intel_cap.pebs_format >= 1)
-               data->addr = pebs->dla;
-
-       if (x86_pmu.intel_cap.pebs_format >= 2) {
-               /* Only set the TSX weight when no memory weight. */
-               if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
-                       data->weight = intel_hsw_weight(pebs);
-
-               if (sample_type & PERF_SAMPLE_TRANSACTION)
-                       data->txn = intel_hsw_transaction(pebs);
-       }
-
-       /*
-        * v3 supplies an accurate time stamp, so we use that
-        * for the time stamp.
-        *
-        * We can only do this for the default trace clock.
-        */
-       if (x86_pmu.intel_cap.pebs_format >= 3 &&
-               event->attr.use_clockid == 0)
-               data->time = native_sched_clock_from_tsc(pebs->tsc);
-
-       if (has_branch_stack(event))
-               data->br_stack = &cpuc->lbr_stack;
-}
-
-static inline void *
-get_next_pebs_record_by_bit(void *base, void *top, int bit)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       void *at;
-       u64 pebs_status;
-
-       /*
-        * fmt0 does not have a status bitfield (does not use
-        * perf_record_nhm format)
-        */
-       if (x86_pmu.intel_cap.pebs_format < 1)
-               return base;
-
-       if (base == NULL)
-               return NULL;
-
-       for (at = base; at < top; at += x86_pmu.pebs_record_size) {
-               struct pebs_record_nhm *p = at;
-
-               if (test_bit(bit, (unsigned long *)&p->status)) {
-                       /* PEBS v3 has accurate status bits */
-                       if (x86_pmu.intel_cap.pebs_format >= 3)
-                               return at;
-
-                       if (p->status == (1 << bit))
-                               return at;
-
-                       /* clear non-PEBS bit and re-check */
-                       pebs_status = p->status & cpuc->pebs_enabled;
-                       pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
-                       if (pebs_status == (1 << bit))
-                               return at;
-               }
-       }
-       return NULL;
-}
-
-static void __intel_pmu_pebs_event(struct perf_event *event,
-                                  struct pt_regs *iregs,
-                                  void *base, void *top,
-                                  int bit, int count)
-{
-       struct perf_sample_data data;
-       struct pt_regs regs;
-       void *at = get_next_pebs_record_by_bit(base, top, bit);
-
-       if (!intel_pmu_save_and_restart(event) &&
-           !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD))
-               return;
-
-       while (count > 1) {
-               setup_pebs_sample_data(event, iregs, at, &data, &regs);
-               perf_event_output(event, &data, &regs);
-               at += x86_pmu.pebs_record_size;
-               at = get_next_pebs_record_by_bit(at, top, bit);
-               count--;
-       }
-
-       setup_pebs_sample_data(event, iregs, at, &data, &regs);
-
-       /*
-        * All but the last records are processed.
-        * The last one is left to be able to call the overflow handler.
-        */
-       if (perf_event_overflow(event, &data, &regs)) {
-               x86_pmu_stop(event, 0);
-               return;
-       }
-
-}
-
-static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct debug_store *ds = cpuc->ds;
-       struct perf_event *event = cpuc->events[0]; /* PMC0 only */
-       struct pebs_record_core *at, *top;
-       int n;
-
-       if (!x86_pmu.pebs_active)
-               return;
-
-       at  = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
-       top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
-
-       /*
-        * Whatever else happens, drain the thing
-        */
-       ds->pebs_index = ds->pebs_buffer_base;
-
-       if (!test_bit(0, cpuc->active_mask))
-               return;
-
-       WARN_ON_ONCE(!event);
-
-       if (!event->attr.precise_ip)
-               return;
-
-       n = top - at;
-       if (n <= 0)
-               return;
-
-       __intel_pmu_pebs_event(event, iregs, at, top, 0, n);
-}
-
-static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct debug_store *ds = cpuc->ds;
-       struct perf_event *event;
-       void *base, *at, *top;
-       short counts[MAX_PEBS_EVENTS] = {};
-       short error[MAX_PEBS_EVENTS] = {};
-       int bit, i;
-
-       if (!x86_pmu.pebs_active)
-               return;
-
-       base = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
-       top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
-
-       ds->pebs_index = ds->pebs_buffer_base;
-
-       if (unlikely(base >= top))
-               return;
-
-       for (at = base; at < top; at += x86_pmu.pebs_record_size) {
-               struct pebs_record_nhm *p = at;
-               u64 pebs_status;
-
-               /* PEBS v3 has accurate status bits */
-               if (x86_pmu.intel_cap.pebs_format >= 3) {
-                       for_each_set_bit(bit, (unsigned long *)&p->status,
-                                        MAX_PEBS_EVENTS)
-                               counts[bit]++;
-
-                       continue;
-               }
-
-               pebs_status = p->status & cpuc->pebs_enabled;
-               pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1;
-
-               /*
-                * On some CPUs the PEBS status can be zero when PEBS is
-                * racing with clearing of GLOBAL_STATUS.
-                *
-                * Normally we would drop that record, but in the
-                * case when there is only a single active PEBS event
-                * we can assume it's for that event.
-                */
-               if (!pebs_status && cpuc->pebs_enabled &&
-                       !(cpuc->pebs_enabled & (cpuc->pebs_enabled-1)))
-                       pebs_status = cpuc->pebs_enabled;
-
-               bit = find_first_bit((unsigned long *)&pebs_status,
-                                       x86_pmu.max_pebs_events);
-               if (bit >= x86_pmu.max_pebs_events)
-                       continue;
-
-               /*
-                * The PEBS hardware does not deal well with the situation
-                * when events happen near to each other and multiple bits
-                * are set. But it should happen rarely.
-                *
-                * If these events include one PEBS and multiple non-PEBS
-                * events, it doesn't impact PEBS record. The record will
-                * be handled normally. (slow path)
-                *
-                * If these events include two or more PEBS events, the
-                * records for the events can be collapsed into a single
-                * one, and it's not possible to reconstruct all events
-                * that caused the PEBS record. It's called collision.
-                * If collision happened, the record will be dropped.
-                */
-               if (p->status != (1ULL << bit)) {
-                       for_each_set_bit(i, (unsigned long *)&pebs_status,
-                                        x86_pmu.max_pebs_events)
-                               error[i]++;
-                       continue;
-               }
-
-               counts[bit]++;
-       }
-
-       for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
-               if ((counts[bit] == 0) && (error[bit] == 0))
-                       continue;
-
-               event = cpuc->events[bit];
-               WARN_ON_ONCE(!event);
-               WARN_ON_ONCE(!event->attr.precise_ip);
-
-               /* log dropped samples number */
-               if (error[bit])
-                       perf_log_lost_samples(event, error[bit]);
-
-               if (counts[bit]) {
-                       __intel_pmu_pebs_event(event, iregs, base,
-                                              top, bit, counts[bit]);
-               }
-       }
-}
-
-/*
- * BTS, PEBS probe and setup
- */
-
-void __init intel_ds_init(void)
-{
-       /*
-        * No support for 32bit formats
-        */
-       if (!boot_cpu_has(X86_FEATURE_DTES64))
-               return;
-
-       x86_pmu.bts  = boot_cpu_has(X86_FEATURE_BTS);
-       x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
-       if (x86_pmu.pebs) {
-               char pebs_type = x86_pmu.intel_cap.pebs_trap ?  '+' : '-';
-               int format = x86_pmu.intel_cap.pebs_format;
-
-               switch (format) {
-               case 0:
-                       printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
-                       x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
-                       x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
-                       break;
-
-               case 1:
-                       printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
-                       x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
-                       x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
-                       break;
-
-               case 2:
-                       pr_cont("PEBS fmt2%c, ", pebs_type);
-                       x86_pmu.pebs_record_size = sizeof(struct pebs_record_hsw);
-                       x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
-                       break;
-
-               case 3:
-                       pr_cont("PEBS fmt3%c, ", pebs_type);
-                       x86_pmu.pebs_record_size =
-                                               sizeof(struct pebs_record_skl);
-                       x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
-                       x86_pmu.free_running_flags |= PERF_SAMPLE_TIME;
-                       break;
-
-               default:
-                       printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
-                       x86_pmu.pebs = 0;
-               }
-       }
-}
-
-void perf_restore_debug_store(void)
-{
-       struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
-
-       if (!x86_pmu.bts && !x86_pmu.pebs)
-               return;
-
-       wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ds);
-}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c

deleted file mode 100644 (file)

index 653f88d..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ /dev/null
@@ -1,1062 +0,0 @@
-#include <linux/perf_event.h>
-#include <linux/types.h>
-
-#include <asm/perf_event.h>
-#include <asm/msr.h>
-#include <asm/insn.h>
-
-#include "perf_event.h"
-
-enum {
-       LBR_FORMAT_32           = 0x00,
-       LBR_FORMAT_LIP          = 0x01,
-       LBR_FORMAT_EIP          = 0x02,
-       LBR_FORMAT_EIP_FLAGS    = 0x03,
-       LBR_FORMAT_EIP_FLAGS2   = 0x04,
-       LBR_FORMAT_INFO         = 0x05,
-       LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_INFO,
-};
-
-static enum {
-       LBR_EIP_FLAGS           = 1,
-       LBR_TSX                 = 2,
-} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = {
-       [LBR_FORMAT_EIP_FLAGS]  = LBR_EIP_FLAGS,
-       [LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX,
-};
-
-/*
- * Intel LBR_SELECT bits
- * Intel Vol3a, April 2011, Section 16.7 Table 16-10
- *
- * Hardware branch filter (not available on all CPUs)
- */
-#define LBR_KERNEL_BIT         0 /* do not capture at ring0 */
-#define LBR_USER_BIT           1 /* do not capture at ring > 0 */
-#define LBR_JCC_BIT            2 /* do not capture conditional branches */
-#define LBR_REL_CALL_BIT       3 /* do not capture relative calls */
-#define LBR_IND_CALL_BIT       4 /* do not capture indirect calls */
-#define LBR_RETURN_BIT         5 /* do not capture near returns */
-#define LBR_IND_JMP_BIT                6 /* do not capture indirect jumps */
-#define LBR_REL_JMP_BIT                7 /* do not capture relative jumps */
-#define LBR_FAR_BIT            8 /* do not capture far branches */
-#define LBR_CALL_STACK_BIT     9 /* enable call stack */
-
-/*
- * Following bit only exists in Linux; we mask it out before writing it to
- * the actual MSR. But it helps the constraint perf code to understand
- * that this is a separate configuration.
- */
-#define LBR_NO_INFO_BIT               63 /* don't read LBR_INFO. */
-
-#define LBR_KERNEL     (1 << LBR_KERNEL_BIT)
-#define LBR_USER       (1 << LBR_USER_BIT)
-#define LBR_JCC                (1 << LBR_JCC_BIT)
-#define LBR_REL_CALL   (1 << LBR_REL_CALL_BIT)
-#define LBR_IND_CALL   (1 << LBR_IND_CALL_BIT)
-#define LBR_RETURN     (1 << LBR_RETURN_BIT)
-#define LBR_REL_JMP    (1 << LBR_REL_JMP_BIT)
-#define LBR_IND_JMP    (1 << LBR_IND_JMP_BIT)
-#define LBR_FAR                (1 << LBR_FAR_BIT)
-#define LBR_CALL_STACK (1 << LBR_CALL_STACK_BIT)
-#define LBR_NO_INFO    (1ULL << LBR_NO_INFO_BIT)
-
-#define LBR_PLM (LBR_KERNEL | LBR_USER)
-
-#define LBR_SEL_MASK   0x1ff   /* valid bits in LBR_SELECT */
-#define LBR_NOT_SUPP   -1      /* LBR filter not supported */
-#define LBR_IGN                0       /* ignored */
-
-#define LBR_ANY                 \
-       (LBR_JCC        |\
-        LBR_REL_CALL   |\
-        LBR_IND_CALL   |\
-        LBR_RETURN     |\
-        LBR_REL_JMP    |\
-        LBR_IND_JMP    |\
-        LBR_FAR)
-
-#define LBR_FROM_FLAG_MISPRED  (1ULL << 63)
-#define LBR_FROM_FLAG_IN_TX    (1ULL << 62)
-#define LBR_FROM_FLAG_ABORT    (1ULL << 61)
-
-/*
- * x86control flow change classification
- * x86control flow changes include branches, interrupts, traps, faults
- */
-enum {
-       X86_BR_NONE             = 0,      /* unknown */
-
-       X86_BR_USER             = 1 << 0, /* branch target is user */
-       X86_BR_KERNEL           = 1 << 1, /* branch target is kernel */
-
-       X86_BR_CALL             = 1 << 2, /* call */
-       X86_BR_RET              = 1 << 3, /* return */
-       X86_BR_SYSCALL          = 1 << 4, /* syscall */
-       X86_BR_SYSRET           = 1 << 5, /* syscall return */
-       X86_BR_INT              = 1 << 6, /* sw interrupt */
-       X86_BR_IRET             = 1 << 7, /* return from interrupt */
-       X86_BR_JCC              = 1 << 8, /* conditional */
-       X86_BR_JMP              = 1 << 9, /* jump */
-       X86_BR_IRQ              = 1 << 10,/* hw interrupt or trap or fault */
-       X86_BR_IND_CALL         = 1 << 11,/* indirect calls */
-       X86_BR_ABORT            = 1 << 12,/* transaction abort */
-       X86_BR_IN_TX            = 1 << 13,/* in transaction */
-       X86_BR_NO_TX            = 1 << 14,/* not in transaction */
-       X86_BR_ZERO_CALL        = 1 << 15,/* zero length call */
-       X86_BR_CALL_STACK       = 1 << 16,/* call stack */
-       X86_BR_IND_JMP          = 1 << 17,/* indirect jump */
-};
-
-#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
-#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX)
-
-#define X86_BR_ANY       \
-       (X86_BR_CALL    |\
-        X86_BR_RET     |\
-        X86_BR_SYSCALL |\
-        X86_BR_SYSRET  |\
-        X86_BR_INT     |\
-        X86_BR_IRET    |\
-        X86_BR_JCC     |\
-        X86_BR_JMP      |\
-        X86_BR_IRQ      |\
-        X86_BR_ABORT    |\
-        X86_BR_IND_CALL |\
-        X86_BR_IND_JMP  |\
-        X86_BR_ZERO_CALL)
-
-#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
-
-#define X86_BR_ANY_CALL                 \
-       (X86_BR_CALL            |\
-        X86_BR_IND_CALL        |\
-        X86_BR_ZERO_CALL       |\
-        X86_BR_SYSCALL         |\
-        X86_BR_IRQ             |\
-        X86_BR_INT)
-
-static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
-
-/*
- * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
- * otherwise it becomes near impossible to get a reliable stack.
- */
-
-static void __intel_pmu_lbr_enable(bool pmi)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       u64 debugctl, lbr_select = 0, orig_debugctl;
-
-       /*
-        * No need to unfreeze manually, as v4 can do that as part
-        * of the GLOBAL_STATUS ack.
-        */
-       if (pmi && x86_pmu.version >= 4)
-               return;
-
-       /*
-        * No need to reprogram LBR_SELECT in a PMI, as it
-        * did not change.
-        */
-       if (cpuc->lbr_sel)
-               lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask;
-       if (!pmi && cpuc->lbr_sel)
-               wrmsrl(MSR_LBR_SELECT, lbr_select);
-
-       rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-       orig_debugctl = debugctl;
-       debugctl |= DEBUGCTLMSR_LBR;
-       /*
-        * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
-        * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
-        * may cause superfluous increase/decrease of LBR_TOS.
-        */
-       if (!(lbr_select & LBR_CALL_STACK))
-               debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
-       if (orig_debugctl != debugctl)
-               wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-}
-
-static void __intel_pmu_lbr_disable(void)
-{
-       u64 debugctl;
-
-       rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-       debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
-       wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-}
-
-static void intel_pmu_lbr_reset_32(void)
-{
-       int i;
-
-       for (i = 0; i < x86_pmu.lbr_nr; i++)
-               wrmsrl(x86_pmu.lbr_from + i, 0);
-}
-
-static void intel_pmu_lbr_reset_64(void)
-{
-       int i;
-
-       for (i = 0; i < x86_pmu.lbr_nr; i++) {
-               wrmsrl(x86_pmu.lbr_from + i, 0);
-               wrmsrl(x86_pmu.lbr_to   + i, 0);
-               if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-                       wrmsrl(MSR_LBR_INFO_0 + i, 0);
-       }
-}
-
-void intel_pmu_lbr_reset(void)
-{
-       if (!x86_pmu.lbr_nr)
-               return;
-
-       if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
-               intel_pmu_lbr_reset_32();
-       else
-               intel_pmu_lbr_reset_64();
-}
-
-/*
- * TOS = most recently recorded branch
- */
-static inline u64 intel_pmu_lbr_tos(void)
-{
-       u64 tos;
-
-       rdmsrl(x86_pmu.lbr_tos, tos);
-       return tos;
-}
-
-enum {
-       LBR_NONE,
-       LBR_VALID,
-};
-
-static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
-{
-       int i;
-       unsigned lbr_idx, mask;
-       u64 tos;
-
-       if (task_ctx->lbr_callstack_users == 0 ||
-           task_ctx->lbr_stack_state == LBR_NONE) {
-               intel_pmu_lbr_reset();
-               return;
-       }
-
-       mask = x86_pmu.lbr_nr - 1;
-       tos = task_ctx->tos;
-       for (i = 0; i < tos; i++) {
-               lbr_idx = (tos - i) & mask;
-               wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
-               wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
-               if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-                       wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
-       }
-       wrmsrl(x86_pmu.lbr_tos, tos);
-       task_ctx->lbr_stack_state = LBR_NONE;
-}
-
-static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
-{
-       int i;
-       unsigned lbr_idx, mask;
-       u64 tos;
-
-       if (task_ctx->lbr_callstack_users == 0) {
-               task_ctx->lbr_stack_state = LBR_NONE;
-               return;
-       }
-
-       mask = x86_pmu.lbr_nr - 1;
-       tos = intel_pmu_lbr_tos();
-       for (i = 0; i < tos; i++) {
-               lbr_idx = (tos - i) & mask;
-               rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
-               rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
-               if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-                       rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
-       }
-       task_ctx->tos = tos;
-       task_ctx->lbr_stack_state = LBR_VALID;
-}
-
-void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct x86_perf_task_context *task_ctx;
-
-       /*
-        * If LBR callstack feature is enabled and the stack was saved when
-        * the task was scheduled out, restore the stack. Otherwise flush
-        * the LBR stack.
-        */
-       task_ctx = ctx ? ctx->task_ctx_data : NULL;
-       if (task_ctx) {
-               if (sched_in) {
-                       __intel_pmu_lbr_restore(task_ctx);
-                       cpuc->lbr_context = ctx;
-               } else {
-                       __intel_pmu_lbr_save(task_ctx);
-               }
-               return;
-       }
-
-       /*
-        * When sampling the branck stack in system-wide, it may be
-        * necessary to flush the stack on context switch. This happens
-        * when the branch stack does not tag its entries with the pid
-        * of the current task. Otherwise it becomes impossible to
-        * associate a branch entry with a task. This ambiguity is more
-        * likely to appear when the branch stack supports priv level
-        * filtering and the user sets it to monitor only at the user
-        * level (which could be a useful measurement in system-wide
-        * mode). In that case, the risk is high of having a branch
-        * stack with branch from multiple tasks.
-        */
-       if (sched_in) {
-               intel_pmu_lbr_reset();
-               cpuc->lbr_context = ctx;
-       }
-}
-
-static inline bool branch_user_callstack(unsigned br_sel)
-{
-       return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
-}
-
-void intel_pmu_lbr_enable(struct perf_event *event)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct x86_perf_task_context *task_ctx;
-
-       if (!x86_pmu.lbr_nr)
-               return;
-
-       /*
-        * Reset the LBR stack if we changed task context to
-        * avoid data leaks.
-        */
-       if (event->ctx->task && cpuc->lbr_context != event->ctx) {
-               intel_pmu_lbr_reset();
-               cpuc->lbr_context = event->ctx;
-       }
-       cpuc->br_sel = event->hw.branch_reg.reg;
-
-       if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
-                                       event->ctx->task_ctx_data) {
-               task_ctx = event->ctx->task_ctx_data;
-               task_ctx->lbr_callstack_users++;
-       }
-
-       cpuc->lbr_users++;
-       perf_sched_cb_inc(event->ctx->pmu);
-}
-
-void intel_pmu_lbr_disable(struct perf_event *event)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct x86_perf_task_context *task_ctx;
-
-       if (!x86_pmu.lbr_nr)
-               return;
-
-       if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
-                                       event->ctx->task_ctx_data) {
-               task_ctx = event->ctx->task_ctx_data;
-               task_ctx->lbr_callstack_users--;
-       }
-
-       cpuc->lbr_users--;
-       WARN_ON_ONCE(cpuc->lbr_users < 0);
-       perf_sched_cb_dec(event->ctx->pmu);
-
-       if (cpuc->enabled && !cpuc->lbr_users) {
-               __intel_pmu_lbr_disable();
-               /* avoid stale pointer */
-               cpuc->lbr_context = NULL;
-       }
-}
-
-void intel_pmu_lbr_enable_all(bool pmi)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       if (cpuc->lbr_users)
-               __intel_pmu_lbr_enable(pmi);
-}
-
-void intel_pmu_lbr_disable_all(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       if (cpuc->lbr_users)
-               __intel_pmu_lbr_disable();
-}
-
-static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
-{
-       unsigned long mask = x86_pmu.lbr_nr - 1;
-       u64 tos = intel_pmu_lbr_tos();
-       int i;
-
-       for (i = 0; i < x86_pmu.lbr_nr; i++) {
-               unsigned long lbr_idx = (tos - i) & mask;
-               union {
-                       struct {
-                               u32 from;
-                               u32 to;
-                       };
-                       u64     lbr;
-               } msr_lastbranch;
-
-               rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
-
-               cpuc->lbr_entries[i].from       = msr_lastbranch.from;
-               cpuc->lbr_entries[i].to         = msr_lastbranch.to;
-               cpuc->lbr_entries[i].mispred    = 0;
-               cpuc->lbr_entries[i].predicted  = 0;
-               cpuc->lbr_entries[i].reserved   = 0;
-       }
-       cpuc->lbr_stack.nr = i;
-}
-
-/*
- * Due to lack of segmentation in Linux the effective address (offset)
- * is the same as the linear address, allowing us to merge the LIP and EIP
- * LBR formats.
- */
-static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
-{
-       bool need_info = false;
-       unsigned long mask = x86_pmu.lbr_nr - 1;
-       int lbr_format = x86_pmu.intel_cap.lbr_format;
-       u64 tos = intel_pmu_lbr_tos();
-       int i;
-       int out = 0;
-       int num = x86_pmu.lbr_nr;
-
-       if (cpuc->lbr_sel) {
-               need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO);
-               if (cpuc->lbr_sel->config & LBR_CALL_STACK)
-                       num = tos;
-       }
-
-       for (i = 0; i < num; i++) {
-               unsigned long lbr_idx = (tos - i) & mask;
-               u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
-               int skip = 0;
-               u16 cycles = 0;
-               int lbr_flags = lbr_desc[lbr_format];
-
-               rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
-               rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);
-
-               if (lbr_format == LBR_FORMAT_INFO && need_info) {
-                       u64 info;
-
-                       rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info);
-                       mis = !!(info & LBR_INFO_MISPRED);
-                       pred = !mis;
-                       in_tx = !!(info & LBR_INFO_IN_TX);
-                       abort = !!(info & LBR_INFO_ABORT);
-                       cycles = (info & LBR_INFO_CYCLES);
-               }
-               if (lbr_flags & LBR_EIP_FLAGS) {
-                       mis = !!(from & LBR_FROM_FLAG_MISPRED);
-                       pred = !mis;
-                       skip = 1;
-               }
-               if (lbr_flags & LBR_TSX) {
-                       in_tx = !!(from & LBR_FROM_FLAG_IN_TX);
-                       abort = !!(from & LBR_FROM_FLAG_ABORT);
-                       skip = 3;
-               }
-               from = (u64)((((s64)from) << skip) >> skip);
-
-               /*
-                * Some CPUs report duplicated abort records,
-                * with the second entry not having an abort bit set.
-                * Skip them here. This loop runs backwards,
-                * so we need to undo the previous record.
-                * If the abort just happened outside the window
-                * the extra entry cannot be removed.
-                */
-               if (abort && x86_pmu.lbr_double_abort && out > 0)
-                       out--;
-
-               cpuc->lbr_entries[out].from      = from;
-               cpuc->lbr_entries[out].to        = to;
-               cpuc->lbr_entries[out].mispred   = mis;
-               cpuc->lbr_entries[out].predicted = pred;
-               cpuc->lbr_entries[out].in_tx     = in_tx;
-               cpuc->lbr_entries[out].abort     = abort;
-               cpuc->lbr_entries[out].cycles    = cycles;
-               cpuc->lbr_entries[out].reserved  = 0;
-               out++;
-       }
-       cpuc->lbr_stack.nr = out;
-}
-
-void intel_pmu_lbr_read(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       if (!cpuc->lbr_users)
-               return;
-
-       if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
-               intel_pmu_lbr_read_32(cpuc);
-       else
-               intel_pmu_lbr_read_64(cpuc);
-
-       intel_pmu_lbr_filter(cpuc);
-}
-
-/*
- * SW filter is used:
- * - in case there is no HW filter
- * - in case the HW filter has errata or limitations
- */
-static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
-{
-       u64 br_type = event->attr.branch_sample_type;
-       int mask = 0;
-
-       if (br_type & PERF_SAMPLE_BRANCH_USER)
-               mask |= X86_BR_USER;
-
-       if (br_type & PERF_SAMPLE_BRANCH_KERNEL)
-               mask |= X86_BR_KERNEL;
-
-       /* we ignore BRANCH_HV here */
-
-       if (br_type & PERF_SAMPLE_BRANCH_ANY)
-               mask |= X86_BR_ANY;
-
-       if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL)
-               mask |= X86_BR_ANY_CALL;
-
-       if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
-               mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET;
-
-       if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
-               mask |= X86_BR_IND_CALL;
-
-       if (br_type & PERF_SAMPLE_BRANCH_ABORT_TX)
-               mask |= X86_BR_ABORT;
-
-       if (br_type & PERF_SAMPLE_BRANCH_IN_TX)
-               mask |= X86_BR_IN_TX;
-
-       if (br_type & PERF_SAMPLE_BRANCH_NO_TX)
-               mask |= X86_BR_NO_TX;
-
-       if (br_type & PERF_SAMPLE_BRANCH_COND)
-               mask |= X86_BR_JCC;
-
-       if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) {
-               if (!x86_pmu_has_lbr_callstack())
-                       return -EOPNOTSUPP;
-               if (mask & ~(X86_BR_USER | X86_BR_KERNEL))
-                       return -EINVAL;
-               mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET |
-                       X86_BR_CALL_STACK;
-       }
-
-       if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP)
-               mask |= X86_BR_IND_JMP;
-
-       if (br_type & PERF_SAMPLE_BRANCH_CALL)
-               mask |= X86_BR_CALL | X86_BR_ZERO_CALL;
-       /*
-        * stash actual user request into reg, it may
-        * be used by fixup code for some CPU
-        */
-       event->hw.branch_reg.reg = mask;
-       return 0;
-}
-
-/*
- * setup the HW LBR filter
- * Used only when available, may not be enough to disambiguate
- * all branches, may need the help of the SW filter
- */
-static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg;
-       u64 br_type = event->attr.branch_sample_type;
-       u64 mask = 0, v;
-       int i;
-
-       for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) {
-               if (!(br_type & (1ULL << i)))
-                       continue;
-
-               v = x86_pmu.lbr_sel_map[i];
-               if (v == LBR_NOT_SUPP)
-                       return -EOPNOTSUPP;
-
-               if (v != LBR_IGN)
-                       mask |= v;
-       }
-
-       reg = &event->hw.branch_reg;
-       reg->idx = EXTRA_REG_LBR;
-
-       /*
-        * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
-        * in suppress mode. So LBR_SELECT should be set to
-        * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
-        */
-       reg->config = mask ^ x86_pmu.lbr_sel_mask;
-
-       if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) &&
-           (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) &&
-           (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO))
-               reg->config |= LBR_NO_INFO;
-
-       return 0;
-}
-
-int intel_pmu_setup_lbr_filter(struct perf_event *event)
-{
-       int ret = 0;
-
-       /*
-        * no LBR on this PMU
-        */
-       if (!x86_pmu.lbr_nr)
-               return -EOPNOTSUPP;
-
-       /*
-        * setup SW LBR filter
-        */
-       ret = intel_pmu_setup_sw_lbr_filter(event);
-       if (ret)
-               return ret;
-
-       /*
-        * setup HW LBR filter, if any
-        */
-       if (x86_pmu.lbr_sel_map)
-               ret = intel_pmu_setup_hw_lbr_filter(event);
-
-       return ret;
-}
-
-/*
- * return the type of control flow change at address "from"
- * intruction is not necessarily a branch (in case of interrupt).
- *
- * The branch type returned also includes the priv level of the
- * target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
- *
- * If a branch type is unknown OR the instruction cannot be
- * decoded (e.g., text page not present), then X86_BR_NONE is
- * returned.
- */
-static int branch_type(unsigned long from, unsigned long to, int abort)
-{
-       struct insn insn;
-       void *addr;
-       int bytes_read, bytes_left;
-       int ret = X86_BR_NONE;
-       int ext, to_plm, from_plm;
-       u8 buf[MAX_INSN_SIZE];
-       int is64 = 0;
-
-       to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
-       from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER;
-
-       /*
-        * maybe zero if lbr did not fill up after a reset by the time
-        * we get a PMU interrupt
-        */
-       if (from == 0 || to == 0)
-               return X86_BR_NONE;
-
-       if (abort)
-               return X86_BR_ABORT | to_plm;
-
-       if (from_plm == X86_BR_USER) {
-               /*
-                * can happen if measuring at the user level only
-                * and we interrupt in a kernel thread, e.g., idle.
-                */
-               if (!current->mm)
-                       return X86_BR_NONE;
-
-               /* may fail if text not present */
-               bytes_left = copy_from_user_nmi(buf, (void __user *)from,
-                                               MAX_INSN_SIZE);
-               bytes_read = MAX_INSN_SIZE - bytes_left;
-               if (!bytes_read)
-                       return X86_BR_NONE;
-
-               addr = buf;
-       } else {
-               /*
-                * The LBR logs any address in the IP, even if the IP just
-                * faulted. This means userspace can control the from address.
-                * Ensure we don't blindy read any address by validating it is
-                * a known text address.
-                */
-               if (kernel_text_address(from)) {
-                       addr = (void *)from;
-                       /*
-                        * Assume we can get the maximum possible size
-                        * when grabbing kernel data.  This is not
-                        * _strictly_ true since we could possibly be
-                        * executing up next to a memory hole, but
-                        * it is very unlikely to be a problem.
-                        */
-                       bytes_read = MAX_INSN_SIZE;
-               } else {
-                       return X86_BR_NONE;
-               }
-       }
-
-       /*
-        * decoder needs to know the ABI especially
-        * on 64-bit systems running 32-bit apps
-        */
-#ifdef CONFIG_X86_64
-       is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32);
-#endif
-       insn_init(&insn, addr, bytes_read, is64);
-       insn_get_opcode(&insn);
-       if (!insn.opcode.got)
-               return X86_BR_ABORT;
-
-       switch (insn.opcode.bytes[0]) {
-       case 0xf:
-               switch (insn.opcode.bytes[1]) {
-               case 0x05: /* syscall */
-               case 0x34: /* sysenter */
-                       ret = X86_BR_SYSCALL;
-                       break;
-               case 0x07: /* sysret */
-               case 0x35: /* sysexit */
-                       ret = X86_BR_SYSRET;
-                       break;
-               case 0x80 ... 0x8f: /* conditional */
-                       ret = X86_BR_JCC;
-                       break;
-               default:
-                       ret = X86_BR_NONE;
-               }
-               break;
-       case 0x70 ... 0x7f: /* conditional */
-               ret = X86_BR_JCC;
-               break;
-       case 0xc2: /* near ret */
-       case 0xc3: /* near ret */
-       case 0xca: /* far ret */
-       case 0xcb: /* far ret */
-               ret = X86_BR_RET;
-               break;
-       case 0xcf: /* iret */
-               ret = X86_BR_IRET;
-               break;
-       case 0xcc ... 0xce: /* int */
-               ret = X86_BR_INT;
-               break;
-       case 0xe8: /* call near rel */
-               insn_get_immediate(&insn);
-               if (insn.immediate1.value == 0) {
-                       /* zero length call */
-                       ret = X86_BR_ZERO_CALL;
-                       break;
-               }
-       case 0x9a: /* call far absolute */
-               ret = X86_BR_CALL;
-               break;
-       case 0xe0 ... 0xe3: /* loop jmp */
-               ret = X86_BR_JCC;
-               break;
-       case 0xe9 ... 0xeb: /* jmp */
-               ret = X86_BR_JMP;
-               break;
-       case 0xff: /* call near absolute, call far absolute ind */
-               insn_get_modrm(&insn);
-               ext = (insn.modrm.bytes[0] >> 3) & 0x7;
-               switch (ext) {
-               case 2: /* near ind call */
-               case 3: /* far ind call */
-                       ret = X86_BR_IND_CALL;
-                       break;
-               case 4:
-               case 5:
-                       ret = X86_BR_IND_JMP;
-                       break;
-               }
-               break;
-       default:
-               ret = X86_BR_NONE;
-       }
-       /*
-        * interrupts, traps, faults (and thus ring transition) may
-        * occur on any instructions. Thus, to classify them correctly,
-        * we need to first look at the from and to priv levels. If they
-        * are different and to is in the kernel, then it indicates
-        * a ring transition. If the from instruction is not a ring
-        * transition instr (syscall, systenter, int), then it means
-        * it was a irq, trap or fault.
-        *
-        * we have no way of detecting kernel to kernel faults.
-        */
-       if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL
-           && ret != X86_BR_SYSCALL && ret != X86_BR_INT)
-               ret = X86_BR_IRQ;
-
-       /*
-        * branch priv level determined by target as
-        * is done by HW when LBR_SELECT is implemented
-        */
-       if (ret != X86_BR_NONE)
-               ret |= to_plm;
-
-       return ret;
-}
-
-/*
- * implement actual branch filter based on user demand.
- * Hardware may not exactly satisfy that request, thus
- * we need to inspect opcodes. Mismatched branches are
- * discarded. Therefore, the number of branches returned
- * in PERF_SAMPLE_BRANCH_STACK sample may vary.
- */
-static void
-intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
-{
-       u64 from, to;
-       int br_sel = cpuc->br_sel;
-       int i, j, type;
-       bool compress = false;
-
-       /* if sampling all branches, then nothing to filter */
-       if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
-               return;
-
-       for (i = 0; i < cpuc->lbr_stack.nr; i++) {
-
-               from = cpuc->lbr_entries[i].from;
-               to = cpuc->lbr_entries[i].to;
-
-               type = branch_type(from, to, cpuc->lbr_entries[i].abort);
-               if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) {
-                       if (cpuc->lbr_entries[i].in_tx)
-                               type |= X86_BR_IN_TX;
-                       else
-                               type |= X86_BR_NO_TX;
-               }
-
-               /* if type does not correspond, then discard */
-               if (type == X86_BR_NONE || (br_sel & type) != type) {
-                       cpuc->lbr_entries[i].from = 0;
-                       compress = true;
-               }
-       }
-
-       if (!compress)
-               return;
-
-       /* remove all entries with from=0 */
-       for (i = 0; i < cpuc->lbr_stack.nr; ) {
-               if (!cpuc->lbr_entries[i].from) {
-                       j = i;
-                       while (++j < cpuc->lbr_stack.nr)
-                               cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
-                       cpuc->lbr_stack.nr--;
-                       if (!cpuc->lbr_entries[i].from)
-                               continue;
-               }
-               i++;
-       }
-}
-
-/*
- * Map interface branch filters onto LBR filters
- */
-static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
-       [PERF_SAMPLE_BRANCH_ANY_SHIFT]          = LBR_ANY,
-       [PERF_SAMPLE_BRANCH_USER_SHIFT]         = LBR_USER,
-       [PERF_SAMPLE_BRANCH_KERNEL_SHIFT]       = LBR_KERNEL,
-       [PERF_SAMPLE_BRANCH_HV_SHIFT]           = LBR_IGN,
-       [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]   = LBR_RETURN | LBR_REL_JMP
-                                               | LBR_IND_JMP | LBR_FAR,
-       /*
-        * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
-        */
-       [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] =
-        LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
-       /*
-        * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
-        */
-       [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
-       [PERF_SAMPLE_BRANCH_COND_SHIFT]     = LBR_JCC,
-       [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
-};
-
-static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
-       [PERF_SAMPLE_BRANCH_ANY_SHIFT]          = LBR_ANY,
-       [PERF_SAMPLE_BRANCH_USER_SHIFT]         = LBR_USER,
-       [PERF_SAMPLE_BRANCH_KERNEL_SHIFT]       = LBR_KERNEL,
-       [PERF_SAMPLE_BRANCH_HV_SHIFT]           = LBR_IGN,
-       [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]   = LBR_RETURN | LBR_FAR,
-       [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]     = LBR_REL_CALL | LBR_IND_CALL
-                                               | LBR_FAR,
-       [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]     = LBR_IND_CALL,
-       [PERF_SAMPLE_BRANCH_COND_SHIFT]         = LBR_JCC,
-       [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]     = LBR_IND_JMP,
-       [PERF_SAMPLE_BRANCH_CALL_SHIFT]         = LBR_REL_CALL,
-};
-
-static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
-       [PERF_SAMPLE_BRANCH_ANY_SHIFT]          = LBR_ANY,
-       [PERF_SAMPLE_BRANCH_USER_SHIFT]         = LBR_USER,
-       [PERF_SAMPLE_BRANCH_KERNEL_SHIFT]       = LBR_KERNEL,
-       [PERF_SAMPLE_BRANCH_HV_SHIFT]           = LBR_IGN,
-       [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]   = LBR_RETURN | LBR_FAR,
-       [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]     = LBR_REL_CALL | LBR_IND_CALL
-                                               | LBR_FAR,
-       [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]     = LBR_IND_CALL,
-       [PERF_SAMPLE_BRANCH_COND_SHIFT]         = LBR_JCC,
-       [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT]   = LBR_REL_CALL | LBR_IND_CALL
-                                               | LBR_RETURN | LBR_CALL_STACK,
-       [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]     = LBR_IND_JMP,
-       [PERF_SAMPLE_BRANCH_CALL_SHIFT]         = LBR_REL_CALL,
-};
-
-/* core */
-void __init intel_pmu_lbr_init_core(void)
-{
-       x86_pmu.lbr_nr     = 4;
-       x86_pmu.lbr_tos    = MSR_LBR_TOS;
-       x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
-       x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
-
-       /*
-        * SW branch filter usage:
-        * - compensate for lack of HW filter
-        */
-       pr_cont("4-deep LBR, ");
-}
-
-/* nehalem/westmere */
-void __init intel_pmu_lbr_init_nhm(void)
-{
-       x86_pmu.lbr_nr     = 16;
-       x86_pmu.lbr_tos    = MSR_LBR_TOS;
-       x86_pmu.lbr_from   = MSR_LBR_NHM_FROM;
-       x86_pmu.lbr_to     = MSR_LBR_NHM_TO;
-
-       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
-       x86_pmu.lbr_sel_map  = nhm_lbr_sel_map;
-
-       /*
-        * SW branch filter usage:
-        * - workaround LBR_SEL errata (see above)
-        * - support syscall, sysret capture.
-        *   That requires LBR_FAR but that means far
-        *   jmp need to be filtered out
-        */
-       pr_cont("16-deep LBR, ");
-}
-
-/* sandy bridge */
-void __init intel_pmu_lbr_init_snb(void)
-{
-       x86_pmu.lbr_nr   = 16;
-       x86_pmu.lbr_tos  = MSR_LBR_TOS;
-       x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
-       x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
-
-       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
-       x86_pmu.lbr_sel_map  = snb_lbr_sel_map;
-
-       /*
-        * SW branch filter usage:
-        * - support syscall, sysret capture.
-        *   That requires LBR_FAR but that means far
-        *   jmp need to be filtered out
-        */
-       pr_cont("16-deep LBR, ");
-}
-
-/* haswell */
-void intel_pmu_lbr_init_hsw(void)
-{
-       x86_pmu.lbr_nr   = 16;
-       x86_pmu.lbr_tos  = MSR_LBR_TOS;
-       x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
-       x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
-
-       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
-       x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
-
-       pr_cont("16-deep LBR, ");
-}
-
-/* skylake */
-__init void intel_pmu_lbr_init_skl(void)
-{
-       x86_pmu.lbr_nr   = 32;
-       x86_pmu.lbr_tos  = MSR_LBR_TOS;
-       x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
-       x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
-
-       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
-       x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
-
-       /*
-        * SW branch filter usage:
-        * - support syscall, sysret capture.
-        *   That requires LBR_FAR but that means far
-        *   jmp need to be filtered out
-        */
-       pr_cont("32-deep LBR, ");
-}
-
-/* atom */
-void __init intel_pmu_lbr_init_atom(void)
-{
-       /*
-        * only models starting at stepping 10 seems
-        * to have an operational LBR which can freeze
-        * on PMU interrupt
-        */
-       if (boot_cpu_data.x86_model == 28
-           && boot_cpu_data.x86_mask < 10) {
-               pr_cont("LBR disabled due to erratum");
-               return;
-       }
-
-       x86_pmu.lbr_nr     = 8;
-       x86_pmu.lbr_tos    = MSR_LBR_TOS;
-       x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
-       x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
-
-       /*
-        * SW branch filter usage:
-        * - compensate for lack of HW filter
-        */
-       pr_cont("8-deep LBR, ");
-}
-
-/* Knights Landing */
-void intel_pmu_lbr_init_knl(void)
-{
-       x86_pmu.lbr_nr     = 8;
-       x86_pmu.lbr_tos    = MSR_LBR_TOS;
-       x86_pmu.lbr_from   = MSR_LBR_NHM_FROM;
-       x86_pmu.lbr_to     = MSR_LBR_NHM_TO;
-
-       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
-       x86_pmu.lbr_sel_map  = snb_lbr_sel_map;
-
-       pr_cont("8-deep LBR, ");
-}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c

deleted file mode 100644 (file)

index c0bbd10..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ /dev/null
@@ -1,1188 +0,0 @@
-/*
- * Intel(R) Processor Trace PMU driver for perf
- * Copyright (c) 2013-2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * Intel PT is specified in the Intel Architecture Instruction Set Extensions
- * Programming Reference:
- * http://software.intel.com/en-us/intel-isa-extensions
- */
-
-#undef DEBUG
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/device.h>
-
-#include <asm/perf_event.h>
-#include <asm/insn.h>
-#include <asm/io.h>
-#include <asm/intel_pt.h>
-
-#include "perf_event.h"
-#include "intel_pt.h"
-
-static DEFINE_PER_CPU(struct pt, pt_ctx);
-
-static struct pt_pmu pt_pmu;
-
-enum cpuid_regs {
-       CR_EAX = 0,
-       CR_ECX,
-       CR_EDX,
-       CR_EBX
-};
-
-/*
- * Capabilities of Intel PT hardware, such as number of address bits or
- * supported output schemes, are cached and exported to userspace as "caps"
- * attribute group of pt pmu device
- * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
- * relevant bits together with intel_pt traces.
- *
- * These are necessary for both trace decoding (payloads_lip, contains address
- * width encoded in IP-related packets), and event configuration (bitmasks with
- * permitted values for certain bit fields).
- */
-#define PT_CAP(_n, _l, _r, _m)                                         \
-       [PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l,        \
-                           .reg = _r, .mask = _m }
-
-static struct pt_cap_desc {
-       const char      *name;
-       u32             leaf;
-       u8              reg;
-       u32             mask;
-} pt_caps[] = {
-       PT_CAP(max_subleaf,             0, CR_EAX, 0xffffffff),
-       PT_CAP(cr3_filtering,           0, CR_EBX, BIT(0)),
-       PT_CAP(psb_cyc,                 0, CR_EBX, BIT(1)),
-       PT_CAP(mtc,                     0, CR_EBX, BIT(3)),
-       PT_CAP(topa_output,             0, CR_ECX, BIT(0)),
-       PT_CAP(topa_multiple_entries,   0, CR_ECX, BIT(1)),
-       PT_CAP(single_range_output,     0, CR_ECX, BIT(2)),
-       PT_CAP(payloads_lip,            0, CR_ECX, BIT(31)),
-       PT_CAP(mtc_periods,             1, CR_EAX, 0xffff0000),
-       PT_CAP(cycle_thresholds,        1, CR_EBX, 0xffff),
-       PT_CAP(psb_periods,             1, CR_EBX, 0xffff0000),
-};
-
-static u32 pt_cap_get(enum pt_capabilities cap)
-{
-       struct pt_cap_desc *cd = &pt_caps[cap];
-       u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
-       unsigned int shift = __ffs(cd->mask);
-
-       return (c & cd->mask) >> shift;
-}
-
-static ssize_t pt_cap_show(struct device *cdev,
-                          struct device_attribute *attr,
-                          char *buf)
-{
-       struct dev_ext_attribute *ea =
-               container_of(attr, struct dev_ext_attribute, attr);
-       enum pt_capabilities cap = (long)ea->var;
-
-       return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
-}
-
-static struct attribute_group pt_cap_group = {
-       .name   = "caps",
-};
-
-PMU_FORMAT_ATTR(cyc,           "config:1"      );
-PMU_FORMAT_ATTR(mtc,           "config:9"      );
-PMU_FORMAT_ATTR(tsc,           "config:10"     );
-PMU_FORMAT_ATTR(noretcomp,     "config:11"     );
-PMU_FORMAT_ATTR(mtc_period,    "config:14-17"  );
-PMU_FORMAT_ATTR(cyc_thresh,    "config:19-22"  );
-PMU_FORMAT_ATTR(psb_period,    "config:24-27"  );
-
-static struct attribute *pt_formats_attr[] = {
-       &format_attr_cyc.attr,
-       &format_attr_mtc.attr,
-       &format_attr_tsc.attr,
-       &format_attr_noretcomp.attr,
-       &format_attr_mtc_period.attr,
-       &format_attr_cyc_thresh.attr,
-       &format_attr_psb_period.attr,
-       NULL,
-};
-
-static struct attribute_group pt_format_group = {
-       .name   = "format",
-       .attrs  = pt_formats_attr,
-};
-
-static const struct attribute_group *pt_attr_groups[] = {
-       &pt_cap_group,
-       &pt_format_group,
-       NULL,
-};
-
-static int __init pt_pmu_hw_init(void)
-{
-       struct dev_ext_attribute *de_attrs;
-       struct attribute **attrs;
-       size_t size;
-       int ret;
-       long i;
-
-       attrs = NULL;
-
-       for (i = 0; i < PT_CPUID_LEAVES; i++) {
-               cpuid_count(20, i,
-                           &pt_pmu.caps[CR_EAX + i*PT_CPUID_REGS_NUM],
-                           &pt_pmu.caps[CR_EBX + i*PT_CPUID_REGS_NUM],
-                           &pt_pmu.caps[CR_ECX + i*PT_CPUID_REGS_NUM],
-                           &pt_pmu.caps[CR_EDX + i*PT_CPUID_REGS_NUM]);
-       }
-
-       ret = -ENOMEM;
-       size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
-       attrs = kzalloc(size, GFP_KERNEL);
-       if (!attrs)
-               goto fail;
-
-       size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
-       de_attrs = kzalloc(size, GFP_KERNEL);
-       if (!de_attrs)
-               goto fail;
-
-       for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
-               struct dev_ext_attribute *de_attr = de_attrs + i;
-
-               de_attr->attr.attr.name = pt_caps[i].name;
-
-               sysfs_attr_init(&de_attr->attr.attr);
-
-               de_attr->attr.attr.mode         = S_IRUGO;
-               de_attr->attr.show              = pt_cap_show;
-               de_attr->var                    = (void *)i;
-
-               attrs[i] = &de_attr->attr.attr;
-       }
-
-       pt_cap_group.attrs = attrs;
-
-       return 0;
-
-fail:
-       kfree(attrs);
-
-       return ret;
-}
-
-#define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC    | \
-                         RTIT_CTL_CYC_THRESH   | \
-                         RTIT_CTL_PSB_FREQ)
-
-#define RTIT_CTL_MTC   (RTIT_CTL_MTC_EN        | \
-                        RTIT_CTL_MTC_RANGE)
-
-#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN                | \
-                       RTIT_CTL_DISRETC        | \
-                       RTIT_CTL_CYC_PSB        | \
-                       RTIT_CTL_MTC)
-
-static bool pt_event_valid(struct perf_event *event)
-{
-       u64 config = event->attr.config;
-       u64 allowed, requested;
-
-       if ((config & PT_CONFIG_MASK) != config)
-               return false;
-
-       if (config & RTIT_CTL_CYC_PSB) {
-               if (!pt_cap_get(PT_CAP_psb_cyc))
-                       return false;
-
-               allowed = pt_cap_get(PT_CAP_psb_periods);
-               requested = (config & RTIT_CTL_PSB_FREQ) >>
-                       RTIT_CTL_PSB_FREQ_OFFSET;
-               if (requested && (!(allowed & BIT(requested))))
-                       return false;
-
-               allowed = pt_cap_get(PT_CAP_cycle_thresholds);
-               requested = (config & RTIT_CTL_CYC_THRESH) >>
-                       RTIT_CTL_CYC_THRESH_OFFSET;
-               if (requested && (!(allowed & BIT(requested))))
-                       return false;
-       }
-
-       if (config & RTIT_CTL_MTC) {
-               /*
-                * In the unlikely case that CPUID lists valid mtc periods,
-                * but not the mtc capability, drop out here.
-                *
-                * Spec says that setting mtc period bits while mtc bit in
-                * CPUID is 0 will #GP, so better safe than sorry.
-                */
-               if (!pt_cap_get(PT_CAP_mtc))
-                       return false;
-
-               allowed = pt_cap_get(PT_CAP_mtc_periods);
-               if (!allowed)
-                       return false;
-
-               requested = (config & RTIT_CTL_MTC_RANGE) >>
-                       RTIT_CTL_MTC_RANGE_OFFSET;
-
-               if (!(allowed & BIT(requested)))
-                       return false;
-       }
-
-       return true;
-}
-
-/*
- * PT configuration helpers
- * These all are cpu affine and operate on a local PT
- */
-
-static void pt_config(struct perf_event *event)
-{
-       u64 reg;
-
-       if (!event->hw.itrace_started) {
-               event->hw.itrace_started = 1;
-               wrmsrl(MSR_IA32_RTIT_STATUS, 0);
-       }
-
-       reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
-
-       if (!event->attr.exclude_kernel)
-               reg |= RTIT_CTL_OS;
-       if (!event->attr.exclude_user)
-               reg |= RTIT_CTL_USR;
-
-       reg |= (event->attr.config & PT_CONFIG_MASK);
-
-       wrmsrl(MSR_IA32_RTIT_CTL, reg);
-}
-
-static void pt_config_start(bool start)
-{
-       u64 ctl;
-
-       rdmsrl(MSR_IA32_RTIT_CTL, ctl);
-       if (start)
-               ctl |= RTIT_CTL_TRACEEN;
-       else
-               ctl &= ~RTIT_CTL_TRACEEN;
-       wrmsrl(MSR_IA32_RTIT_CTL, ctl);
-
-       /*
-        * A wrmsr that disables trace generation serializes other PT
-        * registers and causes all data packets to be written to memory,
-        * but a fence is required for the data to become globally visible.
-        *
-        * The below WMB, separating data store and aux_head store matches
-        * the consumer's RMB that separates aux_head load and data load.
-        */
-       if (!start)
-               wmb();
-}
-
-static void pt_config_buffer(void *buf, unsigned int topa_idx,
-                            unsigned int output_off)
-{
-       u64 reg;
-
-       wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
-
-       reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
-
-       wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
-}
-
-/*
- * Keep ToPA table-related metadata on the same page as the actual table,
- * taking up a few words from the top
- */
-
-#define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1)
-
-/**
- * struct topa - page-sized ToPA table with metadata at the top
- * @table:     actual ToPA table entries, as understood by PT hardware
- * @list:      linkage to struct pt_buffer's list of tables
- * @phys:      physical address of this page
- * @offset:    offset of the first entry in this table in the buffer
- * @size:      total size of all entries in this table
- * @last:      index of the last initialized entry in this table
- */
-struct topa {
-       struct topa_entry       table[TENTS_PER_PAGE];
-       struct list_head        list;
-       u64                     phys;
-       u64                     offset;
-       size_t                  size;
-       int                     last;
-};
-
-/* make -1 stand for the last table entry */
-#define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)])
-
-/**
- * topa_alloc() - allocate page-sized ToPA table
- * @cpu:       CPU on which to allocate.
- * @gfp:       Allocation flags.
- *
- * Return:     On success, return the pointer to ToPA table page.
- */
-static struct topa *topa_alloc(int cpu, gfp_t gfp)
-{
-       int node = cpu_to_node(cpu);
-       struct topa *topa;
-       struct page *p;
-
-       p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
-       if (!p)
-               return NULL;
-
-       topa = page_address(p);
-       topa->last = 0;
-       topa->phys = page_to_phys(p);
-
-       /*
-        * In case of singe-entry ToPA, always put the self-referencing END
-        * link as the 2nd entry in the table
-        */
-       if (!pt_cap_get(PT_CAP_topa_multiple_entries)) {
-               TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
-               TOPA_ENTRY(topa, 1)->end = 1;
-       }
-
-       return topa;
-}
-
-/**
- * topa_free() - free a page-sized ToPA table
- * @topa:      Table to deallocate.
- */
-static void topa_free(struct topa *topa)
-{
-       free_page((unsigned long)topa);
-}
-
-/**
- * topa_insert_table() - insert a ToPA table into a buffer
- * @buf:        PT buffer that's being extended.
- * @topa:       New topa table to be inserted.
- *
- * If it's the first table in this buffer, set up buffer's pointers
- * accordingly; otherwise, add a END=1 link entry to @topa to the current
- * "last" table and adjust the last table pointer to @topa.
- */
-static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
-{
-       struct topa *last = buf->last;
-
-       list_add_tail(&topa->list, &buf->tables);
-
-       if (!buf->first) {
-               buf->first = buf->last = buf->cur = topa;
-               return;
-       }
-
-       topa->offset = last->offset + last->size;
-       buf->last = topa;
-
-       if (!pt_cap_get(PT_CAP_topa_multiple_entries))
-               return;
-
-       BUG_ON(last->last != TENTS_PER_PAGE - 1);
-
-       TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT;
-       TOPA_ENTRY(last, -1)->end = 1;
-}
-
-/**
- * topa_table_full() - check if a ToPA table is filled up
- * @topa:      ToPA table.
- */
-static bool topa_table_full(struct topa *topa)
-{
-       /* single-entry ToPA is a special case */
-       if (!pt_cap_get(PT_CAP_topa_multiple_entries))
-               return !!topa->last;
-
-       return topa->last == TENTS_PER_PAGE - 1;
-}
-
-/**
- * topa_insert_pages() - create a list of ToPA tables
- * @buf:       PT buffer being initialized.
- * @gfp:       Allocation flags.
- *
- * This initializes a list of ToPA tables with entries from
- * the data_pages provided by rb_alloc_aux().
- *
- * Return:     0 on success or error code.
- */
-static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp)
-{
-       struct topa *topa = buf->last;
-       int order = 0;
-       struct page *p;
-
-       p = virt_to_page(buf->data_pages[buf->nr_pages]);
-       if (PagePrivate(p))
-               order = page_private(p);
-
-       if (topa_table_full(topa)) {
-               topa = topa_alloc(buf->cpu, gfp);
-               if (!topa)
-                       return -ENOMEM;
-
-               topa_insert_table(buf, topa);
-       }
-
-       TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
-       TOPA_ENTRY(topa, -1)->size = order;
-       if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) {
-               TOPA_ENTRY(topa, -1)->intr = 1;
-               TOPA_ENTRY(topa, -1)->stop = 1;
-       }
-
-       topa->last++;
-       topa->size += sizes(order);
-
-       buf->nr_pages += 1ul << order;
-
-       return 0;
-}
-
-/**
- * pt_topa_dump() - print ToPA tables and their entries
- * @buf:       PT buffer.
- */
-static void pt_topa_dump(struct pt_buffer *buf)
-{
-       struct topa *topa;
-
-       list_for_each_entry(topa, &buf->tables, list) {
-               int i;
-
-               pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table,
-                        topa->phys, topa->offset, topa->size);
-               for (i = 0; i < TENTS_PER_PAGE; i++) {
-                       pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
-                                &topa->table[i],
-                                (unsigned long)topa->table[i].base << TOPA_SHIFT,
-                                sizes(topa->table[i].size),
-                                topa->table[i].end ?  'E' : ' ',
-                                topa->table[i].intr ? 'I' : ' ',
-                                topa->table[i].stop ? 'S' : ' ',
-                                *(u64 *)&topa->table[i]);
-                       if ((pt_cap_get(PT_CAP_topa_multiple_entries) &&
-                            topa->table[i].stop) ||
-                           topa->table[i].end)
-                               break;
-               }
-       }
-}
-
-/**
- * pt_buffer_advance() - advance to the next output region
- * @buf:       PT buffer.
- *
- * Advance the current pointers in the buffer to the next ToPA entry.
- */
-static void pt_buffer_advance(struct pt_buffer *buf)
-{
-       buf->output_off = 0;
-       buf->cur_idx++;
-
-       if (buf->cur_idx == buf->cur->last) {
-               if (buf->cur == buf->last)
-                       buf->cur = buf->first;
-               else
-                       buf->cur = list_entry(buf->cur->list.next, struct topa,
-                                             list);
-               buf->cur_idx = 0;
-       }
-}
-
-/**
- * pt_update_head() - calculate current offsets and sizes
- * @pt:                Per-cpu pt context.
- *
- * Update buffer's current write pointer position and data size.
- */
-static void pt_update_head(struct pt *pt)
-{
-       struct pt_buffer *buf = perf_get_aux(&pt->handle);
-       u64 topa_idx, base, old;
-
-       /* offset of the first region in this table from the beginning of buf */
-       base = buf->cur->offset + buf->output_off;
-
-       /* offset of the current output region within this table */
-       for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
-               base += sizes(buf->cur->table[topa_idx].size);
-
-       if (buf->snapshot) {
-               local_set(&buf->data_size, base);
-       } else {
-               old = (local64_xchg(&buf->head, base) &
-                      ((buf->nr_pages << PAGE_SHIFT) - 1));
-               if (base < old)
-                       base += buf->nr_pages << PAGE_SHIFT;
-
-               local_add(base - old, &buf->data_size);
-       }
-}
-
-/**
- * pt_buffer_region() - obtain current output region's address
- * @buf:       PT buffer.
- */
-static void *pt_buffer_region(struct pt_buffer *buf)
-{
-       return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT);
-}
-
-/**
- * pt_buffer_region_size() - obtain current output region's size
- * @buf:       PT buffer.
- */
-static size_t pt_buffer_region_size(struct pt_buffer *buf)
-{
-       return sizes(buf->cur->table[buf->cur_idx].size);
-}
-
-/**
- * pt_handle_status() - take care of possible status conditions
- * @pt:                Per-cpu pt context.
- */
-static void pt_handle_status(struct pt *pt)
-{
-       struct pt_buffer *buf = perf_get_aux(&pt->handle);
-       int advance = 0;
-       u64 status;
-
-       rdmsrl(MSR_IA32_RTIT_STATUS, status);
-
-       if (status & RTIT_STATUS_ERROR) {
-               pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
-               pt_topa_dump(buf);
-               status &= ~RTIT_STATUS_ERROR;
-       }
-
-       if (status & RTIT_STATUS_STOPPED) {
-               status &= ~RTIT_STATUS_STOPPED;
-
-               /*
-                * On systems that only do single-entry ToPA, hitting STOP
-                * means we are already losing data; need to let the decoder
-                * know.
-                */
-               if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
-                   buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
-                       local_inc(&buf->lost);
-                       advance++;
-               }
-       }
-
-       /*
-        * Also on single-entry ToPA implementations, interrupt will come
-        * before the output reaches its output region's boundary.
-        */
-       if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot &&
-           pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
-               void *head = pt_buffer_region(buf);
-
-               /* everything within this margin needs to be zeroed out */
-               memset(head + buf->output_off, 0,
-                      pt_buffer_region_size(buf) -
-                      buf->output_off);
-               advance++;
-       }
-
-       if (advance)
-               pt_buffer_advance(buf);
-
-       wrmsrl(MSR_IA32_RTIT_STATUS, status);
-}
-
-/**
- * pt_read_offset() - translate registers into buffer pointers
- * @buf:       PT buffer.
- *
- * Set buffer's output pointers from MSR values.
- */
-static void pt_read_offset(struct pt_buffer *buf)
-{
-       u64 offset, base_topa;
-
-       rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
-       buf->cur = phys_to_virt(base_topa);
-
-       rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
-       /* offset within current output region */
-       buf->output_off = offset >> 32;
-       /* index of current output region within this table */
-       buf->cur_idx = (offset & 0xffffff80) >> 7;
-}
-
-/**
- * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry
- * @buf:       PT buffer.
- * @pg:                Page offset in the buffer.
- *
- * When advancing to the next output region (ToPA entry), given a page offset
- * into the buffer, we need to find the offset of the first page in the next
- * region.
- */
-static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
-{
-       struct topa_entry *te = buf->topa_index[pg];
-
-       /* one region */
-       if (buf->first == buf->last && buf->first->last == 1)
-               return pg;
-
-       do {
-               pg++;
-               pg &= buf->nr_pages - 1;
-       } while (buf->topa_index[pg] == te);
-
-       return pg;
-}
-
-/**
- * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
- * @buf:       PT buffer.
- * @handle:    Current output handle.
- *
- * Place INT and STOP marks to prevent overwriting old data that the consumer
- * hasn't yet collected and waking up the consumer after a certain fraction of
- * the buffer has filled up. Only needed and sensible for non-snapshot counters.
- *
- * This obviously relies on buf::head to figure out buffer markers, so it has
- * to be called after pt_buffer_reset_offsets() and before the hardware tracing
- * is enabled.
- */
-static int pt_buffer_reset_markers(struct pt_buffer *buf,
-                                  struct perf_output_handle *handle)
-
-{
-       unsigned long head = local64_read(&buf->head);
-       unsigned long idx, npages, wakeup;
-
-       /* can't stop in the middle of an output region */
-       if (buf->output_off + handle->size + 1 <
-           sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
-               return -EINVAL;
-
-
-       /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
-       if (!pt_cap_get(PT_CAP_topa_multiple_entries))
-               return 0;
-
-       /* clear STOP and INT from current entry */
-       buf->topa_index[buf->stop_pos]->stop = 0;
-       buf->topa_index[buf->intr_pos]->intr = 0;
-
-       /* how many pages till the STOP marker */
-       npages = handle->size >> PAGE_SHIFT;
-
-       /* if it's on a page boundary, fill up one more page */
-       if (!offset_in_page(head + handle->size + 1))
-               npages++;
-
-       idx = (head >> PAGE_SHIFT) + npages;
-       idx &= buf->nr_pages - 1;
-       buf->stop_pos = idx;
-
-       wakeup = handle->wakeup >> PAGE_SHIFT;
-
-       /* in the worst case, wake up the consumer one page before hard stop */
-       idx = (head >> PAGE_SHIFT) + npages - 1;
-       if (idx > wakeup)
-               idx = wakeup;
-
-       idx &= buf->nr_pages - 1;
-       buf->intr_pos = idx;
-
-       buf->topa_index[buf->stop_pos]->stop = 1;
-       buf->topa_index[buf->intr_pos]->intr = 1;
-
-       return 0;
-}
-
-/**
- * pt_buffer_setup_topa_index() - build topa_index[] table of regions
- * @buf:       PT buffer.
- *
- * topa_index[] references output regions indexed by offset into the
- * buffer for purposes of quick reverse lookup.
- */
-static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
-{
-       struct topa *cur = buf->first, *prev = buf->last;
-       struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
-               *te_prev = TOPA_ENTRY(prev, prev->last - 1);
-       int pg = 0, idx = 0;
-
-       while (pg < buf->nr_pages) {
-               int tidx;
-
-               /* pages within one topa entry */
-               for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++)
-                       buf->topa_index[pg] = te_prev;
-
-               te_prev = te_cur;
-
-               if (idx == cur->last - 1) {
-                       /* advance to next topa table */
-                       idx = 0;
-                       cur = list_entry(cur->list.next, struct topa, list);
-               } else {
-                       idx++;
-               }
-               te_cur = TOPA_ENTRY(cur, idx);
-       }
-
-}
-
-/**
- * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
- * @buf:       PT buffer.
- * @head:      Write pointer (aux_head) from AUX buffer.
- *
- * Find the ToPA table and entry corresponding to given @head and set buffer's
- * "current" pointers accordingly. This is done after we have obtained the
- * current aux_head position from a successful call to perf_aux_output_begin()
- * to make sure the hardware is writing to the right place.
- *
- * This function modifies buf::{cur,cur_idx,output_off} that will be programmed
- * into PT msrs when the tracing is enabled and buf::head and buf::data_size,
- * which are used to determine INT and STOP markers' locations by a subsequent
- * call to pt_buffer_reset_markers().
- */
-static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
-{
-       int pg;
-
-       if (buf->snapshot)
-               head &= (buf->nr_pages << PAGE_SHIFT) - 1;
-
-       pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
-       pg = pt_topa_next_entry(buf, pg);
-
-       buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK);
-       buf->cur_idx = ((unsigned long)buf->topa_index[pg] -
-                       (unsigned long)buf->cur) / sizeof(struct topa_entry);
-       buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1);
-
-       local64_set(&buf->head, head);
-       local_set(&buf->data_size, 0);
-}
-
-/**
- * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
- * @buf:       PT buffer.
- */
-static void pt_buffer_fini_topa(struct pt_buffer *buf)
-{
-       struct topa *topa, *iter;
-
-       list_for_each_entry_safe(topa, iter, &buf->tables, list) {
-               /*
-                * right now, this is in free_aux() path only, so
-                * no need to unlink this table from the list
-                */
-               topa_free(topa);
-       }
-}
-
-/**
- * pt_buffer_init_topa() - initialize ToPA table for pt buffer
- * @buf:       PT buffer.
- * @size:      Total size of all regions within this ToPA.
- * @gfp:       Allocation flags.
- */
-static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages,
-                              gfp_t gfp)
-{
-       struct topa *topa;
-       int err;
-
-       topa = topa_alloc(buf->cpu, gfp);
-       if (!topa)
-               return -ENOMEM;
-
-       topa_insert_table(buf, topa);
-
-       while (buf->nr_pages < nr_pages) {
-               err = topa_insert_pages(buf, gfp);
-               if (err) {
-                       pt_buffer_fini_topa(buf);
-                       return -ENOMEM;
-               }
-       }
-
-       pt_buffer_setup_topa_index(buf);
-
-       /* link last table to the first one, unless we're double buffering */
-       if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
-               TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT;
-               TOPA_ENTRY(buf->last, -1)->end = 1;
-       }
-
-       pt_topa_dump(buf);
-       return 0;
-}
-
-/**
- * pt_buffer_setup_aux() - set up topa tables for a PT buffer
- * @cpu:       Cpu on which to allocate, -1 means current.
- * @pages:     Array of pointers to buffer pages passed from perf core.
- * @nr_pages:  Number of pages in the buffer.
- * @snapshot:  If this is a snapshot/overwrite counter.
- *
- * This is a pmu::setup_aux callback that sets up ToPA tables and all the
- * bookkeeping for an AUX buffer.
- *
- * Return:     Our private PT buffer structure.
- */
-static void *
-pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot)
-{
-       struct pt_buffer *buf;
-       int node, ret;
-
-       if (!nr_pages)
-               return NULL;
-
-       if (cpu == -1)
-               cpu = raw_smp_processor_id();
-       node = cpu_to_node(cpu);
-
-       buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]),
-                          GFP_KERNEL, node);
-       if (!buf)
-               return NULL;
-
-       buf->cpu = cpu;
-       buf->snapshot = snapshot;
-       buf->data_pages = pages;
-
-       INIT_LIST_HEAD(&buf->tables);
-
-       ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL);
-       if (ret) {
-               kfree(buf);
-               return NULL;
-       }
-
-       return buf;
-}
-
-/**
- * pt_buffer_free_aux() - perf AUX deallocation path callback
- * @data:      PT buffer.
- */
-static void pt_buffer_free_aux(void *data)
-{
-       struct pt_buffer *buf = data;
-
-       pt_buffer_fini_topa(buf);
-       kfree(buf);
-}
-
-/**
- * pt_buffer_is_full() - check if the buffer is full
- * @buf:       PT buffer.
- * @pt:                Per-cpu pt handle.
- *
- * If the user hasn't read data from the output region that aux_head
- * points to, the buffer is considered full: the user needs to read at
- * least this region and update aux_tail to point past it.
- */
-static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt)
-{
-       if (buf->snapshot)
-               return false;
-
-       if (local_read(&buf->data_size) >= pt->handle.size)
-               return true;
-
-       return false;
-}
-
-/**
- * intel_pt_interrupt() - PT PMI handler
- */
-void intel_pt_interrupt(void)
-{
-       struct pt *pt = this_cpu_ptr(&pt_ctx);
-       struct pt_buffer *buf;
-       struct perf_event *event = pt->handle.event;
-
-       /*
-        * There may be a dangling PT bit in the interrupt status register
-        * after PT has been disabled by pt_event_stop(). Make sure we don't
-        * do anything (particularly, re-enable) for this event here.
-        */
-       if (!ACCESS_ONCE(pt->handle_nmi))
-               return;
-
-       pt_config_start(false);
-
-       if (!event)
-               return;
-
-       buf = perf_get_aux(&pt->handle);
-       if (!buf)
-               return;
-
-       pt_read_offset(buf);
-
-       pt_handle_status(pt);
-
-       pt_update_head(pt);
-
-       perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
-                           local_xchg(&buf->lost, 0));
-
-       if (!event->hw.state) {
-               int ret;
-
-               buf = perf_aux_output_begin(&pt->handle, event);
-               if (!buf) {
-                       event->hw.state = PERF_HES_STOPPED;
-                       return;
-               }
-
-               pt_buffer_reset_offsets(buf, pt->handle.head);
-               /* snapshot counters don't use PMI, so it's safe */
-               ret = pt_buffer_reset_markers(buf, &pt->handle);
-               if (ret) {
-                       perf_aux_output_end(&pt->handle, 0, true);
-                       return;
-               }
-
-               pt_config_buffer(buf->cur->table, buf->cur_idx,
-                                buf->output_off);
-               pt_config(event);
-       }
-}
-
-/*
- * PMU callbacks
- */
-
-static void pt_event_start(struct perf_event *event, int mode)
-{
-       struct pt *pt = this_cpu_ptr(&pt_ctx);
-       struct pt_buffer *buf = perf_get_aux(&pt->handle);
-
-       if (!buf || pt_buffer_is_full(buf, pt)) {
-               event->hw.state = PERF_HES_STOPPED;
-               return;
-       }
-
-       ACCESS_ONCE(pt->handle_nmi) = 1;
-       event->hw.state = 0;
-
-       pt_config_buffer(buf->cur->table, buf->cur_idx,
-                        buf->output_off);
-       pt_config(event);
-}
-
-static void pt_event_stop(struct perf_event *event, int mode)
-{
-       struct pt *pt = this_cpu_ptr(&pt_ctx);
-
-       /*
-        * Protect against the PMI racing with disabling wrmsr,
-        * see comment in intel_pt_interrupt().
-        */
-       ACCESS_ONCE(pt->handle_nmi) = 0;
-       pt_config_start(false);
-
-       if (event->hw.state == PERF_HES_STOPPED)
-               return;
-
-       event->hw.state = PERF_HES_STOPPED;
-
-       if (mode & PERF_EF_UPDATE) {
-               struct pt_buffer *buf = perf_get_aux(&pt->handle);
-
-               if (!buf)
-                       return;
-
-               if (WARN_ON_ONCE(pt->handle.event != event))
-                       return;
-
-               pt_read_offset(buf);
-
-               pt_handle_status(pt);
-
-               pt_update_head(pt);
-       }
-}
-
-static void pt_event_del(struct perf_event *event, int mode)
-{
-       struct pt *pt = this_cpu_ptr(&pt_ctx);
-       struct pt_buffer *buf;
-
-       pt_event_stop(event, PERF_EF_UPDATE);
-
-       buf = perf_get_aux(&pt->handle);
-
-       if (buf) {
-               if (buf->snapshot)
-                       pt->handle.head =
-                               local_xchg(&buf->data_size,
-                                          buf->nr_pages << PAGE_SHIFT);
-               perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
-                                   local_xchg(&buf->lost, 0));
-       }
-}
-
-static int pt_event_add(struct perf_event *event, int mode)
-{
-       struct pt_buffer *buf;
-       struct pt *pt = this_cpu_ptr(&pt_ctx);
-       struct hw_perf_event *hwc = &event->hw;
-       int ret = -EBUSY;
-
-       if (pt->handle.event)
-               goto fail;
-
-       buf = perf_aux_output_begin(&pt->handle, event);
-       ret = -EINVAL;
-       if (!buf)
-               goto fail_stop;
-
-       pt_buffer_reset_offsets(buf, pt->handle.head);
-       if (!buf->snapshot) {
-               ret = pt_buffer_reset_markers(buf, &pt->handle);
-               if (ret)
-                       goto fail_end_stop;
-       }
-
-       if (mode & PERF_EF_START) {
-               pt_event_start(event, 0);
-               ret = -EBUSY;
-               if (hwc->state == PERF_HES_STOPPED)
-                       goto fail_end_stop;
-       } else {
-               hwc->state = PERF_HES_STOPPED;
-       }
-
-       return 0;
-
-fail_end_stop:
-       perf_aux_output_end(&pt->handle, 0, true);
-fail_stop:
-       hwc->state = PERF_HES_STOPPED;
-fail:
-       return ret;
-}
-
-static void pt_event_read(struct perf_event *event)
-{
-}
-
-static void pt_event_destroy(struct perf_event *event)
-{
-       x86_del_exclusive(x86_lbr_exclusive_pt);
-}
-
-static int pt_event_init(struct perf_event *event)
-{
-       if (event->attr.type != pt_pmu.pmu.type)
-               return -ENOENT;
-
-       if (!pt_event_valid(event))
-               return -EINVAL;
-
-       if (x86_add_exclusive(x86_lbr_exclusive_pt))
-               return -EBUSY;
-
-       event->destroy = pt_event_destroy;
-
-       return 0;
-}
-
-void cpu_emergency_stop_pt(void)
-{
-       struct pt *pt = this_cpu_ptr(&pt_ctx);
-
-       if (pt->handle.event)
-               pt_event_stop(pt->handle.event, PERF_EF_UPDATE);
-}
-
-static __init int pt_init(void)
-{
-       int ret, cpu, prior_warn = 0;
-
-       BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
-
-       if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
-               return -ENODEV;
-
-       get_online_cpus();
-       for_each_online_cpu(cpu) {
-               u64 ctl;
-
-               ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
-               if (!ret && (ctl & RTIT_CTL_TRACEEN))
-                       prior_warn++;
-       }
-       put_online_cpus();
-
-       if (prior_warn) {
-               x86_add_exclusive(x86_lbr_exclusive_pt);
-               pr_warn("PT is enabled at boot time, doing nothing\n");
-
-               return -EBUSY;
-       }
-
-       ret = pt_pmu_hw_init();
-       if (ret)
-               return ret;
-
-       if (!pt_cap_get(PT_CAP_topa_output)) {
-               pr_warn("ToPA output is not supported on this CPU\n");
-               return -ENODEV;
-       }
-
-       if (!pt_cap_get(PT_CAP_topa_multiple_entries))
-               pt_pmu.pmu.capabilities =
-                       PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
-
-       pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
-       pt_pmu.pmu.attr_groups  = pt_attr_groups;
-       pt_pmu.pmu.task_ctx_nr  = perf_sw_context;
-       pt_pmu.pmu.event_init   = pt_event_init;
-       pt_pmu.pmu.add          = pt_event_add;
-       pt_pmu.pmu.del          = pt_event_del;
-       pt_pmu.pmu.start        = pt_event_start;
-       pt_pmu.pmu.stop         = pt_event_stop;
-       pt_pmu.pmu.read         = pt_event_read;
-       pt_pmu.pmu.setup_aux    = pt_buffer_setup_aux;
-       pt_pmu.pmu.free_aux     = pt_buffer_free_aux;
-       ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
-
-       return ret;
-}
-arch_initcall(pt_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c

deleted file mode 100644 (file)

index 24a351a..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ /dev/null
@@ -1,783 +0,0 @@
-/*
- * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
- * Copyright (C) 2013 Google, Inc., Stephane Eranian
- *
- * Intel RAPL interface is specified in the IA-32 Manual Vol3b
- * section 14.7.1 (September 2013)
- *
- * RAPL provides more controls than just reporting energy consumption
- * however here we only expose the 3 energy consumption free running
- * counters (pp0, pkg, dram).
- *
- * Each of those counters increments in a power unit defined by the
- * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
- * but it can vary.
- *
- * Counter to rapl events mappings:
- *
- *  pp0 counter: consumption of all physical cores (power plane 0)
- *       event: rapl_energy_cores
- *    perf code: 0x1
- *
- *  pkg counter: consumption of the whole processor package
- *       event: rapl_energy_pkg
- *    perf code: 0x2
- *
- * dram counter: consumption of the dram domain (servers only)
- *       event: rapl_energy_dram
- *    perf code: 0x3
- *
- * dram counter: consumption of the builtin-gpu domain (client only)
- *       event: rapl_energy_gpu
- *    perf code: 0x4
- *
- * We manage those counters as free running (read-only). They may be
- * use simultaneously by other tools, such as turbostat.
- *
- * The events only support system-wide mode counting. There is no
- * sampling support because it does not make sense and is not
- * supported by the RAPL hardware.
- *
- * Because we want to avoid floating-point operations in the kernel,
- * the events are all reported in fixed point arithmetic (32.32).
- * Tools must adjust the counts to convert them to Watts using
- * the duration of the measurement. Tools may use a function such as
- * ldexp(raw_count, -32);
- */
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/perf_event.h>
-#include <asm/cpu_device_id.h>
-#include "perf_event.h"
-
-/*
- * RAPL energy status counters
- */
-#define RAPL_IDX_PP0_NRG_STAT  0       /* all cores */
-#define INTEL_RAPL_PP0         0x1     /* pseudo-encoding */
-#define RAPL_IDX_PKG_NRG_STAT  1       /* entire package */
-#define INTEL_RAPL_PKG         0x2     /* pseudo-encoding */
-#define RAPL_IDX_RAM_NRG_STAT  2       /* DRAM */
-#define INTEL_RAPL_RAM         0x3     /* pseudo-encoding */
-#define RAPL_IDX_PP1_NRG_STAT  3       /* gpu */
-#define INTEL_RAPL_PP1         0x4     /* pseudo-encoding */
-
-#define NR_RAPL_DOMAINS         0x4
-static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
-       "pp0-core",
-       "package",
-       "dram",
-       "pp1-gpu",
-};
-
-/* Clients have PP0, PKG */
-#define RAPL_IDX_CLN   (1<<RAPL_IDX_PP0_NRG_STAT|\
-                        1<<RAPL_IDX_PKG_NRG_STAT|\
-                        1<<RAPL_IDX_PP1_NRG_STAT)
-
-/* Servers have PP0, PKG, RAM */
-#define RAPL_IDX_SRV   (1<<RAPL_IDX_PP0_NRG_STAT|\
-                        1<<RAPL_IDX_PKG_NRG_STAT|\
-                        1<<RAPL_IDX_RAM_NRG_STAT)
-
-/* Servers have PP0, PKG, RAM, PP1 */
-#define RAPL_IDX_HSW   (1<<RAPL_IDX_PP0_NRG_STAT|\
-                        1<<RAPL_IDX_PKG_NRG_STAT|\
-                        1<<RAPL_IDX_RAM_NRG_STAT|\
-                        1<<RAPL_IDX_PP1_NRG_STAT)
-
-/* Knights Landing has PKG, RAM */
-#define RAPL_IDX_KNL   (1<<RAPL_IDX_PKG_NRG_STAT|\
-                        1<<RAPL_IDX_RAM_NRG_STAT)
-
-/*
- * event code: LSB 8 bits, passed in attr->config
- * any other bit is reserved
- */
-#define RAPL_EVENT_MASK        0xFFULL
-
-#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format)          \
-static ssize_t __rapl_##_var##_show(struct kobject *kobj,      \
-                               struct kobj_attribute *attr,    \
-                               char *page)                     \
-{                                                              \
-       BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);             \
-       return sprintf(page, _format "\n");                     \
-}                                                              \
-static struct kobj_attribute format_attr_##_var =              \
-       __ATTR(_name, 0444, __rapl_##_var##_show, NULL)
-
-#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
-
-#define RAPL_EVENT_ATTR_STR(_name, v, str)                                     \
-static struct perf_pmu_events_attr event_attr_##v = {                          \
-       .attr           = __ATTR(_name, 0444, perf_event_sysfs_show, NULL),     \
-       .id             = 0,                                                    \
-       .event_str      = str,                                                  \
-};
-
-struct rapl_pmu {
-       spinlock_t       lock;
-       int              n_active; /* number of active events */
-       struct list_head active_list;
-       struct pmu       *pmu; /* pointer to rapl_pmu_class */
-       ktime_t          timer_interval; /* in ktime_t unit */
-       struct hrtimer   hrtimer;
-};
-
-static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;  /* 1/2^hw_unit Joule */
-static struct pmu rapl_pmu_class;
-static cpumask_t rapl_cpu_mask;
-static int rapl_cntr_mask;
-
-static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
-static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
-
-static struct x86_pmu_quirk *rapl_quirks;
-static inline u64 rapl_read_counter(struct perf_event *event)
-{
-       u64 raw;
-       rdmsrl(event->hw.event_base, raw);
-       return raw;
-}
-
-#define rapl_add_quirk(func_)                                          \
-do {                                                                   \
-       static struct x86_pmu_quirk __quirk __initdata = {              \
-               .func = func_,                                          \
-       };                                                              \
-       __quirk.next = rapl_quirks;                                     \
-       rapl_quirks = &__quirk;                                         \
-} while (0)
-
-static inline u64 rapl_scale(u64 v, int cfg)
-{
-       if (cfg > NR_RAPL_DOMAINS) {
-               pr_warn("invalid domain %d, failed to scale data\n", cfg);
-               return v;
-       }
-       /*
-        * scale delta to smallest unit (1/2^32)
-        * users must then scale back: count * 1/(1e9*2^32) to get Joules
-        * or use ldexp(count, -32).
-        * Watts = Joules/Time delta
-        */
-       return v << (32 - rapl_hw_unit[cfg - 1]);
-}
-
-static u64 rapl_event_update(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       u64 prev_raw_count, new_raw_count;
-       s64 delta, sdelta;
-       int shift = RAPL_CNTR_WIDTH;
-
-again:
-       prev_raw_count = local64_read(&hwc->prev_count);
-       rdmsrl(event->hw.event_base, new_raw_count);
-
-       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-                           new_raw_count) != prev_raw_count) {
-               cpu_relax();
-               goto again;
-       }
-
-       /*
-        * Now we have the new raw value and have updated the prev
-        * timestamp already. We can now calculate the elapsed delta
-        * (event-)time and add that to the generic event.
-        *
-        * Careful, not all hw sign-extends above the physical width
-        * of the count.
-        */
-       delta = (new_raw_count << shift) - (prev_raw_count << shift);
-       delta >>= shift;
-
-       sdelta = rapl_scale(delta, event->hw.config);
-
-       local64_add(sdelta, &event->count);
-
-       return new_raw_count;
-}
-
-static void rapl_start_hrtimer(struct rapl_pmu *pmu)
-{
-       hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
-                    HRTIMER_MODE_REL_PINNED);
-}
-
-static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
-{
-       hrtimer_cancel(&pmu->hrtimer);
-}
-
-static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
-{
-       struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
-       struct perf_event *event;
-       unsigned long flags;
-
-       if (!pmu->n_active)
-               return HRTIMER_NORESTART;
-
-       spin_lock_irqsave(&pmu->lock, flags);
-
-       list_for_each_entry(event, &pmu->active_list, active_entry) {
-               rapl_event_update(event);
-       }
-
-       spin_unlock_irqrestore(&pmu->lock, flags);
-
-       hrtimer_forward_now(hrtimer, pmu->timer_interval);
-
-       return HRTIMER_RESTART;
-}
-
-static void rapl_hrtimer_init(struct rapl_pmu *pmu)
-{
-       struct hrtimer *hr = &pmu->hrtimer;
-
-       hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-       hr->function = rapl_hrtimer_handle;
-}
-
-static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
-                                  struct perf_event *event)
-{
-       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
-               return;
-
-       event->hw.state = 0;
-
-       list_add_tail(&event->active_entry, &pmu->active_list);
-
-       local64_set(&event->hw.prev_count, rapl_read_counter(event));
-
-       pmu->n_active++;
-       if (pmu->n_active == 1)
-               rapl_start_hrtimer(pmu);
-}
-
-static void rapl_pmu_event_start(struct perf_event *event, int mode)
-{
-       struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
-       unsigned long flags;
-
-       spin_lock_irqsave(&pmu->lock, flags);
-       __rapl_pmu_event_start(pmu, event);
-       spin_unlock_irqrestore(&pmu->lock, flags);
-}
-
-static void rapl_pmu_event_stop(struct perf_event *event, int mode)
-{
-       struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
-       struct hw_perf_event *hwc = &event->hw;
-       unsigned long flags;
-
-       spin_lock_irqsave(&pmu->lock, flags);
-
-       /* mark event as deactivated and stopped */
-       if (!(hwc->state & PERF_HES_STOPPED)) {
-               WARN_ON_ONCE(pmu->n_active <= 0);
-               pmu->n_active--;
-               if (pmu->n_active == 0)
-                       rapl_stop_hrtimer(pmu);
-
-               list_del(&event->active_entry);
-
-               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-               hwc->state |= PERF_HES_STOPPED;
-       }
-
-       /* check if update of sw counter is necessary */
-       if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-               /*
-                * Drain the remaining delta count out of a event
-                * that we are disabling:
-                */
-               rapl_event_update(event);
-               hwc->state |= PERF_HES_UPTODATE;
-       }
-
-       spin_unlock_irqrestore(&pmu->lock, flags);
-}
-
-static int rapl_pmu_event_add(struct perf_event *event, int mode)
-{
-       struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
-       struct hw_perf_event *hwc = &event->hw;
-       unsigned long flags;
-
-       spin_lock_irqsave(&pmu->lock, flags);
-
-       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
-       if (mode & PERF_EF_START)
-               __rapl_pmu_event_start(pmu, event);
-
-       spin_unlock_irqrestore(&pmu->lock, flags);
-
-       return 0;
-}
-
-static void rapl_pmu_event_del(struct perf_event *event, int flags)
-{
-       rapl_pmu_event_stop(event, PERF_EF_UPDATE);
-}
-
-static int rapl_pmu_event_init(struct perf_event *event)
-{
-       u64 cfg = event->attr.config & RAPL_EVENT_MASK;
-       int bit, msr, ret = 0;
-
-       /* only look at RAPL events */
-       if (event->attr.type != rapl_pmu_class.type)
-               return -ENOENT;
-
-       /* check only supported bits are set */
-       if (event->attr.config & ~RAPL_EVENT_MASK)
-               return -EINVAL;
-
-       /*
-        * check event is known (determines counter)
-        */
-       switch (cfg) {
-       case INTEL_RAPL_PP0:
-               bit = RAPL_IDX_PP0_NRG_STAT;
-               msr = MSR_PP0_ENERGY_STATUS;
-               break;
-       case INTEL_RAPL_PKG:
-               bit = RAPL_IDX_PKG_NRG_STAT;
-               msr = MSR_PKG_ENERGY_STATUS;
-               break;
-       case INTEL_RAPL_RAM:
-               bit = RAPL_IDX_RAM_NRG_STAT;
-               msr = MSR_DRAM_ENERGY_STATUS;
-               break;
-       case INTEL_RAPL_PP1:
-               bit = RAPL_IDX_PP1_NRG_STAT;
-               msr = MSR_PP1_ENERGY_STATUS;
-               break;
-       default:
-               return -EINVAL;
-       }
-       /* check event supported */
-       if (!(rapl_cntr_mask & (1 << bit)))
-               return -EINVAL;
-
-       /* unsupported modes and filters */
-       if (event->attr.exclude_user   ||
-           event->attr.exclude_kernel ||
-           event->attr.exclude_hv     ||
-           event->attr.exclude_idle   ||
-           event->attr.exclude_host   ||
-           event->attr.exclude_guest  ||
-           event->attr.sample_period) /* no sampling */
-               return -EINVAL;
-
-       /* must be done before validate_group */
-       event->hw.event_base = msr;
-       event->hw.config = cfg;
-       event->hw.idx = bit;
-
-       return ret;
-}
-
-static void rapl_pmu_event_read(struct perf_event *event)
-{
-       rapl_event_update(event);
-}
-
-static ssize_t rapl_get_attr_cpumask(struct device *dev,
-                               struct device_attribute *attr, char *buf)
-{
-       return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
-}
-
-static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
-
-static struct attribute *rapl_pmu_attrs[] = {
-       &dev_attr_cpumask.attr,
-       NULL,
-};
-
-static struct attribute_group rapl_pmu_attr_group = {
-       .attrs = rapl_pmu_attrs,
-};
-
-RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
-RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
-RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
-RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
-
-RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
-RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
-RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
-RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
-
-/*
- * we compute in 0.23 nJ increments regardless of MSR
- */
-RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
-RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
-RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
-RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
-
-static struct attribute *rapl_events_srv_attr[] = {
-       EVENT_PTR(rapl_cores),
-       EVENT_PTR(rapl_pkg),
-       EVENT_PTR(rapl_ram),
-
-       EVENT_PTR(rapl_cores_unit),
-       EVENT_PTR(rapl_pkg_unit),
-       EVENT_PTR(rapl_ram_unit),
-
-       EVENT_PTR(rapl_cores_scale),
-       EVENT_PTR(rapl_pkg_scale),
-       EVENT_PTR(rapl_ram_scale),
-       NULL,
-};
-
-static struct attribute *rapl_events_cln_attr[] = {
-       EVENT_PTR(rapl_cores),
-       EVENT_PTR(rapl_pkg),
-       EVENT_PTR(rapl_gpu),
-
-       EVENT_PTR(rapl_cores_unit),
-       EVENT_PTR(rapl_pkg_unit),
-       EVENT_PTR(rapl_gpu_unit),
-
-       EVENT_PTR(rapl_cores_scale),
-       EVENT_PTR(rapl_pkg_scale),
-       EVENT_PTR(rapl_gpu_scale),
-       NULL,
-};
-
-static struct attribute *rapl_events_hsw_attr[] = {
-       EVENT_PTR(rapl_cores),
-       EVENT_PTR(rapl_pkg),
-       EVENT_PTR(rapl_gpu),
-       EVENT_PTR(rapl_ram),
-
-       EVENT_PTR(rapl_cores_unit),
-       EVENT_PTR(rapl_pkg_unit),
-       EVENT_PTR(rapl_gpu_unit),
-       EVENT_PTR(rapl_ram_unit),
-
-       EVENT_PTR(rapl_cores_scale),
-       EVENT_PTR(rapl_pkg_scale),
-       EVENT_PTR(rapl_gpu_scale),
-       EVENT_PTR(rapl_ram_scale),
-       NULL,
-};
-
-static struct attribute *rapl_events_knl_attr[] = {
-       EVENT_PTR(rapl_pkg),
-       EVENT_PTR(rapl_ram),
-
-       EVENT_PTR(rapl_pkg_unit),
-       EVENT_PTR(rapl_ram_unit),
-
-       EVENT_PTR(rapl_pkg_scale),
-       EVENT_PTR(rapl_ram_scale),
-       NULL,
-};
-
-static struct attribute_group rapl_pmu_events_group = {
-       .name = "events",
-       .attrs = NULL, /* patched at runtime */
-};
-
-DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
-static struct attribute *rapl_formats_attr[] = {
-       &format_attr_event.attr,
-       NULL,
-};
-
-static struct attribute_group rapl_pmu_format_group = {
-       .name = "format",
-       .attrs = rapl_formats_attr,
-};
-
-const struct attribute_group *rapl_attr_groups[] = {
-       &rapl_pmu_attr_group,
-       &rapl_pmu_format_group,
-       &rapl_pmu_events_group,
-       NULL,
-};
-
-static struct pmu rapl_pmu_class = {
-       .attr_groups    = rapl_attr_groups,
-       .task_ctx_nr    = perf_invalid_context, /* system-wide only */
-       .event_init     = rapl_pmu_event_init,
-       .add            = rapl_pmu_event_add, /* must have */
-       .del            = rapl_pmu_event_del, /* must have */
-       .start          = rapl_pmu_event_start,
-       .stop           = rapl_pmu_event_stop,
-       .read           = rapl_pmu_event_read,
-};
-
-static void rapl_cpu_exit(int cpu)
-{
-       struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
-       int i, phys_id = topology_physical_package_id(cpu);
-       int target = -1;
-
-       /* find a new cpu on same package */
-       for_each_online_cpu(i) {
-               if (i == cpu)
-                       continue;
-               if (phys_id == topology_physical_package_id(i)) {
-                       target = i;
-                       break;
-               }
-       }
-       /*
-        * clear cpu from cpumask
-        * if was set in cpumask and still some cpu on package,
-        * then move to new cpu
-        */
-       if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
-               cpumask_set_cpu(target, &rapl_cpu_mask);
-
-       WARN_ON(cpumask_empty(&rapl_cpu_mask));
-       /*
-        * migrate events and context to new cpu
-        */
-       if (target >= 0)
-               perf_pmu_migrate_context(pmu->pmu, cpu, target);
-
-       /* cancel overflow polling timer for CPU */
-       rapl_stop_hrtimer(pmu);
-}
-
-static void rapl_cpu_init(int cpu)
-{
-       int i, phys_id = topology_physical_package_id(cpu);
-
-       /* check if phys_is is already covered */
-       for_each_cpu(i, &rapl_cpu_mask) {
-               if (phys_id == topology_physical_package_id(i))
-                       return;
-       }
-       /* was not found, so add it */
-       cpumask_set_cpu(cpu, &rapl_cpu_mask);
-}
-
-static __init void rapl_hsw_server_quirk(void)
-{
-       /*
-        * DRAM domain on HSW server has fixed energy unit which can be
-        * different than the unit from power unit MSR.
-        * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
-        * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
-        */
-       rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
-}
-
-static int rapl_cpu_prepare(int cpu)
-{
-       struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
-       int phys_id = topology_physical_package_id(cpu);
-       u64 ms;
-
-       if (pmu)
-               return 0;
-
-       if (phys_id < 0)
-               return -1;
-
-       pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
-       if (!pmu)
-               return -1;
-       spin_lock_init(&pmu->lock);
-
-       INIT_LIST_HEAD(&pmu->active_list);
-
-       pmu->pmu = &rapl_pmu_class;
-
-       /*
-        * use reference of 200W for scaling the timeout
-        * to avoid missing counter overflows.
-        * 200W = 200 Joules/sec
-        * divide interval by 2 to avoid lockstep (2 * 100)
-        * if hw unit is 32, then we use 2 ms 1/200/2
-        */
-       if (rapl_hw_unit[0] < 32)
-               ms = (1000 / (2 * 100)) * (1ULL << (32 - rapl_hw_unit[0] - 1));
-       else
-               ms = 2;
-
-       pmu->timer_interval = ms_to_ktime(ms);
-
-       rapl_hrtimer_init(pmu);
-
-       /* set RAPL pmu for this cpu for now */
-       per_cpu(rapl_pmu, cpu) = pmu;
-       per_cpu(rapl_pmu_to_free, cpu) = NULL;
-
-       return 0;
-}
-
-static void rapl_cpu_kfree(int cpu)
-{
-       struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
-
-       kfree(pmu);
-
-       per_cpu(rapl_pmu_to_free, cpu) = NULL;
-}
-
-static int rapl_cpu_dying(int cpu)
-{
-       struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
-
-       if (!pmu)
-               return 0;
-
-       per_cpu(rapl_pmu, cpu) = NULL;
-
-       per_cpu(rapl_pmu_to_free, cpu) = pmu;
-
-       return 0;
-}
-
-static int rapl_cpu_notifier(struct notifier_block *self,
-                            unsigned long action, void *hcpu)
-{
-       unsigned int cpu = (long)hcpu;
-
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_UP_PREPARE:
-               rapl_cpu_prepare(cpu);
-               break;
-       case CPU_STARTING:
-               rapl_cpu_init(cpu);
-               break;
-       case CPU_UP_CANCELED:
-       case CPU_DYING:
-               rapl_cpu_dying(cpu);
-               break;
-       case CPU_ONLINE:
-       case CPU_DEAD:
-               rapl_cpu_kfree(cpu);
-               break;
-       case CPU_DOWN_PREPARE:
-               rapl_cpu_exit(cpu);
-               break;
-       default:
-               break;
-       }
-
-       return NOTIFY_OK;
-}
-
-static int rapl_check_hw_unit(void)
-{
-       u64 msr_rapl_power_unit_bits;
-       int i;
-
-       /* protect rdmsrl() to handle virtualization */
-       if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
-               return -1;
-       for (i = 0; i < NR_RAPL_DOMAINS; i++)
-               rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
-
-       return 0;
-}
-
-static const struct x86_cpu_id rapl_cpu_match[] = {
-       [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
-       [1] = {},
-};
-
-static int __init rapl_pmu_init(void)
-{
-       struct rapl_pmu *pmu;
-       int cpu, ret;
-       struct x86_pmu_quirk *quirk;
-       int i;
-
-       /*
-        * check for Intel processor family 6
-        */
-       if (!x86_match_cpu(rapl_cpu_match))
-               return 0;
-
-       /* check supported CPU */
-       switch (boot_cpu_data.x86_model) {
-       case 42: /* Sandy Bridge */
-       case 58: /* Ivy Bridge */
-               rapl_cntr_mask = RAPL_IDX_CLN;
-               rapl_pmu_events_group.attrs = rapl_events_cln_attr;
-               break;
-       case 63: /* Haswell-Server */
-               rapl_add_quirk(rapl_hsw_server_quirk);
-               rapl_cntr_mask = RAPL_IDX_SRV;
-               rapl_pmu_events_group.attrs = rapl_events_srv_attr;
-               break;
-       case 60: /* Haswell */
-       case 69: /* Haswell-Celeron */
-       case 61: /* Broadwell */
-               rapl_cntr_mask = RAPL_IDX_HSW;
-               rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
-               break;
-       case 45: /* Sandy Bridge-EP */
-       case 62: /* IvyTown */
-               rapl_cntr_mask = RAPL_IDX_SRV;
-               rapl_pmu_events_group.attrs = rapl_events_srv_attr;
-               break;
-       case 87: /* Knights Landing */
-               rapl_add_quirk(rapl_hsw_server_quirk);
-               rapl_cntr_mask = RAPL_IDX_KNL;
-               rapl_pmu_events_group.attrs = rapl_events_knl_attr;
-
-       default:
-               /* unsupported */
-               return 0;
-       }
-       ret = rapl_check_hw_unit();
-       if (ret)
-               return ret;
-
-       /* run cpu model quirks */
-       for (quirk = rapl_quirks; quirk; quirk = quirk->next)
-               quirk->func();
-       cpu_notifier_register_begin();
-
-       for_each_online_cpu(cpu) {
-               ret = rapl_cpu_prepare(cpu);
-               if (ret)
-                       goto out;
-               rapl_cpu_init(cpu);
-       }
-
-       __perf_cpu_notifier(rapl_cpu_notifier);
-
-       ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
-       if (WARN_ON(ret)) {
-               pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
-               cpu_notifier_register_done();
-               return -1;
-       }
-
-       pmu = __this_cpu_read(rapl_pmu);
-
-       pr_info("RAPL PMU detected,"
-               " API unit is 2^-32 Joules,"
-               " %d fixed counters"
-               " %llu ms ovfl timer\n",
-               hweight32(rapl_cntr_mask),
-               ktime_to_ms(pmu->timer_interval));
-       for (i = 0; i < NR_RAPL_DOMAINS; i++) {
-               if (rapl_cntr_mask & (1 << i)) {
-                       pr_info("hw unit of domain %s 2^-%d Joules\n",
-                               rapl_domain_names[i], rapl_hw_unit[i]);
-               }
-       }
-out:
-       cpu_notifier_register_done();
-
-       return 0;
-}
-device_initcall(rapl_pmu_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c

deleted file mode 100644 (file)

index 3bf41d4..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ /dev/null
@@ -1,1401 +0,0 @@
-#include "perf_event_intel_uncore.h"
-
-static struct intel_uncore_type *empty_uncore[] = { NULL, };
-struct intel_uncore_type **uncore_msr_uncores = empty_uncore;
-struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
-
-static bool pcidrv_registered;
-struct pci_driver *uncore_pci_driver;
-/* pci bus to socket mapping */
-DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
-struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
-struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
-
-static DEFINE_RAW_SPINLOCK(uncore_box_lock);
-/* mask of cpus that collect uncore events */
-static cpumask_t uncore_cpu_mask;
-
-/* constraint for the fixed counter */
-static struct event_constraint uncore_constraint_fixed =
-       EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL);
-struct event_constraint uncore_constraint_empty =
-       EVENT_CONSTRAINT(0, 0, 0);
-
-int uncore_pcibus_to_physid(struct pci_bus *bus)
-{
-       struct pci2phy_map *map;
-       int phys_id = -1;
-
-       raw_spin_lock(&pci2phy_map_lock);
-       list_for_each_entry(map, &pci2phy_map_head, list) {
-               if (map->segment == pci_domain_nr(bus)) {
-                       phys_id = map->pbus_to_physid[bus->number];
-                       break;
-               }
-       }
-       raw_spin_unlock(&pci2phy_map_lock);
-
-       return phys_id;
-}
-
-struct pci2phy_map *__find_pci2phy_map(int segment)
-{
-       struct pci2phy_map *map, *alloc = NULL;
-       int i;
-
-       lockdep_assert_held(&pci2phy_map_lock);
-
-lookup:
-       list_for_each_entry(map, &pci2phy_map_head, list) {
-               if (map->segment == segment)
-                       goto end;
-       }
-
-       if (!alloc) {
-               raw_spin_unlock(&pci2phy_map_lock);
-               alloc = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL);
-               raw_spin_lock(&pci2phy_map_lock);
-
-               if (!alloc)
-                       return NULL;
-
-               goto lookup;
-       }
-
-       map = alloc;
-       alloc = NULL;
-       map->segment = segment;
-       for (i = 0; i < 256; i++)
-               map->pbus_to_physid[i] = -1;
-       list_add_tail(&map->list, &pci2phy_map_head);
-
-end:
-       kfree(alloc);
-       return map;
-}
-
-ssize_t uncore_event_show(struct kobject *kobj,
-                         struct kobj_attribute *attr, char *buf)
-{
-       struct uncore_event_desc *event =
-               container_of(attr, struct uncore_event_desc, attr);
-       return sprintf(buf, "%s", event->config);
-}
-
-struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
-{
-       return container_of(event->pmu, struct intel_uncore_pmu, pmu);
-}
-
-struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
-{
-       struct intel_uncore_box *box;
-
-       box = *per_cpu_ptr(pmu->box, cpu);
-       if (box)
-               return box;
-
-       raw_spin_lock(&uncore_box_lock);
-       /* Recheck in lock to handle races. */
-       if (*per_cpu_ptr(pmu->box, cpu))
-               goto out;
-       list_for_each_entry(box, &pmu->box_list, list) {
-               if (box->phys_id == topology_physical_package_id(cpu)) {
-                       atomic_inc(&box->refcnt);
-                       *per_cpu_ptr(pmu->box, cpu) = box;
-                       break;
-               }
-       }
-out:
-       raw_spin_unlock(&uncore_box_lock);
-
-       return *per_cpu_ptr(pmu->box, cpu);
-}
-
-struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
-{
-       /*
-        * perf core schedules event on the basis of cpu, uncore events are
-        * collected by one of the cpus inside a physical package.
-        */
-       return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id());
-}
-
-u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
-       u64 count;
-
-       rdmsrl(event->hw.event_base, count);
-
-       return count;
-}
-
-/*
- * generic get constraint function for shared match/mask registers.
- */
-struct event_constraint *
-uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct intel_uncore_extra_reg *er;
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-       unsigned long flags;
-       bool ok = false;
-
-       /*
-        * reg->alloc can be set due to existing state, so for fake box we
-        * need to ignore this, otherwise we might fail to allocate proper
-        * fake state for this extra reg constraint.
-        */
-       if (reg1->idx == EXTRA_REG_NONE ||
-           (!uncore_box_is_fake(box) && reg1->alloc))
-               return NULL;
-
-       er = &box->shared_regs[reg1->idx];
-       raw_spin_lock_irqsave(&er->lock, flags);
-       if (!atomic_read(&er->ref) ||
-           (er->config1 == reg1->config && er->config2 == reg2->config)) {
-               atomic_inc(&er->ref);
-               er->config1 = reg1->config;
-               er->config2 = reg2->config;
-               ok = true;
-       }
-       raw_spin_unlock_irqrestore(&er->lock, flags);
-
-       if (ok) {
-               if (!uncore_box_is_fake(box))
-                       reg1->alloc = 1;
-               return NULL;
-       }
-
-       return &uncore_constraint_empty;
-}
-
-void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct intel_uncore_extra_reg *er;
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-
-       /*
-        * Only put constraint if extra reg was actually allocated. Also
-        * takes care of event which do not use an extra shared reg.
-        *
-        * Also, if this is a fake box we shouldn't touch any event state
-        * (reg->alloc) and we don't care about leaving inconsistent box
-        * state either since it will be thrown out.
-        */
-       if (uncore_box_is_fake(box) || !reg1->alloc)
-               return;
-
-       er = &box->shared_regs[reg1->idx];
-       atomic_dec(&er->ref);
-       reg1->alloc = 0;
-}
-
-u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx)
-{
-       struct intel_uncore_extra_reg *er;
-       unsigned long flags;
-       u64 config;
-
-       er = &box->shared_regs[idx];
-
-       raw_spin_lock_irqsave(&er->lock, flags);
-       config = er->config;
-       raw_spin_unlock_irqrestore(&er->lock, flags);
-
-       return config;
-}
-
-static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       hwc->idx = idx;
-       hwc->last_tag = ++box->tags[idx];
-
-       if (hwc->idx == UNCORE_PMC_IDX_FIXED) {
-               hwc->event_base = uncore_fixed_ctr(box);
-               hwc->config_base = uncore_fixed_ctl(box);
-               return;
-       }
-
-       hwc->config_base = uncore_event_ctl(box, hwc->idx);
-       hwc->event_base  = uncore_perf_ctr(box, hwc->idx);
-}
-
-void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event)
-{
-       u64 prev_count, new_count, delta;
-       int shift;
-
-       if (event->hw.idx >= UNCORE_PMC_IDX_FIXED)
-               shift = 64 - uncore_fixed_ctr_bits(box);
-       else
-               shift = 64 - uncore_perf_ctr_bits(box);
-
-       /* the hrtimer might modify the previous event value */
-again:
-       prev_count = local64_read(&event->hw.prev_count);
-       new_count = uncore_read_counter(box, event);
-       if (local64_xchg(&event->hw.prev_count, new_count) != prev_count)
-               goto again;
-
-       delta = (new_count << shift) - (prev_count << shift);
-       delta >>= shift;
-
-       local64_add(delta, &event->count);
-}
-
-/*
- * The overflow interrupt is unavailable for SandyBridge-EP, is broken
- * for SandyBridge. So we use hrtimer to periodically poll the counter
- * to avoid overflow.
- */
-static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
-{
-       struct intel_uncore_box *box;
-       struct perf_event *event;
-       unsigned long flags;
-       int bit;
-
-       box = container_of(hrtimer, struct intel_uncore_box, hrtimer);
-       if (!box->n_active || box->cpu != smp_processor_id())
-               return HRTIMER_NORESTART;
-       /*
-        * disable local interrupt to prevent uncore_pmu_event_start/stop
-        * to interrupt the update process
-        */
-       local_irq_save(flags);
-
-       /*
-        * handle boxes with an active event list as opposed to active
-        * counters
-        */
-       list_for_each_entry(event, &box->active_list, active_entry) {
-               uncore_perf_event_update(box, event);
-       }
-
-       for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX)
-               uncore_perf_event_update(box, box->events[bit]);
-
-       local_irq_restore(flags);
-
-       hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration));
-       return HRTIMER_RESTART;
-}
-
-void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
-{
-       hrtimer_start(&box->hrtimer, ns_to_ktime(box->hrtimer_duration),
-                     HRTIMER_MODE_REL_PINNED);
-}
-
-void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
-{
-       hrtimer_cancel(&box->hrtimer);
-}
-
-static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box)
-{
-       hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-       box->hrtimer.function = uncore_pmu_hrtimer;
-}
-
-static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int node)
-{
-       struct intel_uncore_box *box;
-       int i, size;
-
-       size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg);
-
-       box = kzalloc_node(size, GFP_KERNEL, node);
-       if (!box)
-               return NULL;
-
-       for (i = 0; i < type->num_shared_regs; i++)
-               raw_spin_lock_init(&box->shared_regs[i].lock);
-
-       uncore_pmu_init_hrtimer(box);
-       atomic_set(&box->refcnt, 1);
-       box->cpu = -1;
-       box->phys_id = -1;
-
-       /* set default hrtimer timeout */
-       box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL;
-
-       INIT_LIST_HEAD(&box->active_list);
-
-       return box;
-}
-
-/*
- * Using uncore_pmu_event_init pmu event_init callback
- * as a detection point for uncore events.
- */
-static int uncore_pmu_event_init(struct perf_event *event);
-
-static bool is_uncore_event(struct perf_event *event)
-{
-       return event->pmu->event_init == uncore_pmu_event_init;
-}
-
-static int
-uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, bool dogrp)
-{
-       struct perf_event *event;
-       int n, max_count;
-
-       max_count = box->pmu->type->num_counters;
-       if (box->pmu->type->fixed_ctl)
-               max_count++;
-
-       if (box->n_events >= max_count)
-               return -EINVAL;
-
-       n = box->n_events;
-
-       if (is_uncore_event(leader)) {
-               box->event_list[n] = leader;
-               n++;
-       }
-
-       if (!dogrp)
-               return n;
-
-       list_for_each_entry(event, &leader->sibling_list, group_entry) {
-               if (!is_uncore_event(event) ||
-                   event->state <= PERF_EVENT_STATE_OFF)
-                       continue;
-
-               if (n >= max_count)
-                       return -EINVAL;
-
-               box->event_list[n] = event;
-               n++;
-       }
-       return n;
-}
-
-static struct event_constraint *
-uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct intel_uncore_type *type = box->pmu->type;
-       struct event_constraint *c;
-
-       if (type->ops->get_constraint) {
-               c = type->ops->get_constraint(box, event);
-               if (c)
-                       return c;
-       }
-
-       if (event->attr.config == UNCORE_FIXED_EVENT)
-               return &uncore_constraint_fixed;
-
-       if (type->constraints) {
-               for_each_event_constraint(c, type->constraints) {
-                       if ((event->hw.config & c->cmask) == c->code)
-                               return c;
-               }
-       }
-
-       return &type->unconstrainted;
-}
-
-static void uncore_put_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       if (box->pmu->type->ops->put_constraint)
-               box->pmu->type->ops->put_constraint(box, event);
-}
-
-static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n)
-{
-       unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
-       struct event_constraint *c;
-       int i, wmin, wmax, ret = 0;
-       struct hw_perf_event *hwc;
-
-       bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);
-
-       for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
-               c = uncore_get_event_constraint(box, box->event_list[i]);
-               box->event_constraint[i] = c;
-               wmin = min(wmin, c->weight);
-               wmax = max(wmax, c->weight);
-       }
-
-       /* fastpath, try to reuse previous register */
-       for (i = 0; i < n; i++) {
-               hwc = &box->event_list[i]->hw;
-               c = box->event_constraint[i];
-
-               /* never assigned */
-               if (hwc->idx == -1)
-                       break;
-
-               /* constraint still honored */
-               if (!test_bit(hwc->idx, c->idxmsk))
-                       break;
-
-               /* not already used */
-               if (test_bit(hwc->idx, used_mask))
-                       break;
-
-               __set_bit(hwc->idx, used_mask);
-               if (assign)
-                       assign[i] = hwc->idx;
-       }
-       /* slow path */
-       if (i != n)
-               ret = perf_assign_events(box->event_constraint, n,
-                                        wmin, wmax, n, assign);
-
-       if (!assign || ret) {
-               for (i = 0; i < n; i++)
-                       uncore_put_event_constraint(box, box->event_list[i]);
-       }
-       return ret ? -EINVAL : 0;
-}
-
-static void uncore_pmu_event_start(struct perf_event *event, int flags)
-{
-       struct intel_uncore_box *box = uncore_event_to_box(event);
-       int idx = event->hw.idx;
-
-       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
-               return;
-
-       if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX))
-               return;
-
-       event->hw.state = 0;
-       box->events[idx] = event;
-       box->n_active++;
-       __set_bit(idx, box->active_mask);
-
-       local64_set(&event->hw.prev_count, uncore_read_counter(box, event));
-       uncore_enable_event(box, event);
-
-       if (box->n_active == 1) {
-               uncore_enable_box(box);
-               uncore_pmu_start_hrtimer(box);
-       }
-}
-
-static void uncore_pmu_event_stop(struct perf_event *event, int flags)
-{
-       struct intel_uncore_box *box = uncore_event_to_box(event);
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (__test_and_clear_bit(hwc->idx, box->active_mask)) {
-               uncore_disable_event(box, event);
-               box->n_active--;
-               box->events[hwc->idx] = NULL;
-               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-               hwc->state |= PERF_HES_STOPPED;
-
-               if (box->n_active == 0) {
-                       uncore_disable_box(box);
-                       uncore_pmu_cancel_hrtimer(box);
-               }
-       }
-
-       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-               /*
-                * Drain the remaining delta count out of a event
-                * that we are disabling:
-                */
-               uncore_perf_event_update(box, event);
-               hwc->state |= PERF_HES_UPTODATE;
-       }
-}
-
-static int uncore_pmu_event_add(struct perf_event *event, int flags)
-{
-       struct intel_uncore_box *box = uncore_event_to_box(event);
-       struct hw_perf_event *hwc = &event->hw;
-       int assign[UNCORE_PMC_IDX_MAX];
-       int i, n, ret;
-
-       if (!box)
-               return -ENODEV;
-
-       ret = n = uncore_collect_events(box, event, false);
-       if (ret < 0)
-               return ret;
-
-       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-       if (!(flags & PERF_EF_START))
-               hwc->state |= PERF_HES_ARCH;
-
-       ret = uncore_assign_events(box, assign, n);
-       if (ret)
-               return ret;
-
-       /* save events moving to new counters */
-       for (i = 0; i < box->n_events; i++) {
-               event = box->event_list[i];
-               hwc = &event->hw;
-
-               if (hwc->idx == assign[i] &&
-                       hwc->last_tag == box->tags[assign[i]])
-                       continue;
-               /*
-                * Ensure we don't accidentally enable a stopped
-                * counter simply because we rescheduled.
-                */
-               if (hwc->state & PERF_HES_STOPPED)
-                       hwc->state |= PERF_HES_ARCH;
-
-               uncore_pmu_event_stop(event, PERF_EF_UPDATE);
-       }
-
-       /* reprogram moved events into new counters */
-       for (i = 0; i < n; i++) {
-               event = box->event_list[i];
-               hwc = &event->hw;
-
-               if (hwc->idx != assign[i] ||
-                       hwc->last_tag != box->tags[assign[i]])
-                       uncore_assign_hw_event(box, event, assign[i]);
-               else if (i < box->n_events)
-                       continue;
-
-               if (hwc->state & PERF_HES_ARCH)
-                       continue;
-
-               uncore_pmu_event_start(event, 0);
-       }
-       box->n_events = n;
-
-       return 0;
-}
-
-static void uncore_pmu_event_del(struct perf_event *event, int flags)
-{
-       struct intel_uncore_box *box = uncore_event_to_box(event);
-       int i;
-
-       uncore_pmu_event_stop(event, PERF_EF_UPDATE);
-
-       for (i = 0; i < box->n_events; i++) {
-               if (event == box->event_list[i]) {
-                       uncore_put_event_constraint(box, event);
-
-                       while (++i < box->n_events)
-                               box->event_list[i - 1] = box->event_list[i];
-
-                       --box->n_events;
-                       break;
-               }
-       }
-
-       event->hw.idx = -1;
-       event->hw.last_tag = ~0ULL;
-}
-
-void uncore_pmu_event_read(struct perf_event *event)
-{
-       struct intel_uncore_box *box = uncore_event_to_box(event);
-       uncore_perf_event_update(box, event);
-}
-
-/*
- * validation ensures the group can be loaded onto the
- * PMU if it was the only group available.
- */
-static int uncore_validate_group(struct intel_uncore_pmu *pmu,
-                               struct perf_event *event)
-{
-       struct perf_event *leader = event->group_leader;
-       struct intel_uncore_box *fake_box;
-       int ret = -EINVAL, n;
-
-       fake_box = uncore_alloc_box(pmu->type, NUMA_NO_NODE);
-       if (!fake_box)
-               return -ENOMEM;
-
-       fake_box->pmu = pmu;
-       /*
-        * the event is not yet connected with its
-        * siblings therefore we must first collect
-        * existing siblings, then add the new event
-        * before we can simulate the scheduling
-        */
-       n = uncore_collect_events(fake_box, leader, true);
-       if (n < 0)
-               goto out;
-
-       fake_box->n_events = n;
-       n = uncore_collect_events(fake_box, event, false);
-       if (n < 0)
-               goto out;
-
-       fake_box->n_events = n;
-
-       ret = uncore_assign_events(fake_box, NULL, n);
-out:
-       kfree(fake_box);
-       return ret;
-}
-
-static int uncore_pmu_event_init(struct perf_event *event)
-{
-       struct intel_uncore_pmu *pmu;
-       struct intel_uncore_box *box;
-       struct hw_perf_event *hwc = &event->hw;
-       int ret;
-
-       if (event->attr.type != event->pmu->type)
-               return -ENOENT;
-
-       pmu = uncore_event_to_pmu(event);
-       /* no device found for this pmu */
-       if (pmu->func_id < 0)
-               return -ENOENT;
-
-       /*
-        * Uncore PMU does measure at all privilege level all the time.
-        * So it doesn't make sense to specify any exclude bits.
-        */
-       if (event->attr.exclude_user || event->attr.exclude_kernel ||
-                       event->attr.exclude_hv || event->attr.exclude_idle)
-               return -EINVAL;
-
-       /* Sampling not supported yet */
-       if (hwc->sample_period)
-               return -EINVAL;
-
-       /*
-        * Place all uncore events for a particular physical package
-        * onto a single cpu
-        */
-       if (event->cpu < 0)
-               return -EINVAL;
-       box = uncore_pmu_to_box(pmu, event->cpu);
-       if (!box || box->cpu < 0)
-               return -EINVAL;
-       event->cpu = box->cpu;
-
-       event->hw.idx = -1;
-       event->hw.last_tag = ~0ULL;
-       event->hw.extra_reg.idx = EXTRA_REG_NONE;
-       event->hw.branch_reg.idx = EXTRA_REG_NONE;
-
-       if (event->attr.config == UNCORE_FIXED_EVENT) {
-               /* no fixed counter */
-               if (!pmu->type->fixed_ctl)
-                       return -EINVAL;
-               /*
-                * if there is only one fixed counter, only the first pmu
-                * can access the fixed counter
-                */
-               if (pmu->type->single_fixed && pmu->pmu_idx > 0)
-                       return -EINVAL;
-
-               /* fixed counters have event field hardcoded to zero */
-               hwc->config = 0ULL;
-       } else {
-               hwc->config = event->attr.config & pmu->type->event_mask;
-               if (pmu->type->ops->hw_config) {
-                       ret = pmu->type->ops->hw_config(box, event);
-                       if (ret)
-                               return ret;
-               }
-       }
-
-       if (event->group_leader != event)
-               ret = uncore_validate_group(pmu, event);
-       else
-               ret = 0;
-
-       return ret;
-}
-
-static ssize_t uncore_get_attr_cpumask(struct device *dev,
-                               struct device_attribute *attr, char *buf)
-{
-       return cpumap_print_to_pagebuf(true, buf, &uncore_cpu_mask);
-}
-
-static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL);
-
-static struct attribute *uncore_pmu_attrs[] = {
-       &dev_attr_cpumask.attr,
-       NULL,
-};
-
-static struct attribute_group uncore_pmu_attr_group = {
-       .attrs = uncore_pmu_attrs,
-};
-
-static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
-{
-       int ret;
-
-       if (!pmu->type->pmu) {
-               pmu->pmu = (struct pmu) {
-                       .attr_groups    = pmu->type->attr_groups,
-                       .task_ctx_nr    = perf_invalid_context,
-                       .event_init     = uncore_pmu_event_init,
-                       .add            = uncore_pmu_event_add,
-                       .del            = uncore_pmu_event_del,
-                       .start          = uncore_pmu_event_start,
-                       .stop           = uncore_pmu_event_stop,
-                       .read           = uncore_pmu_event_read,
-               };
-       } else {
-               pmu->pmu = *pmu->type->pmu;
-               pmu->pmu.attr_groups = pmu->type->attr_groups;
-       }
-
-       if (pmu->type->num_boxes == 1) {
-               if (strlen(pmu->type->name) > 0)
-                       sprintf(pmu->name, "uncore_%s", pmu->type->name);
-               else
-                       sprintf(pmu->name, "uncore");
-       } else {
-               sprintf(pmu->name, "uncore_%s_%d", pmu->type->name,
-                       pmu->pmu_idx);
-       }
-
-       ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
-       return ret;
-}
-
-static void __init uncore_type_exit(struct intel_uncore_type *type)
-{
-       int i;
-
-       for (i = 0; i < type->num_boxes; i++)
-               free_percpu(type->pmus[i].box);
-       kfree(type->pmus);
-       type->pmus = NULL;
-       kfree(type->events_group);
-       type->events_group = NULL;
-}
-
-static void __init uncore_types_exit(struct intel_uncore_type **types)
-{
-       int i;
-       for (i = 0; types[i]; i++)
-               uncore_type_exit(types[i]);
-}
-
-static int __init uncore_type_init(struct intel_uncore_type *type)
-{
-       struct intel_uncore_pmu *pmus;
-       struct attribute_group *attr_group;
-       struct attribute **attrs;
-       int i, j;
-
-       pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL);
-       if (!pmus)
-               return -ENOMEM;
-
-       type->pmus = pmus;
-
-       type->unconstrainted = (struct event_constraint)
-               __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
-                               0, type->num_counters, 0, 0);
-
-       for (i = 0; i < type->num_boxes; i++) {
-               pmus[i].func_id = -1;
-               pmus[i].pmu_idx = i;
-               pmus[i].type = type;
-               INIT_LIST_HEAD(&pmus[i].box_list);
-               pmus[i].box = alloc_percpu(struct intel_uncore_box *);
-               if (!pmus[i].box)
-                       goto fail;
-       }
-
-       if (type->event_descs) {
-               i = 0;
-               while (type->event_descs[i].attr.attr.name)
-                       i++;
-
-               attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) +
-                                       sizeof(*attr_group), GFP_KERNEL);
-               if (!attr_group)
-                       goto fail;
-
-               attrs = (struct attribute **)(attr_group + 1);
-               attr_group->name = "events";
-               attr_group->attrs = attrs;
-
-               for (j = 0; j < i; j++)
-                       attrs[j] = &type->event_descs[j].attr.attr;
-
-               type->events_group = attr_group;
-       }
-
-       type->pmu_group = &uncore_pmu_attr_group;
-       return 0;
-fail:
-       uncore_type_exit(type);
-       return -ENOMEM;
-}
-
-static int __init uncore_types_init(struct intel_uncore_type **types)
-{
-       int i, ret;
-
-       for (i = 0; types[i]; i++) {
-               ret = uncore_type_init(types[i]);
-               if (ret)
-                       goto fail;
-       }
-       return 0;
-fail:
-       while (--i >= 0)
-               uncore_type_exit(types[i]);
-       return ret;
-}
-
-/*
- * add a pci uncore device
- */
-static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
-{
-       struct intel_uncore_pmu *pmu;
-       struct intel_uncore_box *box;
-       struct intel_uncore_type *type;
-       int phys_id;
-       bool first_box = false;
-
-       phys_id = uncore_pcibus_to_physid(pdev->bus);
-       if (phys_id < 0)
-               return -ENODEV;
-
-       if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
-               int idx = UNCORE_PCI_DEV_IDX(id->driver_data);
-               uncore_extra_pci_dev[phys_id][idx] = pdev;
-               pci_set_drvdata(pdev, NULL);
-               return 0;
-       }
-
-       type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
-       box = uncore_alloc_box(type, NUMA_NO_NODE);
-       if (!box)
-               return -ENOMEM;
-
-       /*
-        * for performance monitoring unit with multiple boxes,
-        * each box has a different function id.
-        */
-       pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
-       /* Knights Landing uses a common PCI device ID for multiple instances of
-        * an uncore PMU device type. There is only one entry per device type in
-        * the knl_uncore_pci_ids table inspite of multiple devices present for
-        * some device types. Hence PCI device idx would be 0 for all devices.
-        * So increment pmu pointer to point to an unused array element.
-        */
-       if (boot_cpu_data.x86_model == 87)
-               while (pmu->func_id >= 0)
-                       pmu++;
-       if (pmu->func_id < 0)
-               pmu->func_id = pdev->devfn;
-       else
-               WARN_ON_ONCE(pmu->func_id != pdev->devfn);
-
-       box->phys_id = phys_id;
-       box->pci_dev = pdev;
-       box->pmu = pmu;
-       uncore_box_init(box);
-       pci_set_drvdata(pdev, box);
-
-       raw_spin_lock(&uncore_box_lock);
-       if (list_empty(&pmu->box_list))
-               first_box = true;
-       list_add_tail(&box->list, &pmu->box_list);
-       raw_spin_unlock(&uncore_box_lock);
-
-       if (first_box)
-               uncore_pmu_register(pmu);
-       return 0;
-}
-
-static void uncore_pci_remove(struct pci_dev *pdev)
-{
-       struct intel_uncore_box *box = pci_get_drvdata(pdev);
-       struct intel_uncore_pmu *pmu;
-       int i, cpu, phys_id;
-       bool last_box = false;
-
-       phys_id = uncore_pcibus_to_physid(pdev->bus);
-       box = pci_get_drvdata(pdev);
-       if (!box) {
-               for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
-                       if (uncore_extra_pci_dev[phys_id][i] == pdev) {
-                               uncore_extra_pci_dev[phys_id][i] = NULL;
-                               break;
-                       }
-               }
-               WARN_ON_ONCE(i >= UNCORE_EXTRA_PCI_DEV_MAX);
-               return;
-       }
-
-       pmu = box->pmu;
-       if (WARN_ON_ONCE(phys_id != box->phys_id))
-               return;
-
-       pci_set_drvdata(pdev, NULL);
-
-       raw_spin_lock(&uncore_box_lock);
-       list_del(&box->list);
-       if (list_empty(&pmu->box_list))
-               last_box = true;
-       raw_spin_unlock(&uncore_box_lock);
-
-       for_each_possible_cpu(cpu) {
-               if (*per_cpu_ptr(pmu->box, cpu) == box) {
-                       *per_cpu_ptr(pmu->box, cpu) = NULL;
-                       atomic_dec(&box->refcnt);
-               }
-       }
-
-       WARN_ON_ONCE(atomic_read(&box->refcnt) != 1);
-       kfree(box);
-
-       if (last_box)
-               perf_pmu_unregister(&pmu->pmu);
-}
-
-static int __init uncore_pci_init(void)
-{
-       int ret;
-
-       switch (boot_cpu_data.x86_model) {
-       case 45: /* Sandy Bridge-EP */
-               ret = snbep_uncore_pci_init();
-               break;
-       case 62: /* Ivy Bridge-EP */
-               ret = ivbep_uncore_pci_init();
-               break;
-       case 63: /* Haswell-EP */
-               ret = hswep_uncore_pci_init();
-               break;
-       case 79: /* BDX-EP */
-       case 86: /* BDX-DE */
-               ret = bdx_uncore_pci_init();
-               break;
-       case 42: /* Sandy Bridge */
-               ret = snb_uncore_pci_init();
-               break;
-       case 58: /* Ivy Bridge */
-               ret = ivb_uncore_pci_init();
-               break;
-       case 60: /* Haswell */
-       case 69: /* Haswell Celeron */
-               ret = hsw_uncore_pci_init();
-               break;
-       case 61: /* Broadwell */
-               ret = bdw_uncore_pci_init();
-               break;
-       case 87: /* Knights Landing */
-               ret = knl_uncore_pci_init();
-               break;
-       case 94: /* SkyLake */
-               ret = skl_uncore_pci_init();
-               break;
-       default:
-               return 0;
-       }
-
-       if (ret)
-               return ret;
-
-       ret = uncore_types_init(uncore_pci_uncores);
-       if (ret)
-               return ret;
-
-       uncore_pci_driver->probe = uncore_pci_probe;
-       uncore_pci_driver->remove = uncore_pci_remove;
-
-       ret = pci_register_driver(uncore_pci_driver);
-       if (ret == 0)
-               pcidrv_registered = true;
-       else
-               uncore_types_exit(uncore_pci_uncores);
-
-       return ret;
-}
-
-static void __init uncore_pci_exit(void)
-{
-       if (pcidrv_registered) {
-               pcidrv_registered = false;
-               pci_unregister_driver(uncore_pci_driver);
-               uncore_types_exit(uncore_pci_uncores);
-       }
-}
-
-/* CPU hot plug/unplug are serialized by cpu_add_remove_lock mutex */
-static LIST_HEAD(boxes_to_free);
-
-static void uncore_kfree_boxes(void)
-{
-       struct intel_uncore_box *box;
-
-       while (!list_empty(&boxes_to_free)) {
-               box = list_entry(boxes_to_free.next,
-                                struct intel_uncore_box, list);
-               list_del(&box->list);
-               kfree(box);
-       }
-}
-
-static void uncore_cpu_dying(int cpu)
-{
-       struct intel_uncore_type *type;
-       struct intel_uncore_pmu *pmu;
-       struct intel_uncore_box *box;
-       int i, j;
-
-       for (i = 0; uncore_msr_uncores[i]; i++) {
-               type = uncore_msr_uncores[i];
-               for (j = 0; j < type->num_boxes; j++) {
-                       pmu = &type->pmus[j];
-                       box = *per_cpu_ptr(pmu->box, cpu);
-                       *per_cpu_ptr(pmu->box, cpu) = NULL;
-                       if (box && atomic_dec_and_test(&box->refcnt))
-                               list_add(&box->list, &boxes_to_free);
-               }
-       }
-}
-
-static int uncore_cpu_starting(int cpu)
-{
-       struct intel_uncore_type *type;
-       struct intel_uncore_pmu *pmu;
-       struct intel_uncore_box *box, *exist;
-       int i, j, k, phys_id;
-
-       phys_id = topology_physical_package_id(cpu);
-
-       for (i = 0; uncore_msr_uncores[i]; i++) {
-               type = uncore_msr_uncores[i];
-               for (j = 0; j < type->num_boxes; j++) {
-                       pmu = &type->pmus[j];
-                       box = *per_cpu_ptr(pmu->box, cpu);
-                       /* called by uncore_cpu_init? */
-                       if (box && box->phys_id >= 0) {
-                               uncore_box_init(box);
-                               continue;
-                       }
-
-                       for_each_online_cpu(k) {
-                               exist = *per_cpu_ptr(pmu->box, k);
-                               if (exist && exist->phys_id == phys_id) {
-                                       atomic_inc(&exist->refcnt);
-                                       *per_cpu_ptr(pmu->box, cpu) = exist;
-                                       if (box) {
-                                               list_add(&box->list,
-                                                        &boxes_to_free);
-                                               box = NULL;
-                                       }
-                                       break;
-                               }
-                       }
-
-                       if (box) {
-                               box->phys_id = phys_id;
-                               uncore_box_init(box);
-                       }
-               }
-       }
-       return 0;
-}
-
-static int uncore_cpu_prepare(int cpu, int phys_id)
-{
-       struct intel_uncore_type *type;
-       struct intel_uncore_pmu *pmu;
-       struct intel_uncore_box *box;
-       int i, j;
-
-       for (i = 0; uncore_msr_uncores[i]; i++) {
-               type = uncore_msr_uncores[i];
-               for (j = 0; j < type->num_boxes; j++) {
-                       pmu = &type->pmus[j];
-                       if (pmu->func_id < 0)
-                               pmu->func_id = j;
-
-                       box = uncore_alloc_box(type, cpu_to_node(cpu));
-                       if (!box)
-                               return -ENOMEM;
-
-                       box->pmu = pmu;
-                       box->phys_id = phys_id;
-                       *per_cpu_ptr(pmu->box, cpu) = box;
-               }
-       }
-       return 0;
-}
-
-static void
-uncore_change_context(struct intel_uncore_type **uncores, int old_cpu, int new_cpu)
-{
-       struct intel_uncore_type *type;
-       struct intel_uncore_pmu *pmu;
-       struct intel_uncore_box *box;
-       int i, j;
-
-       for (i = 0; uncores[i]; i++) {
-               type = uncores[i];
-               for (j = 0; j < type->num_boxes; j++) {
-                       pmu = &type->pmus[j];
-                       if (old_cpu < 0)
-                               box = uncore_pmu_to_box(pmu, new_cpu);
-                       else
-                               box = uncore_pmu_to_box(pmu, old_cpu);
-                       if (!box)
-                               continue;
-
-                       if (old_cpu < 0) {
-                               WARN_ON_ONCE(box->cpu != -1);
-                               box->cpu = new_cpu;
-                               continue;
-                       }
-
-                       WARN_ON_ONCE(box->cpu != old_cpu);
-                       if (new_cpu >= 0) {
-                               uncore_pmu_cancel_hrtimer(box);
-                               perf_pmu_migrate_context(&pmu->pmu,
-                                               old_cpu, new_cpu);
-                               box->cpu = new_cpu;
-                       } else {
-                               box->cpu = -1;
-                       }
-               }
-       }
-}
-
-static void uncore_event_exit_cpu(int cpu)
-{
-       int i, phys_id, target;
-
-       /* if exiting cpu is used for collecting uncore events */
-       if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
-               return;
-
-       /* find a new cpu to collect uncore events */
-       phys_id = topology_physical_package_id(cpu);
-       target = -1;
-       for_each_online_cpu(i) {
-               if (i == cpu)
-                       continue;
-               if (phys_id == topology_physical_package_id(i)) {
-                       target = i;
-                       break;
-               }
-       }
-
-       /* migrate uncore events to the new cpu */
-       if (target >= 0)
-               cpumask_set_cpu(target, &uncore_cpu_mask);
-
-       uncore_change_context(uncore_msr_uncores, cpu, target);
-       uncore_change_context(uncore_pci_uncores, cpu, target);
-}
-
-static void uncore_event_init_cpu(int cpu)
-{
-       int i, phys_id;
-
-       phys_id = topology_physical_package_id(cpu);
-       for_each_cpu(i, &uncore_cpu_mask) {
-               if (phys_id == topology_physical_package_id(i))
-                       return;
-       }
-
-       cpumask_set_cpu(cpu, &uncore_cpu_mask);
-
-       uncore_change_context(uncore_msr_uncores, -1, cpu);
-       uncore_change_context(uncore_pci_uncores, -1, cpu);
-}
-
-static int uncore_cpu_notifier(struct notifier_block *self,
-                              unsigned long action, void *hcpu)
-{
-       unsigned int cpu = (long)hcpu;
-
-       /* allocate/free data structure for uncore box */
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_UP_PREPARE:
-               uncore_cpu_prepare(cpu, -1);
-               break;
-       case CPU_STARTING:
-               uncore_cpu_starting(cpu);
-               break;
-       case CPU_UP_CANCELED:
-       case CPU_DYING:
-               uncore_cpu_dying(cpu);
-               break;
-       case CPU_ONLINE:
-       case CPU_DEAD:
-               uncore_kfree_boxes();
-               break;
-       default:
-               break;
-       }
-
-       /* select the cpu that collects uncore events */
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_DOWN_FAILED:
-       case CPU_STARTING:
-               uncore_event_init_cpu(cpu);
-               break;
-       case CPU_DOWN_PREPARE:
-               uncore_event_exit_cpu(cpu);
-               break;
-       default:
-               break;
-       }
-
-       return NOTIFY_OK;
-}
-
-static struct notifier_block uncore_cpu_nb = {
-       .notifier_call  = uncore_cpu_notifier,
-       /*
-        * to migrate uncore events, our notifier should be executed
-        * before perf core's notifier.
-        */
-       .priority       = CPU_PRI_PERF + 1,
-};
-
-static void __init uncore_cpu_setup(void *dummy)
-{
-       uncore_cpu_starting(smp_processor_id());
-}
-
-static int __init uncore_cpu_init(void)
-{
-       int ret;
-
-       switch (boot_cpu_data.x86_model) {
-       case 26: /* Nehalem */
-       case 30:
-       case 37: /* Westmere */
-       case 44:
-               nhm_uncore_cpu_init();
-               break;
-       case 42: /* Sandy Bridge */
-       case 58: /* Ivy Bridge */
-       case 60: /* Haswell */
-       case 69: /* Haswell */
-       case 70: /* Haswell */
-       case 61: /* Broadwell */
-       case 71: /* Broadwell */
-               snb_uncore_cpu_init();
-               break;
-       case 45: /* Sandy Bridge-EP */
-               snbep_uncore_cpu_init();
-               break;
-       case 46: /* Nehalem-EX */
-       case 47: /* Westmere-EX aka. Xeon E7 */
-               nhmex_uncore_cpu_init();
-               break;
-       case 62: /* Ivy Bridge-EP */
-               ivbep_uncore_cpu_init();
-               break;
-       case 63: /* Haswell-EP */
-               hswep_uncore_cpu_init();
-               break;
-       case 79: /* BDX-EP */
-       case 86: /* BDX-DE */
-               bdx_uncore_cpu_init();
-               break;
-       case 87: /* Knights Landing */
-               knl_uncore_cpu_init();
-               break;
-       default:
-               return 0;
-       }
-
-       ret = uncore_types_init(uncore_msr_uncores);
-       if (ret)
-               return ret;
-
-       return 0;
-}
-
-static int __init uncore_pmus_register(void)
-{
-       struct intel_uncore_pmu *pmu;
-       struct intel_uncore_type *type;
-       int i, j;
-
-       for (i = 0; uncore_msr_uncores[i]; i++) {
-               type = uncore_msr_uncores[i];
-               for (j = 0; j < type->num_boxes; j++) {
-                       pmu = &type->pmus[j];
-                       uncore_pmu_register(pmu);
-               }
-       }
-
-       return 0;
-}
-
-static void __init uncore_cpumask_init(void)
-{
-       int cpu;
-
-       /*
-        * ony invoke once from msr or pci init code
-        */
-       if (!cpumask_empty(&uncore_cpu_mask))
-               return;
-
-       cpu_notifier_register_begin();
-
-       for_each_online_cpu(cpu) {
-               int i, phys_id = topology_physical_package_id(cpu);
-
-               for_each_cpu(i, &uncore_cpu_mask) {
-                       if (phys_id == topology_physical_package_id(i)) {
-                               phys_id = -1;
-                               break;
-                       }
-               }
-               if (phys_id < 0)
-                       continue;
-
-               uncore_cpu_prepare(cpu, phys_id);
-               uncore_event_init_cpu(cpu);
-       }
-       on_each_cpu(uncore_cpu_setup, NULL, 1);
-
-       __register_cpu_notifier(&uncore_cpu_nb);
-
-       cpu_notifier_register_done();
-}
-
-
-static int __init intel_uncore_init(void)
-{
-       int ret;
-
-       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
-               return -ENODEV;
-
-       if (cpu_has_hypervisor)
-               return -ENODEV;
-
-       ret = uncore_pci_init();
-       if (ret)
-               goto fail;
-       ret = uncore_cpu_init();
-       if (ret) {
-               uncore_pci_exit();
-               goto fail;
-       }
-       uncore_cpumask_init();
-
-       uncore_pmus_register();
-       return 0;
-fail:
-       return ret;
-}
-device_initcall(intel_uncore_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h

deleted file mode 100644 (file)

index a7086b8..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ /dev/null
@@ -1,357 +0,0 @@
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/pci.h>
-#include <linux/perf_event.h>
-#include "perf_event.h"
-
-#define UNCORE_PMU_NAME_LEN            32
-#define UNCORE_PMU_HRTIMER_INTERVAL    (60LL * NSEC_PER_SEC)
-#define UNCORE_SNB_IMC_HRTIMER_INTERVAL (5ULL * NSEC_PER_SEC)
-
-#define UNCORE_FIXED_EVENT             0xff
-#define UNCORE_PMC_IDX_MAX_GENERIC     8
-#define UNCORE_PMC_IDX_FIXED           UNCORE_PMC_IDX_MAX_GENERIC
-#define UNCORE_PMC_IDX_MAX             (UNCORE_PMC_IDX_FIXED + 1)
-
-#define UNCORE_PCI_DEV_DATA(type, idx) ((type << 8) | idx)
-#define UNCORE_PCI_DEV_TYPE(data)      ((data >> 8) & 0xff)
-#define UNCORE_PCI_DEV_IDX(data)       (data & 0xff)
-#define UNCORE_EXTRA_PCI_DEV           0xff
-#define UNCORE_EXTRA_PCI_DEV_MAX       3
-
-/* support up to 8 sockets */
-#define UNCORE_SOCKET_MAX              8
-
-#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
-
-struct intel_uncore_ops;
-struct intel_uncore_pmu;
-struct intel_uncore_box;
-struct uncore_event_desc;
-
-struct intel_uncore_type {
-       const char *name;
-       int num_counters;
-       int num_boxes;
-       int perf_ctr_bits;
-       int fixed_ctr_bits;
-       unsigned perf_ctr;
-       unsigned event_ctl;
-       unsigned event_mask;
-       unsigned fixed_ctr;
-       unsigned fixed_ctl;
-       unsigned box_ctl;
-       unsigned msr_offset;
-       unsigned num_shared_regs:8;
-       unsigned single_fixed:1;
-       unsigned pair_ctr_ctl:1;
-       unsigned *msr_offsets;
-       struct event_constraint unconstrainted;
-       struct event_constraint *constraints;
-       struct intel_uncore_pmu *pmus;
-       struct intel_uncore_ops *ops;
-       struct uncore_event_desc *event_descs;
-       const struct attribute_group *attr_groups[4];
-       struct pmu *pmu; /* for custom pmu ops */
-};
-
-#define pmu_group attr_groups[0]
-#define format_group attr_groups[1]
-#define events_group attr_groups[2]
-
-struct intel_uncore_ops {
-       void (*init_box)(struct intel_uncore_box *);
-       void (*disable_box)(struct intel_uncore_box *);
-       void (*enable_box)(struct intel_uncore_box *);
-       void (*disable_event)(struct intel_uncore_box *, struct perf_event *);
-       void (*enable_event)(struct intel_uncore_box *, struct perf_event *);
-       u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *);
-       int (*hw_config)(struct intel_uncore_box *, struct perf_event *);
-       struct event_constraint *(*get_constraint)(struct intel_uncore_box *,
-                                                  struct perf_event *);
-       void (*put_constraint)(struct intel_uncore_box *, struct perf_event *);
-};
-
-struct intel_uncore_pmu {
-       struct pmu pmu;
-       char name[UNCORE_PMU_NAME_LEN];
-       int pmu_idx;
-       int func_id;
-       struct intel_uncore_type *type;
-       struct intel_uncore_box ** __percpu box;
-       struct list_head box_list;
-};
-
-struct intel_uncore_extra_reg {
-       raw_spinlock_t lock;
-       u64 config, config1, config2;
-       atomic_t ref;
-};
-
-struct intel_uncore_box {
-       int phys_id;
-       int n_active;   /* number of active events */
-       int n_events;
-       int cpu;        /* cpu to collect events */
-       unsigned long flags;
-       atomic_t refcnt;
-       struct perf_event *events[UNCORE_PMC_IDX_MAX];
-       struct perf_event *event_list[UNCORE_PMC_IDX_MAX];
-       struct event_constraint *event_constraint[UNCORE_PMC_IDX_MAX];
-       unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
-       u64 tags[UNCORE_PMC_IDX_MAX];
-       struct pci_dev *pci_dev;
-       struct intel_uncore_pmu *pmu;
-       u64 hrtimer_duration; /* hrtimer timeout for this box */
-       struct hrtimer hrtimer;
-       struct list_head list;
-       struct list_head active_list;
-       void *io_addr;
-       struct intel_uncore_extra_reg shared_regs[0];
-};
-
-#define UNCORE_BOX_FLAG_INITIATED      0
-
-struct uncore_event_desc {
-       struct kobj_attribute attr;
-       const char *config;
-};
-
-struct pci2phy_map {
-       struct list_head list;
-       int segment;
-       int pbus_to_physid[256];
-};
-
-int uncore_pcibus_to_physid(struct pci_bus *bus);
-struct pci2phy_map *__find_pci2phy_map(int segment);
-
-ssize_t uncore_event_show(struct kobject *kobj,
-                         struct kobj_attribute *attr, char *buf);
-
-#define INTEL_UNCORE_EVENT_DESC(_name, _config)                        \
-{                                                              \
-       .attr   = __ATTR(_name, 0444, uncore_event_show, NULL), \
-       .config = _config,                                      \
-}
-
-#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format)                        \
-static ssize_t __uncore_##_var##_show(struct kobject *kobj,            \
-                               struct kobj_attribute *attr,            \
-                               char *page)                             \
-{                                                                      \
-       BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);                     \
-       return sprintf(page, _format "\n");                             \
-}                                                                      \
-static struct kobj_attribute format_attr_##_var =                      \
-       __ATTR(_name, 0444, __uncore_##_var##_show, NULL)
-
-static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box)
-{
-       return box->pmu->type->box_ctl;
-}
-
-static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box)
-{
-       return box->pmu->type->fixed_ctl;
-}
-
-static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box)
-{
-       return box->pmu->type->fixed_ctr;
-}
-
-static inline
-unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx)
-{
-       return idx * 4 + box->pmu->type->event_ctl;
-}
-
-static inline
-unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx)
-{
-       return idx * 8 + box->pmu->type->perf_ctr;
-}
-
-static inline unsigned uncore_msr_box_offset(struct intel_uncore_box *box)
-{
-       struct intel_uncore_pmu *pmu = box->pmu;
-       return pmu->type->msr_offsets ?
-               pmu->type->msr_offsets[pmu->pmu_idx] :
-               pmu->type->msr_offset * pmu->pmu_idx;
-}
-
-static inline unsigned uncore_msr_box_ctl(struct intel_uncore_box *box)
-{
-       if (!box->pmu->type->box_ctl)
-               return 0;
-       return box->pmu->type->box_ctl + uncore_msr_box_offset(box);
-}
-
-static inline unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box)
-{
-       if (!box->pmu->type->fixed_ctl)
-               return 0;
-       return box->pmu->type->fixed_ctl + uncore_msr_box_offset(box);
-}
-
-static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box)
-{
-       return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box);
-}
-
-static inline
-unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx)
-{
-       return box->pmu->type->event_ctl +
-               (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
-               uncore_msr_box_offset(box);
-}
-
-static inline
-unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx)
-{
-       return box->pmu->type->perf_ctr +
-               (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
-               uncore_msr_box_offset(box);
-}
-
-static inline
-unsigned uncore_fixed_ctl(struct intel_uncore_box *box)
-{
-       if (box->pci_dev)
-               return uncore_pci_fixed_ctl(box);
-       else
-               return uncore_msr_fixed_ctl(box);
-}
-
-static inline
-unsigned uncore_fixed_ctr(struct intel_uncore_box *box)
-{
-       if (box->pci_dev)
-               return uncore_pci_fixed_ctr(box);
-       else
-               return uncore_msr_fixed_ctr(box);
-}
-
-static inline
-unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx)
-{
-       if (box->pci_dev)
-               return uncore_pci_event_ctl(box, idx);
-       else
-               return uncore_msr_event_ctl(box, idx);
-}
-
-static inline
-unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx)
-{
-       if (box->pci_dev)
-               return uncore_pci_perf_ctr(box, idx);
-       else
-               return uncore_msr_perf_ctr(box, idx);
-}
-
-static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box)
-{
-       return box->pmu->type->perf_ctr_bits;
-}
-
-static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box)
-{
-       return box->pmu->type->fixed_ctr_bits;
-}
-
-static inline int uncore_num_counters(struct intel_uncore_box *box)
-{
-       return box->pmu->type->num_counters;
-}
-
-static inline void uncore_disable_box(struct intel_uncore_box *box)
-{
-       if (box->pmu->type->ops->disable_box)
-               box->pmu->type->ops->disable_box(box);
-}
-
-static inline void uncore_enable_box(struct intel_uncore_box *box)
-{
-       if (box->pmu->type->ops->enable_box)
-               box->pmu->type->ops->enable_box(box);
-}
-
-static inline void uncore_disable_event(struct intel_uncore_box *box,
-                               struct perf_event *event)
-{
-       box->pmu->type->ops->disable_event(box, event);
-}
-
-static inline void uncore_enable_event(struct intel_uncore_box *box,
-                               struct perf_event *event)
-{
-       box->pmu->type->ops->enable_event(box, event);
-}
-
-static inline u64 uncore_read_counter(struct intel_uncore_box *box,
-                               struct perf_event *event)
-{
-       return box->pmu->type->ops->read_counter(box, event);
-}
-
-static inline void uncore_box_init(struct intel_uncore_box *box)
-{
-       if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
-               if (box->pmu->type->ops->init_box)
-                       box->pmu->type->ops->init_box(box);
-       }
-}
-
-static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
-{
-       return (box->phys_id < 0);
-}
-
-struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event);
-struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu);
-struct intel_uncore_box *uncore_event_to_box(struct perf_event *event);
-u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event);
-void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
-void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
-void uncore_pmu_event_read(struct perf_event *event);
-void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event);
-struct event_constraint *
-uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event);
-void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event);
-u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx);
-
-extern struct intel_uncore_type **uncore_msr_uncores;
-extern struct intel_uncore_type **uncore_pci_uncores;
-extern struct pci_driver *uncore_pci_driver;
-extern raw_spinlock_t pci2phy_map_lock;
-extern struct list_head pci2phy_map_head;
-extern struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
-extern struct event_constraint uncore_constraint_empty;
-
-/* perf_event_intel_uncore_snb.c */
-int snb_uncore_pci_init(void);
-int ivb_uncore_pci_init(void);
-int hsw_uncore_pci_init(void);
-int bdw_uncore_pci_init(void);
-int skl_uncore_pci_init(void);
-void snb_uncore_cpu_init(void);
-void nhm_uncore_cpu_init(void);
-int snb_pci2phy_map_init(int devid);
-
-/* perf_event_intel_uncore_snbep.c */
-int snbep_uncore_pci_init(void);
-void snbep_uncore_cpu_init(void);
-int ivbep_uncore_pci_init(void);
-void ivbep_uncore_cpu_init(void);
-int hswep_uncore_pci_init(void);
-void hswep_uncore_cpu_init(void);
-int bdx_uncore_pci_init(void);
-void bdx_uncore_cpu_init(void);
-int knl_uncore_pci_init(void);
-void knl_uncore_cpu_init(void);
-
-/* perf_event_intel_uncore_nhmex.c */
-void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c

deleted file mode 100644 (file)

index 2749965..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c
+++ /dev/null
@@ -1,1221 +0,0 @@
-/* Nehalem-EX/Westmere-EX uncore support */
-#include "perf_event_intel_uncore.h"
-
-/* NHM-EX event control */
-#define NHMEX_PMON_CTL_EV_SEL_MASK     0x000000ff
-#define NHMEX_PMON_CTL_UMASK_MASK      0x0000ff00
-#define NHMEX_PMON_CTL_EN_BIT0         (1 << 0)
-#define NHMEX_PMON_CTL_EDGE_DET                (1 << 18)
-#define NHMEX_PMON_CTL_PMI_EN          (1 << 20)
-#define NHMEX_PMON_CTL_EN_BIT22                (1 << 22)
-#define NHMEX_PMON_CTL_INVERT          (1 << 23)
-#define NHMEX_PMON_CTL_TRESH_MASK      0xff000000
-#define NHMEX_PMON_RAW_EVENT_MASK      (NHMEX_PMON_CTL_EV_SEL_MASK | \
-                                        NHMEX_PMON_CTL_UMASK_MASK | \
-                                        NHMEX_PMON_CTL_EDGE_DET | \
-                                        NHMEX_PMON_CTL_INVERT | \
-                                        NHMEX_PMON_CTL_TRESH_MASK)
-
-/* NHM-EX Ubox */
-#define NHMEX_U_MSR_PMON_GLOBAL_CTL            0xc00
-#define NHMEX_U_MSR_PMON_CTR                   0xc11
-#define NHMEX_U_MSR_PMON_EV_SEL                        0xc10
-
-#define NHMEX_U_PMON_GLOBAL_EN                 (1 << 0)
-#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL       0x0000001e
-#define NHMEX_U_PMON_GLOBAL_EN_ALL             (1 << 28)
-#define NHMEX_U_PMON_GLOBAL_RST_ALL            (1 << 29)
-#define NHMEX_U_PMON_GLOBAL_FRZ_ALL            (1 << 31)
-
-#define NHMEX_U_PMON_RAW_EVENT_MASK            \
-               (NHMEX_PMON_CTL_EV_SEL_MASK |   \
-                NHMEX_PMON_CTL_EDGE_DET)
-
-/* NHM-EX Cbox */
-#define NHMEX_C0_MSR_PMON_GLOBAL_CTL           0xd00
-#define NHMEX_C0_MSR_PMON_CTR0                 0xd11
-#define NHMEX_C0_MSR_PMON_EV_SEL0              0xd10
-#define NHMEX_C_MSR_OFFSET                     0x20
-
-/* NHM-EX Bbox */
-#define NHMEX_B0_MSR_PMON_GLOBAL_CTL           0xc20
-#define NHMEX_B0_MSR_PMON_CTR0                 0xc31
-#define NHMEX_B0_MSR_PMON_CTL0                 0xc30
-#define NHMEX_B_MSR_OFFSET                     0x40
-#define NHMEX_B0_MSR_MATCH                     0xe45
-#define NHMEX_B0_MSR_MASK                      0xe46
-#define NHMEX_B1_MSR_MATCH                     0xe4d
-#define NHMEX_B1_MSR_MASK                      0xe4e
-
-#define NHMEX_B_PMON_CTL_EN                    (1 << 0)
-#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT          1
-#define NHMEX_B_PMON_CTL_EV_SEL_MASK           \
-               (0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT)
-#define NHMEX_B_PMON_CTR_SHIFT         6
-#define NHMEX_B_PMON_CTR_MASK          \
-               (0x3 << NHMEX_B_PMON_CTR_SHIFT)
-#define NHMEX_B_PMON_RAW_EVENT_MASK            \
-               (NHMEX_B_PMON_CTL_EV_SEL_MASK | \
-                NHMEX_B_PMON_CTR_MASK)
-
-/* NHM-EX Sbox */
-#define NHMEX_S0_MSR_PMON_GLOBAL_CTL           0xc40
-#define NHMEX_S0_MSR_PMON_CTR0                 0xc51
-#define NHMEX_S0_MSR_PMON_CTL0                 0xc50
-#define NHMEX_S_MSR_OFFSET                     0x80
-#define NHMEX_S0_MSR_MM_CFG                    0xe48
-#define NHMEX_S0_MSR_MATCH                     0xe49
-#define NHMEX_S0_MSR_MASK                      0xe4a
-#define NHMEX_S1_MSR_MM_CFG                    0xe58
-#define NHMEX_S1_MSR_MATCH                     0xe59
-#define NHMEX_S1_MSR_MASK                      0xe5a
-
-#define NHMEX_S_PMON_MM_CFG_EN                 (0x1ULL << 63)
-#define NHMEX_S_EVENT_TO_R_PROG_EV             0
-
-/* NHM-EX Mbox */
-#define NHMEX_M0_MSR_GLOBAL_CTL                        0xca0
-#define NHMEX_M0_MSR_PMU_DSP                   0xca5
-#define NHMEX_M0_MSR_PMU_ISS                   0xca6
-#define NHMEX_M0_MSR_PMU_MAP                   0xca7
-#define NHMEX_M0_MSR_PMU_MSC_THR               0xca8
-#define NHMEX_M0_MSR_PMU_PGT                   0xca9
-#define NHMEX_M0_MSR_PMU_PLD                   0xcaa
-#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC           0xcab
-#define NHMEX_M0_MSR_PMU_CTL0                  0xcb0
-#define NHMEX_M0_MSR_PMU_CNT0                  0xcb1
-#define NHMEX_M_MSR_OFFSET                     0x40
-#define NHMEX_M0_MSR_PMU_MM_CFG                        0xe54
-#define NHMEX_M1_MSR_PMU_MM_CFG                        0xe5c
-
-#define NHMEX_M_PMON_MM_CFG_EN                 (1ULL << 63)
-#define NHMEX_M_PMON_ADDR_MATCH_MASK           0x3ffffffffULL
-#define NHMEX_M_PMON_ADDR_MASK_MASK            0x7ffffffULL
-#define NHMEX_M_PMON_ADDR_MASK_SHIFT           34
-
-#define NHMEX_M_PMON_CTL_EN                    (1 << 0)
-#define NHMEX_M_PMON_CTL_PMI_EN                        (1 << 1)
-#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT      2
-#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK       \
-       (0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT)
-#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT    4
-#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK     \
-       (0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT)
-#define NHMEX_M_PMON_CTL_WRAP_MODE             (1 << 6)
-#define NHMEX_M_PMON_CTL_FLAG_MODE             (1 << 7)
-#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT         9
-#define NHMEX_M_PMON_CTL_INC_SEL_MASK          \
-       (0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
-#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT    19
-#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK     \
-       (0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT)
-#define NHMEX_M_PMON_RAW_EVENT_MASK                    \
-               (NHMEX_M_PMON_CTL_COUNT_MODE_MASK |     \
-                NHMEX_M_PMON_CTL_STORAGE_MODE_MASK |   \
-                NHMEX_M_PMON_CTL_WRAP_MODE |           \
-                NHMEX_M_PMON_CTL_FLAG_MODE |           \
-                NHMEX_M_PMON_CTL_INC_SEL_MASK |        \
-                NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK)
-
-#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK          (((1 << 11) - 1) | (1 << 23))
-#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (11 + 3 * (n)))
-
-#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK          (((1 << 12) - 1) | (1 << 24))
-#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (12 + 3 * (n)))
-
-/*
- * use the 9~13 bits to select event If the 7th bit is not set,
- * otherwise use the 19~21 bits to select event.
- */
-#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
-#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \
-                               NHMEX_M_PMON_CTL_FLAG_MODE)
-#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \
-                          NHMEX_M_PMON_CTL_FLAG_MODE)
-#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \
-                               NHMEX_M_PMON_CTL_FLAG_MODE)
-#define MBOX_INC_SEL_EXTAR_REG(c, r) \
-               EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \
-                               MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r)
-#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \
-               EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \
-                               MBOX_SET_FLAG_SEL_MASK, \
-                               (u64)-1, NHMEX_M_##r)
-
-/* NHM-EX Rbox */
-#define NHMEX_R_MSR_GLOBAL_CTL                 0xe00
-#define NHMEX_R_MSR_PMON_CTL0                  0xe10
-#define NHMEX_R_MSR_PMON_CNT0                  0xe11
-#define NHMEX_R_MSR_OFFSET                     0x20
-
-#define NHMEX_R_MSR_PORTN_QLX_CFG(n)           \
-               ((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4))
-#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n)                (0xe04 + (n))
-#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n)                (0xe24 + (n))
-#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n)                \
-               (((n) < 4 ? 0 : 0x10) + (n) * 4)
-#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n)   \
-               (0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
-#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n)    \
-               (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1)
-#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n)     \
-               (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2)
-#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n)   \
-               (0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
-#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n)    \
-               (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1)
-#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n)     \
-               (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2)
-
-#define NHMEX_R_PMON_CTL_EN                    (1 << 0)
-#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT          1
-#define NHMEX_R_PMON_CTL_EV_SEL_MASK           \
-               (0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT)
-#define NHMEX_R_PMON_CTL_PMI_EN                        (1 << 6)
-#define NHMEX_R_PMON_RAW_EVENT_MASK            NHMEX_R_PMON_CTL_EV_SEL_MASK
-
-/* NHM-EX Wbox */
-#define NHMEX_W_MSR_GLOBAL_CTL                 0xc80
-#define NHMEX_W_MSR_PMON_CNT0                  0xc90
-#define NHMEX_W_MSR_PMON_EVT_SEL0              0xc91
-#define NHMEX_W_MSR_PMON_FIXED_CTR             0x394
-#define NHMEX_W_MSR_PMON_FIXED_CTL             0x395
-
-#define NHMEX_W_PMON_GLOBAL_FIXED_EN           (1ULL << 31)
-
-#define __BITS_VALUE(x, i, n)  ((typeof(x))(((x) >> ((i) * (n))) & \
-                               ((1ULL << (n)) - 1)))
-
-DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
-DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5");
-DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
-DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
-DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
-DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7");
-DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63");
-DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63");
-
-static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box)
-{
-       wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL);
-}
-
-static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box)
-{
-       unsigned msr = uncore_msr_box_ctl(box);
-       u64 config;
-
-       if (msr) {
-               rdmsrl(msr, config);
-               config &= ~((1ULL << uncore_num_counters(box)) - 1);
-               /* WBox has a fixed counter */
-               if (uncore_msr_fixed_ctl(box))
-                       config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN;
-               wrmsrl(msr, config);
-       }
-}
-
-static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box)
-{
-       unsigned msr = uncore_msr_box_ctl(box);
-       u64 config;
-
-       if (msr) {
-               rdmsrl(msr, config);
-               config |= (1ULL << uncore_num_counters(box)) - 1;
-               /* WBox has a fixed counter */
-               if (uncore_msr_fixed_ctl(box))
-                       config |= NHMEX_W_PMON_GLOBAL_FIXED_EN;
-               wrmsrl(msr, config);
-       }
-}
-
-static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       wrmsrl(event->hw.config_base, 0);
-}
-
-static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (hwc->idx >= UNCORE_PMC_IDX_FIXED)
-               wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0);
-       else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0)
-               wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
-       else
-               wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
-}
-
-#define NHMEX_UNCORE_OPS_COMMON_INIT()                         \
-       .init_box       = nhmex_uncore_msr_init_box,            \
-       .disable_box    = nhmex_uncore_msr_disable_box,         \
-       .enable_box     = nhmex_uncore_msr_enable_box,          \
-       .disable_event  = nhmex_uncore_msr_disable_event,       \
-       .read_counter   = uncore_msr_read_counter
-
-static struct intel_uncore_ops nhmex_uncore_ops = {
-       NHMEX_UNCORE_OPS_COMMON_INIT(),
-       .enable_event   = nhmex_uncore_msr_enable_event,
-};
-
-static struct attribute *nhmex_uncore_ubox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_edge.attr,
-       NULL,
-};
-
-static struct attribute_group nhmex_uncore_ubox_format_group = {
-       .name           = "format",
-       .attrs          = nhmex_uncore_ubox_formats_attr,
-};
-
-static struct intel_uncore_type nhmex_uncore_ubox = {
-       .name           = "ubox",
-       .num_counters   = 1,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 48,
-       .event_ctl      = NHMEX_U_MSR_PMON_EV_SEL,
-       .perf_ctr       = NHMEX_U_MSR_PMON_CTR,
-       .event_mask     = NHMEX_U_PMON_RAW_EVENT_MASK,
-       .box_ctl        = NHMEX_U_MSR_PMON_GLOBAL_CTL,
-       .ops            = &nhmex_uncore_ops,
-       .format_group   = &nhmex_uncore_ubox_format_group
-};
-
-static struct attribute *nhmex_uncore_cbox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh8.attr,
-       NULL,
-};
-
-static struct attribute_group nhmex_uncore_cbox_format_group = {
-       .name = "format",
-       .attrs = nhmex_uncore_cbox_formats_attr,
-};
-
-/* msr offset for each instance of cbox */
-static unsigned nhmex_cbox_msr_offsets[] = {
-       0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0,
-};
-
-static struct intel_uncore_type nhmex_uncore_cbox = {
-       .name                   = "cbox",
-       .num_counters           = 6,
-       .num_boxes              = 10,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = NHMEX_C0_MSR_PMON_EV_SEL0,
-       .perf_ctr               = NHMEX_C0_MSR_PMON_CTR0,
-       .event_mask             = NHMEX_PMON_RAW_EVENT_MASK,
-       .box_ctl                = NHMEX_C0_MSR_PMON_GLOBAL_CTL,
-       .msr_offsets            = nhmex_cbox_msr_offsets,
-       .pair_ctr_ctl           = 1,
-       .ops                    = &nhmex_uncore_ops,
-       .format_group           = &nhmex_uncore_cbox_format_group
-};
-
-static struct uncore_event_desc nhmex_uncore_wbox_events[] = {
-       INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"),
-       { /* end: all zeroes */ },
-};
-
-static struct intel_uncore_type nhmex_uncore_wbox = {
-       .name                   = "wbox",
-       .num_counters           = 4,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = NHMEX_W_MSR_PMON_CNT0,
-       .perf_ctr               = NHMEX_W_MSR_PMON_EVT_SEL0,
-       .fixed_ctr              = NHMEX_W_MSR_PMON_FIXED_CTR,
-       .fixed_ctl              = NHMEX_W_MSR_PMON_FIXED_CTL,
-       .event_mask             = NHMEX_PMON_RAW_EVENT_MASK,
-       .box_ctl                = NHMEX_W_MSR_GLOBAL_CTL,
-       .pair_ctr_ctl           = 1,
-       .event_descs            = nhmex_uncore_wbox_events,
-       .ops                    = &nhmex_uncore_ops,
-       .format_group           = &nhmex_uncore_cbox_format_group
-};
-
-static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-       int ctr, ev_sel;
-
-       ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >>
-               NHMEX_B_PMON_CTR_SHIFT;
-       ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >>
-                 NHMEX_B_PMON_CTL_EV_SEL_SHIFT;
-
-       /* events that do not use the match/mask registers */
-       if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) ||
-           (ctr == 2 && ev_sel != 0x4) || ctr == 3)
-               return 0;
-
-       if (box->pmu->pmu_idx == 0)
-               reg1->reg = NHMEX_B0_MSR_MATCH;
-       else
-               reg1->reg = NHMEX_B1_MSR_MATCH;
-       reg1->idx = 0;
-       reg1->config = event->attr.config1;
-       reg2->config = event->attr.config2;
-       return 0;
-}
-
-static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-       if (reg1->idx != EXTRA_REG_NONE) {
-               wrmsrl(reg1->reg, reg1->config);
-               wrmsrl(reg1->reg + 1, reg2->config);
-       }
-       wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
-               (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK));
-}
-
-/*
- * The Bbox has 4 counters, but each counter monitors different events.
- * Use bits 6-7 in the event config to select counter.
- */
-static struct event_constraint nhmex_uncore_bbox_constraints[] = {
-       EVENT_CONSTRAINT(0 , 1, 0xc0),
-       EVENT_CONSTRAINT(0x40, 2, 0xc0),
-       EVENT_CONSTRAINT(0x80, 4, 0xc0),
-       EVENT_CONSTRAINT(0xc0, 8, 0xc0),
-       EVENT_CONSTRAINT_END,
-};
-
-static struct attribute *nhmex_uncore_bbox_formats_attr[] = {
-       &format_attr_event5.attr,
-       &format_attr_counter.attr,
-       &format_attr_match.attr,
-       &format_attr_mask.attr,
-       NULL,
-};
-
-static struct attribute_group nhmex_uncore_bbox_format_group = {
-       .name = "format",
-       .attrs = nhmex_uncore_bbox_formats_attr,
-};
-
-static struct intel_uncore_ops nhmex_uncore_bbox_ops = {
-       NHMEX_UNCORE_OPS_COMMON_INIT(),
-       .enable_event           = nhmex_bbox_msr_enable_event,
-       .hw_config              = nhmex_bbox_hw_config,
-       .get_constraint         = uncore_get_constraint,
-       .put_constraint         = uncore_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_bbox = {
-       .name                   = "bbox",
-       .num_counters           = 4,
-       .num_boxes              = 2,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = NHMEX_B0_MSR_PMON_CTL0,
-       .perf_ctr               = NHMEX_B0_MSR_PMON_CTR0,
-       .event_mask             = NHMEX_B_PMON_RAW_EVENT_MASK,
-       .box_ctl                = NHMEX_B0_MSR_PMON_GLOBAL_CTL,
-       .msr_offset             = NHMEX_B_MSR_OFFSET,
-       .pair_ctr_ctl           = 1,
-       .num_shared_regs        = 1,
-       .constraints            = nhmex_uncore_bbox_constraints,
-       .ops                    = &nhmex_uncore_bbox_ops,
-       .format_group           = &nhmex_uncore_bbox_format_group
-};
-
-static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-       /* only TO_R_PROG_EV event uses the match/mask register */
-       if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) !=
-           NHMEX_S_EVENT_TO_R_PROG_EV)
-               return 0;
-
-       if (box->pmu->pmu_idx == 0)
-               reg1->reg = NHMEX_S0_MSR_MM_CFG;
-       else
-               reg1->reg = NHMEX_S1_MSR_MM_CFG;
-       reg1->idx = 0;
-       reg1->config = event->attr.config1;
-       reg2->config = event->attr.config2;
-       return 0;
-}
-
-static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-       if (reg1->idx != EXTRA_REG_NONE) {
-               wrmsrl(reg1->reg, 0);
-               wrmsrl(reg1->reg + 1, reg1->config);
-               wrmsrl(reg1->reg + 2, reg2->config);
-               wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN);
-       }
-       wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
-}
-
-static struct attribute *nhmex_uncore_sbox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh8.attr,
-       &format_attr_match.attr,
-       &format_attr_mask.attr,
-       NULL,
-};
-
-static struct attribute_group nhmex_uncore_sbox_format_group = {
-       .name                   = "format",
-       .attrs                  = nhmex_uncore_sbox_formats_attr,
-};
-
-static struct intel_uncore_ops nhmex_uncore_sbox_ops = {
-       NHMEX_UNCORE_OPS_COMMON_INIT(),
-       .enable_event           = nhmex_sbox_msr_enable_event,
-       .hw_config              = nhmex_sbox_hw_config,
-       .get_constraint         = uncore_get_constraint,
-       .put_constraint         = uncore_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_sbox = {
-       .name                   = "sbox",
-       .num_counters           = 4,
-       .num_boxes              = 2,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = NHMEX_S0_MSR_PMON_CTL0,
-       .perf_ctr               = NHMEX_S0_MSR_PMON_CTR0,
-       .event_mask             = NHMEX_PMON_RAW_EVENT_MASK,
-       .box_ctl                = NHMEX_S0_MSR_PMON_GLOBAL_CTL,
-       .msr_offset             = NHMEX_S_MSR_OFFSET,
-       .pair_ctr_ctl           = 1,
-       .num_shared_regs        = 1,
-       .ops                    = &nhmex_uncore_sbox_ops,
-       .format_group           = &nhmex_uncore_sbox_format_group
-};
-
-enum {
-       EXTRA_REG_NHMEX_M_FILTER,
-       EXTRA_REG_NHMEX_M_DSP,
-       EXTRA_REG_NHMEX_M_ISS,
-       EXTRA_REG_NHMEX_M_MAP,
-       EXTRA_REG_NHMEX_M_MSC_THR,
-       EXTRA_REG_NHMEX_M_PGT,
-       EXTRA_REG_NHMEX_M_PLD,
-       EXTRA_REG_NHMEX_M_ZDP_CTL_FVC,
-};
-
-static struct extra_reg nhmex_uncore_mbox_extra_regs[] = {
-       MBOX_INC_SEL_EXTAR_REG(0x0, DSP),
-       MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR),
-       MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR),
-       MBOX_INC_SEL_EXTAR_REG(0x9, ISS),
-       /* event 0xa uses two extra registers */
-       MBOX_INC_SEL_EXTAR_REG(0xa, ISS),
-       MBOX_INC_SEL_EXTAR_REG(0xa, PLD),
-       MBOX_INC_SEL_EXTAR_REG(0xb, PLD),
-       /* events 0xd ~ 0x10 use the same extra register */
-       MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC),
-       MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC),
-       MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC),
-       MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC),
-       MBOX_INC_SEL_EXTAR_REG(0x16, PGT),
-       MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP),
-       MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS),
-       MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT),
-       MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP),
-       EVENT_EXTRA_END
-};
-
-/* Nehalem-EX or Westmere-EX ? */
-static bool uncore_nhmex;
-
-static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config)
-{
-       struct intel_uncore_extra_reg *er;
-       unsigned long flags;
-       bool ret = false;
-       u64 mask;
-
-       if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
-               er = &box->shared_regs[idx];
-               raw_spin_lock_irqsave(&er->lock, flags);
-               if (!atomic_read(&er->ref) || er->config == config) {
-                       atomic_inc(&er->ref);
-                       er->config = config;
-                       ret = true;
-               }
-               raw_spin_unlock_irqrestore(&er->lock, flags);
-
-               return ret;
-       }
-       /*
-        * The ZDP_CTL_FVC MSR has 4 fields which are used to control
-        * events 0xd ~ 0x10. Besides these 4 fields, there are additional
-        * fields which are shared.
-        */
-       idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-       if (WARN_ON_ONCE(idx >= 4))
-               return false;
-
-       /* mask of the shared fields */
-       if (uncore_nhmex)
-               mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK;
-       else
-               mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK;
-       er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
-
-       raw_spin_lock_irqsave(&er->lock, flags);
-       /* add mask of the non-shared field if it's in use */
-       if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) {
-               if (uncore_nhmex)
-                       mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-               else
-                       mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-       }
-
-       if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) {
-               atomic_add(1 << (idx * 8), &er->ref);
-               if (uncore_nhmex)
-                       mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK |
-                               NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-               else
-                       mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK |
-                               WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-               er->config &= ~mask;
-               er->config |= (config & mask);
-               ret = true;
-       }
-       raw_spin_unlock_irqrestore(&er->lock, flags);
-
-       return ret;
-}
-
-static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx)
-{
-       struct intel_uncore_extra_reg *er;
-
-       if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
-               er = &box->shared_regs[idx];
-               atomic_dec(&er->ref);
-               return;
-       }
-
-       idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-       er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
-       atomic_sub(1 << (idx * 8), &er->ref);
-}
-
-static u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       u64 idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8);
-       u64 config = reg1->config;
-
-       /* get the non-shared control bits and shift them */
-       idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-       if (uncore_nhmex)
-               config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-       else
-               config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-       if (new_idx > orig_idx) {
-               idx = new_idx - orig_idx;
-               config <<= 3 * idx;
-       } else {
-               idx = orig_idx - new_idx;
-               config >>= 3 * idx;
-       }
-
-       /* add the shared control bits back */
-       if (uncore_nhmex)
-               config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
-       else
-               config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
-       config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
-       if (modify) {
-               /* adjust the main event selector */
-               if (new_idx > orig_idx)
-                       hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
-               else
-                       hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
-               reg1->config = config;
-               reg1->idx = ~0xff | new_idx;
-       }
-       return config;
-}
-
-static struct event_constraint *
-nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-       int i, idx[2], alloc = 0;
-       u64 config1 = reg1->config;
-
-       idx[0] = __BITS_VALUE(reg1->idx, 0, 8);
-       idx[1] = __BITS_VALUE(reg1->idx, 1, 8);
-again:
-       for (i = 0; i < 2; i++) {
-               if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
-                       idx[i] = 0xff;
-
-               if (idx[i] == 0xff)
-                       continue;
-
-               if (!nhmex_mbox_get_shared_reg(box, idx[i],
-                               __BITS_VALUE(config1, i, 32)))
-                       goto fail;
-               alloc |= (0x1 << i);
-       }
-
-       /* for the match/mask registers */
-       if (reg2->idx != EXTRA_REG_NONE &&
-           (uncore_box_is_fake(box) || !reg2->alloc) &&
-           !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config))
-               goto fail;
-
-       /*
-        * If it's a fake box -- as per validate_{group,event}() we
-        * shouldn't touch event state and we can avoid doing so
-        * since both will only call get_event_constraints() once
-        * on each event, this avoids the need for reg->alloc.
-        */
-       if (!uncore_box_is_fake(box)) {
-               if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8))
-                       nhmex_mbox_alter_er(event, idx[0], true);
-               reg1->alloc |= alloc;
-               if (reg2->idx != EXTRA_REG_NONE)
-                       reg2->alloc = 1;
-       }
-       return NULL;
-fail:
-       if (idx[0] != 0xff && !(alloc & 0x1) &&
-           idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
-               /*
-                * events 0xd ~ 0x10 are functional identical, but are
-                * controlled by different fields in the ZDP_CTL_FVC
-                * register. If we failed to take one field, try the
-                * rest 3 choices.
-                */
-               BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff);
-               idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-               idx[0] = (idx[0] + 1) % 4;
-               idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-               if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) {
-                       config1 = nhmex_mbox_alter_er(event, idx[0], false);
-                       goto again;
-               }
-       }
-
-       if (alloc & 0x1)
-               nhmex_mbox_put_shared_reg(box, idx[0]);
-       if (alloc & 0x2)
-               nhmex_mbox_put_shared_reg(box, idx[1]);
-       return &uncore_constraint_empty;
-}
-
-static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-
-       if (uncore_box_is_fake(box))
-               return;
-
-       if (reg1->alloc & 0x1)
-               nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8));
-       if (reg1->alloc & 0x2)
-               nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8));
-       reg1->alloc = 0;
-
-       if (reg2->alloc) {
-               nhmex_mbox_put_shared_reg(box, reg2->idx);
-               reg2->alloc = 0;
-       }
-}
-
-static int nhmex_mbox_extra_reg_idx(struct extra_reg *er)
-{
-       if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
-               return er->idx;
-       return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd;
-}
-
-static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct intel_uncore_type *type = box->pmu->type;
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-       struct extra_reg *er;
-       unsigned msr;
-       int reg_idx = 0;
-       /*
-        * The mbox events may require 2 extra MSRs at the most. But only
-        * the lower 32 bits in these MSRs are significant, so we can use
-        * config1 to pass two MSRs' config.
-        */
-       for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) {
-               if (er->event != (event->hw.config & er->config_mask))
-                       continue;
-               if (event->attr.config1 & ~er->valid_mask)
-                       return -EINVAL;
-
-               msr = er->msr + type->msr_offset * box->pmu->pmu_idx;
-               if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff))
-                       return -EINVAL;
-
-               /* always use the 32~63 bits to pass the PLD config */
-               if (er->idx == EXTRA_REG_NHMEX_M_PLD)
-                       reg_idx = 1;
-               else if (WARN_ON_ONCE(reg_idx > 0))
-                       return -EINVAL;
-
-               reg1->idx &= ~(0xff << (reg_idx * 8));
-               reg1->reg &= ~(0xffff << (reg_idx * 16));
-               reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8);
-               reg1->reg |= msr << (reg_idx * 16);
-               reg1->config = event->attr.config1;
-               reg_idx++;
-       }
-       /*
-        * The mbox only provides ability to perform address matching
-        * for the PLD events.
-        */
-       if (reg_idx == 2) {
-               reg2->idx = EXTRA_REG_NHMEX_M_FILTER;
-               if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN)
-                       reg2->config = event->attr.config2;
-               else
-                       reg2->config = ~0ULL;
-               if (box->pmu->pmu_idx == 0)
-                       reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG;
-               else
-                       reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG;
-       }
-       return 0;
-}
-
-static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx)
-{
-       struct intel_uncore_extra_reg *er;
-       unsigned long flags;
-       u64 config;
-
-       if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
-               return box->shared_regs[idx].config;
-
-       er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
-       raw_spin_lock_irqsave(&er->lock, flags);
-       config = er->config;
-       raw_spin_unlock_irqrestore(&er->lock, flags);
-       return config;
-}
-
-static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-       int idx;
-
-       idx = __BITS_VALUE(reg1->idx, 0, 8);
-       if (idx != 0xff)
-               wrmsrl(__BITS_VALUE(reg1->reg, 0, 16),
-                       nhmex_mbox_shared_reg_config(box, idx));
-       idx = __BITS_VALUE(reg1->idx, 1, 8);
-       if (idx != 0xff)
-               wrmsrl(__BITS_VALUE(reg1->reg, 1, 16),
-                       nhmex_mbox_shared_reg_config(box, idx));
-
-       if (reg2->idx != EXTRA_REG_NONE) {
-               wrmsrl(reg2->reg, 0);
-               if (reg2->config != ~0ULL) {
-                       wrmsrl(reg2->reg + 1,
-                               reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK);
-                       wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK &
-                               (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT));
-                       wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN);
-               }
-       }
-
-       wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
-}
-
-DEFINE_UNCORE_FORMAT_ATTR(count_mode,          count_mode,     "config:2-3");
-DEFINE_UNCORE_FORMAT_ATTR(storage_mode,                storage_mode,   "config:4-5");
-DEFINE_UNCORE_FORMAT_ATTR(wrap_mode,           wrap_mode,      "config:6");
-DEFINE_UNCORE_FORMAT_ATTR(flag_mode,           flag_mode,      "config:7");
-DEFINE_UNCORE_FORMAT_ATTR(inc_sel,             inc_sel,        "config:9-13");
-DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel,                set_flag_sel,   "config:19-21");
-DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en,       filter_cfg_en,  "config2:63");
-DEFINE_UNCORE_FORMAT_ATTR(filter_match,                filter_match,   "config2:0-33");
-DEFINE_UNCORE_FORMAT_ATTR(filter_mask,         filter_mask,    "config2:34-61");
-DEFINE_UNCORE_FORMAT_ATTR(dsp,                 dsp,            "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(thr,                 thr,            "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(fvc,                 fvc,            "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(pgt,                 pgt,            "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(map,                 map,            "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(iss,                 iss,            "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(pld,                 pld,            "config1:32-63");
-
-static struct attribute *nhmex_uncore_mbox_formats_attr[] = {
-       &format_attr_count_mode.attr,
-       &format_attr_storage_mode.attr,
-       &format_attr_wrap_mode.attr,
-       &format_attr_flag_mode.attr,
-       &format_attr_inc_sel.attr,
-       &format_attr_set_flag_sel.attr,
-       &format_attr_filter_cfg_en.attr,
-       &format_attr_filter_match.attr,
-       &format_attr_filter_mask.attr,
-       &format_attr_dsp.attr,
-       &format_attr_thr.attr,
-       &format_attr_fvc.attr,
-       &format_attr_pgt.attr,
-       &format_attr_map.attr,
-       &format_attr_iss.attr,
-       &format_attr_pld.attr,
-       NULL,
-};
-
-static struct attribute_group nhmex_uncore_mbox_format_group = {
-       .name           = "format",
-       .attrs          = nhmex_uncore_mbox_formats_attr,
-};
-
-static struct uncore_event_desc nhmex_uncore_mbox_events[] = {
-       INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"),
-       INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"),
-       { /* end: all zeroes */ },
-};
-
-static struct uncore_event_desc wsmex_uncore_mbox_events[] = {
-       INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"),
-       INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"),
-       { /* end: all zeroes */ },
-};
-
-static struct intel_uncore_ops nhmex_uncore_mbox_ops = {
-       NHMEX_UNCORE_OPS_COMMON_INIT(),
-       .enable_event   = nhmex_mbox_msr_enable_event,
-       .hw_config      = nhmex_mbox_hw_config,
-       .get_constraint = nhmex_mbox_get_constraint,
-       .put_constraint = nhmex_mbox_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_mbox = {
-       .name                   = "mbox",
-       .num_counters           = 6,
-       .num_boxes              = 2,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = NHMEX_M0_MSR_PMU_CTL0,
-       .perf_ctr               = NHMEX_M0_MSR_PMU_CNT0,
-       .event_mask             = NHMEX_M_PMON_RAW_EVENT_MASK,
-       .box_ctl                = NHMEX_M0_MSR_GLOBAL_CTL,
-       .msr_offset             = NHMEX_M_MSR_OFFSET,
-       .pair_ctr_ctl           = 1,
-       .num_shared_regs        = 8,
-       .event_descs            = nhmex_uncore_mbox_events,
-       .ops                    = &nhmex_uncore_mbox_ops,
-       .format_group           = &nhmex_uncore_mbox_format_group,
-};
-
-static void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
-       /* adjust the main event selector and extra register index */
-       if (reg1->idx % 2) {
-               reg1->idx--;
-               hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
-       } else {
-               reg1->idx++;
-               hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
-       }
-
-       /* adjust extra register config */
-       switch (reg1->idx % 6) {
-       case 2:
-               /* shift the 8~15 bits to the 0~7 bits */
-               reg1->config >>= 8;
-               break;
-       case 3:
-               /* shift the 0~7 bits to the 8~15 bits */
-               reg1->config <<= 8;
-               break;
-       }
-}
-
-/*
- * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7.
- * An event set consists of 6 events, the 3rd and 4th events in
- * an event set use the same extra register. So an event set uses
- * 5 extra registers.
- */
-static struct event_constraint *
-nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-       struct intel_uncore_extra_reg *er;
-       unsigned long flags;
-       int idx, er_idx;
-       u64 config1;
-       bool ok = false;
-
-       if (!uncore_box_is_fake(box) && reg1->alloc)
-               return NULL;
-
-       idx = reg1->idx % 6;
-       config1 = reg1->config;
-again:
-       er_idx = idx;
-       /* the 3rd and 4th events use the same extra register */
-       if (er_idx > 2)
-               er_idx--;
-       er_idx += (reg1->idx / 6) * 5;
-
-       er = &box->shared_regs[er_idx];
-       raw_spin_lock_irqsave(&er->lock, flags);
-       if (idx < 2) {
-               if (!atomic_read(&er->ref) || er->config == reg1->config) {
-                       atomic_inc(&er->ref);
-                       er->config = reg1->config;
-                       ok = true;
-               }
-       } else if (idx == 2 || idx == 3) {
-               /*
-                * these two events use different fields in a extra register,
-                * the 0~7 bits and the 8~15 bits respectively.
-                */
-               u64 mask = 0xff << ((idx - 2) * 8);
-               if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) ||
-                               !((er->config ^ config1) & mask)) {
-                       atomic_add(1 << ((idx - 2) * 8), &er->ref);
-                       er->config &= ~mask;
-                       er->config |= config1 & mask;
-                       ok = true;
-               }
-       } else {
-               if (!atomic_read(&er->ref) ||
-                               (er->config == (hwc->config >> 32) &&
-                                er->config1 == reg1->config &&
-                                er->config2 == reg2->config)) {
-                       atomic_inc(&er->ref);
-                       er->config = (hwc->config >> 32);
-                       er->config1 = reg1->config;
-                       er->config2 = reg2->config;
-                       ok = true;
-               }
-       }
-       raw_spin_unlock_irqrestore(&er->lock, flags);
-
-       if (!ok) {
-               /*
-                * The Rbox events are always in pairs. The paired
-                * events are functional identical, but use different
-                * extra registers. If we failed to take an extra
-                * register, try the alternative.
-                */
-               idx ^= 1;
-               if (idx != reg1->idx % 6) {
-                       if (idx == 2)
-                               config1 >>= 8;
-                       else if (idx == 3)
-                               config1 <<= 8;
-                       goto again;
-               }
-       } else {
-               if (!uncore_box_is_fake(box)) {
-                       if (idx != reg1->idx % 6)
-                               nhmex_rbox_alter_er(box, event);
-                       reg1->alloc = 1;
-               }
-               return NULL;
-       }
-       return &uncore_constraint_empty;
-}
-
-static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct intel_uncore_extra_reg *er;
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       int idx, er_idx;
-
-       if (uncore_box_is_fake(box) || !reg1->alloc)
-               return;
-
-       idx = reg1->idx % 6;
-       er_idx = idx;
-       if (er_idx > 2)
-               er_idx--;
-       er_idx += (reg1->idx / 6) * 5;
-
-       er = &box->shared_regs[er_idx];
-       if (idx == 2 || idx == 3)
-               atomic_sub(1 << ((idx - 2) * 8), &er->ref);
-       else
-               atomic_dec(&er->ref);
-
-       reg1->alloc = 0;
-}
-
-static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-       int idx;
-
-       idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >>
-               NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
-       if (idx >= 0x18)
-               return -EINVAL;
-
-       reg1->idx = idx;
-       reg1->config = event->attr.config1;
-
-       switch (idx % 6) {
-       case 4:
-       case 5:
-               hwc->config |= event->attr.config & (~0ULL << 32);
-               reg2->config = event->attr.config2;
-               break;
-       }
-       return 0;
-}
-
-static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-       int idx, port;
-
-       idx = reg1->idx;
-       port = idx / 6 + box->pmu->pmu_idx * 4;
-
-       switch (idx % 6) {
-       case 0:
-               wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config);
-               break;
-       case 1:
-               wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config);
-               break;
-       case 2:
-       case 3:
-               wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port),
-                       uncore_shared_reg_config(box, 2 + (idx / 6) * 5));
-               break;
-       case 4:
-               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port),
-                       hwc->config >> 32);
-               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config);
-               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config);
-               break;
-       case 5:
-               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port),
-                       hwc->config >> 32);
-               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config);
-               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config);
-               break;
-       }
-
-       wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
-               (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK));
-}
-
-DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63");
-DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63");
-DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63");
-DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15");
-DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31");
-
-static struct attribute *nhmex_uncore_rbox_formats_attr[] = {
-       &format_attr_event5.attr,
-       &format_attr_xbr_mm_cfg.attr,
-       &format_attr_xbr_match.attr,
-       &format_attr_xbr_mask.attr,
-       &format_attr_qlx_cfg.attr,
-       &format_attr_iperf_cfg.attr,
-       NULL,
-};
-
-static struct attribute_group nhmex_uncore_rbox_format_group = {
-       .name = "format",
-       .attrs = nhmex_uncore_rbox_formats_attr,
-};
-
-static struct uncore_event_desc nhmex_uncore_rbox_events[] = {
-       INTEL_UNCORE_EVENT_DESC(qpi0_flit_send,         "event=0x0,iperf_cfg=0x80000000"),
-       INTEL_UNCORE_EVENT_DESC(qpi1_filt_send,         "event=0x6,iperf_cfg=0x80000000"),
-       INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt,         "event=0x0,iperf_cfg=0x40000000"),
-       INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt,         "event=0x6,iperf_cfg=0x40000000"),
-       INTEL_UNCORE_EVENT_DESC(qpi0_date_response,     "event=0x0,iperf_cfg=0xc4"),
-       INTEL_UNCORE_EVENT_DESC(qpi1_date_response,     "event=0x6,iperf_cfg=0xc4"),
-       { /* end: all zeroes */ },
-};
-
-static struct intel_uncore_ops nhmex_uncore_rbox_ops = {
-       NHMEX_UNCORE_OPS_COMMON_INIT(),
-       .enable_event           = nhmex_rbox_msr_enable_event,
-       .hw_config              = nhmex_rbox_hw_config,
-       .get_constraint         = nhmex_rbox_get_constraint,
-       .put_constraint         = nhmex_rbox_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_rbox = {
-       .name                   = "rbox",
-       .num_counters           = 8,
-       .num_boxes              = 2,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = NHMEX_R_MSR_PMON_CTL0,
-       .perf_ctr               = NHMEX_R_MSR_PMON_CNT0,
-       .event_mask             = NHMEX_R_PMON_RAW_EVENT_MASK,
-       .box_ctl                = NHMEX_R_MSR_GLOBAL_CTL,
-       .msr_offset             = NHMEX_R_MSR_OFFSET,
-       .pair_ctr_ctl           = 1,
-       .num_shared_regs        = 20,
-       .event_descs            = nhmex_uncore_rbox_events,
-       .ops                    = &nhmex_uncore_rbox_ops,
-       .format_group           = &nhmex_uncore_rbox_format_group
-};
-
-static struct intel_uncore_type *nhmex_msr_uncores[] = {
-       &nhmex_uncore_ubox,
-       &nhmex_uncore_cbox,
-       &nhmex_uncore_bbox,
-       &nhmex_uncore_sbox,
-       &nhmex_uncore_mbox,
-       &nhmex_uncore_rbox,
-       &nhmex_uncore_wbox,
-       NULL,
-};
-
-void nhmex_uncore_cpu_init(void)
-{
-       if (boot_cpu_data.x86_model == 46)
-               uncore_nhmex = true;
-       else
-               nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events;
-       if (nhmex_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-               nhmex_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-       uncore_msr_uncores = nhmex_msr_uncores;
-}
-/* end of Nehalem-EX uncore support */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c

deleted file mode 100644 (file)

index 2bd030d..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ /dev/null
@@ -1,717 +0,0 @@
-/* Nehalem/SandBridge/Haswell uncore support */
-#include "perf_event_intel_uncore.h"
-
-/* Uncore IMC PCI IDs */
-#define PCI_DEVICE_ID_INTEL_SNB_IMC    0x0100
-#define PCI_DEVICE_ID_INTEL_IVB_IMC    0x0154
-#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
-#define PCI_DEVICE_ID_INTEL_HSW_IMC    0x0c00
-#define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
-#define PCI_DEVICE_ID_INTEL_BDW_IMC    0x1604
-#define PCI_DEVICE_ID_INTEL_SKL_IMC    0x191f
-
-/* SNB event control */
-#define SNB_UNC_CTL_EV_SEL_MASK                        0x000000ff
-#define SNB_UNC_CTL_UMASK_MASK                 0x0000ff00
-#define SNB_UNC_CTL_EDGE_DET                   (1 << 18)
-#define SNB_UNC_CTL_EN                         (1 << 22)
-#define SNB_UNC_CTL_INVERT                     (1 << 23)
-#define SNB_UNC_CTL_CMASK_MASK                 0x1f000000
-#define NHM_UNC_CTL_CMASK_MASK                 0xff000000
-#define NHM_UNC_FIXED_CTR_CTL_EN               (1 << 0)
-
-#define SNB_UNC_RAW_EVENT_MASK                 (SNB_UNC_CTL_EV_SEL_MASK | \
-                                                SNB_UNC_CTL_UMASK_MASK | \
-                                                SNB_UNC_CTL_EDGE_DET | \
-                                                SNB_UNC_CTL_INVERT | \
-                                                SNB_UNC_CTL_CMASK_MASK)
-
-#define NHM_UNC_RAW_EVENT_MASK                 (SNB_UNC_CTL_EV_SEL_MASK | \
-                                                SNB_UNC_CTL_UMASK_MASK | \
-                                                SNB_UNC_CTL_EDGE_DET | \
-                                                SNB_UNC_CTL_INVERT | \
-                                                NHM_UNC_CTL_CMASK_MASK)
-
-/* SNB global control register */
-#define SNB_UNC_PERF_GLOBAL_CTL                 0x391
-#define SNB_UNC_FIXED_CTR_CTRL                  0x394
-#define SNB_UNC_FIXED_CTR                       0x395
-
-/* SNB uncore global control */
-#define SNB_UNC_GLOBAL_CTL_CORE_ALL             ((1 << 4) - 1)
-#define SNB_UNC_GLOBAL_CTL_EN                   (1 << 29)
-
-/* SNB Cbo register */
-#define SNB_UNC_CBO_0_PERFEVTSEL0               0x700
-#define SNB_UNC_CBO_0_PER_CTR0                  0x706
-#define SNB_UNC_CBO_MSR_OFFSET                  0x10
-
-/* SNB ARB register */
-#define SNB_UNC_ARB_PER_CTR0                   0x3b0
-#define SNB_UNC_ARB_PERFEVTSEL0                        0x3b2
-#define SNB_UNC_ARB_MSR_OFFSET                 0x10
-
-/* NHM global control register */
-#define NHM_UNC_PERF_GLOBAL_CTL                 0x391
-#define NHM_UNC_FIXED_CTR                       0x394
-#define NHM_UNC_FIXED_CTR_CTRL                  0x395
-
-/* NHM uncore global control */
-#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL            ((1ULL << 8) - 1)
-#define NHM_UNC_GLOBAL_CTL_EN_FC                (1ULL << 32)
-
-/* NHM uncore register */
-#define NHM_UNC_PERFEVTSEL0                     0x3c0
-#define NHM_UNC_UNCORE_PMC0                     0x3b0
-
-DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
-DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
-DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
-DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
-DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
-
-/* Sandy Bridge uncore support */
-static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (hwc->idx < UNCORE_PMC_IDX_FIXED)
-               wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
-       else
-               wrmsrl(hwc->config_base, SNB_UNC_CTL_EN);
-}
-
-static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       wrmsrl(event->hw.config_base, 0);
-}
-
-static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
-{
-       if (box->pmu->pmu_idx == 0) {
-               wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
-                       SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
-       }
-}
-
-static struct uncore_event_desc snb_uncore_events[] = {
-       INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
-       { /* end: all zeroes */ },
-};
-
-static struct attribute *snb_uncore_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_cmask5.attr,
-       NULL,
-};
-
-static struct attribute_group snb_uncore_format_group = {
-       .name           = "format",
-       .attrs          = snb_uncore_formats_attr,
-};
-
-static struct intel_uncore_ops snb_uncore_msr_ops = {
-       .init_box       = snb_uncore_msr_init_box,
-       .disable_event  = snb_uncore_msr_disable_event,
-       .enable_event   = snb_uncore_msr_enable_event,
-       .read_counter   = uncore_msr_read_counter,
-};
-
-static struct event_constraint snb_uncore_arb_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
-       EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type snb_uncore_cbox = {
-       .name           = "cbox",
-       .num_counters   = 2,
-       .num_boxes      = 4,
-       .perf_ctr_bits  = 44,
-       .fixed_ctr_bits = 48,
-       .perf_ctr       = SNB_UNC_CBO_0_PER_CTR0,
-       .event_ctl      = SNB_UNC_CBO_0_PERFEVTSEL0,
-       .fixed_ctr      = SNB_UNC_FIXED_CTR,
-       .fixed_ctl      = SNB_UNC_FIXED_CTR_CTRL,
-       .single_fixed   = 1,
-       .event_mask     = SNB_UNC_RAW_EVENT_MASK,
-       .msr_offset     = SNB_UNC_CBO_MSR_OFFSET,
-       .ops            = &snb_uncore_msr_ops,
-       .format_group   = &snb_uncore_format_group,
-       .event_descs    = snb_uncore_events,
-};
-
-static struct intel_uncore_type snb_uncore_arb = {
-       .name           = "arb",
-       .num_counters   = 2,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 44,
-       .perf_ctr       = SNB_UNC_ARB_PER_CTR0,
-       .event_ctl      = SNB_UNC_ARB_PERFEVTSEL0,
-       .event_mask     = SNB_UNC_RAW_EVENT_MASK,
-       .msr_offset     = SNB_UNC_ARB_MSR_OFFSET,
-       .constraints    = snb_uncore_arb_constraints,
-       .ops            = &snb_uncore_msr_ops,
-       .format_group   = &snb_uncore_format_group,
-};
-
-static struct intel_uncore_type *snb_msr_uncores[] = {
-       &snb_uncore_cbox,
-       &snb_uncore_arb,
-       NULL,
-};
-
-void snb_uncore_cpu_init(void)
-{
-       uncore_msr_uncores = snb_msr_uncores;
-       if (snb_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-               snb_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-}
-
-enum {
-       SNB_PCI_UNCORE_IMC,
-};
-
-static struct uncore_event_desc snb_uncore_imc_events[] = {
-       INTEL_UNCORE_EVENT_DESC(data_reads,  "event=0x01"),
-       INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"),
-       INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"),
-
-       INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"),
-       INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"),
-       INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"),
-
-       { /* end: all zeroes */ },
-};
-
-#define SNB_UNCORE_PCI_IMC_EVENT_MASK          0xff
-#define SNB_UNCORE_PCI_IMC_BAR_OFFSET          0x48
-
-/* page size multiple covering all config regs */
-#define SNB_UNCORE_PCI_IMC_MAP_SIZE            0x6000
-
-#define SNB_UNCORE_PCI_IMC_DATA_READS          0x1
-#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE     0x5050
-#define SNB_UNCORE_PCI_IMC_DATA_WRITES         0x2
-#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE    0x5054
-#define SNB_UNCORE_PCI_IMC_CTR_BASE            SNB_UNCORE_PCI_IMC_DATA_READS_BASE
-
-static struct attribute *snb_uncore_imc_formats_attr[] = {
-       &format_attr_event.attr,
-       NULL,
-};
-
-static struct attribute_group snb_uncore_imc_format_group = {
-       .name = "format",
-       .attrs = snb_uncore_imc_formats_attr,
-};
-
-static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
-       resource_size_t addr;
-       u32 pci_dword;
-
-       pci_read_config_dword(pdev, where, &pci_dword);
-       addr = pci_dword;
-
-#ifdef CONFIG_PHYS_ADDR_T_64BIT
-       pci_read_config_dword(pdev, where + 4, &pci_dword);
-       addr |= ((resource_size_t)pci_dword << 32);
-#endif
-
-       addr &= ~(PAGE_SIZE - 1);
-
-       box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
-       box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
-}
-
-static void snb_uncore_imc_enable_box(struct intel_uncore_box *box)
-{}
-
-static void snb_uncore_imc_disable_box(struct intel_uncore_box *box)
-{}
-
-static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{}
-
-static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{}
-
-static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       return (u64)*(unsigned int *)(box->io_addr + hwc->event_base);
-}
-
-/*
- * custom event_init() function because we define our own fixed, free
- * running counters, so we do not want to conflict with generic uncore
- * logic. Also simplifies processing
- */
-static int snb_uncore_imc_event_init(struct perf_event *event)
-{
-       struct intel_uncore_pmu *pmu;
-       struct intel_uncore_box *box;
-       struct hw_perf_event *hwc = &event->hw;
-       u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK;
-       int idx, base;
-
-       if (event->attr.type != event->pmu->type)
-               return -ENOENT;
-
-       pmu = uncore_event_to_pmu(event);
-       /* no device found for this pmu */
-       if (pmu->func_id < 0)
-               return -ENOENT;
-
-       /* Sampling not supported yet */
-       if (hwc->sample_period)
-               return -EINVAL;
-
-       /* unsupported modes and filters */
-       if (event->attr.exclude_user   ||
-           event->attr.exclude_kernel ||
-           event->attr.exclude_hv     ||
-           event->attr.exclude_idle   ||
-           event->attr.exclude_host   ||
-           event->attr.exclude_guest  ||
-           event->attr.sample_period) /* no sampling */
-               return -EINVAL;
-
-       /*
-        * Place all uncore events for a particular physical package
-        * onto a single cpu
-        */
-       if (event->cpu < 0)
-               return -EINVAL;
-
-       /* check only supported bits are set */
-       if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK)
-               return -EINVAL;
-
-       box = uncore_pmu_to_box(pmu, event->cpu);
-       if (!box || box->cpu < 0)
-               return -EINVAL;
-
-       event->cpu = box->cpu;
-
-       event->hw.idx = -1;
-       event->hw.last_tag = ~0ULL;
-       event->hw.extra_reg.idx = EXTRA_REG_NONE;
-       event->hw.branch_reg.idx = EXTRA_REG_NONE;
-       /*
-        * check event is known (whitelist, determines counter)
-        */
-       switch (cfg) {
-       case SNB_UNCORE_PCI_IMC_DATA_READS:
-               base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE;
-               idx = UNCORE_PMC_IDX_FIXED;
-               break;
-       case SNB_UNCORE_PCI_IMC_DATA_WRITES:
-               base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE;
-               idx = UNCORE_PMC_IDX_FIXED + 1;
-               break;
-       default:
-               return -EINVAL;
-       }
-
-       /* must be done before validate_group */
-       event->hw.event_base = base;
-       event->hw.config = cfg;
-       event->hw.idx = idx;
-
-       /* no group validation needed, we have free running counters */
-
-       return 0;
-}
-
-static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       return 0;
-}
-
-static void snb_uncore_imc_event_start(struct perf_event *event, int flags)
-{
-       struct intel_uncore_box *box = uncore_event_to_box(event);
-       u64 count;
-
-       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
-               return;
-
-       event->hw.state = 0;
-       box->n_active++;
-
-       list_add_tail(&event->active_entry, &box->active_list);
-
-       count = snb_uncore_imc_read_counter(box, event);
-       local64_set(&event->hw.prev_count, count);
-
-       if (box->n_active == 1)
-               uncore_pmu_start_hrtimer(box);
-}
-
-static void snb_uncore_imc_event_stop(struct perf_event *event, int flags)
-{
-       struct intel_uncore_box *box = uncore_event_to_box(event);
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (!(hwc->state & PERF_HES_STOPPED)) {
-               box->n_active--;
-
-               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-               hwc->state |= PERF_HES_STOPPED;
-
-               list_del(&event->active_entry);
-
-               if (box->n_active == 0)
-                       uncore_pmu_cancel_hrtimer(box);
-       }
-
-       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-               /*
-                * Drain the remaining delta count out of a event
-                * that we are disabling:
-                */
-               uncore_perf_event_update(box, event);
-               hwc->state |= PERF_HES_UPTODATE;
-       }
-}
-
-static int snb_uncore_imc_event_add(struct perf_event *event, int flags)
-{
-       struct intel_uncore_box *box = uncore_event_to_box(event);
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (!box)
-               return -ENODEV;
-
-       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-       if (!(flags & PERF_EF_START))
-               hwc->state |= PERF_HES_ARCH;
-
-       snb_uncore_imc_event_start(event, 0);
-
-       box->n_events++;
-
-       return 0;
-}
-
-static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
-{
-       struct intel_uncore_box *box = uncore_event_to_box(event);
-       int i;
-
-       snb_uncore_imc_event_stop(event, PERF_EF_UPDATE);
-
-       for (i = 0; i < box->n_events; i++) {
-               if (event == box->event_list[i]) {
-                       --box->n_events;
-                       break;
-               }
-       }
-}
-
-int snb_pci2phy_map_init(int devid)
-{
-       struct pci_dev *dev = NULL;
-       struct pci2phy_map *map;
-       int bus, segment;
-
-       dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
-       if (!dev)
-               return -ENOTTY;
-
-       bus = dev->bus->number;
-       segment = pci_domain_nr(dev->bus);
-
-       raw_spin_lock(&pci2phy_map_lock);
-       map = __find_pci2phy_map(segment);
-       if (!map) {
-               raw_spin_unlock(&pci2phy_map_lock);
-               pci_dev_put(dev);
-               return -ENOMEM;
-       }
-       map->pbus_to_physid[bus] = 0;
-       raw_spin_unlock(&pci2phy_map_lock);
-
-       pci_dev_put(dev);
-
-       return 0;
-}
-
-static struct pmu snb_uncore_imc_pmu = {
-       .task_ctx_nr    = perf_invalid_context,
-       .event_init     = snb_uncore_imc_event_init,
-       .add            = snb_uncore_imc_event_add,
-       .del            = snb_uncore_imc_event_del,
-       .start          = snb_uncore_imc_event_start,
-       .stop           = snb_uncore_imc_event_stop,
-       .read           = uncore_pmu_event_read,
-};
-
-static struct intel_uncore_ops snb_uncore_imc_ops = {
-       .init_box       = snb_uncore_imc_init_box,
-       .enable_box     = snb_uncore_imc_enable_box,
-       .disable_box    = snb_uncore_imc_disable_box,
-       .disable_event  = snb_uncore_imc_disable_event,
-       .enable_event   = snb_uncore_imc_enable_event,
-       .hw_config      = snb_uncore_imc_hw_config,
-       .read_counter   = snb_uncore_imc_read_counter,
-};
-
-static struct intel_uncore_type snb_uncore_imc = {
-       .name           = "imc",
-       .num_counters   = 2,
-       .num_boxes      = 1,
-       .fixed_ctr_bits = 32,
-       .fixed_ctr      = SNB_UNCORE_PCI_IMC_CTR_BASE,
-       .event_descs    = snb_uncore_imc_events,
-       .format_group   = &snb_uncore_imc_format_group,
-       .perf_ctr       = SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
-       .event_mask     = SNB_UNCORE_PCI_IMC_EVENT_MASK,
-       .ops            = &snb_uncore_imc_ops,
-       .pmu            = &snb_uncore_imc_pmu,
-};
-
-static struct intel_uncore_type *snb_pci_uncores[] = {
-       [SNB_PCI_UNCORE_IMC]    = &snb_uncore_imc,
-       NULL,
-};
-
-static const struct pci_device_id snb_uncore_pci_ids[] = {
-       { /* IMC */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-       },
-       { /* end: all zeroes */ },
-};
-
-static const struct pci_device_id ivb_uncore_pci_ids[] = {
-       { /* IMC */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-       },
-       { /* IMC */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_E3_IMC),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-       },
-       { /* end: all zeroes */ },
-};
-
-static const struct pci_device_id hsw_uncore_pci_ids[] = {
-       { /* IMC */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-       },
-       { /* IMC */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-       },
-       { /* end: all zeroes */ },
-};
-
-static const struct pci_device_id bdw_uncore_pci_ids[] = {
-       { /* IMC */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-       },
-       { /* end: all zeroes */ },
-};
-
-static const struct pci_device_id skl_uncore_pci_ids[] = {
-       { /* IMC */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_IMC),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-       },
-       { /* end: all zeroes */ },
-};
-
-static struct pci_driver snb_uncore_pci_driver = {
-       .name           = "snb_uncore",
-       .id_table       = snb_uncore_pci_ids,
-};
-
-static struct pci_driver ivb_uncore_pci_driver = {
-       .name           = "ivb_uncore",
-       .id_table       = ivb_uncore_pci_ids,
-};
-
-static struct pci_driver hsw_uncore_pci_driver = {
-       .name           = "hsw_uncore",
-       .id_table       = hsw_uncore_pci_ids,
-};
-
-static struct pci_driver bdw_uncore_pci_driver = {
-       .name           = "bdw_uncore",
-       .id_table       = bdw_uncore_pci_ids,
-};
-
-static struct pci_driver skl_uncore_pci_driver = {
-       .name           = "skl_uncore",
-       .id_table       = skl_uncore_pci_ids,
-};
-
-struct imc_uncore_pci_dev {
-       __u32 pci_id;
-       struct pci_driver *driver;
-};
-#define IMC_DEV(a, d) \
-       { .pci_id = PCI_DEVICE_ID_INTEL_##a, .driver = (d) }
-
-static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
-       IMC_DEV(SNB_IMC, &snb_uncore_pci_driver),
-       IMC_DEV(IVB_IMC, &ivb_uncore_pci_driver),    /* 3rd Gen Core processor */
-       IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */
-       IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver),    /* 4th Gen Core Processor */
-       IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver),  /* 4th Gen Core ULT Mobile Processor */
-       IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver),    /* 5th Gen Core U */
-       IMC_DEV(SKL_IMC, &skl_uncore_pci_driver),    /* 6th Gen Core */
-       {  /* end marker */ }
-};
-
-
-#define for_each_imc_pci_id(x, t) \
-       for (x = (t); (x)->pci_id; x++)
-
-static struct pci_driver *imc_uncore_find_dev(void)
-{
-       const struct imc_uncore_pci_dev *p;
-       int ret;
-
-       for_each_imc_pci_id(p, desktop_imc_pci_ids) {
-               ret = snb_pci2phy_map_init(p->pci_id);
-               if (ret == 0)
-                       return p->driver;
-       }
-       return NULL;
-}
-
-static int imc_uncore_pci_init(void)
-{
-       struct pci_driver *imc_drv = imc_uncore_find_dev();
-
-       if (!imc_drv)
-               return -ENODEV;
-
-       uncore_pci_uncores = snb_pci_uncores;
-       uncore_pci_driver = imc_drv;
-
-       return 0;
-}
-
-int snb_uncore_pci_init(void)
-{
-       return imc_uncore_pci_init();
-}
-
-int ivb_uncore_pci_init(void)
-{
-       return imc_uncore_pci_init();
-}
-int hsw_uncore_pci_init(void)
-{
-       return imc_uncore_pci_init();
-}
-
-int bdw_uncore_pci_init(void)
-{
-       return imc_uncore_pci_init();
-}
-
-int skl_uncore_pci_init(void)
-{
-       return imc_uncore_pci_init();
-}
-
-/* end of Sandy Bridge uncore support */
-
-/* Nehalem uncore support */
-static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box)
-{
-       wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0);
-}
-
-static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box)
-{
-       wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC);
-}
-
-static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (hwc->idx < UNCORE_PMC_IDX_FIXED)
-               wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
-       else
-               wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN);
-}
-
-static struct attribute *nhm_uncore_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_cmask8.attr,
-       NULL,
-};
-
-static struct attribute_group nhm_uncore_format_group = {
-       .name = "format",
-       .attrs = nhm_uncore_formats_attr,
-};
-
-static struct uncore_event_desc nhm_uncore_events[] = {
-       INTEL_UNCORE_EVENT_DESC(clockticks,                "event=0xff,umask=0x00"),
-       INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any,       "event=0x2f,umask=0x0f"),
-       INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any,      "event=0x2c,umask=0x0f"),
-       INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads,     "event=0x20,umask=0x01"),
-       INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes,    "event=0x20,umask=0x02"),
-       INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads,  "event=0x20,umask=0x04"),
-       INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"),
-       INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads,   "event=0x20,umask=0x10"),
-       INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes,  "event=0x20,umask=0x20"),
-       { /* end: all zeroes */ },
-};
-
-static struct intel_uncore_ops nhm_uncore_msr_ops = {
-       .disable_box    = nhm_uncore_msr_disable_box,
-       .enable_box     = nhm_uncore_msr_enable_box,
-       .disable_event  = snb_uncore_msr_disable_event,
-       .enable_event   = nhm_uncore_msr_enable_event,
-       .read_counter   = uncore_msr_read_counter,
-};
-
-static struct intel_uncore_type nhm_uncore = {
-       .name           = "",
-       .num_counters   = 8,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 48,
-       .fixed_ctr_bits = 48,
-       .event_ctl      = NHM_UNC_PERFEVTSEL0,
-       .perf_ctr       = NHM_UNC_UNCORE_PMC0,
-       .fixed_ctr      = NHM_UNC_FIXED_CTR,
-       .fixed_ctl      = NHM_UNC_FIXED_CTR_CTRL,
-       .event_mask     = NHM_UNC_RAW_EVENT_MASK,
-       .event_descs    = nhm_uncore_events,
-       .ops            = &nhm_uncore_msr_ops,
-       .format_group   = &nhm_uncore_format_group,
-};
-
-static struct intel_uncore_type *nhm_msr_uncores[] = {
-       &nhm_uncore,
-       NULL,
-};
-
-void nhm_uncore_cpu_init(void)
-{
-       uncore_msr_uncores = nhm_msr_uncores;
-}
-
-/* end of Nehalem uncore support */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c

deleted file mode 100644 (file)

index 33acb88..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ /dev/null
@@ -1,3126 +0,0 @@
-/* SandyBridge-EP/IvyTown uncore support */
-#include "perf_event_intel_uncore.h"
-
-
-/* SNB-EP Box level control */
-#define SNBEP_PMON_BOX_CTL_RST_CTRL    (1 << 0)
-#define SNBEP_PMON_BOX_CTL_RST_CTRS    (1 << 1)
-#define SNBEP_PMON_BOX_CTL_FRZ         (1 << 8)
-#define SNBEP_PMON_BOX_CTL_FRZ_EN      (1 << 16)
-#define SNBEP_PMON_BOX_CTL_INT         (SNBEP_PMON_BOX_CTL_RST_CTRL | \
-                                        SNBEP_PMON_BOX_CTL_RST_CTRS | \
-                                        SNBEP_PMON_BOX_CTL_FRZ_EN)
-/* SNB-EP event control */
-#define SNBEP_PMON_CTL_EV_SEL_MASK     0x000000ff
-#define SNBEP_PMON_CTL_UMASK_MASK      0x0000ff00
-#define SNBEP_PMON_CTL_RST             (1 << 17)
-#define SNBEP_PMON_CTL_EDGE_DET                (1 << 18)
-#define SNBEP_PMON_CTL_EV_SEL_EXT      (1 << 21)
-#define SNBEP_PMON_CTL_EN              (1 << 22)
-#define SNBEP_PMON_CTL_INVERT          (1 << 23)
-#define SNBEP_PMON_CTL_TRESH_MASK      0xff000000
-#define SNBEP_PMON_RAW_EVENT_MASK      (SNBEP_PMON_CTL_EV_SEL_MASK | \
-                                        SNBEP_PMON_CTL_UMASK_MASK | \
-                                        SNBEP_PMON_CTL_EDGE_DET | \
-                                        SNBEP_PMON_CTL_INVERT | \
-                                        SNBEP_PMON_CTL_TRESH_MASK)
-
-/* SNB-EP Ubox event control */
-#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK                0x1f000000
-#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK                \
-                               (SNBEP_PMON_CTL_EV_SEL_MASK | \
-                                SNBEP_PMON_CTL_UMASK_MASK | \
-                                SNBEP_PMON_CTL_EDGE_DET | \
-                                SNBEP_PMON_CTL_INVERT | \
-                                SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
-
-#define SNBEP_CBO_PMON_CTL_TID_EN              (1 << 19)
-#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK      (SNBEP_PMON_RAW_EVENT_MASK | \
-                                                SNBEP_CBO_PMON_CTL_TID_EN)
-
-/* SNB-EP PCU event control */
-#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK    0x0000c000
-#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK      0x1f000000
-#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT      (1 << 30)
-#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET    (1 << 31)
-#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK      \
-                               (SNBEP_PMON_CTL_EV_SEL_MASK | \
-                                SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
-                                SNBEP_PMON_CTL_EDGE_DET | \
-                                SNBEP_PMON_CTL_EV_SEL_EXT | \
-                                SNBEP_PMON_CTL_INVERT | \
-                                SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
-                                SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
-                                SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
-
-#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK      \
-                               (SNBEP_PMON_RAW_EVENT_MASK | \
-                                SNBEP_PMON_CTL_EV_SEL_EXT)
-
-/* SNB-EP pci control register */
-#define SNBEP_PCI_PMON_BOX_CTL                 0xf4
-#define SNBEP_PCI_PMON_CTL0                    0xd8
-/* SNB-EP pci counter register */
-#define SNBEP_PCI_PMON_CTR0                    0xa0
-
-/* SNB-EP home agent register */
-#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0       0x40
-#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1       0x44
-#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH      0x48
-/* SNB-EP memory controller register */
-#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL                0xf0
-#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR                0xd0
-/* SNB-EP QPI register */
-#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0         0x228
-#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1         0x22c
-#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0          0x238
-#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1          0x23c
-
-/* SNB-EP Ubox register */
-#define SNBEP_U_MSR_PMON_CTR0                  0xc16
-#define SNBEP_U_MSR_PMON_CTL0                  0xc10
-
-#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL                0xc08
-#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR                0xc09
-
-/* SNB-EP Cbo register */
-#define SNBEP_C0_MSR_PMON_CTR0                 0xd16
-#define SNBEP_C0_MSR_PMON_CTL0                 0xd10
-#define SNBEP_C0_MSR_PMON_BOX_CTL              0xd04
-#define SNBEP_C0_MSR_PMON_BOX_FILTER           0xd14
-#define SNBEP_CBO_MSR_OFFSET                   0x20
-
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_TID      0x1f
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_NID      0x3fc00
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE    0x7c0000
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC      0xff800000
-
-#define SNBEP_CBO_EVENT_EXTRA_REG(e, m, i) {   \
-       .event = (e),                           \
-       .msr = SNBEP_C0_MSR_PMON_BOX_FILTER,    \
-       .config_mask = (m),                     \
-       .idx = (i)                              \
-}
-
-/* SNB-EP PCU register */
-#define SNBEP_PCU_MSR_PMON_CTR0                        0xc36
-#define SNBEP_PCU_MSR_PMON_CTL0                        0xc30
-#define SNBEP_PCU_MSR_PMON_BOX_CTL             0xc24
-#define SNBEP_PCU_MSR_PMON_BOX_FILTER          0xc34
-#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK     0xffffffff
-#define SNBEP_PCU_MSR_CORE_C3_CTR              0x3fc
-#define SNBEP_PCU_MSR_CORE_C6_CTR              0x3fd
-
-/* IVBEP event control */
-#define IVBEP_PMON_BOX_CTL_INT         (SNBEP_PMON_BOX_CTL_RST_CTRL | \
-                                        SNBEP_PMON_BOX_CTL_RST_CTRS)
-#define IVBEP_PMON_RAW_EVENT_MASK              (SNBEP_PMON_CTL_EV_SEL_MASK | \
-                                        SNBEP_PMON_CTL_UMASK_MASK | \
-                                        SNBEP_PMON_CTL_EDGE_DET | \
-                                        SNBEP_PMON_CTL_TRESH_MASK)
-/* IVBEP Ubox */
-#define IVBEP_U_MSR_PMON_GLOBAL_CTL            0xc00
-#define IVBEP_U_PMON_GLOBAL_FRZ_ALL            (1 << 31)
-#define IVBEP_U_PMON_GLOBAL_UNFRZ_ALL          (1 << 29)
-
-#define IVBEP_U_MSR_PMON_RAW_EVENT_MASK        \
-                               (SNBEP_PMON_CTL_EV_SEL_MASK | \
-                                SNBEP_PMON_CTL_UMASK_MASK | \
-                                SNBEP_PMON_CTL_EDGE_DET | \
-                                SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
-/* IVBEP Cbo */
-#define IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK              (IVBEP_PMON_RAW_EVENT_MASK | \
-                                                SNBEP_CBO_PMON_CTL_TID_EN)
-
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_TID              (0x1fULL << 0)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK     (0xfULL << 5)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE    (0x3fULL << 17)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NID              (0xffffULL << 32)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC              (0x1ffULL << 52)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_C6               (0x1ULL << 61)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NC               (0x1ULL << 62)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC     (0x1ULL << 63)
-
-/* IVBEP home agent */
-#define IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST                (1 << 16)
-#define IVBEP_HA_PCI_PMON_RAW_EVENT_MASK               \
-                               (IVBEP_PMON_RAW_EVENT_MASK | \
-                                IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST)
-/* IVBEP PCU */
-#define IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK      \
-                               (SNBEP_PMON_CTL_EV_SEL_MASK | \
-                                SNBEP_PMON_CTL_EV_SEL_EXT | \
-                                SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
-                                SNBEP_PMON_CTL_EDGE_DET | \
-                                SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
-                                SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
-                                SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
-/* IVBEP QPI */
-#define IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK      \
-                               (IVBEP_PMON_RAW_EVENT_MASK | \
-                                SNBEP_PMON_CTL_EV_SEL_EXT)
-
-#define __BITS_VALUE(x, i, n)  ((typeof(x))(((x) >> ((i) * (n))) & \
-                               ((1ULL << (n)) - 1)))
-
-/* Haswell-EP Ubox */
-#define HSWEP_U_MSR_PMON_CTR0                  0x709
-#define HSWEP_U_MSR_PMON_CTL0                  0x705
-#define HSWEP_U_MSR_PMON_FILTER                        0x707
-
-#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTL                0x703
-#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTR                0x704
-
-#define HSWEP_U_MSR_PMON_BOX_FILTER_TID                (0x1 << 0)
-#define HSWEP_U_MSR_PMON_BOX_FILTER_CID                (0x1fULL << 1)
-#define HSWEP_U_MSR_PMON_BOX_FILTER_MASK \
-                                       (HSWEP_U_MSR_PMON_BOX_FILTER_TID | \
-                                        HSWEP_U_MSR_PMON_BOX_FILTER_CID)
-
-/* Haswell-EP CBo */
-#define HSWEP_C0_MSR_PMON_CTR0                 0xe08
-#define HSWEP_C0_MSR_PMON_CTL0                 0xe01
-#define HSWEP_C0_MSR_PMON_BOX_CTL                      0xe00
-#define HSWEP_C0_MSR_PMON_BOX_FILTER0          0xe05
-#define HSWEP_CBO_MSR_OFFSET                   0x10
-
-
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_TID              (0x3fULL << 0)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK     (0xfULL << 6)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE    (0x7fULL << 17)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NID              (0xffffULL << 32)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC              (0x1ffULL << 52)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_C6               (0x1ULL << 61)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NC               (0x1ULL << 62)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC     (0x1ULL << 63)
-
-
-/* Haswell-EP Sbox */
-#define HSWEP_S0_MSR_PMON_CTR0                 0x726
-#define HSWEP_S0_MSR_PMON_CTL0                 0x721
-#define HSWEP_S0_MSR_PMON_BOX_CTL                      0x720
-#define HSWEP_SBOX_MSR_OFFSET                  0xa
-#define HSWEP_S_MSR_PMON_RAW_EVENT_MASK                (SNBEP_PMON_RAW_EVENT_MASK | \
-                                                SNBEP_CBO_PMON_CTL_TID_EN)
-
-/* Haswell-EP PCU */
-#define HSWEP_PCU_MSR_PMON_CTR0                        0x717
-#define HSWEP_PCU_MSR_PMON_CTL0                        0x711
-#define HSWEP_PCU_MSR_PMON_BOX_CTL             0x710
-#define HSWEP_PCU_MSR_PMON_BOX_FILTER          0x715
-
-/* KNL Ubox */
-#define KNL_U_MSR_PMON_RAW_EVENT_MASK \
-                                       (SNBEP_U_MSR_PMON_RAW_EVENT_MASK | \
-                                               SNBEP_CBO_PMON_CTL_TID_EN)
-/* KNL CHA */
-#define KNL_CHA_MSR_OFFSET                     0xc
-#define KNL_CHA_MSR_PMON_CTL_QOR               (1 << 16)
-#define KNL_CHA_MSR_PMON_RAW_EVENT_MASK \
-                                       (SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK | \
-                                        KNL_CHA_MSR_PMON_CTL_QOR)
-#define KNL_CHA_MSR_PMON_BOX_FILTER_TID                0x1ff
-#define KNL_CHA_MSR_PMON_BOX_FILTER_STATE      (7 << 18)
-#define KNL_CHA_MSR_PMON_BOX_FILTER_OP         (0xfffffe2aULL << 32)
-
-/* KNL EDC/MC UCLK */
-#define KNL_UCLK_MSR_PMON_CTR0_LOW             0x400
-#define KNL_UCLK_MSR_PMON_CTL0                 0x420
-#define KNL_UCLK_MSR_PMON_BOX_CTL              0x430
-#define KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW       0x44c
-#define KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL       0x454
-#define KNL_PMON_FIXED_CTL_EN                  0x1
-
-/* KNL EDC */
-#define KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW                0xa00
-#define KNL_EDC0_ECLK_MSR_PMON_CTL0            0xa20
-#define KNL_EDC0_ECLK_MSR_PMON_BOX_CTL         0xa30
-#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW  0xa3c
-#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL  0xa44
-
-/* KNL MC */
-#define KNL_MC0_CH0_MSR_PMON_CTR0_LOW          0xb00
-#define KNL_MC0_CH0_MSR_PMON_CTL0              0xb20
-#define KNL_MC0_CH0_MSR_PMON_BOX_CTL           0xb30
-#define KNL_MC0_CH0_MSR_PMON_FIXED_LOW         0xb3c
-#define KNL_MC0_CH0_MSR_PMON_FIXED_CTL         0xb44
-
-/* KNL IRP */
-#define KNL_IRP_PCI_PMON_BOX_CTL               0xf0
-#define KNL_IRP_PCI_PMON_RAW_EVENT_MASK                (SNBEP_PMON_RAW_EVENT_MASK | \
-                                                KNL_CHA_MSR_PMON_CTL_QOR)
-/* KNL PCU */
-#define KNL_PCU_PMON_CTL_EV_SEL_MASK           0x0000007f
-#define KNL_PCU_PMON_CTL_USE_OCC_CTR           (1 << 7)
-#define KNL_PCU_MSR_PMON_CTL_TRESH_MASK                0x3f000000
-#define KNL_PCU_MSR_PMON_RAW_EVENT_MASK        \
-                               (KNL_PCU_PMON_CTL_EV_SEL_MASK | \
-                                KNL_PCU_PMON_CTL_USE_OCC_CTR | \
-                                SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
-                                SNBEP_PMON_CTL_EDGE_DET | \
-                                SNBEP_CBO_PMON_CTL_TID_EN | \
-                                SNBEP_PMON_CTL_EV_SEL_EXT | \
-                                SNBEP_PMON_CTL_INVERT | \
-                                KNL_PCU_MSR_PMON_CTL_TRESH_MASK | \
-                                SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
-                                SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
-
-DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
-DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6");
-DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
-DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7");
-DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
-DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
-DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
-DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
-DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
-DEFINE_UNCORE_FORMAT_ATTR(thresh6, thresh, "config:24-29");
-DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
-DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
-DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
-DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
-DEFINE_UNCORE_FORMAT_ATTR(occ_edge_det, occ_edge_det, "config:31");
-DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
-DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0");
-DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5");
-DEFINE_UNCORE_FORMAT_ATTR(filter_tid4, filter_tid, "config1:0-8");
-DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5");
-DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
-DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8");
-DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12");
-DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
-DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
-DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
-DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
-DEFINE_UNCORE_FORMAT_ATTR(filter_state3, filter_state, "config1:17-23");
-DEFINE_UNCORE_FORMAT_ATTR(filter_state4, filter_state, "config1:18-20");
-DEFINE_UNCORE_FORMAT_ATTR(filter_local, filter_local, "config1:33");
-DEFINE_UNCORE_FORMAT_ATTR(filter_all_op, filter_all_op, "config1:35");
-DEFINE_UNCORE_FORMAT_ATTR(filter_nnm, filter_nnm, "config1:37");
-DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
-DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
-DEFINE_UNCORE_FORMAT_ATTR(filter_opc3, filter_opc, "config1:41-60");
-DEFINE_UNCORE_FORMAT_ATTR(filter_nc, filter_nc, "config1:62");
-DEFINE_UNCORE_FORMAT_ATTR(filter_c6, filter_c6, "config1:61");
-DEFINE_UNCORE_FORMAT_ATTR(filter_isoc, filter_isoc, "config1:63");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31");
-DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51");
-DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35");
-DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31");
-DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17");
-DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12");
-DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8");
-DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4");
-DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63");
-DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51");
-DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35");
-DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31");
-DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17");
-DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12");
-DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8");
-DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4");
-DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63");
-
-static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       int box_ctl = uncore_pci_box_ctl(box);
-       u32 config = 0;
-
-       if (!pci_read_config_dword(pdev, box_ctl, &config)) {
-               config |= SNBEP_PMON_BOX_CTL_FRZ;
-               pci_write_config_dword(pdev, box_ctl, config);
-       }
-}
-
-static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       int box_ctl = uncore_pci_box_ctl(box);
-       u32 config = 0;
-
-       if (!pci_read_config_dword(pdev, box_ctl, &config)) {
-               config &= ~SNBEP_PMON_BOX_CTL_FRZ;
-               pci_write_config_dword(pdev, box_ctl, config);
-       }
-}
-
-static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       struct hw_perf_event *hwc = &event->hw;
-
-       pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       struct hw_perf_event *hwc = &event->hw;
-
-       pci_write_config_dword(pdev, hwc->config_base, hwc->config);
-}
-
-static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       struct hw_perf_event *hwc = &event->hw;
-       u64 count = 0;
-
-       pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
-       pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
-
-       return count;
-}
-
-static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       int box_ctl = uncore_pci_box_ctl(box);
-
-       pci_write_config_dword(pdev, box_ctl, SNBEP_PMON_BOX_CTL_INT);
-}
-
-static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
-{
-       u64 config;
-       unsigned msr;
-
-       msr = uncore_msr_box_ctl(box);
-       if (msr) {
-               rdmsrl(msr, config);
-               config |= SNBEP_PMON_BOX_CTL_FRZ;
-               wrmsrl(msr, config);
-       }
-}
-
-static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box)
-{
-       u64 config;
-       unsigned msr;
-
-       msr = uncore_msr_box_ctl(box);
-       if (msr) {
-               rdmsrl(msr, config);
-               config &= ~SNBEP_PMON_BOX_CTL_FRZ;
-               wrmsrl(msr, config);
-       }
-}
-
-static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
-       if (reg1->idx != EXTRA_REG_NONE)
-               wrmsrl(reg1->reg, uncore_shared_reg_config(box, 0));
-
-       wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box,
-                                       struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       wrmsrl(hwc->config_base, hwc->config);
-}
-
-static void snbep_uncore_msr_init_box(struct intel_uncore_box *box)
-{
-       unsigned msr = uncore_msr_box_ctl(box);
-
-       if (msr)
-               wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT);
-}
-
-static struct attribute *snbep_uncore_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh8.attr,
-       NULL,
-};
-
-static struct attribute *snbep_uncore_ubox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh5.attr,
-       NULL,
-};
-
-static struct attribute *snbep_uncore_cbox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_tid_en.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh8.attr,
-       &format_attr_filter_tid.attr,
-       &format_attr_filter_nid.attr,
-       &format_attr_filter_state.attr,
-       &format_attr_filter_opc.attr,
-       NULL,
-};
-
-static struct attribute *snbep_uncore_pcu_formats_attr[] = {
-       &format_attr_event_ext.attr,
-       &format_attr_occ_sel.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh5.attr,
-       &format_attr_occ_invert.attr,
-       &format_attr_occ_edge.attr,
-       &format_attr_filter_band0.attr,
-       &format_attr_filter_band1.attr,
-       &format_attr_filter_band2.attr,
-       &format_attr_filter_band3.attr,
-       NULL,
-};
-
-static struct attribute *snbep_uncore_qpi_formats_attr[] = {
-       &format_attr_event_ext.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh8.attr,
-       &format_attr_match_rds.attr,
-       &format_attr_match_rnid30.attr,
-       &format_attr_match_rnid4.attr,
-       &format_attr_match_dnid.attr,
-       &format_attr_match_mc.attr,
-       &format_attr_match_opc.attr,
-       &format_attr_match_vnw.attr,
-       &format_attr_match0.attr,
-       &format_attr_match1.attr,
-       &format_attr_mask_rds.attr,
-       &format_attr_mask_rnid30.attr,
-       &format_attr_mask_rnid4.attr,
-       &format_attr_mask_dnid.attr,
-       &format_attr_mask_mc.attr,
-       &format_attr_mask_opc.attr,
-       &format_attr_mask_vnw.attr,
-       &format_attr_mask0.attr,
-       &format_attr_mask1.attr,
-       NULL,
-};
-
-static struct uncore_event_desc snbep_uncore_imc_events[] = {
-       INTEL_UNCORE_EVENT_DESC(clockticks,      "event=0xff,umask=0x00"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_read,  "event=0x04,umask=0x03"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
-       { /* end: all zeroes */ },
-};
-
-static struct uncore_event_desc snbep_uncore_qpi_events[] = {
-       INTEL_UNCORE_EVENT_DESC(clockticks,       "event=0x14"),
-       INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"),
-       INTEL_UNCORE_EVENT_DESC(drs_data,         "event=0x102,umask=0x08"),
-       INTEL_UNCORE_EVENT_DESC(ncb_data,         "event=0x103,umask=0x04"),
-       { /* end: all zeroes */ },
-};
-
-static struct attribute_group snbep_uncore_format_group = {
-       .name = "format",
-       .attrs = snbep_uncore_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_ubox_format_group = {
-       .name = "format",
-       .attrs = snbep_uncore_ubox_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_cbox_format_group = {
-       .name = "format",
-       .attrs = snbep_uncore_cbox_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_pcu_format_group = {
-       .name = "format",
-       .attrs = snbep_uncore_pcu_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_qpi_format_group = {
-       .name = "format",
-       .attrs = snbep_uncore_qpi_formats_attr,
-};
-
-#define __SNBEP_UNCORE_MSR_OPS_COMMON_INIT()                   \
-       .disable_box    = snbep_uncore_msr_disable_box,         \
-       .enable_box     = snbep_uncore_msr_enable_box,          \
-       .disable_event  = snbep_uncore_msr_disable_event,       \
-       .enable_event   = snbep_uncore_msr_enable_event,        \
-       .read_counter   = uncore_msr_read_counter
-
-#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT()                     \
-       __SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),                   \
-       .init_box       = snbep_uncore_msr_init_box             \
-
-static struct intel_uncore_ops snbep_uncore_msr_ops = {
-       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-};
-
-#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT()                     \
-       .init_box       = snbep_uncore_pci_init_box,            \
-       .disable_box    = snbep_uncore_pci_disable_box,         \
-       .enable_box     = snbep_uncore_pci_enable_box,          \
-       .disable_event  = snbep_uncore_pci_disable_event,       \
-       .read_counter   = snbep_uncore_pci_read_counter
-
-static struct intel_uncore_ops snbep_uncore_pci_ops = {
-       SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
-       .enable_event   = snbep_uncore_pci_enable_event,        \
-};
-
-static struct event_constraint snbep_uncore_cbox_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x04, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x05, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x07, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x13, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x1b, 0xc),
-       UNCORE_EVENT_CONSTRAINT(0x1c, 0xc),
-       UNCORE_EVENT_CONSTRAINT(0x1d, 0xc),
-       UNCORE_EVENT_CONSTRAINT(0x1e, 0xc),
-       EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff),
-       UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
-       EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint snbep_uncore_r2pcie_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x12, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-       EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint snbep_uncore_r3qpi_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2a, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x30, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
-       EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type snbep_uncore_ubox = {
-       .name           = "ubox",
-       .num_counters   = 2,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 44,
-       .fixed_ctr_bits = 48,
-       .perf_ctr       = SNBEP_U_MSR_PMON_CTR0,
-       .event_ctl      = SNBEP_U_MSR_PMON_CTL0,
-       .event_mask     = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
-       .fixed_ctr      = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
-       .fixed_ctl      = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
-       .ops            = &snbep_uncore_msr_ops,
-       .format_group   = &snbep_uncore_ubox_format_group,
-};
-
-static struct extra_reg snbep_uncore_cbox_extra_regs[] = {
-       SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
-                                 SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x2),
-       EVENT_EXTRA_END
-};
-
-static void snbep_cbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct intel_uncore_extra_reg *er = &box->shared_regs[0];
-       int i;
-
-       if (uncore_box_is_fake(box))
-               return;
-
-       for (i = 0; i < 5; i++) {
-               if (reg1->alloc & (0x1 << i))
-                       atomic_sub(1 << (i * 6), &er->ref);
-       }
-       reg1->alloc = 0;
-}
-
-static struct event_constraint *
-__snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event,
-                           u64 (*cbox_filter_mask)(int fields))
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct intel_uncore_extra_reg *er = &box->shared_regs[0];
-       int i, alloc = 0;
-       unsigned long flags;
-       u64 mask;
-
-       if (reg1->idx == EXTRA_REG_NONE)
-               return NULL;
-
-       raw_spin_lock_irqsave(&er->lock, flags);
-       for (i = 0; i < 5; i++) {
-               if (!(reg1->idx & (0x1 << i)))
-                       continue;
-               if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
-                       continue;
-
-               mask = cbox_filter_mask(0x1 << i);
-               if (!__BITS_VALUE(atomic_read(&er->ref), i, 6) ||
-                   !((reg1->config ^ er->config) & mask)) {
-                       atomic_add(1 << (i * 6), &er->ref);
-                       er->config &= ~mask;
-                       er->config |= reg1->config & mask;
-                       alloc |= (0x1 << i);
-               } else {
-                       break;
-               }
-       }
-       raw_spin_unlock_irqrestore(&er->lock, flags);
-       if (i < 5)
-               goto fail;
-
-       if (!uncore_box_is_fake(box))
-               reg1->alloc |= alloc;
-
-       return NULL;
-fail:
-       for (; i >= 0; i--) {
-               if (alloc & (0x1 << i))
-                       atomic_sub(1 << (i * 6), &er->ref);
-       }
-       return &uncore_constraint_empty;
-}
-
-static u64 snbep_cbox_filter_mask(int fields)
-{
-       u64 mask = 0;
-
-       if (fields & 0x1)
-               mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_TID;
-       if (fields & 0x2)
-               mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_NID;
-       if (fields & 0x4)
-               mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
-       if (fields & 0x8)
-               mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
-
-       return mask;
-}
-
-static struct event_constraint *
-snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       return __snbep_cbox_get_constraint(box, event, snbep_cbox_filter_mask);
-}
-
-static int snbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct extra_reg *er;
-       int idx = 0;
-
-       for (er = snbep_uncore_cbox_extra_regs; er->msr; er++) {
-               if (er->event != (event->hw.config & er->config_mask))
-                       continue;
-               idx |= er->idx;
-       }
-
-       if (idx) {
-               reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
-                       SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
-               reg1->config = event->attr.config1 & snbep_cbox_filter_mask(idx);
-               reg1->idx = idx;
-       }
-       return 0;
-}
-
-static struct intel_uncore_ops snbep_uncore_cbox_ops = {
-       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-       .hw_config              = snbep_cbox_hw_config,
-       .get_constraint         = snbep_cbox_get_constraint,
-       .put_constraint         = snbep_cbox_put_constraint,
-};
-
-static struct intel_uncore_type snbep_uncore_cbox = {
-       .name                   = "cbox",
-       .num_counters           = 4,
-       .num_boxes              = 8,
-       .perf_ctr_bits          = 44,
-       .event_ctl              = SNBEP_C0_MSR_PMON_CTL0,
-       .perf_ctr               = SNBEP_C0_MSR_PMON_CTR0,
-       .event_mask             = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_C0_MSR_PMON_BOX_CTL,
-       .msr_offset             = SNBEP_CBO_MSR_OFFSET,
-       .num_shared_regs        = 1,
-       .constraints            = snbep_uncore_cbox_constraints,
-       .ops                    = &snbep_uncore_cbox_ops,
-       .format_group           = &snbep_uncore_cbox_format_group,
-};
-
-static u64 snbep_pcu_alter_er(struct perf_event *event, int new_idx, bool modify)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       u64 config = reg1->config;
-
-       if (new_idx > reg1->idx)
-               config <<= 8 * (new_idx - reg1->idx);
-       else
-               config >>= 8 * (reg1->idx - new_idx);
-
-       if (modify) {
-               hwc->config += new_idx - reg1->idx;
-               reg1->config = config;
-               reg1->idx = new_idx;
-       }
-       return config;
-}
-
-static struct event_constraint *
-snbep_pcu_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct intel_uncore_extra_reg *er = &box->shared_regs[0];
-       unsigned long flags;
-       int idx = reg1->idx;
-       u64 mask, config1 = reg1->config;
-       bool ok = false;
-
-       if (reg1->idx == EXTRA_REG_NONE ||
-           (!uncore_box_is_fake(box) && reg1->alloc))
-               return NULL;
-again:
-       mask = 0xffULL << (idx * 8);
-       raw_spin_lock_irqsave(&er->lock, flags);
-       if (!__BITS_VALUE(atomic_read(&er->ref), idx, 8) ||
-           !((config1 ^ er->config) & mask)) {
-               atomic_add(1 << (idx * 8), &er->ref);
-               er->config &= ~mask;
-               er->config |= config1 & mask;
-               ok = true;
-       }
-       raw_spin_unlock_irqrestore(&er->lock, flags);
-
-       if (!ok) {
-               idx = (idx + 1) % 4;
-               if (idx != reg1->idx) {
-                       config1 = snbep_pcu_alter_er(event, idx, false);
-                       goto again;
-               }
-               return &uncore_constraint_empty;
-       }
-
-       if (!uncore_box_is_fake(box)) {
-               if (idx != reg1->idx)
-                       snbep_pcu_alter_er(event, idx, true);
-               reg1->alloc = 1;
-       }
-       return NULL;
-}
-
-static void snbep_pcu_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct intel_uncore_extra_reg *er = &box->shared_regs[0];
-
-       if (uncore_box_is_fake(box) || !reg1->alloc)
-               return;
-
-       atomic_sub(1 << (reg1->idx * 8), &er->ref);
-       reg1->alloc = 0;
-}
-
-static int snbep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
-
-       if (ev_sel >= 0xb && ev_sel <= 0xe) {
-               reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER;
-               reg1->idx = ev_sel - 0xb;
-               reg1->config = event->attr.config1 & (0xff << (reg1->idx * 8));
-       }
-       return 0;
-}
-
-static struct intel_uncore_ops snbep_uncore_pcu_ops = {
-       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-       .hw_config              = snbep_pcu_hw_config,
-       .get_constraint         = snbep_pcu_get_constraint,
-       .put_constraint         = snbep_pcu_put_constraint,
-};
-
-static struct intel_uncore_type snbep_uncore_pcu = {
-       .name                   = "pcu",
-       .num_counters           = 4,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .perf_ctr               = SNBEP_PCU_MSR_PMON_CTR0,
-       .event_ctl              = SNBEP_PCU_MSR_PMON_CTL0,
-       .event_mask             = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_PCU_MSR_PMON_BOX_CTL,
-       .num_shared_regs        = 1,
-       .ops                    = &snbep_uncore_pcu_ops,
-       .format_group           = &snbep_uncore_pcu_format_group,
-};
-
-static struct intel_uncore_type *snbep_msr_uncores[] = {
-       &snbep_uncore_ubox,
-       &snbep_uncore_cbox,
-       &snbep_uncore_pcu,
-       NULL,
-};
-
-void snbep_uncore_cpu_init(void)
-{
-       if (snbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-               snbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-       uncore_msr_uncores = snbep_msr_uncores;
-}
-
-enum {
-       SNBEP_PCI_QPI_PORT0_FILTER,
-       SNBEP_PCI_QPI_PORT1_FILTER,
-       HSWEP_PCI_PCU_3,
-};
-
-static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-       if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) {
-               reg1->idx = 0;
-               reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0;
-               reg1->config = event->attr.config1;
-               reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0;
-               reg2->config = event->attr.config2;
-       }
-       return 0;
-}
-
-static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-       if (reg1->idx != EXTRA_REG_NONE) {
-               int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER;
-               struct pci_dev *filter_pdev = uncore_extra_pci_dev[box->phys_id][idx];
-               if (filter_pdev) {
-                       pci_write_config_dword(filter_pdev, reg1->reg,
-                                               (u32)reg1->config);
-                       pci_write_config_dword(filter_pdev, reg1->reg + 4,
-                                               (u32)(reg1->config >> 32));
-                       pci_write_config_dword(filter_pdev, reg2->reg,
-                                               (u32)reg2->config);
-                       pci_write_config_dword(filter_pdev, reg2->reg + 4,
-                                               (u32)(reg2->config >> 32));
-               }
-       }
-
-       pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static struct intel_uncore_ops snbep_uncore_qpi_ops = {
-       SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
-       .enable_event           = snbep_qpi_enable_event,
-       .hw_config              = snbep_qpi_hw_config,
-       .get_constraint         = uncore_get_constraint,
-       .put_constraint         = uncore_put_constraint,
-};
-
-#define SNBEP_UNCORE_PCI_COMMON_INIT()                         \
-       .perf_ctr       = SNBEP_PCI_PMON_CTR0,                  \
-       .event_ctl      = SNBEP_PCI_PMON_CTL0,                  \
-       .event_mask     = SNBEP_PMON_RAW_EVENT_MASK,            \
-       .box_ctl        = SNBEP_PCI_PMON_BOX_CTL,               \
-       .ops            = &snbep_uncore_pci_ops,                \
-       .format_group   = &snbep_uncore_format_group
-
-static struct intel_uncore_type snbep_uncore_ha = {
-       .name           = "ha",
-       .num_counters   = 4,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 48,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type snbep_uncore_imc = {
-       .name           = "imc",
-       .num_counters   = 4,
-       .num_boxes      = 4,
-       .perf_ctr_bits  = 48,
-       .fixed_ctr_bits = 48,
-       .fixed_ctr      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
-       .fixed_ctl      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
-       .event_descs    = snbep_uncore_imc_events,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type snbep_uncore_qpi = {
-       .name                   = "qpi",
-       .num_counters           = 4,
-       .num_boxes              = 2,
-       .perf_ctr_bits          = 48,
-       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
-       .event_ctl              = SNBEP_PCI_PMON_CTL0,
-       .event_mask             = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
-       .num_shared_regs        = 1,
-       .ops                    = &snbep_uncore_qpi_ops,
-       .event_descs            = snbep_uncore_qpi_events,
-       .format_group           = &snbep_uncore_qpi_format_group,
-};
-
-
-static struct intel_uncore_type snbep_uncore_r2pcie = {
-       .name           = "r2pcie",
-       .num_counters   = 4,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 44,
-       .constraints    = snbep_uncore_r2pcie_constraints,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type snbep_uncore_r3qpi = {
-       .name           = "r3qpi",
-       .num_counters   = 3,
-       .num_boxes      = 2,
-       .perf_ctr_bits  = 44,
-       .constraints    = snbep_uncore_r3qpi_constraints,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-enum {
-       SNBEP_PCI_UNCORE_HA,
-       SNBEP_PCI_UNCORE_IMC,
-       SNBEP_PCI_UNCORE_QPI,
-       SNBEP_PCI_UNCORE_R2PCIE,
-       SNBEP_PCI_UNCORE_R3QPI,
-};
-
-static struct intel_uncore_type *snbep_pci_uncores[] = {
-       [SNBEP_PCI_UNCORE_HA]           = &snbep_uncore_ha,
-       [SNBEP_PCI_UNCORE_IMC]          = &snbep_uncore_imc,
-       [SNBEP_PCI_UNCORE_QPI]          = &snbep_uncore_qpi,
-       [SNBEP_PCI_UNCORE_R2PCIE]       = &snbep_uncore_r2pcie,
-       [SNBEP_PCI_UNCORE_R3QPI]        = &snbep_uncore_r3qpi,
-       NULL,
-};
-
-static const struct pci_device_id snbep_uncore_pci_ids[] = {
-       { /* Home Agent */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0),
-       },
-       { /* MC Channel 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0),
-       },
-       { /* MC Channel 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1),
-       },
-       { /* MC Channel 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2),
-       },
-       { /* MC Channel 3 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3),
-       },
-       { /* QPI Port 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0),
-       },
-       { /* QPI Port 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1),
-       },
-       { /* R2PCIe */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0),
-       },
-       { /* R3QPI Link 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0),
-       },
-       { /* R3QPI Link 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1),
-       },
-       { /* QPI Port 0 filter  */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-                                                  SNBEP_PCI_QPI_PORT0_FILTER),
-       },
-       { /* QPI Port 0 filter  */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-                                                  SNBEP_PCI_QPI_PORT1_FILTER),
-       },
-       { /* end: all zeroes */ }
-};
-
-static struct pci_driver snbep_uncore_pci_driver = {
-       .name           = "snbep_uncore",
-       .id_table       = snbep_uncore_pci_ids,
-};
-
-/*
- * build pci bus to socket mapping
- */
-static int snbep_pci2phy_map_init(int devid)
-{
-       struct pci_dev *ubox_dev = NULL;
-       int i, bus, nodeid, segment;
-       struct pci2phy_map *map;
-       int err = 0;
-       u32 config = 0;
-
-       while (1) {
-               /* find the UBOX device */
-               ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, ubox_dev);
-               if (!ubox_dev)
-                       break;
-               bus = ubox_dev->bus->number;
-               /* get the Node ID of the local register */
-               err = pci_read_config_dword(ubox_dev, 0x40, &config);
-               if (err)
-                       break;
-               nodeid = config;
-               /* get the Node ID mapping */
-               err = pci_read_config_dword(ubox_dev, 0x54, &config);
-               if (err)
-                       break;
-
-               segment = pci_domain_nr(ubox_dev->bus);
-               raw_spin_lock(&pci2phy_map_lock);
-               map = __find_pci2phy_map(segment);
-               if (!map) {
-                       raw_spin_unlock(&pci2phy_map_lock);
-                       err = -ENOMEM;
-                       break;
-               }
-
-               /*
-                * every three bits in the Node ID mapping register maps
-                * to a particular node.
-                */
-               for (i = 0; i < 8; i++) {
-                       if (nodeid == ((config >> (3 * i)) & 0x7)) {
-                               map->pbus_to_physid[bus] = i;
-                               break;
-                       }
-               }
-               raw_spin_unlock(&pci2phy_map_lock);
-       }
-
-       if (!err) {
-               /*
-                * For PCI bus with no UBOX device, find the next bus
-                * that has UBOX device and use its mapping.
-                */
-               raw_spin_lock(&pci2phy_map_lock);
-               list_for_each_entry(map, &pci2phy_map_head, list) {
-                       i = -1;
-                       for (bus = 255; bus >= 0; bus--) {
-                               if (map->pbus_to_physid[bus] >= 0)
-                                       i = map->pbus_to_physid[bus];
-                               else
-                                       map->pbus_to_physid[bus] = i;
-                       }
-               }
-               raw_spin_unlock(&pci2phy_map_lock);
-       }
-
-       pci_dev_put(ubox_dev);
-
-       return err ? pcibios_err_to_errno(err) : 0;
-}
-
-int snbep_uncore_pci_init(void)
-{
-       int ret = snbep_pci2phy_map_init(0x3ce0);
-       if (ret)
-               return ret;
-       uncore_pci_uncores = snbep_pci_uncores;
-       uncore_pci_driver = &snbep_uncore_pci_driver;
-       return 0;
-}
-/* end of Sandy Bridge-EP uncore support */
-
-/* IvyTown uncore support */
-static void ivbep_uncore_msr_init_box(struct intel_uncore_box *box)
-{
-       unsigned msr = uncore_msr_box_ctl(box);
-       if (msr)
-               wrmsrl(msr, IVBEP_PMON_BOX_CTL_INT);
-}
-
-static void ivbep_uncore_pci_init_box(struct intel_uncore_box *box)
-{
-       struct pci_dev *pdev = box->pci_dev;
-
-       pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT);
-}
-
-#define IVBEP_UNCORE_MSR_OPS_COMMON_INIT()                     \
-       .init_box       = ivbep_uncore_msr_init_box,            \
-       .disable_box    = snbep_uncore_msr_disable_box,         \
-       .enable_box     = snbep_uncore_msr_enable_box,          \
-       .disable_event  = snbep_uncore_msr_disable_event,       \
-       .enable_event   = snbep_uncore_msr_enable_event,        \
-       .read_counter   = uncore_msr_read_counter
-
-static struct intel_uncore_ops ivbep_uncore_msr_ops = {
-       IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-};
-
-static struct intel_uncore_ops ivbep_uncore_pci_ops = {
-       .init_box       = ivbep_uncore_pci_init_box,
-       .disable_box    = snbep_uncore_pci_disable_box,
-       .enable_box     = snbep_uncore_pci_enable_box,
-       .disable_event  = snbep_uncore_pci_disable_event,
-       .enable_event   = snbep_uncore_pci_enable_event,
-       .read_counter   = snbep_uncore_pci_read_counter,
-};
-
-#define IVBEP_UNCORE_PCI_COMMON_INIT()                         \
-       .perf_ctr       = SNBEP_PCI_PMON_CTR0,                  \
-       .event_ctl      = SNBEP_PCI_PMON_CTL0,                  \
-       .event_mask     = IVBEP_PMON_RAW_EVENT_MASK,            \
-       .box_ctl        = SNBEP_PCI_PMON_BOX_CTL,               \
-       .ops            = &ivbep_uncore_pci_ops,                        \
-       .format_group   = &ivbep_uncore_format_group
-
-static struct attribute *ivbep_uncore_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh8.attr,
-       NULL,
-};
-
-static struct attribute *ivbep_uncore_ubox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh5.attr,
-       NULL,
-};
-
-static struct attribute *ivbep_uncore_cbox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_tid_en.attr,
-       &format_attr_thresh8.attr,
-       &format_attr_filter_tid.attr,
-       &format_attr_filter_link.attr,
-       &format_attr_filter_state2.attr,
-       &format_attr_filter_nid2.attr,
-       &format_attr_filter_opc2.attr,
-       &format_attr_filter_nc.attr,
-       &format_attr_filter_c6.attr,
-       &format_attr_filter_isoc.attr,
-       NULL,
-};
-
-static struct attribute *ivbep_uncore_pcu_formats_attr[] = {
-       &format_attr_event_ext.attr,
-       &format_attr_occ_sel.attr,
-       &format_attr_edge.attr,
-       &format_attr_thresh5.attr,
-       &format_attr_occ_invert.attr,
-       &format_attr_occ_edge.attr,
-       &format_attr_filter_band0.attr,
-       &format_attr_filter_band1.attr,
-       &format_attr_filter_band2.attr,
-       &format_attr_filter_band3.attr,
-       NULL,
-};
-
-static struct attribute *ivbep_uncore_qpi_formats_attr[] = {
-       &format_attr_event_ext.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_thresh8.attr,
-       &format_attr_match_rds.attr,
-       &format_attr_match_rnid30.attr,
-       &format_attr_match_rnid4.attr,
-       &format_attr_match_dnid.attr,
-       &format_attr_match_mc.attr,
-       &format_attr_match_opc.attr,
-       &format_attr_match_vnw.attr,
-       &format_attr_match0.attr,
-       &format_attr_match1.attr,
-       &format_attr_mask_rds.attr,
-       &format_attr_mask_rnid30.attr,
-       &format_attr_mask_rnid4.attr,
-       &format_attr_mask_dnid.attr,
-       &format_attr_mask_mc.attr,
-       &format_attr_mask_opc.attr,
-       &format_attr_mask_vnw.attr,
-       &format_attr_mask0.attr,
-       &format_attr_mask1.attr,
-       NULL,
-};
-
-static struct attribute_group ivbep_uncore_format_group = {
-       .name = "format",
-       .attrs = ivbep_uncore_formats_attr,
-};
-
-static struct attribute_group ivbep_uncore_ubox_format_group = {
-       .name = "format",
-       .attrs = ivbep_uncore_ubox_formats_attr,
-};
-
-static struct attribute_group ivbep_uncore_cbox_format_group = {
-       .name = "format",
-       .attrs = ivbep_uncore_cbox_formats_attr,
-};
-
-static struct attribute_group ivbep_uncore_pcu_format_group = {
-       .name = "format",
-       .attrs = ivbep_uncore_pcu_formats_attr,
-};
-
-static struct attribute_group ivbep_uncore_qpi_format_group = {
-       .name = "format",
-       .attrs = ivbep_uncore_qpi_formats_attr,
-};
-
-static struct intel_uncore_type ivbep_uncore_ubox = {
-       .name           = "ubox",
-       .num_counters   = 2,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 44,
-       .fixed_ctr_bits = 48,
-       .perf_ctr       = SNBEP_U_MSR_PMON_CTR0,
-       .event_ctl      = SNBEP_U_MSR_PMON_CTL0,
-       .event_mask     = IVBEP_U_MSR_PMON_RAW_EVENT_MASK,
-       .fixed_ctr      = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
-       .fixed_ctl      = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
-       .ops            = &ivbep_uncore_msr_ops,
-       .format_group   = &ivbep_uncore_ubox_format_group,
-};
-
-static struct extra_reg ivbep_uncore_cbox_extra_regs[] = {
-       SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
-                                 SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
-       EVENT_EXTRA_END
-};
-
-static u64 ivbep_cbox_filter_mask(int fields)
-{
-       u64 mask = 0;
-
-       if (fields & 0x1)
-               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_TID;
-       if (fields & 0x2)
-               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK;
-       if (fields & 0x4)
-               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
-       if (fields & 0x8)
-               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NID;
-       if (fields & 0x10) {
-               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
-               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NC;
-               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_C6;
-               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC;
-       }
-
-       return mask;
-}
-
-static struct event_constraint *
-ivbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       return __snbep_cbox_get_constraint(box, event, ivbep_cbox_filter_mask);
-}
-
-static int ivbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct extra_reg *er;
-       int idx = 0;
-
-       for (er = ivbep_uncore_cbox_extra_regs; er->msr; er++) {
-               if (er->event != (event->hw.config & er->config_mask))
-                       continue;
-               idx |= er->idx;
-       }
-
-       if (idx) {
-               reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
-                       SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
-               reg1->config = event->attr.config1 & ivbep_cbox_filter_mask(idx);
-               reg1->idx = idx;
-       }
-       return 0;
-}
-
-static void ivbep_cbox_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
-       if (reg1->idx != EXTRA_REG_NONE) {
-               u64 filter = uncore_shared_reg_config(box, 0);
-               wrmsrl(reg1->reg, filter & 0xffffffff);
-               wrmsrl(reg1->reg + 6, filter >> 32);
-       }
-
-       wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static struct intel_uncore_ops ivbep_uncore_cbox_ops = {
-       .init_box               = ivbep_uncore_msr_init_box,
-       .disable_box            = snbep_uncore_msr_disable_box,
-       .enable_box             = snbep_uncore_msr_enable_box,
-       .disable_event          = snbep_uncore_msr_disable_event,
-       .enable_event           = ivbep_cbox_enable_event,
-       .read_counter           = uncore_msr_read_counter,
-       .hw_config              = ivbep_cbox_hw_config,
-       .get_constraint         = ivbep_cbox_get_constraint,
-       .put_constraint         = snbep_cbox_put_constraint,
-};
-
-static struct intel_uncore_type ivbep_uncore_cbox = {
-       .name                   = "cbox",
-       .num_counters           = 4,
-       .num_boxes              = 15,
-       .perf_ctr_bits          = 44,
-       .event_ctl              = SNBEP_C0_MSR_PMON_CTL0,
-       .perf_ctr               = SNBEP_C0_MSR_PMON_CTR0,
-       .event_mask             = IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_C0_MSR_PMON_BOX_CTL,
-       .msr_offset             = SNBEP_CBO_MSR_OFFSET,
-       .num_shared_regs        = 1,
-       .constraints            = snbep_uncore_cbox_constraints,
-       .ops                    = &ivbep_uncore_cbox_ops,
-       .format_group           = &ivbep_uncore_cbox_format_group,
-};
-
-static struct intel_uncore_ops ivbep_uncore_pcu_ops = {
-       IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-       .hw_config              = snbep_pcu_hw_config,
-       .get_constraint         = snbep_pcu_get_constraint,
-       .put_constraint         = snbep_pcu_put_constraint,
-};
-
-static struct intel_uncore_type ivbep_uncore_pcu = {
-       .name                   = "pcu",
-       .num_counters           = 4,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .perf_ctr               = SNBEP_PCU_MSR_PMON_CTR0,
-       .event_ctl              = SNBEP_PCU_MSR_PMON_CTL0,
-       .event_mask             = IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_PCU_MSR_PMON_BOX_CTL,
-       .num_shared_regs        = 1,
-       .ops                    = &ivbep_uncore_pcu_ops,
-       .format_group           = &ivbep_uncore_pcu_format_group,
-};
-
-static struct intel_uncore_type *ivbep_msr_uncores[] = {
-       &ivbep_uncore_ubox,
-       &ivbep_uncore_cbox,
-       &ivbep_uncore_pcu,
-       NULL,
-};
-
-void ivbep_uncore_cpu_init(void)
-{
-       if (ivbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-               ivbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-       uncore_msr_uncores = ivbep_msr_uncores;
-}
-
-static struct intel_uncore_type ivbep_uncore_ha = {
-       .name           = "ha",
-       .num_counters   = 4,
-       .num_boxes      = 2,
-       .perf_ctr_bits  = 48,
-       IVBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type ivbep_uncore_imc = {
-       .name           = "imc",
-       .num_counters   = 4,
-       .num_boxes      = 8,
-       .perf_ctr_bits  = 48,
-       .fixed_ctr_bits = 48,
-       .fixed_ctr      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
-       .fixed_ctl      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
-       .event_descs    = snbep_uncore_imc_events,
-       IVBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-/* registers in IRP boxes are not properly aligned */
-static unsigned ivbep_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4};
-static unsigned ivbep_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0};
-
-static void ivbep_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       struct hw_perf_event *hwc = &event->hw;
-
-       pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx],
-                              hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static void ivbep_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       struct hw_perf_event *hwc = &event->hw;
-
-       pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx], hwc->config);
-}
-
-static u64 ivbep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       struct hw_perf_event *hwc = &event->hw;
-       u64 count = 0;
-
-       pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
-       pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
-
-       return count;
-}
-
-static struct intel_uncore_ops ivbep_uncore_irp_ops = {
-       .init_box       = ivbep_uncore_pci_init_box,
-       .disable_box    = snbep_uncore_pci_disable_box,
-       .enable_box     = snbep_uncore_pci_enable_box,
-       .disable_event  = ivbep_uncore_irp_disable_event,
-       .enable_event   = ivbep_uncore_irp_enable_event,
-       .read_counter   = ivbep_uncore_irp_read_counter,
-};
-
-static struct intel_uncore_type ivbep_uncore_irp = {
-       .name                   = "irp",
-       .num_counters           = 4,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .event_mask             = IVBEP_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
-       .ops                    = &ivbep_uncore_irp_ops,
-       .format_group           = &ivbep_uncore_format_group,
-};
-
-static struct intel_uncore_ops ivbep_uncore_qpi_ops = {
-       .init_box       = ivbep_uncore_pci_init_box,
-       .disable_box    = snbep_uncore_pci_disable_box,
-       .enable_box     = snbep_uncore_pci_enable_box,
-       .disable_event  = snbep_uncore_pci_disable_event,
-       .enable_event   = snbep_qpi_enable_event,
-       .read_counter   = snbep_uncore_pci_read_counter,
-       .hw_config      = snbep_qpi_hw_config,
-       .get_constraint = uncore_get_constraint,
-       .put_constraint = uncore_put_constraint,
-};
-
-static struct intel_uncore_type ivbep_uncore_qpi = {
-       .name                   = "qpi",
-       .num_counters           = 4,
-       .num_boxes              = 3,
-       .perf_ctr_bits          = 48,
-       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
-       .event_ctl              = SNBEP_PCI_PMON_CTL0,
-       .event_mask             = IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
-       .num_shared_regs        = 1,
-       .ops                    = &ivbep_uncore_qpi_ops,
-       .format_group           = &ivbep_uncore_qpi_format_group,
-};
-
-static struct intel_uncore_type ivbep_uncore_r2pcie = {
-       .name           = "r2pcie",
-       .num_counters   = 4,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 44,
-       .constraints    = snbep_uncore_r2pcie_constraints,
-       IVBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type ivbep_uncore_r3qpi = {
-       .name           = "r3qpi",
-       .num_counters   = 3,
-       .num_boxes      = 2,
-       .perf_ctr_bits  = 44,
-       .constraints    = snbep_uncore_r3qpi_constraints,
-       IVBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-enum {
-       IVBEP_PCI_UNCORE_HA,
-       IVBEP_PCI_UNCORE_IMC,
-       IVBEP_PCI_UNCORE_IRP,
-       IVBEP_PCI_UNCORE_QPI,
-       IVBEP_PCI_UNCORE_R2PCIE,
-       IVBEP_PCI_UNCORE_R3QPI,
-};
-
-static struct intel_uncore_type *ivbep_pci_uncores[] = {
-       [IVBEP_PCI_UNCORE_HA]   = &ivbep_uncore_ha,
-       [IVBEP_PCI_UNCORE_IMC]  = &ivbep_uncore_imc,
-       [IVBEP_PCI_UNCORE_IRP]  = &ivbep_uncore_irp,
-       [IVBEP_PCI_UNCORE_QPI]  = &ivbep_uncore_qpi,
-       [IVBEP_PCI_UNCORE_R2PCIE]       = &ivbep_uncore_r2pcie,
-       [IVBEP_PCI_UNCORE_R3QPI]        = &ivbep_uncore_r3qpi,
-       NULL,
-};
-
-static const struct pci_device_id ivbep_uncore_pci_ids[] = {
-       { /* Home Agent 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 0),
-       },
-       { /* Home Agent 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 1),
-       },
-       { /* MC0 Channel 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 0),
-       },
-       { /* MC0 Channel 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 1),
-       },
-       { /* MC0 Channel 3 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 2),
-       },
-       { /* MC0 Channel 4 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 3),
-       },
-       { /* MC1 Channel 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 4),
-       },
-       { /* MC1 Channel 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 5),
-       },
-       { /* MC1 Channel 3 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 6),
-       },
-       { /* MC1 Channel 4 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 7),
-       },
-       { /* IRP */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IRP, 0),
-       },
-       { /* QPI0 Port 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 0),
-       },
-       { /* QPI0 Port 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 1),
-       },
-       { /* QPI1 Port 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 2),
-       },
-       { /* R2PCIe */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R2PCIE, 0),
-       },
-       { /* R3QPI0 Link 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 0),
-       },
-       { /* R3QPI0 Link 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 1),
-       },
-       { /* R3QPI1 Link 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 2),
-       },
-       { /* QPI Port 0 filter  */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-                                                  SNBEP_PCI_QPI_PORT0_FILTER),
-       },
-       { /* QPI Port 0 filter  */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-                                                  SNBEP_PCI_QPI_PORT1_FILTER),
-       },
-       { /* end: all zeroes */ }
-};
-
-static struct pci_driver ivbep_uncore_pci_driver = {
-       .name           = "ivbep_uncore",
-       .id_table       = ivbep_uncore_pci_ids,
-};
-
-int ivbep_uncore_pci_init(void)
-{
-       int ret = snbep_pci2phy_map_init(0x0e1e);
-       if (ret)
-               return ret;
-       uncore_pci_uncores = ivbep_pci_uncores;
-       uncore_pci_driver = &ivbep_uncore_pci_driver;
-       return 0;
-}
-/* end of IvyTown uncore support */
-
-/* KNL uncore support */
-static struct attribute *knl_uncore_ubox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_tid_en.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh5.attr,
-       NULL,
-};
-
-static struct attribute_group knl_uncore_ubox_format_group = {
-       .name = "format",
-       .attrs = knl_uncore_ubox_formats_attr,
-};
-
-static struct intel_uncore_type knl_uncore_ubox = {
-       .name                   = "ubox",
-       .num_counters           = 2,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .fixed_ctr_bits         = 48,
-       .perf_ctr               = HSWEP_U_MSR_PMON_CTR0,
-       .event_ctl              = HSWEP_U_MSR_PMON_CTL0,
-       .event_mask             = KNL_U_MSR_PMON_RAW_EVENT_MASK,
-       .fixed_ctr              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
-       .fixed_ctl              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
-       .ops                    = &snbep_uncore_msr_ops,
-       .format_group           = &knl_uncore_ubox_format_group,
-};
-
-static struct attribute *knl_uncore_cha_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_qor.attr,
-       &format_attr_edge.attr,
-       &format_attr_tid_en.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh8.attr,
-       &format_attr_filter_tid4.attr,
-       &format_attr_filter_link3.attr,
-       &format_attr_filter_state4.attr,
-       &format_attr_filter_local.attr,
-       &format_attr_filter_all_op.attr,
-       &format_attr_filter_nnm.attr,
-       &format_attr_filter_opc3.attr,
-       &format_attr_filter_nc.attr,
-       &format_attr_filter_isoc.attr,
-       NULL,
-};
-
-static struct attribute_group knl_uncore_cha_format_group = {
-       .name = "format",
-       .attrs = knl_uncore_cha_formats_attr,
-};
-
-static struct event_constraint knl_uncore_cha_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x1f, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
-       EVENT_CONSTRAINT_END
-};
-
-static struct extra_reg knl_uncore_cha_extra_regs[] = {
-       SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
-                                 SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x3d, 0xff, 0x2),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x4),
-       EVENT_EXTRA_END
-};
-
-static u64 knl_cha_filter_mask(int fields)
-{
-       u64 mask = 0;
-
-       if (fields & 0x1)
-               mask |= KNL_CHA_MSR_PMON_BOX_FILTER_TID;
-       if (fields & 0x2)
-               mask |= KNL_CHA_MSR_PMON_BOX_FILTER_STATE;
-       if (fields & 0x4)
-               mask |= KNL_CHA_MSR_PMON_BOX_FILTER_OP;
-       return mask;
-}
-
-static struct event_constraint *
-knl_cha_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       return __snbep_cbox_get_constraint(box, event, knl_cha_filter_mask);
-}
-
-static int knl_cha_hw_config(struct intel_uncore_box *box,
-                            struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct extra_reg *er;
-       int idx = 0;
-
-       for (er = knl_uncore_cha_extra_regs; er->msr; er++) {
-               if (er->event != (event->hw.config & er->config_mask))
-                       continue;
-               idx |= er->idx;
-       }
-
-       if (idx) {
-               reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
-                           KNL_CHA_MSR_OFFSET * box->pmu->pmu_idx;
-               reg1->config = event->attr.config1 & knl_cha_filter_mask(idx);
-               reg1->idx = idx;
-       }
-       return 0;
-}
-
-static void hswep_cbox_enable_event(struct intel_uncore_box *box,
-                                   struct perf_event *event);
-
-static struct intel_uncore_ops knl_uncore_cha_ops = {
-       .init_box               = snbep_uncore_msr_init_box,
-       .disable_box            = snbep_uncore_msr_disable_box,
-       .enable_box             = snbep_uncore_msr_enable_box,
-       .disable_event          = snbep_uncore_msr_disable_event,
-       .enable_event           = hswep_cbox_enable_event,
-       .read_counter           = uncore_msr_read_counter,
-       .hw_config              = knl_cha_hw_config,
-       .get_constraint         = knl_cha_get_constraint,
-       .put_constraint         = snbep_cbox_put_constraint,
-};
-
-static struct intel_uncore_type knl_uncore_cha = {
-       .name                   = "cha",
-       .num_counters           = 4,
-       .num_boxes              = 38,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = HSWEP_C0_MSR_PMON_CTL0,
-       .perf_ctr               = HSWEP_C0_MSR_PMON_CTR0,
-       .event_mask             = KNL_CHA_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = HSWEP_C0_MSR_PMON_BOX_CTL,
-       .msr_offset             = KNL_CHA_MSR_OFFSET,
-       .num_shared_regs        = 1,
-       .constraints            = knl_uncore_cha_constraints,
-       .ops                    = &knl_uncore_cha_ops,
-       .format_group           = &knl_uncore_cha_format_group,
-};
-
-static struct attribute *knl_uncore_pcu_formats_attr[] = {
-       &format_attr_event2.attr,
-       &format_attr_use_occ_ctr.attr,
-       &format_attr_occ_sel.attr,
-       &format_attr_edge.attr,
-       &format_attr_tid_en.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh6.attr,
-       &format_attr_occ_invert.attr,
-       &format_attr_occ_edge_det.attr,
-       NULL,
-};
-
-static struct attribute_group knl_uncore_pcu_format_group = {
-       .name = "format",
-       .attrs = knl_uncore_pcu_formats_attr,
-};
-
-static struct intel_uncore_type knl_uncore_pcu = {
-       .name                   = "pcu",
-       .num_counters           = 4,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .perf_ctr               = HSWEP_PCU_MSR_PMON_CTR0,
-       .event_ctl              = HSWEP_PCU_MSR_PMON_CTL0,
-       .event_mask             = KNL_PCU_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = HSWEP_PCU_MSR_PMON_BOX_CTL,
-       .ops                    = &snbep_uncore_msr_ops,
-       .format_group           = &knl_uncore_pcu_format_group,
-};
-
-static struct intel_uncore_type *knl_msr_uncores[] = {
-       &knl_uncore_ubox,
-       &knl_uncore_cha,
-       &knl_uncore_pcu,
-       NULL,
-};
-
-void knl_uncore_cpu_init(void)
-{
-       uncore_msr_uncores = knl_msr_uncores;
-}
-
-static void knl_uncore_imc_enable_box(struct intel_uncore_box *box)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       int box_ctl = uncore_pci_box_ctl(box);
-
-       pci_write_config_dword(pdev, box_ctl, 0);
-}
-
-static void knl_uncore_imc_enable_event(struct intel_uncore_box *box,
-                                       struct perf_event *event)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       struct hw_perf_event *hwc = &event->hw;
-
-       if ((event->attr.config & SNBEP_PMON_CTL_EV_SEL_MASK)
-                                                       == UNCORE_FIXED_EVENT)
-               pci_write_config_dword(pdev, hwc->config_base,
-                                      hwc->config | KNL_PMON_FIXED_CTL_EN);
-       else
-               pci_write_config_dword(pdev, hwc->config_base,
-                                      hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static struct intel_uncore_ops knl_uncore_imc_ops = {
-       .init_box       = snbep_uncore_pci_init_box,
-       .disable_box    = snbep_uncore_pci_disable_box,
-       .enable_box     = knl_uncore_imc_enable_box,
-       .read_counter   = snbep_uncore_pci_read_counter,
-       .enable_event   = knl_uncore_imc_enable_event,
-       .disable_event  = snbep_uncore_pci_disable_event,
-};
-
-static struct intel_uncore_type knl_uncore_imc_uclk = {
-       .name                   = "imc_uclk",
-       .num_counters           = 4,
-       .num_boxes              = 2,
-       .perf_ctr_bits          = 48,
-       .fixed_ctr_bits         = 48,
-       .perf_ctr               = KNL_UCLK_MSR_PMON_CTR0_LOW,
-       .event_ctl              = KNL_UCLK_MSR_PMON_CTL0,
-       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
-       .fixed_ctr              = KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
-       .fixed_ctl              = KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
-       .box_ctl                = KNL_UCLK_MSR_PMON_BOX_CTL,
-       .ops                    = &knl_uncore_imc_ops,
-       .format_group           = &snbep_uncore_format_group,
-};
-
-static struct intel_uncore_type knl_uncore_imc_dclk = {
-       .name                   = "imc",
-       .num_counters           = 4,
-       .num_boxes              = 6,
-       .perf_ctr_bits          = 48,
-       .fixed_ctr_bits         = 48,
-       .perf_ctr               = KNL_MC0_CH0_MSR_PMON_CTR0_LOW,
-       .event_ctl              = KNL_MC0_CH0_MSR_PMON_CTL0,
-       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
-       .fixed_ctr              = KNL_MC0_CH0_MSR_PMON_FIXED_LOW,
-       .fixed_ctl              = KNL_MC0_CH0_MSR_PMON_FIXED_CTL,
-       .box_ctl                = KNL_MC0_CH0_MSR_PMON_BOX_CTL,
-       .ops                    = &knl_uncore_imc_ops,
-       .format_group           = &snbep_uncore_format_group,
-};
-
-static struct intel_uncore_type knl_uncore_edc_uclk = {
-       .name                   = "edc_uclk",
-       .num_counters           = 4,
-       .num_boxes              = 8,
-       .perf_ctr_bits          = 48,
-       .fixed_ctr_bits         = 48,
-       .perf_ctr               = KNL_UCLK_MSR_PMON_CTR0_LOW,
-       .event_ctl              = KNL_UCLK_MSR_PMON_CTL0,
-       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
-       .fixed_ctr              = KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
-       .fixed_ctl              = KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
-       .box_ctl                = KNL_UCLK_MSR_PMON_BOX_CTL,
-       .ops                    = &knl_uncore_imc_ops,
-       .format_group           = &snbep_uncore_format_group,
-};
-
-static struct intel_uncore_type knl_uncore_edc_eclk = {
-       .name                   = "edc_eclk",
-       .num_counters           = 4,
-       .num_boxes              = 8,
-       .perf_ctr_bits          = 48,
-       .fixed_ctr_bits         = 48,
-       .perf_ctr               = KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW,
-       .event_ctl              = KNL_EDC0_ECLK_MSR_PMON_CTL0,
-       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
-       .fixed_ctr              = KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW,
-       .fixed_ctl              = KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL,
-       .box_ctl                = KNL_EDC0_ECLK_MSR_PMON_BOX_CTL,
-       .ops                    = &knl_uncore_imc_ops,
-       .format_group           = &snbep_uncore_format_group,
-};
-
-static struct event_constraint knl_uncore_m2pcie_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-       EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type knl_uncore_m2pcie = {
-       .name           = "m2pcie",
-       .num_counters   = 4,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 48,
-       .constraints    = knl_uncore_m2pcie_constraints,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct attribute *knl_uncore_irp_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_qor.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh8.attr,
-       NULL,
-};
-
-static struct attribute_group knl_uncore_irp_format_group = {
-       .name = "format",
-       .attrs = knl_uncore_irp_formats_attr,
-};
-
-static struct intel_uncore_type knl_uncore_irp = {
-       .name                   = "irp",
-       .num_counters           = 2,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
-       .event_ctl              = SNBEP_PCI_PMON_CTL0,
-       .event_mask             = KNL_IRP_PCI_PMON_RAW_EVENT_MASK,
-       .box_ctl                = KNL_IRP_PCI_PMON_BOX_CTL,
-       .ops                    = &snbep_uncore_pci_ops,
-       .format_group           = &knl_uncore_irp_format_group,
-};
-
-enum {
-       KNL_PCI_UNCORE_MC_UCLK,
-       KNL_PCI_UNCORE_MC_DCLK,
-       KNL_PCI_UNCORE_EDC_UCLK,
-       KNL_PCI_UNCORE_EDC_ECLK,
-       KNL_PCI_UNCORE_M2PCIE,
-       KNL_PCI_UNCORE_IRP,
-};
-
-static struct intel_uncore_type *knl_pci_uncores[] = {
-       [KNL_PCI_UNCORE_MC_UCLK]        = &knl_uncore_imc_uclk,
-       [KNL_PCI_UNCORE_MC_DCLK]        = &knl_uncore_imc_dclk,
-       [KNL_PCI_UNCORE_EDC_UCLK]       = &knl_uncore_edc_uclk,
-       [KNL_PCI_UNCORE_EDC_ECLK]       = &knl_uncore_edc_eclk,
-       [KNL_PCI_UNCORE_M2PCIE]         = &knl_uncore_m2pcie,
-       [KNL_PCI_UNCORE_IRP]            = &knl_uncore_irp,
-       NULL,
-};
-
-/*
- * KNL uses a common PCI device ID for multiple instances of an Uncore PMU
- * device type. prior to KNL, each instance of a PMU device type had a unique
- * device ID.
- *
- *     PCI Device ID   Uncore PMU Devices
- *     ----------------------------------
- *     0x7841          MC0 UClk, MC1 UClk
- *     0x7843          MC0 DClk CH 0, MC0 DClk CH 1, MC0 DClk CH 2,
- *                     MC1 DClk CH 0, MC1 DClk CH 1, MC1 DClk CH 2
- *     0x7833          EDC0 UClk, EDC1 UClk, EDC2 UClk, EDC3 UClk,
- *                     EDC4 UClk, EDC5 UClk, EDC6 UClk, EDC7 UClk
- *     0x7835          EDC0 EClk, EDC1 EClk, EDC2 EClk, EDC3 EClk,
- *                     EDC4 EClk, EDC5 EClk, EDC6 EClk, EDC7 EClk
- *     0x7817          M2PCIe
- *     0x7814          IRP
-*/
-
-static const struct pci_device_id knl_uncore_pci_ids[] = {
-       { /* MC UClk */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7841),
-               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_UCLK, 0),
-       },
-       { /* MC DClk Channel */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
-               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_DCLK, 0),
-       },
-       { /* EDC UClk */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
-               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_UCLK, 0),
-       },
-       { /* EDC EClk */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
-               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_ECLK, 0),
-       },
-       { /* M2PCIe */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7817),
-               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_M2PCIE, 0),
-       },
-       { /* IRP */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7814),
-               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_IRP, 0),
-       },
-       { /* end: all zeroes */ }
-};
-
-static struct pci_driver knl_uncore_pci_driver = {
-       .name           = "knl_uncore",
-       .id_table       = knl_uncore_pci_ids,
-};
-
-int knl_uncore_pci_init(void)
-{
-       int ret;
-
-       /* All KNL PCI based PMON units are on the same PCI bus except IRP */
-       ret = snb_pci2phy_map_init(0x7814); /* IRP */
-       if (ret)
-               return ret;
-       ret = snb_pci2phy_map_init(0x7817); /* M2PCIe */
-       if (ret)
-               return ret;
-       uncore_pci_uncores = knl_pci_uncores;
-       uncore_pci_driver = &knl_uncore_pci_driver;
-       return 0;
-}
-
-/* end of KNL uncore support */
-
-/* Haswell-EP uncore support */
-static struct attribute *hswep_uncore_ubox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh5.attr,
-       &format_attr_filter_tid2.attr,
-       &format_attr_filter_cid.attr,
-       NULL,
-};
-
-static struct attribute_group hswep_uncore_ubox_format_group = {
-       .name = "format",
-       .attrs = hswep_uncore_ubox_formats_attr,
-};
-
-static int hswep_ubox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       reg1->reg = HSWEP_U_MSR_PMON_FILTER;
-       reg1->config = event->attr.config1 & HSWEP_U_MSR_PMON_BOX_FILTER_MASK;
-       reg1->idx = 0;
-       return 0;
-}
-
-static struct intel_uncore_ops hswep_uncore_ubox_ops = {
-       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-       .hw_config              = hswep_ubox_hw_config,
-       .get_constraint         = uncore_get_constraint,
-       .put_constraint         = uncore_put_constraint,
-};
-
-static struct intel_uncore_type hswep_uncore_ubox = {
-       .name                   = "ubox",
-       .num_counters           = 2,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 44,
-       .fixed_ctr_bits         = 48,
-       .perf_ctr               = HSWEP_U_MSR_PMON_CTR0,
-       .event_ctl              = HSWEP_U_MSR_PMON_CTL0,
-       .event_mask             = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
-       .fixed_ctr              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
-       .fixed_ctl              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
-       .num_shared_regs        = 1,
-       .ops                    = &hswep_uncore_ubox_ops,
-       .format_group           = &hswep_uncore_ubox_format_group,
-};
-
-static struct attribute *hswep_uncore_cbox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_tid_en.attr,
-       &format_attr_thresh8.attr,
-       &format_attr_filter_tid3.attr,
-       &format_attr_filter_link2.attr,
-       &format_attr_filter_state3.attr,
-       &format_attr_filter_nid2.attr,
-       &format_attr_filter_opc2.attr,
-       &format_attr_filter_nc.attr,
-       &format_attr_filter_c6.attr,
-       &format_attr_filter_isoc.attr,
-       NULL,
-};
-
-static struct attribute_group hswep_uncore_cbox_format_group = {
-       .name = "format",
-       .attrs = hswep_uncore_cbox_formats_attr,
-};
-
-static struct event_constraint hswep_uncore_cbox_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x09, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
-       EVENT_CONSTRAINT_END
-};
-
-static struct extra_reg hswep_uncore_cbox_extra_regs[] = {
-       SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
-                                 SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x2134, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4028, 0x40ff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4032, 0x40ff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4029, 0x40ff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4033, 0x40ff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x402A, 0x40ff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x12),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
-       EVENT_EXTRA_END
-};
-
-static u64 hswep_cbox_filter_mask(int fields)
-{
-       u64 mask = 0;
-       if (fields & 0x1)
-               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_TID;
-       if (fields & 0x2)
-               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK;
-       if (fields & 0x4)
-               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE;
-       if (fields & 0x8)
-               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NID;
-       if (fields & 0x10) {
-               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC;
-               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NC;
-               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_C6;
-               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC;
-       }
-       return mask;
-}
-
-static struct event_constraint *
-hswep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       return __snbep_cbox_get_constraint(box, event, hswep_cbox_filter_mask);
-}
-
-static int hswep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct extra_reg *er;
-       int idx = 0;
-
-       for (er = hswep_uncore_cbox_extra_regs; er->msr; er++) {
-               if (er->event != (event->hw.config & er->config_mask))
-                       continue;
-               idx |= er->idx;
-       }
-
-       if (idx) {
-               reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
-                           HSWEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
-               reg1->config = event->attr.config1 & hswep_cbox_filter_mask(idx);
-               reg1->idx = idx;
-       }
-       return 0;
-}
-
-static void hswep_cbox_enable_event(struct intel_uncore_box *box,
-                                 struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
-       if (reg1->idx != EXTRA_REG_NONE) {
-               u64 filter = uncore_shared_reg_config(box, 0);
-               wrmsrl(reg1->reg, filter & 0xffffffff);
-               wrmsrl(reg1->reg + 1, filter >> 32);
-       }
-
-       wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static struct intel_uncore_ops hswep_uncore_cbox_ops = {
-       .init_box               = snbep_uncore_msr_init_box,
-       .disable_box            = snbep_uncore_msr_disable_box,
-       .enable_box             = snbep_uncore_msr_enable_box,
-       .disable_event          = snbep_uncore_msr_disable_event,
-       .enable_event           = hswep_cbox_enable_event,
-       .read_counter           = uncore_msr_read_counter,
-       .hw_config              = hswep_cbox_hw_config,
-       .get_constraint         = hswep_cbox_get_constraint,
-       .put_constraint         = snbep_cbox_put_constraint,
-};
-
-static struct intel_uncore_type hswep_uncore_cbox = {
-       .name                   = "cbox",
-       .num_counters           = 4,
-       .num_boxes              = 18,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = HSWEP_C0_MSR_PMON_CTL0,
-       .perf_ctr               = HSWEP_C0_MSR_PMON_CTR0,
-       .event_mask             = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = HSWEP_C0_MSR_PMON_BOX_CTL,
-       .msr_offset             = HSWEP_CBO_MSR_OFFSET,
-       .num_shared_regs        = 1,
-       .constraints            = hswep_uncore_cbox_constraints,
-       .ops                    = &hswep_uncore_cbox_ops,
-       .format_group           = &hswep_uncore_cbox_format_group,
-};
-
-/*
- * Write SBOX Initialization register bit by bit to avoid spurious #GPs
- */
-static void hswep_uncore_sbox_msr_init_box(struct intel_uncore_box *box)
-{
-       unsigned msr = uncore_msr_box_ctl(box);
-
-       if (msr) {
-               u64 init = SNBEP_PMON_BOX_CTL_INT;
-               u64 flags = 0;
-               int i;
-
-               for_each_set_bit(i, (unsigned long *)&init, 64) {
-                       flags |= (1ULL << i);
-                       wrmsrl(msr, flags);
-               }
-       }
-}
-
-static struct intel_uncore_ops hswep_uncore_sbox_msr_ops = {
-       __SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-       .init_box               = hswep_uncore_sbox_msr_init_box
-};
-
-static struct attribute *hswep_uncore_sbox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_tid_en.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh8.attr,
-       NULL,
-};
-
-static struct attribute_group hswep_uncore_sbox_format_group = {
-       .name = "format",
-       .attrs = hswep_uncore_sbox_formats_attr,
-};
-
-static struct intel_uncore_type hswep_uncore_sbox = {
-       .name                   = "sbox",
-       .num_counters           = 4,
-       .num_boxes              = 4,
-       .perf_ctr_bits          = 44,
-       .event_ctl              = HSWEP_S0_MSR_PMON_CTL0,
-       .perf_ctr               = HSWEP_S0_MSR_PMON_CTR0,
-       .event_mask             = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = HSWEP_S0_MSR_PMON_BOX_CTL,
-       .msr_offset             = HSWEP_SBOX_MSR_OFFSET,
-       .ops                    = &hswep_uncore_sbox_msr_ops,
-       .format_group           = &hswep_uncore_sbox_format_group,
-};
-
-static int hswep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
-
-       if (ev_sel >= 0xb && ev_sel <= 0xe) {
-               reg1->reg = HSWEP_PCU_MSR_PMON_BOX_FILTER;
-               reg1->idx = ev_sel - 0xb;
-               reg1->config = event->attr.config1 & (0xff << reg1->idx);
-       }
-       return 0;
-}
-
-static struct intel_uncore_ops hswep_uncore_pcu_ops = {
-       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-       .hw_config              = hswep_pcu_hw_config,
-       .get_constraint         = snbep_pcu_get_constraint,
-       .put_constraint         = snbep_pcu_put_constraint,
-};
-
-static struct intel_uncore_type hswep_uncore_pcu = {
-       .name                   = "pcu",
-       .num_counters           = 4,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .perf_ctr               = HSWEP_PCU_MSR_PMON_CTR0,
-       .event_ctl              = HSWEP_PCU_MSR_PMON_CTL0,
-       .event_mask             = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = HSWEP_PCU_MSR_PMON_BOX_CTL,
-       .num_shared_regs        = 1,
-       .ops                    = &hswep_uncore_pcu_ops,
-       .format_group           = &snbep_uncore_pcu_format_group,
-};
-
-static struct intel_uncore_type *hswep_msr_uncores[] = {
-       &hswep_uncore_ubox,
-       &hswep_uncore_cbox,
-       &hswep_uncore_sbox,
-       &hswep_uncore_pcu,
-       NULL,
-};
-
-void hswep_uncore_cpu_init(void)
-{
-       if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-               hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-
-       /* Detect 6-8 core systems with only two SBOXes */
-       if (uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3]) {
-               u32 capid4;
-
-               pci_read_config_dword(uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3],
-                                     0x94, &capid4);
-               if (((capid4 >> 6) & 0x3) == 0)
-                       hswep_uncore_sbox.num_boxes = 2;
-       }
-
-       uncore_msr_uncores = hswep_msr_uncores;
-}
-
-static struct intel_uncore_type hswep_uncore_ha = {
-       .name           = "ha",
-       .num_counters   = 5,
-       .num_boxes      = 2,
-       .perf_ctr_bits  = 48,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct uncore_event_desc hswep_uncore_imc_events[] = {
-       INTEL_UNCORE_EVENT_DESC(clockticks,      "event=0x00,umask=0x00"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_read,  "event=0x04,umask=0x03"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
-       { /* end: all zeroes */ },
-};
-
-static struct intel_uncore_type hswep_uncore_imc = {
-       .name           = "imc",
-       .num_counters   = 5,
-       .num_boxes      = 8,
-       .perf_ctr_bits  = 48,
-       .fixed_ctr_bits = 48,
-       .fixed_ctr      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
-       .fixed_ctl      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
-       .event_descs    = hswep_uncore_imc_events,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static unsigned hswep_uncore_irp_ctrs[] = {0xa0, 0xa8, 0xb0, 0xb8};
-
-static u64 hswep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       struct hw_perf_event *hwc = &event->hw;
-       u64 count = 0;
-
-       pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
-       pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
-
-       return count;
-}
-
-static struct intel_uncore_ops hswep_uncore_irp_ops = {
-       .init_box       = snbep_uncore_pci_init_box,
-       .disable_box    = snbep_uncore_pci_disable_box,
-       .enable_box     = snbep_uncore_pci_enable_box,
-       .disable_event  = ivbep_uncore_irp_disable_event,
-       .enable_event   = ivbep_uncore_irp_enable_event,
-       .read_counter   = hswep_uncore_irp_read_counter,
-};
-
-static struct intel_uncore_type hswep_uncore_irp = {
-       .name                   = "irp",
-       .num_counters           = 4,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
-       .ops                    = &hswep_uncore_irp_ops,
-       .format_group           = &snbep_uncore_format_group,
-};
-
-static struct intel_uncore_type hswep_uncore_qpi = {
-       .name                   = "qpi",
-       .num_counters           = 5,
-       .num_boxes              = 3,
-       .perf_ctr_bits          = 48,
-       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
-       .event_ctl              = SNBEP_PCI_PMON_CTL0,
-       .event_mask             = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
-       .num_shared_regs        = 1,
-       .ops                    = &snbep_uncore_qpi_ops,
-       .format_group           = &snbep_uncore_qpi_format_group,
-};
-
-static struct event_constraint hswep_uncore_r2pcie_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x24, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x27, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2a, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
-       EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type hswep_uncore_r2pcie = {
-       .name           = "r2pcie",
-       .num_counters   = 4,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 48,
-       .constraints    = hswep_uncore_r2pcie_constraints,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct event_constraint hswep_uncore_r3qpi_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x01, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
-       EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type hswep_uncore_r3qpi = {
-       .name           = "r3qpi",
-       .num_counters   = 4,
-       .num_boxes      = 3,
-       .perf_ctr_bits  = 44,
-       .constraints    = hswep_uncore_r3qpi_constraints,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-enum {
-       HSWEP_PCI_UNCORE_HA,
-       HSWEP_PCI_UNCORE_IMC,
-       HSWEP_PCI_UNCORE_IRP,
-       HSWEP_PCI_UNCORE_QPI,
-       HSWEP_PCI_UNCORE_R2PCIE,
-       HSWEP_PCI_UNCORE_R3QPI,
-};
-
-static struct intel_uncore_type *hswep_pci_uncores[] = {
-       [HSWEP_PCI_UNCORE_HA]   = &hswep_uncore_ha,
-       [HSWEP_PCI_UNCORE_IMC]  = &hswep_uncore_imc,
-       [HSWEP_PCI_UNCORE_IRP]  = &hswep_uncore_irp,
-       [HSWEP_PCI_UNCORE_QPI]  = &hswep_uncore_qpi,
-       [HSWEP_PCI_UNCORE_R2PCIE]       = &hswep_uncore_r2pcie,
-       [HSWEP_PCI_UNCORE_R3QPI]        = &hswep_uncore_r3qpi,
-       NULL,
-};
-
-static const struct pci_device_id hswep_uncore_pci_ids[] = {
-       { /* Home Agent 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f30),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 0),
-       },
-       { /* Home Agent 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f38),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 1),
-       },
-       { /* MC0 Channel 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb0),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 0),
-       },
-       { /* MC0 Channel 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb1),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 1),
-       },
-       { /* MC0 Channel 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb4),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 2),
-       },
-       { /* MC0 Channel 3 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb5),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 3),
-       },
-       { /* MC1 Channel 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd0),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 4),
-       },
-       { /* MC1 Channel 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd1),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 5),
-       },
-       { /* MC1 Channel 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd4),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 6),
-       },
-       { /* MC1 Channel 3 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd5),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 7),
-       },
-       { /* IRP */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f39),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IRP, 0),
-       },
-       { /* QPI0 Port 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f32),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 0),
-       },
-       { /* QPI0 Port 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f33),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 1),
-       },
-       { /* QPI1 Port 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3a),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 2),
-       },
-       { /* R2PCIe */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f34),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R2PCIE, 0),
-       },
-       { /* R3QPI0 Link 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f36),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 0),
-       },
-       { /* R3QPI0 Link 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f37),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 1),
-       },
-       { /* R3QPI1 Link 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3e),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 2),
-       },
-       { /* QPI Port 0 filter  */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f86),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-                                                  SNBEP_PCI_QPI_PORT0_FILTER),
-       },
-       { /* QPI Port 1 filter  */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f96),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-                                                  SNBEP_PCI_QPI_PORT1_FILTER),
-       },
-       { /* PCU.3 (for Capability registers) */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fc0),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-                                                  HSWEP_PCI_PCU_3),
-       },
-       { /* end: all zeroes */ }
-};
-
-static struct pci_driver hswep_uncore_pci_driver = {
-       .name           = "hswep_uncore",
-       .id_table       = hswep_uncore_pci_ids,
-};
-
-int hswep_uncore_pci_init(void)
-{
-       int ret = snbep_pci2phy_map_init(0x2f1e);
-       if (ret)
-               return ret;
-       uncore_pci_uncores = hswep_pci_uncores;
-       uncore_pci_driver = &hswep_uncore_pci_driver;
-       return 0;
-}
-/* end of Haswell-EP uncore support */
-
-/* BDX uncore support */
-
-static struct intel_uncore_type bdx_uncore_ubox = {
-       .name                   = "ubox",
-       .num_counters           = 2,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .fixed_ctr_bits         = 48,
-       .perf_ctr               = HSWEP_U_MSR_PMON_CTR0,
-       .event_ctl              = HSWEP_U_MSR_PMON_CTL0,
-       .event_mask             = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
-       .fixed_ctr              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
-       .fixed_ctl              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
-       .num_shared_regs        = 1,
-       .ops                    = &ivbep_uncore_msr_ops,
-       .format_group           = &ivbep_uncore_ubox_format_group,
-};
-
-static struct event_constraint bdx_uncore_cbox_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
-       EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type bdx_uncore_cbox = {
-       .name                   = "cbox",
-       .num_counters           = 4,
-       .num_boxes              = 24,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = HSWEP_C0_MSR_PMON_CTL0,
-       .perf_ctr               = HSWEP_C0_MSR_PMON_CTR0,
-       .event_mask             = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = HSWEP_C0_MSR_PMON_BOX_CTL,
-       .msr_offset             = HSWEP_CBO_MSR_OFFSET,
-       .num_shared_regs        = 1,
-       .constraints            = bdx_uncore_cbox_constraints,
-       .ops                    = &hswep_uncore_cbox_ops,
-       .format_group           = &hswep_uncore_cbox_format_group,
-};
-
-static struct intel_uncore_type bdx_uncore_sbox = {
-       .name                   = "sbox",
-       .num_counters           = 4,
-       .num_boxes              = 4,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = HSWEP_S0_MSR_PMON_CTL0,
-       .perf_ctr               = HSWEP_S0_MSR_PMON_CTR0,
-       .event_mask             = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = HSWEP_S0_MSR_PMON_BOX_CTL,
-       .msr_offset             = HSWEP_SBOX_MSR_OFFSET,
-       .ops                    = &hswep_uncore_sbox_msr_ops,
-       .format_group           = &hswep_uncore_sbox_format_group,
-};
-
-static struct intel_uncore_type *bdx_msr_uncores[] = {
-       &bdx_uncore_ubox,
-       &bdx_uncore_cbox,
-       &bdx_uncore_sbox,
-       &hswep_uncore_pcu,
-       NULL,
-};
-
-void bdx_uncore_cpu_init(void)
-{
-       if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-               bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-       uncore_msr_uncores = bdx_msr_uncores;
-}
-
-static struct intel_uncore_type bdx_uncore_ha = {
-       .name           = "ha",
-       .num_counters   = 4,
-       .num_boxes      = 2,
-       .perf_ctr_bits  = 48,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type bdx_uncore_imc = {
-       .name           = "imc",
-       .num_counters   = 5,
-       .num_boxes      = 8,
-       .perf_ctr_bits  = 48,
-       .fixed_ctr_bits = 48,
-       .fixed_ctr      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
-       .fixed_ctl      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
-       .event_descs    = hswep_uncore_imc_events,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type bdx_uncore_irp = {
-       .name                   = "irp",
-       .num_counters           = 4,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
-       .ops                    = &hswep_uncore_irp_ops,
-       .format_group           = &snbep_uncore_format_group,
-};
-
-static struct intel_uncore_type bdx_uncore_qpi = {
-       .name                   = "qpi",
-       .num_counters           = 4,
-       .num_boxes              = 3,
-       .perf_ctr_bits          = 48,
-       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
-       .event_ctl              = SNBEP_PCI_PMON_CTL0,
-       .event_mask             = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
-       .num_shared_regs        = 1,
-       .ops                    = &snbep_uncore_qpi_ops,
-       .format_group           = &snbep_uncore_qpi_format_group,
-};
-
-static struct event_constraint bdx_uncore_r2pcie_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
-       EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type bdx_uncore_r2pcie = {
-       .name           = "r2pcie",
-       .num_counters   = 4,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 48,
-       .constraints    = bdx_uncore_r2pcie_constraints,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct event_constraint bdx_uncore_r3qpi_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x01, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
-       EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type bdx_uncore_r3qpi = {
-       .name           = "r3qpi",
-       .num_counters   = 3,
-       .num_boxes      = 3,
-       .perf_ctr_bits  = 48,
-       .constraints    = bdx_uncore_r3qpi_constraints,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-enum {
-       BDX_PCI_UNCORE_HA,
-       BDX_PCI_UNCORE_IMC,
-       BDX_PCI_UNCORE_IRP,
-       BDX_PCI_UNCORE_QPI,
-       BDX_PCI_UNCORE_R2PCIE,
-       BDX_PCI_UNCORE_R3QPI,
-};
-
-static struct intel_uncore_type *bdx_pci_uncores[] = {
-       [BDX_PCI_UNCORE_HA]     = &bdx_uncore_ha,
-       [BDX_PCI_UNCORE_IMC]    = &bdx_uncore_imc,
-       [BDX_PCI_UNCORE_IRP]    = &bdx_uncore_irp,
-       [BDX_PCI_UNCORE_QPI]    = &bdx_uncore_qpi,
-       [BDX_PCI_UNCORE_R2PCIE] = &bdx_uncore_r2pcie,
-       [BDX_PCI_UNCORE_R3QPI]  = &bdx_uncore_r3qpi,
-       NULL,
-};
-
-static const struct pci_device_id bdx_uncore_pci_ids[] = {
-       { /* Home Agent 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0),
-       },
-       { /* Home Agent 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f38),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 1),
-       },
-       { /* MC0 Channel 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb0),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 0),
-       },
-       { /* MC0 Channel 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb1),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 1),
-       },
-       { /* MC0 Channel 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb4),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 2),
-       },
-       { /* MC0 Channel 3 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb5),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 3),
-       },
-       { /* MC1 Channel 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd0),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 4),
-       },
-       { /* MC1 Channel 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd1),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 5),
-       },
-       { /* MC1 Channel 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd4),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 6),
-       },
-       { /* MC1 Channel 3 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd5),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 7),
-       },
-       { /* IRP */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f39),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IRP, 0),
-       },
-       { /* QPI0 Port 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f32),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 0),
-       },
-       { /* QPI0 Port 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f33),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 1),
-       },
-       { /* QPI1 Port 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3a),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 2),
-       },
-       { /* R2PCIe */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f34),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R2PCIE, 0),
-       },
-       { /* R3QPI0 Link 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f36),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 0),
-       },
-       { /* R3QPI0 Link 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f37),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 1),
-       },
-       { /* R3QPI1 Link 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3e),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 2),
-       },
-       { /* QPI Port 0 filter  */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f86),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 0),
-       },
-       { /* QPI Port 1 filter  */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f96),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 1),
-       },
-       { /* QPI Port 2 filter  */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f46),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 2),
-       },
-       { /* end: all zeroes */ }
-};
-
-static struct pci_driver bdx_uncore_pci_driver = {
-       .name           = "bdx_uncore",
-       .id_table       = bdx_uncore_pci_ids,
-};
-
-int bdx_uncore_pci_init(void)
-{
-       int ret = snbep_pci2phy_map_init(0x6f1e);
-
-       if (ret)
-               return ret;
-       uncore_pci_uncores = bdx_pci_uncores;
-       uncore_pci_driver = &bdx_uncore_pci_driver;
-       return 0;
-}
-
-/* end of BDX uncore support */
diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c

deleted file mode 100644 (file)

index 5b0c232..0000000
--- a/arch/x86/kernel/cpu/perf_event_knc.c
+++ /dev/null
@@ -1,319 +0,0 @@
-/* Driver for Intel Xeon Phi "Knights Corner" PMU */
-
-#include <linux/perf_event.h>
-#include <linux/types.h>
-
-#include <asm/hardirq.h>
-
-#include "perf_event.h"
-
-static const u64 knc_perfmon_event_map[] =
-{
-  [PERF_COUNT_HW_CPU_CYCLES]           = 0x002a,
-  [PERF_COUNT_HW_INSTRUCTIONS]         = 0x0016,
-  [PERF_COUNT_HW_CACHE_REFERENCES]     = 0x0028,
-  [PERF_COUNT_HW_CACHE_MISSES]         = 0x0029,
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]  = 0x0012,
-  [PERF_COUNT_HW_BRANCH_MISSES]                = 0x002b,
-};
-
-static const u64 __initconst knc_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               /* On Xeon Phi event "0" is a valid DATA_READ          */
-               /*   (L1 Data Cache Reads) Instruction.                */
-               /* We code this as ARCH_PERFMON_EVENTSEL_INT as this   */
-               /* bit will always be set in x86_pmu_hw_config().      */
-               [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
-                                               /* DATA_READ           */
-               [ C(RESULT_MISS)   ] = 0x0003,  /* DATA_READ_MISS      */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0001,  /* DATA_WRITE          */
-               [ C(RESULT_MISS)   ] = 0x0004,  /* DATA_WRITE_MISS     */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0011,  /* L1_DATA_PF1         */
-               [ C(RESULT_MISS)   ] = 0x001c,  /* L1_DATA_PF1_MISS    */
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x000c,  /* CODE_READ          */
-               [ C(RESULT_MISS)   ] = 0x000e,  /* CODE_CACHE_MISS    */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0x10cb,  /* L2_READ_MISS */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x10cc,  /* L2_WRITE_HIT */
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x10fc,  /* L2_DATA_PF2      */
-               [ C(RESULT_MISS)   ] = 0x10fe,  /* L2_DATA_PF2_MISS */
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
-                                               /* DATA_READ */
-                                               /* see note on L1 OP_READ */
-               [ C(RESULT_MISS)   ] = 0x0002,  /* DATA_PAGE_WALK */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0001,  /* DATA_WRITE */
-               [ C(RESULT_MISS)   ] = 0x0002,  /* DATA_PAGE_WALK */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x000c,  /* CODE_READ */
-               [ C(RESULT_MISS)   ] = 0x000d,  /* CODE_PAGE_WALK */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0012,  /* BRANCHES */
-               [ C(RESULT_MISS)   ] = 0x002b,  /* BRANCHES_MISPREDICTED */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
-};
-
-
-static u64 knc_pmu_event_map(int hw_event)
-{
-       return knc_perfmon_event_map[hw_event];
-}
-
-static struct event_constraint knc_event_constraints[] =
-{
-       INTEL_EVENT_CONSTRAINT(0xc3, 0x1),      /* HWP_L2HIT */
-       INTEL_EVENT_CONSTRAINT(0xc4, 0x1),      /* HWP_L2MISS */
-       INTEL_EVENT_CONSTRAINT(0xc8, 0x1),      /* L2_READ_HIT_E */
-       INTEL_EVENT_CONSTRAINT(0xc9, 0x1),      /* L2_READ_HIT_M */
-       INTEL_EVENT_CONSTRAINT(0xca, 0x1),      /* L2_READ_HIT_S */
-       INTEL_EVENT_CONSTRAINT(0xcb, 0x1),      /* L2_READ_MISS */
-       INTEL_EVENT_CONSTRAINT(0xcc, 0x1),      /* L2_WRITE_HIT */
-       INTEL_EVENT_CONSTRAINT(0xce, 0x1),      /* L2_STRONGLY_ORDERED_STREAMING_VSTORES_MISS */
-       INTEL_EVENT_CONSTRAINT(0xcf, 0x1),      /* L2_WEAKLY_ORDERED_STREAMING_VSTORE_MISS */
-       INTEL_EVENT_CONSTRAINT(0xd7, 0x1),      /* L2_VICTIM_REQ_WITH_DATA */
-       INTEL_EVENT_CONSTRAINT(0xe3, 0x1),      /* SNP_HITM_BUNIT */
-       INTEL_EVENT_CONSTRAINT(0xe6, 0x1),      /* SNP_HIT_L2 */
-       INTEL_EVENT_CONSTRAINT(0xe7, 0x1),      /* SNP_HITM_L2 */
-       INTEL_EVENT_CONSTRAINT(0xf1, 0x1),      /* L2_DATA_READ_MISS_CACHE_FILL */
-       INTEL_EVENT_CONSTRAINT(0xf2, 0x1),      /* L2_DATA_WRITE_MISS_CACHE_FILL */
-       INTEL_EVENT_CONSTRAINT(0xf6, 0x1),      /* L2_DATA_READ_MISS_MEM_FILL */
-       INTEL_EVENT_CONSTRAINT(0xf7, 0x1),      /* L2_DATA_WRITE_MISS_MEM_FILL */
-       INTEL_EVENT_CONSTRAINT(0xfc, 0x1),      /* L2_DATA_PF2 */
-       INTEL_EVENT_CONSTRAINT(0xfd, 0x1),      /* L2_DATA_PF2_DROP */
-       INTEL_EVENT_CONSTRAINT(0xfe, 0x1),      /* L2_DATA_PF2_MISS */
-       INTEL_EVENT_CONSTRAINT(0xff, 0x1),      /* L2_DATA_HIT_INFLIGHT_PF2 */
-       EVENT_CONSTRAINT_END
-};
-
-#define MSR_KNC_IA32_PERF_GLOBAL_STATUS                0x0000002d
-#define MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL   0x0000002e
-#define MSR_KNC_IA32_PERF_GLOBAL_CTRL          0x0000002f
-
-#define KNC_ENABLE_COUNTER0                    0x00000001
-#define KNC_ENABLE_COUNTER1                    0x00000002
-
-static void knc_pmu_disable_all(void)
-{
-       u64 val;
-
-       rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
-       val &= ~(KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
-       wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
-}
-
-static void knc_pmu_enable_all(int added)
-{
-       u64 val;
-
-       rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
-       val |= (KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
-       wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
-}
-
-static inline void
-knc_pmu_disable_event(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       u64 val;
-
-       val = hwc->config;
-       val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-
-       (void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
-}
-
-static void knc_pmu_enable_event(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       u64 val;
-
-       val = hwc->config;
-       val |= ARCH_PERFMON_EVENTSEL_ENABLE;
-
-       (void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
-}
-
-static inline u64 knc_pmu_get_status(void)
-{
-       u64 status;
-
-       rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_STATUS, status);
-
-       return status;
-}
-
-static inline void knc_pmu_ack_status(u64 ack)
-{
-       wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL, ack);
-}
-
-static int knc_pmu_handle_irq(struct pt_regs *regs)
-{
-       struct perf_sample_data data;
-       struct cpu_hw_events *cpuc;
-       int handled = 0;
-       int bit, loops;
-       u64 status;
-
-       cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       knc_pmu_disable_all();
-
-       status = knc_pmu_get_status();
-       if (!status) {
-               knc_pmu_enable_all(0);
-               return handled;
-       }
-
-       loops = 0;
-again:
-       knc_pmu_ack_status(status);
-       if (++loops > 100) {
-               WARN_ONCE(1, "perf: irq loop stuck!\n");
-               perf_event_print_debug();
-               goto done;
-       }
-
-       inc_irq_stat(apic_perf_irqs);
-
-       for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
-               struct perf_event *event = cpuc->events[bit];
-
-               handled++;
-
-               if (!test_bit(bit, cpuc->active_mask))
-                       continue;
-
-               if (!intel_pmu_save_and_restart(event))
-                       continue;
-
-               perf_sample_data_init(&data, 0, event->hw.last_period);
-
-               if (perf_event_overflow(event, &data, regs))
-                       x86_pmu_stop(event, 0);
-       }
-
-       /*
-        * Repeat if there is more work to be done:
-        */
-       status = knc_pmu_get_status();
-       if (status)
-               goto again;
-
-done:
-       knc_pmu_enable_all(0);
-
-       return handled;
-}
-
-
-PMU_FORMAT_ATTR(event, "config:0-7"    );
-PMU_FORMAT_ATTR(umask, "config:8-15"   );
-PMU_FORMAT_ATTR(edge,  "config:18"     );
-PMU_FORMAT_ATTR(inv,   "config:23"     );
-PMU_FORMAT_ATTR(cmask, "config:24-31"  );
-
-static struct attribute *intel_knc_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_cmask.attr,
-       NULL,
-};
-
-static const struct x86_pmu knc_pmu __initconst = {
-       .name                   = "knc",
-       .handle_irq             = knc_pmu_handle_irq,
-       .disable_all            = knc_pmu_disable_all,
-       .enable_all             = knc_pmu_enable_all,
-       .enable                 = knc_pmu_enable_event,
-       .disable                = knc_pmu_disable_event,
-       .hw_config              = x86_pmu_hw_config,
-       .schedule_events        = x86_schedule_events,
-       .eventsel               = MSR_KNC_EVNTSEL0,
-       .perfctr                = MSR_KNC_PERFCTR0,
-       .event_map              = knc_pmu_event_map,
-       .max_events             = ARRAY_SIZE(knc_perfmon_event_map),
-       .apic                   = 1,
-       .max_period             = (1ULL << 39) - 1,
-       .version                = 0,
-       .num_counters           = 2,
-       .cntval_bits            = 40,
-       .cntval_mask            = (1ULL << 40) - 1,
-       .get_event_constraints  = x86_get_event_constraints,
-       .event_constraints      = knc_event_constraints,
-       .format_attrs           = intel_knc_formats_attr,
-};
-
-__init int knc_pmu_init(void)
-{
-       x86_pmu = knc_pmu;
-
-       memcpy(hw_cache_event_ids, knc_hw_cache_event_ids, 
-               sizeof(hw_cache_event_ids));
-
-       return 0;
-}
diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c

deleted file mode 100644 (file)

index ec863b9..0000000
--- a/arch/x86/kernel/cpu/perf_event_msr.c
+++ /dev/null
@@ -1,241 +0,0 @@
-#include <linux/perf_event.h>
-
-enum perf_msr_id {
-       PERF_MSR_TSC                    = 0,
-       PERF_MSR_APERF                  = 1,
-       PERF_MSR_MPERF                  = 2,
-       PERF_MSR_PPERF                  = 3,
-       PERF_MSR_SMI                    = 4,
-
-       PERF_MSR_EVENT_MAX,
-};
-
-static bool test_aperfmperf(int idx)
-{
-       return boot_cpu_has(X86_FEATURE_APERFMPERF);
-}
-
-static bool test_intel(int idx)
-{
-       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-           boot_cpu_data.x86 != 6)
-               return false;
-
-       switch (boot_cpu_data.x86_model) {
-       case 30: /* 45nm Nehalem    */
-       case 26: /* 45nm Nehalem-EP */
-       case 46: /* 45nm Nehalem-EX */
-
-       case 37: /* 32nm Westmere    */
-       case 44: /* 32nm Westmere-EP */
-       case 47: /* 32nm Westmere-EX */
-
-       case 42: /* 32nm SandyBridge         */
-       case 45: /* 32nm SandyBridge-E/EN/EP */
-
-       case 58: /* 22nm IvyBridge       */
-       case 62: /* 22nm IvyBridge-EP/EX */
-
-       case 60: /* 22nm Haswell Core */
-       case 63: /* 22nm Haswell Server */
-       case 69: /* 22nm Haswell ULT */
-       case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
-
-       case 61: /* 14nm Broadwell Core-M */
-       case 86: /* 14nm Broadwell Xeon D */
-       case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
-       case 79: /* 14nm Broadwell Server */
-
-       case 55: /* 22nm Atom "Silvermont"                */
-       case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
-       case 76: /* 14nm Atom "Airmont"                   */
-               if (idx == PERF_MSR_SMI)
-                       return true;
-               break;
-
-       case 78: /* 14nm Skylake Mobile */
-       case 94: /* 14nm Skylake Desktop */
-               if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
-                       return true;
-               break;
-       }
-
-       return false;
-}
-
-struct perf_msr {
-       u64     msr;
-       struct  perf_pmu_events_attr *attr;
-       bool    (*test)(int idx);
-};
-
-PMU_EVENT_ATTR_STRING(tsc,   evattr_tsc,   "event=0x00");
-PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01");
-PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02");
-PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03");
-PMU_EVENT_ATTR_STRING(smi,   evattr_smi,   "event=0x04");
-
-static struct perf_msr msr[] = {
-       [PERF_MSR_TSC]   = { 0,                 &evattr_tsc,    NULL,            },
-       [PERF_MSR_APERF] = { MSR_IA32_APERF,    &evattr_aperf,  test_aperfmperf, },
-       [PERF_MSR_MPERF] = { MSR_IA32_MPERF,    &evattr_mperf,  test_aperfmperf, },
-       [PERF_MSR_PPERF] = { MSR_PPERF,         &evattr_pperf,  test_intel,      },
-       [PERF_MSR_SMI]   = { MSR_SMI_COUNT,     &evattr_smi,    test_intel,      },
-};
-
-static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = {
-       NULL,
-};
-
-static struct attribute_group events_attr_group = {
-       .name = "events",
-       .attrs = events_attrs,
-};
-
-PMU_FORMAT_ATTR(event, "config:0-63");
-static struct attribute *format_attrs[] = {
-       &format_attr_event.attr,
-       NULL,
-};
-static struct attribute_group format_attr_group = {
-       .name = "format",
-       .attrs = format_attrs,
-};
-
-static const struct attribute_group *attr_groups[] = {
-       &events_attr_group,
-       &format_attr_group,
-       NULL,
-};
-
-static int msr_event_init(struct perf_event *event)
-{
-       u64 cfg = event->attr.config;
-
-       if (event->attr.type != event->pmu->type)
-               return -ENOENT;
-
-       if (cfg >= PERF_MSR_EVENT_MAX)
-               return -EINVAL;
-
-       /* unsupported modes and filters */
-       if (event->attr.exclude_user   ||
-           event->attr.exclude_kernel ||
-           event->attr.exclude_hv     ||
-           event->attr.exclude_idle   ||
-           event->attr.exclude_host   ||
-           event->attr.exclude_guest  ||
-           event->attr.sample_period) /* no sampling */
-               return -EINVAL;
-
-       if (!msr[cfg].attr)
-               return -EINVAL;
-
-       event->hw.idx = -1;
-       event->hw.event_base = msr[cfg].msr;
-       event->hw.config = cfg;
-
-       return 0;
-}
-
-static inline u64 msr_read_counter(struct perf_event *event)
-{
-       u64 now;
-
-       if (event->hw.event_base)
-               rdmsrl(event->hw.event_base, now);
-       else
-               rdtscll(now);
-
-       return now;
-}
-static void msr_event_update(struct perf_event *event)
-{
-       u64 prev, now;
-       s64 delta;
-
-       /* Careful, an NMI might modify the previous event value. */
-again:
-       prev = local64_read(&event->hw.prev_count);
-       now = msr_read_counter(event);
-
-       if (local64_cmpxchg(&event->hw.prev_count, prev, now) != prev)
-               goto again;
-
-       delta = now - prev;
-       if (unlikely(event->hw.event_base == MSR_SMI_COUNT))
-               delta = sign_extend64(delta, 31);
-
-       local64_add(now - prev, &event->count);
-}
-
-static void msr_event_start(struct perf_event *event, int flags)
-{
-       u64 now;
-
-       now = msr_read_counter(event);
-       local64_set(&event->hw.prev_count, now);
-}
-
-static void msr_event_stop(struct perf_event *event, int flags)
-{
-       msr_event_update(event);
-}
-
-static void msr_event_del(struct perf_event *event, int flags)
-{
-       msr_event_stop(event, PERF_EF_UPDATE);
-}
-
-static int msr_event_add(struct perf_event *event, int flags)
-{
-       if (flags & PERF_EF_START)
-               msr_event_start(event, flags);
-
-       return 0;
-}
-
-static struct pmu pmu_msr = {
-       .task_ctx_nr    = perf_sw_context,
-       .attr_groups    = attr_groups,
-       .event_init     = msr_event_init,
-       .add            = msr_event_add,
-       .del            = msr_event_del,
-       .start          = msr_event_start,
-       .stop           = msr_event_stop,
-       .read           = msr_event_update,
-       .capabilities   = PERF_PMU_CAP_NO_INTERRUPT,
-};
-
-static int __init msr_init(void)
-{
-       int i, j = 0;
-
-       if (!boot_cpu_has(X86_FEATURE_TSC)) {
-               pr_cont("no MSR PMU driver.\n");
-               return 0;
-       }
-
-       /* Probe the MSRs. */
-       for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) {
-               u64 val;
-
-               /*
-                * Virt sucks arse; you cannot tell if a R/O MSR is present :/
-                */
-               if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
-                       msr[i].attr = NULL;
-       }
-
-       /* List remaining MSRs in the sysfs attrs. */
-       for (i = 0; i < PERF_MSR_EVENT_MAX; i++) {
-               if (msr[i].attr)
-                       events_attrs[j++] = &msr[i].attr->attr.attr;
-       }
-       events_attrs[j] = NULL;
-
-       perf_pmu_register(&pmu_msr, "msr", -1);
-
-       return 0;
-}
-device_initcall(msr_init);
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c

deleted file mode 100644 (file)

index f2e5678..0000000
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ /dev/null
@@ -1,1376 +0,0 @@
-/*
- * Netburst Performance Events (P4, old Xeon)
- *
- *  Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
- *  Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
- *
- *  For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_event.h>
-
-#include <asm/perf_event_p4.h>
-#include <asm/hardirq.h>
-#include <asm/apic.h>
-
-#include "perf_event.h"
-
-#define P4_CNTR_LIMIT 3
-/*
- * array indices: 0,1 - HT threads, used with HT enabled cpu
- */
-struct p4_event_bind {
-       unsigned int opcode;                    /* Event code and ESCR selector */
-       unsigned int escr_msr[2];               /* ESCR MSR for this event */
-       unsigned int escr_emask;                /* valid ESCR EventMask bits */
-       unsigned int shared;                    /* event is shared across threads */
-       char cntr[2][P4_CNTR_LIMIT];            /* counter index (offset), -1 on abscence */
-};
-
-struct p4_pebs_bind {
-       unsigned int metric_pebs;
-       unsigned int metric_vert;
-};
-
-/* it sets P4_PEBS_ENABLE_UOP_TAG as well */
-#define P4_GEN_PEBS_BIND(name, pebs, vert)                     \
-       [P4_PEBS_METRIC__##name] = {                            \
-               .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG,   \
-               .metric_vert = vert,                            \
-       }
-
-/*
- * note we have P4_PEBS_ENABLE_UOP_TAG always set here
- *
- * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of
- * event configuration to find out which values are to be
- * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT
- * resgisters
- */
-static struct p4_pebs_bind p4_pebs_bind_map[] = {
-       P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired,  0x0000001, 0x0000001),
-       P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired,  0x0000002, 0x0000001),
-       P4_GEN_PEBS_BIND(dtlb_load_miss_retired,        0x0000004, 0x0000001),
-       P4_GEN_PEBS_BIND(dtlb_store_miss_retired,       0x0000004, 0x0000002),
-       P4_GEN_PEBS_BIND(dtlb_all_miss_retired,         0x0000004, 0x0000003),
-       P4_GEN_PEBS_BIND(tagged_mispred_branch,         0x0018000, 0x0000010),
-       P4_GEN_PEBS_BIND(mob_load_replay_retired,       0x0000200, 0x0000001),
-       P4_GEN_PEBS_BIND(split_load_retired,            0x0000400, 0x0000001),
-       P4_GEN_PEBS_BIND(split_store_retired,           0x0000400, 0x0000002),
-};
-
-/*
- * Note that we don't use CCCR1 here, there is an
- * exception for P4_BSQ_ALLOCATION but we just have
- * no workaround
- *
- * consider this binding as resources which particular
- * event may borrow, it doesn't contain EventMask,
- * Tags and friends -- they are left to a caller
- */
-static struct p4_event_bind p4_event_bind_map[] = {
-       [P4_EVENT_TC_DELIVER_MODE] = {
-               .opcode         = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
-               .escr_msr       = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD)                 |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB)                 |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI)                 |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD)                 |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB)                 |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI)                 |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID),
-               .shared         = 1,
-               .cntr           = { {4, 5, -1}, {6, 7, -1} },
-       },
-       [P4_EVENT_BPU_FETCH_REQUEST] = {
-               .opcode         = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
-               .escr_msr       = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS),
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_ITLB_REFERENCE] = {
-               .opcode         = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
-               .escr_msr       = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT)                 |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS)                |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK),
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_MEMORY_CANCEL] = {
-               .opcode         = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
-               .escr_msr       = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF),
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_MEMORY_COMPLETE] = {
-               .opcode         = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
-               .escr_msr       = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC)                |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC),
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_LOAD_PORT_REPLAY] = {
-               .opcode         = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
-               .escr_msr       = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD),
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_STORE_PORT_REPLAY] = {
-               .opcode         = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
-               .escr_msr       = { MSR_P4_SAAT_ESCR0 ,  MSR_P4_SAAT_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST),
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_MOB_LOAD_REPLAY] = {
-               .opcode         = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
-               .escr_msr       = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA)             |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD)             |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA)       |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR),
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_PAGE_WALK_TYPE] = {
-               .opcode         = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
-               .escr_msr       = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS)              |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS),
-               .shared         = 1,
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_BSQ_CACHE_REFERENCE] = {
-               .opcode         = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
-               .escr_msr       = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_IOQ_ALLOCATION] = {
-               .opcode         = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
-               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT)             |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ)            |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC)              |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC)              |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT)              |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP)              |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB)              |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN)                 |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER)               |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH),
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_IOQ_ACTIVE_ENTRIES] = {       /* shared ESCR */
-               .opcode         = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
-               .escr_msr       = { MSR_P4_FSB_ESCR1,  MSR_P4_FSB_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT)         |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ)        |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE)       |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC)          |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC)          |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT)          |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP)          |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB)          |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN)             |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH),
-               .cntr           = { {2, -1, -1}, {3, -1, -1} },
-       },
-       [P4_EVENT_FSB_DATA_ACTIVITY] = {
-               .opcode         = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
-               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)         |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)         |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER)       |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV)         |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN)         |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER),
-               .shared         = 1,
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_BSQ_ALLOCATION] = {           /* shared ESCR, broken CCCR1 */
-               .opcode         = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
-               .escr_msr       = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0)            |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1)            |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE)         |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE)       |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE)      |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE)      |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE)        |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE)        |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2),
-               .cntr           = { {0, -1, -1}, {1, -1, -1} },
-       },
-       [P4_EVENT_BSQ_ACTIVE_ENTRIES] = {       /* shared ESCR */
-               .opcode         = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
-               .escr_msr       = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0)       |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1)       |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0)        |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1)        |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE)     |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE)  |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE)  |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE)    |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE)    |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0)       |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1)       |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2),
-               .cntr           = { {2, -1, -1}, {3, -1, -1} },
-       },
-       [P4_EVENT_SSE_INPUT_ASSIST] = {
-               .opcode         = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
-               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL),
-               .shared         = 1,
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_PACKED_SP_UOP] = {
-               .opcode         = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
-               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL),
-               .shared         = 1,
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_PACKED_DP_UOP] = {
-               .opcode         = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
-               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL),
-               .shared         = 1,
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_SCALAR_SP_UOP] = {
-               .opcode         = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
-               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL),
-               .shared         = 1,
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_SCALAR_DP_UOP] = {
-               .opcode         = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
-               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL),
-               .shared         = 1,
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_64BIT_MMX_UOP] = {
-               .opcode         = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
-               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL),
-               .shared         = 1,
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_128BIT_MMX_UOP] = {
-               .opcode         = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
-               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL),
-               .shared         = 1,
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_X87_FP_UOP] = {
-               .opcode         = P4_OPCODE(P4_EVENT_X87_FP_UOP),
-               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL),
-               .shared         = 1,
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_TC_MISC] = {
-               .opcode         = P4_OPCODE(P4_EVENT_TC_MISC),
-               .escr_msr       = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH),
-               .cntr           = { {4, 5, -1}, {6, 7, -1} },
-       },
-       [P4_EVENT_GLOBAL_POWER_EVENTS] = {
-               .opcode         = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
-               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING),
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_TC_MS_XFER] = {
-               .opcode         = P4_OPCODE(P4_EVENT_TC_MS_XFER),
-               .escr_msr       = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC),
-               .cntr           = { {4, 5, -1}, {6, 7, -1} },
-       },
-       [P4_EVENT_UOP_QUEUE_WRITES] = {
-               .opcode         = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
-               .escr_msr       = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD)     |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM),
-               .cntr           = { {4, 5, -1}, {6, 7, -1} },
-       },
-       [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
-               .opcode         = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
-               .escr_msr       = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL)    |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN)         |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT),
-               .cntr           = { {4, 5, -1}, {6, 7, -1} },
-       },
-       [P4_EVENT_RETIRED_BRANCH_TYPE] = {
-               .opcode         = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
-               .escr_msr       = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)    |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)         |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT),
-               .cntr           = { {4, 5, -1}, {6, 7, -1} },
-       },
-       [P4_EVENT_RESOURCE_STALL] = {
-               .opcode         = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
-               .escr_msr       = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_WC_BUFFER] = {
-               .opcode         = P4_OPCODE(P4_EVENT_WC_BUFFER),
-               .escr_msr       = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS)               |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS),
-               .shared         = 1,
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_B2B_CYCLES] = {
-               .opcode         = P4_OPCODE(P4_EVENT_B2B_CYCLES),
-               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-               .escr_emask     = 0,
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_BNR] = {
-               .opcode         = P4_OPCODE(P4_EVENT_BNR),
-               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-               .escr_emask     = 0,
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_SNOOP] = {
-               .opcode         = P4_OPCODE(P4_EVENT_SNOOP),
-               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-               .escr_emask     = 0,
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_RESPONSE] = {
-               .opcode         = P4_OPCODE(P4_EVENT_RESPONSE),
-               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-               .escr_emask     = 0,
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_FRONT_END_EVENT] = {
-               .opcode         = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
-               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS)             |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_EXECUTION_EVENT] = {
-               .opcode         = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
-               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)            |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)            |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)            |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)            |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0)             |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1)             |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2)             |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_REPLAY_EVENT] = {
-               .opcode         = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
-               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS)                |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_INSTR_RETIRED] = {
-               .opcode         = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
-               .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG)            |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)            |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_UOPS_RETIRED] = {
-               .opcode         = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
-               .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS)                |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_UOP_TYPE] = {
-               .opcode         = P4_OPCODE(P4_EVENT_UOP_TYPE),
-               .escr_msr       = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS)                  |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_BRANCH_RETIRED] = {
-               .opcode         = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
-               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP)                |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM)                |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP)                |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_MISPRED_BRANCH_RETIRED] = {
-               .opcode         = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
-               .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_X87_ASSIST] = {
-               .opcode         = P4_OPCODE(P4_EVENT_X87_ASSIST),
-               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU)                    |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO)                    |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO)                    |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU)                    |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_MACHINE_CLEAR] = {
-               .opcode         = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
-               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR)                |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR)              |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_INSTR_COMPLETED] = {
-               .opcode         = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
-               .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS)             |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-};
-
-#define P4_GEN_CACHE_EVENT(event, bit, metric)                           \
-       p4_config_pack_escr(P4_ESCR_EVENT(event)                        | \
-                           P4_ESCR_EMASK_BIT(event, bit))              | \
-       p4_config_pack_cccr(metric                                      | \
-                           P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
-
-static __initconst const u64 p4_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-                                               P4_PEBS_METRIC__1stl_cache_load_miss_retired),
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-                                               P4_PEBS_METRIC__2ndl_cache_load_miss_retired),
-       },
-},
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-                                               P4_PEBS_METRIC__dtlb_load_miss_retired),
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-                                               P4_PEBS_METRIC__dtlb_store_miss_retired),
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
-                                               P4_PEBS_METRIC__none),
-               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
-                                               P4_PEBS_METRIC__none),
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
-};
-
-/*
- * Because of Netburst being quite restricted in how many
- * identical events may run simultaneously, we introduce event aliases,
- * ie the different events which have the same functionality but
- * utilize non-intersected resources (ESCR/CCCR/counter registers).
- *
- * This allow us to relax restrictions a bit and run two or more
- * identical events together.
- *
- * Never set any custom internal bits such as P4_CONFIG_HT,
- * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are
- * either up to date automatically or not applicable at all.
- */
-struct p4_event_alias {
-       u64 original;
-       u64 alternative;
-} p4_event_aliases[] = {
-       {
-               /*
-                * Non-halted cycles can be substituted with non-sleeping cycles (see
-                * Intel SDM Vol3b for details). We need this alias to be able
-                * to run nmi-watchdog and 'perf top' (or any other user space tool
-                * which is interested in running PERF_COUNT_HW_CPU_CYCLES)
-                * simultaneously.
-                */
-       .original       =
-               p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)         |
-                                   P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
-       .alternative    =
-               p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT)             |
-                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)|
-                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)|
-                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)|
-                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)|
-                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
-                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
-                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
-                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))|
-               p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT          |
-                                   P4_CCCR_COMPARE),
-       },
-};
-
-static u64 p4_get_alias_event(u64 config)
-{
-       u64 config_match;
-       int i;
-
-       /*
-        * Only event with special mark is allowed,
-        * we're to be sure it didn't come as malformed
-        * RAW event.
-        */
-       if (!(config & P4_CONFIG_ALIASABLE))
-               return 0;
-
-       config_match = config & P4_CONFIG_EVENT_ALIAS_MASK;
-
-       for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) {
-               if (config_match == p4_event_aliases[i].original) {
-                       config_match = p4_event_aliases[i].alternative;
-                       break;
-               } else if (config_match == p4_event_aliases[i].alternative) {
-                       config_match = p4_event_aliases[i].original;
-                       break;
-               }
-       }
-
-       if (i >= ARRAY_SIZE(p4_event_aliases))
-               return 0;
-
-       return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS);
-}
-
-static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
-  /* non-halted CPU clocks */
-  [PERF_COUNT_HW_CPU_CYCLES] =
-       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)         |
-               P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING))       |
-               P4_CONFIG_ALIASABLE,
-
-  /*
-   * retired instructions
-   * in a sake of simplicity we don't use the FSB tagging
-   */
-  [PERF_COUNT_HW_INSTRUCTIONS] =
-       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED)               |
-               P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)           |
-               P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)),
-
-  /* cache hits */
-  [PERF_COUNT_HW_CACHE_REFERENCES] =
-       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE)         |
-               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)   |
-               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)   |
-               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)   |
-               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)   |
-               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)   |
-               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)),
-
-  /* cache misses */
-  [PERF_COUNT_HW_CACHE_MISSES] =
-       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE)         |
-               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)   |
-               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)   |
-               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)),
-
-  /* branch instructions retired */
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =
-       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE)         |
-               P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)    |
-               P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)           |
-               P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)         |
-               P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)),
-
-  /* mispredicted branches retired */
-  [PERF_COUNT_HW_BRANCH_MISSES]        =
-       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED)      |
-               P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)),
-
-  /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN):  */
-  [PERF_COUNT_HW_BUS_CYCLES] =
-       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY)           |
-               P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)         |
-               P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN))        |
-       p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
-};
-
-static struct p4_event_bind *p4_config_get_bind(u64 config)
-{
-       unsigned int evnt = p4_config_unpack_event(config);
-       struct p4_event_bind *bind = NULL;
-
-       if (evnt < ARRAY_SIZE(p4_event_bind_map))
-               bind = &p4_event_bind_map[evnt];
-
-       return bind;
-}
-
-static u64 p4_pmu_event_map(int hw_event)
-{
-       struct p4_event_bind *bind;
-       unsigned int esel;
-       u64 config;
-
-       config = p4_general_events[hw_event];
-       bind = p4_config_get_bind(config);
-       esel = P4_OPCODE_ESEL(bind->opcode);
-       config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
-
-       return config;
-}
-
-/* check cpu model specifics */
-static bool p4_event_match_cpu_model(unsigned int event_idx)
-{
-       /* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */
-       if (event_idx == P4_EVENT_INSTR_COMPLETED) {
-               if (boot_cpu_data.x86_model != 3 &&
-                       boot_cpu_data.x86_model != 4 &&
-                       boot_cpu_data.x86_model != 6)
-                       return false;
-       }
-
-       /*
-        * For info
-        * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2
-        */
-
-       return true;
-}
-
-static int p4_validate_raw_event(struct perf_event *event)
-{
-       unsigned int v, emask;
-
-       /* User data may have out-of-bound event index */
-       v = p4_config_unpack_event(event->attr.config);
-       if (v >= ARRAY_SIZE(p4_event_bind_map))
-               return -EINVAL;
-
-       /* It may be unsupported: */
-       if (!p4_event_match_cpu_model(v))
-               return -EINVAL;
-
-       /*
-        * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as
-        * in Architectural Performance Monitoring, it means not
-        * on _which_ logical cpu to count but rather _when_, ie it
-        * depends on logical cpu state -- count event if one cpu active,
-        * none, both or any, so we just allow user to pass any value
-        * desired.
-        *
-        * In turn we always set Tx_OS/Tx_USR bits bound to logical
-        * cpu without their propagation to another cpu
-        */
-
-       /*
-        * if an event is shared across the logical threads
-        * the user needs special permissions to be able to use it
-        */
-       if (p4_ht_active() && p4_event_bind_map[v].shared) {
-               if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
-                       return -EACCES;
-       }
-
-       /* ESCR EventMask bits may be invalid */
-       emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK;
-       if (emask & ~p4_event_bind_map[v].escr_emask)
-               return -EINVAL;
-
-       /*
-        * it may have some invalid PEBS bits
-        */
-       if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE))
-               return -EINVAL;
-
-       v = p4_config_unpack_metric(event->attr.config);
-       if (v >= ARRAY_SIZE(p4_pebs_bind_map))
-               return -EINVAL;
-
-       return 0;
-}
-
-static int p4_hw_config(struct perf_event *event)
-{
-       int cpu = get_cpu();
-       int rc = 0;
-       u32 escr, cccr;
-
-       /*
-        * the reason we use cpu that early is that: if we get scheduled
-        * first time on the same cpu -- we will not need swap thread
-        * specific flags in config (and will save some cpu cycles)
-        */
-
-       cccr = p4_default_cccr_conf(cpu);
-       escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel,
-                                        event->attr.exclude_user);
-       event->hw.config = p4_config_pack_escr(escr) |
-                          p4_config_pack_cccr(cccr);
-
-       if (p4_ht_active() && p4_ht_thread(cpu))
-               event->hw.config = p4_set_ht_bit(event->hw.config);
-
-       if (event->attr.type == PERF_TYPE_RAW) {
-               struct p4_event_bind *bind;
-               unsigned int esel;
-               /*
-                * Clear bits we reserve to be managed by kernel itself
-                * and never allowed from a user space
-                */
-                event->attr.config &= P4_CONFIG_MASK;
-
-               rc = p4_validate_raw_event(event);
-               if (rc)
-                       goto out;
-
-               /*
-                * Note that for RAW events we allow user to use P4_CCCR_RESERVED
-                * bits since we keep additional info here (for cache events and etc)
-                */
-               event->hw.config |= event->attr.config;
-               bind = p4_config_get_bind(event->attr.config);
-               if (!bind) {
-                       rc = -EINVAL;
-                       goto out;
-               }
-               esel = P4_OPCODE_ESEL(bind->opcode);
-               event->hw.config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
-       }
-
-       rc = x86_setup_perfctr(event);
-out:
-       put_cpu();
-       return rc;
-}
-
-static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
-{
-       u64 v;
-
-       /* an official way for overflow indication */
-       rdmsrl(hwc->config_base, v);
-       if (v & P4_CCCR_OVF) {
-               wrmsrl(hwc->config_base, v & ~P4_CCCR_OVF);
-               return 1;
-       }
-
-       /*
-        * In some circumstances the overflow might issue an NMI but did
-        * not set P4_CCCR_OVF bit. Because a counter holds a negative value
-        * we simply check for high bit being set, if it's cleared it means
-        * the counter has reached zero value and continued counting before
-        * real NMI signal was received:
-        */
-       rdmsrl(hwc->event_base, v);
-       if (!(v & ARCH_P4_UNFLAGGED_BIT))
-               return 1;
-
-       return 0;
-}
-
-static void p4_pmu_disable_pebs(void)
-{
-       /*
-        * FIXME
-        *
-        * It's still allowed that two threads setup same cache
-        * events so we can't simply clear metrics until we knew
-        * no one is depending on us, so we need kind of counter
-        * for "ReplayEvent" users.
-        *
-        * What is more complex -- RAW events, if user (for some
-        * reason) will pass some cache event metric with improper
-        * event opcode -- it's fine from hardware point of view
-        * but completely nonsense from "meaning" of such action.
-        *
-        * So at moment let leave metrics turned on forever -- it's
-        * ok for now but need to be revisited!
-        *
-        * (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, 0);
-        * (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, 0);
-        */
-}
-
-static inline void p4_pmu_disable_event(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       /*
-        * If event gets disabled while counter is in overflowed
-        * state we need to clear P4_CCCR_OVF, otherwise interrupt get
-        * asserted again and again
-        */
-       (void)wrmsrl_safe(hwc->config_base,
-               p4_config_unpack_cccr(hwc->config) & ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
-}
-
-static void p4_pmu_disable_all(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       int idx;
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               struct perf_event *event = cpuc->events[idx];
-               if (!test_bit(idx, cpuc->active_mask))
-                       continue;
-               p4_pmu_disable_event(event);
-       }
-
-       p4_pmu_disable_pebs();
-}
-
-/* configuration must be valid */
-static void p4_pmu_enable_pebs(u64 config)
-{
-       struct p4_pebs_bind *bind;
-       unsigned int idx;
-
-       BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK);
-
-       idx = p4_config_unpack_metric(config);
-       if (idx == P4_PEBS_METRIC__none)
-               return;
-
-       bind = &p4_pebs_bind_map[idx];
-
-       (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs);
-       (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT,      (u64)bind->metric_vert);
-}
-
-static void p4_pmu_enable_event(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       int thread = p4_ht_config_thread(hwc->config);
-       u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
-       unsigned int idx = p4_config_unpack_event(hwc->config);
-       struct p4_event_bind *bind;
-       u64 escr_addr, cccr;
-
-       bind = &p4_event_bind_map[idx];
-       escr_addr = bind->escr_msr[thread];
-
-       /*
-        * - we dont support cascaded counters yet
-        * - and counter 1 is broken (erratum)
-        */
-       WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
-       WARN_ON_ONCE(hwc->idx == 1);
-
-       /* we need a real Event value */
-       escr_conf &= ~P4_ESCR_EVENT_MASK;
-       escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode));
-
-       cccr = p4_config_unpack_cccr(hwc->config);
-
-       /*
-        * it could be Cache event so we need to write metrics
-        * into additional MSRs
-        */
-       p4_pmu_enable_pebs(hwc->config);
-
-       (void)wrmsrl_safe(escr_addr, escr_conf);
-       (void)wrmsrl_safe(hwc->config_base,
-                               (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
-}
-
-static void p4_pmu_enable_all(int added)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       int idx;
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               struct perf_event *event = cpuc->events[idx];
-               if (!test_bit(idx, cpuc->active_mask))
-                       continue;
-               p4_pmu_enable_event(event);
-       }
-}
-
-static int p4_pmu_handle_irq(struct pt_regs *regs)
-{
-       struct perf_sample_data data;
-       struct cpu_hw_events *cpuc;
-       struct perf_event *event;
-       struct hw_perf_event *hwc;
-       int idx, handled = 0;
-       u64 val;
-
-       cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               int overflow;
-
-               if (!test_bit(idx, cpuc->active_mask)) {
-                       /* catch in-flight IRQs */
-                       if (__test_and_clear_bit(idx, cpuc->running))
-                               handled++;
-                       continue;
-               }
-
-               event = cpuc->events[idx];
-               hwc = &event->hw;
-
-               WARN_ON_ONCE(hwc->idx != idx);
-
-               /* it might be unflagged overflow */
-               overflow = p4_pmu_clear_cccr_ovf(hwc);
-
-               val = x86_perf_event_update(event);
-               if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
-                       continue;
-
-               handled += overflow;
-
-               /* event overflow for sure */
-               perf_sample_data_init(&data, 0, hwc->last_period);
-
-               if (!x86_perf_event_set_period(event))
-                       continue;
-
-
-               if (perf_event_overflow(event, &data, regs))
-                       x86_pmu_stop(event, 0);
-       }
-
-       if (handled)
-               inc_irq_stat(apic_perf_irqs);
-
-       /*
-        * When dealing with the unmasking of the LVTPC on P4 perf hw, it has
-        * been observed that the OVF bit flag has to be cleared first _before_
-        * the LVTPC can be unmasked.
-        *
-        * The reason is the NMI line will continue to be asserted while the OVF
-        * bit is set.  This causes a second NMI to generate if the LVTPC is
-        * unmasked before the OVF bit is cleared, leading to unknown NMI
-        * messages.
-        */
-       apic_write(APIC_LVTPC, APIC_DM_NMI);
-
-       return handled;
-}
-
-/*
- * swap thread specific fields according to a thread
- * we are going to run on
- */
-static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
-{
-       u32 escr, cccr;
-
-       /*
-        * we either lucky and continue on same cpu or no HT support
-        */
-       if (!p4_should_swap_ts(hwc->config, cpu))
-               return;
-
-       /*
-        * the event is migrated from an another logical
-        * cpu, so we need to swap thread specific flags
-        */
-
-       escr = p4_config_unpack_escr(hwc->config);
-       cccr = p4_config_unpack_cccr(hwc->config);
-
-       if (p4_ht_thread(cpu)) {
-               cccr &= ~P4_CCCR_OVF_PMI_T0;
-               cccr |= P4_CCCR_OVF_PMI_T1;
-               if (escr & P4_ESCR_T0_OS) {
-                       escr &= ~P4_ESCR_T0_OS;
-                       escr |= P4_ESCR_T1_OS;
-               }
-               if (escr & P4_ESCR_T0_USR) {
-                       escr &= ~P4_ESCR_T0_USR;
-                       escr |= P4_ESCR_T1_USR;
-               }
-               hwc->config  = p4_config_pack_escr(escr);
-               hwc->config |= p4_config_pack_cccr(cccr);
-               hwc->config |= P4_CONFIG_HT;
-       } else {
-               cccr &= ~P4_CCCR_OVF_PMI_T1;
-               cccr |= P4_CCCR_OVF_PMI_T0;
-               if (escr & P4_ESCR_T1_OS) {
-                       escr &= ~P4_ESCR_T1_OS;
-                       escr |= P4_ESCR_T0_OS;
-               }
-               if (escr & P4_ESCR_T1_USR) {
-                       escr &= ~P4_ESCR_T1_USR;
-                       escr |= P4_ESCR_T0_USR;
-               }
-               hwc->config  = p4_config_pack_escr(escr);
-               hwc->config |= p4_config_pack_cccr(cccr);
-               hwc->config &= ~P4_CONFIG_HT;
-       }
-}
-
-/*
- * ESCR address hashing is tricky, ESCRs are not sequential
- * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and
- * the metric between any ESCRs is laid in range [0xa0,0xe1]
- *
- * so we make ~70% filled hashtable
- */
-
-#define P4_ESCR_MSR_BASE               0x000003a0
-#define P4_ESCR_MSR_MAX                        0x000003e1
-#define P4_ESCR_MSR_TABLE_SIZE         (P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1)
-#define P4_ESCR_MSR_IDX(msr)           (msr - P4_ESCR_MSR_BASE)
-#define P4_ESCR_MSR_TABLE_ENTRY(msr)   [P4_ESCR_MSR_IDX(msr)] = msr
-
-static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = {
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1),
-};
-
-static int p4_get_escr_idx(unsigned int addr)
-{
-       unsigned int idx = P4_ESCR_MSR_IDX(addr);
-
-       if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE      ||
-                       !p4_escr_table[idx]             ||
-                       p4_escr_table[idx] != addr)) {
-               WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
-               return -1;
-       }
-
-       return idx;
-}
-
-static int p4_next_cntr(int thread, unsigned long *used_mask,
-                       struct p4_event_bind *bind)
-{
-       int i, j;
-
-       for (i = 0; i < P4_CNTR_LIMIT; i++) {
-               j = bind->cntr[thread][i];
-               if (j != -1 && !test_bit(j, used_mask))
-                       return j;
-       }
-
-       return -1;
-}
-
-static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
-{
-       unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-       unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
-       int cpu = smp_processor_id();
-       struct hw_perf_event *hwc;
-       struct p4_event_bind *bind;
-       unsigned int i, thread, num;
-       int cntr_idx, escr_idx;
-       u64 config_alias;
-       int pass;
-
-       bitmap_zero(used_mask, X86_PMC_IDX_MAX);
-       bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
-
-       for (i = 0, num = n; i < n; i++, num--) {
-
-               hwc = &cpuc->event_list[i]->hw;
-               thread = p4_ht_thread(cpu);
-               pass = 0;
-
-again:
-               /*
-                * It's possible to hit a circular lock
-                * between original and alternative events
-                * if both are scheduled already.
-                */
-               if (pass > 2)
-                       goto done;
-
-               bind = p4_config_get_bind(hwc->config);
-               escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
-               if (unlikely(escr_idx == -1))
-                       goto done;
-
-               if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
-                       cntr_idx = hwc->idx;
-                       if (assign)
-                               assign[i] = hwc->idx;
-                       goto reserve;
-               }
-
-               cntr_idx = p4_next_cntr(thread, used_mask, bind);
-               if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) {
-                       /*
-                        * Check whether an event alias is still available.
-                        */
-                       config_alias = p4_get_alias_event(hwc->config);
-                       if (!config_alias)
-                               goto done;
-                       hwc->config = config_alias;
-                       pass++;
-                       goto again;
-               }
-               /*
-                * Perf does test runs to see if a whole group can be assigned
-                * together succesfully.  There can be multiple rounds of this.
-                * Unfortunately, p4_pmu_swap_config_ts touches the hwc->config
-                * bits, such that the next round of group assignments will
-                * cause the above p4_should_swap_ts to pass instead of fail.
-                * This leads to counters exclusive to thread0 being used by
-                * thread1.
-                *
-                * Solve this with a cheap hack, reset the idx back to -1 to
-                * force a new lookup (p4_next_cntr) to get the right counter
-                * for the right thread.
-                *
-                * This probably doesn't comply with the general spirit of how
-                * perf wants to work, but P4 is special. :-(
-                */
-               if (p4_should_swap_ts(hwc->config, cpu))
-                       hwc->idx = -1;
-               p4_pmu_swap_config_ts(hwc, cpu);
-               if (assign)
-                       assign[i] = cntr_idx;
-reserve:
-               set_bit(cntr_idx, used_mask);
-               set_bit(escr_idx, escr_mask);
-       }
-
-done:
-       return num ? -EINVAL : 0;
-}
-
-PMU_FORMAT_ATTR(cccr, "config:0-31" );
-PMU_FORMAT_ATTR(escr, "config:32-62");
-PMU_FORMAT_ATTR(ht,   "config:63"   );
-
-static struct attribute *intel_p4_formats_attr[] = {
-       &format_attr_cccr.attr,
-       &format_attr_escr.attr,
-       &format_attr_ht.attr,
-       NULL,
-};
-
-static __initconst const struct x86_pmu p4_pmu = {
-       .name                   = "Netburst P4/Xeon",
-       .handle_irq             = p4_pmu_handle_irq,
-       .disable_all            = p4_pmu_disable_all,
-       .enable_all             = p4_pmu_enable_all,
-       .enable                 = p4_pmu_enable_event,
-       .disable                = p4_pmu_disable_event,
-       .eventsel               = MSR_P4_BPU_CCCR0,
-       .perfctr                = MSR_P4_BPU_PERFCTR0,
-       .event_map              = p4_pmu_event_map,
-       .max_events             = ARRAY_SIZE(p4_general_events),
-       .get_event_constraints  = x86_get_event_constraints,
-       /*
-        * IF HT disabled we may need to use all
-        * ARCH_P4_MAX_CCCR counters simulaneously
-        * though leave it restricted at moment assuming
-        * HT is on
-        */
-       .num_counters           = ARCH_P4_MAX_CCCR,
-       .apic                   = 1,
-       .cntval_bits            = ARCH_P4_CNTRVAL_BITS,
-       .cntval_mask            = ARCH_P4_CNTRVAL_MASK,
-       .max_period             = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
-       .hw_config              = p4_hw_config,
-       .schedule_events        = p4_pmu_schedule_events,
-       /*
-        * This handles erratum N15 in intel doc 249199-029,
-        * the counter may not be updated correctly on write
-        * so we need a second write operation to do the trick
-        * (the official workaround didn't work)
-        *
-        * the former idea is taken from OProfile code
-        */
-       .perfctr_second_write   = 1,
-
-       .format_attrs           = intel_p4_formats_attr,
-};
-
-__init int p4_pmu_init(void)
-{
-       unsigned int low, high;
-       int i, reg;
-
-       /* If we get stripped -- indexing fails */
-       BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC);
-
-       rdmsr(MSR_IA32_MISC_ENABLE, low, high);
-       if (!(low & (1 << 7))) {
-               pr_cont("unsupported Netburst CPU model %d ",
-                       boot_cpu_data.x86_model);
-               return -ENODEV;
-       }
-
-       memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
-               sizeof(hw_cache_event_ids));
-
-       pr_cont("Netburst events, ");
-
-       x86_pmu = p4_pmu;
-
-       /*
-        * Even though the counters are configured to interrupt a particular
-        * logical processor when an overflow happens, testing has shown that
-        * on kdump kernels (which uses a single cpu), thread1's counter
-        * continues to run and will report an NMI on thread0.  Due to the
-        * overflow bug, this leads to a stream of unknown NMIs.
-        *
-        * Solve this by zero'ing out the registers to mimic a reset.
-        */
-       for (i = 0; i < x86_pmu.num_counters; i++) {
-               reg = x86_pmu_config_addr(i);
-               wrmsrl_safe(reg, 0ULL);
-       }
-
-       return 0;
-}
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c

deleted file mode 100644 (file)

index 7c1a0c0..0000000
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ /dev/null
@@ -1,279 +0,0 @@
-#include <linux/perf_event.h>
-#include <linux/types.h>
-
-#include "perf_event.h"
-
-/*
- * Not sure about some of these
- */
-static const u64 p6_perfmon_event_map[] =
-{
-  [PERF_COUNT_HW_CPU_CYCLES]           = 0x0079,       /* CPU_CLK_UNHALTED */
-  [PERF_COUNT_HW_INSTRUCTIONS]         = 0x00c0,       /* INST_RETIRED     */
-  [PERF_COUNT_HW_CACHE_REFERENCES]     = 0x0f2e,       /* L2_RQSTS:M:E:S:I */
-  [PERF_COUNT_HW_CACHE_MISSES]         = 0x012e,       /* L2_RQSTS:I       */
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]  = 0x00c4,       /* BR_INST_RETIRED  */
-  [PERF_COUNT_HW_BRANCH_MISSES]                = 0x00c5,       /* BR_MISS_PRED_RETIRED */
-  [PERF_COUNT_HW_BUS_CYCLES]           = 0x0062,       /* BUS_DRDY_CLOCKS  */
-  [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a2,    /* RESOURCE_STALLS  */
-
-};
-
-static const u64 __initconst p6_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0043,  /* DATA_MEM_REFS       */
-                [ C(RESULT_MISS)   ] = 0x0045, /* DCU_LINES_IN        */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0x0f29,  /* L2_LD:M:E:S:I       */
-       },
-        [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-        },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0080,  /* IFU_IFETCH         */
-               [ C(RESULT_MISS)   ] = 0x0f28,  /* L2_IFETCH:M:E:S:I  */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0x0025,  /* L2_M_LINES_INM     */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0043,  /* DATA_MEM_REFS      */
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0080,  /* IFU_IFETCH         */
-               [ C(RESULT_MISS)   ] = 0x0085,  /* ITLB_MISS          */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c4,  /* BR_INST_RETIRED      */
-               [ C(RESULT_MISS)   ] = 0x00c5,  /* BR_MISS_PRED_RETIRED */
-        },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
-};
-
-static u64 p6_pmu_event_map(int hw_event)
-{
-       return p6_perfmon_event_map[hw_event];
-}
-
-/*
- * Event setting that is specified not to count anything.
- * We use this to effectively disable a counter.
- *
- * L2_RQSTS with 0 MESI unit mask.
- */
-#define P6_NOP_EVENT                   0x0000002EULL
-
-static struct event_constraint p6_event_constraints[] =
-{
-       INTEL_EVENT_CONSTRAINT(0xc1, 0x1),      /* FLOPS */
-       INTEL_EVENT_CONSTRAINT(0x10, 0x1),      /* FP_COMP_OPS_EXE */
-       INTEL_EVENT_CONSTRAINT(0x11, 0x2),      /* FP_ASSIST */
-       INTEL_EVENT_CONSTRAINT(0x12, 0x2),      /* MUL */
-       INTEL_EVENT_CONSTRAINT(0x13, 0x2),      /* DIV */
-       INTEL_EVENT_CONSTRAINT(0x14, 0x1),      /* CYCLES_DIV_BUSY */
-       EVENT_CONSTRAINT_END
-};
-
-static void p6_pmu_disable_all(void)
-{
-       u64 val;
-
-       /* p6 only has one enable register */
-       rdmsrl(MSR_P6_EVNTSEL0, val);
-       val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-       wrmsrl(MSR_P6_EVNTSEL0, val);
-}
-
-static void p6_pmu_enable_all(int added)
-{
-       unsigned long val;
-
-       /* p6 only has one enable register */
-       rdmsrl(MSR_P6_EVNTSEL0, val);
-       val |= ARCH_PERFMON_EVENTSEL_ENABLE;
-       wrmsrl(MSR_P6_EVNTSEL0, val);
-}
-
-static inline void
-p6_pmu_disable_event(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       u64 val = P6_NOP_EVENT;
-
-       (void)wrmsrl_safe(hwc->config_base, val);
-}
-
-static void p6_pmu_enable_event(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       u64 val;
-
-       val = hwc->config;
-
-       /*
-        * p6 only has a global event enable, set on PerfEvtSel0
-        * We "disable" events by programming P6_NOP_EVENT
-        * and we rely on p6_pmu_enable_all() being called
-        * to actually enable the events.
-        */
-
-       (void)wrmsrl_safe(hwc->config_base, val);
-}
-
-PMU_FORMAT_ATTR(event, "config:0-7"    );
-PMU_FORMAT_ATTR(umask, "config:8-15"   );
-PMU_FORMAT_ATTR(edge,  "config:18"     );
-PMU_FORMAT_ATTR(pc,    "config:19"     );
-PMU_FORMAT_ATTR(inv,   "config:23"     );
-PMU_FORMAT_ATTR(cmask, "config:24-31"  );
-
-static struct attribute *intel_p6_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_pc.attr,
-       &format_attr_inv.attr,
-       &format_attr_cmask.attr,
-       NULL,
-};
-
-static __initconst const struct x86_pmu p6_pmu = {
-       .name                   = "p6",
-       .handle_irq             = x86_pmu_handle_irq,
-       .disable_all            = p6_pmu_disable_all,
-       .enable_all             = p6_pmu_enable_all,
-       .enable                 = p6_pmu_enable_event,
-       .disable                = p6_pmu_disable_event,
-       .hw_config              = x86_pmu_hw_config,
-       .schedule_events        = x86_schedule_events,
-       .eventsel               = MSR_P6_EVNTSEL0,
-       .perfctr                = MSR_P6_PERFCTR0,
-       .event_map              = p6_pmu_event_map,
-       .max_events             = ARRAY_SIZE(p6_perfmon_event_map),
-       .apic                   = 1,
-       .max_period             = (1ULL << 31) - 1,
-       .version                = 0,
-       .num_counters           = 2,
-       /*
-        * Events have 40 bits implemented. However they are designed such
-        * that bits [32-39] are sign extensions of bit 31. As such the
-        * effective width of a event for P6-like PMU is 32 bits only.
-        *
-        * See IA-32 Intel Architecture Software developer manual Vol 3B
-        */
-       .cntval_bits            = 32,
-       .cntval_mask            = (1ULL << 32) - 1,
-       .get_event_constraints  = x86_get_event_constraints,
-       .event_constraints      = p6_event_constraints,
-
-       .format_attrs           = intel_p6_formats_attr,
-       .events_sysfs_show      = intel_event_sysfs_show,
-
-};
-
-static __init void p6_pmu_rdpmc_quirk(void)
-{
-       if (boot_cpu_data.x86_mask < 9) {
-               /*
-                * PPro erratum 26; fixed in stepping 9 and above.
-                */
-               pr_warn("Userspace RDPMC support disabled due to a CPU erratum\n");
-               x86_pmu.attr_rdpmc_broken = 1;
-               x86_pmu.attr_rdpmc = 0;
-       }
-}
-
-__init int p6_pmu_init(void)
-{
-       x86_pmu = p6_pmu;
-
-       switch (boot_cpu_data.x86_model) {
-       case  1: /* Pentium Pro */
-               x86_add_quirk(p6_pmu_rdpmc_quirk);
-               break;
-
-       case  3: /* Pentium II - Klamath */
-       case  5: /* Pentium II - Deschutes */
-       case  6: /* Pentium II - Mendocino */
-               break;
-
-       case  7: /* Pentium III - Katmai */
-       case  8: /* Pentium III - Coppermine */
-       case 10: /* Pentium III Xeon */
-       case 11: /* Pentium III - Tualatin */
-               break;
-
-       case  9: /* Pentium M - Banias */
-       case 13: /* Pentium M - Dothan */
-               break;
-
-       default:
-               pr_cont("unsupported p6 CPU model %d ", boot_cpu_data.x86_model);
-               return -ENODEV;
-       }
-
-       memcpy(hw_cache_event_ids, p6_hw_cache_event_ids,
-               sizeof(hw_cache_event_ids));
-
-       return 0;
-}
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c

index 819d94982e078b8597f084f8409fcd3106361293..f6f50c4ceaeceef16170d14d6d55376823bbd63f 100644 (file)
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -51,7 +51,7 @@ void x86_init_rdrand(struct cpuinfo_x86 *c)
         for (i = 0; i < SANITY_CHECK_LOOPS; i++) {
                 if (!rdrand_long(&tmp)) {
                         clear_cpu_cap(c, X86_FEATURE_RDRAND);
-                       printk_once(KERN_WARNING "rdrand: disabled\n");
+                       pr_warn_once("rdrand: disabled\n");
                         return;
                 }
         }
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c

index 4c60eaf0571c2fb1a6d73258861398fc6bcc963e..cd531355e8386b4177d4dcecaf1031ba8c2086c8 100644 (file)
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -87,10 +87,10 @@ void detect_extended_topology(struct cpuinfo_x86 *c)
         c->x86_max_cores = (core_level_siblings / smp_num_siblings);
  
         if (!printed) {
-               printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
+               pr_info("CPU: Physical Processor ID: %d\n",
                        c->phys_proc_id);
                 if (c->x86_max_cores > 1)
-                       printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
+                       pr_info("CPU: Processor Core ID: %d\n",
                                c->cpu_core_id);
                 printed = 1;
         }
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c

index a19a663282b57e9a77891ee3d26d37fcd2ef3a23..34178564be2a70adbeaf3fd890cac32d97a51635 100644 (file)
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -33,7 +33,7 @@ static void init_transmeta(struct cpuinfo_x86 *c)
         if (max >= 0x80860001) {
                 cpuid(0x80860001, &dummy, &cpu_rev, &cpu_freq, &cpu_flags);
                 if (cpu_rev != 0x02000000) {
-                       printk(KERN_INFO "CPU: Processor revision %u.%u.%u.%u, %u MHz\n",
+                       pr_info("CPU: Processor revision %u.%u.%u.%u, %u MHz\n",
                                 (cpu_rev >> 24) & 0xff,
                                 (cpu_rev >> 16) & 0xff,
                                 (cpu_rev >> 8) & 0xff,
@@ -44,10 +44,10 @@ static void init_transmeta(struct cpuinfo_x86 *c)
         if (max >= 0x80860002) {
                 cpuid(0x80860002, &new_cpu_rev, &cms_rev1, &cms_rev2, &dummy);
                 if (cpu_rev == 0x02000000) {
-                       printk(KERN_INFO "CPU: Processor revision %08X, %u MHz\n",
+                       pr_info("CPU: Processor revision %08X, %u MHz\n",
                                 new_cpu_rev, cpu_freq);
                 }
-               printk(KERN_INFO "CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n",
+               pr_info("CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n",
                        (cms_rev1 >> 24) & 0xff,
                        (cms_rev1 >> 16) & 0xff,
                        (cms_rev1 >> 8) & 0xff,
@@ -76,7 +76,7 @@ static void init_transmeta(struct cpuinfo_x86 *c)
                       (void *)&cpu_info[56],
                       (void *)&cpu_info[60]);
                 cpu_info[64] = '\0';
-               printk(KERN_INFO "CPU: %s\n", cpu_info);
+               pr_info("CPU: %s\n", cpu_info);
         }
  
         /* Unhide possibly hidden capability flags */
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c

index 628a059a9a0663ef7cbdf07e4750c9e1e4f53006..364e5834689753fc7da34c6dc8a34cca22ab92db 100644 (file)
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -62,7 +62,7 @@ static unsigned long vmware_get_tsc_khz(void)
         tsc_hz = eax | (((uint64_t)ebx) << 32);
         do_div(tsc_hz, 1000);
         BUG_ON(tsc_hz >> 32);
-       printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
+       pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n",
                          (unsigned long) tsc_hz / 1000,
                          (unsigned long) tsc_hz % 1000);
  
@@ -84,8 +84,7 @@ static void __init vmware_platform_setup(void)
         if (ebx != UINT_MAX)
                 x86_platform.calibrate_tsc = vmware_get_tsc_khz;
         else
-               printk(KERN_WARNING
-                      "Failed to get TSC freq from the hypervisor\n");
+               pr_warn("Failed to get TSC freq from the hypervisor\n");
  }
  
  /*
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c

index 58f34319b29ab64aee4bd21e6ff30367d478c262..9ef978d69c22e00cffdce3592cbe5a495027a058 100644 (file)
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -57,10 +57,9 @@ struct crash_elf_data {
         struct kimage *image;
         /*
          * Total number of ram ranges we have after various adjustments for
-        * GART, crash reserved region etc.
+        * crash reserved region, etc.
          */
         unsigned int max_nr_ranges;
-       unsigned long gart_start, gart_end;
  
         /* Pointer to elf header */
         void *ehdr;
@@ -201,17 +200,6 @@ static int get_nr_ram_ranges_callback(u64 start, u64 end, void *arg)
         return 0;
  }
  
-static int get_gart_ranges_callback(u64 start, u64 end, void *arg)
-{
-       struct crash_elf_data *ced = arg;
-
-       ced->gart_start = start;
-       ced->gart_end = end;
-
-       /* Not expecting more than 1 gart aperture */
-       return 1;
-}
-
  
  /* Gather all the required information to prepare elf headers for ram regions */
  static void fill_up_crash_elf_data(struct crash_elf_data *ced,
@@ -226,22 +214,6 @@ static void fill_up_crash_elf_data(struct crash_elf_data *ced,
  
         ced->max_nr_ranges = nr_ranges;
  
-       /*
-        * We don't create ELF headers for GART aperture as an attempt
-        * to dump this memory in second kernel leads to hang/crash.
-        * If gart aperture is present, one needs to exclude that region
-        * and that could lead to need of extra phdr.
-        */
-       walk_iomem_res("GART", IORESOURCE_MEM, 0, -1,
-                               ced, get_gart_ranges_callback);
-
-       /*
-        * If we have gart region, excluding that could potentially split
-        * a memory range, resulting in extra header. Account for  that.
-        */
-       if (ced->gart_end)
-               ced->max_nr_ranges++;
-
         /* Exclusion of crash region could split memory ranges */
         ced->max_nr_ranges++;
  
@@ -350,13 +322,6 @@ static int elf_header_exclude_ranges(struct crash_elf_data *ced,
                         return ret;
         }
  
-       /* Exclude GART region */
-       if (ced->gart_end) {
-               ret = exclude_mem_range(cmem, ced->gart_start, ced->gart_end);
-               if (ret)
-                       return ret;
-       }
-
         return ret;
  }
  
@@ -599,12 +564,12 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
         /* Add ACPI tables */
         cmd.type = E820_ACPI;
         flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-       walk_iomem_res("ACPI Tables", flags, 0, -1, &cmd,
+       walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd,
                        memmap_entry_callback);
  
         /* Add ACPI Non-volatile Storage */
         cmd.type = E820_NVS;
-       walk_iomem_res("ACPI Non-volatile Storage", flags, 0, -1, &cmd,
+       walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd,
                         memmap_entry_callback);
  
         /* Add crashk_low_res region */
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c

index b3c2a697820aab012118adf56ee0c67188188fe7..621b501f89351146b84b090b7160166bb9d5907e 100644 (file)
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -926,6 +926,41 @@ static const char *e820_type_to_string(int e820_type)
         }
  }
  
+static unsigned long e820_type_to_iomem_type(int e820_type)
+{
+       switch (e820_type) {
+       case E820_RESERVED_KERN:
+       case E820_RAM:
+               return IORESOURCE_SYSTEM_RAM;
+       case E820_ACPI:
+       case E820_NVS:
+       case E820_UNUSABLE:
+       case E820_PRAM:
+       case E820_PMEM:
+       default:
+               return IORESOURCE_MEM;
+       }
+}
+
+static unsigned long e820_type_to_iores_desc(int e820_type)
+{
+       switch (e820_type) {
+       case E820_ACPI:
+               return IORES_DESC_ACPI_TABLES;
+       case E820_NVS:
+               return IORES_DESC_ACPI_NV_STORAGE;
+       case E820_PMEM:
+               return IORES_DESC_PERSISTENT_MEMORY;
+       case E820_PRAM:
+               return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
+       case E820_RESERVED_KERN:
+       case E820_RAM:
+       case E820_UNUSABLE:
+       default:
+               return IORES_DESC_NONE;
+       }
+}
+
  static bool do_mark_busy(u32 type, struct resource *res)
  {
         /* this is the legacy bios/dos rom-shadow + mmio region */
@@ -968,7 +1003,8 @@ void __init e820_reserve_resources(void)
                 res->start = e820.map[i].addr;
                 res->end = end;
  
-               res->flags = IORESOURCE_MEM;
+               res->flags = e820_type_to_iomem_type(e820.map[i].type);
+               res->desc = e820_type_to_iores_desc(e820.map[i].type);
  
                 /*
                  * don't register the region that could be conflicted with
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c

index d25097c3fc1d1af8af35c156f05121f9f4d46a94..d5804adfa6da6c3d495b124faf47c288e2c0c6cf 100644 (file)
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -409,8 +409,10 @@ static inline void copy_init_fpstate_to_fpregs(void)
  {
         if (use_xsave())
                 copy_kernel_to_xregs(&init_fpstate.xsave, -1);
-       else
+       else if (static_cpu_has(X86_FEATURE_FXSR))
                 copy_kernel_to_fxregs(&init_fpstate.fxsave);
+       else
+               copy_kernel_to_fregs(&init_fpstate.fsave);
  }
  
  /*
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c

index 6d9f0a7ef4c8e5da3d596444242de7478d7b9356..bd08fb77073d833ec8f1e27bc67d20fa37b559dc 100644 (file)
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -78,13 +78,15 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
         cr0 &= ~(X86_CR0_TS | X86_CR0_EM);
         write_cr0(cr0);
  
-       asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
-                    : "+m" (fsw), "+m" (fcw));
+       if (!test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) {
+               asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
+                            : "+m" (fsw), "+m" (fcw));
  
-       if (fsw == 0 && (fcw & 0x103f) == 0x003f)
-               set_cpu_cap(c, X86_FEATURE_FPU);
-       else
-               clear_cpu_cap(c, X86_FEATURE_FPU);
+               if (fsw == 0 && (fcw & 0x103f) == 0x003f)
+                       set_cpu_cap(c, X86_FEATURE_FPU);
+               else
+                       clear_cpu_cap(c, X86_FEATURE_FPU);
+       }
  
  #ifndef CONFIG_MATH_EMULATION
         if (!cpu_has_fpu) {
@@ -132,7 +134,7 @@ static void __init fpu__init_system_generic(void)
          * Set up the legacy init FPU context. (xstate init might overwrite this
          * with a more modern format, if the CPU supports it.)
          */
-       fpstate_init_fxstate(&init_fpstate.fxsave);
+       fpstate_init(&init_fpstate);
  
         fpu__init_system_mxcsr();
  }
@@ -300,12 +302,6 @@ u64 __init fpu__get_supported_xfeatures_mask(void)
  static void __init fpu__clear_eager_fpu_features(void)
  {
         setup_clear_cpu_cap(X86_FEATURE_MPX);
-       setup_clear_cpu_cap(X86_FEATURE_AVX);
-       setup_clear_cpu_cap(X86_FEATURE_AVX2);
-       setup_clear_cpu_cap(X86_FEATURE_AVX512F);
-       setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
-       setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
-       setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
  }
  
  /*
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c

index 04f9641e0cb6bd22f0c054c981402b0575a33ab5..702547ce33c9c7a33fd321fdb94367af5515dd8b 100644 (file)
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -81,9 +81,9 @@ within(unsigned long addr, unsigned long start, unsigned long end)
  static unsigned long text_ip_addr(unsigned long ip)
  {
         /*
-        * On x86_64, kernel text mappings are mapped read-only with
-        * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
-        * of the kernel text mapping to modify the kernel text.
+        * On x86_64, kernel text mappings are mapped read-only, so we use
+        * the kernel identity mapping instead of the kernel text mapping
+        * to modify the kernel text.
          *
          * For 32bit kernels, these mappings are same and we can use
          * kernel identity mapping to modify code.
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c

index 44256a62702b2c51077fc0b8b82a904ed122b9f6..ed15cd486d06347626c080a0081e3eed01ac9128 100644 (file)
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -750,9 +750,7 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
  int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
  {
         int err;
-#ifdef CONFIG_DEBUG_RODATA
         char opc[BREAK_INSTR_SIZE];
-#endif /* CONFIG_DEBUG_RODATA */
  
         bpt->type = BP_BREAKPOINT;
         err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
@@ -761,7 +759,6 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
                 return err;
         err = probe_kernel_write((char *)bpt->bpt_addr,
                                  arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
-#ifdef CONFIG_DEBUG_RODATA
         if (!err)
                 return err;
         /*
@@ -778,13 +775,12 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
         if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE))
                 return -EINVAL;
         bpt->type = BP_POKE_BREAKPOINT;
-#endif /* CONFIG_DEBUG_RODATA */
+
         return err;
  }
  
  int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
  {
-#ifdef CONFIG_DEBUG_RODATA
         int err;
         char opc[BREAK_INSTR_SIZE];
  
@@ -801,8 +797,8 @@ int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
         if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE))
                 goto knl_write;
         return err;
+
  knl_write:
-#endif /* CONFIG_DEBUG_RODATA */
         return probe_kernel_write((char *)bpt->bpt_addr,
                                   (char *)bpt->saved_instr, BREAK_INSTR_SIZE);
  }
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c

index 1deffe6cc87367631fe23b4c5d4b6be697ff4b0e..0f05deeff5ce2e2af5aa7fb1f2fe1f59f4ed8348 100644 (file)
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -988,7 +988,7 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
                  * In case the user-specified fault handler returned
                  * zero, try to fix up.
                  */
-               if (fixup_exception(regs))
+               if (fixup_exception(regs, trapnr))
                         return 1;
  
                 /*
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c

index 30ca7607cbbbbcae4793aa5c14d8f73bbd784d71..97340f2c437c64def7a83f75a8f9b0bc6a968451 100644 (file)
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -408,7 +408,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
         processor.cpuflag = CPU_ENABLED;
         processor.cpufeature = (boot_cpu_data.x86 << 8) |
             (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
-       processor.featureflag = boot_cpu_data.x86_capability[0];
+       processor.featureflag = boot_cpu_data.x86_capability[CPUID_1_EDX];
         processor.reserved[0] = 0;
         processor.reserved[1] = 0;
         for (i = 0; i < 2; i++) {
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c

index 8a2cdd736fa4da82374fa9392e76b5716cc0f89a..04b132a767f116e8ff35efcbbc036e331a145a4b 100644 (file)
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -30,6 +30,7 @@
  #include <asm/nmi.h>
  #include <asm/x86_init.h>
  #include <asm/reboot.h>
+#include <asm/cache.h>
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/nmi.h>
@@ -69,7 +70,7 @@ struct nmi_stats {
  
  static DEFINE_PER_CPU(struct nmi_stats, nmi_stats);
  
-static int ignore_nmis;
+static int ignore_nmis __read_mostly;
  
  int unknown_nmi_panic;
  /*
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c

index 14415aff18136524cfa3d4c9b07bdde6ee0d6f54..92f70147a9a673e74b3753637adbb08e2a4869fa 100644 (file)
--- a/arch/x86/kernel/pmem.c
+++ b/arch/x86/kernel/pmem.c
@@ -13,11 +13,11 @@ static int found(u64 start, u64 end, void *data)
  
  static __init int register_e820_pmem(void)
  {
-       char *pmem = "Persistent Memory (legacy)";
         struct platform_device *pdev;
         int rc;
  
-       rc = walk_iomem_res(pmem, IORESOURCE_MEM, 0, -1, NULL, found);
+       rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY,
+                                IORESOURCE_MEM, 0, -1, NULL, found);
         if (rc <= 0)
                 return 0;
  
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c

index ee9a9792caeb3c8a2f75ec56fdd7fb96828e2f4f..2915d54e9dd5f730558fba3ed2e7f3c9d4cd5a5b 100644 (file)
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -421,9 +421,9 @@ static void mwait_idle(void)
         if (!current_set_polling_and_test()) {
                 trace_cpu_idle_rcuidle(1, smp_processor_id());
                 if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
-                       smp_mb(); /* quirk */
+                       mb(); /* quirk */
                         clflush((void *)&current_thread_info()->flags);
-                       smp_mb(); /* quirk */
+                       mb(); /* quirk */
                 }
  
                 __monitor((void *)&current_thread_info()->flags, 0, 0);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c

index d3d80e6d42a20ea665325b4cf2a5e7be3c43289a..aa52c10094755e2fa9fdd5ac7a39b0c36effac8a 100644 (file)
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -152,21 +152,21 @@ static struct resource data_resource = {
         .name   = "Kernel data",
         .start  = 0,
         .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
  };
  
  static struct resource code_resource = {
         .name   = "Kernel code",
         .start  = 0,
         .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
  };
  
  static struct resource bss_resource = {
         .name   = "Kernel bss",
         .start  = 0,
         .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
  };
  
  
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c

index 24d57f77b3c19615840ac4f09c8c0fd299864698..3bf1e0b5f827ae43dca2b2c0c3c4040e56d15c56 100644 (file)
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -97,6 +97,14 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
  DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
  EXPORT_PER_CPU_SYMBOL(cpu_info);
  
+/* Logical package management. We might want to allocate that dynamically */
+static int *physical_to_logical_pkg __read_mostly;
+static unsigned long *physical_package_map __read_mostly;;
+static unsigned long *logical_package_map  __read_mostly;
+static unsigned int max_physical_pkg_id __read_mostly;
+unsigned int __max_logical_packages __read_mostly;
+EXPORT_SYMBOL(__max_logical_packages);
+
  static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
  {
         unsigned long flags;
@@ -251,6 +259,97 @@ static void notrace start_secondary(void *unused)
         cpu_startup_entry(CPUHP_ONLINE);
  }
  
+int topology_update_package_map(unsigned int apicid, unsigned int cpu)
+{
+       unsigned int new, pkg = apicid >> boot_cpu_data.x86_coreid_bits;
+
+       /* Called from early boot ? */
+       if (!physical_package_map)
+               return 0;
+
+       if (pkg >= max_physical_pkg_id)
+               return -EINVAL;
+
+       /* Set the logical package id */
+       if (test_and_set_bit(pkg, physical_package_map))
+               goto found;
+
+       if (pkg < __max_logical_packages) {
+               set_bit(pkg, logical_package_map);
+               physical_to_logical_pkg[pkg] = pkg;
+               goto found;
+       }
+       new = find_first_zero_bit(logical_package_map, __max_logical_packages);
+       if (new >= __max_logical_packages) {
+               physical_to_logical_pkg[pkg] = -1;
+               pr_warn("APIC(%x) Package %u exceeds logical package map\n",
+                       apicid, pkg);
+               return -ENOSPC;
+       }
+       set_bit(new, logical_package_map);
+       pr_info("APIC(%x) Converting physical %u to logical package %u\n",
+               apicid, pkg, new);
+       physical_to_logical_pkg[pkg] = new;
+
+found:
+       cpu_data(cpu).logical_proc_id = physical_to_logical_pkg[pkg];
+       return 0;
+}
+
+/**
+ * topology_phys_to_logical_pkg - Map a physical package id to a logical
+ *
+ * Returns logical package id or -1 if not found
+ */
+int topology_phys_to_logical_pkg(unsigned int phys_pkg)
+{
+       if (phys_pkg >= max_physical_pkg_id)
+               return -1;
+       return physical_to_logical_pkg[phys_pkg];
+}
+EXPORT_SYMBOL(topology_phys_to_logical_pkg);
+
+static void __init smp_init_package_map(void)
+{
+       unsigned int ncpus, cpu;
+       size_t size;
+
+       /*
+        * Today neither Intel nor AMD support heterogenous systems. That
+        * might change in the future....
+        */
+       ncpus = boot_cpu_data.x86_max_cores * smp_num_siblings;
+       __max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus);
+
+       /*
+        * Possibly larger than what we need as the number of apic ids per
+        * package can be smaller than the actual used apic ids.
+        */
+       max_physical_pkg_id = DIV_ROUND_UP(MAX_LOCAL_APIC, ncpus);
+       size = max_physical_pkg_id * sizeof(unsigned int);
+       physical_to_logical_pkg = kmalloc(size, GFP_KERNEL);
+       memset(physical_to_logical_pkg, 0xff, size);
+       size = BITS_TO_LONGS(max_physical_pkg_id) * sizeof(unsigned long);
+       physical_package_map = kzalloc(size, GFP_KERNEL);
+       size = BITS_TO_LONGS(__max_logical_packages) * sizeof(unsigned long);
+       logical_package_map = kzalloc(size, GFP_KERNEL);
+
+       pr_info("Max logical packages: %u\n", __max_logical_packages);
+
+       for_each_present_cpu(cpu) {
+               unsigned int apicid = apic->cpu_present_to_apicid(cpu);
+
+               if (apicid == BAD_APICID || !apic->apic_id_valid(apicid))
+                       continue;
+               if (!topology_update_package_map(apicid, cpu))
+                       continue;
+               pr_warn("CPU %u APICId %x disabled\n", cpu, apicid);
+               per_cpu(x86_bios_cpu_apicid, cpu) = BAD_APICID;
+               set_cpu_possible(cpu, false);
+               set_cpu_present(cpu, false);
+       }
+}
+
  void __init smp_store_boot_cpu_info(void)
  {
         int id = 0; /* CPU 0 */
@@ -258,6 +357,7 @@ void __init smp_store_boot_cpu_info(void)
  
         *c = boot_cpu_data;
         c->cpu_index = id;
+       smp_init_package_map();
  }
  
  /*
diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c

index 3f92ce07e525fd41167a64f9d7c4df1bfbe673bf..27538f183c3b15d9d59e225f2612b12660ee979b 100644 (file)
--- a/arch/x86/kernel/test_nx.c
+++ b/arch/x86/kernel/test_nx.c
@@ -142,7 +142,6 @@ static int test_NX(void)
          * by the error message
          */
  
-#ifdef CONFIG_DEBUG_RODATA
         /* Test 3: Check if the .rodata section is executable */
         if (rodata_test_data != 0xC3) {
                 printk(KERN_ERR "test_nx: .rodata marker has invalid value\n");
@@ -151,7 +150,6 @@ static int test_NX(void)
                 printk(KERN_ERR "test_nx: .rodata section is executable\n");
                 ret = -ENODEV;
         }
-#endif
  
  #if 0
         /* Test 4: Check if the .data section of a module is executable */
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c

index 5ecbfe5099dad68e140b85ffdaefc75818afc6d9..cb4a01b41e277887b6769e4a17d60cb4139f9864 100644 (file)
--- a/arch/x86/kernel/test_rodata.c
+++ b/arch/x86/kernel/test_rodata.c
@@ -76,5 +76,5 @@ int rodata_test(void)
  }
  
  MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Testcase for the DEBUG_RODATA infrastructure");
+MODULE_DESCRIPTION("Testcase for marking rodata as read-only");
  MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c

index 1baf08187ddf057fe8cbb57bb771d163ee565ee9..5c9ca2bb9fe95c249680f69e076a2320e1855aa1 100644 (file)
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -185,7 +185,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
         }
  
         if (!user_mode(regs)) {
-               if (!fixup_exception(regs)) {
+               if (!fixup_exception(regs, trapnr)) {
                         tsk->thread.error_code = error_code;
                         tsk->thread.trap_nr = trapnr;
                         die(str, regs, error_code);
@@ -437,7 +437,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
  
         tsk = current;
         if (!user_mode(regs)) {
-               if (fixup_exception(regs))
+               if (fixup_exception(regs, X86_TRAP_GP))
                         return;
  
                 tsk->thread.error_code = error_code;
@@ -741,7 +741,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
         cond_local_irq_enable(regs);
  
         if (!user_mode(regs)) {
-               if (!fixup_exception(regs)) {
+               if (!fixup_exception(regs, trapnr)) {
                         task->thread.error_code = error_code;
                         task->thread.trap_nr = trapnr;
                         die(str, regs, error_code);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S

index 92dc211c11db236a94581158244e37bf12ef75ca..5af9958cbdb6be4293b62181ffe104f435caf830 100644 (file)
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -41,29 +41,28 @@ ENTRY(phys_startup_64)
  jiffies_64 = jiffies;
  #endif
  
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
  /*
- * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA
- * we retain large page mappings for boundaries spanning kernel text, rodata
- * and data sections.
+ * On 64-bit, align RODATA to 2MB so we retain large page mappings for
+ * boundaries spanning kernel text, rodata and data sections.
   *
   * However, kernel identity mappings will have different RWX permissions
   * to the pages mapping to text and to the pages padding (which are freed) the
   * text section. Hence kernel identity mappings will be broken to smaller
   * pages. For 64-bit, kernel text and kernel identity mappings are different,
- * so we can enable protection checks that come with CONFIG_DEBUG_RODATA,
- * as well as retain 2MB large page mappings for kernel text.
+ * so we can enable protection checks as well as retain 2MB large page
+ * mappings for kernel text.
   */
-#define X64_ALIGN_DEBUG_RODATA_BEGIN   . = ALIGN(HPAGE_SIZE);
+#define X64_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
  
-#define X64_ALIGN_DEBUG_RODATA_END                             \
+#define X64_ALIGN_RODATA_END                                   \
                 . = ALIGN(HPAGE_SIZE);                          \
                 __end_rodata_hpage_align = .;
  
  #else
  
-#define X64_ALIGN_DEBUG_RODATA_BEGIN
-#define X64_ALIGN_DEBUG_RODATA_END
+#define X64_ALIGN_RODATA_BEGIN
+#define X64_ALIGN_RODATA_END
  
  #endif
  
@@ -112,13 +111,11 @@ SECTIONS
  
         EXCEPTION_TABLE(16) :text = 0x9090
  
-#if defined(CONFIG_DEBUG_RODATA)
         /* .text should occupy whole number of pages */
         . = ALIGN(PAGE_SIZE);
-#endif
-       X64_ALIGN_DEBUG_RODATA_BEGIN
+       X64_ALIGN_RODATA_BEGIN
         RO_DATA(PAGE_SIZE)
-       X64_ALIGN_DEBUG_RODATA_END
+       X64_ALIGN_RODATA_END
  
         /* Data */
         .data : AT(ADDR(.data) - LOAD_OFFSET) {
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c

index a0695be19864eda009742b142ba2046e13ad05f9..cd05942bc9189452d8ec7c0cebd96431cf8dd394 100644 (file)
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -37,6 +37,8 @@ EXPORT_SYMBOL(__copy_user_nocache);
  EXPORT_SYMBOL(_copy_from_user);
  EXPORT_SYMBOL(_copy_to_user);
  
+EXPORT_SYMBOL_GPL(memcpy_mcsafe);
+
  EXPORT_SYMBOL(copy_page);
  EXPORT_SYMBOL(clear_page);
  
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c

index 36591faed13be04d12c13fa520d46ca9df0dfcf8..3a045f39ed8114e24e375521135cb7d2296e9e7e 100644 (file)
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1195,7 +1195,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic)
  static void apic_timer_expired(struct kvm_lapic *apic)
  {
         struct kvm_vcpu *vcpu = apic->vcpu;
-       wait_queue_head_t *q = &vcpu->wq;
+       struct swait_queue_head *q = &vcpu->wq;
         struct kvm_timer *ktimer = &apic->lapic_timer;
  
         if (atomic_read(&apic->lapic_timer.pending))
@@ -1204,8 +1204,8 @@ static void apic_timer_expired(struct kvm_lapic *apic)
         atomic_inc(&apic->lapic_timer.pending);
         kvm_set_pending_timer(vcpu);
  
-       if (waitqueue_active(q))
-               wake_up_interruptible(q);
+       if (swait_active(q))
+               swake_up(q);
  
         if (apic_lvtt_tscdeadline(apic))
                 ktimer->expired_tscdeadline = ktimer->tscdeadline;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c

index 95a955de5964bcc3f4aa6791a004e29b28504c13..1e7a49bfc94fb323cbb11782693cab89743f1133 100644 (file)
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3721,13 +3721,15 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
  void
  reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
  {
+       bool uses_nx = context->nx || context->base_role.smep_andnot_wp;
+
         /*
          * Passing "true" to the last argument is okay; it adds a check
          * on bit 8 of the SPTEs which KVM doesn't use anyway.
          */
         __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check,
                                 boot_cpu_data.x86_phys_bits,
-                               context->shadow_root_level, context->nx,
+                               context->shadow_root_level, uses_nx,
                                 guest_cpuid_has_gbpages(vcpu), is_pse(vcpu),
                                 true);
  }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c

index 0ff453749a909b9f0140b5de5d5d2dbb133e99d9..9bd8f44baded2318e8b5bc2a4773068a45a8723b 100644 (file)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1813,6 +1813,13 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
                         return;
                 }
                 break;
+       case MSR_IA32_PEBS_ENABLE:
+               /* PEBS needs a quiescent period after being disabled (to write
+                * a record).  Disabling PEBS through VMX MSR swapping doesn't
+                * provide that period, so a CPU could write host's record into
+                * guest's memory.
+                */
+               wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
         }
  
         for (i = 0; i < m->nr; ++i)
@@ -1850,26 +1857,31 @@ static void reload_tss(void)
  
  static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
  {
-       u64 guest_efer;
-       u64 ignore_bits;
+       u64 guest_efer = vmx->vcpu.arch.efer;
+       u64 ignore_bits = 0;
  
-       guest_efer = vmx->vcpu.arch.efer;
+       if (!enable_ept) {
+               /*
+                * NX is needed to handle CR0.WP=1, CR4.SMEP=1.  Testing
+                * host CPUID is more efficient than testing guest CPUID
+                * or CR4.  Host SMEP is anyway a requirement for guest SMEP.
+                */
+               if (boot_cpu_has(X86_FEATURE_SMEP))
+                       guest_efer |= EFER_NX;
+               else if (!(guest_efer & EFER_NX))
+                       ignore_bits |= EFER_NX;
+       }
  
         /*
-        * NX is emulated; LMA and LME handled by hardware; SCE meaningless
-        * outside long mode
+        * LMA and LME handled by hardware; SCE meaningless outside long mode.
          */
-       ignore_bits = EFER_NX | EFER_SCE;
+       ignore_bits |= EFER_SCE;
  #ifdef CONFIG_X86_64
         ignore_bits |= EFER_LMA | EFER_LME;
         /* SCE is meaningful only in long mode on Intel */
         if (guest_efer & EFER_LMA)
                 ignore_bits &= ~(u64)EFER_SCE;
  #endif
-       guest_efer &= ~ignore_bits;
-       guest_efer |= host_efer & ignore_bits;
-       vmx->guest_msrs[efer_offset].data = guest_efer;
-       vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
  
         clear_atomic_switch_msr(vmx, MSR_EFER);
  
@@ -1880,16 +1892,21 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
          */
         if (cpu_has_load_ia32_efer ||
             (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
-               guest_efer = vmx->vcpu.arch.efer;
                 if (!(guest_efer & EFER_LMA))
                         guest_efer &= ~EFER_LME;
                 if (guest_efer != host_efer)
                         add_atomic_switch_msr(vmx, MSR_EFER,
                                               guest_efer, host_efer);
                 return false;
-       }
+       } else {
+               guest_efer &= ~ignore_bits;
+               guest_efer |= host_efer & ignore_bits;
  
-       return true;
+               vmx->guest_msrs[efer_offset].data = guest_efer;
+               vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
+
+               return true;
+       }
  }
  
  static unsigned long segment_base(u16 selector)
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c

index 4ba229ac3f4ff127dddf0c33fb91cc42f2e60b22..fd57d3ae7e16daf24f8cfdb4e708c5e4e9a9f3f9 100644 (file)
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1520,12 +1520,6 @@ __init void lguest_init(void)
          */
         reserve_top_address(lguest_data.reserve_mem);
  
-       /*
-        * If we don't initialize the lock dependency checker now, it crashes
-        * atomic_notifier_chain_register, then paravirt_disable_iospace.
-        */
-       lockdep_init();
-
         /* Hook in our special panic hypercall code. */
         atomic_notifier_chain_register(&panic_notifier_list, &paniced);
  
@@ -1535,7 +1529,7 @@ __init void lguest_init(void)
          */
         cpu_detect(&new_cpu_data);
         /* head.S usually sets up the first capability word, so do it here. */
-       new_cpu_data.x86_capability[0] = cpuid_edx(1);
+       new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
  
         /* Math is always hard! */
         set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c

index e912b2f6d36e5392b33bf6749f19a53cff6d7a06..2f07c291dcc855af5c483c55133d91a4dff21c92 100644 (file)
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -102,7 +102,7 @@ static void delay_mwaitx(unsigned long __loops)
                  * Use cpu_tss as a cacheline-aligned, seldomly
                  * accessed per-cpu variable as the monitor target.
                  */
-               __monitorx(this_cpu_ptr(&cpu_tss), 0, 0);
+               __monitorx(raw_cpu_ptr(&cpu_tss), 0, 0);
  
                 /*
                  * AMD, like Intel, supports the EAX hint and EAX=0xf
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S

index a0de849435ad69dda8b50e79c79a4bc9fc8d3d49..cbb8ee5830ff134f131533fd91ea8be35a853dba 100644 (file)
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -177,3 +177,120 @@ ENTRY(memcpy_orig)
  .Lend:
         retq
  ENDPROC(memcpy_orig)
+
+#ifndef CONFIG_UML
+/*
+ * memcpy_mcsafe - memory copy with machine check exception handling
+ * Note that we only catch machine checks when reading the source addresses.
+ * Writes to target are posted and don't generate machine checks.
+ */
+ENTRY(memcpy_mcsafe)
+       cmpl $8, %edx
+       /* Less than 8 bytes? Go to byte copy loop */
+       jb .L_no_whole_words
+
+       /* Check for bad alignment of source */
+       testl $7, %esi
+       /* Already aligned */
+       jz .L_8byte_aligned
+
+       /* Copy one byte at a time until source is 8-byte aligned */
+       movl %esi, %ecx
+       andl $7, %ecx
+       subl $8, %ecx
+       negl %ecx
+       subl %ecx, %edx
+.L_copy_leading_bytes:
+       movb (%rsi), %al
+       movb %al, (%rdi)
+       incq %rsi
+       incq %rdi
+       decl %ecx
+       jnz .L_copy_leading_bytes
+
+.L_8byte_aligned:
+       /* Figure out how many whole cache lines (64-bytes) to copy */
+       movl %edx, %ecx
+       andl $63, %edx
+       shrl $6, %ecx
+       jz .L_no_whole_cache_lines
+
+       /* Loop copying whole cache lines */
+.L_cache_w0: movq (%rsi), %r8
+.L_cache_w1: movq 1*8(%rsi), %r9
+.L_cache_w2: movq 2*8(%rsi), %r10
+.L_cache_w3: movq 3*8(%rsi), %r11
+       movq %r8, (%rdi)
+       movq %r9, 1*8(%rdi)
+       movq %r10, 2*8(%rdi)
+       movq %r11, 3*8(%rdi)
+.L_cache_w4: movq 4*8(%rsi), %r8
+.L_cache_w5: movq 5*8(%rsi), %r9
+.L_cache_w6: movq 6*8(%rsi), %r10
+.L_cache_w7: movq 7*8(%rsi), %r11
+       movq %r8, 4*8(%rdi)
+       movq %r9, 5*8(%rdi)
+       movq %r10, 6*8(%rdi)
+       movq %r11, 7*8(%rdi)
+       leaq 64(%rsi), %rsi
+       leaq 64(%rdi), %rdi
+       decl %ecx
+       jnz .L_cache_w0
+
+       /* Are there any trailing 8-byte words? */
+.L_no_whole_cache_lines:
+       movl %edx, %ecx
+       andl $7, %edx
+       shrl $3, %ecx
+       jz .L_no_whole_words
+
+       /* Copy trailing words */
+.L_copy_trailing_words:
+       movq (%rsi), %r8
+       mov %r8, (%rdi)
+       leaq 8(%rsi), %rsi
+       leaq 8(%rdi), %rdi
+       decl %ecx
+       jnz .L_copy_trailing_words
+
+       /* Any trailing bytes? */
+.L_no_whole_words:
+       andl %edx, %edx
+       jz .L_done_memcpy_trap
+
+       /* Copy trailing bytes */
+       movl %edx, %ecx
+.L_copy_trailing_bytes:
+       movb (%rsi), %al
+       movb %al, (%rdi)
+       incq %rsi
+       incq %rdi
+       decl %ecx
+       jnz .L_copy_trailing_bytes
+
+       /* Copy successful. Return true */
+.L_done_memcpy_trap:
+       xorq %rax, %rax
+       ret
+ENDPROC(memcpy_mcsafe)
+
+       .section .fixup, "ax"
+       /* Return false for any failure */
+.L_memcpy_mcsafe_fail:
+       mov     $1, %rax
+       ret
+
+       .previous
+
+       _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
+#endif
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c

index 903ec1e9c3263f7f7ce6e1f5a59e7a32b224dc85..9dd7e4b7fcdee4a847f98937b9a287b826366a2d 100644 (file)
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -3,6 +3,9 @@
  #include <linux/sort.h>
  #include <asm/uaccess.h>
  
+typedef bool (*ex_handler_t)(const struct exception_table_entry *,
+                           struct pt_regs *, int);
+
  static inline unsigned long
  ex_insn_addr(const struct exception_table_entry *x)
  {
@@ -13,11 +16,56 @@ ex_fixup_addr(const struct exception_table_entry *x)
  {
         return (unsigned long)&x->fixup + x->fixup;
  }
+static inline ex_handler_t
+ex_fixup_handler(const struct exception_table_entry *x)
+{
+       return (ex_handler_t)((unsigned long)&x->handler + x->handler);
+}
  
-int fixup_exception(struct pt_regs *regs)
+bool ex_handler_default(const struct exception_table_entry *fixup,
+                      struct pt_regs *regs, int trapnr)
  {
-       const struct exception_table_entry *fixup;
-       unsigned long new_ip;
+       regs->ip = ex_fixup_addr(fixup);
+       return true;
+}
+EXPORT_SYMBOL(ex_handler_default);
+
+bool ex_handler_fault(const struct exception_table_entry *fixup,
+                    struct pt_regs *regs, int trapnr)
+{
+       regs->ip = ex_fixup_addr(fixup);
+       regs->ax = trapnr;
+       return true;
+}
+EXPORT_SYMBOL_GPL(ex_handler_fault);
+
+bool ex_handler_ext(const struct exception_table_entry *fixup,
+                  struct pt_regs *regs, int trapnr)
+{
+       /* Special hack for uaccess_err */
+       current_thread_info()->uaccess_err = 1;
+       regs->ip = ex_fixup_addr(fixup);
+       return true;
+}
+EXPORT_SYMBOL(ex_handler_ext);
+
+bool ex_has_fault_handler(unsigned long ip)
+{
+       const struct exception_table_entry *e;
+       ex_handler_t handler;
+
+       e = search_exception_tables(ip);
+       if (!e)
+               return false;
+       handler = ex_fixup_handler(e);
+
+       return handler == ex_handler_fault;
+}
+
+int fixup_exception(struct pt_regs *regs, int trapnr)
+{
+       const struct exception_table_entry *e;
+       ex_handler_t handler;
  
  #ifdef CONFIG_PNPBIOS
         if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
@@ -33,42 +81,34 @@ int fixup_exception(struct pt_regs *regs)
         }
  #endif
  
-       fixup = search_exception_tables(regs->ip);
-       if (fixup) {
-               new_ip = ex_fixup_addr(fixup);
-
-               if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
-                       /* Special hack for uaccess_err */
-                       current_thread_info()->uaccess_err = 1;
-                       new_ip -= 0x7ffffff0;
-               }
-               regs->ip = new_ip;
-               return 1;
-       }
+       e = search_exception_tables(regs->ip);
+       if (!e)
+               return 0;
  
-       return 0;
+       handler = ex_fixup_handler(e);
+       return handler(e, regs, trapnr);
  }
  
  /* Restricted version used during very early boot */
  int __init early_fixup_exception(unsigned long *ip)
  {
-       const struct exception_table_entry *fixup;
+       const struct exception_table_entry *e;
         unsigned long new_ip;
+       ex_handler_t handler;
  
-       fixup = search_exception_tables(*ip);
-       if (fixup) {
-               new_ip = ex_fixup_addr(fixup);
+       e = search_exception_tables(*ip);
+       if (!e)
+               return 0;
  
-               if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
-                       /* uaccess handling not supported during early boot */
-                       return 0;
-               }
+       new_ip  = ex_fixup_addr(e);
+       handler = ex_fixup_handler(e);
  
-               *ip = new_ip;
-               return 1;
-       }
+       /* special handling not supported during early boot */
+       if (handler != ex_handler_default)
+               return 0;
  
-       return 0;
+       *ip = new_ip;
+       return 1;
  }
  
  /*
@@ -133,6 +173,8 @@ void sort_extable(struct exception_table_entry *start,
                 i += 4;
                 p->fixup += i;
                 i += 4;
+               p->handler += i;
+               i += 4;
         }
  
         sort(start, finish - start, sizeof(struct exception_table_entry),
@@ -145,6 +187,8 @@ void sort_extable(struct exception_table_entry *start,
                 i += 4;
                 p->fixup -= i;
                 i += 4;
+               p->handler -= i;
+               i += 4;
         }
  }
  
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c

index e830c71a13232f4305adf9ef1eb8ce6ffe201d5b..03898aea6e0f2697e2196e9de7f8fe3aeb200ec9 100644 (file)
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -663,7 +663,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
         int sig;
  
         /* Are we prepared to handle this kernel fault? */
-       if (fixup_exception(regs)) {
+       if (fixup_exception(regs, X86_TRAP_PF)) {
                 /*
                  * Any interrupt that takes a fault gets the fixup. This makes
                  * the below recursive fault logic only apply to a faults from
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c

index cb4ef3de61f9ae9c95249876965a71a5d7b58cf8..2ebfbaf611424be1937c4e2288776d50d1c8ff9e 100644 (file)
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -871,7 +871,6 @@ static noinline int do_test_wp_bit(void)
         return flag;
  }
  
-#ifdef CONFIG_DEBUG_RODATA
  const int rodata_test_data = 0xC3;
  EXPORT_SYMBOL_GPL(rodata_test_data);
  
@@ -960,5 +959,3 @@ void mark_rodata_ro(void)
         if (__supported_pte_mask & _PAGE_NX)
                 debug_checkwx();
  }
-#endif
-
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c

index 5488d21123bd2edb3f71ded119495474f3e8b728..a40b755c67e31280ded6fe10d81a18927523059c 100644 (file)
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1074,7 +1074,6 @@ void __init mem_init(void)
         mem_init_print_info(NULL);
  }
  
-#ifdef CONFIG_DEBUG_RODATA
  const int rodata_test_data = 0xC3;
  EXPORT_SYMBOL_GPL(rodata_test_data);
  
@@ -1166,8 +1165,6 @@ void mark_rodata_ro(void)
         debug_checkwx();
  }
  
-#endif
-
  int kern_addr_valid(unsigned long addr)
  {
         unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c

index 9cf96d82147a8da38ce4f57413d75075003cb244..1c37e650acacff422460b1d9b705a867e01a09a5 100644 (file)
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -283,7 +283,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
                    __pa_symbol(__end_rodata) >> PAGE_SHIFT))
                 pgprot_val(forbidden) |= _PAGE_RW;
  
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
         /*
          * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
          * kernel text mappings for the large page aligned text, rodata sections
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c

index 2d66db8f80f992d3b0609373502031aacf91f161..ed30e79347e86377198009dcafb1c0b26a9865df 100644 (file)
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -130,6 +130,27 @@ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size)
  }
  EXPORT_SYMBOL_GPL(efi_query_variable_store);
  
+/*
+ * Helper function for efi_reserve_boot_services() to figure out if we
+ * can free regions in efi_free_boot_services().
+ *
+ * Use this function to ensure we do not free regions owned by somebody
+ * else. We must only reserve (and then free) regions:
+ *
+ * - Not within any part of the kernel
+ * - Not the BIOS reserved area (E820_RESERVED, E820_NVS, etc)
+ */
+static bool can_free_region(u64 start, u64 size)
+{
+       if (start + size > __pa_symbol(_text) && start <= __pa_symbol(_end))
+               return false;
+
+       if (!e820_all_mapped(start, start+size, E820_RAM))
+               return false;
+
+       return true;
+}
+
  /*
   * The UEFI specification makes it clear that the operating system is free to do
   * whatever it wants with boot services code after ExitBootServices() has been
@@ -147,26 +168,50 @@ void __init efi_reserve_boot_services(void)
                 efi_memory_desc_t *md = p;
                 u64 start = md->phys_addr;
                 u64 size = md->num_pages << EFI_PAGE_SHIFT;
+               bool already_reserved;
  
                 if (md->type != EFI_BOOT_SERVICES_CODE &&
                     md->type != EFI_BOOT_SERVICES_DATA)
                         continue;
-               /* Only reserve where possible:
-                * - Not within any already allocated areas
-                * - Not over any memory area (really needed, if above?)
-                * - Not within any part of the kernel
-                * - Not the bios reserved area
-               */
-               if ((start + size > __pa_symbol(_text)
-                               && start <= __pa_symbol(_end)) ||
-                       !e820_all_mapped(start, start+size, E820_RAM) ||
-                       memblock_is_region_reserved(start, size)) {
-                       /* Could not reserve, skip it */
-                       md->num_pages = 0;
-                       memblock_dbg("Could not reserve boot range [0x%010llx-0x%010llx]\n",
-                                    start, start+size-1);
-               } else
+
+               already_reserved = memblock_is_region_reserved(start, size);
+
+               /*
+                * Because the following memblock_reserve() is paired
+                * with free_bootmem_late() for this region in
+                * efi_free_boot_services(), we must be extremely
+                * careful not to reserve, and subsequently free,
+                * critical regions of memory (like the kernel image) or
+                * those regions that somebody else has already
+                * reserved.
+                *
+                * A good example of a critical region that must not be
+                * freed is page zero (first 4Kb of memory), which may
+                * contain boot services code/data but is marked
+                * E820_RESERVED by trim_bios_range().
+                */
+               if (!already_reserved) {
                         memblock_reserve(start, size);
+
+                       /*
+                        * If we are the first to reserve the region, no
+                        * one else cares about it. We own it and can
+                        * free it later.
+                        */
+                       if (can_free_region(start, size))
+                               continue;
+               }
+
+               /*
+                * We don't own the region. We must not free it.
+                *
+                * Setting this bit for a boot services region really
+                * doesn't make sense as far as the firmware is
+                * concerned, but it does provide us with a way to tag
+                * those regions that must not be paired with
+                * free_bootmem_late().
+                */
+               md->attribute |= EFI_MEMORY_RUNTIME;
         }
  }
  
@@ -183,8 +228,8 @@ void __init efi_free_boot_services(void)
                     md->type != EFI_BOOT_SERVICES_DATA)
                         continue;
  
-               /* Could not reserve boot area */
-               if (!size)
+               /* Do not free, someone else owns it: */
+               if (md->attribute & EFI_MEMORY_RUNTIME)
                         continue;
  
                 free_bootmem_late(start, size);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c

index d09e4c9d7cc5b4044c4421bde8692aaa6f8b21be..2c261082eadf82f693d062e7f600660fcd56817f 100644 (file)
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1654,7 +1654,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
         cpu_detect(&new_cpu_data);
         set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
         new_cpu_data.wp_works_ok = 1;
-       new_cpu_data.x86_capability[0] = cpuid_edx(1);
+       new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
  #endif
  
         if (xen_start_info->mod_start) {
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c

index 724a08740a04b4a255ca080c103ec3b7381bc4f7..9466354d3e4962f14cdae33b09378a73f9c51a5d 100644 (file)
--- a/arch/x86/xen/pmu.c
+++ b/arch/x86/xen/pmu.c
@@ -11,7 +11,7 @@
  #include "pmu.h"
  
  /* x86_pmu.handle_irq definition */
-#include "../kernel/cpu/perf_event.h"
+#include "../events/perf_event.h"
  
  #define XENPMU_IRQ_PROCESSING    1
  struct xenpmu {
diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c

index 296b7a14893aabba895c578e8671e36b34875a37..b6f7fa3a1d403ea95428808f49edab475422e80a 100644 (file)
--- a/drivers/acpi/acpi_platform.c
+++ b/drivers/acpi/acpi_platform.c
@@ -62,7 +62,7 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev)
         if (count < 0) {
                 return NULL;
         } else if (count > 0) {
-               resources = kmalloc(count * sizeof(struct resource),
+               resources = kzalloc(count * sizeof(struct resource),
                                     GFP_KERNEL);
                 if (!resources) {
                         dev_err(&adev->dev, "No memory for resources\n");
diff --git a/drivers/acpi/acpica/psargs.c b/drivers/acpi/acpica/psargs.c

index 305218539df28cda17b97992ff44a0dbaeb613de..d48cbed342c1dda500d5020c746a9caf1c72ad09 100644 (file)
--- a/drivers/acpi/acpica/psargs.c
+++ b/drivers/acpi/acpica/psargs.c
@@ -269,8 +269,7 @@ acpi_ps_get_next_namepath(struct acpi_walk_state *walk_state,
          */
         if (ACPI_SUCCESS(status) &&
             possible_method_call && (node->type == ACPI_TYPE_METHOD)) {
-               if (GET_CURRENT_ARG_TYPE(walk_state->arg_types) ==
-                   ARGP_SUPERNAME) {
+               if (walk_state->opcode == AML_UNLOAD_OP) {
                         /*
                          * acpi_ps_get_next_namestring has increased the AML pointer,
                          * so we need to restore the saved AML pointer for method call.
@@ -697,7 +696,7 @@ static union acpi_parse_object *acpi_ps_get_next_field(struct acpi_parse_state
   *
   * PARAMETERS:  walk_state          - Current state
   *              parser_state        - Current parser state object
- *              arg_type            - The parser argument type (ARGP_*)
+ *              arg_type            - The argument type (AML_*_ARG)
   *              return_arg          - Where the next arg is returned
   *
   * RETURN:      Status, and an op object containing the next argument.
@@ -817,9 +816,9 @@ acpi_ps_get_next_arg(struct acpi_walk_state *walk_state,
                                 return_ACPI_STATUS(AE_NO_MEMORY);
                         }
  
-                       /* super_name allows argument to be a method call */
+                       /* To support super_name arg of Unload */
  
-                       if (arg_type == ARGP_SUPERNAME) {
+                       if (walk_state->opcode == AML_UNLOAD_OP) {
                                 status =
                                     acpi_ps_get_next_namepath(walk_state,
                                                               parser_state, arg,
diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c

index 0431883653bed9d3c0394368fe87ced99043f185..559c1173de1c99177ba702c60d27993826510e9c 100644 (file)
--- a/drivers/acpi/apei/einj.c
+++ b/drivers/acpi/apei/einj.c
@@ -519,7 +519,7 @@ static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
                              u64 param3, u64 param4)
  {
         int rc;
-       unsigned long pfn;
+       u64 base_addr, size;
  
         /* If user manually set "flags", make sure it is legal */
         if (flags && (flags &
@@ -545,10 +545,17 @@ static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
         /*
          * Disallow crazy address masks that give BIOS leeway to pick
          * injection address almost anywhere. Insist on page or
-        * better granularity and that target address is normal RAM.
+        * better granularity and that target address is normal RAM or
+        * NVDIMM.
          */
-       pfn = PFN_DOWN(param1 & param2);
-       if (!page_is_ram(pfn) || ((param2 & PAGE_MASK) != PAGE_MASK))
+       base_addr = param1 & param2;
+       size = ~param2 + 1;
+
+       if (((param2 & PAGE_MASK) != PAGE_MASK) ||
+           ((region_intersects(base_addr, size, IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)
+                               != REGION_INTERSECTS) &&
+            (region_intersects(base_addr, size, IORESOURCE_MEM, IORES_DESC_PERSISTENT_MEMORY)
+                               != REGION_INTERSECTS)))
                 return -EINVAL;
  
  inject:
diff --git a/drivers/base/property.c b/drivers/base/property.c

index c359351d50f1c99e9758b74ea84d97e76b402b0f..a163f2c59aa36571359e75661d52cfdc55d4ef3b 100644 (file)
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -218,7 +218,7 @@ bool fwnode_property_present(struct fwnode_handle *fwnode, const char *propname)
         bool ret;
  
         ret = __fwnode_property_present(fwnode, propname);
-       if (ret == false && fwnode && fwnode->secondary)
+       if (ret == false && fwnode && !IS_ERR_OR_NULL(fwnode->secondary))
                 ret = __fwnode_property_present(fwnode->secondary, propname);
         return ret;
  }
@@ -423,7 +423,7 @@ EXPORT_SYMBOL_GPL(device_property_match_string);
         int _ret_;                                                                      \
         _ret_ = FWNODE_PROP_READ(_fwnode_, _propname_, _type_, _proptype_,              \
                                  _val_, _nval_);                                        \
-       if (_ret_ == -EINVAL && _fwnode_ && _fwnode_->secondary)                        \
+       if (_ret_ == -EINVAL && _fwnode_ && !IS_ERR_OR_NULL(_fwnode_->secondary))       \
                 _ret_ = FWNODE_PROP_READ(_fwnode_->secondary, _propname_, _type_,       \
                                 _proptype_, _val_, _nval_);                             \
         _ret_;                                                                          \
@@ -593,7 +593,7 @@ int fwnode_property_read_string_array(struct fwnode_handle *fwnode,
         int ret;
  
         ret = __fwnode_property_read_string_array(fwnode, propname, val, nval);
-       if (ret == -EINVAL && fwnode && fwnode->secondary)
+       if (ret == -EINVAL && fwnode && !IS_ERR_OR_NULL(fwnode->secondary))
                 ret = __fwnode_property_read_string_array(fwnode->secondary,
                                                           propname, val, nval);
         return ret;
@@ -621,7 +621,7 @@ int fwnode_property_read_string(struct fwnode_handle *fwnode,
         int ret;
  
         ret = __fwnode_property_read_string(fwnode, propname, val);
-       if (ret == -EINVAL && fwnode && fwnode->secondary)
+       if (ret == -EINVAL && fwnode && !IS_ERR_OR_NULL(fwnode->secondary))
                 ret = __fwnode_property_read_string(fwnode->secondary,
                                                     propname, val);
         return ret;
diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c

index 64f5d1bdbb48defdddc4c7ed7ee0c5b988b92e1b..8e304b1befc56385b8fc34d7697a529289d8ae3b 100644 (file)
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -176,6 +176,7 @@
  #define AT_XDMAC_MAX_CHAN      0x20
  #define AT_XDMAC_MAX_CSIZE     16      /* 16 data */
  #define AT_XDMAC_MAX_DWIDTH    8       /* 64 bits */
+#define AT_XDMAC_RESIDUE_MAX_RETRIES   5
  
  #define AT_XDMAC_DMA_BUSWIDTHS\
         (BIT(DMA_SLAVE_BUSWIDTH_UNDEFINED) |\
@@ -1395,8 +1396,8 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
         struct at_xdmac_desc    *desc, *_desc;
         struct list_head        *descs_list;
         enum dma_status         ret;
-       int                     residue;
-       u32                     cur_nda, mask, value;
+       int                     residue, retry;
+       u32                     cur_nda, check_nda, cur_ubc, mask, value;
         u8                      dwidth = 0;
         unsigned long           flags;
  
@@ -1433,7 +1434,42 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
                         cpu_relax();
         }
  
+       /*
+        * When processing the residue, we need to read two registers but we
+        * can't do it in an atomic way. AT_XDMAC_CNDA is used to find where
+        * we stand in the descriptor list and AT_XDMAC_CUBC is used
+        * to know how many data are remaining for the current descriptor.
+        * Since the dma channel is not paused to not loose data, between the
+        * AT_XDMAC_CNDA and AT_XDMAC_CUBC read, we may have change of
+        * descriptor.
+        * For that reason, after reading AT_XDMAC_CUBC, we check if we are
+        * still using the same descriptor by reading a second time
+        * AT_XDMAC_CNDA. If AT_XDMAC_CNDA has changed, it means we have to
+        * read again AT_XDMAC_CUBC.
+        * Memory barriers are used to ensure the read order of the registers.
+        * A max number of retries is set because unlikely it can never ends if
+        * we are transferring a lot of data with small buffers.
+        */
         cur_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc;
+       rmb();
+       cur_ubc = at_xdmac_chan_read(atchan, AT_XDMAC_CUBC);
+       for (retry = 0; retry < AT_XDMAC_RESIDUE_MAX_RETRIES; retry++) {
+               rmb();
+               check_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc;
+
+               if (likely(cur_nda == check_nda))
+                       break;
+
+               cur_nda = check_nda;
+               rmb();
+               cur_ubc = at_xdmac_chan_read(atchan, AT_XDMAC_CUBC);
+       }
+
+       if (unlikely(retry >= AT_XDMAC_RESIDUE_MAX_RETRIES)) {
+               ret = DMA_ERROR;
+               goto spin_unlock;
+       }
+
         /*
          * Remove size of all microblocks already transferred and the current
          * one. Then add the remaining size to transfer of the current
@@ -1446,7 +1482,7 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
                 if ((desc->lld.mbr_nda & 0xfffffffc) == cur_nda)
                         break;
         }
-       residue += at_xdmac_chan_read(atchan, AT_XDMAC_CUBC) << dwidth;
+       residue += cur_ubc << dwidth;
  
         dma_set_residue(txstate, residue);
  
diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c

index 2209f75fdf05bf29114f0fb3ffedff89f55b43f3..aac85c30c2cf6fc64669841c5b6abb5317df1b94 100644 (file)
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -522,6 +522,8 @@ static dma_cookie_t fsldma_run_tx_complete_actions(struct fsldma_chan *chan,
                         chan_dbg(chan, "LD %p callback\n", desc);
                         txd->callback(txd->callback_param);
                 }
+
+               dma_descriptor_unmap(txd);
         }
  
         /* Run any dependencies */
diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c

index e4f43125e0fbbf5017cd848478eeb01550a87508..f039cfadf17b307c32bd88ef0e4a4a5ac749b590 100644 (file)
--- a/drivers/dma/iop-adma.c
+++ b/drivers/dma/iop-adma.c
@@ -1300,10 +1300,10 @@ static int iop_adma_probe(struct platform_device *pdev)
          * note: writecombine gives slightly better performance, but
          * requires that we explicitly flush the writes
          */
-       adev->dma_desc_pool_virt = dma_alloc_writecombine(&pdev->dev,
-                                                         plat_data->pool_size,
-                                                         &adev->dma_desc_pool,
-                                                         GFP_KERNEL);
+       adev->dma_desc_pool_virt = dma_alloc_wc(&pdev->dev,
+                                               plat_data->pool_size,
+                                               &adev->dma_desc_pool,
+                                               GFP_KERNEL);
         if (!adev->dma_desc_pool_virt) {
                 ret = -ENOMEM;
                 goto err_free_adev;
diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c

index 14091f878f80a5ca7cd0843e70284d890ad49f0c..3922a5d5680604f831308dc02a50b93685a089ce 100644 (file)
--- a/drivers/dma/mv_xor.c
+++ b/drivers/dma/mv_xor.c
@@ -964,8 +964,8 @@ mv_xor_channel_add(struct mv_xor_device *xordev,
          * requires that we explicitly flush the writes
          */
         mv_chan->dma_desc_pool_virt =
-         dma_alloc_writecombine(&pdev->dev, MV_XOR_POOL_SIZE,
-                                &mv_chan->dma_desc_pool, GFP_KERNEL);
+         dma_alloc_wc(&pdev->dev, MV_XOR_POOL_SIZE, &mv_chan->dma_desc_pool,
+                      GFP_KERNEL);
         if (!mv_chan->dma_desc_pool_virt)
                 return ERR_PTR(-ENOMEM);
  
diff --git a/drivers/dma/qcom_bam_dma.c b/drivers/dma/qcom_bam_dma.c

index 5a250cdc83769f0d4aaff65530c76b705a474e25..d34aef7a101b669994e2a92b88e448437f0bb6f8 100644 (file)
--- a/drivers/dma/qcom_bam_dma.c
+++ b/drivers/dma/qcom_bam_dma.c
@@ -502,8 +502,8 @@ static int bam_alloc_chan(struct dma_chan *chan)
                 return 0;
  
         /* allocate FIFO descriptor space, but only if necessary */
-       bchan->fifo_virt = dma_alloc_writecombine(bdev->dev, BAM_DESC_FIFO_SIZE,
-                               &bchan->fifo_phys, GFP_KERNEL);
+       bchan->fifo_virt = dma_alloc_wc(bdev->dev, BAM_DESC_FIFO_SIZE,
+                                       &bchan->fifo_phys, GFP_KERNEL);
  
         if (!bchan->fifo_virt) {
                 dev_err(bdev->dev, "Failed to allocate desc fifo\n");
@@ -538,8 +538,8 @@ static void bam_free_chan(struct dma_chan *chan)
         bam_reset_channel(bchan);
         spin_unlock_irqrestore(&bchan->vc.lock, flags);
  
-       dma_free_writecombine(bdev->dev, BAM_DESC_FIFO_SIZE, bchan->fifo_virt,
-                               bchan->fifo_phys);
+       dma_free_wc(bdev->dev, BAM_DESC_FIFO_SIZE, bchan->fifo_virt,
+                   bchan->fifo_phys);
         bchan->fifo_virt = NULL;
  
         /* mask irq for pipe/channel */
@@ -1231,9 +1231,9 @@ static int bam_dma_remove(struct platform_device *pdev)
                 bam_dma_terminate_all(&bdev->channels[i].vc.chan);
                 tasklet_kill(&bdev->channels[i].vc.task);
  
-               dma_free_writecombine(bdev->dev, BAM_DESC_FIFO_SIZE,
-                       bdev->channels[i].fifo_virt,
-                       bdev->channels[i].fifo_phys);
+               dma_free_wc(bdev->dev, BAM_DESC_FIFO_SIZE,
+                           bdev->channels[i].fifo_virt,
+                           bdev->channels[i].fifo_phys);
         }
  
         tasklet_kill(&bdev->task);
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c

index e3a945ce374b8318526102852b28535948284ae7..49768c08ac0734345007a1faaae7eef70cd34f60 100644 (file)
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -147,6 +147,135 @@ static const char * const mc6_mce_desc[] = {
         "Status Register File",
  };
  
+/* Scalable MCA error strings */
+static const char * const f17h_ls_mce_desc[] = {
+       "Load queue parity",
+       "Store queue parity",
+       "Miss address buffer payload parity",
+       "L1 TLB parity",
+       "",                                             /* reserved */
+       "DC tag error type 6",
+       "DC tag error type 1",
+       "Internal error type 1",
+       "Internal error type 2",
+       "Sys Read data error thread 0",
+       "Sys read data error thread 1",
+       "DC tag error type 2",
+       "DC data error type 1 (poison comsumption)",
+       "DC data error type 2",
+       "DC data error type 3",
+       "DC tag error type 4",
+       "L2 TLB parity",
+       "PDC parity error",
+       "DC tag error type 3",
+       "DC tag error type 5",
+       "L2 fill data error",
+};
+
+static const char * const f17h_if_mce_desc[] = {
+       "microtag probe port parity error",
+       "IC microtag or full tag multi-hit error",
+       "IC full tag parity",
+       "IC data array parity",
+       "Decoupling queue phys addr parity error",
+       "L0 ITLB parity error",
+       "L1 ITLB parity error",
+       "L2 ITLB parity error",
+       "BPQ snoop parity on Thread 0",
+       "BPQ snoop parity on Thread 1",
+       "L1 BTB multi-match error",
+       "L2 BTB multi-match error",
+};
+
+static const char * const f17h_l2_mce_desc[] = {
+       "L2M tag multi-way-hit error",
+       "L2M tag ECC error",
+       "L2M data ECC error",
+       "HW assert",
+};
+
+static const char * const f17h_de_mce_desc[] = {
+       "uop cache tag parity error",
+       "uop cache data parity error",
+       "Insn buffer parity error",
+       "Insn dispatch queue parity error",
+       "Fetch address FIFO parity",
+       "Patch RAM data parity",
+       "Patch RAM sequencer parity",
+       "uop buffer parity"
+};
+
+static const char * const f17h_ex_mce_desc[] = {
+       "Watchdog timeout error",
+       "Phy register file parity",
+       "Flag register file parity",
+       "Immediate displacement register file parity",
+       "Address generator payload parity",
+       "EX payload parity",
+       "Checkpoint queue parity",
+       "Retire dispatch queue parity",
+};
+
+static const char * const f17h_fp_mce_desc[] = {
+       "Physical register file parity",
+       "Freelist parity error",
+       "Schedule queue parity",
+       "NSQ parity error",
+       "Retire queue parity",
+       "Status register file parity",
+};
+
+static const char * const f17h_l3_mce_desc[] = {
+       "Shadow tag macro ECC error",
+       "Shadow tag macro multi-way-hit error",
+       "L3M tag ECC error",
+       "L3M tag multi-way-hit error",
+       "L3M data ECC error",
+       "XI parity, L3 fill done channel error",
+       "L3 victim queue parity",
+       "L3 HW assert",
+};
+
+static const char * const f17h_cs_mce_desc[] = {
+       "Illegal request from transport layer",
+       "Address violation",
+       "Security violation",
+       "Illegal response from transport layer",
+       "Unexpected response",
+       "Parity error on incoming request or probe response data",
+       "Parity error on incoming read response data",
+       "Atomic request parity",
+       "ECC error on probe filter access",
+};
+
+static const char * const f17h_pie_mce_desc[] = {
+       "HW assert",
+       "Internal PIE register security violation",
+       "Error on GMI link",
+       "Poison data written to internal PIE register",
+};
+
+static const char * const f17h_umc_mce_desc[] = {
+       "DRAM ECC error",
+       "Data poison error on DRAM",
+       "SDP parity error",
+       "Advanced peripheral bus error",
+       "Command/address parity error",
+       "Write data CRC error",
+};
+
+static const char * const f17h_pb_mce_desc[] = {
+       "Parameter Block RAM ECC error",
+};
+
+static const char * const f17h_psp_mce_desc[] = {
+       "PSP RAM ECC or parity error",
+};
+
+static const char * const f17h_smu_mce_desc[] = {
+       "SMU RAM ECC or parity error",
+};
+
  static bool f12h_mc0_mce(u16 ec, u8 xec)
  {
         bool ret = false;
@@ -691,6 +820,177 @@ static void decode_mc6_mce(struct mce *m)
         pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
  }
  
+static void decode_f17h_core_errors(const char *ip_name, u8 xec,
+                                  unsigned int mca_type)
+{
+       const char * const *error_desc_array;
+       size_t len;
+
+       pr_emerg(HW_ERR "%s Error: ", ip_name);
+
+       switch (mca_type) {
+       case SMCA_LS:
+               error_desc_array = f17h_ls_mce_desc;
+               len = ARRAY_SIZE(f17h_ls_mce_desc) - 1;
+
+               if (xec == 0x4) {
+                       pr_cont("Unrecognized LS MCA error code.\n");
+                       return;
+               }
+               break;
+
+       case SMCA_IF:
+               error_desc_array = f17h_if_mce_desc;
+               len = ARRAY_SIZE(f17h_if_mce_desc) - 1;
+               break;
+
+       case SMCA_L2_CACHE:
+               error_desc_array = f17h_l2_mce_desc;
+               len = ARRAY_SIZE(f17h_l2_mce_desc) - 1;
+               break;
+
+       case SMCA_DE:
+               error_desc_array = f17h_de_mce_desc;
+               len = ARRAY_SIZE(f17h_de_mce_desc) - 1;
+               break;
+
+       case SMCA_EX:
+               error_desc_array = f17h_ex_mce_desc;
+               len = ARRAY_SIZE(f17h_ex_mce_desc) - 1;
+               break;
+
+       case SMCA_FP:
+               error_desc_array = f17h_fp_mce_desc;
+               len = ARRAY_SIZE(f17h_fp_mce_desc) - 1;
+               break;
+
+       case SMCA_L3_CACHE:
+               error_desc_array = f17h_l3_mce_desc;
+               len = ARRAY_SIZE(f17h_l3_mce_desc) - 1;
+               break;
+
+       default:
+               pr_cont("Corrupted MCA core error info.\n");
+               return;
+       }
+
+       if (xec > len) {
+               pr_cont("Unrecognized %s MCA bank error code.\n",
+                        amd_core_mcablock_names[mca_type]);
+               return;
+       }
+
+       pr_cont("%s.\n", error_desc_array[xec]);
+}
+
+static void decode_df_errors(u8 xec, unsigned int mca_type)
+{
+       const char * const *error_desc_array;
+       size_t len;
+
+       pr_emerg(HW_ERR "Data Fabric Error: ");
+
+       switch (mca_type) {
+       case  SMCA_CS:
+               error_desc_array = f17h_cs_mce_desc;
+               len = ARRAY_SIZE(f17h_cs_mce_desc) - 1;
+               break;
+
+       case SMCA_PIE:
+               error_desc_array = f17h_pie_mce_desc;
+               len = ARRAY_SIZE(f17h_pie_mce_desc) - 1;
+               break;
+
+       default:
+               pr_cont("Corrupted MCA Data Fabric info.\n");
+               return;
+       }
+
+       if (xec > len) {
+               pr_cont("Unrecognized %s MCA bank error code.\n",
+                        amd_df_mcablock_names[mca_type]);
+               return;
+       }
+
+       pr_cont("%s.\n", error_desc_array[xec]);
+}
+
+/* Decode errors according to Scalable MCA specification */
+static void decode_smca_errors(struct mce *m)
+{
+       u32 addr = MSR_AMD64_SMCA_MCx_IPID(m->bank);
+       unsigned int hwid, mca_type, i;
+       u8 xec = XEC(m->status, xec_mask);
+       const char * const *error_desc_array;
+       const char *ip_name;
+       u32 low, high;
+       size_t len;
+
+       if (rdmsr_safe(addr, &low, &high)) {
+               pr_emerg("Invalid IP block specified, error information is unreliable.\n");
+               return;
+       }
+
+       hwid = high & MCI_IPID_HWID;
+       mca_type = (high & MCI_IPID_MCATYPE) >> 16;
+
+       pr_emerg(HW_ERR "MC%d IPID value: 0x%08x%08x\n", m->bank, high, low);
+
+       /*
+        * Based on hwid and mca_type values, decode errors from respective IPs.
+        * Note: mca_type values make sense only in the context of an hwid.
+        */
+       for (i = 0; i < ARRAY_SIZE(amd_hwids); i++)
+               if (amd_hwids[i].hwid == hwid)
+                       break;
+
+       switch (i) {
+       case SMCA_F17H_CORE:
+               ip_name = (mca_type == SMCA_L3_CACHE) ?
+                         "L3 Cache" : "F17h Core";
+               return decode_f17h_core_errors(ip_name, xec, mca_type);
+               break;
+
+       case SMCA_DF:
+               return decode_df_errors(xec, mca_type);
+               break;
+
+       case SMCA_UMC:
+               error_desc_array = f17h_umc_mce_desc;
+               len = ARRAY_SIZE(f17h_umc_mce_desc) - 1;
+               break;
+
+       case SMCA_PB:
+               error_desc_array = f17h_pb_mce_desc;
+               len = ARRAY_SIZE(f17h_pb_mce_desc) - 1;
+               break;
+
+       case SMCA_PSP:
+               error_desc_array = f17h_psp_mce_desc;
+               len = ARRAY_SIZE(f17h_psp_mce_desc) - 1;
+               break;
+
+       case SMCA_SMU:
+               error_desc_array = f17h_smu_mce_desc;
+               len = ARRAY_SIZE(f17h_smu_mce_desc) - 1;
+               break;
+
+       default:
+               pr_emerg(HW_ERR "HWID:%d does not match any existing IPs.\n", hwid);
+               return;
+       }
+
+       ip_name = amd_hwids[i].name;
+       pr_emerg(HW_ERR "%s Error: ", ip_name);
+
+       if (xec > len) {
+               pr_cont("Unrecognized %s MCA bank error code.\n", ip_name);
+               return;
+       }
+
+       pr_cont("%s.\n", error_desc_array[xec]);
+}
+
  static inline void amd_decode_err_code(u16 ec)
  {
         if (INT_ERROR(ec)) {
@@ -752,6 +1052,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
         struct mce *m = (struct mce *)data;
         struct cpuinfo_x86 *c = &cpu_data(m->extcpu);
         int ecc;
+       u32 ebx = cpuid_ebx(0x80000007);
  
         if (amd_filter_mce(m))
                 return NOTIFY_STOP;
@@ -769,11 +1070,20 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
                 ((m->status & MCI_STATUS_PCC)   ? "PCC"   : "-"),
                 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"));
  
-       if (c->x86 == 0x15 || c->x86 == 0x16)
+       if (c->x86 >= 0x15)
                 pr_cont("|%s|%s",
                         ((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"),
                         ((m->status & MCI_STATUS_POISON)   ? "Poison"   : "-"));
  
+       if (!!(ebx & BIT(3))) {
+               u32 low, high;
+               u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
+
+               if (!rdmsr_safe(addr, &low, &high) &&
+                   (low & MCI_CONFIG_MCAX))
+                       pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
+       }
+
         /* do the two bits[14:13] together */
         ecc = (m->status >> 45) & 0x3;
         if (ecc)
@@ -784,6 +1094,11 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
         if (m->status & MCI_STATUS_ADDRV)
                 pr_emerg(HW_ERR "MC%d Error Address: 0x%016llx\n", m->bank, m->addr);
  
+       if (!!(ebx & BIT(3))) {
+               decode_smca_errors(m);
+               goto err_code;
+       }
+
         if (!fam_ops)
                 goto err_code;
  
@@ -834,6 +1149,7 @@ static struct notifier_block amd_mce_dec_nb = {
  static int __init mce_amd_init(void)
  {
         struct cpuinfo_x86 *c = &boot_cpu_data;
+       u32 ebx;
  
         if (c->x86_vendor != X86_VENDOR_AMD)
                 return -ENODEV;
@@ -888,10 +1204,18 @@ static int __init mce_amd_init(void)
                 fam_ops->mc2_mce = f16h_mc2_mce;
                 break;
  
+       case 0x17:
+               ebx = cpuid_ebx(0x80000007);
+               xec_mask = 0x3f;
+               if (!(ebx & BIT(3))) {
+                       printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
+                       goto err_out;
+               }
+               break;
+
         default:
                 printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
-               kfree(fam_ops);
-               fam_ops = NULL;
+               goto err_out;
         }
  
         pr_info("MCE: In-kernel MCE decoding enabled.\n");
@@ -899,6 +1223,11 @@ static int __init mce_amd_init(void)
         mce_register_decode_chain(&amd_mce_dec_nb);
  
         return 0;
+
+err_out:
+       kfree(fam_ops);
+       fam_ops = NULL;
+       return -EINVAL;
  }
  early_initcall(mce_amd_init);
  
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c

index e438ee5b433f3f498cba20df890730a3fc77220c..93f0d4120289fa92d01a6873fac6d2efd37337e7 100644 (file)
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -1574,7 +1574,7 @@ static int knl_get_dimm_capacity(struct sbridge_pvt *pvt, u64 *mc_sizes)
                                 for (cha = 0; cha < KNL_MAX_CHAS; cha++) {
                                         if (knl_get_mc_route(target,
                                                 mc_route_reg[cha]) == channel
-                                               && participants[channel]) {
+                                               && !participants[channel]) {
                                                 participant_count++;
                                                 participants[channel] = 1;
                                                 break;
@@ -1839,8 +1839,8 @@ static void get_memory_layout(const struct mem_ctl_info *mci)
                 edac_dbg(0, "TAD#%d: up to %u.%03u GB (0x%016Lx), socket interleave %d, memory interleave %d, TGT: %d, %d, %d, %d, reg=0x%08x\n",
                          n_tads, gb, (mb*1000)/1024,
                          ((u64)tmp_mb) << 20L,
-                        (u32)TAD_SOCK(reg),
-                        (u32)TAD_CH(reg),
+                        (u32)(1 << TAD_SOCK(reg)),
+                        (u32)TAD_CH(reg) + 1,
                          (u32)TAD_TGT0(reg),
                          (u32)TAD_TGT1(reg),
                          (u32)TAD_TGT2(reg),
@@ -2118,7 +2118,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
         }
  
         ch_way = TAD_CH(reg) + 1;
-       sck_way = TAD_SOCK(reg) + 1;
+       sck_way = 1 << TAD_SOCK(reg);
  
         if (ch_way == 3)
                 idx = addr >> 6;
@@ -2175,7 +2175,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
                  n_tads,
                  addr,
                  limit,
-                (u32)TAD_SOCK(reg),
+                sck_way,
                  ch_way,
                  offset,
                  idx,
@@ -2190,18 +2190,12 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
                         offset, addr);
                 return -EINVAL;
         }
-       addr -= offset;
-       /* Store the low bits [0:6] of the addr */
-       ch_addr = addr & 0x7f;
-       /* Remove socket wayness and remove 6 bits */
-       addr >>= 6;
-       addr = div_u64(addr, sck_xch);
-#if 0
-       /* Divide by channel way */
-       addr = addr / ch_way;
-#endif
-       /* Recover the last 6 bits */
-       ch_addr |= addr << 6;
+
+       ch_addr = addr - offset;
+       ch_addr >>= (6 + shiftup);
+       ch_addr /= ch_way * sck_way;
+       ch_addr <<= (6 + shiftup);
+       ch_addr |= addr & ((1 << (6 + shiftup)) - 1);
  
         /*
          * Step 3) Decode rank
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c

index 8297bc319369d6e4e4dd1579195ca1b6161a57d1..1846d65b72859284b31c536dc2fdc1a29cae0d9e 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
@@ -96,7 +96,7 @@ static void amdgpu_flip_work_func(struct work_struct *__work)
          * In practice this won't execute very often unless on very fast
          * machines because the time window for this to happen is very small.
          */
-       while (amdgpuCrtc->enabled && repcnt--) {
+       while (amdgpuCrtc->enabled && --repcnt) {
                 /* GET_DISTANCE_TO_VBLANKSTART returns distance to real vblank
                  * start in hpos, and to the "fudged earlier" vblank start in
                  * vpos.
@@ -112,13 +112,13 @@ static void amdgpu_flip_work_func(struct work_struct *__work)
                         break;
  
                 /* Sleep at least until estimated real start of hw vblank */
-               spin_unlock_irqrestore(&crtc->dev->event_lock, flags);
                 min_udelay = (-hpos + 1) * max(vblank->linedur_ns / 1000, 5);
                 if (min_udelay > vblank->framedur_ns / 2000) {
                         /* Don't wait ridiculously long - something is wrong */
                         repcnt = 0;
                         break;
                 }
+               spin_unlock_irqrestore(&crtc->dev->event_lock, flags);
                 usleep_range(min_udelay, 2 * min_udelay);
                 spin_lock_irqsave(&crtc->dev->event_lock, flags);
         };
diff --git a/drivers/gpu/drm/amd/amdgpu/atombios_dp.c b/drivers/gpu/drm/amd/amdgpu/atombios_dp.c

index 21aacc1f45c1fd7afded7f6088c8a365e9005224..bf731e9f643e9ecddd367034179fae6a40d294ce 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/atombios_dp.c
+++ b/drivers/gpu/drm/amd/amdgpu/atombios_dp.c
@@ -265,15 +265,27 @@ static int amdgpu_atombios_dp_get_dp_link_config(struct drm_connector *connector
         unsigned max_lane_num = drm_dp_max_lane_count(dpcd);
         unsigned lane_num, i, max_pix_clock;
  
-       for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
-               for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) {
-                       max_pix_clock = (lane_num * link_rates[i] * 8) / bpp;
+       if (amdgpu_connector_encoder_get_dp_bridge_encoder_id(connector) ==
+           ENCODER_OBJECT_ID_NUTMEG) {
+               for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
+                       max_pix_clock = (lane_num * 270000 * 8) / bpp;
                         if (max_pix_clock >= pix_clock) {
                                 *dp_lanes = lane_num;
-                               *dp_rate = link_rates[i];
+                               *dp_rate = 270000;
                                 return 0;
                         }
                 }
+       } else {
+               for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
+                       for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) {
+                               max_pix_clock = (lane_num * link_rates[i] * 8) / bpp;
+                               if (max_pix_clock >= pix_clock) {
+                                       *dp_lanes = lane_num;
+                                       *dp_rate = link_rates[i];
+                                       return 0;
+                               }
+                       }
+               }
         }
  
         return -EINVAL;
diff --git a/drivers/gpu/drm/drm_gem_cma_helper.c b/drivers/gpu/drm/drm_gem_cma_helper.c

index e5df53b6e229ff0348541eeaa921acc6cf4700ef..1f500a1b99695a6bb83b3d596b9a2101f6f1f8ef 100644 (file)
--- a/drivers/gpu/drm/drm_gem_cma_helper.c
+++ b/drivers/gpu/drm/drm_gem_cma_helper.c
@@ -109,8 +109,8 @@ struct drm_gem_cma_object *drm_gem_cma_create(struct drm_device *drm,
         if (IS_ERR(cma_obj))
                 return cma_obj;
  
-       cma_obj->vaddr = dma_alloc_writecombine(drm->dev, size,
-                       &cma_obj->paddr, GFP_KERNEL | __GFP_NOWARN);
+       cma_obj->vaddr = dma_alloc_wc(drm->dev, size, &cma_obj->paddr,
+                                     GFP_KERNEL | __GFP_NOWARN);
         if (!cma_obj->vaddr) {
                 dev_err(drm->dev, "failed to allocate buffer with size %zu\n",
                         size);
@@ -192,8 +192,8 @@ void drm_gem_cma_free_object(struct drm_gem_object *gem_obj)
         cma_obj = to_drm_gem_cma_obj(gem_obj);
  
         if (cma_obj->vaddr) {
-               dma_free_writecombine(gem_obj->dev->dev, cma_obj->base.size,
-                                     cma_obj->vaddr, cma_obj->paddr);
+               dma_free_wc(gem_obj->dev->dev, cma_obj->base.size,
+                           cma_obj->vaddr, cma_obj->paddr);
         } else if (gem_obj->import_attach) {
                 drm_prime_gem_destroy(gem_obj, cma_obj->sgt);
         }
@@ -324,9 +324,8 @@ static int drm_gem_cma_mmap_obj(struct drm_gem_cma_object *cma_obj,
         vma->vm_flags &= ~VM_PFNMAP;
         vma->vm_pgoff = 0;
  
-       ret = dma_mmap_writecombine(cma_obj->base.dev->dev, vma,
-                                   cma_obj->vaddr, cma_obj->paddr,
-                                   vma->vm_end - vma->vm_start);
+       ret = dma_mmap_wc(cma_obj->base.dev->dev, vma, cma_obj->vaddr,
+                         cma_obj->paddr, vma->vm_end - vma->vm_start);
         if (ret)
                 drm_gem_vm_close(vma);
  
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c

index a33162cf4f4cc6c4f0aee66433680a15f64cfffc..3c1ce44483d991416fed77f8945dc6686f3a3341 100644 (file)
--- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
@@ -1113,8 +1113,8 @@ struct etnaviv_cmdbuf *etnaviv_gpu_cmdbuf_new(struct etnaviv_gpu *gpu, u32 size,
         if (!cmdbuf)
                 return NULL;
  
-       cmdbuf->vaddr = dma_alloc_writecombine(gpu->dev, size, &cmdbuf->paddr,
-                                              GFP_KERNEL);
+       cmdbuf->vaddr = dma_alloc_wc(gpu->dev, size, &cmdbuf->paddr,
+                                    GFP_KERNEL);
         if (!cmdbuf->vaddr) {
                 kfree(cmdbuf);
                 return NULL;
@@ -1128,8 +1128,8 @@ struct etnaviv_cmdbuf *etnaviv_gpu_cmdbuf_new(struct etnaviv_gpu *gpu, u32 size,
  
  void etnaviv_gpu_cmdbuf_free(struct etnaviv_cmdbuf *cmdbuf)
  {
-       dma_free_writecombine(cmdbuf->gpu->dev, cmdbuf->size,
-                             cmdbuf->vaddr, cmdbuf->paddr);
+       dma_free_wc(cmdbuf->gpu->dev, cmdbuf->size, cmdbuf->vaddr,
+                   cmdbuf->paddr);
         kfree(cmdbuf);
  }
  
diff --git a/drivers/gpu/drm/i2c/tda998x_drv.c b/drivers/gpu/drm/i2c/tda998x_drv.c

index 34e38749a8176f87001e54cb4ea8a7cc86fa744e..f8ee740c0e264d488b937503745ac9de06bbc9b1 100644 (file)
--- a/drivers/gpu/drm/i2c/tda998x_drv.c
+++ b/drivers/gpu/drm/i2c/tda998x_drv.c
@@ -1382,8 +1382,16 @@ static void tda998x_connector_destroy(struct drm_connector *connector)
         drm_connector_cleanup(connector);
  }
  
+static int tda998x_connector_dpms(struct drm_connector *connector, int mode)
+{
+       if (drm_core_check_feature(connector->dev, DRIVER_ATOMIC))
+               return drm_atomic_helper_connector_dpms(connector, mode);
+       else
+               return drm_helper_connector_dpms(connector, mode);
+}
+
  static const struct drm_connector_funcs tda998x_connector_funcs = {
-       .dpms = drm_atomic_helper_connector_dpms,
+       .dpms = tda998x_connector_dpms,
         .reset = drm_atomic_helper_connector_reset,
         .fill_modes = drm_helper_probe_single_connector_modes,
         .detect = tda998x_connector_detect,
diff --git a/drivers/gpu/drm/i915/intel_audio.c b/drivers/gpu/drm/i915/intel_audio.c

index 31f6d212fb1bbbfa1ce64efe8cebde2a5f9f575d..30f921421b0c944217832ba86856a6904f8fef11 100644 (file)
--- a/drivers/gpu/drm/i915/intel_audio.c
+++ b/drivers/gpu/drm/i915/intel_audio.c
@@ -527,6 +527,8 @@ void intel_audio_codec_enable(struct intel_encoder *intel_encoder)
  
         mutex_lock(&dev_priv->av_mutex);
         intel_dig_port->audio_connector = connector;
+       /* referred in audio callbacks */
+       dev_priv->dig_port_map[port] = intel_encoder;
         mutex_unlock(&dev_priv->av_mutex);
  
         if (acomp && acomp->audio_ops && acomp->audio_ops->pin_eld_notify)
@@ -554,6 +556,7 @@ void intel_audio_codec_disable(struct intel_encoder *intel_encoder)
  
         mutex_lock(&dev_priv->av_mutex);
         intel_dig_port->audio_connector = NULL;
+       dev_priv->dig_port_map[port] = NULL;
         mutex_unlock(&dev_priv->av_mutex);
  
         if (acomp && acomp->audio_ops && acomp->audio_ops->pin_eld_notify)
diff --git a/drivers/gpu/drm/i915/intel_ddi.c b/drivers/gpu/drm/i915/intel_ddi.c

index 0f3df2c39f7cdd3a69d8f0044cbd725e3b9bac6d..084d5586585d012891423067703fde0f9daec0b1 100644 (file)
--- a/drivers/gpu/drm/i915/intel_ddi.c
+++ b/drivers/gpu/drm/i915/intel_ddi.c
@@ -3358,7 +3358,6 @@ void intel_ddi_init(struct drm_device *dev, enum port port)
         intel_encoder->get_config = intel_ddi_get_config;
  
         intel_dig_port->port = port;
-       dev_priv->dig_port_map[port] = intel_encoder;
         intel_dig_port->saved_port_bits = I915_READ(DDI_BUF_CTL(port)) &
                                           (DDI_BUF_PORT_REVERSAL |
                                            DDI_A_4_LANES);
diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c

index 1d8de43bed56839fd1fee3a75b6d61b6221c7c0a..cdc2c15873dcc71189ed3263aad55e3c63493ed3 100644 (file)
--- a/drivers/gpu/drm/i915/intel_dp.c
+++ b/drivers/gpu/drm/i915/intel_dp.c
@@ -6045,7 +6045,6 @@ intel_dp_init(struct drm_device *dev,
         }
  
         intel_dig_port->port = port;
-       dev_priv->dig_port_map[port] = intel_encoder;
         intel_dig_port->dp.output_reg = output_reg;
  
         intel_encoder->type = INTEL_OUTPUT_DISPLAYPORT;
diff --git a/drivers/gpu/drm/i915/intel_hdmi.c b/drivers/gpu/drm/i915/intel_hdmi.c

index cb5d1b15755c3b19a496105c7a8df99982e83cb9..616108c4bc3e5741f59c9a1066aa2df3b63d5fc5 100644 (file)
--- a/drivers/gpu/drm/i915/intel_hdmi.c
+++ b/drivers/gpu/drm/i915/intel_hdmi.c
@@ -2154,7 +2154,6 @@ void intel_hdmi_init_connector(struct intel_digital_port *intel_dig_port,
  void intel_hdmi_init(struct drm_device *dev,
                      i915_reg_t hdmi_reg, enum port port)
  {
-       struct drm_i915_private *dev_priv = dev->dev_private;
         struct intel_digital_port *intel_dig_port;
         struct intel_encoder *intel_encoder;
         struct intel_connector *intel_connector;
@@ -2223,7 +2222,6 @@ void intel_hdmi_init(struct drm_device *dev,
                 intel_encoder->cloneable |= 1 << INTEL_OUTPUT_HDMI;
  
         intel_dig_port->port = port;
-       dev_priv->dig_port_map[port] = intel_encoder;
         intel_dig_port->hdmi.hdmi_reg = hdmi_reg;
         intel_dig_port->dp.output_reg = INVALID_MMIO_REG;
  
diff --git a/drivers/gpu/drm/i915/intel_i2c.c b/drivers/gpu/drm/i915/intel_i2c.c

index deb8282c26d83f952473ae145c4fef0b3112b9f1..52fbe530fc9eac207093271a05d44dbf3515e11c 100644 (file)
--- a/drivers/gpu/drm/i915/intel_i2c.c
+++ b/drivers/gpu/drm/i915/intel_i2c.c
@@ -664,6 +664,12 @@ int intel_setup_gmbus(struct drm_device *dev)
  
                 bus->adapter.algo = &gmbus_algorithm;
  
+               /*
+                * We wish to retry with bit banging
+                * after a timed out GMBUS attempt.
+                */
+               bus->adapter.retries = 1;
+
                 /* By default use a conservative clock rate */
                 bus->reg0 = pin | GMBUS_RATE_100KHZ;
  
diff --git a/drivers/gpu/drm/imx/ipuv3-crtc.c b/drivers/gpu/drm/imx/ipuv3-crtc.c

index 30a57185bdb4e41e7ff26a0d4af36577a218b0ea..287226311413c7036cecb02e973e6109d0b26548 100644 (file)
--- a/drivers/gpu/drm/imx/ipuv3-crtc.c
+++ b/drivers/gpu/drm/imx/ipuv3-crtc.c
@@ -64,6 +64,7 @@ static void ipu_fb_enable(struct ipu_crtc *ipu_crtc)
         /* Start DC channel and DI after IDMAC */
         ipu_dc_enable_channel(ipu_crtc->dc);
         ipu_di_enable(ipu_crtc->di);
+       drm_crtc_vblank_on(&ipu_crtc->base);
  
         ipu_crtc->enabled = 1;
  }
@@ -80,6 +81,7 @@ static void ipu_fb_disable(struct ipu_crtc *ipu_crtc)
         ipu_di_disable(ipu_crtc->di);
         ipu_plane_disable(ipu_crtc->plane[0]);
         ipu_dc_disable(ipu);
+       drm_crtc_vblank_off(&ipu_crtc->base);
  
         ipu_crtc->enabled = 0;
  }
diff --git a/drivers/gpu/drm/imx/ipuv3-plane.c b/drivers/gpu/drm/imx/ipuv3-plane.c

index 591ba2f1ae03674224b4719660d547ad4e64c0b1..26bb1b626fe3d817812e18e19e0bfb64a4247c3c 100644 (file)
--- a/drivers/gpu/drm/imx/ipuv3-plane.c
+++ b/drivers/gpu/drm/imx/ipuv3-plane.c
@@ -42,6 +42,7 @@ static const uint32_t ipu_plane_formats[] = {
         DRM_FORMAT_YVYU,
         DRM_FORMAT_YUV420,
         DRM_FORMAT_YVU420,
+       DRM_FORMAT_RGB565,
  };
  
  int ipu_plane_irq(struct ipu_plane *ipu_plane)
diff --git a/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c b/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c

index dfebdc4aa0f24ac00f47ab3062d1e24e9b455648..85dfe3674b4138ef02e1b3f043699459d9decb25 100644 (file)
--- a/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c
+++ b/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c
@@ -573,10 +573,9 @@ static int omap_dmm_remove(struct platform_device *dev)
  
                 kfree(omap_dmm->engines);
                 if (omap_dmm->refill_va)
-                       dma_free_writecombine(omap_dmm->dev,
-                               REFILL_BUFFER_SIZE * omap_dmm->num_engines,
-                               omap_dmm->refill_va,
-                               omap_dmm->refill_pa);
+                       dma_free_wc(omap_dmm->dev,
+                                   REFILL_BUFFER_SIZE * omap_dmm->num_engines,
+                                   omap_dmm->refill_va, omap_dmm->refill_pa);
                 if (omap_dmm->dummy_page)
                         __free_page(omap_dmm->dummy_page);
  
@@ -701,9 +700,9 @@ static int omap_dmm_probe(struct platform_device *dev)
         omap_dmm->dummy_pa = page_to_phys(omap_dmm->dummy_page);
  
         /* alloc refill memory */
-       omap_dmm->refill_va = dma_alloc_writecombine(&dev->dev,
-                               REFILL_BUFFER_SIZE * omap_dmm->num_engines,
-                               &omap_dmm->refill_pa, GFP_KERNEL);
+       omap_dmm->refill_va = dma_alloc_wc(&dev->dev,
+                                          REFILL_BUFFER_SIZE * omap_dmm->num_engines,
+                                          &omap_dmm->refill_pa, GFP_KERNEL);
         if (!omap_dmm->refill_va) {
                 dev_err(&dev->dev, "could not allocate refill memory\n");
                 goto fail;
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c

index 8495a1a4b61745524dd04045616d859b2ac566bb..359b0d7e8ef78faf6cd9d6e044cfb5088f99f0be 100644 (file)
--- a/drivers/gpu/drm/omapdrm/omap_gem.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem.c
@@ -1330,8 +1330,8 @@ void omap_gem_free_object(struct drm_gem_object *obj)
                         omap_gem_detach_pages(obj);
  
                 if (!is_shmem(obj)) {
-                       dma_free_writecombine(dev->dev, obj->size,
-                                       omap_obj->vaddr, omap_obj->paddr);
+                       dma_free_wc(dev->dev, obj->size, omap_obj->vaddr,
+                                   omap_obj->paddr);
                 } else if (omap_obj->vaddr) {
                         vunmap(omap_obj->vaddr);
                 }
@@ -1395,8 +1395,8 @@ struct drm_gem_object *omap_gem_new(struct drm_device *dev,
                 /* attempt to allocate contiguous memory if we don't
                  * have DMM for remappign discontiguous buffers
                  */
-               omap_obj->vaddr =  dma_alloc_writecombine(dev->dev, size,
-                               &omap_obj->paddr, GFP_KERNEL);
+               omap_obj->vaddr =  dma_alloc_wc(dev->dev, size,
+                                               &omap_obj->paddr, GFP_KERNEL);
                 if (!omap_obj->vaddr) {
                         kfree(omap_obj);
  
diff --git a/drivers/gpu/drm/radeon/atombios_dp.c b/drivers/gpu/drm/radeon/atombios_dp.c

index 44ee72e04df9e953bafe64cdfdadf2f01c1f9bce..6af832545bc5b76b44e836553e72005f01012bbe 100644 (file)
--- a/drivers/gpu/drm/radeon/atombios_dp.c
+++ b/drivers/gpu/drm/radeon/atombios_dp.c
@@ -315,15 +315,27 @@ int radeon_dp_get_dp_link_config(struct drm_connector *connector,
         unsigned max_lane_num = drm_dp_max_lane_count(dpcd);
         unsigned lane_num, i, max_pix_clock;
  
-       for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
-               for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) {
-                       max_pix_clock = (lane_num * link_rates[i] * 8) / bpp;
+       if (radeon_connector_encoder_get_dp_bridge_encoder_id(connector) ==
+           ENCODER_OBJECT_ID_NUTMEG) {
+               for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
+                       max_pix_clock = (lane_num * 270000 * 8) / bpp;
                         if (max_pix_clock >= pix_clock) {
                                 *dp_lanes = lane_num;
-                               *dp_rate = link_rates[i];
+                               *dp_rate = 270000;
                                 return 0;
                         }
                 }
+       } else {
+               for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
+                       for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) {
+                               max_pix_clock = (lane_num * link_rates[i] * 8) / bpp;
+                               if (max_pix_clock >= pix_clock) {
+                                       *dp_lanes = lane_num;
+                                       *dp_rate = link_rates[i];
+                                       return 0;
+                               }
+                       }
+               }
         }
  
         return -EINVAL;
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c

index 902b59cebac584b075639a2b7d34f031907cc3d1..4197ca1bb1e4d3f5d1455cc7c282787778660e9c 100644 (file)
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -1744,7 +1744,6 @@ int radeon_resume_kms(struct drm_device *dev, bool resume, bool fbcon)
         }
  
         drm_kms_helper_poll_enable(dev);
-       drm_helper_hpd_irq_event(dev);
  
         /* set the power state here in case we are a PX system or headless */
         if ((rdev->pm.pm_method == PM_METHOD_DPM) && rdev->pm.dpm_enabled)
diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c

index 2b9ba03a7c1a84bc00564fe0f6391771977c2e7b..2d9196a447fdc94a49140dcad365b6d3957d43f6 100644 (file)
--- a/drivers/gpu/drm/radeon/radeon_display.c
+++ b/drivers/gpu/drm/radeon/radeon_display.c
@@ -455,7 +455,7 @@ static void radeon_flip_work_func(struct work_struct *__work)
          * In practice this won't execute very often unless on very fast
          * machines because the time window for this to happen is very small.
          */
-       while (radeon_crtc->enabled && repcnt--) {
+       while (radeon_crtc->enabled && --repcnt) {
                 /* GET_DISTANCE_TO_VBLANKSTART returns distance to real vblank
                  * start in hpos, and to the "fudged earlier" vblank start in
                  * vpos.
@@ -471,13 +471,13 @@ static void radeon_flip_work_func(struct work_struct *__work)
                         break;
  
                 /* Sleep at least until estimated real start of hw vblank */
-               spin_unlock_irqrestore(&crtc->dev->event_lock, flags);
                 min_udelay = (-hpos + 1) * max(vblank->linedur_ns / 1000, 5);
                 if (min_udelay > vblank->framedur_ns / 2000) {
                         /* Don't wait ridiculously long - something is wrong */
                         repcnt = 0;
                         break;
                 }
+               spin_unlock_irqrestore(&crtc->dev->event_lock, flags);
                 usleep_range(min_udelay, 2 * min_udelay);
                 spin_lock_irqsave(&crtc->dev->event_lock, flags);
         };
diff --git a/drivers/gpu/drm/radeon/radeon_pm.c b/drivers/gpu/drm/radeon/radeon_pm.c

index 0f14d897baf9b32974a97eaa59153363e3d8c97b..7a98823bacd1cd5da33c9e11dc7f798ae1a866d9 100644 (file)
--- a/drivers/gpu/drm/radeon/radeon_pm.c
+++ b/drivers/gpu/drm/radeon/radeon_pm.c
@@ -1079,6 +1079,8 @@ force:
  
         /* update display watermarks based on new power state */
         radeon_bandwidth_update(rdev);
+       /* update displays */
+       radeon_dpm_display_configuration_changed(rdev);
  
         /* wait for the rings to drain */
         for (i = 0; i < RADEON_NUM_RINGS; i++) {
@@ -1095,9 +1097,6 @@ force:
  
         radeon_dpm_post_set_power_state(rdev);
  
-       /* update displays */
-       radeon_dpm_display_configuration_changed(rdev);
-
         rdev->pm.dpm.current_active_crtcs = rdev->pm.dpm.new_active_crtcs;
         rdev->pm.dpm.current_active_crtc_count = rdev->pm.dpm.new_active_crtc_count;
         rdev->pm.dpm.single_display = single_display;
diff --git a/drivers/gpu/drm/sti/sti_cursor.c b/drivers/gpu/drm/sti/sti_cursor.c

index 807863106b8da813949383f291830c60d087d4e0..bd736ace3f8147b25c3362cb2e46aeeef2ef87b8 100644 (file)
--- a/drivers/gpu/drm/sti/sti_cursor.c
+++ b/drivers/gpu/drm/sti/sti_cursor.c
@@ -157,17 +157,15 @@ static void sti_cursor_atomic_update(struct drm_plane *drm_plane,
                 cursor->height = src_h;
  
                 if (cursor->pixmap.base)
-                       dma_free_writecombine(cursor->dev,
-                                             cursor->pixmap.size,
-                                             cursor->pixmap.base,
-                                             cursor->pixmap.paddr);
+                       dma_free_wc(cursor->dev, cursor->pixmap.size,
+                                   cursor->pixmap.base, cursor->pixmap.paddr);
  
                 cursor->pixmap.size = cursor->width * cursor->height;
  
-               cursor->pixmap.base = dma_alloc_writecombine(cursor->dev,
-                                                       cursor->pixmap.size,
-                                                       &cursor->pixmap.paddr,
-                                                       GFP_KERNEL | GFP_DMA);
+               cursor->pixmap.base = dma_alloc_wc(cursor->dev,
+                                                  cursor->pixmap.size,
+                                                  &cursor->pixmap.paddr,
+                                                  GFP_KERNEL | GFP_DMA);
                 if (!cursor->pixmap.base) {
                         DRM_ERROR("Failed to allocate memory for pixmap\n");
                         return;
@@ -252,8 +250,8 @@ struct drm_plane *sti_cursor_create(struct drm_device *drm_dev,
  
         /* Allocate clut buffer */
         size = 0x100 * sizeof(unsigned short);
-       cursor->clut = dma_alloc_writecombine(dev, size, &cursor->clut_paddr,
-                                             GFP_KERNEL | GFP_DMA);
+       cursor->clut = dma_alloc_wc(dev, size, &cursor->clut_paddr,
+                                   GFP_KERNEL | GFP_DMA);
  
         if (!cursor->clut) {
                 DRM_ERROR("Failed to allocate memory for cursor clut\n");
@@ -286,7 +284,7 @@ struct drm_plane *sti_cursor_create(struct drm_device *drm_dev,
         return &cursor->plane.drm_plane;
  
  err_plane:
-       dma_free_writecombine(dev, size, cursor->clut, cursor->clut_paddr);
+       dma_free_wc(dev, size, cursor->clut, cursor->clut_paddr);
  err_clut:
         devm_kfree(dev, cursor);
         return NULL;
diff --git a/drivers/gpu/drm/sti/sti_gdp.c b/drivers/gpu/drm/sti/sti_gdp.c

index f9a1d92c9d9519c3ddbeefe33277f26af4f37a32..514551c857bbda87b8fcbbca215bdc17c7bdfe67 100644 (file)
--- a/drivers/gpu/drm/sti/sti_gdp.c
+++ b/drivers/gpu/drm/sti/sti_gdp.c
@@ -312,8 +312,7 @@ static void sti_gdp_init(struct sti_gdp *gdp)
         /* Allocate all the nodes within a single memory page */
         size = sizeof(struct sti_gdp_node) *
             GDP_NODE_PER_FIELD * GDP_NODE_NB_BANK;
-       base = dma_alloc_writecombine(gdp->dev,
-                                     size, &dma_addr, GFP_KERNEL | GFP_DMA);
+       base = dma_alloc_wc(gdp->dev, size, &dma_addr, GFP_KERNEL | GFP_DMA);
  
         if (!base) {
                 DRM_ERROR("Failed to allocate memory for GDP node\n");
diff --git a/drivers/gpu/drm/sti/sti_hqvdp.c b/drivers/gpu/drm/sti/sti_hqvdp.c

index 43861b52261d7e4fee17839351577e5d4b90080f..1d3c3d029603d030ec02809fd14026d6d5df08da 100644 (file)
--- a/drivers/gpu/drm/sti/sti_hqvdp.c
+++ b/drivers/gpu/drm/sti/sti_hqvdp.c
@@ -617,9 +617,9 @@ static void sti_hqvdp_init(struct sti_hqvdp *hqvdp)
  
         /* Allocate memory for the VDP commands */
         size = NB_VDP_CMD * sizeof(struct sti_hqvdp_cmd);
-       hqvdp->hqvdp_cmd = dma_alloc_writecombine(hqvdp->dev, size,
-                                        &hqvdp->hqvdp_cmd_paddr,
-                                        GFP_KERNEL | GFP_DMA);
+       hqvdp->hqvdp_cmd = dma_alloc_wc(hqvdp->dev, size,
+                                       &hqvdp->hqvdp_cmd_paddr,
+                                       GFP_KERNEL | GFP_DMA);
         if (!hqvdp->hqvdp_cmd) {
                 DRM_ERROR("Failed to allocate memory for VDP cmd\n");
                 return;
diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c

index 33add93b4ed96826a3f7da9bfc5d11d0018eff8b..3b0d8c392b707e93e56a0b391345d58d3a5eef4c 100644 (file)
--- a/drivers/gpu/drm/tegra/gem.c
+++ b/drivers/gpu/drm/tegra/gem.c
@@ -175,8 +175,7 @@ static void tegra_bo_free(struct drm_device *drm, struct tegra_bo *bo)
                 sg_free_table(bo->sgt);
                 kfree(bo->sgt);
         } else if (bo->vaddr) {
-               dma_free_writecombine(drm->dev, bo->gem.size, bo->vaddr,
-                                     bo->paddr);
+               dma_free_wc(drm->dev, bo->gem.size, bo->vaddr, bo->paddr);
         }
  }
  
@@ -233,8 +232,8 @@ static int tegra_bo_alloc(struct drm_device *drm, struct tegra_bo *bo)
         } else {
                 size_t size = bo->gem.size;
  
-               bo->vaddr = dma_alloc_writecombine(drm->dev, size, &bo->paddr,
-                                                  GFP_KERNEL | __GFP_NOWARN);
+               bo->vaddr = dma_alloc_wc(drm->dev, size, &bo->paddr,
+                                        GFP_KERNEL | __GFP_NOWARN);
                 if (!bo->vaddr) {
                         dev_err(drm->dev,
                                 "failed to allocate buffer of size %zu\n",
@@ -472,8 +471,8 @@ int tegra_drm_mmap(struct file *file, struct vm_area_struct *vma)
                 vma->vm_flags &= ~VM_PFNMAP;
                 vma->vm_pgoff = 0;
  
-               ret = dma_mmap_writecombine(gem->dev->dev, vma, bo->vaddr,
-                                           bo->paddr, gem->size);
+               ret = dma_mmap_wc(gem->dev->dev, vma, bo->vaddr, bo->paddr,
+                                 gem->size);
                 if (ret) {
                         drm_gem_vm_close(vma);
                         return ret;
diff --git a/drivers/gpu/drm/vc4/vc4_bo.c b/drivers/gpu/drm/vc4/vc4_bo.c

index 22278bcfc60eac4ed40fac0e31dda840aa4b703e..034ef2de903769f521c50c6a6936a5099e69578f 100644 (file)
--- a/drivers/gpu/drm/vc4/vc4_bo.c
+++ b/drivers/gpu/drm/vc4/vc4_bo.c
@@ -398,9 +398,8 @@ int vc4_mmap(struct file *filp, struct vm_area_struct *vma)
         vma->vm_flags &= ~VM_PFNMAP;
         vma->vm_pgoff = 0;
  
-       ret = dma_mmap_writecombine(bo->base.base.dev->dev, vma,
-                                   bo->base.vaddr, bo->base.paddr,
-                                   vma->vm_end - vma->vm_start);
+       ret = dma_mmap_wc(bo->base.base.dev->dev, vma, bo->base.vaddr,
+                         bo->base.paddr, vma->vm_end - vma->vm_start);
         if (ret)
                 drm_gem_vm_close(vma);
  
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c b/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c

index db082bea8dafd023fa6ad19719f7fadf5ebfcad5..c5a1a08b0449004670b79947e68f0ba7fb8afb50 100644 (file)
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c
@@ -563,6 +563,8 @@ static void vmw_sou_connector_destroy(struct drm_connector *connector)
  
  static const struct drm_connector_funcs vmw_sou_connector_funcs = {
         .dpms = vmw_du_connector_dpms,
+       .detect = vmw_du_connector_detect,
+       .fill_modes = vmw_du_connector_fill_modes,
         .set_property = vmw_du_connector_set_property,
         .destroy = vmw_sou_connector_destroy,
  };
diff --git a/drivers/gpu/host1x/cdma.c b/drivers/gpu/host1x/cdma.c

index 5a8c8d55317ab4f622c5754f509d5d3660a3e47c..a18db4d5347cefa99847df944be3a3b364de089a 100644 (file)
--- a/drivers/gpu/host1x/cdma.c
+++ b/drivers/gpu/host1x/cdma.c
@@ -52,8 +52,8 @@ static void host1x_pushbuffer_destroy(struct push_buffer *pb)
         struct host1x *host1x = cdma_to_host1x(cdma);
  
         if (pb->phys != 0)
-               dma_free_writecombine(host1x->dev, pb->size_bytes + 4,
-                                     pb->mapped, pb->phys);
+               dma_free_wc(host1x->dev, pb->size_bytes + 4, pb->mapped,
+                           pb->phys);
  
         pb->mapped = NULL;
         pb->phys = 0;
@@ -76,8 +76,8 @@ static int host1x_pushbuffer_init(struct push_buffer *pb)
         pb->pos = 0;
  
         /* allocate and map pushbuffer memory */
-       pb->mapped = dma_alloc_writecombine(host1x->dev, pb->size_bytes + 4,
-                                           &pb->phys, GFP_KERNEL);
+       pb->mapped = dma_alloc_wc(host1x->dev, pb->size_bytes + 4, &pb->phys,
+                                 GFP_KERNEL);
         if (!pb->mapped)
                 goto fail;
  
diff --git a/drivers/gpu/host1x/job.c b/drivers/gpu/host1x/job.c

index 63bd63f3c7dfd2da2fd3d9ae58bba1473bceb70b..defa7995f2131a06ddc88e09d13d914f8a244ea1 100644 (file)
--- a/drivers/gpu/host1x/job.c
+++ b/drivers/gpu/host1x/job.c
@@ -467,9 +467,8 @@ static inline int copy_gathers(struct host1x_job *job, struct device *dev)
                 size += g->words * sizeof(u32);
         }
  
-       job->gather_copy_mapped = dma_alloc_writecombine(dev, size,
-                                                        &job->gather_copy,
-                                                        GFP_KERNEL);
+       job->gather_copy_mapped = dma_alloc_wc(dev, size, &job->gather_copy,
+                                              GFP_KERNEL);
         if (!job->gather_copy_mapped) {
                 job->gather_copy_mapped = NULL;
                 return -ENOMEM;
@@ -578,9 +577,8 @@ void host1x_job_unpin(struct host1x_job *job)
         job->num_unpins = 0;
  
         if (job->gather_copy_size)
-               dma_free_writecombine(job->channel->dev, job->gather_copy_size,
-                                     job->gather_copy_mapped,
-                                     job->gather_copy);
+               dma_free_wc(job->channel->dev, job->gather_copy_size,
+                           job->gather_copy_mapped, job->gather_copy);
  }
  EXPORT_SYMBOL(host1x_job_unpin);
  
diff --git a/drivers/gpu/ipu-v3/ipu-common.c b/drivers/gpu/ipu-v3/ipu-common.c

index f2e13eb8339ffc1287110e247e0e54e922b7d72a..e00db3f510dd425c62565d913c937c5638a072d5 100644 (file)
--- a/drivers/gpu/ipu-v3/ipu-common.c
+++ b/drivers/gpu/ipu-v3/ipu-common.c
@@ -1050,6 +1050,17 @@ static int ipu_add_client_devices(struct ipu_soc *ipu, unsigned long ipu_base)
         for (i = 0; i < ARRAY_SIZE(client_reg); i++) {
                 const struct ipu_platform_reg *reg = &client_reg[i];
                 struct platform_device *pdev;
+               struct device_node *of_node;
+
+               /* Associate subdevice with the corresponding port node */
+               of_node = of_graph_get_port_by_id(dev->of_node, i);
+               if (!of_node) {
+                       dev_info(dev,
+                                "no port@%d node in %s, not using %s%d\n",
+                                i, dev->of_node->full_name,
+                                (i / 2) ? "DI" : "CSI", i % 2);
+                       continue;
+               }
  
                 pdev = platform_device_alloc(reg->name, id++);
                 if (!pdev) {
@@ -1057,17 +1068,9 @@ static int ipu_add_client_devices(struct ipu_soc *ipu, unsigned long ipu_base)
                         goto err_register;
                 }
  
+               pdev->dev.of_node = of_node;
                 pdev->dev.parent = dev;
  
-               /* Associate subdevice with the corresponding port node */
-               pdev->dev.of_node = of_graph_get_port_by_id(dev->of_node, i);
-               if (!pdev->dev.of_node) {
-                       dev_err(dev, "missing port@%d node in %s\n", i,
-                               dev->of_node->full_name);
-                       ret = -ENODEV;
-                       goto err_register;
-               }
-
                 ret = platform_device_add_data(pdev, &reg->pdata,
                                                sizeof(reg->pdata));
                 if (!ret)
@@ -1289,10 +1292,6 @@ static int ipu_probe(struct platform_device *pdev)
         ipu->irq_sync = irq_sync;
         ipu->irq_err = irq_err;
  
-       ret = ipu_irq_init(ipu);
-       if (ret)
-               goto out_failed_irq;
-
         ret = device_reset(&pdev->dev);
         if (ret) {
                 dev_err(&pdev->dev, "failed to reset: %d\n", ret);
@@ -1302,6 +1301,10 @@ static int ipu_probe(struct platform_device *pdev)
         if (ret)
                 goto out_failed_reset;
  
+       ret = ipu_irq_init(ipu);
+       if (ret)
+               goto out_failed_irq;
+
         /* Set MCU_T to divide MCU access window into 2 */
         ipu_cm_write(ipu, 0x00400000L | (IPU_MCU_T_DEFAULT << 18),
                         IPU_DISP_GEN);
@@ -1324,9 +1327,9 @@ static int ipu_probe(struct platform_device *pdev)
  failed_add_clients:
         ipu_submodules_exit(ipu);
  failed_submodules_init:
-out_failed_reset:
         ipu_irq_exit(ipu);
  out_failed_irq:
+out_failed_reset:
         clk_disable_unprepare(ipu->clk);
         return ret;
  }
diff --git a/drivers/media/media-device.c b/drivers/media/media-device.c

index 7dae0ac0f3aec12c5d44875111d3a69818e43c92..e9219f528d7e8ada4876649c28b9d0c54ce68117 100644 (file)
--- a/drivers/media/media-device.c
+++ b/drivers/media/media-device.c
@@ -20,6 +20,9 @@
   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   */
  
+/* We need to access legacy defines from linux/media.h */
+#define __NEED_MEDIA_LEGACY_API
+
  #include <linux/compat.h>
  #include <linux/export.h>
  #include <linux/idr.h>
@@ -115,6 +118,26 @@ static long media_device_enum_entities(struct media_device *mdev,
         u_ent.group_id = 0;             /* Unused */
         u_ent.pads = ent->num_pads;
         u_ent.links = ent->num_links - ent->num_backlinks;
+
+       /*
+        * Workaround for a bug at media-ctl <= v1.10 that makes it to
+        * do the wrong thing if the entity function doesn't belong to
+        * either MEDIA_ENT_F_OLD_BASE or MEDIA_ENT_F_OLD_SUBDEV_BASE
+        * Ranges.
+        *
+        * Non-subdevices are expected to be at the MEDIA_ENT_F_OLD_BASE,
+        * or, otherwise, will be silently ignored by media-ctl when
+        * printing the graphviz diagram. So, map them into the devnode
+        * old range.
+        */
+       if (ent->function < MEDIA_ENT_F_OLD_BASE ||
+           ent->function > MEDIA_ENT_T_DEVNODE_UNKNOWN) {
+               if (is_media_entity_v4l2_subdev(ent))
+                       u_ent.type = MEDIA_ENT_F_V4L2_SUBDEV_UNKNOWN;
+               else if (ent->function != MEDIA_ENT_F_IO_V4L)
+                       u_ent.type = MEDIA_ENT_T_DEVNODE_UNKNOWN;
+       }
+
         memcpy(&u_ent.raw, &ent->info, sizeof(ent->info));
         if (copy_to_user(uent, &u_ent, sizeof(u_ent)))
                 return -EFAULT;
diff --git a/drivers/media/platform/coda/coda-bit.c b/drivers/media/platform/coda/coda-bit.c

index 7d28899f89ce16c00cd9f00f424a948fa57e2ae8..38aacc7fc692024b58afd94708218b022bafd318 100644 (file)
--- a/drivers/media/platform/coda/coda-bit.c
+++ b/drivers/media/platform/coda/coda-bit.c
@@ -1455,9 +1455,9 @@ static int coda_alloc_bitstream_buffer(struct coda_ctx *ctx,
                 return 0;
  
         ctx->bitstream.size = roundup_pow_of_two(q_data->sizeimage * 2);
-       ctx->bitstream.vaddr = dma_alloc_writecombine(
-                       &ctx->dev->plat_dev->dev, ctx->bitstream.size,
-                       &ctx->bitstream.paddr, GFP_KERNEL);
+       ctx->bitstream.vaddr = dma_alloc_wc(&ctx->dev->plat_dev->dev,
+                                           ctx->bitstream.size,
+                                           &ctx->bitstream.paddr, GFP_KERNEL);
         if (!ctx->bitstream.vaddr) {
                 v4l2_err(&ctx->dev->v4l2_dev,
                          "failed to allocate bitstream ringbuffer");
@@ -1474,8 +1474,8 @@ static void coda_free_bitstream_buffer(struct coda_ctx *ctx)
         if (ctx->bitstream.vaddr == NULL)
                 return;
  
-       dma_free_writecombine(&ctx->dev->plat_dev->dev, ctx->bitstream.size,
-                             ctx->bitstream.vaddr, ctx->bitstream.paddr);
+       dma_free_wc(&ctx->dev->plat_dev->dev, ctx->bitstream.size,
+                   ctx->bitstream.vaddr, ctx->bitstream.paddr);
         ctx->bitstream.vaddr = NULL;
         kfifo_init(&ctx->bitstream_fifo, NULL, 0);
  }
diff --git a/drivers/misc/lkdtm.c b/drivers/misc/lkdtm.c

index 11fdadc68e53e57722b4d58892fdf2c644beba34..2a6eaf1122b4e9b742eb3777fb4b6b317c07b201 100644 (file)
--- a/drivers/misc/lkdtm.c
+++ b/drivers/misc/lkdtm.c
@@ -103,6 +103,7 @@ enum ctype {
         CT_EXEC_USERSPACE,
         CT_ACCESS_USERSPACE,
         CT_WRITE_RO,
+       CT_WRITE_RO_AFTER_INIT,
         CT_WRITE_KERN,
  };
  
@@ -140,6 +141,7 @@ static char* cp_type[] = {
         "EXEC_USERSPACE",
         "ACCESS_USERSPACE",
         "WRITE_RO",
+       "WRITE_RO_AFTER_INIT",
         "WRITE_KERN",
  };
  
@@ -162,6 +164,7 @@ static DEFINE_SPINLOCK(lock_me_up);
  static u8 data_area[EXEC_SIZE];
  
  static const unsigned long rodata = 0xAA55AA55;
+static unsigned long ro_after_init __ro_after_init = 0x55AA5500;
  
  module_param(recur_count, int, 0644);
  MODULE_PARM_DESC(recur_count, " Recursion level for the stack overflow test");
@@ -503,11 +506,28 @@ static void lkdtm_do_action(enum ctype which)
                 break;
         }
         case CT_WRITE_RO: {
-               unsigned long *ptr;
+               /* Explicitly cast away "const" for the test. */
+               unsigned long *ptr = (unsigned long *)&rodata;
  
-               ptr = (unsigned long *)&rodata;
+               pr_info("attempting bad rodata write at %p\n", ptr);
+               *ptr ^= 0xabcd1234;
  
-               pr_info("attempting bad write at %p\n", ptr);
+               break;
+       }
+       case CT_WRITE_RO_AFTER_INIT: {
+               unsigned long *ptr = &ro_after_init;
+
+               /*
+                * Verify we were written to during init. Since an Oops
+                * is considered a "success", a failure is to just skip the
+                * real test.
+                */
+               if ((*ptr & 0xAA) != 0xAA) {
+                       pr_info("%p was NOT written during init!?\n", ptr);
+                       break;
+               }
+
+               pr_info("attempting bad ro_after_init write at %p\n", ptr);
                 *ptr ^= 0xabcd1234;
  
                 break;
@@ -817,6 +837,9 @@ static int __init lkdtm_module_init(void)
         int n_debugfs_entries = 1; /* Assume only the direct entry */
         int i;
  
+       /* Make sure we can write to __ro_after_init values during __init */
+       ro_after_init |= 0xAA;
+
         /* Register debugfs interface */
         lkdtm_debugfs_root = debugfs_create_dir("provoke-crash", NULL);
         if (!lkdtm_debugfs_root) {
diff --git a/drivers/mtd/tests/mtd_nandecctest.c b/drivers/mtd/tests/mtd_nandecctest.c

index 79316159eec63cbca36aa06d27bfaab561b62dc0..88b6c81cebbe68123da96775b3a42c3f96da4a11 100644 (file)
--- a/drivers/mtd/tests/mtd_nandecctest.c
+++ b/drivers/mtd/tests/mtd_nandecctest.c
@@ -187,7 +187,7 @@ static int double_bit_error_detect(void *error_data, void *error_ecc,
         __nand_calculate_ecc(error_data, size, calc_ecc);
         ret = __nand_correct_data(error_data, error_ecc, calc_ecc, size);
  
-       return (ret == -1) ? 0 : -EINVAL;
+       return (ret == -EBADMSG) ? 0 : -EINVAL;
  }
  
  static const struct nand_ecc_test nand_ecc_test[] = {
diff --git a/drivers/net/can/spi/mcp251x.c b/drivers/net/can/spi/mcp251x.c

index 575790e8a75af8e6184f5611fe133b1d73825679..74a7dfecee2783ac609b9fefbd32f7817b48c43c 100644 (file)
--- a/drivers/net/can/spi/mcp251x.c
+++ b/drivers/net/can/spi/mcp251x.c
@@ -843,7 +843,7 @@ static irqreturn_t mcp251x_can_ist(int irq, void *dev_id)
                 if (clear_intf)
                         mcp251x_write_bits(spi, CANINTF, clear_intf, 0x00);
  
-               if (eflag)
+               if (eflag & (EFLG_RX0OVR | EFLG_RX1OVR))
                         mcp251x_write_bits(spi, EFLG, eflag, 0x00);
  
                 /* Update can state */
diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c

index 5eee62badf45457798c2fabe5005faa26c6286e2..cbc99d5649afa3877158a0ac37a50b8865a7e8ac 100644 (file)
--- a/drivers/net/can/usb/gs_usb.c
+++ b/drivers/net/can/usb/gs_usb.c
@@ -826,9 +826,8 @@ static struct gs_can *gs_make_candev(unsigned int channel, struct usb_interface
  static void gs_destroy_candev(struct gs_can *dev)
  {
         unregister_candev(dev->netdev);
-       free_candev(dev->netdev);
         usb_kill_anchored_urbs(&dev->tx_submitted);
-       kfree(dev);
+       free_candev(dev->netdev);
  }
  
  static int gs_usb_probe(struct usb_interface *intf, const struct usb_device_id *id)
@@ -913,12 +912,15 @@ static int gs_usb_probe(struct usb_interface *intf, const struct usb_device_id *
         for (i = 0; i < icount; i++) {
                 dev->canch[i] = gs_make_candev(i, intf);
                 if (IS_ERR_OR_NULL(dev->canch[i])) {
+                       /* save error code to return later */
+                       rc = PTR_ERR(dev->canch[i]);
+
                         /* on failure destroy previously created candevs */
                         icount = i;
-                       for (i = 0; i < icount; i++) {
+                       for (i = 0; i < icount; i++)
                                 gs_destroy_candev(dev->canch[i]);
-                               dev->canch[i] = NULL;
-                       }
+
+                       usb_kill_anchored_urbs(&dev->rx_submitted);
                         kfree(dev);
                         return rc;
                 }
@@ -939,16 +941,12 @@ static void gs_usb_disconnect(struct usb_interface *intf)
                 return;
         }
  
-       for (i = 0; i < GS_MAX_INTF; i++) {
-               struct gs_can *can = dev->canch[i];
-
-               if (!can)
-                       continue;
-
-               gs_destroy_candev(can);
-       }
+       for (i = 0; i < GS_MAX_INTF; i++)
+               if (dev->canch[i])
+                       gs_destroy_candev(dev->canch[i]);
  
         usb_kill_anchored_urbs(&dev->rx_submitted);
+       kfree(dev);
  }
  
  static const struct usb_device_id gs_usb_table[] = {
diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c

index 79e1a0282163db200916e0f862fa1560eab2fd03..17b2126075e01afde20ee2de1718dfc7db1e4641 100644 (file)
--- a/drivers/net/ethernet/3com/3c59x.c
+++ b/drivers/net/ethernet/3com/3c59x.c
@@ -2461,7 +2461,7 @@ boomerang_interrupt(int irq, void *dev_id)
                                         int i;
                                         pci_unmap_single(VORTEX_PCI(vp),
                                                         le32_to_cpu(vp->tx_ring[entry].frag[0].addr),
-                                                       le32_to_cpu(vp->tx_ring[entry].frag[0].length),
+                                                       le32_to_cpu(vp->tx_ring[entry].frag[0].length)&0xFFF,
                                                         PCI_DMA_TODEVICE);
  
                                         for (i=1; i<=skb_shinfo(skb)->nr_frags; i++)
diff --git a/drivers/net/ethernet/altera/altera_tse_main.c b/drivers/net/ethernet/altera/altera_tse_main.c

index 17472851674f19f949f97e04fccdc5a6fb874a3d..f749e4d389eb163132d5ab51ad7a9821c07577e4 100644 (file)
--- a/drivers/net/ethernet/altera/altera_tse_main.c
+++ b/drivers/net/ethernet/altera/altera_tse_main.c
@@ -193,7 +193,6 @@ static void altera_tse_mdio_destroy(struct net_device *dev)
                             priv->mdio->id);
  
         mdiobus_unregister(priv->mdio);
-       kfree(priv->mdio->irq);
         mdiobus_free(priv->mdio);
         priv->mdio = NULL;
  }
diff --git a/drivers/net/ethernet/aurora/nb8800.c b/drivers/net/ethernet/aurora/nb8800.c

index f71ab2647a3bc1f62f6b7450b34dd325c710549d..08a23e6b60e947894783f2115c2fe16abece84bc 100644 (file)
--- a/drivers/net/ethernet/aurora/nb8800.c
+++ b/drivers/net/ethernet/aurora/nb8800.c
@@ -1460,7 +1460,19 @@ static int nb8800_probe(struct platform_device *pdev)
                 goto err_disable_clk;
         }
  
-       priv->phy_node = of_parse_phandle(pdev->dev.of_node, "phy-handle", 0);
+       if (of_phy_is_fixed_link(pdev->dev.of_node)) {
+               ret = of_phy_register_fixed_link(pdev->dev.of_node);
+               if (ret < 0) {
+                       dev_err(&pdev->dev, "bad fixed-link spec\n");
+                       goto err_free_bus;
+               }
+               priv->phy_node = of_node_get(pdev->dev.of_node);
+       }
+
+       if (!priv->phy_node)
+               priv->phy_node = of_parse_phandle(pdev->dev.of_node,
+                                                 "phy-handle", 0);
+
         if (!priv->phy_node) {
                 dev_err(&pdev->dev, "no PHY specified\n");
                 ret = -ENODEV;
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h

index 27aa0802d87d5f46d813184649dd9107fdeb541b..91874d24fd560c2d25afbf7ab7e3251774627a5b 100644 (file)
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h
@@ -4896,9 +4896,9 @@ struct c2s_pri_trans_table_entry {
   * cfc delete event data
   */
  struct cfc_del_event_data {
-       u32 cid;
-       u32 reserved0;
-       u32 reserved1;
+       __le32 cid;
+       __le32 reserved0;
+       __le32 reserved1;
  };
  
  
@@ -5114,15 +5114,9 @@ struct vf_pf_channel_zone_trigger {
   * zone that triggers the in-bound interrupt
   */
  struct trigger_vf_zone {
-#if defined(__BIG_ENDIAN)
-       u16 reserved1;
-       u8 reserved0;
-       struct vf_pf_channel_zone_trigger vf_pf_channel;
-#elif defined(__LITTLE_ENDIAN)
         struct vf_pf_channel_zone_trigger vf_pf_channel;
         u8 reserved0;
         u16 reserved1;
-#endif
         u32 reserved2;
  };
  
@@ -5207,9 +5201,9 @@ struct e2_integ_data {
   * set mac event data
   */
  struct eth_event_data {
-       u32 echo;
-       u32 reserved0;
-       u32 reserved1;
+       __le32 echo;
+       __le32 reserved0;
+       __le32 reserved1;
  };
  
  
@@ -5219,9 +5213,9 @@ struct eth_event_data {
  struct vf_pf_event_data {
         u8 vf_id;
         u8 reserved0;
-       u16 reserved1;
-       u32 msg_addr_lo;
-       u32 msg_addr_hi;
+       __le16 reserved1;
+       __le32 msg_addr_lo;
+       __le32 msg_addr_hi;
  };
  
  /*
@@ -5230,9 +5224,9 @@ struct vf_pf_event_data {
  struct vf_flr_event_data {
         u8 vf_id;
         u8 reserved0;
-       u16 reserved1;
-       u32 reserved2;
-       u32 reserved3;
+       __le16 reserved1;
+       __le32 reserved2;
+       __le32 reserved3;
  };
  
  /*
@@ -5241,9 +5235,9 @@ struct vf_flr_event_data {
  struct malicious_vf_event_data {
         u8 vf_id;
         u8 err_id;
-       u16 reserved1;
-       u32 reserved2;
-       u32 reserved3;
+       __le16 reserved1;
+       __le32 reserved2;
+       __le32 reserved3;
  };
  
  /*
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c

index 6c4e3a69976fcaf4ecc0bafcebbcb2e725a40575..2bf9c871144f714c56a72e4e584aa4db431d218c 100644 (file)
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -5280,14 +5280,14 @@ static void bnx2x_handle_classification_eqe(struct bnx2x *bp,
  {
         unsigned long ramrod_flags = 0;
         int rc = 0;
-       u32 cid = elem->message.data.eth_event.echo & BNX2X_SWCID_MASK;
+       u32 echo = le32_to_cpu(elem->message.data.eth_event.echo);
+       u32 cid = echo & BNX2X_SWCID_MASK;
         struct bnx2x_vlan_mac_obj *vlan_mac_obj;
  
         /* Always push next commands out, don't wait here */
         __set_bit(RAMROD_CONT, &ramrod_flags);
  
-       switch (le32_to_cpu((__force __le32)elem->message.data.eth_event.echo)
-                           >> BNX2X_SWCID_SHIFT) {
+       switch (echo >> BNX2X_SWCID_SHIFT) {
         case BNX2X_FILTER_MAC_PENDING:
                 DP(BNX2X_MSG_SP, "Got SETUP_MAC completions\n");
                 if (CNIC_LOADED(bp) && (cid == BNX2X_ISCSI_ETH_CID(bp)))
@@ -5308,8 +5308,7 @@ static void bnx2x_handle_classification_eqe(struct bnx2x *bp,
                 bnx2x_handle_mcast_eqe(bp);
                 return;
         default:
-               BNX2X_ERR("Unsupported classification command: %d\n",
-                         elem->message.data.eth_event.echo);
+               BNX2X_ERR("Unsupported classification command: 0x%x\n", echo);
                 return;
         }
  
@@ -5478,9 +5477,6 @@ static void bnx2x_eq_int(struct bnx2x *bp)
                         goto next_spqe;
                 }
  
-               /* elem CID originates from FW; actually LE */
-               cid = SW_CID((__force __le32)
-                            elem->message.data.cfc_del_event.cid);
                 opcode = elem->message.opcode;
  
                 /* handle eq element */
@@ -5503,6 +5499,10 @@ static void bnx2x_eq_int(struct bnx2x *bp)
                          * we may want to verify here that the bp state is
                          * HALTING
                          */
+
+                       /* elem CID originates from FW; actually LE */
+                       cid = SW_CID(elem->message.data.cfc_del_event.cid);
+
                         DP(BNX2X_MSG_SP,
                            "got delete ramrod for MULTI[%d]\n", cid);
  
@@ -5596,10 +5596,8 @@ static void bnx2x_eq_int(struct bnx2x *bp)
                       BNX2X_STATE_OPENING_WAIT4_PORT):
                 case (EVENT_RING_OPCODE_RSS_UPDATE_RULES |
                       BNX2X_STATE_CLOSING_WAIT4_HALT):
-                       cid = elem->message.data.eth_event.echo &
-                               BNX2X_SWCID_MASK;
                         DP(BNX2X_MSG_SP, "got RSS_UPDATE ramrod. CID %d\n",
-                          cid);
+                          SW_CID(elem->message.data.eth_event.echo));
                         rss_raw->clear_pending(rss_raw);
                         break;
  
@@ -5684,7 +5682,7 @@ static void bnx2x_sp_task(struct work_struct *work)
                 if (status & BNX2X_DEF_SB_IDX) {
                         struct bnx2x_fastpath *fp = bnx2x_fcoe_fp(bp);
  
-               if (FCOE_INIT(bp) &&
+                       if (FCOE_INIT(bp) &&
                             (bnx2x_has_rx_work(fp) || bnx2x_has_tx_work(fp))) {
                                 /* Prevent local bottom-halves from running as
                                  * we are going to change the local NAPI list.
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c

index 9d027348cd09b90fcca14843f13e278d54dd89e6..632daff117d305a822f95cc1bcd84b47437b2334 100644 (file)
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
@@ -1672,11 +1672,12 @@ void bnx2x_vf_handle_classification_eqe(struct bnx2x *bp,
  {
         unsigned long ramrod_flags = 0;
         int rc = 0;
+       u32 echo = le32_to_cpu(elem->message.data.eth_event.echo);
  
         /* Always push next commands out, don't wait here */
         set_bit(RAMROD_CONT, &ramrod_flags);
  
-       switch (elem->message.data.eth_event.echo >> BNX2X_SWCID_SHIFT) {
+       switch (echo >> BNX2X_SWCID_SHIFT) {
         case BNX2X_FILTER_MAC_PENDING:
                 rc = vfq->mac_obj.complete(bp, &vfq->mac_obj, elem,
                                            &ramrod_flags);
@@ -1686,8 +1687,7 @@ void bnx2x_vf_handle_classification_eqe(struct bnx2x *bp,
                                             &ramrod_flags);
                 break;
         default:
-               BNX2X_ERR("Unsupported classification command: %d\n",
-                         elem->message.data.eth_event.echo);
+               BNX2X_ERR("Unsupported classification command: 0x%x\n", echo);
                 return;
         }
         if (rc < 0)
@@ -1747,16 +1747,14 @@ int bnx2x_iov_eq_sp_event(struct bnx2x *bp, union event_ring_elem *elem)
  
         switch (opcode) {
         case EVENT_RING_OPCODE_CFC_DEL:
-               cid = SW_CID((__force __le32)
-                            elem->message.data.cfc_del_event.cid);
+               cid = SW_CID(elem->message.data.cfc_del_event.cid);
                 DP(BNX2X_MSG_IOV, "checking cfc-del comp cid=%d\n", cid);
                 break;
         case EVENT_RING_OPCODE_CLASSIFICATION_RULES:
         case EVENT_RING_OPCODE_MULTICAST_RULES:
         case EVENT_RING_OPCODE_FILTERS_RULES:
         case EVENT_RING_OPCODE_RSS_UPDATE_RULES:
-               cid = (elem->message.data.eth_event.echo &
-                      BNX2X_SWCID_MASK);
+               cid = SW_CID(elem->message.data.eth_event.echo);
                 DP(BNX2X_MSG_IOV, "checking filtering comp cid=%d\n", cid);
                 break;
         case EVENT_RING_OPCODE_VF_FLR:
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c

index 1374e5394a7970ba20ad54ddfc6271ce09e40744..bfae300cf25ff881292dc36ad56e51e37132cd76 100644 (file)
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
@@ -2187,8 +2187,10 @@ void bnx2x_vf_mbx_schedule(struct bnx2x *bp,
  
         /* Update VFDB with current message and schedule its handling */
         mutex_lock(&BP_VFDB(bp)->event_mutex);
-       BP_VF_MBX(bp, vf_idx)->vf_addr_hi = vfpf_event->msg_addr_hi;
-       BP_VF_MBX(bp, vf_idx)->vf_addr_lo = vfpf_event->msg_addr_lo;
+       BP_VF_MBX(bp, vf_idx)->vf_addr_hi =
+               le32_to_cpu(vfpf_event->msg_addr_hi);
+       BP_VF_MBX(bp, vf_idx)->vf_addr_lo =
+               le32_to_cpu(vfpf_event->msg_addr_lo);
         BP_VFDB(bp)->event_occur |= (1ULL << vf_idx);
         mutex_unlock(&BP_VFDB(bp)->event_mutex);
  
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c

index 8ab000dd52d958317323c8486fc0538576727803..82f191382989b04877177075ea1906f0e941dfb9 100644 (file)
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -248,7 +248,8 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev)
                 tx_push1->tx_bd_cfa_meta = cpu_to_le32(vlan_tag_flags);
                 tx_push1->tx_bd_cfa_action = cpu_to_le32(cfa_action);
  
-               end = PTR_ALIGN(pdata + length + 1, 8) - 1;
+               end = pdata + length;
+               end = PTR_ALIGN(end, 8) - 1;
                 *end = 0;
  
                 skb_copy_from_linear_data(skb, pdata, len);
diff --git a/drivers/net/ethernet/brocade/bna/bna_tx_rx.c b/drivers/net/ethernet/brocade/bna/bna_tx_rx.c

index 04b0d16b210e89dd6eae3aa6704987e8543edeca..95bc470ae441af824f7b6ce29df5ff17842d7fee 100644 (file)
--- a/drivers/net/ethernet/brocade/bna/bna_tx_rx.c
+++ b/drivers/net/ethernet/brocade/bna/bna_tx_rx.c
@@ -987,7 +987,7 @@ bna_rxf_ucast_cfg_apply(struct bna_rxf *rxf)
         if (!list_empty(&rxf->ucast_pending_add_q)) {
                 mac = list_first_entry(&rxf->ucast_pending_add_q,
                                        struct bna_mac, qe);
-               list_add_tail(&mac->qe, &rxf->ucast_active_q);
+               list_move_tail(&mac->qe, &rxf->ucast_active_q);
                 bna_bfi_ucast_req(rxf, mac, BFI_ENET_H2I_MAC_UCAST_ADD_REQ);
                 return 1;
         }
diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h

index 688828865c48253d5f96e0bd4cab4c315a635e37..34e9acea87479db83b852c4e233c3e26b532a6a1 100644 (file)
--- a/drivers/net/ethernet/cavium/thunder/nic.h
+++ b/drivers/net/ethernet/cavium/thunder/nic.h
@@ -116,6 +116,15 @@
  #define NIC_PF_INTR_ID_MBOX0           8
  #define NIC_PF_INTR_ID_MBOX1           9
  
+/* Minimum FIFO level before all packets for the CQ are dropped
+ *
+ * This value ensures that once a packet has been "accepted"
+ * for reception it will not get dropped due to non-availability
+ * of CQ descriptor. An errata in HW mandates this value to be
+ * atleast 0x100.
+ */
+#define NICPF_CQM_MIN_DROP_LEVEL       0x100
+
  /* Global timer for CQ timer thresh interrupts
   * Calculated for SCLK of 700Mhz
   * value written should be a 1/16th of what is expected
diff --git a/drivers/net/ethernet/cavium/thunder/nic_main.c b/drivers/net/ethernet/cavium/thunder/nic_main.c

index 4dded90076c8783b95c7944131e6b2cd3b828e40..95f17f8cadacc08637c485e555242e78aafb48c5 100644 (file)
--- a/drivers/net/ethernet/cavium/thunder/nic_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nic_main.c
@@ -304,6 +304,7 @@ static void nic_set_lmac_vf_mapping(struct nicpf *nic)
  static void nic_init_hw(struct nicpf *nic)
  {
         int i;
+       u64 cqm_cfg;
  
         /* Enable NIC HW block */
         nic_reg_write(nic, NIC_PF_CFG, 0x3);
@@ -340,6 +341,11 @@ static void nic_init_hw(struct nicpf *nic)
         /* Enable VLAN ethertype matching and stripping */
         nic_reg_write(nic, NIC_PF_RX_ETYPE_0_7,
                       (2 << 19) | (ETYPE_ALG_VLAN_STRIP << 16) | ETH_P_8021Q);
+
+       /* Check if HW expected value is higher (could be in future chips) */
+       cqm_cfg = nic_reg_read(nic, NIC_PF_CQM_CFG);
+       if (cqm_cfg < NICPF_CQM_MIN_DROP_LEVEL)
+               nic_reg_write(nic, NIC_PF_CQM_CFG, NICPF_CQM_MIN_DROP_LEVEL);
  }
  
  /* Channel parse index configuration */
diff --git a/drivers/net/ethernet/cavium/thunder/nic_reg.h b/drivers/net/ethernet/cavium/thunder/nic_reg.h

index dd536be20193119c3465cd772a4bff6d649e3319..afb10e326b4fc46043b4d3f4cddb7d17f1cc341c 100644 (file)
--- a/drivers/net/ethernet/cavium/thunder/nic_reg.h
+++ b/drivers/net/ethernet/cavium/thunder/nic_reg.h
@@ -21,7 +21,7 @@
  #define   NIC_PF_TCP_TIMER                     (0x0060)
  #define   NIC_PF_BP_CFG                                (0x0080)
  #define   NIC_PF_RRM_CFG                       (0x0088)
-#define   NIC_PF_CQM_CF                                (0x00A0)
+#define   NIC_PF_CQM_CFG                       (0x00A0)
  #define   NIC_PF_CNM_CF                                (0x00A8)
  #define   NIC_PF_CNM_STATUS                    (0x00B0)
  #define   NIC_PF_CQ_AVG_CFG                    (0x00C0)
diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h

index cf837831304be2f4d7edaebf68880a44f96bd057..f9751294ece79e54eff07e20ad9122f585bda32a 100644 (file)
--- a/drivers/net/ethernet/emulex/benet/be.h
+++ b/drivers/net/ethernet/emulex/benet/be.h
@@ -531,6 +531,7 @@ struct be_adapter {
  
         struct delayed_work be_err_detection_work;
         u8 err_flags;
+       bool pcicfg_mapped;     /* pcicfg obtained via pci_iomap() */
         u32 flags;
         u32 cmd_privileges;
         /* Ethtool knobs and info */
diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.h b/drivers/net/ethernet/emulex/benet/be_cmds.h

index 241819b36ca72ac6c133875f88d17f4359c0c5d4..6d9a8d78e8ad8413f075960ee45f0033e222861c 100644 (file)
--- a/drivers/net/ethernet/emulex/benet/be_cmds.h
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.h
@@ -622,10 +622,13 @@ enum be_if_flags {
                                          BE_IF_FLAGS_VLAN_PROMISCUOUS |\
                                          BE_IF_FLAGS_MCAST_PROMISCUOUS)
  
-#define BE_IF_EN_FLAGS (BE_IF_FLAGS_BROADCAST | BE_IF_FLAGS_PASS_L3L4_ERRORS |\
-                       BE_IF_FLAGS_MULTICAST | BE_IF_FLAGS_UNTAGGED)
+#define BE_IF_FILT_FLAGS_BASIC (BE_IF_FLAGS_BROADCAST | \
+                               BE_IF_FLAGS_PASS_L3L4_ERRORS | \
+                               BE_IF_FLAGS_UNTAGGED)
  
-#define BE_IF_ALL_FILT_FLAGS   (BE_IF_EN_FLAGS | BE_IF_FLAGS_ALL_PROMISCUOUS)
+#define BE_IF_ALL_FILT_FLAGS   (BE_IF_FILT_FLAGS_BASIC | \
+                                BE_IF_FLAGS_MULTICAST | \
+                                BE_IF_FLAGS_ALL_PROMISCUOUS)
  
  /* An RX interface is an object with one or more MAC addresses and
   * filtering capabilities. */
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c

index f99de3657ce3b5f58b6f08f1f97f470bb11d7788..d1cf1274fc2f4de4b763e683b182e485e39452e6 100644 (file)
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -125,6 +125,11 @@ static const char * const ue_status_hi_desc[] = {
         "Unknown"
  };
  
+#define BE_VF_IF_EN_FLAGS      (BE_IF_FLAGS_UNTAGGED | \
+                                BE_IF_FLAGS_BROADCAST | \
+                                BE_IF_FLAGS_MULTICAST | \
+                                BE_IF_FLAGS_PASS_L3L4_ERRORS)
+
  static void be_queue_free(struct be_adapter *adapter, struct be_queue_info *q)
  {
         struct be_dma_mem *mem = &q->dma_mem;
@@ -3537,7 +3542,7 @@ static int be_enable_if_filters(struct be_adapter *adapter)
  {
         int status;
  
-       status = be_cmd_rx_filter(adapter, BE_IF_EN_FLAGS, ON);
+       status = be_cmd_rx_filter(adapter, BE_IF_FILT_FLAGS_BASIC, ON);
         if (status)
                 return status;
  
@@ -3857,8 +3862,7 @@ static int be_vfs_if_create(struct be_adapter *adapter)
         int status;
  
         /* If a FW profile exists, then cap_flags are updated */
-       cap_flags = BE_IF_FLAGS_UNTAGGED | BE_IF_FLAGS_BROADCAST |
-                   BE_IF_FLAGS_MULTICAST | BE_IF_FLAGS_PASS_L3L4_ERRORS;
+       cap_flags = BE_VF_IF_EN_FLAGS;
  
         for_all_vfs(adapter, vf_cfg, vf) {
                 if (!BE3_chip(adapter)) {
@@ -3874,10 +3878,8 @@ static int be_vfs_if_create(struct be_adapter *adapter)
                         }
                 }
  
-               en_flags = cap_flags & (BE_IF_FLAGS_UNTAGGED |
-                                       BE_IF_FLAGS_BROADCAST |
-                                       BE_IF_FLAGS_MULTICAST |
-                                       BE_IF_FLAGS_PASS_L3L4_ERRORS);
+               /* PF should enable IF flags during proxy if_create call */
+               en_flags = cap_flags & BE_VF_IF_EN_FLAGS;
                 status = be_cmd_if_create(adapter, cap_flags, en_flags,
                                           &vf_cfg->if_handle, vf + 1);
                 if (status)
@@ -4968,6 +4970,8 @@ static void be_unmap_pci_bars(struct be_adapter *adapter)
                 pci_iounmap(adapter->pdev, adapter->csr);
         if (adapter->db)
                 pci_iounmap(adapter->pdev, adapter->db);
+       if (adapter->pcicfg && adapter->pcicfg_mapped)
+               pci_iounmap(adapter->pdev, adapter->pcicfg);
  }
  
  static int db_bar(struct be_adapter *adapter)
@@ -5019,8 +5023,10 @@ static int be_map_pci_bars(struct be_adapter *adapter)
                         if (!addr)
                                 goto pci_map_err;
                         adapter->pcicfg = addr;
+                       adapter->pcicfg_mapped = true;
                 } else {
                         adapter->pcicfg = adapter->db + SRIOV_VF_PCICFG_OFFSET;
+                       adapter->pcicfg_mapped = false;
                 }
         }
  
diff --git a/drivers/net/ethernet/ethoc.c b/drivers/net/ethernet/ethoc.c

index 62fa136554ac29a49074e5f176b3fc9ac8d37042..41b01064510098c00a66d86bb49cf2f3fe02f9c3 100644 (file)
--- a/drivers/net/ethernet/ethoc.c
+++ b/drivers/net/ethernet/ethoc.c
@@ -1265,7 +1265,6 @@ static int ethoc_remove(struct platform_device *pdev)
  
                 if (priv->mdio) {
                         mdiobus_unregister(priv->mdio);
-                       kfree(priv->mdio->irq);
                         mdiobus_free(priv->mdio);
                 }
                 if (priv->clk)
diff --git a/drivers/net/ethernet/freescale/fman/fman.c b/drivers/net/ethernet/freescale/fman/fman.c

index 623aa1c8ebc6ba3149fc529e04994496b2e7be18..79a210aaf0bbd69d86ce830a45909ae8ede07f83 100644 (file)
--- a/drivers/net/ethernet/freescale/fman/fman.c
+++ b/drivers/net/ethernet/freescale/fman/fman.c
@@ -2791,6 +2791,8 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
                 goto fman_free;
         }
  
+       fman->dev = &of_dev->dev;
+
         return fman;
  
  fman_node_put:
@@ -2845,8 +2847,6 @@ static int fman_probe(struct platform_device *of_dev)
  
         dev_set_drvdata(dev, fman);
  
-       fman->dev = dev;
-
         dev_dbg(dev, "FMan%d probed\n", fman->dts_params.id);
  
         return 0;
diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c

index 2aa7b401cc3be29eb24a8fb6d1065bff6caa6e39..b9ecf197ad117754245a290964d7cd55d9be5d4e 100644 (file)
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -1111,8 +1111,10 @@ static void __gfar_detect_errata_85xx(struct gfar_private *priv)
  
         if ((SVR_SOC_VER(svr) == SVR_8548) && (SVR_REV(svr) == 0x20))
                 priv->errata |= GFAR_ERRATA_12;
+       /* P2020/P1010 Rev 1; MPC8548 Rev 2 */
         if (((SVR_SOC_VER(svr) == SVR_P2020) && (SVR_REV(svr) < 0x20)) ||
-           ((SVR_SOC_VER(svr) == SVR_P2010) && (SVR_REV(svr) < 0x20)))
+           ((SVR_SOC_VER(svr) == SVR_P2010) && (SVR_REV(svr) < 0x20)) ||
+           ((SVR_SOC_VER(svr) == SVR_8548) && (SVR_REV(svr) < 0x31)))
                 priv->errata |= GFAR_ERRATA_76; /* aka eTSEC 20 */
  }
  #endif
diff --git a/drivers/net/ethernet/hisilicon/Kconfig b/drivers/net/ethernet/hisilicon/Kconfig

index 74beb1867230c9cf62a60a2843825316e6d9cdbc..4ccc032633c4e57576b316cca8f0486a3e8b4a98 100644 (file)
--- a/drivers/net/ethernet/hisilicon/Kconfig
+++ b/drivers/net/ethernet/hisilicon/Kconfig
@@ -25,6 +25,7 @@ config HIX5HD2_GMAC
  
  config HIP04_ETH
         tristate "HISILICON P04 Ethernet support"
+       depends on HAS_IOMEM    # For MFD_SYSCON
         select MARVELL_PHY
         select MFD_SYSCON
         select HNS_MDIO
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c b/drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c

index a0070d0e740dae5b3de8f2b2a92a772854b08321..d4f92ed322d6e5c4ac10a3f8d208d94445760e27 100644 (file)
--- a/drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c
@@ -675,8 +675,12 @@ static int hns_ae_config_loopback(struct hnae_handle *handle,
  {
         int ret;
         struct hnae_vf_cb *vf_cb = hns_ae_get_vf_cb(handle);
+       struct hns_mac_cb *mac_cb = hns_get_mac_cb(handle);
  
         switch (loop) {
+       case MAC_INTERNALLOOP_PHY:
+               ret = 0;
+               break;
         case MAC_INTERNALLOOP_SERDES:
                 ret = hns_mac_config_sds_loopback(vf_cb->mac_cb, en);
                 break;
@@ -686,6 +690,10 @@ static int hns_ae_config_loopback(struct hnae_handle *handle,
         default:
                 ret = -EINVAL;
         }
+
+       if (!ret)
+               hns_dsaf_set_inner_lb(mac_cb->dsaf_dev, mac_cb->mac_id, en);
+
         return ret;
  }
  
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c

index 9439f04962e1d96bf480d8aedf6472c912e28813..38fc5be3870cc3258755e79d13ac501ffa0826e4 100644 (file)
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c
@@ -230,6 +230,30 @@ static void hns_dsaf_mix_def_qid_cfg(struct dsaf_device *dsaf_dev)
         }
  }
  
+static void hns_dsaf_inner_qid_cfg(struct dsaf_device *dsaf_dev)
+{
+       u16 max_q_per_vf, max_vfn;
+       u32 q_id, q_num_per_port;
+       u32 mac_id;
+
+       if (AE_IS_VER1(dsaf_dev->dsaf_ver))
+               return;
+
+       hns_rcb_get_queue_mode(dsaf_dev->dsaf_mode,
+                              HNS_DSAF_COMM_SERVICE_NW_IDX,
+                              &max_vfn, &max_q_per_vf);
+       q_num_per_port = max_vfn * max_q_per_vf;
+
+       for (mac_id = 0, q_id = 0; mac_id < DSAF_SERVICE_NW_NUM; mac_id++) {
+               dsaf_set_dev_field(dsaf_dev,
+                                  DSAFV2_SERDES_LBK_0_REG + 4 * mac_id,
+                                  DSAFV2_SERDES_LBK_QID_M,
+                                  DSAFV2_SERDES_LBK_QID_S,
+                                  q_id);
+               q_id += q_num_per_port;
+       }
+}
+
  /**
   * hns_dsaf_sw_port_type_cfg - cfg sw type
   * @dsaf_id: dsa fabric id
@@ -691,6 +715,16 @@ void hns_dsaf_set_promisc_mode(struct dsaf_device *dsaf_dev, u32 en)
         dsaf_set_dev_bit(dsaf_dev, DSAF_CFG_0_REG, DSAF_CFG_MIX_MODE_S, !!en);
  }
  
+void hns_dsaf_set_inner_lb(struct dsaf_device *dsaf_dev, u32 mac_id, u32 en)
+{
+       if (AE_IS_VER1(dsaf_dev->dsaf_ver) ||
+           dsaf_dev->mac_cb[mac_id].mac_type == HNAE_PORT_DEBUG)
+               return;
+
+       dsaf_set_dev_bit(dsaf_dev, DSAFV2_SERDES_LBK_0_REG + 4 * mac_id,
+                        DSAFV2_SERDES_LBK_EN_B, !!en);
+}
+
  /**
   * hns_dsaf_tbl_stat_en - tbl
   * @dsaf_id: dsa fabric id
@@ -1022,6 +1056,9 @@ static void hns_dsaf_comm_init(struct dsaf_device *dsaf_dev)
         /* set promisc def queue id */
         hns_dsaf_mix_def_qid_cfg(dsaf_dev);
  
+       /* set inner loopback queue id */
+       hns_dsaf_inner_qid_cfg(dsaf_dev);
+
         /* in non switch mode, set all port to access mode */
         hns_dsaf_sw_port_type_cfg(dsaf_dev, DSAF_SW_PORT_TYPE_NON_VLAN);
  
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h

index 40205b910f80e66a439c14b53e7f2c907b0fbeb1..5fea226efaf330f968b2942a55065c7fc5c01112 100644 (file)
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h
@@ -417,5 +417,6 @@ void hns_dsaf_get_strings(int stringset, u8 *data, int port);
  void hns_dsaf_get_regs(struct dsaf_device *ddev, u32 port, void *data);
  int hns_dsaf_get_regs_count(void);
  void hns_dsaf_set_promisc_mode(struct dsaf_device *dsaf_dev, u32 en);
+void hns_dsaf_set_inner_lb(struct dsaf_device *dsaf_dev, u32 mac_id, u32 en);
  
  #endif /* __HNS_DSAF_MAIN_H__ */
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_reg.h b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_reg.h

index f0c4f9b09d5b0477365c862816bc8b8d4dd19b88..60d695daa471eb23d5284778300807f8b235e546 100644 (file)
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_reg.h
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_reg.h
@@ -134,6 +134,7 @@
  #define DSAF_XGE_INT_STS_0_REG         0x1C0
  #define DSAF_PPE_INT_STS_0_REG         0x1E0
  #define DSAF_ROCEE_INT_STS_0_REG       0x200
+#define DSAFV2_SERDES_LBK_0_REG         0x220
  #define DSAF_PPE_QID_CFG_0_REG         0x300
  #define DSAF_SW_PORT_TYPE_0_REG                0x320
  #define DSAF_STP_PORT_TYPE_0_REG       0x340
@@ -857,6 +858,10 @@
  #define PPEV2_CFG_RSS_TBL_4N3_S        24
  #define PPEV2_CFG_RSS_TBL_4N3_M        (((1UL << 5) - 1) << PPEV2_CFG_RSS_TBL_4N3_S)
  
+#define DSAFV2_SERDES_LBK_EN_B  8
+#define DSAFV2_SERDES_LBK_QID_S 0
+#define DSAFV2_SERDES_LBK_QID_M        (((1UL << 8) - 1) << DSAFV2_SERDES_LBK_QID_S)
+
  #define PPE_CNT_CLR_CE_B       0
  #define PPE_CNT_CLR_SNAP_EN_B  1
  
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c

index 3df22840fcd15370393e7e6a5c331b72dd87f476..3c4a3bc31a89230c2453c115316306739e28c2a3 100644 (file)
--- a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
@@ -295,8 +295,10 @@ static int __lb_setup(struct net_device *ndev,
  
         switch (loop) {
         case MAC_INTERNALLOOP_PHY:
-               if ((phy_dev) && (!phy_dev->is_c45))
+               if ((phy_dev) && (!phy_dev->is_c45)) {
                         ret = hns_nic_config_phy_loopback(phy_dev, 0x1);
+                       ret |= h->dev->ops->set_loopback(h, loop, 0x1);
+               }
                 break;
         case MAC_INTERNALLOOP_MAC:
                 if ((h->dev->ops->set_loopback) &&
@@ -376,6 +378,7 @@ static void __lb_other_process(struct hns_nic_ring_data *ring_data,
                                struct sk_buff *skb)
  {
         struct net_device *ndev;
+       struct hns_nic_priv *priv;
         struct hnae_ring *ring;
         struct netdev_queue *dev_queue;
         struct sk_buff *new_skb;
@@ -385,8 +388,17 @@ static void __lb_other_process(struct hns_nic_ring_data *ring_data,
         char buff[33]; /* 32B data and the last character '\0' */
  
         if (!ring_data) { /* Just for doing create frame*/
+               ndev = skb->dev;
+               priv = netdev_priv(ndev);
+
                 frame_size = skb->len;
                 memset(skb->data, 0xFF, frame_size);
+               if ((!AE_IS_VER1(priv->enet_ver)) &&
+                   (priv->ae_handle->port_type == HNAE_PORT_SERVICE)) {
+                       memcpy(skb->data, ndev->dev_addr, 6);
+                       skb->data[5] += 0x1f;
+               }
+
                 frame_size &= ~1ul;
                 memset(&skb->data[frame_size / 2], 0xAA, frame_size / 2 - 1);
                 memset(&skb->data[frame_size / 2 + 10], 0xBE,
@@ -486,6 +498,7 @@ static int __lb_run_test(struct net_device *ndev,
  
         /* place data into test skb */
         (void)skb_put(skb, size);
+       skb->dev = ndev;
         __lb_other_process(NULL, skb);
         skb->queue_mapping = NIC_LB_TEST_RING_ID;
  
diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c

index 335417b4756b73d3d2d9f1558b10b2556443bee7..ebe60719e489cd1fbdc12248ee48eed6d0979fd4 100644 (file)
--- a/drivers/net/ethernet/ibm/ibmveth.c
+++ b/drivers/net/ethernet/ibm/ibmveth.c
@@ -1166,7 +1166,10 @@ map_failed:
         if (!firmware_has_feature(FW_FEATURE_CMO))
                 netdev_err(netdev, "tx: unable to map xmit buffer\n");
         adapter->tx_map_failed++;
-       skb_linearize(skb);
+       if (skb_linearize(skb)) {
+               netdev->stats.tx_dropped++;
+               goto out;
+       }
         force_bounce = 1;
         goto retry_bounce;
  }
diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c

index 7d65708437238b48f5c5cdfcb1d901b202e2d547..6e9e16eee5d0eff7d74a2eff4a8a4bb91368afcd 100644 (file)
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1348,44 +1348,44 @@ static void init_sub_crqs(struct ibmvnic_adapter *adapter, int retry)
         crq.request_capability.cmd = REQUEST_CAPABILITY;
  
         crq.request_capability.capability = cpu_to_be16(REQ_TX_QUEUES);
-       crq.request_capability.number = cpu_to_be32(adapter->req_tx_queues);
+       crq.request_capability.number = cpu_to_be64(adapter->req_tx_queues);
         ibmvnic_send_crq(adapter, &crq);
  
         crq.request_capability.capability = cpu_to_be16(REQ_RX_QUEUES);
-       crq.request_capability.number = cpu_to_be32(adapter->req_rx_queues);
+       crq.request_capability.number = cpu_to_be64(adapter->req_rx_queues);
         ibmvnic_send_crq(adapter, &crq);
  
         crq.request_capability.capability = cpu_to_be16(REQ_RX_ADD_QUEUES);
-       crq.request_capability.number = cpu_to_be32(adapter->req_rx_add_queues);
+       crq.request_capability.number = cpu_to_be64(adapter->req_rx_add_queues);
         ibmvnic_send_crq(adapter, &crq);
  
         crq.request_capability.capability =
             cpu_to_be16(REQ_TX_ENTRIES_PER_SUBCRQ);
         crq.request_capability.number =
-           cpu_to_be32(adapter->req_tx_entries_per_subcrq);
+           cpu_to_be64(adapter->req_tx_entries_per_subcrq);
         ibmvnic_send_crq(adapter, &crq);
  
         crq.request_capability.capability =
             cpu_to_be16(REQ_RX_ADD_ENTRIES_PER_SUBCRQ);
         crq.request_capability.number =
-           cpu_to_be32(adapter->req_rx_add_entries_per_subcrq);
+           cpu_to_be64(adapter->req_rx_add_entries_per_subcrq);
         ibmvnic_send_crq(adapter, &crq);
  
         crq.request_capability.capability = cpu_to_be16(REQ_MTU);
-       crq.request_capability.number = cpu_to_be32(adapter->req_mtu);
+       crq.request_capability.number = cpu_to_be64(adapter->req_mtu);
         ibmvnic_send_crq(adapter, &crq);
  
         if (adapter->netdev->flags & IFF_PROMISC) {
                 if (adapter->promisc_supported) {
                         crq.request_capability.capability =
                             cpu_to_be16(PROMISC_REQUESTED);
-                       crq.request_capability.number = cpu_to_be32(1);
+                       crq.request_capability.number = cpu_to_be64(1);
                         ibmvnic_send_crq(adapter, &crq);
                 }
         } else {
                 crq.request_capability.capability =
                     cpu_to_be16(PROMISC_REQUESTED);
-               crq.request_capability.number = cpu_to_be32(0);
+               crq.request_capability.number = cpu_to_be64(0);
                 ibmvnic_send_crq(adapter, &crq);
         }
  
@@ -2312,93 +2312,93 @@ static void handle_query_cap_rsp(union ibmvnic_crq *crq,
         switch (be16_to_cpu(crq->query_capability.capability)) {
         case MIN_TX_QUEUES:
                 adapter->min_tx_queues =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                 netdev_dbg(netdev, "min_tx_queues = %lld\n",
                            adapter->min_tx_queues);
                 break;
         case MIN_RX_QUEUES:
                 adapter->min_rx_queues =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                 netdev_dbg(netdev, "min_rx_queues = %lld\n",
                            adapter->min_rx_queues);
                 break;
         case MIN_RX_ADD_QUEUES:
                 adapter->min_rx_add_queues =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                 netdev_dbg(netdev, "min_rx_add_queues = %lld\n",
                            adapter->min_rx_add_queues);
                 break;
         case MAX_TX_QUEUES:
                 adapter->max_tx_queues =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                 netdev_dbg(netdev, "max_tx_queues = %lld\n",
                            adapter->max_tx_queues);
                 break;
         case MAX_RX_QUEUES:
                 adapter->max_rx_queues =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                 netdev_dbg(netdev, "max_rx_queues = %lld\n",
                            adapter->max_rx_queues);
                 break;
         case MAX_RX_ADD_QUEUES:
                 adapter->max_rx_add_queues =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                 netdev_dbg(netdev, "max_rx_add_queues = %lld\n",
                            adapter->max_rx_add_queues);
                 break;
         case MIN_TX_ENTRIES_PER_SUBCRQ:
                 adapter->min_tx_entries_per_subcrq =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                 netdev_dbg(netdev, "min_tx_entries_per_subcrq = %lld\n",
                            adapter->min_tx_entries_per_subcrq);
                 break;
         case MIN_RX_ADD_ENTRIES_PER_SUBCRQ:
                 adapter->min_rx_add_entries_per_subcrq =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                 netdev_dbg(netdev, "min_rx_add_entrs_per_subcrq = %lld\n",
                            adapter->min_rx_add_entries_per_subcrq);
                 break;
         case MAX_TX_ENTRIES_PER_SUBCRQ:
                 adapter->max_tx_entries_per_subcrq =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                 netdev_dbg(netdev, "max_tx_entries_per_subcrq = %lld\n",
                            adapter->max_tx_entries_per_subcrq);
                 break;
         case MAX_RX_ADD_ENTRIES_PER_SUBCRQ:
                 adapter->max_rx_add_entries_per_subcrq =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                 netdev_dbg(netdev, "max_rx_add_entrs_per_subcrq = %lld\n",
                            adapter->max_rx_add_entries_per_subcrq);
                 break;
         case TCP_IP_OFFLOAD:
                 adapter->tcp_ip_offload =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                 netdev_dbg(netdev, "tcp_ip_offload = %lld\n",
                            adapter->tcp_ip_offload);
                 break;
         case PROMISC_SUPPORTED:
                 adapter->promisc_supported =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                 netdev_dbg(netdev, "promisc_supported = %lld\n",
                            adapter->promisc_supported);
                 break;
         case MIN_MTU:
-               adapter->min_mtu = be32_to_cpu(crq->query_capability.number);
+               adapter->min_mtu = be64_to_cpu(crq->query_capability.number);
                 netdev_dbg(netdev, "min_mtu = %lld\n", adapter->min_mtu);
                 break;
         case MAX_MTU:
-               adapter->max_mtu = be32_to_cpu(crq->query_capability.number);
+               adapter->max_mtu = be64_to_cpu(crq->query_capability.number);
                 netdev_dbg(netdev, "max_mtu = %lld\n", adapter->max_mtu);
                 break;
         case MAX_MULTICAST_FILTERS:
                 adapter->max_multicast_filters =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                 netdev_dbg(netdev, "max_multicast_filters = %lld\n",
                            adapter->max_multicast_filters);
                 break;
         case VLAN_HEADER_INSERTION:
                 adapter->vlan_header_insertion =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                 if (adapter->vlan_header_insertion)
                         netdev->features |= NETIF_F_HW_VLAN_STAG_TX;
                 netdev_dbg(netdev, "vlan_header_insertion = %lld\n",
@@ -2406,43 +2406,43 @@ static void handle_query_cap_rsp(union ibmvnic_crq *crq,
                 break;
         case MAX_TX_SG_ENTRIES:
                 adapter->max_tx_sg_entries =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                 netdev_dbg(netdev, "max_tx_sg_entries = %lld\n",
                            adapter->max_tx_sg_entries);
                 break;
         case RX_SG_SUPPORTED:
                 adapter->rx_sg_supported =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                 netdev_dbg(netdev, "rx_sg_supported = %lld\n",
                            adapter->rx_sg_supported);
                 break;
         case OPT_TX_COMP_SUB_QUEUES:
                 adapter->opt_tx_comp_sub_queues =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                 netdev_dbg(netdev, "opt_tx_comp_sub_queues = %lld\n",
                            adapter->opt_tx_comp_sub_queues);
                 break;
         case OPT_RX_COMP_QUEUES:
                 adapter->opt_rx_comp_queues =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                 netdev_dbg(netdev, "opt_rx_comp_queues = %lld\n",
                            adapter->opt_rx_comp_queues);
                 break;
         case OPT_RX_BUFADD_Q_PER_RX_COMP_Q:
                 adapter->opt_rx_bufadd_q_per_rx_comp_q =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                 netdev_dbg(netdev, "opt_rx_bufadd_q_per_rx_comp_q = %lld\n",
                            adapter->opt_rx_bufadd_q_per_rx_comp_q);
                 break;
         case OPT_TX_ENTRIES_PER_SUBCRQ:
                 adapter->opt_tx_entries_per_subcrq =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                 netdev_dbg(netdev, "opt_tx_entries_per_subcrq = %lld\n",
                            adapter->opt_tx_entries_per_subcrq);
                 break;
         case OPT_RXBA_ENTRIES_PER_SUBCRQ:
                 adapter->opt_rxba_entries_per_subcrq =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                 netdev_dbg(netdev, "opt_rxba_entries_per_subcrq = %lld\n",
                            adapter->opt_rxba_entries_per_subcrq);
                 break;
diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h

index 1242925ad34cb6b0336524b1c3c1e569158aee24..1a9993cc79b572f58c172a5f4a071594cf4a93b1 100644 (file)
--- a/drivers/net/ethernet/ibm/ibmvnic.h
+++ b/drivers/net/ethernet/ibm/ibmvnic.h
@@ -319,10 +319,8 @@ struct ibmvnic_capability {
         u8 first;
         u8 cmd;
         __be16 capability; /* one of ibmvnic_capabilities */
+       __be64 number;
         struct ibmvnic_rc rc;
-       __be32 number; /*FIX: should be __be64, but I'm getting the least
-                       * significant word first
-                       */
  } __packed __aligned(8);
  
  struct ibmvnic_login {
diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c

index b1de7afd41166cc9ef67ed7e43191aaff897ec4e..3ddf657bc10bcc51d5a217595ba8e98ed909e0cd 100644 (file)
--- a/drivers/net/ethernet/jme.c
+++ b/drivers/net/ethernet/jme.c
@@ -270,11 +270,17 @@ jme_reset_mac_processor(struct jme_adapter *jme)
  }
  
  static inline void
-jme_clear_pm(struct jme_adapter *jme)
+jme_clear_pm_enable_wol(struct jme_adapter *jme)
  {
         jwrite32(jme, JME_PMCS, PMCS_STMASK | jme->reg_pmcs);
  }
  
+static inline void
+jme_clear_pm_disable_wol(struct jme_adapter *jme)
+{
+       jwrite32(jme, JME_PMCS, PMCS_STMASK);
+}
+
  static int
  jme_reload_eeprom(struct jme_adapter *jme)
  {
@@ -1853,7 +1859,7 @@ jme_open(struct net_device *netdev)
         struct jme_adapter *jme = netdev_priv(netdev);
         int rc;
  
-       jme_clear_pm(jme);
+       jme_clear_pm_disable_wol(jme);
         JME_NAPI_ENABLE(jme);
  
         tasklet_init(&jme->linkch_task, jme_link_change_tasklet,
@@ -1925,11 +1931,11 @@ jme_wait_link(struct jme_adapter *jme)
  static void
  jme_powersave_phy(struct jme_adapter *jme)
  {
-       if (jme->reg_pmcs) {
+       if (jme->reg_pmcs && device_may_wakeup(&jme->pdev->dev)) {
                 jme_set_100m_half(jme);
                 if (jme->reg_pmcs & (PMCS_LFEN | PMCS_LREN))
                         jme_wait_link(jme);
-               jme_clear_pm(jme);
+               jme_clear_pm_enable_wol(jme);
         } else {
                 jme_phy_off(jme);
         }
@@ -2646,9 +2652,6 @@ jme_set_wol(struct net_device *netdev,
         if (wol->wolopts & WAKE_MAGIC)
                 jme->reg_pmcs |= PMCS_MFEN;
  
-       jwrite32(jme, JME_PMCS, jme->reg_pmcs);
-       device_set_wakeup_enable(&jme->pdev->dev, !!(jme->reg_pmcs));
-
         return 0;
  }
  
@@ -3172,8 +3175,8 @@ jme_init_one(struct pci_dev *pdev,
         jme->mii_if.mdio_read = jme_mdio_read;
         jme->mii_if.mdio_write = jme_mdio_write;
  
-       jme_clear_pm(jme);
-       device_set_wakeup_enable(&pdev->dev, true);
+       jme_clear_pm_disable_wol(jme);
+       device_init_wakeup(&pdev->dev, true);
  
         jme_set_phyfifo_5level(jme);
         jme->pcirev = pdev->revision;
@@ -3304,7 +3307,7 @@ jme_resume(struct device *dev)
         if (!netif_running(netdev))
                 return 0;
  
-       jme_clear_pm(jme);
+       jme_clear_pm_disable_wol(jme);
         jme_phy_on(jme);
         if (test_bit(JME_FLAG_SSET, &jme->flags))
                 jme_set_settings(netdev, &jme->old_ecmd);
@@ -3312,13 +3315,14 @@ jme_resume(struct device *dev)
                 jme_reset_phy_processor(jme);
         jme_phy_calibration(jme);
         jme_phy_setEA(jme);
-       jme_start_irq(jme);
         netif_device_attach(netdev);
  
         atomic_inc(&jme->link_changing);
  
         jme_reset_link(jme);
  
+       jme_start_irq(jme);
+
         return 0;
  }
  
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c

index f191a16125893e5b742c5d99ea7bd814dba433a5..21e2c09602716351a21dc2762f1631a298bf05eb 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2245,7 +2245,7 @@ static int mlx4_en_set_vf_mac(struct net_device *dev, int queue, u8 *mac)
         struct mlx4_en_dev *mdev = en_priv->mdev;
         u64 mac_u64 = mlx4_mac_to_u64(mac);
  
-       if (!is_valid_ether_addr(mac))
+       if (is_multicast_ether_addr(mac))
                 return -EINVAL;
  
         return mlx4_set_vf_mac(mdev->dev, en_priv->port, queue, mac_u64);
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c

index 2cc3c626c3fea732d0862ff5f0ce520d442a6205..f8674ae62752d53bc768c16ac9b2de3053e183eb 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -1256,6 +1256,7 @@ err_set_port:
  static int mlx4_mf_bond(struct mlx4_dev *dev)
  {
         int err = 0;
+       int nvfs;
         struct mlx4_slaves_pport slaves_port1;
         struct mlx4_slaves_pport slaves_port2;
         DECLARE_BITMAP(slaves_port_1_2, MLX4_MFUNC_MAX);
@@ -1272,11 +1273,18 @@ static int mlx4_mf_bond(struct mlx4_dev *dev)
                 return -EINVAL;
         }
  
+       /* number of virtual functions is number of total functions minus one
+        * physical function for each port.
+        */
+       nvfs = bitmap_weight(slaves_port1.slaves, dev->persist->num_vfs + 1) +
+               bitmap_weight(slaves_port2.slaves, dev->persist->num_vfs + 1) - 2;
+
         /* limit on maximum allowed VFs */
-       if ((bitmap_weight(slaves_port1.slaves, dev->persist->num_vfs + 1) +
-           bitmap_weight(slaves_port2.slaves, dev->persist->num_vfs + 1)) >
-           MAX_MF_BOND_ALLOWED_SLAVES)
+       if (nvfs > MAX_MF_BOND_ALLOWED_SLAVES) {
+               mlx4_warn(dev, "HA mode is not supported for %d VFs (max %d are allowed)\n",
+                         nvfs, MAX_MF_BOND_ALLOWED_SLAVES);
                 return -EINVAL;
+       }
  
         if (dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED) {
                 mlx4_warn(dev, "HA mode unsupported for NON DMFS steering\n");
diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c b/drivers/net/ethernet/mellanox/mlx4/port.c

index 787b7bb54d52acd094570283bf6793f1bfb0dab0..211c65087997dd5a92cc53a13f9fd3869927d09d 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx4/port.c
+++ b/drivers/net/ethernet/mellanox/mlx4/port.c
@@ -193,10 +193,10 @@ int __mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac)
         if (need_mf_bond) {
                 if (port == 1) {
                         mutex_lock(&table->mutex);
-                       mutex_lock(&dup_table->mutex);
+                       mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
                 } else {
                         mutex_lock(&dup_table->mutex);
-                       mutex_lock(&table->mutex);
+                       mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
                 }
         } else {
                 mutex_lock(&table->mutex);
@@ -389,10 +389,10 @@ void __mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac)
         if (dup) {
                 if (port == 1) {
                         mutex_lock(&table->mutex);
-                       mutex_lock(&dup_table->mutex);
+                       mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
                 } else {
                         mutex_lock(&dup_table->mutex);
-                       mutex_lock(&table->mutex);
+                       mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
                 }
         } else {
                 mutex_lock(&table->mutex);
@@ -479,10 +479,10 @@ int __mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac)
         if (dup) {
                 if (port == 1) {
                         mutex_lock(&table->mutex);
-                       mutex_lock(&dup_table->mutex);
+                       mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
                 } else {
                         mutex_lock(&dup_table->mutex);
-                       mutex_lock(&table->mutex);
+                       mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
                 }
         } else {
                 mutex_lock(&table->mutex);
@@ -588,10 +588,10 @@ int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan,
         if (need_mf_bond) {
                 if (port == 1) {
                         mutex_lock(&table->mutex);
-                       mutex_lock(&dup_table->mutex);
+                       mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
                 } else {
                         mutex_lock(&dup_table->mutex);
-                       mutex_lock(&table->mutex);
+                       mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
                 }
         } else {
                 mutex_lock(&table->mutex);
@@ -764,10 +764,10 @@ void __mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan)
         if (dup) {
                 if (port == 1) {
                         mutex_lock(&table->mutex);
-                       mutex_lock(&dup_table->mutex);
+                       mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
                 } else {
                         mutex_lock(&dup_table->mutex);
-                       mutex_lock(&table->mutex);
+                       mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
                 }
         } else {
                 mutex_lock(&table->mutex);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h

index aac071a7e830b5fd777da7a5e4d014d802ad70d0..5b1753233c5dd8c30c1dd110a952c6d9c4360e39 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -223,6 +223,7 @@ struct mlx5e_pport_stats {
  
  static const char rq_stats_strings[][ETH_GSTRING_LEN] = {
         "packets",
+       "bytes",
         "csum_none",
         "csum_sw",
         "lro_packets",
@@ -232,16 +233,18 @@ static const char rq_stats_strings[][ETH_GSTRING_LEN] = {
  
  struct mlx5e_rq_stats {
         u64 packets;
+       u64 bytes;
         u64 csum_none;
         u64 csum_sw;
         u64 lro_packets;
         u64 lro_bytes;
         u64 wqe_err;
-#define NUM_RQ_STATS 6
+#define NUM_RQ_STATS 7
  };
  
  static const char sq_stats_strings[][ETH_GSTRING_LEN] = {
         "packets",
+       "bytes",
         "tso_packets",
         "tso_bytes",
         "csum_offload_none",
@@ -253,6 +256,7 @@ static const char sq_stats_strings[][ETH_GSTRING_LEN] = {
  
  struct mlx5e_sq_stats {
         u64 packets;
+       u64 bytes;
         u64 tso_packets;
         u64 tso_bytes;
         u64 csum_offload_none;
@@ -260,7 +264,7 @@ struct mlx5e_sq_stats {
         u64 wake;
         u64 dropped;
         u64 nop;
-#define NUM_SQ_STATS 8
+#define NUM_SQ_STATS 9
  };
  
  struct mlx5e_stats {
@@ -304,14 +308,9 @@ enum {
         MLX5E_RQ_STATE_POST_WQES_ENABLE,
  };
  
-enum cq_flags {
-       MLX5E_CQ_HAS_CQES = 1,
-};
-
  struct mlx5e_cq {
         /* data path - accessed per cqe */
         struct mlx5_cqwq           wq;
-       unsigned long              flags;
  
         /* data path - accessed per napi poll */
         struct napi_struct        *napi;
@@ -452,6 +451,8 @@ enum mlx5e_traffic_types {
         MLX5E_NUM_TT,
  };
  
+#define IS_HASHING_TT(tt) (tt != MLX5E_TT_ANY)
+
  enum mlx5e_rqt_ix {
         MLX5E_INDIRECTION_RQT,
         MLX5E_SINGLE_RQ_RQT,
@@ -618,9 +619,12 @@ void mlx5e_enable_vlan_filter(struct mlx5e_priv *priv);
  void mlx5e_disable_vlan_filter(struct mlx5e_priv *priv);
  
  int mlx5e_redirect_rqt(struct mlx5e_priv *priv, enum mlx5e_rqt_ix rqt_ix);
+void mlx5e_build_tir_ctx_hash(void *tirc, struct mlx5e_priv *priv);
  
  int mlx5e_open_locked(struct net_device *netdev);
  int mlx5e_close_locked(struct net_device *netdev);
+void mlx5e_build_default_indir_rqt(u32 *indirection_rqt, int len,
+                                  int num_channels);
  
  static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq,
                                       struct mlx5e_tx_wqe *wqe, int bf_sz)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c

index be6543570b2be18d476773f6b530dfeab6057800..2018eebe1531563e036c41c19372331f8a576b14 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c
@@ -62,10 +62,11 @@ static void mlx5e_timestamp_overflow(struct work_struct *work)
         struct delayed_work *dwork = to_delayed_work(work);
         struct mlx5e_tstamp *tstamp = container_of(dwork, struct mlx5e_tstamp,
                                                    overflow_work);
+       unsigned long flags;
  
-       write_lock(&tstamp->lock);
+       write_lock_irqsave(&tstamp->lock, flags);
         timecounter_read(&tstamp->clock);
-       write_unlock(&tstamp->lock);
+       write_unlock_irqrestore(&tstamp->lock, flags);
         schedule_delayed_work(&tstamp->overflow_work, tstamp->overflow_period);
  }
  
@@ -136,10 +137,11 @@ static int mlx5e_ptp_settime(struct ptp_clock_info *ptp,
         struct mlx5e_tstamp *tstamp = container_of(ptp, struct mlx5e_tstamp,
                                                    ptp_info);
         u64 ns = timespec64_to_ns(ts);
+       unsigned long flags;
  
-       write_lock(&tstamp->lock);
+       write_lock_irqsave(&tstamp->lock, flags);
         timecounter_init(&tstamp->clock, &tstamp->cycles, ns);
-       write_unlock(&tstamp->lock);
+       write_unlock_irqrestore(&tstamp->lock, flags);
  
         return 0;
  }
@@ -150,10 +152,11 @@ static int mlx5e_ptp_gettime(struct ptp_clock_info *ptp,
         struct mlx5e_tstamp *tstamp = container_of(ptp, struct mlx5e_tstamp,
                                                    ptp_info);
         u64 ns;
+       unsigned long flags;
  
-       write_lock(&tstamp->lock);
+       write_lock_irqsave(&tstamp->lock, flags);
         ns = timecounter_read(&tstamp->clock);
-       write_unlock(&tstamp->lock);
+       write_unlock_irqrestore(&tstamp->lock, flags);
  
         *ts = ns_to_timespec64(ns);
  
@@ -164,10 +167,11 @@ static int mlx5e_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
  {
         struct mlx5e_tstamp *tstamp = container_of(ptp, struct mlx5e_tstamp,
                                                    ptp_info);
+       unsigned long flags;
  
-       write_lock(&tstamp->lock);
+       write_lock_irqsave(&tstamp->lock, flags);
         timecounter_adjtime(&tstamp->clock, delta);
-       write_unlock(&tstamp->lock);
+       write_unlock_irqrestore(&tstamp->lock, flags);
  
         return 0;
  }
@@ -176,6 +180,7 @@ static int mlx5e_ptp_adjfreq(struct ptp_clock_info *ptp, s32 delta)
  {
         u64 adj;
         u32 diff;
+       unsigned long flags;
         int neg_adj = 0;
         struct mlx5e_tstamp *tstamp = container_of(ptp, struct mlx5e_tstamp,
                                                   ptp_info);
@@ -189,11 +194,11 @@ static int mlx5e_ptp_adjfreq(struct ptp_clock_info *ptp, s32 delta)
         adj *= delta;
         diff = div_u64(adj, 1000000000ULL);
  
-       write_lock(&tstamp->lock);
+       write_lock_irqsave(&tstamp->lock, flags);
         timecounter_read(&tstamp->clock);
         tstamp->cycles.mult = neg_adj ? tstamp->nominal_c_mult - diff :
                                         tstamp->nominal_c_mult + diff;
-       write_unlock(&tstamp->lock);
+       write_unlock_irqrestore(&tstamp->lock, flags);
  
         return 0;
  }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c

index 65624ac65b4c347ac5dedf55f3b209de799e7e5f..5abeb00fceb8b0876d17f4aab813054d55414cc8 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -385,6 +385,8 @@ static int mlx5e_set_channels(struct net_device *dev,
                 mlx5e_close_locked(dev);
  
         priv->params.num_channels = count;
+       mlx5e_build_default_indir_rqt(priv->params.indirection_rqt,
+                                     MLX5E_INDIR_RQT_SIZE, count);
  
         if (was_opened)
                 err = mlx5e_open_locked(dev);
@@ -703,18 +705,36 @@ static int mlx5e_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
         return 0;
  }
  
+static void mlx5e_modify_tirs_hash(struct mlx5e_priv *priv, void *in, int inlen)
+{
+       struct mlx5_core_dev *mdev = priv->mdev;
+       void *tirc = MLX5_ADDR_OF(modify_tir_in, in, ctx);
+       int i;
+
+       MLX5_SET(modify_tir_in, in, bitmask.hash, 1);
+       mlx5e_build_tir_ctx_hash(tirc, priv);
+
+       for (i = 0; i < MLX5E_NUM_TT; i++)
+               if (IS_HASHING_TT(i))
+                       mlx5_core_modify_tir(mdev, priv->tirn[i], in, inlen);
+}
+
  static int mlx5e_set_rxfh(struct net_device *dev, const u32 *indir,
                           const u8 *key, const u8 hfunc)
  {
         struct mlx5e_priv *priv = netdev_priv(dev);
-       bool close_open;
-       int err = 0;
+       int inlen = MLX5_ST_SZ_BYTES(modify_tir_in);
+       void *in;
  
         if ((hfunc != ETH_RSS_HASH_NO_CHANGE) &&
             (hfunc != ETH_RSS_HASH_XOR) &&
             (hfunc != ETH_RSS_HASH_TOP))
                 return -EINVAL;
  
+       in = mlx5_vzalloc(inlen);
+       if (!in)
+               return -ENOMEM;
+
         mutex_lock(&priv->state_lock);
  
         if (indir) {
@@ -723,11 +743,6 @@ static int mlx5e_set_rxfh(struct net_device *dev, const u32 *indir,
                 mlx5e_redirect_rqt(priv, MLX5E_INDIRECTION_RQT);
         }
  
-       close_open = (key || (hfunc != ETH_RSS_HASH_NO_CHANGE)) &&
-                    test_bit(MLX5E_STATE_OPENED, &priv->state);
-       if (close_open)
-               mlx5e_close_locked(dev);
-
         if (key)
                 memcpy(priv->params.toeplitz_hash_key, key,
                        sizeof(priv->params.toeplitz_hash_key));
@@ -735,12 +750,13 @@ static int mlx5e_set_rxfh(struct net_device *dev, const u32 *indir,
         if (hfunc != ETH_RSS_HASH_NO_CHANGE)
                 priv->params.rss_hfunc = hfunc;
  
-       if (close_open)
-               err = mlx5e_open_locked(priv->netdev);
+       mlx5e_modify_tirs_hash(priv, in, inlen);
  
         mutex_unlock(&priv->state_lock);
  
-       return err;
+       kvfree(in);
+
+       return 0;
  }
  
  static int mlx5e_get_rxnfc(struct net_device *netdev,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c

index d4e1c30452009718d9761a8a4949f5195d7f4c59..402994bf7e167d1ae23a8943789f712f82862f46 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -141,6 +141,10 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
                 return;
  
         /* Collect firts the SW counters and then HW for consistency */
+       s->rx_packets           = 0;
+       s->rx_bytes             = 0;
+       s->tx_packets           = 0;
+       s->tx_bytes             = 0;
         s->tso_packets          = 0;
         s->tso_bytes            = 0;
         s->tx_queue_stopped     = 0;
@@ -155,6 +159,8 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
         for (i = 0; i < priv->params.num_channels; i++) {
                 rq_stats = &priv->channel[i]->rq.stats;
  
+               s->rx_packets   += rq_stats->packets;
+               s->rx_bytes     += rq_stats->bytes;
                 s->lro_packets  += rq_stats->lro_packets;
                 s->lro_bytes    += rq_stats->lro_bytes;
                 s->rx_csum_none += rq_stats->csum_none;
@@ -164,6 +170,8 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
                 for (j = 0; j < priv->params.num_tc; j++) {
                         sq_stats = &priv->channel[i]->sq[j].stats;
  
+                       s->tx_packets           += sq_stats->packets;
+                       s->tx_bytes             += sq_stats->bytes;
                         s->tso_packets          += sq_stats->tso_packets;
                         s->tso_bytes            += sq_stats->tso_bytes;
                         s->tx_queue_stopped     += sq_stats->stopped;
@@ -225,23 +233,6 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
         s->tx_broadcast_bytes   =
                 MLX5_GET_CTR(out, transmitted_eth_broadcast.octets);
  
-       s->rx_packets =
-               s->rx_unicast_packets +
-               s->rx_multicast_packets +
-               s->rx_broadcast_packets;
-       s->rx_bytes =
-               s->rx_unicast_bytes +
-               s->rx_multicast_bytes +
-               s->rx_broadcast_bytes;
-       s->tx_packets =
-               s->tx_unicast_packets +
-               s->tx_multicast_packets +
-               s->tx_broadcast_packets;
-       s->tx_bytes =
-               s->tx_unicast_bytes +
-               s->tx_multicast_bytes +
-               s->tx_broadcast_bytes;
-
         /* Update calculated offload counters */
         s->tx_csum_offload = s->tx_packets - tx_offload_none;
         s->rx_csum_good    = s->rx_packets - s->rx_csum_none -
@@ -1199,7 +1190,6 @@ static void mlx5e_fill_indir_rqt_rqns(struct mlx5e_priv *priv, void *rqtc)
                         ix = mlx5e_bits_invert(i, MLX5E_LOG_INDIR_RQT_SIZE);
  
                 ix = priv->params.indirection_rqt[ix];
-               ix = ix % priv->params.num_channels;
                 MLX5_SET(rqtc, rqtc, rq_num[i],
                          test_bit(MLX5E_STATE_OPENED, &priv->state) ?
                          priv->channel[ix]->rq.rqn :
@@ -1317,7 +1307,22 @@ static void mlx5e_build_tir_ctx_lro(void *tirc, struct mlx5e_priv *priv)
                               lro_timer_supported_periods[2]));
  }
  
-static int mlx5e_modify_tir_lro(struct mlx5e_priv *priv, int tt)
+void mlx5e_build_tir_ctx_hash(void *tirc, struct mlx5e_priv *priv)
+{
+       MLX5_SET(tirc, tirc, rx_hash_fn,
+                mlx5e_rx_hash_fn(priv->params.rss_hfunc));
+       if (priv->params.rss_hfunc == ETH_RSS_HASH_TOP) {
+               void *rss_key = MLX5_ADDR_OF(tirc, tirc,
+                                            rx_hash_toeplitz_key);
+               size_t len = MLX5_FLD_SZ_BYTES(tirc,
+                                              rx_hash_toeplitz_key);
+
+               MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
+               memcpy(rss_key, priv->params.toeplitz_hash_key, len);
+       }
+}
+
+static int mlx5e_modify_tirs_lro(struct mlx5e_priv *priv)
  {
         struct mlx5_core_dev *mdev = priv->mdev;
  
@@ -1325,6 +1330,7 @@ static int mlx5e_modify_tir_lro(struct mlx5e_priv *priv, int tt)
         void *tirc;
         int inlen;
         int err;
+       int tt;
  
         inlen = MLX5_ST_SZ_BYTES(modify_tir_in);
         in = mlx5_vzalloc(inlen);
@@ -1336,7 +1342,11 @@ static int mlx5e_modify_tir_lro(struct mlx5e_priv *priv, int tt)
  
         mlx5e_build_tir_ctx_lro(tirc, priv);
  
-       err = mlx5_core_modify_tir(mdev, priv->tirn[tt], in, inlen);
+       for (tt = 0; tt < MLX5E_NUM_TT; tt++) {
+               err = mlx5_core_modify_tir(mdev, priv->tirn[tt], in, inlen);
+               if (err)
+                       break;
+       }
  
         kvfree(in);
  
@@ -1672,17 +1682,7 @@ static void mlx5e_build_tir_ctx(struct mlx5e_priv *priv, u32 *tirc, int tt)
         default:
                 MLX5_SET(tirc, tirc, indirect_table,
                          priv->rqtn[MLX5E_INDIRECTION_RQT]);
-               MLX5_SET(tirc, tirc, rx_hash_fn,
-                        mlx5e_rx_hash_fn(priv->params.rss_hfunc));
-               if (priv->params.rss_hfunc == ETH_RSS_HASH_TOP) {
-                       void *rss_key = MLX5_ADDR_OF(tirc, tirc,
-                                                    rx_hash_toeplitz_key);
-                       size_t len = MLX5_FLD_SZ_BYTES(tirc,
-                                                      rx_hash_toeplitz_key);
-
-                       MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
-                       memcpy(rss_key, priv->params.toeplitz_hash_key, len);
-               }
+               mlx5e_build_tir_ctx_hash(tirc, priv);
                 break;
         }
  
@@ -1885,8 +1885,10 @@ static int mlx5e_set_features(struct net_device *netdev,
                         mlx5e_close_locked(priv->netdev);
  
                 priv->params.lro_en = !!(features & NETIF_F_LRO);
-               mlx5e_modify_tir_lro(priv, MLX5E_TT_IPV4_TCP);
-               mlx5e_modify_tir_lro(priv, MLX5E_TT_IPV6_TCP);
+               err = mlx5e_modify_tirs_lro(priv);
+               if (err)
+                       mlx5_core_warn(priv->mdev, "lro modify failed, %d\n",
+                                      err);
  
                 if (was_opened)
                         err = mlx5e_open_locked(priv->netdev);
@@ -2089,12 +2091,20 @@ u16 mlx5e_get_max_inline_cap(struct mlx5_core_dev *mdev)
                2 /*sizeof(mlx5e_tx_wqe.inline_hdr_start)*/;
  }
  
+void mlx5e_build_default_indir_rqt(u32 *indirection_rqt, int len,
+                                  int num_channels)
+{
+       int i;
+
+       for (i = 0; i < len; i++)
+               indirection_rqt[i] = i % num_channels;
+}
+
  static void mlx5e_build_netdev_priv(struct mlx5_core_dev *mdev,
                                     struct net_device *netdev,
                                     int num_channels)
  {
         struct mlx5e_priv *priv = netdev_priv(netdev);
-       int i;
  
         priv->params.log_sq_size           =
                 MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE;
@@ -2118,8 +2128,8 @@ static void mlx5e_build_netdev_priv(struct mlx5_core_dev *mdev,
         netdev_rss_key_fill(priv->params.toeplitz_hash_key,
                             sizeof(priv->params.toeplitz_hash_key));
  
-       for (i = 0; i < MLX5E_INDIR_RQT_SIZE; i++)
-               priv->params.indirection_rqt[i] = i % num_channels;
+       mlx5e_build_default_indir_rqt(priv->params.indirection_rqt,
+                                     MLX5E_INDIR_RQT_SIZE, num_channels);
  
         priv->params.lro_wqe_sz            =
                 MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c

index dd959d929aadd561ab78c252fe2127e1beb24fad..59658b9d05d1fc57c9da02f39f96a245929f42e4 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -230,10 +230,6 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
         struct mlx5e_rq *rq = container_of(cq, struct mlx5e_rq, cq);
         int work_done;
  
-       /* avoid accessing cq (dma coherent memory) if not needed */
-       if (!test_and_clear_bit(MLX5E_CQ_HAS_CQES, &cq->flags))
-               return 0;
-
         for (work_done = 0; work_done < budget; work_done++) {
                 struct mlx5e_rx_wqe *wqe;
                 struct mlx5_cqe64 *cqe;
@@ -267,6 +263,7 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
  
                 mlx5e_build_rx_skb(cqe, rq, skb);
                 rq->stats.packets++;
+               rq->stats.bytes += be32_to_cpu(cqe->byte_cnt);
                 napi_gro_receive(cq->napi, skb);
  
  wq_ll_pop:
@@ -279,8 +276,5 @@ wq_ll_pop:
         /* ensure cq space is freed before enabling more cqes */
         wmb();
  
-       if (work_done == budget)
-               set_bit(MLX5E_CQ_HAS_CQES, &cq->flags);
-
         return work_done;
  }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c

index 2c3fba0fff546179f68a53ab23fcdf7620ba188e..bb4eeeb007dec48eb022a6917f69d7fd5e1e5071 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -179,6 +179,7 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
         unsigned int skb_len = skb->len;
         u8  opcode = MLX5_OPCODE_SEND;
         dma_addr_t dma_addr = 0;
+       unsigned int num_bytes;
         bool bf = false;
         u16 headlen;
         u16 ds_cnt;
@@ -204,8 +205,7 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
                 opcode       = MLX5_OPCODE_LSO;
                 ihs          = skb_transport_offset(skb) + tcp_hdrlen(skb);
                 payload_len  = skb->len - ihs;
-               wi->num_bytes = skb->len +
-                               (skb_shinfo(skb)->gso_segs - 1) * ihs;
+               num_bytes = skb->len + (skb_shinfo(skb)->gso_segs - 1) * ihs;
                 sq->stats.tso_packets++;
                 sq->stats.tso_bytes += payload_len;
         } else {
@@ -213,9 +213,11 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
                      !skb->xmit_more &&
                      !skb_shinfo(skb)->nr_frags;
                 ihs = mlx5e_get_inline_hdr_size(sq, skb, bf);
-               wi->num_bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
+               num_bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
         }
  
+       wi->num_bytes = num_bytes;
+
         if (skb_vlan_tag_present(skb)) {
                 mlx5e_insert_vlan(eseg->inline_hdr_start, skb, ihs, &skb_data,
                                   &skb_len);
@@ -307,6 +309,7 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
         sq->bf_budget = bf ? sq->bf_budget - 1 : 0;
  
         sq->stats.packets++;
+       sq->stats.bytes += num_bytes;
         return NETDEV_TX_OK;
  
  dma_unmap_wqe_err:
@@ -335,10 +338,6 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq)
         u16 sqcc;
         int i;
  
-       /* avoid accessing cq (dma coherent memory) if not needed */
-       if (!test_and_clear_bit(MLX5E_CQ_HAS_CQES, &cq->flags))
-               return false;
-
         sq = container_of(cq, struct mlx5e_sq, cq);
  
         npkts = 0;
@@ -422,10 +421,6 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq)
                                 netif_tx_wake_queue(sq->txq);
                                 sq->stats.wake++;
         }
-       if (i == MLX5E_TX_CQ_POLL_BUDGET) {
-               set_bit(MLX5E_CQ_HAS_CQES, &cq->flags);
-               return true;
-       }
  
-       return false;
+       return (i == MLX5E_TX_CQ_POLL_BUDGET);
  }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c

index 4ac8d716dbddfd9650d50c9c2af15efe7089a7d6..66d51a77609e890263cd91277c6b8e7315d660e3 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -88,7 +88,6 @@ void mlx5e_completion_event(struct mlx5_core_cq *mcq)
  {
         struct mlx5e_cq *cq = container_of(mcq, struct mlx5e_cq, mcq);
  
-       set_bit(MLX5E_CQ_HAS_CQES, &cq->flags);
         set_bit(MLX5E_CHANNEL_NAPI_SCHED, &cq->channel->flags);
         barrier();
         napi_schedule(cq->napi);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c

index c071077aafbdb53ad959538b290b203f2256335e..7992c553c1f5ce2e4de9cec4caa798e0da676531 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -215,7 +215,7 @@ mlxsw_pci_queue_elem_info_producer_get(struct mlxsw_pci_queue *q)
  {
         int index = q->producer_counter & (q->count - 1);
  
-       if ((q->producer_counter - q->consumer_counter) == q->count)
+       if ((u16) (q->producer_counter - q->consumer_counter) == q->count)
                 return NULL;
         return mlxsw_pci_queue_elem_info_get(q, index);
  }
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c

index 09ce451c283bb8dfb5dd9c4c69bdeb04c0e796b3..a94daa8c346ca11ca10f6eed09b0c15a28dde2c4 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -2358,9 +2358,7 @@ static int mlxsw_sp_port_lag_leave(struct mlxsw_sp_port *mlxsw_sp_port,
         if (mlxsw_sp_port->bridged) {
                 mlxsw_sp_port_active_vlans_del(mlxsw_sp_port);
                 mlxsw_sp_port_bridge_leave(mlxsw_sp_port, false);
-
-               if (lag->ref_count == 1)
-                       mlxsw_sp_master_bridge_dec(mlxsw_sp, NULL);
+               mlxsw_sp_master_bridge_dec(mlxsw_sp, NULL);
         }
  
         if (lag->ref_count == 1) {
diff --git a/drivers/net/ethernet/moxa/moxart_ether.c b/drivers/net/ethernet/moxa/moxart_ether.c

index 00cfd95ca59d53fe040ef5b002e7d30604d35e96..3e67f451f2ab918c95d7a6c3429d45b359e18978 100644 (file)
--- a/drivers/net/ethernet/moxa/moxart_ether.c
+++ b/drivers/net/ethernet/moxa/moxart_ether.c
@@ -474,9 +474,9 @@ static int moxart_mac_probe(struct platform_device *pdev)
         res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
         ndev->base_addr = res->start;
         priv->base = devm_ioremap_resource(p_dev, res);
-       ret = IS_ERR(priv->base);
-       if (ret) {
+       if (IS_ERR(priv->base)) {
                 dev_err(p_dev, "devm_ioremap_resource failed\n");
+               ret = PTR_ERR(priv->base);
                 goto init_fail;
         }
  
diff --git a/drivers/net/ethernet/qualcomm/qca_spi.c b/drivers/net/ethernet/qualcomm/qca_spi.c

index 689a4a5c8dcfc30fba2675e17293e98691648dd2..1ef03939d25f478356112fbe376ff79cfd1e8eaa 100644 (file)
--- a/drivers/net/ethernet/qualcomm/qca_spi.c
+++ b/drivers/net/ethernet/qualcomm/qca_spi.c
@@ -811,7 +811,7 @@ qcaspi_netdev_setup(struct net_device *dev)
         dev->netdev_ops = &qcaspi_netdev_ops;
         qcaspi_set_ethtool_ops(dev);
         dev->watchdog_timeo = QCASPI_TX_TIMEOUT;
-       dev->flags = IFF_MULTICAST;
+       dev->priv_flags &= ~IFF_TX_SKB_SHARING;
         dev->tx_queue_len = 100;
  
         qca = netdev_priv(dev);
diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c

index 537974cfd427091442acc9098a0a40535bbda431..dd2cf3738b738bdf5b21e73587c5ab9aeecbe3c7 100644 (file)
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -4933,8 +4933,6 @@ static void rtl_init_rxcfg(struct rtl8169_private *tp)
                 RTL_W32(RxConfig, RX128_INT_EN | RX_MULTI_EN | RX_DMA_BURST);
                 break;
         case RTL_GIGA_MAC_VER_40:
-               RTL_W32(RxConfig, RX128_INT_EN | RX_MULTI_EN | RX_DMA_BURST | RX_EARLY_OFF);
-               break;
         case RTL_GIGA_MAC_VER_41:
         case RTL_GIGA_MAC_VER_42:
         case RTL_GIGA_MAC_VER_43:
@@ -4943,8 +4941,6 @@ static void rtl_init_rxcfg(struct rtl8169_private *tp)
         case RTL_GIGA_MAC_VER_46:
         case RTL_GIGA_MAC_VER_47:
         case RTL_GIGA_MAC_VER_48:
-               RTL_W32(RxConfig, RX128_INT_EN | RX_DMA_BURST | RX_EARLY_OFF);
-               break;
         case RTL_GIGA_MAC_VER_49:
         case RTL_GIGA_MAC_VER_50:
         case RTL_GIGA_MAC_VER_51:
@@ -7730,10 +7726,13 @@ rtl8169_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
  {
         struct rtl8169_private *tp = netdev_priv(dev);
         void __iomem *ioaddr = tp->mmio_addr;
+       struct pci_dev *pdev = tp->pci_dev;
         struct rtl8169_counters *counters = tp->counters;
         unsigned int start;
  
-       if (netif_running(dev))
+       pm_runtime_get_noresume(&pdev->dev);
+
+       if (netif_running(dev) && pm_runtime_active(&pdev->dev))
                 rtl8169_rx_missed(dev, ioaddr);
  
         do {
@@ -7761,7 +7760,8 @@ rtl8169_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
          * Fetch additonal counter values missing in stats collected by driver
          * from tally counters.
          */
-       rtl8169_update_counters(dev);
+       if (pm_runtime_active(&pdev->dev))
+               rtl8169_update_counters(dev);
  
         /*
          * Subtract values fetched during initalization.
@@ -7774,6 +7774,8 @@ rtl8169_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
         stats->tx_aborted_errors = le16_to_cpu(counters->tx_aborted) -
                 le16_to_cpu(tp->tc_offset.tx_aborted);
  
+       pm_runtime_put_noidle(&pdev->dev);
+
         return stats;
  }
  
@@ -7853,6 +7855,10 @@ static int rtl8169_runtime_suspend(struct device *device)
  
         rtl8169_net_suspend(dev);
  
+       /* Update counters before going runtime suspend */
+       rtl8169_rx_missed(dev, tp->mmio_addr);
+       rtl8169_update_counters(dev);
+
         return 0;
  }
  
diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c

index 744d7806a9eec63e1c4fb18cc7b57fa96d73fcbc..86449c357168ebb4cd6c6fa25d1503583b8cd82d 100644 (file)
--- a/drivers/net/ethernet/renesas/ravb_main.c
+++ b/drivers/net/ethernet/renesas/ravb_main.c
@@ -1722,7 +1722,6 @@ static int ravb_set_gti(struct net_device *ndev)
  static int ravb_probe(struct platform_device *pdev)
  {
         struct device_node *np = pdev->dev.of_node;
-       const struct of_device_id *match;
         struct ravb_private *priv;
         enum ravb_chip_id chip_id;
         struct net_device *ndev;
@@ -1754,8 +1753,7 @@ static int ravb_probe(struct platform_device *pdev)
         ndev->base_addr = res->start;
         ndev->dma = -1;
  
-       match = of_match_device(of_match_ptr(ravb_match_table), &pdev->dev);
-       chip_id = (enum ravb_chip_id)match->data;
+       chip_id = (enum ravb_chip_id)of_device_get_match_data(&pdev->dev);
  
         if (chip_id == RCAR_GEN3)
                 irq = platform_get_irq_byname(pdev, "ch22");
diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c

index dfa9e59c9442884dfe6a52a5c6df0c4c70fa4737..7384499928761612775af0f8b1eaf701fcbb4d18 100644 (file)
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -3061,15 +3061,11 @@ static int sh_eth_drv_probe(struct platform_device *pdev)
         mdp->ether_link_active_low = pd->ether_link_active_low;
  
         /* set cpu data */
-       if (id) {
+       if (id)
                 mdp->cd = (struct sh_eth_cpu_data *)id->driver_data;
-       } else  {
-               const struct of_device_id *match;
+       else
+               mdp->cd = (struct sh_eth_cpu_data *)of_device_get_match_data(&pdev->dev);
  
-               match = of_match_device(of_match_ptr(sh_eth_match_table),
-                                       &pdev->dev);
-               mdp->cd = (struct sh_eth_cpu_data *)match->data;
-       }
         mdp->reg_offset = sh_eth_get_register_offset(mdp->cd->register_type);
         if (!mdp->reg_offset) {
                 dev_err(&pdev->dev, "Unknown register type (%d)\n",
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c

index 0faf1633603531a709358388b116d596d746e3e9..efb54f356a67c03e36856ed0047f5e3d9b2bf0f1 100644 (file)
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
@@ -199,21 +199,12 @@ int stmmac_mdio_register(struct net_device *ndev)
         struct stmmac_priv *priv = netdev_priv(ndev);
         struct stmmac_mdio_bus_data *mdio_bus_data = priv->plat->mdio_bus_data;
         int addr, found;
-       struct device_node *mdio_node = NULL;
-       struct device_node *child_node = NULL;
+       struct device_node *mdio_node = priv->plat->mdio_node;
  
         if (!mdio_bus_data)
                 return 0;
  
         if (IS_ENABLED(CONFIG_OF)) {
-               for_each_child_of_node(priv->device->of_node, child_node) {
-                       if (of_device_is_compatible(child_node,
-                                                   "snps,dwmac-mdio")) {
-                               mdio_node = child_node;
-                               break;
-                       }
-               }
-
                 if (mdio_node) {
                         netdev_dbg(ndev, "FOUND MDIO subnode\n");
                 } else {
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c

index 6a52fa18cbf2e94958bdfe44c4db258ababcdd02..4514ba73d96116317ca5ff8b797b037ae7434ffd 100644 (file)
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -110,6 +110,7 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
         struct device_node *np = pdev->dev.of_node;
         struct plat_stmmacenet_data *plat;
         struct stmmac_dma_cfg *dma_cfg;
+       struct device_node *child_node = NULL;
  
         plat = devm_kzalloc(&pdev->dev, sizeof(*plat), GFP_KERNEL);
         if (!plat)
@@ -140,13 +141,19 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
                 plat->phy_node = of_node_get(np);
         }
  
+       for_each_child_of_node(np, child_node)
+               if (of_device_is_compatible(child_node, "snps,dwmac-mdio")) {
+                       plat->mdio_node = child_node;
+                       break;
+               }
+
         /* "snps,phy-addr" is not a standard property. Mark it as deprecated
          * and warn of its use. Remove this when phy node support is added.
          */
         if (of_property_read_u32(np, "snps,phy-addr", &plat->phy_addr) == 0)
                 dev_warn(&pdev->dev, "snps,phy-addr property is deprecated\n");
  
-       if ((plat->phy_node && !of_phy_is_fixed_link(np)) || plat->phy_bus_name)
+       if ((plat->phy_node && !of_phy_is_fixed_link(np)) || !plat->mdio_node)
                 plat->mdio_bus_data = NULL;
         else
                 plat->mdio_bus_data =
diff --git a/drivers/net/ethernet/synopsys/dwc_eth_qos.c b/drivers/net/ethernet/synopsys/dwc_eth_qos.c

index fc8bbff2d7e37ec19d807008c1e9b70040551ea2..af11ed1e0bcc09b4ed4fa5e94df8945fab4d581e 100644 (file)
--- a/drivers/net/ethernet/synopsys/dwc_eth_qos.c
+++ b/drivers/net/ethernet/synopsys/dwc_eth_qos.c
@@ -426,7 +426,7 @@
  #define DWC_MMC_RXOCTETCOUNT_GB          0x0784
  #define DWC_MMC_RXPACKETCOUNT_GB         0x0780
  
-static int debug = 3;
+static int debug = -1;
  module_param(debug, int, 0);
  MODULE_PARM_DESC(debug, "DWC_eth_qos debug level (0=none,...,16=all)");
  
@@ -650,6 +650,11 @@ struct net_local {
         u32 mmc_tx_counters_mask;
  
         struct dwceqos_flowcontrol flowcontrol;
+
+       /* Tracks the intermediate state of phy started but hardware
+        * init not finished yet.
+        */
+       bool phy_defer;
  };
  
  static void dwceqos_read_mmc_counters(struct net_local *lp, u32 rx_mask,
@@ -901,6 +906,9 @@ static void dwceqos_adjust_link(struct net_device *ndev)
         struct phy_device *phydev = lp->phy_dev;
         int status_change = 0;
  
+       if (lp->phy_defer)
+               return;
+
         if (phydev->link) {
                 if ((lp->speed != phydev->speed) ||
                     (lp->duplex != phydev->duplex)) {
@@ -1113,7 +1121,7 @@ static int dwceqos_descriptor_init(struct net_local *lp)
         /* Allocate DMA descriptors */
         size = DWCEQOS_RX_DCNT * sizeof(struct dwceqos_dma_desc);
         lp->rx_descs = dma_alloc_coherent(lp->ndev->dev.parent, size,
-                       &lp->rx_descs_addr, 0);
+                       &lp->rx_descs_addr, GFP_KERNEL);
         if (!lp->rx_descs)
                 goto err_out;
         lp->rx_descs_tail_addr = lp->rx_descs_addr +
@@ -1121,7 +1129,7 @@ static int dwceqos_descriptor_init(struct net_local *lp)
  
         size = DWCEQOS_TX_DCNT * sizeof(struct dwceqos_dma_desc);
         lp->tx_descs = dma_alloc_coherent(lp->ndev->dev.parent, size,
-                       &lp->tx_descs_addr, 0);
+                       &lp->tx_descs_addr, GFP_KERNEL);
         if (!lp->tx_descs)
                 goto err_out;
         lp->tx_descs_tail_addr = lp->tx_descs_addr +
@@ -1635,6 +1643,12 @@ static void dwceqos_init_hw(struct net_local *lp)
         regval = dwceqos_read(lp, REG_DWCEQOS_MAC_CFG);
         dwceqos_write(lp, REG_DWCEQOS_MAC_CFG,
                       regval | DWCEQOS_MAC_CFG_TE | DWCEQOS_MAC_CFG_RE);
+
+       lp->phy_defer = false;
+       mutex_lock(&lp->phy_dev->lock);
+       phy_read_status(lp->phy_dev);
+       dwceqos_adjust_link(lp->ndev);
+       mutex_unlock(&lp->phy_dev->lock);
  }
  
  static void dwceqos_tx_reclaim(unsigned long data)
@@ -1880,9 +1894,13 @@ static int dwceqos_open(struct net_device *ndev)
         }
         netdev_reset_queue(ndev);
  
+       /* The dwceqos reset state machine requires all phy clocks to complete,
+        * hence the unusual init order with phy_start first.
+        */
+       lp->phy_defer = true;
+       phy_start(lp->phy_dev);
         dwceqos_init_hw(lp);
         napi_enable(&lp->napi);
-       phy_start(lp->phy_dev);
  
         netif_start_queue(ndev);
         tasklet_enable(&lp->tx_bdreclaim_tasklet);
@@ -1915,18 +1933,19 @@ static int dwceqos_stop(struct net_device *ndev)
  {
         struct net_local *lp = netdev_priv(ndev);
  
-       phy_stop(lp->phy_dev);
-
         tasklet_disable(&lp->tx_bdreclaim_tasklet);
-       netif_stop_queue(ndev);
         napi_disable(&lp->napi);
  
-       dwceqos_drain_dma(lp);
+       /* Stop all tx before we drain the tx dma. */
+       netif_tx_lock_bh(lp->ndev);
+       netif_stop_queue(ndev);
+       netif_tx_unlock_bh(lp->ndev);
  
-       netif_tx_lock(lp->ndev);
+       dwceqos_drain_dma(lp);
         dwceqos_reset_hw(lp);
+       phy_stop(lp->phy_dev);
+
         dwceqos_descriptor_free(lp);
-       netif_tx_unlock(lp->ndev);
  
         return 0;
  }
@@ -2178,12 +2197,10 @@ static int dwceqos_start_xmit(struct sk_buff *skb, struct net_device *ndev)
                 ((trans.initial_descriptor + trans.nr_descriptors) %
                  DWCEQOS_TX_DCNT));
  
-       dwceqos_tx_finalize(skb, lp, &trans);
-
-       netdev_sent_queue(ndev, skb->len);
-
         spin_lock_bh(&lp->tx_lock);
         lp->tx_free -= trans.nr_descriptors;
+       dwceqos_tx_finalize(skb, lp, &trans);
+       netdev_sent_queue(ndev, skb->len);
         spin_unlock_bh(&lp->tx_lock);
  
         ndev->trans_start = jiffies;
diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c

index 03833dbfca67d1ab53ca1af42398e7ea9dd04f01..dc85f7095e51038c0ebbb1182ba0d169ec0b2dd7 100644 (file)
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -297,6 +297,17 @@ static int kszphy_config_init(struct phy_device *phydev)
         if (priv->led_mode >= 0)
                 kszphy_setup_led(phydev, type->led_mode_reg, priv->led_mode);
  
+       if (phy_interrupt_is_valid(phydev)) {
+               int ctl = phy_read(phydev, MII_BMCR);
+
+               if (ctl < 0)
+                       return ctl;
+
+               ret = phy_write(phydev, MII_BMCR, ctl & ~BMCR_ANENABLE);
+               if (ret < 0)
+                       return ret;
+       }
+
         return 0;
  }
  
@@ -635,6 +646,21 @@ static void kszphy_get_stats(struct phy_device *phydev,
                 data[i] = kszphy_get_stat(phydev, i);
  }
  
+static int kszphy_resume(struct phy_device *phydev)
+{
+       int value;
+
+       mutex_lock(&phydev->lock);
+
+       value = phy_read(phydev, MII_BMCR);
+       phy_write(phydev, MII_BMCR, value & ~BMCR_PDOWN);
+
+       kszphy_config_intr(phydev);
+       mutex_unlock(&phydev->lock);
+
+       return 0;
+}
+
  static int kszphy_probe(struct phy_device *phydev)
  {
         const struct kszphy_type *type = phydev->drv->driver_data;
@@ -844,7 +870,7 @@ static struct phy_driver ksphy_driver[] = {
         .get_strings    = kszphy_get_strings,
         .get_stats      = kszphy_get_stats,
         .suspend        = genphy_suspend,
-       .resume         = genphy_resume,
+       .resume         = kszphy_resume,
  }, {
         .phy_id         = PHY_ID_KSZ8061,
         .name           = "Micrel KSZ8061",
diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c

index fc8ad001bc949e894e7659301af65565a5a21046..d61da9ece3ba021a7aa68253efe8dd82198cb575 100644 (file)
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -443,9 +443,14 @@ static ssize_t ppp_read(struct file *file, char __user *buf,
                          * network traffic (demand mode).
                          */
                         struct ppp *ppp = PF_TO_PPP(pf);
+
+                       ppp_recv_lock(ppp);
                         if (ppp->n_channels == 0 &&
-                           (ppp->flags & SC_LOOP_TRAFFIC) == 0)
+                           (ppp->flags & SC_LOOP_TRAFFIC) == 0) {
+                               ppp_recv_unlock(ppp);
                                 break;
+                       }
+                       ppp_recv_unlock(ppp);
                 }
                 ret = -EAGAIN;
                 if (file->f_flags & O_NONBLOCK)
@@ -532,9 +537,12 @@ static unsigned int ppp_poll(struct file *file, poll_table *wait)
         else if (pf->kind == INTERFACE) {
                 /* see comment in ppp_read */
                 struct ppp *ppp = PF_TO_PPP(pf);
+
+               ppp_recv_lock(ppp);
                 if (ppp->n_channels == 0 &&
                     (ppp->flags & SC_LOOP_TRAFFIC) == 0)
                         mask |= POLLIN | POLLRDNORM;
+               ppp_recv_unlock(ppp);
         }
  
         return mask;
@@ -2808,6 +2816,7 @@ static struct ppp *ppp_create_interface(struct net *net, int unit,
  
  out2:
         mutex_unlock(&pn->all_ppp_mutex);
+       rtnl_unlock();
         free_netdev(dev);
  out1:
         *retp = ret;
diff --git a/drivers/net/usb/ax88172a.c b/drivers/net/usb/ax88172a.c

index 224e7d82de6d2552121a2af643ea57d927e5b05b..cf77f2dffa698fa8399f5ae7b04162be8a1b34fe 100644 (file)
--- a/drivers/net/usb/ax88172a.c
+++ b/drivers/net/usb/ax88172a.c
@@ -134,7 +134,6 @@ static void ax88172a_remove_mdio(struct usbnet *dev)
  
         netdev_info(dev->net, "deregistering mdio bus %s\n", priv->mdio->id);
         mdiobus_unregister(priv->mdio);
-       kfree(priv->mdio->irq);
         mdiobus_free(priv->mdio);
  }
  
diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c

index dc0212c3cc28cc36267d62cfebe726f51fe0c910..86ba30ba35e8fcf45f9eba287334da04b54203e2 100644 (file)
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -837,7 +837,11 @@ int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_
  
         iface_no = ctx->data->cur_altsetting->desc.bInterfaceNumber;
  
-       /* reset data interface */
+       /* Reset data interface. Some devices will not reset properly
+        * unless they are configured first.  Toggle the altsetting to
+        * force a reset
+        */
+       usb_set_interface(dev->udev, iface_no, data_altsetting);
         temp = usb_set_interface(dev->udev, iface_no, 0);
         if (temp) {
                 dev_dbg(&intf->dev, "set interface failed\n");
@@ -984,8 +988,6 @@ EXPORT_SYMBOL_GPL(cdc_ncm_select_altsetting);
  
  static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf)
  {
-       int ret;
-
         /* MBIM backwards compatible function? */
         if (cdc_ncm_select_altsetting(intf) != CDC_NCM_COMM_ALTSETTING_NCM)
                 return -ENODEV;
@@ -994,16 +996,7 @@ static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf)
          * Additionally, generic NCM devices are assumed to accept arbitrarily
          * placed NDP.
          */
-       ret = cdc_ncm_bind_common(dev, intf, CDC_NCM_DATA_ALTSETTING_NCM, 0);
-
-       /*
-        * We should get an event when network connection is "connected" or
-        * "disconnected". Set network connection in "disconnected" state
-        * (carrier is OFF) during attach, so the IP network stack does not
-        * start IPv6 negotiation and more.
-        */
-       usbnet_link_change(dev, 0, 0);
-       return ret;
+       return cdc_ncm_bind_common(dev, intf, CDC_NCM_DATA_ALTSETTING_NCM, 0);
  }
  
  static void cdc_ncm_align_tail(struct sk_buff *skb, size_t modulus, size_t remainder, size_t max)
@@ -1586,7 +1579,8 @@ static void cdc_ncm_status(struct usbnet *dev, struct urb *urb)
  
  static const struct driver_info cdc_ncm_info = {
         .description = "CDC NCM",
-       .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET,
+       .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET
+                       | FLAG_LINK_INTR,
         .bind = cdc_ncm_bind,
         .unbind = cdc_ncm_unbind,
         .manage_power = usbnet_manage_power,
@@ -1599,7 +1593,7 @@ static const struct driver_info cdc_ncm_info = {
  static const struct driver_info wwan_info = {
         .description = "Mobile Broadband Network Device",
         .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET
-                       | FLAG_WWAN,
+                       | FLAG_LINK_INTR | FLAG_WWAN,
         .bind = cdc_ncm_bind,
         .unbind = cdc_ncm_unbind,
         .manage_power = usbnet_manage_power,
@@ -1612,7 +1606,7 @@ static const struct driver_info wwan_info = {
  static const struct driver_info wwan_noarp_info = {
         .description = "Mobile Broadband Network Device (NO ARP)",
         .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET
-                       | FLAG_WWAN | FLAG_NOARP,
+                       | FLAG_LINK_INTR | FLAG_WWAN | FLAG_NOARP,
         .bind = cdc_ncm_bind,
         .unbind = cdc_ncm_unbind,
         .manage_power = usbnet_manage_power,
diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c

index 570deef53f74301896cfbb8e2ca5c622a5b67185..a3a4ccf7cf5272530ac8addf182c479feffd47fc 100644 (file)
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -861,8 +861,10 @@ static const struct usb_device_id products[] = {
         {QMI_FIXED_INTF(0x1199, 0x9056, 8)},    /* Sierra Wireless Modem */
         {QMI_FIXED_INTF(0x1199, 0x9057, 8)},
         {QMI_FIXED_INTF(0x1199, 0x9061, 8)},    /* Sierra Wireless Modem */
-       {QMI_FIXED_INTF(0x1199, 0x9071, 8)},    /* Sierra Wireless MC74xx/EM74xx */
-       {QMI_FIXED_INTF(0x1199, 0x9071, 10)},   /* Sierra Wireless MC74xx/EM74xx */
+       {QMI_FIXED_INTF(0x1199, 0x9071, 8)},    /* Sierra Wireless MC74xx */
+       {QMI_FIXED_INTF(0x1199, 0x9071, 10)},   /* Sierra Wireless MC74xx */
+       {QMI_FIXED_INTF(0x1199, 0x9079, 8)},    /* Sierra Wireless EM74xx */
+       {QMI_FIXED_INTF(0x1199, 0x9079, 10)},   /* Sierra Wireless EM74xx */
         {QMI_FIXED_INTF(0x1bbb, 0x011e, 4)},    /* Telekom Speedstick LTE II (Alcatel One Touch L100V LTE) */
         {QMI_FIXED_INTF(0x1bbb, 0x0203, 2)},    /* Alcatel L800MA */
         {QMI_FIXED_INTF(0x2357, 0x0201, 4)},    /* TP-LINK HSUPA Modem MA180 */
@@ -885,6 +887,7 @@ static const struct usb_device_id products[] = {
         {QMI_FIXED_INTF(0x413c, 0x81a8, 8)},    /* Dell Wireless 5808 Gobi(TM) 4G LTE Mobile Broadband Card */
         {QMI_FIXED_INTF(0x413c, 0x81a9, 8)},    /* Dell Wireless 5808e Gobi(TM) 4G LTE Mobile Broadband Card */
         {QMI_FIXED_INTF(0x413c, 0x81b1, 8)},    /* Dell Wireless 5809e Gobi(TM) 4G LTE Mobile Broadband Card */
+       {QMI_FIXED_INTF(0x413c, 0x81b3, 8)},    /* Dell Wireless 5809e Gobi(TM) 4G LTE Mobile Broadband Card (rev3) */
         {QMI_FIXED_INTF(0x03f0, 0x4e1d, 8)},    /* HP lt4111 LTE/EV-DO/HSPA+ Gobi 4G Module */
         {QMI_FIXED_INTF(0x22de, 0x9061, 3)},    /* WeTelecom WPD-600N */
         {QMI_FIXED_INTF(0x1e0e, 0x9001, 5)},    /* SIMCom 7230E */
diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c

index 0b0ba7ef14e4e9054adeb1251c29cddba08c5187..10798128c03fc64881c5a437df3510f016461b7c 100644 (file)
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -1769,6 +1769,13 @@ out3:
         if (info->unbind)
                 info->unbind (dev, udev);
  out1:
+       /* subdrivers must undo all they did in bind() if they
+        * fail it, but we may fail later and a deferred kevent
+        * may trigger an error resubmitting itself and, worse,
+        * schedule a timer. So we kill it all just in case.
+        */
+       cancel_work_sync(&dev->kevent);
+       del_timer_sync(&dev->delay);
         free_netdev(net);
  out:
         return status;
diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c

index 0cbf520cea778fc703c9657ab2d085eee522634b..fc895d0e85d9cafced5d7e9d1166e8bc0de833bd 100644 (file)
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -814,7 +814,7 @@ vmxnet3_tq_init_all(struct vmxnet3_adapter *adapter)
  
  
  /*
- *    parse and copy relevant protocol headers:
+ *    parse relevant protocol headers:
   *      For a tso pkt, relevant headers are L2/3/4 including options
   *      For a pkt requesting csum offloading, they are L2/3 and may include L4
   *      if it's a TCP/UDP pkt
@@ -827,15 +827,14 @@ vmxnet3_tq_init_all(struct vmxnet3_adapter *adapter)
   * Other effects:
   *    1. related *ctx fields are updated.
   *    2. ctx->copy_size is # of bytes copied
- *    3. the portion copied is guaranteed to be in the linear part
+ *    3. the portion to be copied is guaranteed to be in the linear part
   *
   */
  static int
-vmxnet3_parse_and_copy_hdr(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
-                          struct vmxnet3_tx_ctx *ctx,
-                          struct vmxnet3_adapter *adapter)
+vmxnet3_parse_hdr(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
+                 struct vmxnet3_tx_ctx *ctx,
+                 struct vmxnet3_adapter *adapter)
  {
-       struct Vmxnet3_TxDataDesc *tdd;
         u8 protocol = 0;
  
         if (ctx->mss) { /* TSO */
@@ -892,16 +891,34 @@ vmxnet3_parse_and_copy_hdr(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
                 return 0;
         }
  
+       return 1;
+err:
+       return -1;
+}
+
+/*
+ *    copy relevant protocol headers to the transmit ring:
+ *      For a tso pkt, relevant headers are L2/3/4 including options
+ *      For a pkt requesting csum offloading, they are L2/3 and may include L4
+ *      if it's a TCP/UDP pkt
+ *
+ *
+ *    Note that this requires that vmxnet3_parse_hdr be called first to set the
+ *      appropriate bits in ctx first
+ */
+static void
+vmxnet3_copy_hdr(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
+                struct vmxnet3_tx_ctx *ctx,
+                struct vmxnet3_adapter *adapter)
+{
+       struct Vmxnet3_TxDataDesc *tdd;
+
         tdd = tq->data_ring.base + tq->tx_ring.next2fill;
  
         memcpy(tdd->data, skb->data, ctx->copy_size);
         netdev_dbg(adapter->netdev,
                 "copy %u bytes to dataRing[%u]\n",
                 ctx->copy_size, tq->tx_ring.next2fill);
-       return 1;
-
-err:
-       return -1;
  }
  
  
@@ -998,22 +1015,7 @@ vmxnet3_tq_xmit(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
                 }
         }
  
-       spin_lock_irqsave(&tq->tx_lock, flags);
-
-       if (count > vmxnet3_cmd_ring_desc_avail(&tq->tx_ring)) {
-               tq->stats.tx_ring_full++;
-               netdev_dbg(adapter->netdev,
-                       "tx queue stopped on %s, next2comp %u"
-                       " next2fill %u\n", adapter->netdev->name,
-                       tq->tx_ring.next2comp, tq->tx_ring.next2fill);
-
-               vmxnet3_tq_stop(tq, adapter);
-               spin_unlock_irqrestore(&tq->tx_lock, flags);
-               return NETDEV_TX_BUSY;
-       }
-
-
-       ret = vmxnet3_parse_and_copy_hdr(skb, tq, &ctx, adapter);
+       ret = vmxnet3_parse_hdr(skb, tq, &ctx, adapter);
         if (ret >= 0) {
                 BUG_ON(ret <= 0 && ctx.copy_size != 0);
                 /* hdrs parsed, check against other limits */
@@ -1033,9 +1035,26 @@ vmxnet3_tq_xmit(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
                 }
         } else {
                 tq->stats.drop_hdr_inspect_err++;
-               goto unlock_drop_pkt;
+               goto drop_pkt;
         }
  
+       spin_lock_irqsave(&tq->tx_lock, flags);
+
+       if (count > vmxnet3_cmd_ring_desc_avail(&tq->tx_ring)) {
+               tq->stats.tx_ring_full++;
+               netdev_dbg(adapter->netdev,
+                       "tx queue stopped on %s, next2comp %u"
+                       " next2fill %u\n", adapter->netdev->name,
+                       tq->tx_ring.next2comp, tq->tx_ring.next2fill);
+
+               vmxnet3_tq_stop(tq, adapter);
+               spin_unlock_irqrestore(&tq->tx_lock, flags);
+               return NETDEV_TX_BUSY;
+       }
+
+
+       vmxnet3_copy_hdr(skb, tq, &ctx, adapter);
+
         /* fill tx descs related to addr & len */
         if (vmxnet3_map_pkt(skb, &ctx, tq, adapter->pdev, adapter))
                 goto unlock_drop_pkt;
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c

index 66addb7a7911beb33f17c916caa7bb48ae84507e..bdcf617a9d52b86eb41c13dad6df1c3c42d3d319 100644 (file)
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -104,20 +104,23 @@ static struct dst_ops vrf_dst_ops = {
  #if IS_ENABLED(CONFIG_IPV6)
  static bool check_ipv6_frame(const struct sk_buff *skb)
  {
-       const struct ipv6hdr *ipv6h = (struct ipv6hdr *)skb->data;
-       size_t hlen = sizeof(*ipv6h);
+       const struct ipv6hdr *ipv6h;
+       struct ipv6hdr _ipv6h;
         bool rc = true;
  
-       if (skb->len < hlen)
+       ipv6h = skb_header_pointer(skb, 0, sizeof(_ipv6h), &_ipv6h);
+       if (!ipv6h)
                 goto out;
  
         if (ipv6h->nexthdr == NEXTHDR_ICMP) {
                 const struct icmp6hdr *icmph;
+               struct icmp6hdr _icmph;
  
-               if (skb->len < hlen + sizeof(*icmph))
+               icmph = skb_header_pointer(skb, sizeof(_ipv6h),
+                                          sizeof(_icmph), &_icmph);
+               if (!icmph)
                         goto out;
  
-               icmph = (struct icmp6hdr *)(skb->data + sizeof(*ipv6h));
                 switch (icmph->icmp6_type) {
                 case NDISC_ROUTER_SOLICITATION:
                 case NDISC_ROUTER_ADVERTISEMENT:
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c

index e6944b29588e1bd329c5b331988373d5297c0bd6..1c32bd10479730a73f2832fa7f15ef3d845941db 100644 (file)
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -931,8 +931,10 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
                                                      cb->nlh->nlmsg_seq,
                                                      RTM_NEWNEIGH,
                                                      NLM_F_MULTI, rd);
-                               if (err < 0)
+                               if (err < 0) {
+                                       cb->args[1] = err;
                                         goto out;
+                               }
  skip:
                                 ++idx;
                         }
@@ -1306,8 +1308,10 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
                 gbp = (struct vxlanhdr_gbp *)vxh;
                 md->gbp = ntohs(gbp->policy_id);
  
-               if (tun_dst)
+               if (tun_dst) {
                         tun_dst->u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT;
+                       tun_dst->u.tun_info.options_len = sizeof(*md);
+               }
  
                 if (gbp->dont_learn)
                         md->gbp |= VXLAN_GBP_DONT_LEARN;
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c

index 4ed5180c547bb6ce8af288a6653220b5cf2c61b4..0ccc697fef76cf25c94443c4cde9d0fb1185df05 100644 (file)
--- a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c
@@ -107,7 +107,7 @@ static int iwl_send_tx_ant_cfg(struct iwl_mvm *mvm, u8 valid_tx_ant)
                                     sizeof(tx_ant_cmd), &tx_ant_cmd);
  }
  
-static void iwl_free_fw_paging(struct iwl_mvm *mvm)
+void iwl_free_fw_paging(struct iwl_mvm *mvm)
  {
         int i;
  
@@ -127,6 +127,8 @@ static void iwl_free_fw_paging(struct iwl_mvm *mvm)
                              get_order(mvm->fw_paging_db[i].fw_paging_size));
         }
         kfree(mvm->trans->paging_download_buf);
+       mvm->trans->paging_download_buf = NULL;
+
         memset(mvm->fw_paging_db, 0, sizeof(mvm->fw_paging_db));
  }
  
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h

index 5f3ac8cccf49d2c5a07644f894b1ba920dca8bfc..ff7c6df9f9418ebac294709f63c21911459cfac1 100644 (file)
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
@@ -1225,6 +1225,9 @@ void iwl_mvm_rx_umac_scan_complete_notif(struct iwl_mvm *mvm,
  void iwl_mvm_rx_umac_scan_iter_complete_notif(struct iwl_mvm *mvm,
                                               struct iwl_rx_cmd_buffer *rxb);
  
+/* Paging */
+void iwl_free_fw_paging(struct iwl_mvm *mvm);
+
  /* MVM debugfs */
  #ifdef CONFIG_IWLWIFI_DEBUGFS
  int iwl_mvm_dbgfs_register(struct iwl_mvm *mvm, struct dentry *dbgfs_dir);
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c

index 89ea70deeb84410edd0e3ad07ae77abc08476107..e80be9a595207bcf962551cc2bde42d14b898a03 100644 (file)
--- a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
@@ -684,6 +684,8 @@ static void iwl_op_mode_mvm_stop(struct iwl_op_mode *op_mode)
         for (i = 0; i < NVM_MAX_NUM_SECTIONS; i++)
                 kfree(mvm->nvm_sections[i].data);
  
+       iwl_free_fw_paging(mvm);
+
         iwl_mvm_tof_clean(mvm);
  
         ieee80211_free_hw(mvm->hw);
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c

index 0914ec2fd57467023b0f5336b2e630856c06cf81..a040edc550570a11373ed30b4916b7930c7fa832 100644 (file)
--- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
@@ -423,6 +423,15 @@ int iwl_mvm_tx_skb_non_sta(struct iwl_mvm *mvm, struct sk_buff *skb)
                 return -1;
         }
  
+       /*
+        * Increase the pending frames counter, so that later when a reply comes
+        * in and the counter is decreased - we don't start getting negative
+        * values.
+        * Note that we don't need to make sure it isn't agg'd, since we're
+        * TXing non-sta
+        */
+       atomic_inc(&mvm->pending_frames[sta_id]);
+
         return 0;
  }
  
diff --git a/drivers/nvdimm/e820.c b/drivers/nvdimm/e820.c

index b0045a505dc88c9ef56dcc1094a198bd011d24a7..95825b38559addb8a0dc5cca22bc305e228e65f6 100644 (file)
--- a/drivers/nvdimm/e820.c
+++ b/drivers/nvdimm/e820.c
@@ -55,7 +55,7 @@ static int e820_pmem_probe(struct platform_device *pdev)
         for (p = iomem_resource.child; p ; p = p->sibling) {
                 struct nd_region_desc ndr_desc;
  
-               if (strncmp(p->name, "Persistent Memory (legacy)", 26) != 0)
+               if (p->desc != IORES_DESC_PERSISTENT_MEMORY_LEGACY)
                         continue;
  
                 memset(&ndr_desc, 0, sizeof(ndr_desc));
diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c

index 39c4be41ef83d6cf23302fb9af6270f8857f85de..365dc7e83ab43cd3473d0da57d0ea768784869c8 100644 (file)
--- a/drivers/of/of_mdio.c
+++ b/drivers/of/of_mdio.c
@@ -305,6 +305,7 @@ EXPORT_SYMBOL(of_phy_find_device);
   * @dev: pointer to net_device claiming the phy
   * @phy_np: Pointer to device tree node for the PHY
   * @hndlr: Link state callback for the network device
+ * @flags: flags to pass to the PHY
   * @iface: PHY data interface type
   *
   * If successful, returns a pointer to the phy_device with the embedded
diff --git a/drivers/parisc/eisa_enumerator.c b/drivers/parisc/eisa_enumerator.c

index a656d9e8334302c2ec4f013ad6c016034ace5f27..21905fef2cbf0d34259e37069e50140a66101441 100644 (file)
--- a/drivers/parisc/eisa_enumerator.c
+++ b/drivers/parisc/eisa_enumerator.c
@@ -91,7 +91,7 @@ static int configure_memory(const unsigned char *buf,
         for (i=0;i<HPEE_MEMORY_MAX_ENT;i++) {
                 c = get_8(buf+len);
                 
-               if (NULL != (res = kmalloc(sizeof(struct resource), GFP_KERNEL))) {
+               if (NULL != (res = kzalloc(sizeof(struct resource), GFP_KERNEL))) {
                         int result;
                         
                         res->name = name;
@@ -183,7 +183,7 @@ static int configure_port(const unsigned char *buf, struct resource *io_parent,
         for (i=0;i<HPEE_PORT_MAX_ENT;i++) {
                 c = get_8(buf+len);
                 
-               if (NULL != (res = kmalloc(sizeof(struct resource), GFP_KERNEL))) {
+               if (NULL != (res = kzalloc(sizeof(struct resource), GFP_KERNEL))) {
                         res->name = board;
                         res->start = get_16(buf+len+1);
                         res->end = get_16(buf+len+1)+(c&HPEE_PORT_SIZE_MASK)+1;
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c

index 602eb422351060c611967538359b8409885397a1..f89db3af0607263648c9a64432bcb1efdaeb8326 100644 (file)
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4772,8 +4772,10 @@ int pci_get_new_domain_nr(void)
  void pci_bus_assign_domain_nr(struct pci_bus *bus, struct device *parent)
  {
         static int use_dt_domains = -1;
-       int domain = of_get_pci_domain_nr(parent->of_node);
+       int domain = -1;
  
+       if (parent)
+               domain = of_get_pci_domain_nr(parent->of_node);
         /*
          * Check DT domain and use_dt_domains values.
          *
diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c

index d7b87c64b7cd261c6ec6a1b22ddce040747679d9..e220edc85c68a12bda653b0177255cf38459c7d2 100644 (file)
--- a/drivers/rapidio/rio.c
+++ b/drivers/rapidio/rio.c
@@ -117,7 +117,7 @@ int rio_request_inb_mbox(struct rio_mport *mport,
         if (mport->ops->open_inb_mbox == NULL)
                 goto out;
  
-       res = kmalloc(sizeof(struct resource), GFP_KERNEL);
+       res = kzalloc(sizeof(struct resource), GFP_KERNEL);
  
         if (res) {
                 rio_init_mbox_res(res, mbox, mbox);
@@ -185,7 +185,7 @@ int rio_request_outb_mbox(struct rio_mport *mport,
         if (mport->ops->open_outb_mbox == NULL)
                 goto out;
  
-       res = kmalloc(sizeof(struct resource), GFP_KERNEL);
+       res = kzalloc(sizeof(struct resource), GFP_KERNEL);
  
         if (res) {
                 rio_init_mbox_res(res, mbox, mbox);
@@ -285,7 +285,7 @@ int rio_request_inb_dbell(struct rio_mport *mport,
  {
         int rc = 0;
  
-       struct resource *res = kmalloc(sizeof(struct resource), GFP_KERNEL);
+       struct resource *res = kzalloc(sizeof(struct resource), GFP_KERNEL);
  
         if (res) {
                 rio_init_dbell_res(res, start, end);
@@ -360,7 +360,7 @@ int rio_release_inb_dbell(struct rio_mport *mport, u16 start, u16 end)
  struct resource *rio_request_outb_dbell(struct rio_dev *rdev, u16 start,
                                         u16 end)
  {
-       struct resource *res = kmalloc(sizeof(struct resource), GFP_KERNEL);
+       struct resource *res = kzalloc(sizeof(struct resource), GFP_KERNEL);
  
         if (res) {
                 rio_init_dbell_res(res, start, end);
diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c

index cb61f300f8b5d111ce7871a39be43a1b7fe3ca80..277b5c8c825ca4a63864a01ba67cbd8765b5a041 100644 (file)
--- a/drivers/s390/block/dasd_diag.c
+++ b/drivers/s390/block/dasd_diag.c
@@ -67,7 +67,7 @@ static const u8 DASD_DIAG_CMS1[] = { 0xc3, 0xd4, 0xe2, 0xf1 };/* EBCDIC CMS1 */
   * and function code cmd.
   * In case of an exception return 3. Otherwise return result of bitwise OR of
   * resulting condition code and DIAG return code. */
-static inline int dia250(void *iob, int cmd)
+static inline int __dia250(void *iob, int cmd)
  {
         register unsigned long reg2 asm ("2") = (unsigned long) iob;
         typedef union {
@@ -77,7 +77,6 @@ static inline int dia250(void *iob, int cmd)
         int rc;
  
         rc = 3;
-       diag_stat_inc(DIAG_STAT_X250);
         asm volatile(
                 "       diag    2,%2,0x250\n"
                 "0:     ipm     %0\n"
@@ -91,6 +90,12 @@ static inline int dia250(void *iob, int cmd)
         return rc;
  }
  
+static inline int dia250(void *iob, int cmd)
+{
+       diag_stat_inc(DIAG_STAT_X250);
+       return __dia250(iob, cmd);
+}
+
  /* Initialize block I/O to DIAG device using the specified blocksize and
   * block offset. On success, return zero and set end_block to contain the
   * number of blocks on the device minus the specified offset. Return non-zero
diff --git a/drivers/sh/superhyway/superhyway.c b/drivers/sh/superhyway/superhyway.c

index 2d9e7f3d56115aeaddea8eb250008dba7729d98e..bb1fb771213401a8cbf67b98011f464409ecb699 100644 (file)
--- a/drivers/sh/superhyway/superhyway.c
+++ b/drivers/sh/superhyway/superhyway.c
@@ -66,7 +66,7 @@ int superhyway_add_device(unsigned long base, struct superhyway_device *sdev,
         superhyway_read_vcr(dev, base, &dev->vcr);
  
         if (!dev->resource) {
-               dev->resource = kmalloc(sizeof(struct resource), GFP_KERNEL);
+               dev->resource = kzalloc(sizeof(struct resource), GFP_KERNEL);
                 if (!dev->resource) {
                         kfree(dev);
                         return -ENOMEM;
diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c

index 6a4ff27f4357eb229815c18b535327487f2df580..c688efa95e29b0561ac14be901cc4e4faa5aaf33 100644 (file)
--- a/drivers/spi/spi-imx.c
+++ b/drivers/spi/spi-imx.c
@@ -204,8 +204,8 @@ static bool spi_imx_can_dma(struct spi_master *master, struct spi_device *spi,
  {
         struct spi_imx_data *spi_imx = spi_master_get_devdata(master);
  
-       if (spi_imx->dma_is_inited &&
-           transfer->len > spi_imx->wml * sizeof(u32))
+       if (spi_imx->dma_is_inited && transfer->len >= spi_imx->wml &&
+           (transfer->len % spi_imx->wml) == 0)
                 return true;
         return false;
  }
@@ -919,8 +919,6 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
         struct dma_async_tx_descriptor *desc_tx = NULL, *desc_rx = NULL;
         int ret;
         unsigned long timeout;
-       u32 dma;
-       int left;
         struct spi_master *master = spi_imx->bitbang.master;
         struct sg_table *tx = &transfer->tx_sg, *rx = &transfer->rx_sg;
  
@@ -954,13 +952,6 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
         /* Trigger the cspi module. */
         spi_imx->dma_finished = 0;
  
-       dma = readl(spi_imx->base + MX51_ECSPI_DMA);
-       dma = dma & (~MX51_ECSPI_DMA_RXT_WML_MASK);
-       /* Change RX_DMA_LENGTH trigger dma fetch tail data */
-       left = transfer->len % spi_imx->wml;
-       if (left)
-               writel(dma | (left << MX51_ECSPI_DMA_RXT_WML_OFFSET),
-                               spi_imx->base + MX51_ECSPI_DMA);
         /*
          * Set these order to avoid potential RX overflow. The overflow may
          * happen if we enable SPI HW before starting RX DMA due to rescheduling
@@ -992,10 +983,6 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
                         spi_imx->devtype_data->reset(spi_imx);
                         dmaengine_terminate_all(master->dma_rx);
                 }
-               dma &= ~MX51_ECSPI_DMA_RXT_WML_MASK;
-               writel(dma |
-                      spi_imx->wml << MX51_ECSPI_DMA_RXT_WML_OFFSET,
-                      spi_imx->base + MX51_ECSPI_DMA);
         }
  
         spi_imx->dma_finished = 1;
diff --git a/drivers/spi/spi-rockchip.c b/drivers/spi/spi-rockchip.c

index 79a8bc4f6cec9e32ec9c78627a2e68fc85ab5e12..7cb1b2d710c10f7aab6c245a31d34e5439dfcbf0 100644 (file)
--- a/drivers/spi/spi-rockchip.c
+++ b/drivers/spi/spi-rockchip.c
@@ -749,6 +749,7 @@ static int rockchip_spi_probe(struct platform_device *pdev)
         return 0;
  
  err_register_master:
+       pm_runtime_disable(&pdev->dev);
         if (rs->dma_tx.ch)
                 dma_release_channel(rs->dma_tx.ch);
         if (rs->dma_rx.ch)
@@ -778,6 +779,8 @@ static int rockchip_spi_remove(struct platform_device *pdev)
         if (rs->dma_rx.ch)
                 dma_release_channel(rs->dma_rx.ch);
  
+       spi_master_put(master);
+
         return 0;
  }
  
diff --git a/drivers/ssb/Kconfig b/drivers/ssb/Kconfig

index 0c675861623f4abeb2054b65c87fe185d8188bf3..d8e4219c2324900c583245407a7c78f20e20325e 100644 (file)
--- a/drivers/ssb/Kconfig
+++ b/drivers/ssb/Kconfig
@@ -83,6 +83,7 @@ config SSB_SDIOHOST
  config SSB_HOST_SOC
         bool "Support for SSB bus on SoC"
         depends on SSB && BCM47XX_NVRAM
+       select SSB_SPROM
         help
           Host interface for a SSB directly mapped into memory. This is
           for some Broadcom SoCs from the BCM47xx and BCM53xx lines.
diff --git a/drivers/target/target_core_tmr.c b/drivers/target/target_core_tmr.c

index 82a663ba98009fb5f286a88651c9b3a54cf68bbb..4f229e711e1c1cfc0134abb71b617f9511a6e606 100644 (file)
--- a/drivers/target/target_core_tmr.c
+++ b/drivers/target/target_core_tmr.c
@@ -177,7 +177,6 @@ void core_tmr_abort_task(
  
                 if (!__target_check_io_state(se_cmd, se_sess, 0)) {
                         spin_unlock_irqrestore(&se_sess->sess_cmd_lock, flags);
-                       target_put_sess_cmd(se_cmd);
                         goto out;
                 }
                 list_del_init(&se_cmd->se_cmd_list);
diff --git a/drivers/video/fbdev/acornfb.c b/drivers/video/fbdev/acornfb.c

index a305caea58eea717fdffaac4c5364552673ad5ae..fb75b7e5a19ab52748e4a1980d56046e15716908 100644 (file)
--- a/drivers/video/fbdev/acornfb.c
+++ b/drivers/video/fbdev/acornfb.c
@@ -1040,8 +1040,8 @@ static int acornfb_probe(struct platform_device *dev)
                  * for the framebuffer if we are not using
                  * VRAM.
                  */
-               base = dma_alloc_writecombine(current_par.dev, size, &handle,
-                                             GFP_KERNEL);
+               base = dma_alloc_wc(current_par.dev, size, &handle,
+                                   GFP_KERNEL);
                 if (base == NULL) {
                         printk(KERN_ERR "acornfb: unable to allocate screen "
                                "memory\n");
diff --git a/drivers/video/fbdev/amba-clcd-versatile.c b/drivers/video/fbdev/amba-clcd-versatile.c

index 7a8afcd4573e5c1e595550f25985ea1181367157..a8a22daa3f9d2a70a6fa1ca5f4e0f526e7380375 100644 (file)
--- a/drivers/video/fbdev/amba-clcd-versatile.c
+++ b/drivers/video/fbdev/amba-clcd-versatile.c
@@ -154,8 +154,8 @@ int versatile_clcd_setup_dma(struct clcd_fb *fb, unsigned long framesize)
  {
         dma_addr_t dma;
  
-       fb->fb.screen_base = dma_alloc_writecombine(&fb->dev->dev, framesize,
-                                                   &dma, GFP_KERNEL);
+       fb->fb.screen_base = dma_alloc_wc(&fb->dev->dev, framesize, &dma,
+                                         GFP_KERNEL);
         if (!fb->fb.screen_base) {
                 pr_err("CLCD: unable to map framebuffer\n");
                 return -ENOMEM;
@@ -169,14 +169,12 @@ int versatile_clcd_setup_dma(struct clcd_fb *fb, unsigned long framesize)
  
  int versatile_clcd_mmap_dma(struct clcd_fb *fb, struct vm_area_struct *vma)
  {
-       return dma_mmap_writecombine(&fb->dev->dev, vma,
-                                    fb->fb.screen_base,
-                                    fb->fb.fix.smem_start,
-                                    fb->fb.fix.smem_len);
+       return dma_mmap_wc(&fb->dev->dev, vma, fb->fb.screen_base,
+                          fb->fb.fix.smem_start, fb->fb.fix.smem_len);
  }
  
  void versatile_clcd_remove_dma(struct clcd_fb *fb)
  {
-       dma_free_writecombine(&fb->dev->dev, fb->fb.fix.smem_len,
-                             fb->fb.screen_base, fb->fb.fix.smem_start);
+       dma_free_wc(&fb->dev->dev, fb->fb.fix.smem_len, fb->fb.screen_base,
+                   fb->fb.fix.smem_start);
  }
diff --git a/drivers/video/fbdev/amba-clcd.c b/drivers/video/fbdev/amba-clcd.c

index 9362424c2340490585fe02e4dfe950f53f2097af..fe274b5851c77ff5e59b7c88f205cbf09ec61114 100644 (file)
--- a/drivers/video/fbdev/amba-clcd.c
+++ b/drivers/video/fbdev/amba-clcd.c
@@ -774,8 +774,8 @@ static int clcdfb_of_dma_setup(struct clcd_fb *fb)
  
  static int clcdfb_of_dma_mmap(struct clcd_fb *fb, struct vm_area_struct *vma)
  {
-       return dma_mmap_writecombine(&fb->dev->dev, vma, fb->fb.screen_base,
-                       fb->fb.fix.smem_start, fb->fb.fix.smem_len);
+       return dma_mmap_wc(&fb->dev->dev, vma, fb->fb.screen_base,
+                          fb->fb.fix.smem_start, fb->fb.fix.smem_len);
  }
  
  static void clcdfb_of_dma_remove(struct clcd_fb *fb)
diff --git a/drivers/video/fbdev/atmel_lcdfb.c b/drivers/video/fbdev/atmel_lcdfb.c

index 19eb42b57d8742089d2faadf5c8a64ce81cc01b3..56c60e67316a57457c1741ea62d614e113d0a00e 100644 (file)
--- a/drivers/video/fbdev/atmel_lcdfb.c
+++ b/drivers/video/fbdev/atmel_lcdfb.c
@@ -414,8 +414,8 @@ static inline void atmel_lcdfb_free_video_memory(struct atmel_lcdfb_info *sinfo)
  {
         struct fb_info *info = sinfo->info;
  
-       dma_free_writecombine(info->device, info->fix.smem_len,
-                               info->screen_base, info->fix.smem_start);
+       dma_free_wc(info->device, info->fix.smem_len, info->screen_base,
+                   info->fix.smem_start);
  }
  
  /**
@@ -435,8 +435,9 @@ static int atmel_lcdfb_alloc_video_memory(struct atmel_lcdfb_info *sinfo)
                     * ((var->bits_per_pixel + 7) / 8));
         info->fix.smem_len = max(smem_len, sinfo->smem_len);
  
-       info->screen_base = dma_alloc_writecombine(info->device, info->fix.smem_len,
-                                       (dma_addr_t *)&info->fix.smem_start, GFP_KERNEL);
+       info->screen_base = dma_alloc_wc(info->device, info->fix.smem_len,
+                                        (dma_addr_t *)&info->fix.smem_start,
+                                        GFP_KERNEL);
  
         if (!info->screen_base) {
                 return -ENOMEM;
diff --git a/drivers/video/fbdev/ep93xx-fb.c b/drivers/video/fbdev/ep93xx-fb.c

index 5b1081030cbbbefb3f7a8ff1f5f7dbed03ebb2fa..75f0db25d19fef774b4903f71b43f9beb5a578f2 100644 (file)
--- a/drivers/video/fbdev/ep93xx-fb.c
+++ b/drivers/video/fbdev/ep93xx-fb.c
@@ -316,9 +316,8 @@ static int ep93xxfb_mmap(struct fb_info *info, struct vm_area_struct *vma)
         unsigned int offset = vma->vm_pgoff << PAGE_SHIFT;
  
         if (offset < info->fix.smem_len) {
-               return dma_mmap_writecombine(info->dev, vma, info->screen_base,
-                                            info->fix.smem_start,
-                                            info->fix.smem_len);
+               return dma_mmap_wc(info->dev, vma, info->screen_base,
+                                  info->fix.smem_start, info->fix.smem_len);
         }
  
         return -EINVAL;
@@ -428,8 +427,7 @@ static int ep93xxfb_alloc_videomem(struct fb_info *info)
         /* Maximum 16bpp -> used memory is maximum x*y*2 bytes */
         fb_size = EP93XXFB_MAX_XRES * EP93XXFB_MAX_YRES * 2;
  
-       virt_addr = dma_alloc_writecombine(info->dev, fb_size,
-                                          &phys_addr, GFP_KERNEL);
+       virt_addr = dma_alloc_wc(info->dev, fb_size, &phys_addr, GFP_KERNEL);
         if (!virt_addr)
                 return -ENOMEM;
  
diff --git a/drivers/video/fbdev/gbefb.c b/drivers/video/fbdev/gbefb.c

index b63d55f481fae421cbea5cb71466401835e23a3d..1a242b1338e9a75ff4436e8532f64a37124055cd 100644 (file)
--- a/drivers/video/fbdev/gbefb.c
+++ b/drivers/video/fbdev/gbefb.c
@@ -1185,8 +1185,8 @@ static int gbefb_probe(struct platform_device *p_dev)
         } else {
                 /* try to allocate memory with the classical allocator
                  * this has high chance to fail on low memory machines */
-               gbe_mem = dma_alloc_writecombine(NULL, gbe_mem_size,
-                                                &gbe_dma_addr, GFP_KERNEL);
+               gbe_mem = dma_alloc_wc(NULL, gbe_mem_size, &gbe_dma_addr,
+                                      GFP_KERNEL);
                 if (!gbe_mem) {
                         printk(KERN_ERR "gbefb: couldn't allocate framebuffer memory\n");
                         ret = -ENOMEM;
@@ -1238,7 +1238,7 @@ static int gbefb_probe(struct platform_device *p_dev)
  out_gbe_unmap:
         arch_phys_wc_del(par->wc_cookie);
         if (gbe_dma_addr)
-               dma_free_writecombine(NULL, gbe_mem_size, gbe_mem, gbe_mem_phys);
+               dma_free_wc(NULL, gbe_mem_size, gbe_mem, gbe_mem_phys);
  out_tiles_free:
         dma_free_coherent(NULL, GBE_TLB_SIZE * sizeof(uint16_t),
                           (void *)gbe_tiles.cpu, gbe_tiles.dma);
@@ -1259,7 +1259,7 @@ static int gbefb_remove(struct platform_device* p_dev)
         gbe_turn_off();
         arch_phys_wc_del(par->wc_cookie);
         if (gbe_dma_addr)
-               dma_free_writecombine(NULL, gbe_mem_size, gbe_mem, gbe_mem_phys);
+               dma_free_wc(NULL, gbe_mem_size, gbe_mem, gbe_mem_phys);
         dma_free_coherent(NULL, GBE_TLB_SIZE * sizeof(uint16_t),
                           (void *)gbe_tiles.cpu, gbe_tiles.dma);
         release_mem_region(GBE_BASE, sizeof(struct sgi_gbe));
diff --git a/drivers/video/fbdev/imxfb.c b/drivers/video/fbdev/imxfb.c

index bb2f1e866020199d31b8ba0b1940cb9e6b87923f..76b6a7784b06c7c752ba79862c26b2edcf51f932 100644 (file)
--- a/drivers/video/fbdev/imxfb.c
+++ b/drivers/video/fbdev/imxfb.c
@@ -937,8 +937,8 @@ static int imxfb_probe(struct platform_device *pdev)
         }
  
         fbi->map_size = PAGE_ALIGN(info->fix.smem_len);
-       info->screen_base = dma_alloc_writecombine(&pdev->dev, fbi->map_size,
-                                                  &fbi->map_dma, GFP_KERNEL);
+       info->screen_base = dma_alloc_wc(&pdev->dev, fbi->map_size,
+                                        &fbi->map_dma, GFP_KERNEL);
  
         if (!info->screen_base) {
                 dev_err(&pdev->dev, "Failed to allocate video RAM: %d\n", ret);
@@ -1005,8 +1005,8 @@ failed_cmap:
         if (pdata && pdata->exit)
                 pdata->exit(fbi->pdev);
  failed_platform_init:
-       dma_free_writecombine(&pdev->dev, fbi->map_size, info->screen_base,
-                             fbi->map_dma);
+       dma_free_wc(&pdev->dev, fbi->map_size, info->screen_base,
+                   fbi->map_dma);
  failed_map:
         iounmap(fbi->regs);
  failed_ioremap:
@@ -1041,8 +1041,8 @@ static int imxfb_remove(struct platform_device *pdev)
         kfree(info->pseudo_palette);
         framebuffer_release(info);
  
-       dma_free_writecombine(&pdev->dev, fbi->map_size, info->screen_base,
-                             fbi->map_dma);
+       dma_free_wc(&pdev->dev, fbi->map_size, info->screen_base,
+                   fbi->map_dma);
  
         iounmap(fbi->regs);
         release_mem_region(res->start, resource_size(res));
diff --git a/drivers/video/fbdev/mx3fb.c b/drivers/video/fbdev/mx3fb.c

index 7947634ee6b06c6f63707c87ef754d22342e975c..f91b1db262b04a2b131911009e3c69914b254013 100644 (file)
--- a/drivers/video/fbdev/mx3fb.c
+++ b/drivers/video/fbdev/mx3fb.c
@@ -1336,9 +1336,8 @@ static int mx3fb_map_video_memory(struct fb_info *fbi, unsigned int mem_len,
         int retval = 0;
         dma_addr_t addr;
  
-       fbi->screen_base = dma_alloc_writecombine(fbi->device,
-                                                 mem_len,
-                                                 &addr, GFP_DMA | GFP_KERNEL);
+       fbi->screen_base = dma_alloc_wc(fbi->device, mem_len, &addr,
+                                       GFP_DMA | GFP_KERNEL);
  
         if (!fbi->screen_base) {
                 dev_err(fbi->device, "Cannot allocate %u bytes framebuffer memory\n",
@@ -1378,8 +1377,8 @@ err0:
   */
  static int mx3fb_unmap_video_memory(struct fb_info *fbi)
  {
-       dma_free_writecombine(fbi->device, fbi->fix.smem_len,
-                             fbi->screen_base, fbi->fix.smem_start);
+       dma_free_wc(fbi->device, fbi->fix.smem_len, fbi->screen_base,
+                   fbi->fix.smem_start);
  
         fbi->screen_base = NULL;
         mutex_lock(&fbi->mm_lock);
diff --git a/drivers/video/fbdev/nuc900fb.c b/drivers/video/fbdev/nuc900fb.c

index 389fa2cbb713108623ca07b67941572ccdc3fa85..6680edae4696d1b0dc4dff547dda27ca8c159ab5 100644 (file)
--- a/drivers/video/fbdev/nuc900fb.c
+++ b/drivers/video/fbdev/nuc900fb.c
@@ -396,8 +396,8 @@ static int nuc900fb_map_video_memory(struct fb_info *info)
         dev_dbg(fbi->dev, "nuc900fb_map_video_memory(fbi=%p) map_size %lu\n",
                 fbi, map_size);
  
-       info->screen_base = dma_alloc_writecombine(fbi->dev, map_size,
-                                                       &map_dma, GFP_KERNEL);
+       info->screen_base = dma_alloc_wc(fbi->dev, map_size, &map_dma,
+                                        GFP_KERNEL);
  
         if (!info->screen_base)
                 return -ENOMEM;
@@ -411,8 +411,8 @@ static int nuc900fb_map_video_memory(struct fb_info *info)
  static inline void nuc900fb_unmap_video_memory(struct fb_info *info)
  {
         struct nuc900fb_info *fbi = info->par;
-       dma_free_writecombine(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
-                             info->screen_base, info->fix.smem_start);
+       dma_free_wc(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
+                   info->screen_base, info->fix.smem_start);
  }
  
  static irqreturn_t nuc900fb_irqhandler(int irq, void *dev_id)
diff --git a/drivers/video/fbdev/omap/lcdc.c b/drivers/video/fbdev/omap/lcdc.c

index 6efa2591eaa8dfa6d72a22cd18eaf7f07c36174d..e3d9b9ea5498e0f840c5189bd1401200494d9ae2 100644 (file)
--- a/drivers/video/fbdev/omap/lcdc.c
+++ b/drivers/video/fbdev/omap/lcdc.c
@@ -612,8 +612,8 @@ static void lcdc_dma_handler(u16 status, void *data)
  
  static int alloc_palette_ram(void)
  {
-       lcdc.palette_virt = dma_alloc_writecombine(lcdc.fbdev->dev,
-               MAX_PALETTE_SIZE, &lcdc.palette_phys, GFP_KERNEL);
+       lcdc.palette_virt = dma_alloc_wc(lcdc.fbdev->dev, MAX_PALETTE_SIZE,
+                                        &lcdc.palette_phys, GFP_KERNEL);
         if (lcdc.palette_virt == NULL) {
                 dev_err(lcdc.fbdev->dev, "failed to alloc palette memory\n");
                 return -ENOMEM;
@@ -625,8 +625,8 @@ static int alloc_palette_ram(void)
  
  static void free_palette_ram(void)
  {
-       dma_free_writecombine(lcdc.fbdev->dev, MAX_PALETTE_SIZE,
-                       lcdc.palette_virt, lcdc.palette_phys);
+       dma_free_wc(lcdc.fbdev->dev, MAX_PALETTE_SIZE, lcdc.palette_virt,
+                   lcdc.palette_phys);
  }
  
  static int alloc_fbmem(struct omapfb_mem_region *region)
@@ -642,8 +642,8 @@ static int alloc_fbmem(struct omapfb_mem_region *region)
         if (region->size > frame_size)
                 frame_size = region->size;
         lcdc.vram_size = frame_size;
-       lcdc.vram_virt = dma_alloc_writecombine(lcdc.fbdev->dev,
-                       lcdc.vram_size, &lcdc.vram_phys, GFP_KERNEL);
+       lcdc.vram_virt = dma_alloc_wc(lcdc.fbdev->dev, lcdc.vram_size,
+                                     &lcdc.vram_phys, GFP_KERNEL);
         if (lcdc.vram_virt == NULL) {
                 dev_err(lcdc.fbdev->dev, "unable to allocate FB DMA memory\n");
                 return -ENOMEM;
@@ -660,8 +660,8 @@ static int alloc_fbmem(struct omapfb_mem_region *region)
  
  static void free_fbmem(void)
  {
-       dma_free_writecombine(lcdc.fbdev->dev, lcdc.vram_size,
-                             lcdc.vram_virt, lcdc.vram_phys);
+       dma_free_wc(lcdc.fbdev->dev, lcdc.vram_size, lcdc.vram_virt,
+                   lcdc.vram_phys);
  }
  
  static int setup_fbmem(struct omapfb_mem_desc *req_md)
diff --git a/drivers/video/fbdev/pxa168fb.c b/drivers/video/fbdev/pxa168fb.c

index efb57c059997641baed972fe2b25fd85715cf291..def3a501acd64484342f2315c9b32fe697b166a5 100644 (file)
--- a/drivers/video/fbdev/pxa168fb.c
+++ b/drivers/video/fbdev/pxa168fb.c
@@ -680,8 +680,8 @@ static int pxa168fb_probe(struct platform_device *pdev)
          */
         info->fix.smem_len = PAGE_ALIGN(DEFAULT_FB_SIZE);
  
-       info->screen_base = dma_alloc_writecombine(fbi->dev, info->fix.smem_len,
-                                               &fbi->fb_start_dma, GFP_KERNEL);
+       info->screen_base = dma_alloc_wc(fbi->dev, info->fix.smem_len,
+                                        &fbi->fb_start_dma, GFP_KERNEL);
         if (info->screen_base == NULL) {
                 ret = -ENOMEM;
                 goto failed_free_info;
@@ -804,8 +804,8 @@ static int pxa168fb_remove(struct platform_device *pdev)
  
         irq = platform_get_irq(pdev, 0);
  
-       dma_free_writecombine(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
-                               info->screen_base, info->fix.smem_start);
+       dma_free_wc(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
+                   info->screen_base, info->fix.smem_start);
  
         clk_disable(fbi->clk);
  
diff --git a/drivers/video/fbdev/pxafb.c b/drivers/video/fbdev/pxafb.c

index 33b2bb315a2a462daefa5bc2c3181806747dc919..2c0487f4f805cf7d4c9153d4c966f7cc67c45c81 100644 (file)
--- a/drivers/video/fbdev/pxafb.c
+++ b/drivers/video/fbdev/pxafb.c
@@ -2446,8 +2446,8 @@ static int pxafb_remove(struct platform_device *dev)
  
         free_pages_exact(fbi->video_mem, fbi->video_mem_size);
  
-       dma_free_writecombine(&dev->dev, fbi->dma_buff_size,
-                       fbi->dma_buff, fbi->dma_buff_phys);
+       dma_free_wc(&dev->dev, fbi->dma_buff_size, fbi->dma_buff,
+                   fbi->dma_buff_phys);
  
         iounmap(fbi->mmio_base);
  
diff --git a/drivers/video/fbdev/s3c-fb.c b/drivers/video/fbdev/s3c-fb.c

index f72dd12456f962eba79eec43fb706be9dc1cdd39..5f4f696c2ecfd5e677575675be2969b1ba09947d 100644 (file)
--- a/drivers/video/fbdev/s3c-fb.c
+++ b/drivers/video/fbdev/s3c-fb.c
@@ -1105,8 +1105,7 @@ static int s3c_fb_alloc_memory(struct s3c_fb *sfb, struct s3c_fb_win *win)
  
         dev_dbg(sfb->dev, "want %u bytes for window\n", size);
  
-       fbi->screen_base = dma_alloc_writecombine(sfb->dev, size,
-                                                 &map_dma, GFP_KERNEL);
+       fbi->screen_base = dma_alloc_wc(sfb->dev, size, &map_dma, GFP_KERNEL);
         if (!fbi->screen_base)
                 return -ENOMEM;
  
@@ -1131,8 +1130,8 @@ static void s3c_fb_free_memory(struct s3c_fb *sfb, struct s3c_fb_win *win)
         struct fb_info *fbi = win->fbinfo;
  
         if (fbi->screen_base)
-               dma_free_writecombine(sfb->dev, PAGE_ALIGN(fbi->fix.smem_len),
-                             fbi->screen_base, fbi->fix.smem_start);
+               dma_free_wc(sfb->dev, PAGE_ALIGN(fbi->fix.smem_len),
+                           fbi->screen_base, fbi->fix.smem_start);
  }
  
  /**
diff --git a/drivers/video/fbdev/s3c2410fb.c b/drivers/video/fbdev/s3c2410fb.c

index d6704add1601c184d217dfbe19dcaa9d91aace27..0dd86be36afbb5bdff4d268ae58c17337a98c6b9 100644 (file)
--- a/drivers/video/fbdev/s3c2410fb.c
+++ b/drivers/video/fbdev/s3c2410fb.c
@@ -645,8 +645,8 @@ static int s3c2410fb_map_video_memory(struct fb_info *info)
  
         dprintk("map_video_memory(fbi=%p) map_size %u\n", fbi, map_size);
  
-       info->screen_base = dma_alloc_writecombine(fbi->dev, map_size,
-                                                  &map_dma, GFP_KERNEL);
+       info->screen_base = dma_alloc_wc(fbi->dev, map_size, &map_dma,
+                                        GFP_KERNEL);
  
         if (info->screen_base) {
                 /* prevent initial garbage on screen */
@@ -667,8 +667,8 @@ static inline void s3c2410fb_unmap_video_memory(struct fb_info *info)
  {
         struct s3c2410fb_info *fbi = info->par;
  
-       dma_free_writecombine(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
-                             info->screen_base, info->fix.smem_start);
+       dma_free_wc(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
+                   info->screen_base, info->fix.smem_start);
  }
  
  static inline void modify_gpio(void __iomem *reg,
diff --git a/drivers/video/fbdev/sa1100fb.c b/drivers/video/fbdev/sa1100fb.c

index dcf774c1588965032eb93ac3b18cc822f419de18..fc2aaa5aca2347e705c6eb1623ec188b5262e498 100644 (file)
--- a/drivers/video/fbdev/sa1100fb.c
+++ b/drivers/video/fbdev/sa1100fb.c
@@ -567,8 +567,8 @@ static int sa1100fb_mmap(struct fb_info *info,
  
         if (off < info->fix.smem_len) {
                 vma->vm_pgoff += 1; /* skip over the palette */
-               return dma_mmap_writecombine(fbi->dev, vma, fbi->map_cpu,
-                                            fbi->map_dma, fbi->map_size);
+               return dma_mmap_wc(fbi->dev, vma, fbi->map_cpu, fbi->map_dma,
+                                  fbi->map_size);
         }
  
         vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
@@ -1099,8 +1099,8 @@ static int sa1100fb_map_video_memory(struct sa1100fb_info *fbi)
          * of the framebuffer.
          */
         fbi->map_size = PAGE_ALIGN(fbi->fb.fix.smem_len + PAGE_SIZE);
-       fbi->map_cpu = dma_alloc_writecombine(fbi->dev, fbi->map_size,
-                                             &fbi->map_dma, GFP_KERNEL);
+       fbi->map_cpu = dma_alloc_wc(fbi->dev, fbi->map_size, &fbi->map_dma,
+                                   GFP_KERNEL);
  
         if (fbi->map_cpu) {
                 fbi->fb.screen_base = fbi->map_cpu + PAGE_SIZE;
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c

index 12eab503efd1bb8793fff55eca83649e1f564426..dc4305b407bf6756791ec9b31e7929c1fe5c3ca8 100644 (file)
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -257,7 +257,7 @@ static struct resource *additional_memory_resource(phys_addr_t size)
                 return NULL;
  
         res->name = "System RAM";
-       res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+       res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
  
         ret = allocate_resource(&iomem_resource, res,
                                 size, 0, -1,
diff --git a/fs/dax.c b/fs/dax.c

index 711172450da642ccdcf4ac7dcceb7546faf66fad..bbb2ad78377020ac85158fb61df067aceadaafe5 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1056,6 +1056,7 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
  int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
  {
         struct file *file = vma->vm_file;
+       int error;
  
         /*
          * We pass NO_SECTOR to dax_radix_entry() because we expect that a
@@ -1065,7 +1066,13 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
          * saves us from having to make a call to get_block() here to look
          * up the sector.
          */
-       dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true);
+       error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false,
+                       true);
+
+       if (error == -ENOMEM)
+               return VM_FAULT_OOM;
+       if (error)
+               return VM_FAULT_SIGBUS;
         return VM_FAULT_NOPAGE;
  }
  EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c

index e032a0423e351cabfd6fe6156cdfc9c53ebff216..4098acc701c3e66d5e8a229ed49828d69997655f 100644 (file)
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -390,6 +390,7 @@ data_copy:
                 *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0);
                 if (*err < 0)
                         break;
+               bh = bh->b_this_page;
         }
         if (!*err)
                 *err = block_commit_write(pagep[0], from, from + replaced_size);
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c

index d211b8e18566719c873838268217b143966e5965..30c4c9ebb693faaecf249df4c913c01bbaa58f20 100644 (file)
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -843,9 +843,14 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
  
                 pr_notice("%s(): Link succeeded, unlink failed (err %d). You now have a hard link\n",
                           __func__, ret);
-               /* Might as well let the VFS know */
-               d_instantiate(new_dentry, d_inode(old_dentry));
-               ihold(d_inode(old_dentry));
+               /*
+                * We can't keep the target in dcache after that.
+                * For one thing, we can't afford dentry aliases for directories.
+                * For another, if there was a victim, we _can't_ set new inode
+                * for that sucker and we have to trigger mount eviction - the
+                * caller won't do it on its own since we are returning an error.
+                */
+               d_invalidate(new_dentry);
                 new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
                 return ret;
         }
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c

index 26c2de2de13fd0ce7bb55287c58e268cf4bf37e6..b7f8eaeea5d83d8a1cb66e2d697de5341e7361cd 100644 (file)
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -633,7 +633,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
                                 d_rehash(newdent);
                 } else {
                         spin_lock(&dentry->d_lock);
-                       NCP_FINFO(inode)->flags &= ~NCPI_DIR_CACHE;
+                       NCP_FINFO(dir)->flags &= ~NCPI_DIR_CACHE;
                         spin_unlock(&dentry->d_lock);
                 }
         } else {
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c

index 9581d190f6e12346e70226c93458df0a1791abe0..77ebc2bc1cca112056501fe4dc160d48cc7069cd 100644 (file)
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -147,6 +147,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
         ret = ocfs2_inode_lock(inode, &di_bh, 1);
         if (ret < 0) {
                 mlog_errno(ret);
+               if (ret == -ENOMEM)
+                       ret = VM_FAULT_OOM;
+               else
+                       ret = VM_FAULT_SIGBUS;
                 goto out;
         }
  
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c

index ed95272d57a61af3f26a58c71cf789d3c94e28ce..52f6de5d40a9211baa29a39fc2a178598a86a92f 100644 (file)
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -618,7 +618,8 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
          * sole user of this dentry.  Too tricky...  Just unhash for
          * now.
          */
-       d_drop(dentry);
+       if (!err)
+               d_drop(dentry);
         inode_unlock(dir);
  
         return err;
@@ -903,6 +904,13 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
         if (!overwrite && new_is_dir && !old_opaque && new_opaque)
                 ovl_remove_opaque(newdentry);
  
+       /*
+        * Old dentry now lives in different location. Dentries in
+        * lowerstack are stale. We cannot drop them here because
+        * access to them is lockless. This could be only pure upper
+        * or opaque directory - numlower is zero. Or upper non-dir
+        * entry - its pureness is tracked by flag opaque.
+        */
         if (old_opaque != new_opaque) {
                 ovl_dentry_set_opaque(old, new_opaque);
                 if (!overwrite)
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c

index 49e204560655a8f1737a2a03a38b7a88604a5a4c..a4ff5d0d7db91605880b4eb350ea37aa397c9c9d 100644 (file)
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -65,6 +65,8 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
  
                 inode_lock(upperdentry->d_inode);
                 err = notify_change(upperdentry, attr, NULL);
+               if (!err)
+                       ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
                 inode_unlock(upperdentry->d_inode);
         }
         ovl_drop_write(dentry);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c

index 8d826bd56b26b10d641d7ce1f7a47b747851fac9..619ad4b016d209adcb2b6649d6f654049c860b07 100644 (file)
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -76,12 +76,14 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry)
         if (oe->__upperdentry) {
                 type = __OVL_PATH_UPPER;
  
-               if (oe->numlower) {
-                       if (S_ISDIR(dentry->d_inode->i_mode))
-                               type |= __OVL_PATH_MERGE;
-               } else if (!oe->opaque) {
+               /*
+                * Non-dir dentry can hold lower dentry from previous
+                * location. Its purity depends only on opaque flag.
+                */
+               if (oe->numlower && S_ISDIR(dentry->d_inode->i_mode))
+                       type |= __OVL_PATH_MERGE;
+               else if (!oe->opaque)
                         type |= __OVL_PATH_PURE;
-               }
         } else {
                 if (oe->numlower > 1)
                         type |= __OVL_PATH_MERGE;
@@ -341,6 +343,7 @@ static const struct dentry_operations ovl_dentry_operations = {
  
  static const struct dentry_operations ovl_reval_dentry_operations = {
         .d_release = ovl_dentry_release,
+       .d_select_inode = ovl_d_select_inode,
         .d_revalidate = ovl_dentry_revalidate,
         .d_weak_revalidate = ovl_dentry_weak_revalidate,
  };
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c

index 594f7e63b432427fd5b6448afa0a75eb0b71d558..be5568839442d1ab50bf5cae293b7b2b133b525e 100644 (file)
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1109,27 +1109,10 @@ xlog_verify_head(
         bool                    tmp_wrapped;
  
         /*
-        * Search backwards through the log looking for the log record header
-        * block. This wraps all the way back around to the head so something is
-        * seriously wrong if we can't find it.
-        */
-       found = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp, rhead_blk,
-                                     rhead, wrapped);
-       if (found < 0)
-               return found;
-       if (!found) {
-               xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
-               return -EIO;
-       }
-
-       *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
-
-       /*
-        * Now that we have a tail block, check the head of the log for torn
-        * writes. Search again until we hit the tail or the maximum number of
-        * log record I/Os that could have been in flight at one time. Use a
-        * temporary buffer so we don't trash the rhead/bp pointer from the
-        * call above.
+        * Check the head of the log for torn writes. Search backwards from the
+        * head until we hit the tail or the maximum number of log record I/Os
+        * that could have been in flight at one time. Use a temporary buffer so
+        * we don't trash the rhead/bp pointers from the caller.
          */
         tmp_bp = xlog_get_bp(log, 1);
         if (!tmp_bp)
@@ -1215,6 +1198,115 @@ xlog_verify_head(
         return error;
  }
  
+/*
+ * Check whether the head of the log points to an unmount record. In other
+ * words, determine whether the log is clean. If so, update the in-core state
+ * appropriately.
+ */
+static int
+xlog_check_unmount_rec(
+       struct xlog             *log,
+       xfs_daddr_t             *head_blk,
+       xfs_daddr_t             *tail_blk,
+       struct xlog_rec_header  *rhead,
+       xfs_daddr_t             rhead_blk,
+       struct xfs_buf          *bp,
+       bool                    *clean)
+{
+       struct xlog_op_header   *op_head;
+       xfs_daddr_t             umount_data_blk;
+       xfs_daddr_t             after_umount_blk;
+       int                     hblks;
+       int                     error;
+       char                    *offset;
+
+       *clean = false;
+
+       /*
+        * Look for unmount record. If we find it, then we know there was a
+        * clean unmount. Since 'i' could be the last block in the physical
+        * log, we convert to a log block before comparing to the head_blk.
+        *
+        * Save the current tail lsn to use to pass to xlog_clear_stale_blocks()
+        * below. We won't want to clear the unmount record if there is one, so
+        * we pass the lsn of the unmount record rather than the block after it.
+        */
+       if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+               int     h_size = be32_to_cpu(rhead->h_size);
+               int     h_version = be32_to_cpu(rhead->h_version);
+
+               if ((h_version & XLOG_VERSION_2) &&
+                   (h_size > XLOG_HEADER_CYCLE_SIZE)) {
+                       hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
+                       if (h_size % XLOG_HEADER_CYCLE_SIZE)
+                               hblks++;
+               } else {
+                       hblks = 1;
+               }
+       } else {
+               hblks = 1;
+       }
+       after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len));
+       after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize);
+       if (*head_blk == after_umount_blk &&
+           be32_to_cpu(rhead->h_num_logops) == 1) {
+               umount_data_blk = rhead_blk + hblks;
+               umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize);
+               error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
+               if (error)
+                       return error;
+
+               op_head = (struct xlog_op_header *)offset;
+               if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
+                       /*
+                        * Set tail and last sync so that newly written log
+                        * records will point recovery to after the current
+                        * unmount record.
+                        */
+                       xlog_assign_atomic_lsn(&log->l_tail_lsn,
+                                       log->l_curr_cycle, after_umount_blk);
+                       xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
+                                       log->l_curr_cycle, after_umount_blk);
+                       *tail_blk = after_umount_blk;
+
+                       *clean = true;
+               }
+       }
+
+       return 0;
+}
+
+static void
+xlog_set_state(
+       struct xlog             *log,
+       xfs_daddr_t             head_blk,
+       struct xlog_rec_header  *rhead,
+       xfs_daddr_t             rhead_blk,
+       bool                    bump_cycle)
+{
+       /*
+        * Reset log values according to the state of the log when we
+        * crashed.  In the case where head_blk == 0, we bump curr_cycle
+        * one because the next write starts a new cycle rather than
+        * continuing the cycle of the last good log record.  At this
+        * point we have guaranteed that all partial log records have been
+        * accounted for.  Therefore, we know that the last good log record
+        * written was complete and ended exactly on the end boundary
+        * of the physical log.
+        */
+       log->l_prev_block = rhead_blk;
+       log->l_curr_block = (int)head_blk;
+       log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
+       if (bump_cycle)
+               log->l_curr_cycle++;
+       atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
+       atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
+       xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
+                                       BBTOB(log->l_curr_block));
+       xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
+                                       BBTOB(log->l_curr_block));
+}
+
  /*
   * Find the sync block number or the tail of the log.
   *
@@ -1238,22 +1330,20 @@ xlog_find_tail(
         xfs_daddr_t             *tail_blk)
  {
         xlog_rec_header_t       *rhead;
-       xlog_op_header_t        *op_head;
         char                    *offset = NULL;
         xfs_buf_t               *bp;
         int                     error;
-       xfs_daddr_t             umount_data_blk;
-       xfs_daddr_t             after_umount_blk;
         xfs_daddr_t             rhead_blk;
         xfs_lsn_t               tail_lsn;
-       int                     hblks;
         bool                    wrapped = false;
+       bool                    clean = false;
  
         /*
          * Find previous log record
          */
         if ((error = xlog_find_head(log, head_blk)))
                 return error;
+       ASSERT(*head_blk < INT_MAX);
  
         bp = xlog_get_bp(log, 1);
         if (!bp)
@@ -1271,99 +1361,74 @@ xlog_find_tail(
         }
  
         /*
-        * Trim the head block back to skip over torn records. We can have
-        * multiple log I/Os in flight at any time, so we assume CRC failures
-        * back through the previous several records are torn writes and skip
-        * them.
+        * Search backwards through the log looking for the log record header
+        * block. This wraps all the way back around to the head so something is
+        * seriously wrong if we can't find it.
          */
-       ASSERT(*head_blk < INT_MAX);
-       error = xlog_verify_head(log, head_blk, tail_blk, bp, &rhead_blk,
-                                &rhead, &wrapped);
-       if (error)
-               goto done;
+       error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp,
+                                     &rhead_blk, &rhead, &wrapped);
+       if (error < 0)
+               return error;
+       if (!error) {
+               xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
+               return -EIO;
+       }
+       *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
  
         /*
-        * Reset log values according to the state of the log when we
-        * crashed.  In the case where head_blk == 0, we bump curr_cycle
-        * one because the next write starts a new cycle rather than
-        * continuing the cycle of the last good log record.  At this
-        * point we have guaranteed that all partial log records have been
-        * accounted for.  Therefore, we know that the last good log record
-        * written was complete and ended exactly on the end boundary
-        * of the physical log.
+        * Set the log state based on the current head record.
          */
-       log->l_prev_block = rhead_blk;
-       log->l_curr_block = (int)*head_blk;
-       log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
-       if (wrapped)
-               log->l_curr_cycle++;
-       atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
-       atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
-       xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
-                                       BBTOB(log->l_curr_block));
-       xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
-                                       BBTOB(log->l_curr_block));
+       xlog_set_state(log, *head_blk, rhead, rhead_blk, wrapped);
+       tail_lsn = atomic64_read(&log->l_tail_lsn);
  
         /*
-        * Look for unmount record.  If we find it, then we know there
-        * was a clean unmount.  Since 'i' could be the last block in
-        * the physical log, we convert to a log block before comparing
-        * to the head_blk.
+        * Look for an unmount record at the head of the log. This sets the log
+        * state to determine whether recovery is necessary.
+        */
+       error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead,
+                                      rhead_blk, bp, &clean);
+       if (error)
+               goto done;
+
+       /*
+        * Verify the log head if the log is not clean (e.g., we have anything
+        * but an unmount record at the head). This uses CRC verification to
+        * detect and trim torn writes. If discovered, CRC failures are
+        * considered torn writes and the log head is trimmed accordingly.
          *
-        * Save the current tail lsn to use to pass to
-        * xlog_clear_stale_blocks() below.  We won't want to clear the
-        * unmount record if there is one, so we pass the lsn of the
-        * unmount record rather than the block after it.
+        * Note that we can only run CRC verification when the log is dirty
+        * because there's no guarantee that the log data behind an unmount
+        * record is compatible with the current architecture.
          */
-       if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
-               int     h_size = be32_to_cpu(rhead->h_size);
-               int     h_version = be32_to_cpu(rhead->h_version);
+       if (!clean) {
+               xfs_daddr_t     orig_head = *head_blk;
  
-               if ((h_version & XLOG_VERSION_2) &&
-                   (h_size > XLOG_HEADER_CYCLE_SIZE)) {
-                       hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
-                       if (h_size % XLOG_HEADER_CYCLE_SIZE)
-                               hblks++;
-               } else {
-                       hblks = 1;
-               }
-       } else {
-               hblks = 1;
-       }
-       after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len));
-       after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize);
-       tail_lsn = atomic64_read(&log->l_tail_lsn);
-       if (*head_blk == after_umount_blk &&
-           be32_to_cpu(rhead->h_num_logops) == 1) {
-               umount_data_blk = rhead_blk + hblks;
-               umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize);
-               error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
+               error = xlog_verify_head(log, head_blk, tail_blk, bp,
+                                        &rhead_blk, &rhead, &wrapped);
                 if (error)
                         goto done;
  
-               op_head = (xlog_op_header_t *)offset;
-               if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
-                       /*
-                        * Set tail and last sync so that newly written
-                        * log records will point recovery to after the
-                        * current unmount record.
-                        */
-                       xlog_assign_atomic_lsn(&log->l_tail_lsn,
-                                       log->l_curr_cycle, after_umount_blk);
-                       xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
-                                       log->l_curr_cycle, after_umount_blk);
-                       *tail_blk = after_umount_blk;
-
-                       /*
-                        * Note that the unmount was clean. If the unmount
-                        * was not clean, we need to know this to rebuild the
-                        * superblock counters from the perag headers if we
-                        * have a filesystem using non-persistent counters.
-                        */
-                       log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
+               /* update in-core state again if the head changed */
+               if (*head_blk != orig_head) {
+                       xlog_set_state(log, *head_blk, rhead, rhead_blk,
+                                      wrapped);
+                       tail_lsn = atomic64_read(&log->l_tail_lsn);
+                       error = xlog_check_unmount_rec(log, head_blk, tail_blk,
+                                                      rhead, rhead_blk, bp,
+                                                      &clean);
+                       if (error)
+                               goto done;
                 }
         }
  
+       /*
+        * Note that the unmount was clean. If the unmount was not clean, we
+        * need to know this to rebuild the superblock counters from the perag
+        * headers if we have a filesystem using non-persistent counters.
+        */
+       if (clean)
+               log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
+
         /*
          * Make sure that there are no blocks in front of the head
          * with the same cycle number as the head.  This can happen
diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h

index 39e1cb201b8eaa00ffe759ce7147cc8b3647fb5d..35a52a880b2f812b382c905bf44df50290aa4f0c 100644 (file)
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -119,11 +119,6 @@ static __always_inline bool virt_spin_lock(struct qspinlock *lock)
  }
  #endif
  
-/*
- * Initializier
- */
-#define        __ARCH_SPIN_LOCK_UNLOCKED       { ATOMIC_INIT(0) }
-
  /*
   * Remapping spinlock architecture specific functions to the corresponding
   * queued spinlock functions.
diff --git a/include/asm-generic/qspinlock_types.h b/include/asm-generic/qspinlock_types.h

index 85f888e86761e1b71bd0de2a891a160194e256d7..034acd0c4956b59932f650e85612a9397236dc66 100644 (file)
--- a/include/asm-generic/qspinlock_types.h
+++ b/include/asm-generic/qspinlock_types.h
@@ -32,6 +32,11 @@ typedef struct qspinlock {
         atomic_t        val;
  } arch_spinlock_t;
  
+/*
+ * Initializier
+ */
+#define        __ARCH_SPIN_LOCK_UNLOCKED       { ATOMIC_INIT(0) }
+
  /*
   * Bitfields in the atomic value:
   *
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h

index c4bd0e2c173c011e37f6eef7acfb9bc3920fcca6..772c784ba76301dbceca9f3c0991c626a893e076 100644 (file)
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -256,6 +256,7 @@
         .rodata           : AT(ADDR(.rodata) - LOAD_OFFSET) {           \
                 VMLINUX_SYMBOL(__start_rodata) = .;                     \
                 *(.rodata) *(.rodata.*)                                 \
+               *(.data..ro_after_init) /* Read only after init */      \
                 *(__vermagic)           /* Kernel version magic */      \
                 . = ALIGN(8);                                           \
                 VMLINUX_SYMBOL(__start___tracepoints_ptrs) = .;         \
diff --git a/include/linux/atomic.h b/include/linux/atomic.h

index 301de78d65f75c194720cc5168359dbce6eab59f..6c502cb13c95aa2f93466687c16d23b950b231d6 100644 (file)
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -548,6 +548,27 @@ static inline int atomic_dec_if_positive(atomic_t *v)
  }
  #endif
  
+/**
+ * fetch_or - perform *ptr |= mask and return old value of *ptr
+ * @ptr: pointer to value
+ * @mask: mask to OR on the value
+ *
+ * cmpxchg based fetch_or, macro so it works for different integer types
+ */
+#ifndef fetch_or
+#define fetch_or(ptr, mask)                                            \
+({     typeof(*(ptr)) __old, __val = *(ptr);                           \
+       for (;;) {                                                      \
+               __old = cmpxchg((ptr), __val, __val | (mask));          \
+               if (__old == __val)                                     \
+                       break;                                          \
+               __val = __old;                                          \
+       }                                                               \
+       __old;                                                          \
+})
+#endif
+
+
  #ifdef CONFIG_GENERIC_ATOMIC64
  #include <asm-generic/atomic64.h>
  #endif
diff --git a/include/linux/bio.h b/include/linux/bio.h

index cb68888241084e33af2d7e11753e048431cf7837..88bc64f00bb53cb01fe60ca87dfafd7c9c0f0f4a 100644 (file)
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -320,11 +320,6 @@ static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
         struct bvec_iter iter = bio->bi_iter;
         int idx;
  
-       if (!bio_flagged(bio, BIO_CLONED)) {
-               *bv = bio->bi_io_vec[bio->bi_vcnt - 1];
-               return;
-       }
-
         if (unlikely(!bio_multiple_segments(bio))) {
                 *bv = bio_iovec(bio);
                 return;
diff --git a/include/linux/cache.h b/include/linux/cache.h

index 17e7e82d2aa758f9888419a9c03aa4059e16b247..1be04f8c563a0c60bdfca72a36c120ec96ef327c 100644 (file)
--- a/include/linux/cache.h
+++ b/include/linux/cache.h
@@ -12,10 +12,24 @@
  #define SMP_CACHE_BYTES L1_CACHE_BYTES
  #endif
  
+/*
+ * __read_mostly is used to keep rarely changing variables out of frequently
+ * updated cachelines. If an architecture doesn't support it, ignore the
+ * hint.
+ */
  #ifndef __read_mostly
  #define __read_mostly
  #endif
  
+/*
+ * __ro_after_init is used to mark things that are read-only after init (i.e.
+ * after mark_rodata_ro() has been called). These are effectively read-only,
+ * but may get written to during init, so can't live in .rodata (via "const").
+ */
+#ifndef __ro_after_init
+#define __ro_after_init __attribute__((__section__(".data..ro_after_init")))
+#endif
+
  #ifndef ____cacheline_aligned
  #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES)))
  #endif
diff --git a/include/linux/compiler.h b/include/linux/compiler.h

index 48f5aab117ae12625d041cd18555031e87c178fc..a27f4f17c382b1cbf8884ac33b8134dafe624b2d 100644 (file)
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -263,8 +263,9 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
   * In contrast to ACCESS_ONCE these two macros will also work on aggregate
   * data types like structs or unions. If the size of the accessed data
   * type exceeds the word size of the machine (e.g., 32 bits or 64 bits)
- * READ_ONCE() and WRITE_ONCE()  will fall back to memcpy and print a
- * compile-time warning.
+ * READ_ONCE() and WRITE_ONCE() will fall back to memcpy(). There's at
+ * least two memcpy()s: one for the __builtin_memcpy() and then one for
+ * the macro doing the copy of variable - '__u' allocated on the stack.
   *
   * Their two major use cases are: (1) Mediating communication between
   * process-level code and irq/NMI handlers, all running on the same CPU,
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h

index 75857cda38e989e5a44150c5abb9bd1bd4872957..5e45cf930a3f53fc7071f42a5759fe3dd5fd6a31 100644 (file)
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -386,7 +386,7 @@ static inline void dma_free_attrs(struct device *dev, size_t size,
         if (dma_release_from_coherent(dev, get_order(size), cpu_addr))
                 return;
  
-       if (!ops->free)
+       if (!ops->free || !cpu_addr)
                 return;
  
         debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
@@ -641,31 +641,40 @@ static inline void dmam_release_declared_memory(struct device *dev)
  }
  #endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */
  
-static inline void *dma_alloc_writecombine(struct device *dev, size_t size,
-                                          dma_addr_t *dma_addr, gfp_t gfp)
+static inline void *dma_alloc_wc(struct device *dev, size_t size,
+                                dma_addr_t *dma_addr, gfp_t gfp)
  {
         DEFINE_DMA_ATTRS(attrs);
         dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs);
         return dma_alloc_attrs(dev, size, dma_addr, gfp, &attrs);
  }
+#ifndef dma_alloc_writecombine
+#define dma_alloc_writecombine dma_alloc_wc
+#endif
  
-static inline void dma_free_writecombine(struct device *dev, size_t size,
-                                        void *cpu_addr, dma_addr_t dma_addr)
+static inline void dma_free_wc(struct device *dev, size_t size,
+                              void *cpu_addr, dma_addr_t dma_addr)
  {
         DEFINE_DMA_ATTRS(attrs);
         dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs);
         return dma_free_attrs(dev, size, cpu_addr, dma_addr, &attrs);
  }
+#ifndef dma_free_writecombine
+#define dma_free_writecombine dma_free_wc
+#endif
  
-static inline int dma_mmap_writecombine(struct device *dev,
-                                       struct vm_area_struct *vma,
-                                       void *cpu_addr, dma_addr_t dma_addr,
-                                       size_t size)
+static inline int dma_mmap_wc(struct device *dev,
+                             struct vm_area_struct *vma,
+                             void *cpu_addr, dma_addr_t dma_addr,
+                             size_t size)
  {
         DEFINE_DMA_ATTRS(attrs);
         dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs);
         return dma_mmap_attrs(dev, vma, cpu_addr, dma_addr, size, &attrs);
  }
+#ifndef dma_mmap_writecombine
+#define dma_mmap_writecombine dma_mmap_wc
+#endif
  
  #ifdef CONFIG_NEED_DMA_MAP_STATE
  #define DEFINE_DMA_UNMAP_ADDR(ADDR_NAME)        dma_addr_t ADDR_NAME
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h

index c2b340e23f62d3240abc6360ab0412a82a12f21a..6d9df3f7e334c3a942728237dca471f7bb602f98 100644 (file)
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -713,6 +713,18 @@ static inline void __ftrace_enabled_restore(int enabled)
  #define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5))
  #define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6))
  
+static inline unsigned long get_lock_parent_ip(void)
+{
+       unsigned long addr = CALLER_ADDR0;
+
+       if (!in_lock_functions(addr))
+               return addr;
+       addr = CALLER_ADDR1;
+       if (!in_lock_functions(addr))
+               return addr;
+       return CALLER_ADDR2;
+}
+
  #ifdef CONFIG_IRQSOFF_TRACER
    extern void time_hardirqs_on(unsigned long a0, unsigned long a1);
    extern void time_hardirqs_off(unsigned long a0, unsigned long a1);
diff --git a/include/linux/init.h b/include/linux/init.h

index b449f378f995ae647077f521d9f7af3af9480a70..aedb254abc37204a091b790eedfde857dc22155f 100644 (file)
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -142,6 +142,10 @@ void prepare_namespace(void);
  void __init load_default_modules(void);
  int __init init_rootfs(void);
  
+#ifdef CONFIG_DEBUG_RODATA
+void mark_rodata_ro(void);
+#endif
+
  extern void (*late_time_init)(void);
  
  extern bool initcall_debug;
diff --git a/include/linux/ioport.h b/include/linux/ioport.h

index 24bea087e7af8083b3c81596a289d70530c64285..afb45597fb5f0d3aa19f6bb9fd0818995ae6623e 100644 (file)
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -20,6 +20,7 @@ struct resource {
         resource_size_t end;
         const char *name;
         unsigned long flags;
+       unsigned long desc;
         struct resource *parent, *sibling, *child;
  };
  
@@ -49,12 +50,19 @@ struct resource {
  #define IORESOURCE_WINDOW      0x00200000      /* forwarded by bridge */
  #define IORESOURCE_MUXED       0x00400000      /* Resource is software muxed */
  
+#define IORESOURCE_EXT_TYPE_BITS 0x01000000    /* Resource extended types */
+#define IORESOURCE_SYSRAM      0x01000000      /* System RAM (modifier) */
+
  #define IORESOURCE_EXCLUSIVE   0x08000000      /* Userland may not map this resource */
+
  #define IORESOURCE_DISABLED    0x10000000
  #define IORESOURCE_UNSET       0x20000000      /* No address assigned yet */
  #define IORESOURCE_AUTO                0x40000000
  #define IORESOURCE_BUSY                0x80000000      /* Driver has marked this resource busy */
  
+/* I/O resource extended types */
+#define IORESOURCE_SYSTEM_RAM          (IORESOURCE_MEM|IORESOURCE_SYSRAM)
+
  /* PnP IRQ specific bits (IORESOURCE_BITS) */
  #define IORESOURCE_IRQ_HIGHEDGE                (1<<0)
  #define IORESOURCE_IRQ_LOWEDGE         (1<<1)
@@ -105,6 +113,22 @@ struct resource {
  /* PCI control bits.  Shares IORESOURCE_BITS with above PCI ROM.  */
  #define IORESOURCE_PCI_FIXED           (1<<4)  /* Do not move resource */
  
+/*
+ * I/O Resource Descriptors
+ *
+ * Descriptors are used by walk_iomem_res_desc() and region_intersects()
+ * for searching a specific resource range in the iomem table.  Assign
+ * a new descriptor when a resource range supports the search interfaces.
+ * Otherwise, resource.desc must be set to IORES_DESC_NONE (0).
+ */
+enum {
+       IORES_DESC_NONE                         = 0,
+       IORES_DESC_CRASH_KERNEL                 = 1,
+       IORES_DESC_ACPI_TABLES                  = 2,
+       IORES_DESC_ACPI_NV_STORAGE              = 3,
+       IORES_DESC_PERSISTENT_MEMORY            = 4,
+       IORES_DESC_PERSISTENT_MEMORY_LEGACY     = 5,
+};
  
  /* helpers to define resources */
  #define DEFINE_RES_NAMED(_start, _size, _name, _flags)                 \
@@ -113,6 +137,7 @@ struct resource {
                 .end = (_start) + (_size) - 1,                          \
                 .name = (_name),                                        \
                 .flags = (_flags),                                      \
+               .desc = IORES_DESC_NONE,                                \
         }
  
  #define DEFINE_RES_IO_NAMED(_start, _size, _name)                      \
@@ -170,6 +195,10 @@ static inline unsigned long resource_type(const struct resource *res)
  {
         return res->flags & IORESOURCE_TYPE_BITS;
  }
+static inline unsigned long resource_ext_type(const struct resource *res)
+{
+       return res->flags & IORESOURCE_EXT_TYPE_BITS;
+}
  /* True iff r1 completely contains r2 */
  static inline bool resource_contains(struct resource *r1, struct resource *r2)
  {
@@ -239,8 +268,8 @@ extern int
  walk_system_ram_res(u64 start, u64 end, void *arg,
                     int (*func)(u64, u64, void *));
  extern int
-walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, void *arg,
-              int (*func)(u64, u64, void *));
+walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end,
+                   void *arg, int (*func)(u64, u64, void *));
  
  /* True if any part of r1 overlaps r2 */
  static inline bool resource_overlaps(struct resource *r1, struct resource *r2)
diff --git a/include/linux/kasan.h b/include/linux/kasan.h

index 4b9f85c963d0738e20ee7c89e2dcb28ba3cc1aaa..0fdc798e3ff795a9dffd75e36e90ce2b61141562 100644 (file)
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -1,6 +1,7 @@
  #ifndef _LINUX_KASAN_H
  #define _LINUX_KASAN_H
  
+#include <linux/sched.h>
  #include <linux/types.h>
  
  struct kmem_cache;
@@ -13,7 +14,6 @@ struct vm_struct;
  
  #include <asm/kasan.h>
  #include <asm/pgtable.h>
-#include <linux/sched.h>
  
  extern unsigned char kasan_zero_page[PAGE_SIZE];
  extern pte_t kasan_zero_pte[PTRS_PER_PTE];
@@ -43,6 +43,8 @@ static inline void kasan_disable_current(void)
  
  void kasan_unpoison_shadow(const void *address, size_t size);
  
+void kasan_unpoison_task_stack(struct task_struct *task);
+
  void kasan_alloc_pages(struct page *page, unsigned int order);
  void kasan_free_pages(struct page *page, unsigned int order);
  
@@ -66,6 +68,8 @@ void kasan_free_shadow(const struct vm_struct *vm);
  
  static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
  
+static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
+
  static inline void kasan_enable_current(void) {}
  static inline void kasan_disable_current(void) {}
  
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h

index 861f690aa79118a0203d90f896bf860e3374820e..5276fe0916fcc63a8944af0a288b1afeccc100b0 100644 (file)
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -25,6 +25,7 @@
  #include <linux/irqflags.h>
  #include <linux/context_tracking.h>
  #include <linux/irqbypass.h>
+#include <linux/swait.h>
  #include <asm/signal.h>
  
  #include <linux/kvm.h>
@@ -218,7 +219,7 @@ struct kvm_vcpu {
         int fpu_active;
         int guest_fpu_loaded, guest_xcr0_loaded;
         unsigned char fpu_counter;
-       wait_queue_head_t wq;
+       struct swait_queue_head wq;
         struct pid *pid;
         int sigset_active;
         sigset_t sigset;
@@ -782,7 +783,7 @@ static inline bool kvm_arch_has_assigned_device(struct kvm *kvm)
  }
  #endif
  
-static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
+static inline struct swait_queue_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
  {
  #ifdef __KVM_HAVE_ARCH_WQP
         return vcpu->arch.wqp;
diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h

index e23121f9d82a042a9b10907fe40e468e5711f99f..59ccab297ae061ba03494ccede8eae2d7a5236a1 100644 (file)
--- a/include/linux/latencytop.h
+++ b/include/linux/latencytop.h
@@ -37,6 +37,9 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter)
  
  void clear_all_latency_tracing(struct task_struct *p);
  
+extern int sysctl_latencytop(struct ctl_table *table, int write,
+                       void __user *buffer, size_t *lenp, loff_t *ppos);
+
  #else
  
  static inline void
diff --git a/include/linux/list.h b/include/linux/list.h

index 30cf4200ab40ee40fdae694291e36fe869b508bc..5356f4d661a721ba0446b1183e2a834f3bf3b56f 100644 (file)
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -113,17 +113,6 @@ extern void __list_del_entry(struct list_head *entry);
  extern void list_del(struct list_head *entry);
  #endif
  
-#ifdef CONFIG_DEBUG_LIST
-/*
- * See devm_memremap_pages() which wants DEBUG_LIST=y to assert if one
- * of the pages it allocates is ever passed to list_add()
- */
-extern void list_force_poison(struct list_head *entry);
-#else
-/* fallback to the less strict LIST_POISON* definitions */
-#define list_force_poison list_del
-#endif
-
  /**
   * list_replace - replace old entry by new one
   * @old : the element to be replaced
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h

index 4dca42fd32f52d17326e436c4d9fcbd86a0e8d24..d026b190c53066d25753ce98f0d7c66d864a6c0a 100644 (file)
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -261,7 +261,6 @@ struct held_lock {
  /*
   * Initialization, self-test and debugging-output methods:
   */
-extern void lockdep_init(void);
  extern void lockdep_info(void);
  extern void lockdep_reset(void);
  extern void lockdep_reset_lock(struct lockdep_map *lock);
@@ -392,7 +391,6 @@ static inline void lockdep_on(void)
  # define lockdep_set_current_reclaim_state(g)  do { } while (0)
  # define lockdep_clear_current_reclaim_state() do { } while (0)
  # define lockdep_trace_alloc(g)                        do { } while (0)
-# define lockdep_init()                                do { } while (0)
  # define lockdep_info()                                do { } while (0)
  # define lockdep_init_map(lock, name, key, sub) \
                 do { (void)(name); (void)(key); } while (0)
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h

index 51f1e540fc2b83bf9dd143637bad8acdbc44d0fd..58eef02edc7e81ab2dcd14aace9c722b76a89623 100644 (file)
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -4245,7 +4245,9 @@ struct mlx5_ifc_modify_tir_bitmask_bits {
  
         u8         reserved_at_20[0x1b];
         u8         self_lb_en[0x1];
-       u8         reserved_at_3c[0x3];
+       u8         reserved_at_3c[0x1];
+       u8         hash[0x1];
+       u8         reserved_at_3e[0x1];
         u8         lro[0x1];
  };
  
diff --git a/include/linux/mm.h b/include/linux/mm.h

index b1d4b8c7f7cdcddb4b2513f11bc4231c546baf25..3579d1e2fe3acd30bcf7733c356a8b018758391d 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -387,7 +387,8 @@ enum {
         REGION_MIXED,
  };
  
-int region_intersects(resource_size_t offset, size_t size, const char *type);
+int region_intersects(resource_size_t offset, size_t size, unsigned long flags,
+                     unsigned long desc);
  
  /* Support for virtually mapped pages */
  struct page *vmalloc_to_page(const void *addr);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h

index f5c5a3fa2c8101cc37ea917d29dec30713ec0699..79ec7bbf01557e523aee3ed263425b6fc3360741 100644 (file)
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -468,6 +468,7 @@ struct perf_event {
         int                             group_flags;
         struct perf_event               *group_leader;
         struct pmu                      *pmu;
+       void                            *pmu_private;
  
         enum perf_event_active_state    state;
         unsigned int                    attach_state;
@@ -1109,12 +1110,6 @@ static inline void perf_event_task_tick(void)                            { }
  static inline int perf_event_release_kernel(struct perf_event *event)  { return 0; }
  #endif
  
-#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_NO_HZ_FULL)
-extern bool perf_event_can_stop_tick(void);
-#else
-static inline bool perf_event_can_stop_tick(void)                      { return true; }
-#endif
-
  #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
  extern void perf_restore_debug_store(void);
  #else
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h

index 907f3fd191acaa1573445a0988e5161b8e408ec3..62d44c176071833ab644d46a95ec910fb1ee57e7 100644 (file)
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -128,9 +128,6 @@ void posix_cpu_timer_schedule(struct k_itimer *timer);
  void run_posix_cpu_timers(struct task_struct *task);
  void posix_cpu_timers_exit(struct task_struct *task);
  void posix_cpu_timers_exit_group(struct task_struct *task);
-
-bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk);
-
  void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
                            cputime_t *newval, cputime_t *oldval);
  
diff --git a/include/linux/sched.h b/include/linux/sched.h

index a10494a94cc30f24d65ab866771833883c6f886d..c617ea12c6b7c365f529b517859b26f8c2678e48 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -182,8 +182,6 @@ extern void update_cpu_load_nohz(int active);
  static inline void update_cpu_load_nohz(int active) { }
  #endif
  
-extern unsigned long get_parent_ip(unsigned long addr);
-
  extern void dump_cpu_task(int cpu);
  
  struct seq_file;
@@ -719,6 +717,10 @@ struct signal_struct {
         /* Earliest-expiration cache. */
         struct task_cputime cputime_expires;
  
+#ifdef CONFIG_NO_HZ_FULL
+       unsigned long tick_dep_mask;
+#endif
+
         struct list_head cpu_timers[3];
  
         struct pid *tty_old_pgrp;
@@ -920,6 +922,10 @@ static inline int sched_info_on(void)
  #endif
  }
  
+#ifdef CONFIG_SCHEDSTATS
+void force_schedstat_enabled(void);
+#endif
+
  enum cpu_idle_type {
         CPU_IDLE,
         CPU_NOT_IDLE,
@@ -1289,6 +1295,8 @@ struct sched_rt_entity {
         unsigned long timeout;
         unsigned long watchdog_stamp;
         unsigned int time_slice;
+       unsigned short on_rq;
+       unsigned short on_list;
  
         struct sched_rt_entity *back;
  #ifdef CONFIG_RT_GROUP_SCHED
@@ -1329,10 +1337,6 @@ struct sched_dl_entity {
          * task has to wait for a replenishment to be performed at the
          * next firing of dl_timer.
          *
-        * @dl_new tells if a new instance arrived. If so we must
-        * start executing it with full runtime and reset its absolute
-        * deadline;
-        *
          * @dl_boosted tells if we are boosted due to DI. If so we are
          * outside bandwidth enforcement mechanism (but only until we
          * exit the critical section);
@@ -1340,7 +1344,7 @@ struct sched_dl_entity {
          * @dl_yielded tells if task gave up the cpu before consuming
          * all its available runtime during the last job.
          */
-       int dl_throttled, dl_new, dl_boosted, dl_yielded;
+       int dl_throttled, dl_boosted, dl_yielded;
  
         /*
          * Bandwidth enforcement timer. Each -deadline task has its
@@ -1542,6 +1546,10 @@ struct task_struct {
                 VTIME_SYS,
         } vtime_snap_whence;
  #endif
+
+#ifdef CONFIG_NO_HZ_FULL
+       unsigned long tick_dep_mask;
+#endif
         unsigned long nvcsw, nivcsw; /* context switch counts */
         u64 start_time;         /* monotonic time in nsec */
         u64 real_start_time;    /* boot based time in nsec */
@@ -2356,10 +2364,7 @@ static inline void wake_up_nohz_cpu(int cpu) { }
  #endif
  
  #ifdef CONFIG_NO_HZ_FULL
-extern bool sched_can_stop_tick(void);
  extern u64 scheduler_tick_max_deferment(void);
-#else
-static inline bool sched_can_stop_tick(void) { return false; }
  #endif
  
  #ifdef CONFIG_SCHED_AUTOGROUP
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h

index c9e4731cf10b8e97956b160c503e447490991931..4f080ab4f2cd1199f3f5aee15e7b06fdebb61333 100644 (file)
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -95,4 +95,8 @@ extern int sysctl_numa_balancing(struct ctl_table *table, int write,
                                  void __user *buffer, size_t *lenp,
                                  loff_t *ppos);
  
+extern int sysctl_schedstats(struct ctl_table *table, int write,
+                                void __user *buffer, size_t *lenp,
+                                loff_t *ppos);
+
  #endif /* _SCHED_SYSCTL_H */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h

index 4ce9ff7086f4897a67f6037d88df21012cf28413..d3fcd4591ce4ac1312a35710c81a9ff51211c140 100644 (file)
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1985,6 +1985,30 @@ static inline void skb_reserve(struct sk_buff *skb, int len)
         skb->tail += len;
  }
  
+/**
+ *     skb_tailroom_reserve - adjust reserved_tailroom
+ *     @skb: buffer to alter
+ *     @mtu: maximum amount of headlen permitted
+ *     @needed_tailroom: minimum amount of reserved_tailroom
+ *
+ *     Set reserved_tailroom so that headlen can be as large as possible but
+ *     not larger than mtu and tailroom cannot be smaller than
+ *     needed_tailroom.
+ *     The required headroom should already have been reserved before using
+ *     this function.
+ */
+static inline void skb_tailroom_reserve(struct sk_buff *skb, unsigned int mtu,
+                                       unsigned int needed_tailroom)
+{
+       SKB_LINEAR_ASSERT(skb);
+       if (mtu < skb_tailroom(skb) - needed_tailroom)
+               /* use at most mtu */
+               skb->reserved_tailroom = skb_tailroom(skb) - mtu;
+       else
+               /* use up to all available space */
+               skb->reserved_tailroom = needed_tailroom;
+}
+
  #define ENCAP_TYPE_ETHER       0
  #define ENCAP_TYPE_IPPROTO     1
  
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h

index eead8ab93c0a36e402741ee767d3c3bc70128964..881a79d524675d7411b8985fd4a91110f69fd8e5 100644 (file)
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -100,6 +100,7 @@ struct plat_stmmacenet_data {
         int interface;
         struct stmmac_mdio_bus_data *mdio_bus_data;
         struct device_node *phy_node;
+       struct device_node *mdio_node;
         struct stmmac_dma_cfg *dma_cfg;
         int clk_csr;
         int has_gmac;
diff --git a/include/linux/swait.h b/include/linux/swait.h

new file mode 100644 (file)

index 0000000..c1f9c62
--- /dev/null
+++ b/include/linux/swait.h
@@ -0,0 +1,172 @@
+#ifndef _LINUX_SWAIT_H
+#define _LINUX_SWAIT_H
+
+#include <linux/list.h>
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <asm/current.h>
+
+/*
+ * Simple wait queues
+ *
+ * While these are very similar to the other/complex wait queues (wait.h) the
+ * most important difference is that the simple waitqueue allows for
+ * deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold
+ * times.
+ *
+ * In order to make this so, we had to drop a fair number of features of the
+ * other waitqueue code; notably:
+ *
+ *  - mixing INTERRUPTIBLE and UNINTERRUPTIBLE sleeps on the same waitqueue;
+ *    all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right
+ *    sleeper state.
+ *
+ *  - the exclusive mode; because this requires preserving the list order
+ *    and this is hard.
+ *
+ *  - custom wake functions; because you cannot give any guarantees about
+ *    random code.
+ *
+ * As a side effect of this; the data structures are slimmer.
+ *
+ * One would recommend using this wait queue where possible.
+ */
+
+struct task_struct;
+
+struct swait_queue_head {
+       raw_spinlock_t          lock;
+       struct list_head        task_list;
+};
+
+struct swait_queue {
+       struct task_struct      *task;
+       struct list_head        task_list;
+};
+
+#define __SWAITQUEUE_INITIALIZER(name) {                               \
+       .task           = current,                                      \
+       .task_list      = LIST_HEAD_INIT((name).task_list),             \
+}
+
+#define DECLARE_SWAITQUEUE(name)                                       \
+       struct swait_queue name = __SWAITQUEUE_INITIALIZER(name)
+
+#define __SWAIT_QUEUE_HEAD_INITIALIZER(name) {                         \
+       .lock           = __RAW_SPIN_LOCK_UNLOCKED(name.lock),          \
+       .task_list      = LIST_HEAD_INIT((name).task_list),             \
+}
+
+#define DECLARE_SWAIT_QUEUE_HEAD(name)                                 \
+       struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INITIALIZER(name)
+
+extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
+                                   struct lock_class_key *key);
+
+#define init_swait_queue_head(q)                               \
+       do {                                                    \
+               static struct lock_class_key __key;             \
+               __init_swait_queue_head((q), #q, &__key);       \
+       } while (0)
+
+#ifdef CONFIG_LOCKDEP
+# define __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name)                 \
+       ({ init_swait_queue_head(&name); name; })
+# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name)                        \
+       struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name)
+#else
+# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name)                        \
+       DECLARE_SWAIT_QUEUE_HEAD(name)
+#endif
+
+static inline int swait_active(struct swait_queue_head *q)
+{
+       return !list_empty(&q->task_list);
+}
+
+extern void swake_up(struct swait_queue_head *q);
+extern void swake_up_all(struct swait_queue_head *q);
+extern void swake_up_locked(struct swait_queue_head *q);
+
+extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
+extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state);
+
+extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
+extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
+
+/* as per ___wait_event() but for swait, therefore "exclusive == 0" */
+#define ___swait_event(wq, condition, state, ret, cmd)                 \
+({                                                                     \
+       struct swait_queue __wait;                                      \
+       long __ret = ret;                                               \
+                                                                       \
+       INIT_LIST_HEAD(&__wait.task_list);                              \
+       for (;;) {                                                      \
+               long __int = prepare_to_swait_event(&wq, &__wait, state);\
+                                                                       \
+               if (condition)                                          \
+                       break;                                          \
+                                                                       \
+               if (___wait_is_interruptible(state) && __int) {         \
+                       __ret = __int;                                  \
+                       break;                                          \
+               }                                                       \
+                                                                       \
+               cmd;                                                    \
+       }                                                               \
+       finish_swait(&wq, &__wait);                                     \
+       __ret;                                                          \
+})
+
+#define __swait_event(wq, condition)                                   \
+       (void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0,    \
+                           schedule())
+
+#define swait_event(wq, condition)                                     \
+do {                                                                   \
+       if (condition)                                                  \
+               break;                                                  \
+       __swait_event(wq, condition);                                   \
+} while (0)
+
+#define __swait_event_timeout(wq, condition, timeout)                  \
+       ___swait_event(wq, ___wait_cond_timeout(condition),             \
+                     TASK_UNINTERRUPTIBLE, timeout,                    \
+                     __ret = schedule_timeout(__ret))
+
+#define swait_event_timeout(wq, condition, timeout)                    \
+({                                                                     \
+       long __ret = timeout;                                           \
+       if (!___wait_cond_timeout(condition))                           \
+               __ret = __swait_event_timeout(wq, condition, timeout);  \
+       __ret;                                                          \
+})
+
+#define __swait_event_interruptible(wq, condition)                     \
+       ___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0,            \
+                     schedule())
+
+#define swait_event_interruptible(wq, condition)                       \
+({                                                                     \
+       int __ret = 0;                                                  \
+       if (!(condition))                                               \
+               __ret = __swait_event_interruptible(wq, condition);     \
+       __ret;                                                          \
+})
+
+#define __swait_event_interruptible_timeout(wq, condition, timeout)    \
+       ___swait_event(wq, ___wait_cond_timeout(condition),             \
+                     TASK_INTERRUPTIBLE, timeout,                      \
+                     __ret = schedule_timeout(__ret))
+
+#define swait_event_interruptible_timeout(wq, condition, timeout)      \
+({                                                                     \
+       long __ret = timeout;                                           \
+       if (!___wait_cond_timeout(condition))                           \
+               __ret = __swait_event_interruptible_timeout(wq,         \
+                                               condition, timeout);    \
+       __ret;                                                          \
+})
+
+#endif /* _LINUX_SWAIT_H */
diff --git a/include/linux/tick.h b/include/linux/tick.h

index 97fd4e543846b5f4de1dc5e366f250cc26724098..21f73649a4dc501002356a95bd3e68f52a7be717 100644 (file)
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -97,6 +97,19 @@ static inline void tick_broadcast_exit(void)
         tick_broadcast_oneshot_control(TICK_BROADCAST_EXIT);
  }
  
+enum tick_dep_bits {
+       TICK_DEP_BIT_POSIX_TIMER        = 0,
+       TICK_DEP_BIT_PERF_EVENTS        = 1,
+       TICK_DEP_BIT_SCHED              = 2,
+       TICK_DEP_BIT_CLOCK_UNSTABLE     = 3
+};
+
+#define TICK_DEP_MASK_NONE             0
+#define TICK_DEP_MASK_POSIX_TIMER      (1 << TICK_DEP_BIT_POSIX_TIMER)
+#define TICK_DEP_MASK_PERF_EVENTS      (1 << TICK_DEP_BIT_PERF_EVENTS)
+#define TICK_DEP_MASK_SCHED            (1 << TICK_DEP_BIT_SCHED)
+#define TICK_DEP_MASK_CLOCK_UNSTABLE   (1 << TICK_DEP_BIT_CLOCK_UNSTABLE)
+
  #ifdef CONFIG_NO_HZ_COMMON
  extern int tick_nohz_enabled;
  extern int tick_nohz_tick_stopped(void);
@@ -154,9 +167,73 @@ static inline int housekeeping_any_cpu(void)
         return cpumask_any_and(housekeeping_mask, cpu_online_mask);
  }
  
-extern void tick_nohz_full_kick(void);
+extern void tick_nohz_dep_set(enum tick_dep_bits bit);
+extern void tick_nohz_dep_clear(enum tick_dep_bits bit);
+extern void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit);
+extern void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit);
+extern void tick_nohz_dep_set_task(struct task_struct *tsk,
+                                  enum tick_dep_bits bit);
+extern void tick_nohz_dep_clear_task(struct task_struct *tsk,
+                                    enum tick_dep_bits bit);
+extern void tick_nohz_dep_set_signal(struct signal_struct *signal,
+                                    enum tick_dep_bits bit);
+extern void tick_nohz_dep_clear_signal(struct signal_struct *signal,
+                                      enum tick_dep_bits bit);
+
+/*
+ * The below are tick_nohz_[set,clear]_dep() wrappers that optimize off-cases
+ * on top of static keys.
+ */
+static inline void tick_dep_set(enum tick_dep_bits bit)
+{
+       if (tick_nohz_full_enabled())
+               tick_nohz_dep_set(bit);
+}
+
+static inline void tick_dep_clear(enum tick_dep_bits bit)
+{
+       if (tick_nohz_full_enabled())
+               tick_nohz_dep_clear(bit);
+}
+
+static inline void tick_dep_set_cpu(int cpu, enum tick_dep_bits bit)
+{
+       if (tick_nohz_full_cpu(cpu))
+               tick_nohz_dep_set_cpu(cpu, bit);
+}
+
+static inline void tick_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
+{
+       if (tick_nohz_full_cpu(cpu))
+               tick_nohz_dep_clear_cpu(cpu, bit);
+}
+
+static inline void tick_dep_set_task(struct task_struct *tsk,
+                                    enum tick_dep_bits bit)
+{
+       if (tick_nohz_full_enabled())
+               tick_nohz_dep_set_task(tsk, bit);
+}
+static inline void tick_dep_clear_task(struct task_struct *tsk,
+                                      enum tick_dep_bits bit)
+{
+       if (tick_nohz_full_enabled())
+               tick_nohz_dep_clear_task(tsk, bit);
+}
+static inline void tick_dep_set_signal(struct signal_struct *signal,
+                                      enum tick_dep_bits bit)
+{
+       if (tick_nohz_full_enabled())
+               tick_nohz_dep_set_signal(signal, bit);
+}
+static inline void tick_dep_clear_signal(struct signal_struct *signal,
+                                        enum tick_dep_bits bit)
+{
+       if (tick_nohz_full_enabled())
+               tick_nohz_dep_clear_signal(signal, bit);
+}
+
  extern void tick_nohz_full_kick_cpu(int cpu);
-extern void tick_nohz_full_kick_all(void);
  extern void __tick_nohz_task_switch(void);
  #else
  static inline int housekeeping_any_cpu(void)
@@ -166,9 +243,21 @@ static inline int housekeeping_any_cpu(void)
  static inline bool tick_nohz_full_enabled(void) { return false; }
  static inline bool tick_nohz_full_cpu(int cpu) { return false; }
  static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { }
+
+static inline void tick_dep_set(enum tick_dep_bits bit) { }
+static inline void tick_dep_clear(enum tick_dep_bits bit) { }
+static inline void tick_dep_set_cpu(int cpu, enum tick_dep_bits bit) { }
+static inline void tick_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { }
+static inline void tick_dep_set_task(struct task_struct *tsk,
+                                    enum tick_dep_bits bit) { }
+static inline void tick_dep_clear_task(struct task_struct *tsk,
+                                      enum tick_dep_bits bit) { }
+static inline void tick_dep_set_signal(struct signal_struct *signal,
+                                      enum tick_dep_bits bit) { }
+static inline void tick_dep_clear_signal(struct signal_struct *signal,
+                                        enum tick_dep_bits bit) { }
+
  static inline void tick_nohz_full_kick_cpu(int cpu) { }
-static inline void tick_nohz_full_kick(void) { }
-static inline void tick_nohz_full_kick_all(void) { }
  static inline void __tick_nohz_task_switch(void) { }
  #endif
  
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h

index acfdbf353a0b5bc7cfeb6ae58fa7e6a6acfa16ba..be586c632a0c04da3887ae6a0cf28357df98ef5c 100644 (file)
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -134,9 +134,6 @@ extern void syscall_unregfunc(void);
                 void *it_func;                                          \
                 void *__data;                                           \
                                                                         \
-               if (!cpu_online(raw_smp_processor_id()))                \
-                       return;                                         \
-                                                                       \
                 if (!(cond))                                            \
                         return;                                         \
                 prercu;                                                 \
@@ -343,15 +340,19 @@ extern void syscall_unregfunc(void);
   * "void *__data, proto" as the callback prototype.
   */
  #define DECLARE_TRACE_NOARGS(name)                                     \
-               __DECLARE_TRACE(name, void, , 1, void *__data, __data)
+       __DECLARE_TRACE(name, void, ,                                   \
+                       cpu_online(raw_smp_processor_id()),             \
+                       void *__data, __data)
  
  #define DECLARE_TRACE(name, proto, args)                               \
-               __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), 1,   \
-                               PARAMS(void *__data, proto),            \
-                               PARAMS(__data, args))
+       __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args),              \
+                       cpu_online(raw_smp_processor_id()),             \
+                       PARAMS(void *__data, proto),                    \
+                       PARAMS(__data, args))
  
  #define DECLARE_TRACE_CONDITION(name, proto, args, cond)               \
-       __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), PARAMS(cond), \
+       __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args),              \
+                       cpu_online(raw_smp_processor_id()) && (PARAMS(cond)), \
                         PARAMS(void *__data, proto),                    \
                         PARAMS(__data, args))
  
diff --git a/include/linux/wait.h b/include/linux/wait.h

index ae71a769b89e3d9f706543a16cfd125120a52cb0..27d7a0ab5da3edf217e60d767c10c65d3e3c52a0 100644 (file)
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -338,7 +338,7 @@ do {                                                                        \
                             schedule(); try_to_freeze())
  
  /**
- * wait_event - sleep (or freeze) until a condition gets true
+ * wait_event_freezable - sleep (or freeze) until a condition gets true
   * @wq: the waitqueue to wait on
   * @condition: a C expression for the event to wait for
   *
diff --git a/include/net/iw_handler.h b/include/net/iw_handler.h

index 8f81bbbc38fc939070a5761e3af90da62faf8d68..e0f4109e64c6fca9ba87d768c2c7b1220a6557f4 100644 (file)
--- a/include/net/iw_handler.h
+++ b/include/net/iw_handler.h
@@ -439,6 +439,12 @@ int dev_get_wireless_info(char *buffer, char **start, off_t offset, int length);
  /* Send a single event to user space */
  void wireless_send_event(struct net_device *dev, unsigned int cmd,
                          union iwreq_data *wrqu, const char *extra);
+#ifdef CONFIG_WEXT_CORE
+/* flush all previous wext events - if work is done from netdev notifiers */
+void wireless_nlevent_flush(void);
+#else
+static inline void wireless_nlevent_flush(void) {}
+#endif
  
  /* We may need a function to send a stream of events to user space.
   * More on that later... */
diff --git a/include/trace/events/asoc.h b/include/trace/events/asoc.h

index 317a1ed2f4acc7745c3e02955c60e8c1dc8eb7c3..9130dd5a184a25e7d6c6cf0b633d7d5c0c4fc0f6 100644 (file)
--- a/include/trace/events/asoc.h
+++ b/include/trace/events/asoc.h
@@ -231,13 +231,13 @@ TRACE_EVENT(snd_soc_jack_report,
         TP_ARGS(jack, mask, val),
  
         TP_STRUCT__entry(
-               __string(       name,           jack->jack->name        )
+               __string(       name,           jack->jack->id          )
                 __field(        int,            mask                    )
                 __field(        int,            val                     )
         ),
  
         TP_fast_assign(
-               __assign_str(name, jack->jack->name);
+               __assign_str(name, jack->jack->id);
                 __entry->mask = mask;
                 __entry->val = val;
         ),
@@ -253,12 +253,12 @@ TRACE_EVENT(snd_soc_jack_notify,
         TP_ARGS(jack, val),
  
         TP_STRUCT__entry(
-               __string(       name,           jack->jack->name        )
+               __string(       name,           jack->jack->id          )
                 __field(        int,            val                     )
         ),
  
         TP_fast_assign(
-               __assign_str(name, jack->jack->name);
+               __assign_str(name, jack->jack->id);
                 __entry->val = val;
         ),
  
diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h

index 073b9ac245ba0315f31a51f5df9f21bcdf2e9115..51440131d3372feb1f09b7139362db66119f0e2a 100644 (file)
--- a/include/trace/events/timer.h
+++ b/include/trace/events/timer.h
@@ -328,23 +328,49 @@ TRACE_EVENT(itimer_expire,
  );
  
  #ifdef CONFIG_NO_HZ_COMMON
+
+#define TICK_DEP_NAMES                                 \
+               tick_dep_name(NONE)                     \
+               tick_dep_name(POSIX_TIMER)              \
+               tick_dep_name(PERF_EVENTS)              \
+               tick_dep_name(SCHED)                    \
+               tick_dep_name_end(CLOCK_UNSTABLE)
+
+#undef tick_dep_name
+#undef tick_dep_name_end
+
+#define tick_dep_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);
+#define tick_dep_name_end(sdep)  TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);
+
+TICK_DEP_NAMES
+
+#undef tick_dep_name
+#undef tick_dep_name_end
+
+#define tick_dep_name(sdep) { TICK_DEP_MASK_##sdep, #sdep },
+#define tick_dep_name_end(sdep) { TICK_DEP_MASK_##sdep, #sdep }
+
+#define show_tick_dep_name(val)                                \
+       __print_symbolic(val, TICK_DEP_NAMES)
+
  TRACE_EVENT(tick_stop,
  
-       TP_PROTO(int success, char *error_msg),
+       TP_PROTO(int success, int dependency),
  
-       TP_ARGS(success, error_msg),
+       TP_ARGS(success, dependency),
  
         TP_STRUCT__entry(
                 __field( int ,          success )
-               __string( msg,          error_msg )
+               __field( int ,          dependency )
         ),
  
         TP_fast_assign(
                 __entry->success        = success;
-               __assign_str(msg, error_msg);
+               __entry->dependency     = dependency;
         ),
  
-       TP_printk("success=%s msg=%s",  __entry->success ? "yes" : "no", __get_str(msg))
+       TP_printk("success=%d dependency=%s",  __entry->success, \
+                       show_tick_dep_name(__entry->dependency))
  );
  #endif
  
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h

index aa6f8571de136b74fba93996883bd69b3e28d412..5df4881dea7b5e8e42fc1274f967331a7d455600 100644 (file)
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -292,6 +292,9 @@ enum bpf_func_id {
  /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
  #define BPF_F_TUNINFO_IPV6             (1ULL << 0)
  
+/* BPF_FUNC_skb_set_tunnel_key flags. */
+#define BPF_F_ZERO_CSUM_TX             (1ULL << 1)
+
  /* user accessible mirror of in-kernel sk_buff.
   * new fields can only be added to the end of this structure
   */
diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h

index 625b38f65764b34fe9d942339ef61a08af817564..a8e3a8c0d85a9173c7aa248b84e8e4e9fd01739f 100644 (file)
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -120,7 +120,7 @@ struct media_device_info {
  
  #define MEDIA_ENT_F_V4L2_SUBDEV_UNKNOWN        MEDIA_ENT_F_OLD_SUBDEV_BASE
  
-#ifndef __KERNEL__
+#if !defined(__KERNEL__) || defined(__NEED_MEDIA_LEGACY_API)
  
  /*
   * Legacy symbols used to avoid userspace compilation breakages
@@ -133,6 +133,10 @@ struct media_device_info {
  #define MEDIA_ENT_TYPE_MASK            0x00ff0000
  #define MEDIA_ENT_SUBTYPE_MASK         0x0000ffff
  
+/* End of the old subdev reserved numberspace */
+#define MEDIA_ENT_T_DEVNODE_UNKNOWN    (MEDIA_ENT_T_DEVNODE | \
+                                        MEDIA_ENT_SUBTYPE_MASK)
+
  #define MEDIA_ENT_T_DEVNODE            MEDIA_ENT_F_OLD_BASE
  #define MEDIA_ENT_T_DEVNODE_V4L                MEDIA_ENT_F_IO_V4L
  #define MEDIA_ENT_T_DEVNODE_FB         (MEDIA_ENT_T_DEVNODE + 2)
diff --git a/init/main.c b/init/main.c

index 58c9e374704bb20cfff41fa92afcf7cafe498c32..7c27de4577eda65a15510af0d7f4d5cae59d57b8 100644 (file)
--- a/init/main.c
+++ b/init/main.c
@@ -93,9 +93,6 @@ static int kernel_init(void *);
  extern void init_IRQ(void);
  extern void fork_init(void);
  extern void radix_tree_init(void);
-#ifndef CONFIG_DEBUG_RODATA
-static inline void mark_rodata_ro(void) { }
-#endif
  
  /*
   * Debug helper: via this flag we know that we are in 'early bootup code'
@@ -499,11 +496,6 @@ asmlinkage __visible void __init start_kernel(void)
         char *command_line;
         char *after_dashes;
  
-       /*
-        * Need to run as early as possible, to initialize the
-        * lockdep hash:
-        */
-       lockdep_init();
         set_task_stack_end_magic(&init_task);
         smp_setup_processor_id();
         debug_objects_early_init();
@@ -929,6 +921,28 @@ static int try_to_run_init_process(const char *init_filename)
  
  static noinline void __init kernel_init_freeable(void);
  
+#ifdef CONFIG_DEBUG_RODATA
+static bool rodata_enabled = true;
+static int __init set_debug_rodata(char *str)
+{
+       return strtobool(str, &rodata_enabled);
+}
+__setup("rodata=", set_debug_rodata);
+
+static void mark_readonly(void)
+{
+       if (rodata_enabled)
+               mark_rodata_ro();
+       else
+               pr_info("Kernel memory protection disabled.\n");
+}
+#else
+static inline void mark_readonly(void)
+{
+       pr_warn("This architecture does not have kernel memory protection.\n");
+}
+#endif
+
  static int __ref kernel_init(void *unused)
  {
         int ret;
@@ -937,7 +951,7 @@ static int __ref kernel_init(void *unused)
         /* need to finish all async __init code before freeing the memory */
         async_synchronize_full();
         free_initmem();
-       mark_rodata_ro();
+       mark_readonly();
         system_state = SYSTEM_RUNNING;
         numa_default_policy();
  
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c

index e1dbf4a2c69e4ca9721c22184cb9f800325b9194..90ff129c88a27c50e33be234be695650e7210494 100644 (file)
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -153,13 +153,11 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
         } else {
                 kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
                            __func__, bp->bp_addr);
-#ifdef CONFIG_DEBUG_RODATA
                 if (!bp->bp_type) {
                         kdb_printf("Software breakpoints are unavailable.\n"
-                                  "  Change the kernel CONFIG_DEBUG_RODATA=n\n"
+                                  "  Boot the kernel with rodata=off\n"
                                    "  OR use hw breaks: help bph\n");
                 }
-#endif
                 return 1;
         }
         return 0;
diff --git a/kernel/events/core.c b/kernel/events/core.c

index 614614821f00a02928439b068903904d81233808..712570dddacde345c481f86b8631df75b798a33b 100644 (file)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3112,17 +3112,6 @@ done:
         return rotate;
  }
  
-#ifdef CONFIG_NO_HZ_FULL
-bool perf_event_can_stop_tick(void)
-{
-       if (atomic_read(&nr_freq_events) ||
-           __this_cpu_read(perf_throttled_count))
-               return false;
-       else
-               return true;
-}
-#endif
-
  void perf_event_task_tick(void)
  {
         struct list_head *head = this_cpu_ptr(&active_ctx_list);
@@ -3133,6 +3122,7 @@ void perf_event_task_tick(void)
  
         __this_cpu_inc(perf_throttled_seq);
         throttled = __this_cpu_xchg(perf_throttled_count, 0);
+       tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
  
         list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
                 perf_adjust_freq_unthr_context(ctx, throttled);
@@ -3564,6 +3554,28 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu)
                 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
  }
  
+#ifdef CONFIG_NO_HZ_FULL
+static DEFINE_SPINLOCK(nr_freq_lock);
+#endif
+
+static void unaccount_freq_event_nohz(void)
+{
+#ifdef CONFIG_NO_HZ_FULL
+       spin_lock(&nr_freq_lock);
+       if (atomic_dec_and_test(&nr_freq_events))
+               tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
+       spin_unlock(&nr_freq_lock);
+#endif
+}
+
+static void unaccount_freq_event(void)
+{
+       if (tick_nohz_full_enabled())
+               unaccount_freq_event_nohz();
+       else
+               atomic_dec(&nr_freq_events);
+}
+
  static void unaccount_event(struct perf_event *event)
  {
         bool dec = false;
@@ -3580,7 +3592,7 @@ static void unaccount_event(struct perf_event *event)
         if (event->attr.task)
                 atomic_dec(&nr_task_events);
         if (event->attr.freq)
-               atomic_dec(&nr_freq_events);
+               unaccount_freq_event();
         if (event->attr.context_switch) {
                 dec = true;
                 atomic_dec(&nr_switch_events);
@@ -6424,9 +6436,9 @@ static int __perf_event_overflow(struct perf_event *event,
                 if (unlikely(throttle
                              && hwc->interrupts >= max_samples_per_tick)) {
                         __this_cpu_inc(perf_throttled_count);
+                       tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
                         hwc->interrupts = MAX_INTERRUPTS;
                         perf_log_throttle(event, 0);
-                       tick_nohz_full_kick();
                         ret = 1;
                 }
         }
@@ -6785,7 +6797,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash)
         kfree_rcu(hlist, rcu_head);
  }
  
-static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
+static void swevent_hlist_put_cpu(int cpu)
  {
         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
  
@@ -6797,15 +6809,15 @@ static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
         mutex_unlock(&swhash->hlist_mutex);
  }
  
-static void swevent_hlist_put(struct perf_event *event)
+static void swevent_hlist_put(void)
  {
         int cpu;
  
         for_each_possible_cpu(cpu)
-               swevent_hlist_put_cpu(event, cpu);
+               swevent_hlist_put_cpu(cpu);
  }
  
-static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
+static int swevent_hlist_get_cpu(int cpu)
  {
         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
         int err = 0;
@@ -6828,14 +6840,13 @@ exit:
         return err;
  }
  
-static int swevent_hlist_get(struct perf_event *event)
+static int swevent_hlist_get(void)
  {
-       int err;
-       int cpu, failed_cpu;
+       int err, cpu, failed_cpu;
  
         get_online_cpus();
         for_each_possible_cpu(cpu) {
-               err = swevent_hlist_get_cpu(event, cpu);
+               err = swevent_hlist_get_cpu(cpu);
                 if (err) {
                         failed_cpu = cpu;
                         goto fail;
@@ -6848,7 +6859,7 @@ fail:
         for_each_possible_cpu(cpu) {
                 if (cpu == failed_cpu)
                         break;
-               swevent_hlist_put_cpu(event, cpu);
+               swevent_hlist_put_cpu(cpu);
         }
  
         put_online_cpus();
@@ -6864,7 +6875,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
         WARN_ON(event->parent);
  
         static_key_slow_dec(&perf_swevent_enabled[event_id]);
-       swevent_hlist_put(event);
+       swevent_hlist_put();
  }
  
  static int perf_swevent_init(struct perf_event *event)
@@ -6895,7 +6906,7 @@ static int perf_swevent_init(struct perf_event *event)
         if (!event->parent) {
                 int err;
  
-               err = swevent_hlist_get(event);
+               err = swevent_hlist_get();
                 if (err)
                         return err;
  
@@ -7816,6 +7827,27 @@ static void account_event_cpu(struct perf_event *event, int cpu)
                 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
  }
  
+/* Freq events need the tick to stay alive (see perf_event_task_tick). */
+static void account_freq_event_nohz(void)
+{
+#ifdef CONFIG_NO_HZ_FULL
+       /* Lock so we don't race with concurrent unaccount */
+       spin_lock(&nr_freq_lock);
+       if (atomic_inc_return(&nr_freq_events) == 1)
+               tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
+       spin_unlock(&nr_freq_lock);
+#endif
+}
+
+static void account_freq_event(void)
+{
+       if (tick_nohz_full_enabled())
+               account_freq_event_nohz();
+       else
+               atomic_inc(&nr_freq_events);
+}
+
+
  static void account_event(struct perf_event *event)
  {
         bool inc = false;
@@ -7831,10 +7863,8 @@ static void account_event(struct perf_event *event)
                 atomic_inc(&nr_comm_events);
         if (event->attr.task)
                 atomic_inc(&nr_task_events);
-       if (event->attr.freq) {
-               if (atomic_inc_return(&nr_freq_events) == 1)
-                       tick_nohz_full_kick_all();
-       }
+       if (event->attr.freq)
+               account_freq_event();
         if (event->attr.context_switch) {
                 atomic_inc(&nr_switch_events);
                 inc = true;
@@ -8001,6 +8031,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
                 }
         }
  
+       /* symmetric to unaccount_event() in _free_event() */
+       account_event(event);
+
         return event;
  
  err_per_task:
@@ -8364,8 +8397,6 @@ SYSCALL_DEFINE5(perf_event_open,
                 }
         }
  
-       account_event(event);
-
         /*
          * Special case software events and allow them to be part of
          * any hardware group.
@@ -8662,8 +8693,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
         /* Mark owner so we could distinguish it from user events. */
         event->owner = TASK_TOMBSTONE;
  
-       account_event(event);
-
         ctx = find_get_context(event->pmu, task, event);
         if (IS_ERR(ctx)) {
                 err = PTR_ERR(ctx);
@@ -9447,6 +9476,7 @@ ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
  
         return 0;
  }
+EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
  
  static int __init perf_event_sysfs_init(void)
  {
diff --git a/kernel/futex.c b/kernel/futex.c

index 5d6ce6413ef1d227b32c99a4330bee1289f9f571..a5d2e74c89e0b217df98326e5febf3caf687687c 100644 (file)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -124,16 +124,16 @@
   *   futex_wait(futex, val);
   *
   *   waiters++; (a)
- *   mb(); (A) <-- paired with -.
- *                              |
- *   lock(hash_bucket(futex));  |
- *                              |
- *   uval = *futex;             |
- *                              |        *futex = newval;
- *                              |        sys_futex(WAKE, futex);
- *                              |          futex_wake(futex);
- *                              |
- *                              `------->  mb(); (B)
+ *   smp_mb(); (A) <-- paired with -.
+ *                                  |
+ *   lock(hash_bucket(futex));      |
+ *                                  |
+ *   uval = *futex;                 |
+ *                                  |        *futex = newval;
+ *                                  |        sys_futex(WAKE, futex);
+ *                                  |          futex_wake(futex);
+ *                                  |
+ *                                  `--------> smp_mb(); (B)
   *   if (uval == val)
   *     queue();
   *     unlock(hash_bucket(futex));
@@ -334,7 +334,7 @@ static inline void futex_get_mm(union futex_key *key)
         /*
          * Ensure futex_get_mm() implies a full barrier such that
          * get_futex_key() implies a full barrier. This is relied upon
-        * as full barrier (B), see the ordering comment above.
+        * as smp_mb(); (B), see the ordering comment above.
          */
         smp_mb__after_atomic();
  }
@@ -407,10 +407,10 @@ static void get_futex_key_refs(union futex_key *key)
  
         switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
         case FUT_OFF_INODE:
-               ihold(key->shared.inode); /* implies MB (B) */
+               ihold(key->shared.inode); /* implies smp_mb(); (B) */
                 break;
         case FUT_OFF_MMSHARED:
-               futex_get_mm(key); /* implies MB (B) */
+               futex_get_mm(key); /* implies smp_mb(); (B) */
                 break;
         default:
                 /*
@@ -418,7 +418,7 @@ static void get_futex_key_refs(union futex_key *key)
                  * mm, therefore the only purpose of calling get_futex_key_refs
                  * is because we need the barrier for the lockless waiter check.
                  */
-               smp_mb(); /* explicit MB (B) */
+               smp_mb(); /* explicit smp_mb(); (B) */
         }
  }
  
@@ -497,7 +497,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
         if (!fshared) {
                 key->private.mm = mm;
                 key->private.address = address;
-               get_futex_key_refs(key);  /* implies MB (B) */
+               get_futex_key_refs(key);  /* implies smp_mb(); (B) */
                 return 0;
         }
  
@@ -520,7 +520,20 @@ again:
         else
                 err = 0;
  
-       lock_page(page);
+       /*
+        * The treatment of mapping from this point on is critical. The page
+        * lock protects many things but in this context the page lock
+        * stabilizes mapping, prevents inode freeing in the shared
+        * file-backed region case and guards against movement to swap cache.
+        *
+        * Strictly speaking the page lock is not needed in all cases being
+        * considered here and page lock forces unnecessarily serialization
+        * From this point on, mapping will be re-verified if necessary and
+        * page lock will be acquired only if it is unavoidable
+        */
+       page = compound_head(page);
+       mapping = READ_ONCE(page->mapping);
+
         /*
          * If page->mapping is NULL, then it cannot be a PageAnon
          * page; but it might be the ZERO_PAGE or in the gate area or
@@ -536,19 +549,31 @@ again:
          * shmem_writepage move it from filecache to swapcache beneath us:
          * an unlikely race, but we do need to retry for page->mapping.
          */
-       mapping = compound_head(page)->mapping;
-       if (!mapping) {
-               int shmem_swizzled = PageSwapCache(page);
+       if (unlikely(!mapping)) {
+               int shmem_swizzled;
+
+               /*
+                * Page lock is required to identify which special case above
+                * applies. If this is really a shmem page then the page lock
+                * will prevent unexpected transitions.
+                */
+               lock_page(page);
+               shmem_swizzled = PageSwapCache(page) || page->mapping;
                 unlock_page(page);
                 put_page(page);
+
                 if (shmem_swizzled)
                         goto again;
+
                 return -EFAULT;
         }
  
         /*
          * Private mappings are handled in a simple way.
          *
+        * If the futex key is stored on an anonymous page, then the associated
+        * object is the mm which is implicitly pinned by the calling process.
+        *
          * NOTE: When userspace waits on a MAP_SHARED mapping, even if
          * it's a read-only handle, it's expected that futexes attach to
          * the object not the particular process.
@@ -566,16 +591,74 @@ again:
                 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
                 key->private.mm = mm;
                 key->private.address = address;
+
+               get_futex_key_refs(key); /* implies smp_mb(); (B) */
+
         } else {
+               struct inode *inode;
+
+               /*
+                * The associated futex object in this case is the inode and
+                * the page->mapping must be traversed. Ordinarily this should
+                * be stabilised under page lock but it's not strictly
+                * necessary in this case as we just want to pin the inode, not
+                * update the radix tree or anything like that.
+                *
+                * The RCU read lock is taken as the inode is finally freed
+                * under RCU. If the mapping still matches expectations then the
+                * mapping->host can be safely accessed as being a valid inode.
+                */
+               rcu_read_lock();
+
+               if (READ_ONCE(page->mapping) != mapping) {
+                       rcu_read_unlock();
+                       put_page(page);
+
+                       goto again;
+               }
+
+               inode = READ_ONCE(mapping->host);
+               if (!inode) {
+                       rcu_read_unlock();
+                       put_page(page);
+
+                       goto again;
+               }
+
+               /*
+                * Take a reference unless it is about to be freed. Previously
+                * this reference was taken by ihold under the page lock
+                * pinning the inode in place so i_lock was unnecessary. The
+                * only way for this check to fail is if the inode was
+                * truncated in parallel so warn for now if this happens.
+                *
+                * We are not calling into get_futex_key_refs() in file-backed
+                * cases, therefore a successful atomic_inc return below will
+                * guarantee that get_futex_key() will still imply smp_mb(); (B).
+                */
+               if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) {
+                       rcu_read_unlock();
+                       put_page(page);
+
+                       goto again;
+               }
+
+               /* Should be impossible but lets be paranoid for now */
+               if (WARN_ON_ONCE(inode->i_mapping != mapping)) {
+                       err = -EFAULT;
+                       rcu_read_unlock();
+                       iput(inode);
+
+                       goto out;
+               }
+
                 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
-               key->shared.inode = mapping->host;
+               key->shared.inode = inode;
                 key->shared.pgoff = basepage_index(page);
+               rcu_read_unlock();
         }
  
-       get_futex_key_refs(key); /* implies MB (B) */
-
  out:
-       unlock_page(page);
         put_page(page);
         return err;
  }
@@ -1864,7 +1947,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
  
         q->lock_ptr = &hb->lock;
  
-       spin_lock(&hb->lock); /* implies MB (A) */
+       spin_lock(&hb->lock); /* implies smp_mb(); (A) */
         return hb;
  }
  
@@ -1927,8 +2010,12 @@ static int unqueue_me(struct futex_q *q)
  
         /* In the common case we don't take the spinlock, which is nice. */
  retry:
-       lock_ptr = q->lock_ptr;
-       barrier();
+       /*
+        * q->lock_ptr can change between this read and the following spin_lock.
+        * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
+        * optimizing lock_ptr out of the logic below.
+        */
+       lock_ptr = READ_ONCE(q->lock_ptr);
         if (lock_ptr != NULL) {
                 spin_lock(lock_ptr);
                 /*
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c

index 8dc65914486999f43caa9276dde914663919fbc6..8d34308ea449ad31bae472f0403c5f1b25324683 100644 (file)
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -66,13 +66,15 @@ struct resource crashk_res = {
         .name  = "Crash kernel",
         .start = 0,
         .end   = 0,
-       .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+       .desc  = IORES_DESC_CRASH_KERNEL
  };
  struct resource crashk_low_res = {
         .name  = "Crash kernel",
         .start = 0,
         .end   = 0,
-       .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+       .desc  = IORES_DESC_CRASH_KERNEL
  };
  
  int kexec_should_crash(struct task_struct *p)
@@ -959,7 +961,7 @@ int crash_shrink_memory(unsigned long new_size)
  
         ram_res->start = end;
         ram_res->end = crashk_res.end;
-       ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+       ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
         ram_res->name = "System RAM";
  
         crashk_res.end = end - 1;
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c

index 007b791f676d5605fb674b6db333d9dd1d0a23b4..56b18eb1f0013c9bc7e041a3dd2a8ecaeb580b95 100644 (file)
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -524,10 +524,10 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
  
         /* Walk the RAM ranges and allocate a suitable range for the buffer */
         if (image->type == KEXEC_TYPE_CRASH)
-               ret = walk_iomem_res("Crash kernel",
-                                    IORESOURCE_MEM | IORESOURCE_BUSY,
-                                    crashk_res.start, crashk_res.end, kbuf,
-                                    locate_mem_hole_callback);
+               ret = walk_iomem_res_desc(crashk_res.desc,
+                               IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
+                               crashk_res.start, crashk_res.end, kbuf,
+                               locate_mem_hole_callback);
         else
                 ret = walk_system_ram_res(0, -1, kbuf,
                                           locate_mem_hole_callback);
diff --git a/kernel/latencytop.c b/kernel/latencytop.c

index a02812743a7e63378a79cc768255f807a7fd469b..b5c30d9f46c5084acddbd34e533f91e9c98e8300 100644 (file)
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -47,12 +47,12 @@
   * of times)
   */
  
-#include <linux/latencytop.h>
  #include <linux/kallsyms.h>
  #include <linux/seq_file.h>
  #include <linux/notifier.h>
  #include <linux/spinlock.h>
  #include <linux/proc_fs.h>
+#include <linux/latencytop.h>
  #include <linux/export.h>
  #include <linux/sched.h>
  #include <linux/list.h>
@@ -289,4 +289,16 @@ static int __init init_lstats_procfs(void)
         proc_create("latency_stats", 0644, NULL, &lstats_fops);
         return 0;
  }
+
+int sysctl_latencytop(struct ctl_table *table, int write,
+                       void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+       int err;
+
+       err = proc_dointvec(table, write, buffer, lenp, ppos);
+       if (latencytop_enabled)
+               force_schedstat_enabled();
+
+       return err;
+}
  device_initcall(init_lstats_procfs);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c

index 716547fdb8731dcc395d70f2664ec6fb22378db6..f894a2cd9b2aa3ad37adfad0d1c15f46fdf93885 100644 (file)
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -123,8 +123,6 @@ static inline int debug_locks_off_graph_unlock(void)
         return ret;
  }
  
-static int lockdep_initialized;
-
  unsigned long nr_list_entries;
  static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
  
@@ -433,19 +431,6 @@ unsigned int nr_process_chains;
  unsigned int max_lockdep_depth;
  
  #ifdef CONFIG_DEBUG_LOCKDEP
-/*
- * We cannot printk in early bootup code. Not even early_printk()
- * might work. So we mark any initialization errors and printk
- * about it later on, in lockdep_info().
- */
-static int lockdep_init_error;
-static const char *lock_init_error;
-static unsigned long lockdep_init_trace_data[20];
-static struct stack_trace lockdep_init_trace = {
-       .max_entries = ARRAY_SIZE(lockdep_init_trace_data),
-       .entries = lockdep_init_trace_data,
-};
-
  /*
   * Various lockdep statistics:
   */
@@ -669,20 +654,6 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
         struct hlist_head *hash_head;
         struct lock_class *class;
  
-#ifdef CONFIG_DEBUG_LOCKDEP
-       /*
-        * If the architecture calls into lockdep before initializing
-        * the hashes then we'll warn about it later. (we cannot printk
-        * right now)
-        */
-       if (unlikely(!lockdep_initialized)) {
-               lockdep_init();
-               lockdep_init_error = 1;
-               lock_init_error = lock->name;
-               save_stack_trace(&lockdep_init_trace);
-       }
-#endif
-
         if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
                 debug_locks_off();
                 printk(KERN_ERR
@@ -2010,6 +1981,53 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i)
         return lock_classes + chain_hlocks[chain->base + i];
  }
  
+/*
+ * Returns the index of the first held_lock of the current chain
+ */
+static inline int get_first_held_lock(struct task_struct *curr,
+                                       struct held_lock *hlock)
+{
+       int i;
+       struct held_lock *hlock_curr;
+
+       for (i = curr->lockdep_depth - 1; i >= 0; i--) {
+               hlock_curr = curr->held_locks + i;
+               if (hlock_curr->irq_context != hlock->irq_context)
+                       break;
+
+       }
+
+       return ++i;
+}
+
+/*
+ * Checks whether the chain and the current held locks are consistent
+ * in depth and also in content. If they are not it most likely means
+ * that there was a collision during the calculation of the chain_key.
+ * Returns: 0 not passed, 1 passed
+ */
+static int check_no_collision(struct task_struct *curr,
+                       struct held_lock *hlock,
+                       struct lock_chain *chain)
+{
+#ifdef CONFIG_DEBUG_LOCKDEP
+       int i, j, id;
+
+       i = get_first_held_lock(curr, hlock);
+
+       if (DEBUG_LOCKS_WARN_ON(chain->depth != curr->lockdep_depth - (i - 1)))
+               return 0;
+
+       for (j = 0; j < chain->depth - 1; j++, i++) {
+               id = curr->held_locks[i].class_idx - 1;
+
+               if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id))
+                       return 0;
+       }
+#endif
+       return 1;
+}
+
  /*
   * Look up a dependency chain. If the key is not present yet then
   * add it and return 1 - in this case the new dependency chain is
@@ -2023,7 +2041,6 @@ static inline int lookup_chain_cache(struct task_struct *curr,
         struct lock_class *class = hlock_class(hlock);
         struct hlist_head *hash_head = chainhashentry(chain_key);
         struct lock_chain *chain;
-       struct held_lock *hlock_curr;
         int i, j;
  
         /*
@@ -2041,6 +2058,9 @@ static inline int lookup_chain_cache(struct task_struct *curr,
                 if (chain->chain_key == chain_key) {
  cache_hit:
                         debug_atomic_inc(chain_lookup_hits);
+                       if (!check_no_collision(curr, hlock, chain))
+                               return 0;
+
                         if (very_verbose(class))
                                 printk("\nhash chain already cached, key: "
                                         "%016Lx tail class: [%p] %s\n",
@@ -2078,13 +2098,7 @@ cache_hit:
         chain = lock_chains + nr_lock_chains++;
         chain->chain_key = chain_key;
         chain->irq_context = hlock->irq_context;
-       /* Find the first held_lock of current chain */
-       for (i = curr->lockdep_depth - 1; i >= 0; i--) {
-               hlock_curr = curr->held_locks + i;
-               if (hlock_curr->irq_context != hlock->irq_context)
-                       break;
-       }
-       i++;
+       i = get_first_held_lock(curr, hlock);
         chain->depth = curr->lockdep_depth + 1 - i;
         if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
                 chain->base = nr_chain_hlocks;
@@ -2172,7 +2186,7 @@ static void check_chain_key(struct task_struct *curr)
  {
  #ifdef CONFIG_DEBUG_LOCKDEP
         struct held_lock *hlock, *prev_hlock = NULL;
-       unsigned int i, id;
+       unsigned int i;
         u64 chain_key = 0;
  
         for (i = 0; i < curr->lockdep_depth; i++) {
@@ -2189,17 +2203,16 @@ static void check_chain_key(struct task_struct *curr)
                                 (unsigned long long)hlock->prev_chain_key);
                         return;
                 }
-               id = hlock->class_idx - 1;
                 /*
                  * Whoops ran out of static storage again?
                  */
-               if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
+               if (DEBUG_LOCKS_WARN_ON(hlock->class_idx > MAX_LOCKDEP_KEYS))
                         return;
  
                 if (prev_hlock && (prev_hlock->irq_context !=
                                                         hlock->irq_context))
                         chain_key = 0;
-               chain_key = iterate_chain_key(chain_key, id);
+               chain_key = iterate_chain_key(chain_key, hlock->class_idx);
                 prev_hlock = hlock;
         }
         if (chain_key != curr->curr_chain_key) {
@@ -3077,7 +3090,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
         struct task_struct *curr = current;
         struct lock_class *class = NULL;
         struct held_lock *hlock;
-       unsigned int depth, id;
+       unsigned int depth;
         int chain_head = 0;
         int class_idx;
         u64 chain_key;
@@ -3180,11 +3193,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
          * The 'key ID' is what is the most compact key value to drive
          * the hash, not class->key.
          */
-       id = class - lock_classes;
         /*
          * Whoops, we did it again.. ran straight out of our static allocation.
          */
-       if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
+       if (DEBUG_LOCKS_WARN_ON(class_idx > MAX_LOCKDEP_KEYS))
                 return 0;
  
         chain_key = curr->curr_chain_key;
@@ -3202,7 +3214,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
                 chain_key = 0;
                 chain_head = 1;
         }
-       chain_key = iterate_chain_key(chain_key, id);
+       chain_key = iterate_chain_key(chain_key, class_idx);
  
         if (nest_lock && !__lock_is_held(nest_lock))
                 return print_lock_nested_lock_not_held(curr, hlock, ip);
@@ -4013,28 +4025,6 @@ out_restore:
         raw_local_irq_restore(flags);
  }
  
-void lockdep_init(void)
-{
-       int i;
-
-       /*
-        * Some architectures have their own start_kernel()
-        * code which calls lockdep_init(), while we also
-        * call lockdep_init() from the start_kernel() itself,
-        * and we want to initialize the hashes only once:
-        */
-       if (lockdep_initialized)
-               return;
-
-       for (i = 0; i < CLASSHASH_SIZE; i++)
-               INIT_HLIST_HEAD(classhash_table + i);
-
-       for (i = 0; i < CHAINHASH_SIZE; i++)
-               INIT_HLIST_HEAD(chainhash_table + i);
-
-       lockdep_initialized = 1;
-}
-
  void __init lockdep_info(void)
  {
         printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
@@ -4061,14 +4051,6 @@ void __init lockdep_info(void)
  
         printk(" per task-struct memory footprint: %lu bytes\n",
                 sizeof(struct held_lock) * MAX_LOCK_DEPTH);
-
-#ifdef CONFIG_DEBUG_LOCKDEP
-       if (lockdep_init_error) {
-               printk("WARNING: lockdep init error: lock '%s' was acquired before lockdep_init().\n", lock_init_error);
-               printk("Call stack leading to lockdep invocation was:\n");
-               print_stack_trace(&lockdep_init_trace, 0);
-       }
-#endif
  }
  
  static void
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h

index 5b9102a47ea5209dd887b6dfbca16215b2bad922..c835270f0c2f93c24f623e9ea2ba2e94eebb006a 100644 (file)
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -67,7 +67,13 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
         node->locked = 0;
         node->next   = NULL;
  
-       prev = xchg_acquire(lock, node);
+       /*
+        * We rely on the full barrier with global transitivity implied by the
+        * below xchg() to order the initialization stores above against any
+        * observation of @node. And to provide the ACQUIRE ordering associated
+        * with a LOCK primitive.
+        */
+       prev = xchg(lock, node);
         if (likely(prev == NULL)) {
                 /*
                  * Lock acquired, don't need to set node->locked to 1. Threads
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c

index 0551c219c40e5bb8f8f87bfb0445100fe1b31cc9..e364b424b019ff51de0bde17fe2611444b5c1efb 100644 (file)
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -716,6 +716,7 @@ static inline void
  __mutex_unlock_common_slowpath(struct mutex *lock, int nested)
  {
         unsigned long flags;
+       WAKE_Q(wake_q);
  
         /*
          * As a performance measurement, release the lock before doing other
@@ -743,11 +744,11 @@ __mutex_unlock_common_slowpath(struct mutex *lock, int nested)
                                            struct mutex_waiter, list);
  
                 debug_mutex_wake_waiter(lock, waiter);
-
-               wake_up_process(waiter->task);
+               wake_q_add(&wake_q, waiter->task);
         }
  
         spin_unlock_mutex(&lock->wait_lock, flags);
+       wake_up_q(&wake_q);
  }
  
  /*
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c

index 393d1874b9e0c81d44ef9d0447ded22a03bccd81..ce2f75e32ae155b30cfe324c56f9bc526771b66a 100644 (file)
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -358,8 +358,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
          * sequentiality; this is because not all clear_pending_set_locked()
          * implementations imply full barriers.
          */
-       while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK)
-               cpu_relax();
+       smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK));
  
         /*
          * take ownership and clear the pending bit.
@@ -435,7 +434,7 @@ queue:
          *
          * The PV pv_wait_head_or_lock function, if active, will acquire
          * the lock and return a non-zero value. So we have to skip the
-        * smp_load_acquire() call. As the next PV queue head hasn't been
+        * smp_cond_acquire() call. As the next PV queue head hasn't been
          * designated yet, there is no way for the locked value to become
          * _Q_SLOW_VAL. So both the set_locked() and the
          * atomic_cmpxchg_relaxed() calls will be safe.
@@ -466,7 +465,7 @@ locked:
                         break;
                 }
                 /*
-                * The smp_load_acquire() call above has provided the necessary
+                * The smp_cond_acquire() call above has provided the necessary
                  * acquire semantics required for locking. At most two
                  * iterations of this loop may be ran.
                  */
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h

index 87bb235c3448054d63923d0f9549cbc7718a49ca..21ede57f68b320594f874e4ff0a1b60974fc67f0 100644 (file)
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -54,6 +54,11 @@ struct pv_node {
         u8                      state;
  };
  
+/*
+ * Include queued spinlock statistics code
+ */
+#include "qspinlock_stat.h"
+
  /*
   * By replacing the regular queued_spin_trylock() with the function below,
   * it will be called once when a lock waiter enter the PV slowpath before
@@ -65,9 +70,11 @@ struct pv_node {
  static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
  {
         struct __qspinlock *l = (void *)lock;
+       int ret = !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
+                  (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
  
-       return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
-               (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
+       qstat_inc(qstat_pv_lock_stealing, ret);
+       return ret;
  }
  
  /*
@@ -137,11 +144,6 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
  }
  #endif /* _Q_PENDING_BITS == 8 */
  
-/*
- * Include queued spinlock statistics code
- */
-#include "qspinlock_stat.h"
-
  /*
   * Lock and MCS node addresses hash table for fast lookup
   *
@@ -398,6 +400,11 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
         if (READ_ONCE(pn->state) == vcpu_hashed)
                 lp = (struct qspinlock **)1;
  
+       /*
+        * Tracking # of slowpath locking operations
+        */
+       qstat_inc(qstat_pv_lock_slowpath, true);
+
         for (;; waitcnt++) {
                 /*
                  * Set correct vCPU state to be used by queue node wait-early
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h

index 640dcecdd1df7a5ac5c8167fbd9e70e71b2e8f21..eb2a2c9bc3fc15d181c9e5981648974dc7aec65c 100644 (file)
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -22,6 +22,7 @@
   *   pv_kick_wake      - # of vCPU kicks used for computing pv_latency_wake
   *   pv_latency_kick   - average latency (ns) of vCPU kick operation
   *   pv_latency_wake   - average latency (ns) from vCPU kick to wakeup
+ *   pv_lock_slowpath  - # of locking operations via the slowpath
   *   pv_lock_stealing  - # of lock stealing operations
   *   pv_spurious_wakeup        - # of spurious wakeups
   *   pv_wait_again     - # of vCPU wait's that happened after a vCPU kick
@@ -45,6 +46,7 @@ enum qlock_stats {
         qstat_pv_kick_wake,
         qstat_pv_latency_kick,
         qstat_pv_latency_wake,
+       qstat_pv_lock_slowpath,
         qstat_pv_lock_stealing,
         qstat_pv_spurious_wakeup,
         qstat_pv_wait_again,
@@ -70,6 +72,7 @@ static const char * const qstat_names[qstat_num + 1] = {
         [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
         [qstat_pv_latency_kick]    = "pv_latency_kick",
         [qstat_pv_latency_wake]    = "pv_latency_wake",
+       [qstat_pv_lock_slowpath]   = "pv_lock_slowpath",
         [qstat_pv_lock_stealing]   = "pv_lock_stealing",
         [qstat_pv_wait_again]      = "pv_wait_again",
         [qstat_pv_wait_early]      = "pv_wait_early",
@@ -279,19 +282,6 @@ static inline void __pv_wait(u8 *ptr, u8 val)
  #define pv_kick(c)     __pv_kick(c)
  #define pv_wait(p, v)  __pv_wait(p, v)
  
-/*
- * PV unfair trylock count tracking function
- */
-static inline int qstat_spin_steal_lock(struct qspinlock *lock)
-{
-       int ret = pv_queued_spin_steal_lock(lock);
-
-       qstat_inc(qstat_pv_lock_stealing, ret);
-       return ret;
-}
-#undef  queued_spin_trylock
-#define queued_spin_trylock(l) qstat_spin_steal_lock(l)
-
  #else /* CONFIG_QUEUED_LOCK_STAT */
  
  static inline void qstat_inc(enum qlock_stats stat, bool cond) { }
diff --git a/kernel/memremap.c b/kernel/memremap.c

index b981a7b023f04c356df5b852f60caeae232fdf45..fb9b88787ebc269fa39a3c08a19c8d0940245bbe 100644 (file)
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -29,10 +29,10 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
  
  static void *try_ram_remap(resource_size_t offset, size_t size)
  {
-       struct page *page = pfn_to_page(offset >> PAGE_SHIFT);
+       unsigned long pfn = PHYS_PFN(offset);
  
         /* In the simple case just return the existing linear address */
-       if (!PageHighMem(page))
+       if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)))
                 return __va(offset);
         return NULL; /* fallback to ioremap_cache */
  }
@@ -47,7 +47,7 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
   * being mapped does not have i/o side effects and the __iomem
   * annotation is not applicable.
   *
- * MEMREMAP_WB - matches the default mapping for "System RAM" on
+ * MEMREMAP_WB - matches the default mapping for System RAM on
   * the architecture.  This is usually a read-allocate write-back cache.
   * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM
   * memremap() will bypass establishing a new mapping and instead return
@@ -56,11 +56,12 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
   * MEMREMAP_WT - establish a mapping whereby writes either bypass the
   * cache or are written through to memory and never exist in a
   * cache-dirty state with respect to program visibility.  Attempts to
- * map "System RAM" with this mapping type will fail.
+ * map System RAM with this mapping type will fail.
   */
  void *memremap(resource_size_t offset, size_t size, unsigned long flags)
  {
-       int is_ram = region_intersects(offset, size, "System RAM");
+       int is_ram = region_intersects(offset, size,
+                                      IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
         void *addr = NULL;
  
         if (is_ram == REGION_MIXED) {
@@ -76,7 +77,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
                  * MEMREMAP_WB is special in that it can be satisifed
                  * from the direct map.  Some archs depend on the
                  * capability of memremap() to autodetect cases where
-                * the requested range is potentially in "System RAM"
+                * the requested range is potentially in System RAM.
                  */
                 if (is_ram == REGION_INTERSECTS)
                         addr = try_ram_remap(offset, size);
@@ -88,7 +89,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
          * If we don't have a mapping yet and more request flags are
          * pending then we will be attempting to establish a new virtual
          * address mapping.  Enforce that this mapping is not aliasing
-        * "System RAM"
+        * System RAM.
          */
         if (!addr && is_ram == REGION_INTERSECTS && flags) {
                 WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n",
@@ -270,13 +271,17 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
  void *devm_memremap_pages(struct device *dev, struct resource *res,
                 struct percpu_ref *ref, struct vmem_altmap *altmap)
  {
-       int is_ram = region_intersects(res->start, resource_size(res),
-                       "System RAM");
         resource_size_t key, align_start, align_size, align_end;
         struct dev_pagemap *pgmap;
         struct page_map *page_map;
+       int error, nid, is_ram;
         unsigned long pfn;
-       int error, nid;
+
+       align_start = res->start & ~(SECTION_SIZE - 1);
+       align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
+               - align_start;
+       is_ram = region_intersects(align_start, align_size,
+               IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
  
         if (is_ram == REGION_MIXED) {
                 WARN_ONCE(1, "%s attempted on mixed region %pr\n",
@@ -314,8 +319,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
  
         mutex_lock(&pgmap_lock);
         error = 0;
-       align_start = res->start & ~(SECTION_SIZE - 1);
-       align_size = ALIGN(resource_size(res), SECTION_SIZE);
         align_end = align_start + align_size - 1;
         for (key = align_start; key <= align_end; key += SECTION_SIZE) {
                 struct dev_pagemap *dup;
@@ -351,8 +354,13 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
         for_each_device_pfn(pfn, page_map) {
                 struct page *page = pfn_to_page(pfn);
  
-               /* ZONE_DEVICE pages must never appear on a slab lru */
-               list_force_poison(&page->lru);
+               /*
+                * ZONE_DEVICE pages union ->lru with a ->pgmap back
+                * pointer.  It is a bug if a ZONE_DEVICE page is ever
+                * freed or placed on a driver-private list.  Seed the
+                * storage with LIST_POISON* values.
+                */
+               list_del(&page->lru);
                 page->pgmap = pgmap;
         }
         devres_add(dev, page_map);
diff --git a/kernel/profile.c b/kernel/profile.c

index 99513e1160e518d322f6d0ce0f346e6da9fcbbf0..51369697466e36ba52af710863376d4847d43e36 100644 (file)
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -59,6 +59,7 @@ int profile_setup(char *str)
  
         if (!strncmp(str, sleepstr, strlen(sleepstr))) {
  #ifdef CONFIG_SCHEDSTATS
+               force_schedstat_enabled();
                 prof_on = SLEEP_PROFILING;
                 if (str[strlen(sleepstr)] == ',')
                         str += strlen(sleepstr) + 1;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c

index e41dd4131f7a141976e771653e3169f2955f6f33..9fd5b628a88dc0cd1e0790d20ecf212940bd5407 100644 (file)
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1614,7 +1614,6 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
         int needmore;
         struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
  
-       rcu_nocb_gp_cleanup(rsp, rnp);
         rnp->need_future_gp[c & 0x1] = 0;
         needmore = rnp->need_future_gp[(c + 1) & 0x1];
         trace_rcu_future_gp(rnp, rdp, c,
@@ -1635,7 +1634,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
             !READ_ONCE(rsp->gp_flags) ||
             !rsp->gp_kthread)
                 return;
-       wake_up(&rsp->gp_wq);
+       swake_up(&rsp->gp_wq);
  }
  
  /*
@@ -2010,6 +2009,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
         int nocb = 0;
         struct rcu_data *rdp;
         struct rcu_node *rnp = rcu_get_root(rsp);
+       struct swait_queue_head *sq;
  
         WRITE_ONCE(rsp->gp_activity, jiffies);
         raw_spin_lock_irq_rcu_node(rnp);
@@ -2046,7 +2046,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
                         needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
                 /* smp_mb() provided by prior unlock-lock pair. */
                 nocb += rcu_future_gp_cleanup(rsp, rnp);
+               sq = rcu_nocb_gp_get(rnp);
                 raw_spin_unlock_irq(&rnp->lock);
+               rcu_nocb_gp_cleanup(sq);
                 cond_resched_rcu_qs();
                 WRITE_ONCE(rsp->gp_activity, jiffies);
                 rcu_gp_slow(rsp, gp_cleanup_delay);
@@ -2092,7 +2094,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
                                                READ_ONCE(rsp->gpnum),
                                                TPS("reqwait"));
                         rsp->gp_state = RCU_GP_WAIT_GPS;
-                       wait_event_interruptible(rsp->gp_wq,
+                       swait_event_interruptible(rsp->gp_wq,
                                                  READ_ONCE(rsp->gp_flags) &
                                                  RCU_GP_FLAG_INIT);
                         rsp->gp_state = RCU_GP_DONE_GPS;
@@ -2122,7 +2124,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
                                                READ_ONCE(rsp->gpnum),
                                                TPS("fqswait"));
                         rsp->gp_state = RCU_GP_WAIT_FQS;
-                       ret = wait_event_interruptible_timeout(rsp->gp_wq,
+                       ret = swait_event_interruptible_timeout(rsp->gp_wq,
                                         rcu_gp_fqs_check_wake(rsp, &gf), j);
                         rsp->gp_state = RCU_GP_DOING_FQS;
                         /* Locking provides needed memory barriers. */
@@ -2246,7 +2248,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
         WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
         WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
         raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
-       rcu_gp_kthread_wake(rsp);
+       swake_up(&rsp->gp_wq);  /* Memory barrier implied by swake_up() path. */
  }
  
  /*
@@ -2900,7 +2902,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
         }
         WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
         raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
-       rcu_gp_kthread_wake(rsp);
+       swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
  }
  
  /*
@@ -3529,7 +3531,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
                         raw_spin_unlock_irqrestore(&rnp->lock, flags);
                         if (wake) {
                                 smp_mb(); /* EGP done before wake_up(). */
-                               wake_up(&rsp->expedited_wq);
+                               swake_up(&rsp->expedited_wq);
                         }
                         break;
                 }
@@ -3780,7 +3782,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
         jiffies_start = jiffies;
  
         for (;;) {
-               ret = wait_event_interruptible_timeout(
+               ret = swait_event_timeout(
                                 rsp->expedited_wq,
                                 sync_rcu_preempt_exp_done(rnp_root),
                                 jiffies_stall);
@@ -3788,7 +3790,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
                         return;
                 if (ret < 0) {
                         /* Hit a signal, disable CPU stall warnings. */
-                       wait_event(rsp->expedited_wq,
+                       swait_event(rsp->expedited_wq,
                                    sync_rcu_preempt_exp_done(rnp_root));
                         return;
                 }
@@ -4482,8 +4484,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
                 }
         }
  
-       init_waitqueue_head(&rsp->gp_wq);
-       init_waitqueue_head(&rsp->expedited_wq);
+       init_swait_queue_head(&rsp->gp_wq);
+       init_swait_queue_head(&rsp->expedited_wq);
         rnp = rsp->level[rcu_num_lvls - 1];
         for_each_possible_cpu(i) {
                 while (i > rnp->grphi)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h

index 83360b4f4352786b0129d5ac64046102d73933db..bbd235d0e71f7144b3548c75e33db3ddc97f84a6 100644 (file)
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -27,6 +27,7 @@
  #include <linux/threads.h>
  #include <linux/cpumask.h>
  #include <linux/seqlock.h>
+#include <linux/swait.h>
  #include <linux/stop_machine.h>
  
  /*
@@ -243,7 +244,7 @@ struct rcu_node {
                                 /* Refused to boost: not sure why, though. */
                                 /*  This can happen due to race conditions. */
  #ifdef CONFIG_RCU_NOCB_CPU
-       wait_queue_head_t nocb_gp_wq[2];
+       struct swait_queue_head nocb_gp_wq[2];
                                 /* Place for rcu_nocb_kthread() to wait GP. */
  #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
         int need_future_gp[2];
@@ -399,7 +400,7 @@ struct rcu_data {
         atomic_long_t nocb_q_count_lazy; /*  invocation (all stages). */
         struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
         struct rcu_head **nocb_follower_tail;
-       wait_queue_head_t nocb_wq;      /* For nocb kthreads to sleep on. */
+       struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
         struct task_struct *nocb_kthread;
         int nocb_defer_wakeup;          /* Defer wakeup of nocb_kthread. */
  
@@ -478,7 +479,7 @@ struct rcu_state {
         unsigned long gpnum;                    /* Current gp number. */
         unsigned long completed;                /* # of last completed gp. */
         struct task_struct *gp_kthread;         /* Task for grace periods. */
-       wait_queue_head_t gp_wq;                /* Where GP task waits. */
+       struct swait_queue_head gp_wq;          /* Where GP task waits. */
         short gp_flags;                         /* Commands for GP task. */
         short gp_state;                         /* GP kthread sleep state. */
  
@@ -506,7 +507,7 @@ struct rcu_state {
         unsigned long expedited_sequence;       /* Take a ticket. */
         atomic_long_t expedited_normal;         /* # fallbacks to normal. */
         atomic_t expedited_need_qs;             /* # CPUs left to check in. */
-       wait_queue_head_t expedited_wq;         /* Wait for check-ins. */
+       struct swait_queue_head expedited_wq;   /* Wait for check-ins. */
         int ncpus_snap;                         /* # CPUs seen last time. */
  
         unsigned long jiffies_force_qs;         /* Time at which to invoke */
@@ -621,7 +622,8 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
  static void increment_cpu_stall_ticks(void);
  static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
  static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
  static void rcu_init_one_nocb(struct rcu_node *rnp);
  static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
                             bool lazy, unsigned long flags);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h

index 9467a8b7e756173bc9a94cc5b9828066a9f50e16..080bd202d360faa62f1a5fc63654239b5cd80144 100644 (file)
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1811,9 +1811,9 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
   * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
   * grace period.
   */
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
  {
-       wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
+       swake_up_all(sq);
  }
  
  /*
@@ -1829,10 +1829,15 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
         rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
  }
  
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+       return &rnp->nocb_gp_wq[rnp->completed & 0x1];
+}
+
  static void rcu_init_one_nocb(struct rcu_node *rnp)
  {
-       init_waitqueue_head(&rnp->nocb_gp_wq[0]);
-       init_waitqueue_head(&rnp->nocb_gp_wq[1]);
+       init_swait_queue_head(&rnp->nocb_gp_wq[0]);
+       init_swait_queue_head(&rnp->nocb_gp_wq[1]);
  }
  
  #ifndef CONFIG_RCU_NOCB_CPU_ALL
@@ -1857,7 +1862,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
         if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
                 /* Prior smp_mb__after_atomic() orders against prior enqueue. */
                 WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
-               wake_up(&rdp_leader->nocb_wq);
+               swake_up(&rdp_leader->nocb_wq);
         }
  }
  
@@ -2069,7 +2074,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
          */
         trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
         for (;;) {
-               wait_event_interruptible(
+               swait_event_interruptible(
                         rnp->nocb_gp_wq[c & 0x1],
                         (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c)));
                 if (likely(d))
@@ -2097,7 +2102,7 @@ wait_again:
         /* Wait for callbacks to appear. */
         if (!rcu_nocb_poll) {
                 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
-               wait_event_interruptible(my_rdp->nocb_wq,
+               swait_event_interruptible(my_rdp->nocb_wq,
                                 !READ_ONCE(my_rdp->nocb_leader_sleep));
                 /* Memory barrier handled by smp_mb() calls below and repoll. */
         } else if (firsttime) {
@@ -2172,7 +2177,7 @@ wait_again:
                          * List was empty, wake up the follower.
                          * Memory barriers supplied by atomic_long_add().
                          */
-                       wake_up(&rdp->nocb_wq);
+                       swake_up(&rdp->nocb_wq);
                 }
         }
  
@@ -2193,7 +2198,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
                 if (!rcu_nocb_poll) {
                         trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
                                             "FollowerSleep");
-                       wait_event_interruptible(rdp->nocb_wq,
+                       swait_event_interruptible(rdp->nocb_wq,
                                                  READ_ONCE(rdp->nocb_follower_head));
                 } else if (firsttime) {
                         /* Don't drown trace log with "Poll"! */
@@ -2352,7 +2357,7 @@ void __init rcu_init_nohz(void)
  static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
  {
         rdp->nocb_tail = &rdp->nocb_head;
-       init_waitqueue_head(&rdp->nocb_wq);
+       init_swait_queue_head(&rdp->nocb_wq);
         rdp->nocb_follower_tail = &rdp->nocb_follower_head;
  }
  
@@ -2502,7 +2507,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
         return false;
  }
  
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
  {
  }
  
@@ -2510,6 +2515,11 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
  {
  }
  
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+       return NULL;
+}
+
  static void rcu_init_one_nocb(struct rcu_node *rnp)
  {
  }
diff --git a/kernel/resource.c b/kernel/resource.c

index 3669d1bfc4254213e0e42dacab374624d74cdd5c..4d466052426b3e993e7e3c3550e948308e377c51 100644 (file)
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -333,13 +333,13 @@ int release_resource(struct resource *old)
  EXPORT_SYMBOL(release_resource);
  
  /*
- * Finds the lowest iomem reosurce exists with-in [res->start.res->end)
- * the caller must specify res->start, res->end, res->flags and "name".
- * If found, returns 0, res is overwritten, if not found, returns -1.
- * This walks through whole tree and not just first level children
- * until and unless first_level_children_only is true.
+ * Finds the lowest iomem resource existing within [res->start.res->end).
+ * The caller must specify res->start, res->end, res->flags, and optionally
+ * desc.  If found, returns 0, res is overwritten, if not found, returns -1.
+ * This function walks the whole tree and not just first level children until
+ * and unless first_level_children_only is true.
   */
-static int find_next_iomem_res(struct resource *res, char *name,
+static int find_next_iomem_res(struct resource *res, unsigned long desc,
                                bool first_level_children_only)
  {
         resource_size_t start, end;
@@ -358,9 +358,9 @@ static int find_next_iomem_res(struct resource *res, char *name,
         read_lock(&resource_lock);
  
         for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) {
-               if (p->flags != res->flags)
+               if ((p->flags & res->flags) != res->flags)
                         continue;
-               if (name && strcmp(p->name, name))
+               if ((desc != IORES_DESC_NONE) && (desc != p->desc))
                         continue;
                 if (p->start > end) {
                         p = NULL;
@@ -385,15 +385,18 @@ static int find_next_iomem_res(struct resource *res, char *name,
   * Walks through iomem resources and calls func() with matching resource
   * ranges. This walks through whole tree and not just first level children.
   * All the memory ranges which overlap start,end and also match flags and
- * name are valid candidates.
+ * desc are valid candidates.
   *
- * @name: name of resource
- * @flags: resource flags
+ * @desc: I/O resource descriptor. Use IORES_DESC_NONE to skip @desc check.
+ * @flags: I/O resource flags
   * @start: start addr
   * @end: end addr
+ *
+ * NOTE: For a new descriptor search, define a new IORES_DESC in
+ * <linux/ioport.h> and set it in 'desc' of a target resource entry.
   */
-int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end,
-               void *arg, int (*func)(u64, u64, void *))
+int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
+               u64 end, void *arg, int (*func)(u64, u64, void *))
  {
         struct resource res;
         u64 orig_end;
@@ -403,23 +406,27 @@ int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end,
         res.end = end;
         res.flags = flags;
         orig_end = res.end;
+
         while ((res.start < res.end) &&
-               (!find_next_iomem_res(&res, name, false))) {
+               (!find_next_iomem_res(&res, desc, false))) {
+
                 ret = (*func)(res.start, res.end, arg);
                 if (ret)
                         break;
+
                 res.start = res.end + 1;
                 res.end = orig_end;
         }
+
         return ret;
  }
  
  /*
- * This function calls callback against all memory range of "System RAM"
- * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
- * Now, this function is only for "System RAM". This function deals with
- * full ranges and not pfn. If resources are not pfn aligned, dealing
- * with pfn can truncate ranges.
+ * This function calls the @func callback against all memory ranges of type
+ * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
+ * Now, this function is only for System RAM, it deals with full ranges and
+ * not PFNs. If resources are not PFN-aligned, dealing with PFNs can truncate
+ * ranges.
   */
  int walk_system_ram_res(u64 start, u64 end, void *arg,
                                 int (*func)(u64, u64, void *))
@@ -430,10 +437,10 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
  
         res.start = start;
         res.end = end;
-       res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+       res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
         orig_end = res.end;
         while ((res.start < res.end) &&
-               (!find_next_iomem_res(&res, "System RAM", true))) {
+               (!find_next_iomem_res(&res, IORES_DESC_NONE, true))) {
                 ret = (*func)(res.start, res.end, arg);
                 if (ret)
                         break;
@@ -446,9 +453,9 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
  #if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
  
  /*
- * This function calls callback against all memory range of "System RAM"
- * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
- * Now, this function is only for "System RAM".
+ * This function calls the @func callback against all memory ranges of type
+ * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
+ * It is to be used only for System RAM.
   */
  int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
                 void *arg, int (*func)(unsigned long, unsigned long, void *))
@@ -460,10 +467,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
  
         res.start = (u64) start_pfn << PAGE_SHIFT;
         res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
-       res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+       res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
         orig_end = res.end;
         while ((res.start < res.end) &&
-               (find_next_iomem_res(&res, "System RAM", true) >= 0)) {
+               (find_next_iomem_res(&res, IORES_DESC_NONE, true) >= 0)) {
                 pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
                 end_pfn = (res.end + 1) >> PAGE_SHIFT;
                 if (end_pfn > pfn)
@@ -484,7 +491,7 @@ static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
  }
  /*
   * This generic page_is_ram() returns true if specified address is
- * registered as "System RAM" in iomem_resource list.
+ * registered as System RAM in iomem_resource list.
   */
  int __weak page_is_ram(unsigned long pfn)
  {
@@ -496,30 +503,34 @@ EXPORT_SYMBOL_GPL(page_is_ram);
   * region_intersects() - determine intersection of region with known resources
   * @start: region start address
   * @size: size of region
- * @name: name of resource (in iomem_resource)
+ * @flags: flags of resource (in iomem_resource)
+ * @desc: descriptor of resource (in iomem_resource) or IORES_DESC_NONE
   *
   * Check if the specified region partially overlaps or fully eclipses a
- * resource identified by @name.  Return REGION_DISJOINT if the region
- * does not overlap @name, return REGION_MIXED if the region overlaps
- * @type and another resource, and return REGION_INTERSECTS if the
- * region overlaps @type and no other defined resource. Note, that
- * REGION_INTERSECTS is also returned in the case when the specified
- * region overlaps RAM and undefined memory holes.
+ * resource identified by @flags and @desc (optional with IORES_DESC_NONE).
+ * Return REGION_DISJOINT if the region does not overlap @flags/@desc,
+ * return REGION_MIXED if the region overlaps @flags/@desc and another
+ * resource, and return REGION_INTERSECTS if the region overlaps @flags/@desc
+ * and no other defined resource. Note that REGION_INTERSECTS is also
+ * returned in the case when the specified region overlaps RAM and undefined
+ * memory holes.
   *
   * region_intersect() is used by memory remapping functions to ensure
   * the user is not remapping RAM and is a vast speed up over walking
   * through the resource table page by page.
   */
-int region_intersects(resource_size_t start, size_t size, const char *name)
+int region_intersects(resource_size_t start, size_t size, unsigned long flags,
+                     unsigned long desc)
  {
-       unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
         resource_size_t end = start + size - 1;
         int type = 0; int other = 0;
         struct resource *p;
  
         read_lock(&resource_lock);
         for (p = iomem_resource.child; p ; p = p->sibling) {
-               bool is_type = strcmp(p->name, name) == 0 && p->flags == flags;
+               bool is_type = (((p->flags & flags) == flags) &&
+                               ((desc == IORES_DESC_NONE) ||
+                                (desc == p->desc)));
  
                 if (start >= p->start && start <= p->end)
                         is_type ? type++ : other++;
@@ -538,6 +549,7 @@ int region_intersects(resource_size_t start, size_t size, const char *name)
  
         return REGION_DISJOINT;
  }
+EXPORT_SYMBOL_GPL(region_intersects);
  
  void __weak arch_remove_reservations(struct resource *avail)
  {
@@ -948,6 +960,7 @@ static void __init __reserve_region_with_split(struct resource *root,
         res->start = start;
         res->end = end;
         res->flags = IORESOURCE_BUSY;
+       res->desc = IORES_DESC_NONE;
  
         while (1) {
  
@@ -982,6 +995,7 @@ static void __init __reserve_region_with_split(struct resource *root,
                                 next_res->start = conflict->end + 1;
                                 next_res->end = end;
                                 next_res->flags = IORESOURCE_BUSY;
+                               next_res->desc = IORES_DESC_NONE;
                         }
                 } else {
                         res->start = conflict->end + 1;
@@ -1071,8 +1085,9 @@ struct resource * __request_region(struct resource *parent,
         res->name = name;
         res->start = start;
         res->end = start + n - 1;
-       res->flags = resource_type(parent);
+       res->flags = resource_type(parent) | resource_ext_type(parent);
         res->flags |= IORESOURCE_BUSY | flags;
+       res->desc = IORES_DESC_NONE;
  
         write_lock(&resource_lock);
  
@@ -1238,6 +1253,7 @@ int release_mem_region_adjustable(struct resource *parent,
                         new_res->start = end + 1;
                         new_res->end = res->end;
                         new_res->flags = res->flags;
+                       new_res->desc = res->desc;
                         new_res->parent = res->parent;
                         new_res->sibling = res->sibling;
                         new_res->child = NULL;
@@ -1413,6 +1429,7 @@ static int __init reserve_setup(char *str)
                         res->start = io_start;
                         res->end = io_start + io_num - 1;
                         res->flags = IORESOURCE_BUSY;
+                       res->desc = IORES_DESC_NONE;
                         res->child = NULL;
                         if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0)
                                 reserved = x+1;
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile

index 67687973ce80d63d3f52698fb4b738b76964b896..7d4cba227cbd393c3515516926bfeb035b222e0c 100644 (file)
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -13,7 +13,7 @@ endif
  
  obj-y += core.o loadavg.o clock.o cputime.o
  obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o completion.o idle.o
+obj-y += wait.o swait.o completion.o idle.o
  obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
  obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
  obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c

index bc54e84675da0d50bd9a60a82f2da5df8590b080..fedb967a98419c14348e8d79becdcb7abf62bfa2 100644 (file)
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -61,6 +61,7 @@
  #include <linux/static_key.h>
  #include <linux/workqueue.h>
  #include <linux/compiler.h>
+#include <linux/tick.h>
  
  /*
   * Scheduler clock - returns current time in nanosec units.
@@ -89,6 +90,8 @@ static void __set_sched_clock_stable(void)
  {
         if (!sched_clock_stable())
                 static_key_slow_inc(&__sched_clock_stable);
+
+       tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
  }
  
  void set_sched_clock_stable(void)
@@ -108,6 +111,8 @@ static void __clear_sched_clock_stable(struct work_struct *work)
         /* XXX worry about clock continuity */
         if (sched_clock_stable())
                 static_key_slow_dec(&__sched_clock_stable);
+
+       tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);
  }
  
  static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 9503d590e5ef5b81537947b9925ecfe689f72a91..e5725b931bee953fb8bcf0e0ae74dea38c404537 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -26,6 +26,7 @@
   *              Thomas Gleixner, Mike Kravetz
   */
  
+#include <linux/kasan.h>
  #include <linux/mm.h>
  #include <linux/module.h>
  #include <linux/nmi.h>
@@ -66,12 +67,10 @@
  #include <linux/pagemap.h>
  #include <linux/hrtimer.h>
  #include <linux/tick.h>
-#include <linux/debugfs.h>
  #include <linux/ctype.h>
  #include <linux/ftrace.h>
  #include <linux/slab.h>
  #include <linux/init_task.h>
-#include <linux/binfmts.h>
  #include <linux/context_tracking.h>
  #include <linux/compiler.h>
  
@@ -124,138 +123,6 @@ const_debug unsigned int sysctl_sched_features =
  
  #undef SCHED_FEAT
  
-#ifdef CONFIG_SCHED_DEBUG
-#define SCHED_FEAT(name, enabled)      \
-       #name ,
-
-static const char * const sched_feat_names[] = {
-#include "features.h"
-};
-
-#undef SCHED_FEAT
-
-static int sched_feat_show(struct seq_file *m, void *v)
-{
-       int i;
-
-       for (i = 0; i < __SCHED_FEAT_NR; i++) {
-               if (!(sysctl_sched_features & (1UL << i)))
-                       seq_puts(m, "NO_");
-               seq_printf(m, "%s ", sched_feat_names[i]);
-       }
-       seq_puts(m, "\n");
-
-       return 0;
-}
-
-#ifdef HAVE_JUMP_LABEL
-
-#define jump_label_key__true  STATIC_KEY_INIT_TRUE
-#define jump_label_key__false STATIC_KEY_INIT_FALSE
-
-#define SCHED_FEAT(name, enabled)      \
-       jump_label_key__##enabled ,
-
-struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
-#include "features.h"
-};
-
-#undef SCHED_FEAT
-
-static void sched_feat_disable(int i)
-{
-       static_key_disable(&sched_feat_keys[i]);
-}
-
-static void sched_feat_enable(int i)
-{
-       static_key_enable(&sched_feat_keys[i]);
-}
-#else
-static void sched_feat_disable(int i) { };
-static void sched_feat_enable(int i) { };
-#endif /* HAVE_JUMP_LABEL */
-
-static int sched_feat_set(char *cmp)
-{
-       int i;
-       int neg = 0;
-
-       if (strncmp(cmp, "NO_", 3) == 0) {
-               neg = 1;
-               cmp += 3;
-       }
-
-       for (i = 0; i < __SCHED_FEAT_NR; i++) {
-               if (strcmp(cmp, sched_feat_names[i]) == 0) {
-                       if (neg) {
-                               sysctl_sched_features &= ~(1UL << i);
-                               sched_feat_disable(i);
-                       } else {
-                               sysctl_sched_features |= (1UL << i);
-                               sched_feat_enable(i);
-                       }
-                       break;
-               }
-       }
-
-       return i;
-}
-
-static ssize_t
-sched_feat_write(struct file *filp, const char __user *ubuf,
-               size_t cnt, loff_t *ppos)
-{
-       char buf[64];
-       char *cmp;
-       int i;
-       struct inode *inode;
-
-       if (cnt > 63)
-               cnt = 63;
-
-       if (copy_from_user(&buf, ubuf, cnt))
-               return -EFAULT;
-
-       buf[cnt] = 0;
-       cmp = strstrip(buf);
-
-       /* Ensure the static_key remains in a consistent state */
-       inode = file_inode(filp);
-       inode_lock(inode);
-       i = sched_feat_set(cmp);
-       inode_unlock(inode);
-       if (i == __SCHED_FEAT_NR)
-               return -EINVAL;
-
-       *ppos += cnt;
-
-       return cnt;
-}
-
-static int sched_feat_open(struct inode *inode, struct file *filp)
-{
-       return single_open(filp, sched_feat_show, NULL);
-}
-
-static const struct file_operations sched_feat_fops = {
-       .open           = sched_feat_open,
-       .write          = sched_feat_write,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = single_release,
-};
-
-static __init int sched_init_debug(void)
-{
-       debugfs_create_file("sched_features", 0644, NULL, NULL,
-                       &sched_feat_fops);
-
-       return 0;
-}
-late_initcall(sched_init_debug);
-#endif /* CONFIG_SCHED_DEBUG */
-
  /*
   * Number of tasks to iterate in a single balance run.
   * Limited because this is done with IRQs disabled.
@@ -453,20 +320,6 @@ static inline void init_hrtick(void)
  }
  #endif /* CONFIG_SCHED_HRTICK */
  
-/*
- * cmpxchg based fetch_or, macro so it works for different integer types
- */
-#define fetch_or(ptr, val)                                             \
-({     typeof(*(ptr)) __old, __val = *(ptr);                           \
-       for (;;) {                                                      \
-               __old = cmpxchg((ptr), __val, __val | (val));           \
-               if (__old == __val)                                     \
-                       break;                                          \
-               __val = __old;                                          \
-       }                                                               \
-       __old;                                                          \
-})
-
  #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
  /*
   * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
@@ -715,31 +568,36 @@ static inline bool got_nohz_idle_kick(void)
  #endif /* CONFIG_NO_HZ_COMMON */
  
  #ifdef CONFIG_NO_HZ_FULL
-bool sched_can_stop_tick(void)
+bool sched_can_stop_tick(struct rq *rq)
  {
+       int fifo_nr_running;
+
+       /* Deadline tasks, even if single, need the tick */
+       if (rq->dl.dl_nr_running)
+               return false;
+
         /*
-        * FIFO realtime policy runs the highest priority task. Other runnable
-        * tasks are of a lower priority. The scheduler tick does nothing.
+        * FIFO realtime policy runs the highest priority task (after DEADLINE).
+        * Other runnable tasks are of a lower priority. The scheduler tick
+        * isn't needed.
          */
-       if (current->policy == SCHED_FIFO)
+       fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
+       if (fifo_nr_running)
                 return true;
  
         /*
          * Round-robin realtime tasks time slice with other tasks at the same
-        * realtime priority. Is this task the only one at this priority?
+        * realtime priority.
          */
-       if (current->policy == SCHED_RR) {
-               struct sched_rt_entity *rt_se = &current->rt;
-
-               return list_is_singular(&rt_se->run_list);
+       if (rq->rt.rr_nr_running) {
+               if (rq->rt.rr_nr_running == 1)
+                       return true;
+               else
+                       return false;
         }
  
-       /*
-        * More than one running task need preemption.
-        * nr_running update is assumed to be visible
-        * after IPI is sent from wakers.
-        */
-       if (this_rq()->nr_running > 1)
+       /* Normal multitasking need periodic preemption checks */
+       if (rq->cfs.nr_running > 1)
                 return false;
  
         return true;
@@ -2093,7 +1951,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
  
         ttwu_queue(p, cpu);
  stat:
-       ttwu_stat(p, cpu, wake_flags);
+       if (schedstat_enabled())
+               ttwu_stat(p, cpu, wake_flags);
  out:
         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
  
@@ -2141,7 +2000,8 @@ static void try_to_wake_up_local(struct task_struct *p)
                 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
  
         ttwu_do_wakeup(rq, p, 0);
-       ttwu_stat(p, smp_processor_id(), 0);
+       if (schedstat_enabled())
+               ttwu_stat(p, smp_processor_id(), 0);
  out:
         raw_spin_unlock(&p->pi_lock);
  }
@@ -2183,7 +2043,6 @@ void __dl_clear_params(struct task_struct *p)
         dl_se->dl_bw = 0;
  
         dl_se->dl_throttled = 0;
-       dl_se->dl_new = 1;
         dl_se->dl_yielded = 0;
  }
  
@@ -2210,6 +2069,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
  #endif
  
  #ifdef CONFIG_SCHEDSTATS
+       /* Even if schedstat is disabled, there should not be garbage */
         memset(&p->se.statistics, 0, sizeof(p->se.statistics));
  #endif
  
@@ -2218,6 +2078,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
         __dl_clear_params(p);
  
         INIT_LIST_HEAD(&p->rt.run_list);
+       p->rt.timeout           = 0;
+       p->rt.time_slice        = sched_rr_timeslice;
+       p->rt.on_rq             = 0;
+       p->rt.on_list           = 0;
  
  #ifdef CONFIG_PREEMPT_NOTIFIERS
         INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2281,6 +2145,69 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
  #endif
  #endif
  
+DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+
+#ifdef CONFIG_SCHEDSTATS
+static void set_schedstats(bool enabled)
+{
+       if (enabled)
+               static_branch_enable(&sched_schedstats);
+       else
+               static_branch_disable(&sched_schedstats);
+}
+
+void force_schedstat_enabled(void)
+{
+       if (!schedstat_enabled()) {
+               pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
+               static_branch_enable(&sched_schedstats);
+       }
+}
+
+static int __init setup_schedstats(char *str)
+{
+       int ret = 0;
+       if (!str)
+               goto out;
+
+       if (!strcmp(str, "enable")) {
+               set_schedstats(true);
+               ret = 1;
+       } else if (!strcmp(str, "disable")) {
+               set_schedstats(false);
+               ret = 1;
+       }
+out:
+       if (!ret)
+               pr_warn("Unable to parse schedstats=\n");
+
+       return ret;
+}
+__setup("schedstats=", setup_schedstats);
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_schedstats(struct ctl_table *table, int write,
+                        void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+       struct ctl_table t;
+       int err;
+       int state = static_branch_likely(&sched_schedstats);
+
+       if (write && !capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       t = *table;
+       t.data = &state;
+       err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+       if (err < 0)
+               return err;
+       if (write)
+               set_schedstats(state);
+       return err;
+}
+#endif
+#endif
+
  /*
   * fork()/clone()-time setup:
   */
@@ -3010,16 +2937,6 @@ u64 scheduler_tick_max_deferment(void)
  }
  #endif
  
-notrace unsigned long get_parent_ip(unsigned long addr)
-{
-       if (in_lock_functions(addr)) {
-               addr = CALLER_ADDR2;
-               if (in_lock_functions(addr))
-                       addr = CALLER_ADDR3;
-       }
-       return addr;
-}
-
  #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
                                 defined(CONFIG_PREEMPT_TRACER))
  
@@ -3041,7 +2958,7 @@ void preempt_count_add(int val)
                                 PREEMPT_MASK - 10);
  #endif
         if (preempt_count() == val) {
-               unsigned long ip = get_parent_ip(CALLER_ADDR1);
+               unsigned long ip = get_lock_parent_ip();
  #ifdef CONFIG_DEBUG_PREEMPT
                 current->preempt_disable_ip = ip;
  #endif
@@ -3068,7 +2985,7 @@ void preempt_count_sub(int val)
  #endif
  
         if (preempt_count() == val)
-               trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+               trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
         __preempt_count_sub(val);
  }
  EXPORT_SYMBOL(preempt_count_sub);
@@ -3280,7 +3197,6 @@ static void __sched notrace __schedule(bool preempt)
  
                 trace_sched_switch(preempt, prev, next);
                 rq = context_switch(rq, prev, next); /* unlocks the rq */
-               cpu = cpu_of(rq);
         } else {
                 lockdep_unpin_lock(&rq->lock);
                 raw_spin_unlock_irq(&rq->lock);
@@ -3466,7 +3382,7 @@ EXPORT_SYMBOL(default_wake_function);
   */
  void rt_mutex_setprio(struct task_struct *p, int prio)
  {
-       int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
+       int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
         struct rq *rq;
         const struct sched_class *prev_class;
  
@@ -3494,11 +3410,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
  
         trace_sched_pi_setprio(p, prio);
         oldprio = p->prio;
+
+       if (oldprio == prio)
+               queue_flag &= ~DEQUEUE_MOVE;
+
         prev_class = p->sched_class;
         queued = task_on_rq_queued(p);
         running = task_current(rq, p);
         if (queued)
-               dequeue_task(rq, p, DEQUEUE_SAVE);
+               dequeue_task(rq, p, queue_flag);
         if (running)
                 put_prev_task(rq, p);
  
@@ -3516,7 +3436,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                 if (!dl_prio(p->normal_prio) ||
                     (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
                         p->dl.dl_boosted = 1;
-                       enqueue_flag |= ENQUEUE_REPLENISH;
+                       queue_flag |= ENQUEUE_REPLENISH;
                 } else
                         p->dl.dl_boosted = 0;
                 p->sched_class = &dl_sched_class;
@@ -3524,7 +3444,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                 if (dl_prio(oldprio))
                         p->dl.dl_boosted = 0;
                 if (oldprio < prio)
-                       enqueue_flag |= ENQUEUE_HEAD;
+                       queue_flag |= ENQUEUE_HEAD;
                 p->sched_class = &rt_sched_class;
         } else {
                 if (dl_prio(oldprio))
@@ -3539,7 +3459,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
         if (running)
                 p->sched_class->set_curr_task(rq);
         if (queued)
-               enqueue_task(rq, p, enqueue_flag);
+               enqueue_task(rq, p, queue_flag);
  
         check_class_changed(rq, p, prev_class, oldprio);
  out_unlock:
@@ -3895,6 +3815,7 @@ static int __sched_setscheduler(struct task_struct *p,
         const struct sched_class *prev_class;
         struct rq *rq;
         int reset_on_fork;
+       int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
  
         /* may grab non-irq protected spin_locks */
         BUG_ON(in_interrupt());
@@ -4077,17 +3998,14 @@ change:
                  * itself.
                  */
                 new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
-               if (new_effective_prio == oldprio) {
-                       __setscheduler_params(p, attr);
-                       task_rq_unlock(rq, p, &flags);
-                       return 0;
-               }
+               if (new_effective_prio == oldprio)
+                       queue_flags &= ~DEQUEUE_MOVE;
         }
  
         queued = task_on_rq_queued(p);
         running = task_current(rq, p);
         if (queued)
-               dequeue_task(rq, p, DEQUEUE_SAVE);
+               dequeue_task(rq, p, queue_flags);
         if (running)
                 put_prev_task(rq, p);
  
@@ -4097,15 +4015,14 @@ change:
         if (running)
                 p->sched_class->set_curr_task(rq);
         if (queued) {
-               int enqueue_flags = ENQUEUE_RESTORE;
                 /*
                  * We enqueue to tail when the priority of a task is
                  * increased (user space view).
                  */
-               if (oldprio <= p->prio)
-                       enqueue_flags |= ENQUEUE_HEAD;
+               if (oldprio < p->prio)
+                       queue_flags |= ENQUEUE_HEAD;
  
-               enqueue_task(rq, p, enqueue_flags);
+               enqueue_task(rq, p, queue_flags);
         }
  
         check_class_changed(rq, p, prev_class, oldprio);
@@ -5096,6 +5013,8 @@ void init_idle(struct task_struct *idle, int cpu)
         idle->state = TASK_RUNNING;
         idle->se.exec_start = sched_clock();
  
+       kasan_unpoison_task_stack(idle);
+
  #ifdef CONFIG_SMP
         /*
          * Its possible that init_idle() gets called multiple times on a task,
@@ -5405,183 +5324,6 @@ static void migrate_tasks(struct rq *dead_rq)
  }
  #endif /* CONFIG_HOTPLUG_CPU */
  
-#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
-
-static struct ctl_table sd_ctl_dir[] = {
-       {
-               .procname       = "sched_domain",
-               .mode           = 0555,
-       },
-       {}
-};
-
-static struct ctl_table sd_ctl_root[] = {
-       {
-               .procname       = "kernel",
-               .mode           = 0555,
-               .child          = sd_ctl_dir,
-       },
-       {}
-};
-
-static struct ctl_table *sd_alloc_ctl_entry(int n)
-{
-       struct ctl_table *entry =
-               kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
-
-       return entry;
-}
-
-static void sd_free_ctl_entry(struct ctl_table **tablep)
-{
-       struct ctl_table *entry;
-
-       /*
-        * In the intermediate directories, both the child directory and
-        * procname are dynamically allocated and could fail but the mode
-        * will always be set. In the lowest directory the names are
-        * static strings and all have proc handlers.
-        */
-       for (entry = *tablep; entry->mode; entry++) {
-               if (entry->child)
-                       sd_free_ctl_entry(&entry->child);
-               if (entry->proc_handler == NULL)
-                       kfree(entry->procname);
-       }
-
-       kfree(*tablep);
-       *tablep = NULL;
-}
-
-static int min_load_idx = 0;
-static int max_load_idx = CPU_LOAD_IDX_MAX-1;
-
-static void
-set_table_entry(struct ctl_table *entry,
-               const char *procname, void *data, int maxlen,
-               umode_t mode, proc_handler *proc_handler,
-               bool load_idx)
-{
-       entry->procname = procname;
-       entry->data = data;
-       entry->maxlen = maxlen;
-       entry->mode = mode;
-       entry->proc_handler = proc_handler;
-
-       if (load_idx) {
-               entry->extra1 = &min_load_idx;
-               entry->extra2 = &max_load_idx;
-       }
-}
-
-static struct ctl_table *
-sd_alloc_ctl_domain_table(struct sched_domain *sd)
-{
-       struct ctl_table *table = sd_alloc_ctl_entry(14);
-
-       if (table == NULL)
-               return NULL;
-
-       set_table_entry(&table[0], "min_interval", &sd->min_interval,
-               sizeof(long), 0644, proc_doulongvec_minmax, false);
-       set_table_entry(&table[1], "max_interval", &sd->max_interval,
-               sizeof(long), 0644, proc_doulongvec_minmax, false);
-       set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
-               sizeof(int), 0644, proc_dointvec_minmax, true);
-       set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
-               sizeof(int), 0644, proc_dointvec_minmax, true);
-       set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
-               sizeof(int), 0644, proc_dointvec_minmax, true);
-       set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
-               sizeof(int), 0644, proc_dointvec_minmax, true);
-       set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
-               sizeof(int), 0644, proc_dointvec_minmax, true);
-       set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
-               sizeof(int), 0644, proc_dointvec_minmax, false);
-       set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
-               sizeof(int), 0644, proc_dointvec_minmax, false);
-       set_table_entry(&table[9], "cache_nice_tries",
-               &sd->cache_nice_tries,
-               sizeof(int), 0644, proc_dointvec_minmax, false);
-       set_table_entry(&table[10], "flags", &sd->flags,
-               sizeof(int), 0644, proc_dointvec_minmax, false);
-       set_table_entry(&table[11], "max_newidle_lb_cost",
-               &sd->max_newidle_lb_cost,
-               sizeof(long), 0644, proc_doulongvec_minmax, false);
-       set_table_entry(&table[12], "name", sd->name,
-               CORENAME_MAX_SIZE, 0444, proc_dostring, false);
-       /* &table[13] is terminator */
-
-       return table;
-}
-
-static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
-{
-       struct ctl_table *entry, *table;
-       struct sched_domain *sd;
-       int domain_num = 0, i;
-       char buf[32];
-
-       for_each_domain(cpu, sd)
-               domain_num++;
-       entry = table = sd_alloc_ctl_entry(domain_num + 1);
-       if (table == NULL)
-               return NULL;
-
-       i = 0;
-       for_each_domain(cpu, sd) {
-               snprintf(buf, 32, "domain%d", i);
-               entry->procname = kstrdup(buf, GFP_KERNEL);
-               entry->mode = 0555;
-               entry->child = sd_alloc_ctl_domain_table(sd);
-               entry++;
-               i++;
-       }
-       return table;
-}
-
-static struct ctl_table_header *sd_sysctl_header;
-static void register_sched_domain_sysctl(void)
-{
-       int i, cpu_num = num_possible_cpus();
-       struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
-       char buf[32];
-
-       WARN_ON(sd_ctl_dir[0].child);
-       sd_ctl_dir[0].child = entry;
-
-       if (entry == NULL)
-               return;
-
-       for_each_possible_cpu(i) {
-               snprintf(buf, 32, "cpu%d", i);
-               entry->procname = kstrdup(buf, GFP_KERNEL);
-               entry->mode = 0555;
-               entry->child = sd_alloc_ctl_cpu_table(i);
-               entry++;
-       }
-
-       WARN_ON(sd_sysctl_header);
-       sd_sysctl_header = register_sysctl_table(sd_ctl_root);
-}
-
-/* may be called multiple times per register */
-static void unregister_sched_domain_sysctl(void)
-{
-       unregister_sysctl_table(sd_sysctl_header);
-       sd_sysctl_header = NULL;
-       if (sd_ctl_dir[0].child)
-               sd_free_ctl_entry(&sd_ctl_dir[0].child);
-}
-#else
-static void register_sched_domain_sysctl(void)
-{
-}
-static void unregister_sched_domain_sysctl(void)
-{
-}
-#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */
-
  static void set_rq_online(struct rq *rq)
  {
         if (!rq->online) {
@@ -6173,11 +5915,16 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
  /* Setup the mask of cpus configured for isolated domains */
  static int __init isolated_cpu_setup(char *str)
  {
+       int ret;
+
         alloc_bootmem_cpumask_var(&cpu_isolated_map);
-       cpulist_parse(str, cpu_isolated_map);
+       ret = cpulist_parse(str, cpu_isolated_map);
+       if (ret) {
+               pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
+               return 0;
+       }
         return 1;
  }
-
  __setup("isolcpus=", isolated_cpu_setup);
  
  struct s_data {
@@ -7860,11 +7607,9 @@ void sched_destroy_group(struct task_group *tg)
  void sched_offline_group(struct task_group *tg)
  {
         unsigned long flags;
-       int i;
  
         /* end participation in shares distribution */
-       for_each_possible_cpu(i)
-               unregister_fair_sched_group(tg, i);
+       unregister_fair_sched_group(tg);
  
         spin_lock_irqsave(&task_group_lock, flags);
         list_del_rcu(&tg->list);
@@ -7890,7 +7635,7 @@ void sched_move_task(struct task_struct *tsk)
         queued = task_on_rq_queued(tsk);
  
         if (queued)
-               dequeue_task(rq, tsk, DEQUEUE_SAVE);
+               dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
         if (unlikely(running))
                 put_prev_task(rq, tsk);
  
@@ -7914,7 +7659,7 @@ void sched_move_task(struct task_struct *tsk)
         if (unlikely(running))
                 tsk->sched_class->set_curr_task(rq);
         if (queued)
-               enqueue_task(rq, tsk, ENQUEUE_RESTORE);
+               enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
  
         task_rq_unlock(rq, tsk, &flags);
  }
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c

index b2ab2ffb1adc021de289d3e3c857de60ad2428b6..75f98c5498d55d38b0a36bba2719907742166fe1 100644 (file)
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -262,21 +262,21 @@ static __always_inline bool steal_account_process_tick(void)
  #ifdef CONFIG_PARAVIRT
         if (static_key_false(&paravirt_steal_enabled)) {
                 u64 steal;
-               cputime_t steal_ct;
+               unsigned long steal_jiffies;
  
                 steal = paravirt_steal_clock(smp_processor_id());
                 steal -= this_rq()->prev_steal_time;
  
                 /*
-                * cputime_t may be less precise than nsecs (eg: if it's
-                * based on jiffies). Lets cast the result to cputime
+                * steal is in nsecs but our caller is expecting steal
+                * time in jiffies. Lets cast the result to jiffies
                  * granularity and account the rest on the next rounds.
                  */
-               steal_ct = nsecs_to_cputime(steal);
-               this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
+               steal_jiffies = nsecs_to_jiffies(steal);
+               this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
  
-               account_steal_time(steal_ct);
-               return steal_ct;
+               account_steal_time(jiffies_to_cputime(steal_jiffies));
+               return steal_jiffies;
         }
  #endif
         return false;
@@ -668,26 +668,25 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
  #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
  
  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static unsigned long long vtime_delta(struct task_struct *tsk)
+static cputime_t vtime_delta(struct task_struct *tsk)
  {
-       unsigned long long clock;
+       unsigned long now = READ_ONCE(jiffies);
  
-       clock = local_clock();
-       if (clock < tsk->vtime_snap)
+       if (time_before(now, (unsigned long)tsk->vtime_snap))
                 return 0;
  
-       return clock - tsk->vtime_snap;
+       return jiffies_to_cputime(now - tsk->vtime_snap);
  }
  
  static cputime_t get_vtime_delta(struct task_struct *tsk)
  {
-       unsigned long long delta = vtime_delta(tsk);
+       unsigned long now = READ_ONCE(jiffies);
+       unsigned long delta = now - tsk->vtime_snap;
  
         WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
-       tsk->vtime_snap += delta;
+       tsk->vtime_snap = now;
  
-       /* CHECKME: always safe to convert nsecs to cputime? */
-       return nsecs_to_cputime(delta);
+       return jiffies_to_cputime(delta);
  }
  
  static void __vtime_account_system(struct task_struct *tsk)
@@ -699,6 +698,9 @@ static void __vtime_account_system(struct task_struct *tsk)
  
  void vtime_account_system(struct task_struct *tsk)
  {
+       if (!vtime_delta(tsk))
+               return;
+
         write_seqcount_begin(&tsk->vtime_seqcount);
         __vtime_account_system(tsk);
         write_seqcount_end(&tsk->vtime_seqcount);
@@ -707,7 +709,8 @@ void vtime_account_system(struct task_struct *tsk)
  void vtime_gen_account_irq_exit(struct task_struct *tsk)
  {
         write_seqcount_begin(&tsk->vtime_seqcount);
-       __vtime_account_system(tsk);
+       if (vtime_delta(tsk))
+               __vtime_account_system(tsk);
         if (context_tracking_in_user())
                 tsk->vtime_snap_whence = VTIME_USER;
         write_seqcount_end(&tsk->vtime_seqcount);
@@ -718,16 +721,19 @@ void vtime_account_user(struct task_struct *tsk)
         cputime_t delta_cpu;
  
         write_seqcount_begin(&tsk->vtime_seqcount);
-       delta_cpu = get_vtime_delta(tsk);
         tsk->vtime_snap_whence = VTIME_SYS;
-       account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+       if (vtime_delta(tsk)) {
+               delta_cpu = get_vtime_delta(tsk);
+               account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+       }
         write_seqcount_end(&tsk->vtime_seqcount);
  }
  
  void vtime_user_enter(struct task_struct *tsk)
  {
         write_seqcount_begin(&tsk->vtime_seqcount);
-       __vtime_account_system(tsk);
+       if (vtime_delta(tsk))
+               __vtime_account_system(tsk);
         tsk->vtime_snap_whence = VTIME_USER;
         write_seqcount_end(&tsk->vtime_seqcount);
  }
@@ -742,7 +748,8 @@ void vtime_guest_enter(struct task_struct *tsk)
          * that can thus safely catch up with a tickless delta.
          */
         write_seqcount_begin(&tsk->vtime_seqcount);
-       __vtime_account_system(tsk);
+       if (vtime_delta(tsk))
+               __vtime_account_system(tsk);
         current->flags |= PF_VCPU;
         write_seqcount_end(&tsk->vtime_seqcount);
  }
@@ -772,7 +779,7 @@ void arch_vtime_task_switch(struct task_struct *prev)
  
         write_seqcount_begin(&current->vtime_seqcount);
         current->vtime_snap_whence = VTIME_SYS;
-       current->vtime_snap = sched_clock_cpu(smp_processor_id());
+       current->vtime_snap = jiffies;
         write_seqcount_end(&current->vtime_seqcount);
  }
  
@@ -783,7 +790,7 @@ void vtime_init_idle(struct task_struct *t, int cpu)
         local_irq_save(flags);
         write_seqcount_begin(&t->vtime_seqcount);
         t->vtime_snap_whence = VTIME_SYS;
-       t->vtime_snap = sched_clock_cpu(cpu);
+       t->vtime_snap = jiffies;
         write_seqcount_end(&t->vtime_seqcount);
         local_irq_restore(flags);
  }
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c

index 57b939c81bce5d06fa587df8915f05affbe22b82..c7a036facbe1d311e24a56d470288ecf8d43d0d3 100644 (file)
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -352,7 +352,15 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
         struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
         struct rq *rq = rq_of_dl_rq(dl_rq);
  
-       WARN_ON(!dl_se->dl_new || dl_se->dl_throttled);
+       WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
+
+       /*
+        * We are racing with the deadline timer. So, do nothing because
+        * the deadline timer handler will take care of properly recharging
+        * the runtime and postponing the deadline
+        */
+       if (dl_se->dl_throttled)
+               return;
  
         /*
          * We use the regular wall clock time to set deadlines in the
@@ -361,7 +369,6 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
          */
         dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
         dl_se->runtime = pi_se->dl_runtime;
-       dl_se->dl_new = 0;
  }
  
  /*
@@ -399,6 +406,9 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
                 dl_se->runtime = pi_se->dl_runtime;
         }
  
+       if (dl_se->dl_yielded && dl_se->runtime > 0)
+               dl_se->runtime = 0;
+
         /*
          * We keep moving the deadline away until we get some
          * available runtime for the entity. This ensures correct
@@ -500,15 +510,6 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
         struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
         struct rq *rq = rq_of_dl_rq(dl_rq);
  
-       /*
-        * The arrival of a new instance needs special treatment, i.e.,
-        * the actual scheduling parameters have to be "renewed".
-        */
-       if (dl_se->dl_new) {
-               setup_new_dl_entity(dl_se, pi_se);
-               return;
-       }
-
         if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
             dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
                 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
@@ -604,16 +605,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
                 goto unlock;
         }
  
-       /*
-        * This is possible if switched_from_dl() raced against a running
-        * callback that took the above !dl_task() path and we've since then
-        * switched back into SCHED_DEADLINE.
-        *
-        * There's nothing to do except drop our task reference.
-        */
-       if (dl_se->dl_new)
-               goto unlock;
-
         /*
          * The task might have been boosted by someone else and might be in the
          * boosting/deboosting path, its not throttled.
@@ -735,8 +726,11 @@ static void update_curr_dl(struct rq *rq)
          * approach need further study.
          */
         delta_exec = rq_clock_task(rq) - curr->se.exec_start;
-       if (unlikely((s64)delta_exec <= 0))
+       if (unlikely((s64)delta_exec <= 0)) {
+               if (unlikely(dl_se->dl_yielded))
+                       goto throttle;
                 return;
+       }
  
         schedstat_set(curr->se.statistics.exec_max,
                       max(curr->se.statistics.exec_max, delta_exec));
@@ -749,8 +743,10 @@ static void update_curr_dl(struct rq *rq)
  
         sched_rt_avg_update(rq, delta_exec);
  
-       dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
-       if (dl_runtime_exceeded(dl_se)) {
+       dl_se->runtime -= delta_exec;
+
+throttle:
+       if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
                 dl_se->dl_throttled = 1;
                 __dequeue_task_dl(rq, curr, 0);
                 if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
@@ -917,7 +913,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
          * parameters of the task might need updating. Otherwise,
          * we want a replenishment of its runtime.
          */
-       if (dl_se->dl_new || flags & ENQUEUE_WAKEUP)
+       if (flags & ENQUEUE_WAKEUP)
                 update_dl_entity(dl_se, pi_se);
         else if (flags & ENQUEUE_REPLENISH)
                 replenish_dl_entity(dl_se, pi_se);
@@ -994,18 +990,14 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
   */
  static void yield_task_dl(struct rq *rq)
  {
-       struct task_struct *p = rq->curr;
-
         /*
          * We make the task go to sleep until its current deadline by
          * forcing its runtime to zero. This way, update_curr_dl() stops
          * it and the bandwidth timer will wake it up and will give it
          * new scheduling parameters (thanks to dl_yielded=1).
          */
-       if (p->dl.runtime > 0) {
-               rq->curr->dl.dl_yielded = 1;
-               p->dl.runtime = 0;
-       }
+       rq->curr->dl.dl_yielded = 1;
+
         update_rq_clock(rq);
         update_curr_dl(rq);
         /*
@@ -1722,6 +1714,9 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
   */
  static void switched_to_dl(struct rq *rq, struct task_struct *p)
  {
+       if (dl_time_before(p->dl.deadline, rq_clock(rq)))
+               setup_new_dl_entity(&p->dl, &p->dl);
+
         if (task_on_rq_queued(p) && rq->curr != p) {
  #ifdef CONFIG_SMP
                 if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
@@ -1768,8 +1763,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
                  */
                 resched_curr(rq);
  #endif /* CONFIG_SMP */
-       } else
-               switched_to_dl(rq, p);
+       }
  }
  
  const struct sched_class dl_sched_class = {
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c

index 641511771ae6a696271f77532ac9e40e28175749..4fbc3bd5ff6067dfe184295fc262987c912b669e 100644 (file)
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -16,6 +16,7 @@
  #include <linux/kallsyms.h>
  #include <linux/utsname.h>
  #include <linux/mempolicy.h>
+#include <linux/debugfs.h>
  
  #include "sched.h"
  
@@ -58,6 +59,309 @@ static unsigned long nsec_low(unsigned long long nsec)
  
  #define SPLIT_NS(x) nsec_high(x), nsec_low(x)
  
+#define SCHED_FEAT(name, enabled)      \
+       #name ,
+
+static const char * const sched_feat_names[] = {
+#include "features.h"
+};
+
+#undef SCHED_FEAT
+
+static int sched_feat_show(struct seq_file *m, void *v)
+{
+       int i;
+
+       for (i = 0; i < __SCHED_FEAT_NR; i++) {
+               if (!(sysctl_sched_features & (1UL << i)))
+                       seq_puts(m, "NO_");
+               seq_printf(m, "%s ", sched_feat_names[i]);
+       }
+       seq_puts(m, "\n");
+
+       return 0;
+}
+
+#ifdef HAVE_JUMP_LABEL
+
+#define jump_label_key__true  STATIC_KEY_INIT_TRUE
+#define jump_label_key__false STATIC_KEY_INIT_FALSE
+
+#define SCHED_FEAT(name, enabled)      \
+       jump_label_key__##enabled ,
+
+struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
+#include "features.h"
+};
+
+#undef SCHED_FEAT
+
+static void sched_feat_disable(int i)
+{
+       static_key_disable(&sched_feat_keys[i]);
+}
+
+static void sched_feat_enable(int i)
+{
+       static_key_enable(&sched_feat_keys[i]);
+}
+#else
+static void sched_feat_disable(int i) { };
+static void sched_feat_enable(int i) { };
+#endif /* HAVE_JUMP_LABEL */
+
+static int sched_feat_set(char *cmp)
+{
+       int i;
+       int neg = 0;
+
+       if (strncmp(cmp, "NO_", 3) == 0) {
+               neg = 1;
+               cmp += 3;
+       }
+
+       for (i = 0; i < __SCHED_FEAT_NR; i++) {
+               if (strcmp(cmp, sched_feat_names[i]) == 0) {
+                       if (neg) {
+                               sysctl_sched_features &= ~(1UL << i);
+                               sched_feat_disable(i);
+                       } else {
+                               sysctl_sched_features |= (1UL << i);
+                               sched_feat_enable(i);
+                       }
+                       break;
+               }
+       }
+
+       return i;
+}
+
+static ssize_t
+sched_feat_write(struct file *filp, const char __user *ubuf,
+               size_t cnt, loff_t *ppos)
+{
+       char buf[64];
+       char *cmp;
+       int i;
+       struct inode *inode;
+
+       if (cnt > 63)
+               cnt = 63;
+
+       if (copy_from_user(&buf, ubuf, cnt))
+               return -EFAULT;
+
+       buf[cnt] = 0;
+       cmp = strstrip(buf);
+
+       /* Ensure the static_key remains in a consistent state */
+       inode = file_inode(filp);
+       inode_lock(inode);
+       i = sched_feat_set(cmp);
+       inode_unlock(inode);
+       if (i == __SCHED_FEAT_NR)
+               return -EINVAL;
+
+       *ppos += cnt;
+
+       return cnt;
+}
+
+static int sched_feat_open(struct inode *inode, struct file *filp)
+{
+       return single_open(filp, sched_feat_show, NULL);
+}
+
+static const struct file_operations sched_feat_fops = {
+       .open           = sched_feat_open,
+       .write          = sched_feat_write,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static __init int sched_init_debug(void)
+{
+       debugfs_create_file("sched_features", 0644, NULL, NULL,
+                       &sched_feat_fops);
+
+       return 0;
+}
+late_initcall(sched_init_debug);
+
+#ifdef CONFIG_SMP
+
+#ifdef CONFIG_SYSCTL
+
+static struct ctl_table sd_ctl_dir[] = {
+       {
+               .procname       = "sched_domain",
+               .mode           = 0555,
+       },
+       {}
+};
+
+static struct ctl_table sd_ctl_root[] = {
+       {
+               .procname       = "kernel",
+               .mode           = 0555,
+               .child          = sd_ctl_dir,
+       },
+       {}
+};
+
+static struct ctl_table *sd_alloc_ctl_entry(int n)
+{
+       struct ctl_table *entry =
+               kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
+
+       return entry;
+}
+
+static void sd_free_ctl_entry(struct ctl_table **tablep)
+{
+       struct ctl_table *entry;
+
+       /*
+        * In the intermediate directories, both the child directory and
+        * procname are dynamically allocated and could fail but the mode
+        * will always be set. In the lowest directory the names are
+        * static strings and all have proc handlers.
+        */
+       for (entry = *tablep; entry->mode; entry++) {
+               if (entry->child)
+                       sd_free_ctl_entry(&entry->child);
+               if (entry->proc_handler == NULL)
+                       kfree(entry->procname);
+       }
+
+       kfree(*tablep);
+       *tablep = NULL;
+}
+
+static int min_load_idx = 0;
+static int max_load_idx = CPU_LOAD_IDX_MAX-1;
+
+static void
+set_table_entry(struct ctl_table *entry,
+               const char *procname, void *data, int maxlen,
+               umode_t mode, proc_handler *proc_handler,
+               bool load_idx)
+{
+       entry->procname = procname;
+       entry->data = data;
+       entry->maxlen = maxlen;
+       entry->mode = mode;
+       entry->proc_handler = proc_handler;
+
+       if (load_idx) {
+               entry->extra1 = &min_load_idx;
+               entry->extra2 = &max_load_idx;
+       }
+}
+
+static struct ctl_table *
+sd_alloc_ctl_domain_table(struct sched_domain *sd)
+{
+       struct ctl_table *table = sd_alloc_ctl_entry(14);
+
+       if (table == NULL)
+               return NULL;
+
+       set_table_entry(&table[0], "min_interval", &sd->min_interval,
+               sizeof(long), 0644, proc_doulongvec_minmax, false);
+       set_table_entry(&table[1], "max_interval", &sd->max_interval,
+               sizeof(long), 0644, proc_doulongvec_minmax, false);
+       set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
+               sizeof(int), 0644, proc_dointvec_minmax, true);
+       set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
+               sizeof(int), 0644, proc_dointvec_minmax, true);
+       set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
+               sizeof(int), 0644, proc_dointvec_minmax, true);
+       set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
+               sizeof(int), 0644, proc_dointvec_minmax, true);
+       set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
+               sizeof(int), 0644, proc_dointvec_minmax, true);
+       set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
+               sizeof(int), 0644, proc_dointvec_minmax, false);
+       set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
+               sizeof(int), 0644, proc_dointvec_minmax, false);
+       set_table_entry(&table[9], "cache_nice_tries",
+               &sd->cache_nice_tries,
+               sizeof(int), 0644, proc_dointvec_minmax, false);
+       set_table_entry(&table[10], "flags", &sd->flags,
+               sizeof(int), 0644, proc_dointvec_minmax, false);
+       set_table_entry(&table[11], "max_newidle_lb_cost",
+               &sd->max_newidle_lb_cost,
+               sizeof(long), 0644, proc_doulongvec_minmax, false);
+       set_table_entry(&table[12], "name", sd->name,
+               CORENAME_MAX_SIZE, 0444, proc_dostring, false);
+       /* &table[13] is terminator */
+
+       return table;
+}
+
+static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+{
+       struct ctl_table *entry, *table;
+       struct sched_domain *sd;
+       int domain_num = 0, i;
+       char buf[32];
+
+       for_each_domain(cpu, sd)
+               domain_num++;
+       entry = table = sd_alloc_ctl_entry(domain_num + 1);
+       if (table == NULL)
+               return NULL;
+
+       i = 0;
+       for_each_domain(cpu, sd) {
+               snprintf(buf, 32, "domain%d", i);
+               entry->procname = kstrdup(buf, GFP_KERNEL);
+               entry->mode = 0555;
+               entry->child = sd_alloc_ctl_domain_table(sd);
+               entry++;
+               i++;
+       }
+       return table;
+}
+
+static struct ctl_table_header *sd_sysctl_header;
+void register_sched_domain_sysctl(void)
+{
+       int i, cpu_num = num_possible_cpus();
+       struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+       char buf[32];
+
+       WARN_ON(sd_ctl_dir[0].child);
+       sd_ctl_dir[0].child = entry;
+
+       if (entry == NULL)
+               return;
+
+       for_each_possible_cpu(i) {
+               snprintf(buf, 32, "cpu%d", i);
+               entry->procname = kstrdup(buf, GFP_KERNEL);
+               entry->mode = 0555;
+               entry->child = sd_alloc_ctl_cpu_table(i);
+               entry++;
+       }
+
+       WARN_ON(sd_sysctl_header);
+       sd_sysctl_header = register_sysctl_table(sd_ctl_root);
+}
+
+/* may be called multiple times per register */
+void unregister_sched_domain_sysctl(void)
+{
+       unregister_sysctl_table(sd_sysctl_header);
+       sd_sysctl_header = NULL;
+       if (sd_ctl_dir[0].child)
+               sd_free_ctl_entry(&sd_ctl_dir[0].child);
+}
+#endif /* CONFIG_SYSCTL */
+#endif /* CONFIG_SMP */
+
  #ifdef CONFIG_FAIR_GROUP_SCHED
  static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
  {
@@ -75,16 +379,18 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
         PN(se->vruntime);
         PN(se->sum_exec_runtime);
  #ifdef CONFIG_SCHEDSTATS
-       PN(se->statistics.wait_start);
-       PN(se->statistics.sleep_start);
-       PN(se->statistics.block_start);
-       PN(se->statistics.sleep_max);
-       PN(se->statistics.block_max);
-       PN(se->statistics.exec_max);
-       PN(se->statistics.slice_max);
-       PN(se->statistics.wait_max);
-       PN(se->statistics.wait_sum);
-       P(se->statistics.wait_count);
+       if (schedstat_enabled()) {
+               PN(se->statistics.wait_start);
+               PN(se->statistics.sleep_start);
+               PN(se->statistics.block_start);
+               PN(se->statistics.sleep_max);
+               PN(se->statistics.block_max);
+               PN(se->statistics.exec_max);
+               PN(se->statistics.slice_max);
+               PN(se->statistics.wait_max);
+               PN(se->statistics.wait_sum);
+               P(se->statistics.wait_count);
+       }
  #endif
         P(se->load.weight);
  #ifdef CONFIG_SMP
@@ -122,10 +428,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
                 (long long)(p->nvcsw + p->nivcsw),
                 p->prio);
  #ifdef CONFIG_SCHEDSTATS
-       SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
-               SPLIT_NS(p->se.statistics.wait_sum),
-               SPLIT_NS(p->se.sum_exec_runtime),
-               SPLIT_NS(p->se.statistics.sum_sleep_runtime));
+       if (schedstat_enabled()) {
+               SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
+                       SPLIT_NS(p->se.statistics.wait_sum),
+                       SPLIT_NS(p->se.sum_exec_runtime),
+                       SPLIT_NS(p->se.statistics.sum_sleep_runtime));
+       }
  #else
         SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
                 0LL, 0L,
@@ -258,8 +566,17 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
  
  void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
  {
+       struct dl_bw *dl_bw;
+
         SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
         SEQ_printf(m, "  .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running);
+#ifdef CONFIG_SMP
+       dl_bw = &cpu_rq(cpu)->rd->dl_bw;
+#else
+       dl_bw = &dl_rq->dl_bw;
+#endif
+       SEQ_printf(m, "  .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw);
+       SEQ_printf(m, "  .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw);
  }
  
  extern __read_mostly int sched_clock_running;
@@ -313,17 +630,18 @@ do {                                                                      \
  #define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
  #define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
  
-       P(yld_count);
-
-       P(sched_count);
-       P(sched_goidle);
  #ifdef CONFIG_SMP
         P64(avg_idle);
         P64(max_idle_balance_cost);
  #endif
  
-       P(ttwu_count);
-       P(ttwu_local);
+       if (schedstat_enabled()) {
+               P(yld_count);
+               P(sched_count);
+               P(sched_goidle);
+               P(ttwu_count);
+               P(ttwu_local);
+       }
  
  #undef P
  #undef P64
@@ -569,38 +887,39 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
         nr_switches = p->nvcsw + p->nivcsw;
  
  #ifdef CONFIG_SCHEDSTATS
-       PN(se.statistics.sum_sleep_runtime);
-       PN(se.statistics.wait_start);
-       PN(se.statistics.sleep_start);
-       PN(se.statistics.block_start);
-       PN(se.statistics.sleep_max);
-       PN(se.statistics.block_max);
-       PN(se.statistics.exec_max);
-       PN(se.statistics.slice_max);
-       PN(se.statistics.wait_max);
-       PN(se.statistics.wait_sum);
-       P(se.statistics.wait_count);
-       PN(se.statistics.iowait_sum);
-       P(se.statistics.iowait_count);
         P(se.nr_migrations);
-       P(se.statistics.nr_migrations_cold);
-       P(se.statistics.nr_failed_migrations_affine);
-       P(se.statistics.nr_failed_migrations_running);
-       P(se.statistics.nr_failed_migrations_hot);
-       P(se.statistics.nr_forced_migrations);
-       P(se.statistics.nr_wakeups);
-       P(se.statistics.nr_wakeups_sync);
-       P(se.statistics.nr_wakeups_migrate);
-       P(se.statistics.nr_wakeups_local);
-       P(se.statistics.nr_wakeups_remote);
-       P(se.statistics.nr_wakeups_affine);
-       P(se.statistics.nr_wakeups_affine_attempts);
-       P(se.statistics.nr_wakeups_passive);
-       P(se.statistics.nr_wakeups_idle);
  
-       {
+       if (schedstat_enabled()) {
                 u64 avg_atom, avg_per_cpu;
  
+               PN(se.statistics.sum_sleep_runtime);
+               PN(se.statistics.wait_start);
+               PN(se.statistics.sleep_start);
+               PN(se.statistics.block_start);
+               PN(se.statistics.sleep_max);
+               PN(se.statistics.block_max);
+               PN(se.statistics.exec_max);
+               PN(se.statistics.slice_max);
+               PN(se.statistics.wait_max);
+               PN(se.statistics.wait_sum);
+               P(se.statistics.wait_count);
+               PN(se.statistics.iowait_sum);
+               P(se.statistics.iowait_count);
+               P(se.statistics.nr_migrations_cold);
+               P(se.statistics.nr_failed_migrations_affine);
+               P(se.statistics.nr_failed_migrations_running);
+               P(se.statistics.nr_failed_migrations_hot);
+               P(se.statistics.nr_forced_migrations);
+               P(se.statistics.nr_wakeups);
+               P(se.statistics.nr_wakeups_sync);
+               P(se.statistics.nr_wakeups_migrate);
+               P(se.statistics.nr_wakeups_local);
+               P(se.statistics.nr_wakeups_remote);
+               P(se.statistics.nr_wakeups_affine);
+               P(se.statistics.nr_wakeups_affine_attempts);
+               P(se.statistics.nr_wakeups_passive);
+               P(se.statistics.nr_wakeups_idle);
+
                 avg_atom = p->se.sum_exec_runtime;
                 if (nr_switches)
                         avg_atom = div64_ul(avg_atom, nr_switches);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 56b7d4b839476b6ed1692e786abe9ed6cda64a5f..33130529e9b5628b53cd7031c4914da0de0e7148 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,8 +20,8 @@
   *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
   */
  
-#include <linux/latencytop.h>
  #include <linux/sched.h>
+#include <linux/latencytop.h>
  #include <linux/cpumask.h>
  #include <linux/cpuidle.h>
  #include <linux/slab.h>
@@ -755,7 +755,9 @@ static void
  update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
         struct task_struct *p;
-       u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+       u64 delta;
+
+       delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
  
         if (entity_is_task(se)) {
                 p = task_of(se);
@@ -776,22 +778,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
         se->statistics.wait_sum += delta;
         se->statistics.wait_start = 0;
  }
-#else
-static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-
-static inline void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-#endif
  
  /*
   * Task is being enqueued - update stats:
   */
-static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static inline void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
         /*
          * Are we enqueueing a waiting task? (for current tasks
@@ -802,7 +794,7 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
  }
  
  static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  {
         /*
          * Mark the end of the wait period if dequeueing a
@@ -810,8 +802,41 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
          */
         if (se != cfs_rq->curr)
                 update_stats_wait_end(cfs_rq, se);
+
+       if (flags & DEQUEUE_SLEEP) {
+               if (entity_is_task(se)) {
+                       struct task_struct *tsk = task_of(se);
+
+                       if (tsk->state & TASK_INTERRUPTIBLE)
+                               se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
+                       if (tsk->state & TASK_UNINTERRUPTIBLE)
+                               se->statistics.block_start = rq_clock(rq_of(cfs_rq));
+               }
+       }
+
+}
+#else
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
  }
  
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
+}
+#endif
+
  /*
   * We are picking a new current task - update its stats:
   */
@@ -907,10 +932,11 @@ struct numa_group {
         spinlock_t lock; /* nr_tasks, tasks */
         int nr_tasks;
         pid_t gid;
+       int active_nodes;
  
         struct rcu_head rcu;
-       nodemask_t active_nodes;
         unsigned long total_faults;
+       unsigned long max_faults_cpu;
         /*
          * Faults_cpu is used to decide whether memory should move
          * towards the CPU. As a consequence, these stats are weighted
@@ -969,6 +995,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
                 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
  }
  
+/*
+ * A node triggering more than 1/3 as many NUMA faults as the maximum is
+ * considered part of a numa group's pseudo-interleaving set. Migrations
+ * between these nodes are slowed down, to allow things to settle down.
+ */
+#define ACTIVE_NODE_FRACTION 3
+
+static bool numa_is_active_node(int nid, struct numa_group *ng)
+{
+       return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
+}
+
  /* Handle placement on systems where not all nodes are directly connected. */
  static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
                                         int maxdist, bool task)
@@ -1118,27 +1156,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
                 return true;
  
         /*
-        * Do not migrate if the destination is not a node that
-        * is actively used by this numa group.
+        * Destination node is much more heavily used than the source
+        * node? Allow migration.
          */
-       if (!node_isset(dst_nid, ng->active_nodes))
-               return false;
-
-       /*
-        * Source is a node that is not actively used by this
-        * numa group, while the destination is. Migrate.
-        */
-       if (!node_isset(src_nid, ng->active_nodes))
+       if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
+                                       ACTIVE_NODE_FRACTION)
                 return true;
  
         /*
-        * Both source and destination are nodes in active
-        * use by this numa group. Maximize memory bandwidth
-        * by migrating from more heavily used groups, to less
-        * heavily used ones, spreading the load around.
-        * Use a 1/4 hysteresis to avoid spurious page movement.
+        * Distribute memory according to CPU & memory use on each node,
+        * with 3/4 hysteresis to avoid unnecessary memory migrations:
+        *
+        * faults_cpu(dst)   3   faults_cpu(src)
+        * --------------- * - > ---------------
+        * faults_mem(dst)   4   faults_mem(src)
          */
-       return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
+       return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
+              group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
  }
  
  static unsigned long weighted_cpuload(const int cpu);
@@ -1484,7 +1518,7 @@ static int task_numa_migrate(struct task_struct *p)
  
                 .best_task = NULL,
                 .best_imp = 0,
-               .best_cpu = -1
+               .best_cpu = -1,
         };
         struct sched_domain *sd;
         unsigned long taskweight, groupweight;
@@ -1536,8 +1570,7 @@ static int task_numa_migrate(struct task_struct *p)
          *   multiple NUMA nodes; in order to better consolidate the group,
          *   we need to check other locations.
          */
-       if (env.best_cpu == -1 || (p->numa_group &&
-                       nodes_weight(p->numa_group->active_nodes) > 1)) {
+       if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
                 for_each_online_node(nid) {
                         if (nid == env.src_nid || nid == p->numa_preferred_nid)
                                 continue;
@@ -1572,12 +1605,14 @@ static int task_numa_migrate(struct task_struct *p)
          * trying for a better one later. Do not set the preferred node here.
          */
         if (p->numa_group) {
+               struct numa_group *ng = p->numa_group;
+
                 if (env.best_cpu == -1)
                         nid = env.src_nid;
                 else
                         nid = env.dst_nid;
  
-               if (node_isset(nid, p->numa_group->active_nodes))
+               if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
                         sched_setnuma(p, env.dst_nid);
         }
  
@@ -1627,20 +1662,15 @@ static void numa_migrate_preferred(struct task_struct *p)
  }
  
  /*
- * Find the nodes on which the workload is actively running. We do this by
+ * Find out how many nodes on the workload is actively running on. Do this by
   * tracking the nodes from which NUMA hinting faults are triggered. This can
   * be different from the set of nodes where the workload's memory is currently
   * located.
- *
- * The bitmask is used to make smarter decisions on when to do NUMA page
- * migrations, To prevent flip-flopping, and excessive page migrations, nodes
- * are added when they cause over 6/16 of the maximum number of faults, but
- * only removed when they drop below 3/16.
   */
-static void update_numa_active_node_mask(struct numa_group *numa_group)
+static void numa_group_count_active_nodes(struct numa_group *numa_group)
  {
         unsigned long faults, max_faults = 0;
-       int nid;
+       int nid, active_nodes = 0;
  
         for_each_online_node(nid) {
                 faults = group_faults_cpu(numa_group, nid);
@@ -1650,12 +1680,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
  
         for_each_online_node(nid) {
                 faults = group_faults_cpu(numa_group, nid);
-               if (!node_isset(nid, numa_group->active_nodes)) {
-                       if (faults > max_faults * 6 / 16)
-                               node_set(nid, numa_group->active_nodes);
-               } else if (faults < max_faults * 3 / 16)
-                       node_clear(nid, numa_group->active_nodes);
+               if (faults * ACTIVE_NODE_FRACTION > max_faults)
+                       active_nodes++;
         }
+
+       numa_group->max_faults_cpu = max_faults;
+       numa_group->active_nodes = active_nodes;
  }
  
  /*
@@ -1946,7 +1976,7 @@ static void task_numa_placement(struct task_struct *p)
         update_task_scan_period(p, fault_types[0], fault_types[1]);
  
         if (p->numa_group) {
-               update_numa_active_node_mask(p->numa_group);
+               numa_group_count_active_nodes(p->numa_group);
                 spin_unlock_irq(group_lock);
                 max_nid = preferred_group_nid(p, max_group_nid);
         }
@@ -1990,14 +2020,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
                         return;
  
                 atomic_set(&grp->refcount, 1);
+               grp->active_nodes = 1;
+               grp->max_faults_cpu = 0;
                 spin_lock_init(&grp->lock);
                 grp->gid = p->pid;
                 /* Second half of the array tracks nids where faults happen */
                 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
                                                 nr_node_ids;
  
-               node_set(task_node(current), grp->active_nodes);
-
                 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
                         grp->faults[i] = p->numa_faults[i];
  
@@ -2111,6 +2141,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
         bool migrated = flags & TNF_MIGRATED;
         int cpu_node = task_node(current);
         int local = !!(flags & TNF_FAULT_LOCAL);
+       struct numa_group *ng;
         int priv;
  
         if (!static_branch_likely(&sched_numa_balancing))
@@ -2151,9 +2182,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
          * actively using should be counted as local. This allows the
          * scan rate to slow down when a workload has settled down.
          */
-       if (!priv && !local && p->numa_group &&
-                       node_isset(cpu_node, p->numa_group->active_nodes) &&
-                       node_isset(mem_node, p->numa_group->active_nodes))
+       ng = p->numa_group;
+       if (!priv && !local && ng && ng->active_nodes > 1 &&
+                               numa_is_active_node(cpu_node, ng) &&
+                               numa_is_active_node(mem_node, ng))
                 local = 1;
  
         task_numa_placement(p);
@@ -3102,6 +3134,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
  
  static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
  
+static inline void check_schedstat_required(void)
+{
+#ifdef CONFIG_SCHEDSTATS
+       if (schedstat_enabled())
+               return;
+
+       /* Force schedstat enabled if a dependent tracepoint is active */
+       if (trace_sched_stat_wait_enabled()    ||
+                       trace_sched_stat_sleep_enabled()   ||
+                       trace_sched_stat_iowait_enabled()  ||
+                       trace_sched_stat_blocked_enabled() ||
+                       trace_sched_stat_runtime_enabled())  {
+               pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
+                            "stat_blocked and stat_runtime require the "
+                            "kernel parameter schedstats=enabled or "
+                            "kernel.sched_schedstats=1\n");
+       }
+#endif
+}
+
  static void
  enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  {
@@ -3122,11 +3174,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  
         if (flags & ENQUEUE_WAKEUP) {
                 place_entity(cfs_rq, se, 0);
-               enqueue_sleeper(cfs_rq, se);
+               if (schedstat_enabled())
+                       enqueue_sleeper(cfs_rq, se);
         }
  
-       update_stats_enqueue(cfs_rq, se);
-       check_spread(cfs_rq, se);
+       check_schedstat_required();
+       if (schedstat_enabled()) {
+               update_stats_enqueue(cfs_rq, se);
+               check_spread(cfs_rq, se);
+       }
         if (se != cfs_rq->curr)
                 __enqueue_entity(cfs_rq, se);
         se->on_rq = 1;
@@ -3193,19 +3249,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         update_curr(cfs_rq);
         dequeue_entity_load_avg(cfs_rq, se);
  
-       update_stats_dequeue(cfs_rq, se);
-       if (flags & DEQUEUE_SLEEP) {
-#ifdef CONFIG_SCHEDSTATS
-               if (entity_is_task(se)) {
-                       struct task_struct *tsk = task_of(se);
-
-                       if (tsk->state & TASK_INTERRUPTIBLE)
-                               se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
-                       if (tsk->state & TASK_UNINTERRUPTIBLE)
-                               se->statistics.block_start = rq_clock(rq_of(cfs_rq));
-               }
-#endif
-       }
+       if (schedstat_enabled())
+               update_stats_dequeue(cfs_rq, se, flags);
  
         clear_buddies(cfs_rq, se);
  
@@ -3279,7 +3324,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
                  * a CPU. So account for the time it spent waiting on the
                  * runqueue.
                  */
-               update_stats_wait_end(cfs_rq, se);
+               if (schedstat_enabled())
+                       update_stats_wait_end(cfs_rq, se);
                 __dequeue_entity(cfs_rq, se);
                 update_load_avg(se, 1);
         }
@@ -3292,7 +3338,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
          * least twice that of our own weight (i.e. dont track it
          * when there are only lesser-weight tasks around):
          */
-       if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
+       if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
                 se->statistics.slice_max = max(se->statistics.slice_max,
                         se->sum_exec_runtime - se->prev_sum_exec_runtime);
         }
@@ -3375,9 +3421,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
         /* throttle cfs_rqs exceeding runtime */
         check_cfs_rq_runtime(cfs_rq);
  
-       check_spread(cfs_rq, prev);
+       if (schedstat_enabled()) {
+               check_spread(cfs_rq, prev);
+               if (prev->on_rq)
+                       update_stats_wait_start(cfs_rq, prev);
+       }
+
         if (prev->on_rq) {
-               update_stats_wait_start(cfs_rq, prev);
                 /* Put 'current' back into the tree. */
                 __enqueue_entity(cfs_rq, prev);
                 /* in !on_rq case, update occurred at dequeue */
@@ -4459,9 +4509,17 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
  
                 /* scale is effectively 1 << i now, and >> i divides by scale */
  
-               old_load = this_rq->cpu_load[i] - tickless_load;
+               old_load = this_rq->cpu_load[i];
                 old_load = decay_load_missed(old_load, pending_updates - 1, i);
-               old_load += tickless_load;
+               if (tickless_load) {
+                       old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
+                       /*
+                        * old_load can never be a negative value because a
+                        * decayed tickless_load cannot be greater than the
+                        * original tickless_load.
+                        */
+                       old_load += tickless_load;
+               }
                 new_load = this_load;
                 /*
                  * Round up the averaging division if load is increasing. This
@@ -4484,6 +4542,25 @@ static unsigned long weighted_cpuload(const int cpu)
  }
  
  #ifdef CONFIG_NO_HZ_COMMON
+static void __update_cpu_load_nohz(struct rq *this_rq,
+                                  unsigned long curr_jiffies,
+                                  unsigned long load,
+                                  int active)
+{
+       unsigned long pending_updates;
+
+       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+       if (pending_updates) {
+               this_rq->last_load_update_tick = curr_jiffies;
+               /*
+                * In the regular NOHZ case, we were idle, this means load 0.
+                * In the NOHZ_FULL case, we were non-idle, we should consider
+                * its weighted load.
+                */
+               __update_cpu_load(this_rq, load, pending_updates, active);
+       }
+}
+
  /*
   * There is no sane way to deal with nohz on smp when using jiffies because the
   * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
@@ -4501,22 +4578,15 @@ static unsigned long weighted_cpuload(const int cpu)
   * Called from nohz_idle_balance() to update the load ratings before doing the
   * idle balance.
   */
-static void update_idle_cpu_load(struct rq *this_rq)
+static void update_cpu_load_idle(struct rq *this_rq)
  {
-       unsigned long curr_jiffies = READ_ONCE(jiffies);
-       unsigned long load = weighted_cpuload(cpu_of(this_rq));
-       unsigned long pending_updates;
-
         /*
          * bail if there's load or we're actually up-to-date.
          */
-       if (load || curr_jiffies == this_rq->last_load_update_tick)
+       if (weighted_cpuload(cpu_of(this_rq)))
                 return;
  
-       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-       this_rq->last_load_update_tick = curr_jiffies;
-
-       __update_cpu_load(this_rq, load, pending_updates, 0);
+       __update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0);
  }
  
  /*
@@ -4527,22 +4597,12 @@ void update_cpu_load_nohz(int active)
         struct rq *this_rq = this_rq();
         unsigned long curr_jiffies = READ_ONCE(jiffies);
         unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
-       unsigned long pending_updates;
  
         if (curr_jiffies == this_rq->last_load_update_tick)
                 return;
  
         raw_spin_lock(&this_rq->lock);
-       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-       if (pending_updates) {
-               this_rq->last_load_update_tick = curr_jiffies;
-               /*
-                * In the regular NOHZ case, we were idle, this means load 0.
-                * In the NOHZ_FULL case, we were non-idle, we should consider
-                * its weighted load.
-                */
-               __update_cpu_load(this_rq, load, pending_updates, active);
-       }
+       __update_cpu_load_nohz(this_rq, curr_jiffies, load, active);
         raw_spin_unlock(&this_rq->lock);
  }
  #endif /* CONFIG_NO_HZ */
@@ -4554,7 +4614,7 @@ void update_cpu_load_active(struct rq *this_rq)
  {
         unsigned long load = weighted_cpuload(cpu_of(this_rq));
         /*
-        * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+        * See the mess around update_cpu_load_idle() / update_cpu_load_nohz().
          */
         this_rq->last_load_update_tick = jiffies;
         __update_cpu_load(this_rq, load, 1, 1);
@@ -7848,7 +7908,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
                 if (time_after_eq(jiffies, rq->next_balance)) {
                         raw_spin_lock_irq(&rq->lock);
                         update_rq_clock(rq);
-                       update_idle_cpu_load(rq);
+                       update_cpu_load_idle(rq);
                         raw_spin_unlock_irq(&rq->lock);
                         rebalance_domains(rq, CPU_IDLE);
                 }
@@ -8234,11 +8294,8 @@ void free_fair_sched_group(struct task_group *tg)
         for_each_possible_cpu(i) {
                 if (tg->cfs_rq)
                         kfree(tg->cfs_rq[i]);
-               if (tg->se) {
-                       if (tg->se[i])
-                               remove_entity_load_avg(tg->se[i]);
+               if (tg->se)
                         kfree(tg->se[i]);
-               }
         }
  
         kfree(tg->cfs_rq);
@@ -8286,21 +8343,29 @@ err:
         return 0;
  }
  
-void unregister_fair_sched_group(struct task_group *tg, int cpu)
+void unregister_fair_sched_group(struct task_group *tg)
  {
-       struct rq *rq = cpu_rq(cpu);
         unsigned long flags;
+       struct rq *rq;
+       int cpu;
  
-       /*
-       * Only empty task groups can be destroyed; so we can speculatively
-       * check on_list without danger of it being re-added.
-       */
-       if (!tg->cfs_rq[cpu]->on_list)
-               return;
+       for_each_possible_cpu(cpu) {
+               if (tg->se[cpu])
+                       remove_entity_load_avg(tg->se[cpu]);
  
-       raw_spin_lock_irqsave(&rq->lock, flags);
-       list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+               /*
+                * Only empty task groups can be destroyed; so we can speculatively
+                * check on_list without danger of it being re-added.
+                */
+               if (!tg->cfs_rq[cpu]->on_list)
+                       continue;
+
+               rq = cpu_rq(cpu);
+
+               raw_spin_lock_irqsave(&rq->lock, flags);
+               list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+               raw_spin_unlock_irqrestore(&rq->lock, flags);
+       }
  }
  
  void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
@@ -8382,7 +8447,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
         return 1;
  }
  
-void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
+void unregister_fair_sched_group(struct task_group *tg) { }
  
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c

index 8ec86abe0ea188369ee4e7efd21789d8cb127c14..56247132948799036fc4cbcc0c3d7bfcb778bcc5 100644 (file)
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -58,7 +58,15 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
         raw_spin_lock(&rt_b->rt_runtime_lock);
         if (!rt_b->rt_period_active) {
                 rt_b->rt_period_active = 1;
-               hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period);
+               /*
+                * SCHED_DEADLINE updates the bandwidth, as a run away
+                * RT task with a DL task could hog a CPU. But DL does
+                * not reset the period. If a deadline task was running
+                * without an RT task running, it can cause RT tasks to
+                * throttle when they start up. Kick the timer right away
+                * to update the period.
+                */
+               hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
                 hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
         }
         raw_spin_unlock(&rt_b->rt_runtime_lock);
@@ -436,7 +444,7 @@ static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
  
  static inline int on_rt_rq(struct sched_rt_entity *rt_se)
  {
-       return !list_empty(&rt_se->run_list);
+       return rt_se->on_rq;
  }
  
  #ifdef CONFIG_RT_GROUP_SCHED
@@ -482,8 +490,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
         return rt_se->my_q;
  }
  
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
  
  static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
  {
@@ -499,7 +507,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
                 if (!rt_se)
                         enqueue_top_rt_rq(rt_rq);
                 else if (!on_rt_rq(rt_se))
-                       enqueue_rt_entity(rt_se, false);
+                       enqueue_rt_entity(rt_se, 0);
  
                 if (rt_rq->highest_prio.curr < curr->prio)
                         resched_curr(rq);
@@ -516,7 +524,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
         if (!rt_se)
                 dequeue_top_rt_rq(rt_rq);
         else if (on_rt_rq(rt_se))
-               dequeue_rt_entity(rt_se);
+               dequeue_rt_entity(rt_se, 0);
  }
  
  static inline int rt_rq_throttled(struct rt_rq *rt_rq)
@@ -1141,6 +1149,20 @@ unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
                 return 1;
  }
  
+static inline
+unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
+{
+       struct rt_rq *group_rq = group_rt_rq(rt_se);
+       struct task_struct *tsk;
+
+       if (group_rq)
+               return group_rq->rr_nr_running;
+
+       tsk = rt_task_of(rt_se);
+
+       return (tsk->policy == SCHED_RR) ? 1 : 0;
+}
+
  static inline
  void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  {
@@ -1148,6 +1170,7 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  
         WARN_ON(!rt_prio(prio));
         rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
+       rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
  
         inc_rt_prio(rt_rq, prio);
         inc_rt_migration(rt_se, rt_rq);
@@ -1160,13 +1183,37 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
         WARN_ON(!rt_prio(rt_se_prio(rt_se)));
         WARN_ON(!rt_rq->rt_nr_running);
         rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
+       rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
  
         dec_rt_prio(rt_rq, rt_se_prio(rt_se));
         dec_rt_migration(rt_se, rt_rq);
         dec_rt_group(rt_se, rt_rq);
  }
  
-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+/*
+ * Change rt_se->run_list location unless SAVE && !MOVE
+ *
+ * assumes ENQUEUE/DEQUEUE flags match
+ */
+static inline bool move_entity(unsigned int flags)
+{
+       if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+               return false;
+
+       return true;
+}
+
+static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
+{
+       list_del_init(&rt_se->run_list);
+
+       if (list_empty(array->queue + rt_se_prio(rt_se)))
+               __clear_bit(rt_se_prio(rt_se), array->bitmap);
+
+       rt_se->on_list = 0;
+}
+
+static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
  {
         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
         struct rt_prio_array *array = &rt_rq->active;
@@ -1179,26 +1226,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
          * get throttled and the current group doesn't have any other
          * active members.
          */
-       if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
+       if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
+               if (rt_se->on_list)
+                       __delist_rt_entity(rt_se, array);
                 return;
+       }
  
-       if (head)
-               list_add(&rt_se->run_list, queue);
-       else
-               list_add_tail(&rt_se->run_list, queue);
-       __set_bit(rt_se_prio(rt_se), array->bitmap);
+       if (move_entity(flags)) {
+               WARN_ON_ONCE(rt_se->on_list);
+               if (flags & ENQUEUE_HEAD)
+                       list_add(&rt_se->run_list, queue);
+               else
+                       list_add_tail(&rt_se->run_list, queue);
+
+               __set_bit(rt_se_prio(rt_se), array->bitmap);
+               rt_se->on_list = 1;
+       }
+       rt_se->on_rq = 1;
  
         inc_rt_tasks(rt_se, rt_rq);
  }
  
-static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
  {
         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
         struct rt_prio_array *array = &rt_rq->active;
  
-       list_del_init(&rt_se->run_list);
-       if (list_empty(array->queue + rt_se_prio(rt_se)))
-               __clear_bit(rt_se_prio(rt_se), array->bitmap);
+       if (move_entity(flags)) {
+               WARN_ON_ONCE(!rt_se->on_list);
+               __delist_rt_entity(rt_se, array);
+       }
+       rt_se->on_rq = 0;
  
         dec_rt_tasks(rt_se, rt_rq);
  }
@@ -1207,7 +1265,7 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
   * Because the prio of an upper entry depends on the lower
   * entries, we must remove entries top - down.
   */
-static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
+static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
  {
         struct sched_rt_entity *back = NULL;
  
@@ -1220,31 +1278,31 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
  
         for (rt_se = back; rt_se; rt_se = rt_se->back) {
                 if (on_rt_rq(rt_se))
-                       __dequeue_rt_entity(rt_se);
+                       __dequeue_rt_entity(rt_se, flags);
         }
  }
  
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
  {
         struct rq *rq = rq_of_rt_se(rt_se);
  
-       dequeue_rt_stack(rt_se);
+       dequeue_rt_stack(rt_se, flags);
         for_each_sched_rt_entity(rt_se)
-               __enqueue_rt_entity(rt_se, head);
+               __enqueue_rt_entity(rt_se, flags);
         enqueue_top_rt_rq(&rq->rt);
  }
  
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
  {
         struct rq *rq = rq_of_rt_se(rt_se);
  
-       dequeue_rt_stack(rt_se);
+       dequeue_rt_stack(rt_se, flags);
  
         for_each_sched_rt_entity(rt_se) {
                 struct rt_rq *rt_rq = group_rt_rq(rt_se);
  
                 if (rt_rq && rt_rq->rt_nr_running)
-                       __enqueue_rt_entity(rt_se, false);
+                       __enqueue_rt_entity(rt_se, flags);
         }
         enqueue_top_rt_rq(&rq->rt);
  }
@@ -1260,7 +1318,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
         if (flags & ENQUEUE_WAKEUP)
                 rt_se->timeout = 0;
  
-       enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
+       enqueue_rt_entity(rt_se, flags);
  
         if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
                 enqueue_pushable_task(rq, p);
@@ -1271,7 +1329,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
         struct sched_rt_entity *rt_se = &p->rt;
  
         update_curr_rt(rq);
-       dequeue_rt_entity(rt_se);
+       dequeue_rt_entity(rt_se, flags);
  
         dequeue_pushable_task(rq, p);
  }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 10f16374df7f3a3f0f3dc0eb281559cacd223311..b2ff5a2bd6df20b89bbd8ed838712ac83e905d75 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3,6 +3,7 @@
  #include <linux/sched/sysctl.h>
  #include <linux/sched/rt.h>
  #include <linux/sched/deadline.h>
+#include <linux/binfmts.h>
  #include <linux/mutex.h>
  #include <linux/spinlock.h>
  #include <linux/stop_machine.h>
@@ -313,12 +314,11 @@ extern int tg_nop(struct task_group *tg, void *data);
  
  extern void free_fair_sched_group(struct task_group *tg);
  extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
-extern void unregister_fair_sched_group(struct task_group *tg, int cpu);
+extern void unregister_fair_sched_group(struct task_group *tg);
  extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
                         struct sched_entity *se, int cpu,
                         struct sched_entity *parent);
  extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
-extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
  
  extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
  extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
@@ -450,6 +450,7 @@ static inline int rt_bandwidth_enabled(void)
  struct rt_rq {
         struct rt_prio_array active;
         unsigned int rt_nr_running;
+       unsigned int rr_nr_running;
  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
         struct {
                 int curr; /* highest queued rt task prio */
@@ -909,6 +910,18 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
  
  extern int group_balance_cpu(struct sched_group *sg);
  
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+void register_sched_domain_sysctl(void);
+void unregister_sched_domain_sysctl(void);
+#else
+static inline void register_sched_domain_sysctl(void)
+{
+}
+static inline void unregister_sched_domain_sysctl(void)
+{
+}
+#endif
+
  #else
  
  static inline void sched_ttwu_pending(void) { }
@@ -1022,6 +1035,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
  #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
  
  extern struct static_key_false sched_numa_balancing;
+extern struct static_key_false sched_schedstats;
  
  static inline u64 global_rt_period(void)
  {
@@ -1130,18 +1144,40 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
  extern const int sched_prio_to_weight[40];
  extern const u32 sched_prio_to_wmult[40];
  
+/*
+ * {de,en}queue flags:
+ *
+ * DEQUEUE_SLEEP  - task is no longer runnable
+ * ENQUEUE_WAKEUP - task just became runnable
+ *
+ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
+ *                are in a known state which allows modification. Such pairs
+ *                should preserve as much state as possible.
+ *
+ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
+ *        in the runqueue.
+ *
+ * ENQUEUE_HEAD      - place at front of runqueue (tail if not specified)
+ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
+ * ENQUEUE_WAKING    - sched_class::task_waking was called
+ *
+ */
+
+#define DEQUEUE_SLEEP          0x01
+#define DEQUEUE_SAVE           0x02 /* matches ENQUEUE_RESTORE */
+#define DEQUEUE_MOVE           0x04 /* matches ENQUEUE_MOVE */
+
  #define ENQUEUE_WAKEUP         0x01
-#define ENQUEUE_HEAD           0x02
+#define ENQUEUE_RESTORE                0x02
+#define ENQUEUE_MOVE           0x04
+
+#define ENQUEUE_HEAD           0x08
+#define ENQUEUE_REPLENISH      0x10
  #ifdef CONFIG_SMP
-#define ENQUEUE_WAKING         0x04    /* sched_class::task_waking was called */
+#define ENQUEUE_WAKING         0x20
  #else
  #define ENQUEUE_WAKING         0x00
  #endif
-#define ENQUEUE_REPLENISH      0x08
-#define ENQUEUE_RESTORE        0x10
-
-#define DEQUEUE_SLEEP          0x01
-#define DEQUEUE_SAVE           0x02
  
  #define RETRY_TASK             ((void *)-1UL)
  
@@ -1278,6 +1314,35 @@ unsigned long to_ratio(u64 period, u64 runtime);
  
  extern void init_entity_runnable_average(struct sched_entity *se);
  
+#ifdef CONFIG_NO_HZ_FULL
+extern bool sched_can_stop_tick(struct rq *rq);
+
+/*
+ * Tick may be needed by tasks in the runqueue depending on their policy and
+ * requirements. If tick is needed, lets send the target an IPI to kick it out of
+ * nohz mode if necessary.
+ */
+static inline void sched_update_tick_dependency(struct rq *rq)
+{
+       int cpu;
+
+       if (!tick_nohz_full_enabled())
+               return;
+
+       cpu = cpu_of(rq);
+
+       if (!tick_nohz_full_cpu(cpu))
+               return;
+
+       if (sched_can_stop_tick(rq))
+               tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED);
+       else
+               tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
+}
+#else
+static inline void sched_update_tick_dependency(struct rq *rq) { }
+#endif
+
  static inline void add_nr_running(struct rq *rq, unsigned count)
  {
         unsigned prev_nr = rq->nr_running;
@@ -1289,26 +1354,16 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
                 if (!rq->rd->overload)
                         rq->rd->overload = true;
  #endif
-
-#ifdef CONFIG_NO_HZ_FULL
-               if (tick_nohz_full_cpu(rq->cpu)) {
-                       /*
-                        * Tick is needed if more than one task runs on a CPU.
-                        * Send the target an IPI to kick it out of nohz mode.
-                        *
-                        * We assume that IPI implies full memory barrier and the
-                        * new value of rq->nr_running is visible on reception
-                        * from the target.
-                        */
-                       tick_nohz_full_kick_cpu(rq->cpu);
-               }
-#endif
         }
+
+       sched_update_tick_dependency(rq);
  }
  
  static inline void sub_nr_running(struct rq *rq, unsigned count)
  {
         rq->nr_running -= count;
+       /* Check if we still need preemption */
+       sched_update_tick_dependency(rq);
  }
  
  static inline void rq_last_tick_reset(struct rq *rq)
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h

index b0fbc7632de5f9b13d8ccd2c42d73560c347669a..70b3b6a20fb0e362f4c816fe0f069c7c8576c7f4 100644 (file)
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -29,9 +29,10 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
         if (rq)
                 rq->rq_sched_info.run_delay += delta;
  }
-# define schedstat_inc(rq, field)      do { (rq)->field++; } while (0)
-# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
-# define schedstat_set(var, val)       do { var = (val); } while (0)
+# define schedstat_enabled()           static_branch_unlikely(&sched_schedstats)
+# define schedstat_inc(rq, field)      do { if (schedstat_enabled()) { (rq)->field++; } } while (0)
+# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)
+# define schedstat_set(var, val)       do { if (schedstat_enabled()) { var = (val); } } while (0)
  #else /* !CONFIG_SCHEDSTATS */
  static inline void
  rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
@@ -42,6 +43,7 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
  static inline void
  rq_sched_info_depart(struct rq *rq, unsigned long long delta)
  {}
+# define schedstat_enabled()           0
  # define schedstat_inc(rq, field)      do { } while (0)
  # define schedstat_add(rq, field, amt) do { } while (0)
  # define schedstat_set(var, val)       do { } while (0)
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c

new file mode 100644 (file)

index 0000000..82f0dff
--- /dev/null
+++ b/kernel/sched/swait.c
@@ -0,0 +1,123 @@
+#include <linux/sched.h>
+#include <linux/swait.h>
+
+void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
+                            struct lock_class_key *key)
+{
+       raw_spin_lock_init(&q->lock);
+       lockdep_set_class_and_name(&q->lock, key, name);
+       INIT_LIST_HEAD(&q->task_list);
+}
+EXPORT_SYMBOL(__init_swait_queue_head);
+
+/*
+ * The thing about the wake_up_state() return value; I think we can ignore it.
+ *
+ * If for some reason it would return 0, that means the previously waiting
+ * task is already running, so it will observe condition true (or has already).
+ */
+void swake_up_locked(struct swait_queue_head *q)
+{
+       struct swait_queue *curr;
+
+       if (list_empty(&q->task_list))
+               return;
+
+       curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
+       wake_up_process(curr->task);
+       list_del_init(&curr->task_list);
+}
+EXPORT_SYMBOL(swake_up_locked);
+
+void swake_up(struct swait_queue_head *q)
+{
+       unsigned long flags;
+
+       if (!swait_active(q))
+               return;
+
+       raw_spin_lock_irqsave(&q->lock, flags);
+       swake_up_locked(q);
+       raw_spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(swake_up);
+
+/*
+ * Does not allow usage from IRQ disabled, since we must be able to
+ * release IRQs to guarantee bounded hold time.
+ */
+void swake_up_all(struct swait_queue_head *q)
+{
+       struct swait_queue *curr;
+       LIST_HEAD(tmp);
+
+       if (!swait_active(q))
+               return;
+
+       raw_spin_lock_irq(&q->lock);
+       list_splice_init(&q->task_list, &tmp);
+       while (!list_empty(&tmp)) {
+               curr = list_first_entry(&tmp, typeof(*curr), task_list);
+
+               wake_up_state(curr->task, TASK_NORMAL);
+               list_del_init(&curr->task_list);
+
+               if (list_empty(&tmp))
+                       break;
+
+               raw_spin_unlock_irq(&q->lock);
+               raw_spin_lock_irq(&q->lock);
+       }
+       raw_spin_unlock_irq(&q->lock);
+}
+EXPORT_SYMBOL(swake_up_all);
+
+void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+       wait->task = current;
+       if (list_empty(&wait->task_list))
+               list_add(&wait->task_list, &q->task_list);
+}
+
+void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
+{
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&q->lock, flags);
+       __prepare_to_swait(q, wait);
+       set_current_state(state);
+       raw_spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_swait);
+
+long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
+{
+       if (signal_pending_state(state, current))
+               return -ERESTARTSYS;
+
+       prepare_to_swait(q, wait, state);
+
+       return 0;
+}
+EXPORT_SYMBOL(prepare_to_swait_event);
+
+void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+       __set_current_state(TASK_RUNNING);
+       if (!list_empty(&wait->task_list))
+               list_del_init(&wait->task_list);
+}
+
+void finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+       unsigned long flags;
+
+       __set_current_state(TASK_RUNNING);
+
+       if (!list_empty_careful(&wait->task_list)) {
+               raw_spin_lock_irqsave(&q->lock, flags);
+               list_del_init(&wait->task_list);
+               raw_spin_unlock_irqrestore(&q->lock, flags);
+       }
+}
+EXPORT_SYMBOL(finish_swait);
diff --git a/kernel/smp.c b/kernel/smp.c

index d903c02223afbaa2776b2610f00ae3def7de442e..300d29391e07416d73177c752ecca4afee21d3dd 100644 (file)
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -105,13 +105,12 @@ void __init call_function_init(void)
   * previous function call. For multi-cpu calls its even more interesting
   * as we'll have to ensure no other cpu is observing our csd.
   */
-static void csd_lock_wait(struct call_single_data *csd)
+static __always_inline void csd_lock_wait(struct call_single_data *csd)
  {
-       while (smp_load_acquire(&csd->flags) & CSD_FLAG_LOCK)
-               cpu_relax();
+       smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK));
  }
  
-static void csd_lock(struct call_single_data *csd)
+static __always_inline void csd_lock(struct call_single_data *csd)
  {
         csd_lock_wait(csd);
         csd->flags |= CSD_FLAG_LOCK;
@@ -124,7 +123,7 @@ static void csd_lock(struct call_single_data *csd)
         smp_wmb();
  }
  
-static void csd_unlock(struct call_single_data *csd)
+static __always_inline void csd_unlock(struct call_single_data *csd)
  {
         WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
  
diff --git a/kernel/softirq.c b/kernel/softirq.c

index 479e4436f787646c92c42e0dbe2d940b366fbf60..8aae49dd7da862955f9ec49c65b0d80ad8effcd6 100644 (file)
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -116,9 +116,9 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
  
         if (preempt_count() == cnt) {
  #ifdef CONFIG_DEBUG_PREEMPT
-               current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1);
+               current->preempt_disable_ip = get_lock_parent_ip();
  #endif
-               trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+               trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip());
         }
  }
  EXPORT_SYMBOL(__local_bh_disable_ip);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index 97715fd9e790ade5d7cd7731107de2dafb8272e3..f5102fabef7f525f6c79d66756336c262e465caa 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -350,6 +350,17 @@ static struct ctl_table kern_table[] = {
                 .mode           = 0644,
                 .proc_handler   = proc_dointvec,
         },
+#ifdef CONFIG_SCHEDSTATS
+       {
+               .procname       = "sched_schedstats",
+               .data           = NULL,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = sysctl_schedstats,
+               .extra1         = &zero,
+               .extra2         = &one,
+       },
+#endif /* CONFIG_SCHEDSTATS */
  #endif /* CONFIG_SMP */
  #ifdef CONFIG_NUMA_BALANCING
         {
@@ -505,7 +516,7 @@ static struct ctl_table kern_table[] = {
                 .data           = &latencytop_enabled,
                 .maxlen         = sizeof(int),
                 .mode           = 0644,
-               .proc_handler   = proc_dointvec,
+               .proc_handler   = sysctl_latencytop,
         },
  #endif
  #ifdef CONFIG_BLK_DEV_INITRD
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c

index f5e86d282d520a7881557de76096c1cf46f58fa7..1cafba860b08ceb6030fa2a1f237e3950a4e0aa7 100644 (file)
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -333,7 +333,6 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
         return err;
  }
  
-
  /*
   * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
   * This is called from sys_timer_create() and do_cpu_nanosleep() with the
@@ -517,6 +516,10 @@ static void arm_timer(struct k_itimer *timer)
                                 cputime_expires->sched_exp = exp;
                         break;
                 }
+               if (CPUCLOCK_PERTHREAD(timer->it_clock))
+                       tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER);
+               else
+                       tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER);
         }
  }
  
@@ -582,39 +585,6 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
         return 0;
  }
  
-#ifdef CONFIG_NO_HZ_FULL
-static void nohz_kick_work_fn(struct work_struct *work)
-{
-       tick_nohz_full_kick_all();
-}
-
-static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn);
-
-/*
- * We need the IPIs to be sent from sane process context.
- * The posix cpu timers are always set with irqs disabled.
- */
-static void posix_cpu_timer_kick_nohz(void)
-{
-       if (context_tracking_is_enabled())
-               schedule_work(&nohz_kick_work);
-}
-
-bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
-{
-       if (!task_cputime_zero(&tsk->cputime_expires))
-               return false;
-
-       /* Check if cputimer is running. This is accessed without locking. */
-       if (READ_ONCE(tsk->signal->cputimer.running))
-               return false;
-
-       return true;
-}
-#else
-static inline void posix_cpu_timer_kick_nohz(void) { }
-#endif
-
  /*
   * Guts of sys_timer_settime for CPU timers.
   * This is called with the timer locked and interrupts disabled.
@@ -761,8 +731,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
                 sample_to_timespec(timer->it_clock,
                                    old_incr, &old->it_interval);
         }
-       if (!ret)
-               posix_cpu_timer_kick_nohz();
+
         return ret;
  }
  
@@ -911,6 +880,8 @@ static void check_thread_timers(struct task_struct *tsk,
                         __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
                 }
         }
+       if (task_cputime_zero(tsk_expires))
+               tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER);
  }
  
  static inline void stop_process_timers(struct signal_struct *sig)
@@ -919,6 +890,7 @@ static inline void stop_process_timers(struct signal_struct *sig)
  
         /* Turn off cputimer->running. This is done without locking. */
         WRITE_ONCE(cputimer->running, false);
+       tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER);
  }
  
  static u32 onecputick;
@@ -1095,8 +1067,6 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
         arm_timer(timer);
         unlock_task_sighand(p, &flags);
  
-       /* Kick full dynticks CPUs in case they need to tick on the new timer */
-       posix_cpu_timer_kick_nohz();
  out:
         timer->it_overrun_last = timer->it_overrun;
         timer->it_overrun = -1;
@@ -1270,7 +1240,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
                 }
  
                 if (!*newval)
-                       goto out;
+                       return;
                 *newval += now;
         }
  
@@ -1288,8 +1258,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
                         tsk->signal->cputime_expires.virt_exp = *newval;
                 break;
         }
-out:
-       posix_cpu_timer_kick_nohz();
+
+       tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER);
  }
  
  static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c

index 0b17424349eb4dafd76a2cf588748988ce51a295..969e6704c3c9a24754d0da31e0dd028257425ddd 100644 (file)
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -22,7 +22,6 @@
  #include <linux/module.h>
  #include <linux/irq_work.h>
  #include <linux/posix-timers.h>
-#include <linux/perf_event.h>
  #include <linux/context_tracking.h>
  
  #include <asm/irq_regs.h>
@@ -158,54 +157,63 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
  cpumask_var_t tick_nohz_full_mask;
  cpumask_var_t housekeeping_mask;
  bool tick_nohz_full_running;
+static unsigned long tick_dep_mask;
  
-static bool can_stop_full_tick(void)
+static void trace_tick_dependency(unsigned long dep)
+{
+       if (dep & TICK_DEP_MASK_POSIX_TIMER) {
+               trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER);
+               return;
+       }
+
+       if (dep & TICK_DEP_MASK_PERF_EVENTS) {
+               trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS);
+               return;
+       }
+
+       if (dep & TICK_DEP_MASK_SCHED) {
+               trace_tick_stop(0, TICK_DEP_MASK_SCHED);
+               return;
+       }
+
+       if (dep & TICK_DEP_MASK_CLOCK_UNSTABLE)
+               trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE);
+}
+
+static bool can_stop_full_tick(struct tick_sched *ts)
  {
         WARN_ON_ONCE(!irqs_disabled());
  
-       if (!sched_can_stop_tick()) {
-               trace_tick_stop(0, "more than 1 task in runqueue\n");
+       if (tick_dep_mask) {
+               trace_tick_dependency(tick_dep_mask);
                 return false;
         }
  
-       if (!posix_cpu_timers_can_stop_tick(current)) {
-               trace_tick_stop(0, "posix timers running\n");
+       if (ts->tick_dep_mask) {
+               trace_tick_dependency(ts->tick_dep_mask);
                 return false;
         }
  
-       if (!perf_event_can_stop_tick()) {
-               trace_tick_stop(0, "perf events running\n");
+       if (current->tick_dep_mask) {
+               trace_tick_dependency(current->tick_dep_mask);
                 return false;
         }
  
-       /* sched_clock_tick() needs us? */
-#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-       /*
-        * TODO: kick full dynticks CPUs when
-        * sched_clock_stable is set.
-        */
-       if (!sched_clock_stable()) {
-               trace_tick_stop(0, "unstable sched clock\n");
-               /*
-                * Don't allow the user to think they can get
-                * full NO_HZ with this machine.
-                */
-               WARN_ONCE(tick_nohz_full_running,
-                         "NO_HZ FULL will not work with unstable sched clock");
+       if (current->signal->tick_dep_mask) {
+               trace_tick_dependency(current->signal->tick_dep_mask);
                 return false;
         }
-#endif
  
         return true;
  }
  
-static void nohz_full_kick_work_func(struct irq_work *work)
+static void nohz_full_kick_func(struct irq_work *work)
  {
         /* Empty, the tick restart happens on tick_nohz_irq_exit() */
  }
  
  static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
-       .func = nohz_full_kick_work_func,
+       .func = nohz_full_kick_func,
  };
  
  /*
@@ -214,7 +222,7 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
   * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(),
   * is NMI safe.
   */
-void tick_nohz_full_kick(void)
+static void tick_nohz_full_kick(void)
  {
         if (!tick_nohz_full_cpu(smp_processor_id()))
                 return;
@@ -234,27 +242,112 @@ void tick_nohz_full_kick_cpu(int cpu)
         irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
  }
  
-static void nohz_full_kick_ipi(void *info)
-{
-       /* Empty, the tick restart happens on tick_nohz_irq_exit() */
-}
-
  /*
   * Kick all full dynticks CPUs in order to force these to re-evaluate
   * their dependency on the tick and restart it if necessary.
   */
-void tick_nohz_full_kick_all(void)
+static void tick_nohz_full_kick_all(void)
  {
+       int cpu;
+
         if (!tick_nohz_full_running)
                 return;
  
         preempt_disable();
-       smp_call_function_many(tick_nohz_full_mask,
-                              nohz_full_kick_ipi, NULL, false);
-       tick_nohz_full_kick();
+       for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask)
+               tick_nohz_full_kick_cpu(cpu);
         preempt_enable();
  }
  
+static void tick_nohz_dep_set_all(unsigned long *dep,
+                                 enum tick_dep_bits bit)
+{
+       unsigned long prev;
+
+       prev = fetch_or(dep, BIT_MASK(bit));
+       if (!prev)
+               tick_nohz_full_kick_all();
+}
+
+/*
+ * Set a global tick dependency. Used by perf events that rely on freq and
+ * by unstable clock.
+ */
+void tick_nohz_dep_set(enum tick_dep_bits bit)
+{
+       tick_nohz_dep_set_all(&tick_dep_mask, bit);
+}
+
+void tick_nohz_dep_clear(enum tick_dep_bits bit)
+{
+       clear_bit(bit, &tick_dep_mask);
+}
+
+/*
+ * Set per-CPU tick dependency. Used by scheduler and perf events in order to
+ * manage events throttling.
+ */
+void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
+{
+       unsigned long prev;
+       struct tick_sched *ts;
+
+       ts = per_cpu_ptr(&tick_cpu_sched, cpu);
+
+       prev = fetch_or(&ts->tick_dep_mask, BIT_MASK(bit));
+       if (!prev) {
+               preempt_disable();
+               /* Perf needs local kick that is NMI safe */
+               if (cpu == smp_processor_id()) {
+                       tick_nohz_full_kick();
+               } else {
+                       /* Remote irq work not NMI-safe */
+                       if (!WARN_ON_ONCE(in_nmi()))
+                               tick_nohz_full_kick_cpu(cpu);
+               }
+               preempt_enable();
+       }
+}
+
+void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
+{
+       struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
+
+       clear_bit(bit, &ts->tick_dep_mask);
+}
+
+/*
+ * Set a per-task tick dependency. Posix CPU timers need this in order to elapse
+ * per task timers.
+ */
+void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
+{
+       /*
+        * We could optimize this with just kicking the target running the task
+        * if that noise matters for nohz full users.
+        */
+       tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit);
+}
+
+void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit)
+{
+       clear_bit(bit, &tsk->tick_dep_mask);
+}
+
+/*
+ * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse
+ * per process timers.
+ */
+void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits bit)
+{
+       tick_nohz_dep_set_all(&sig->tick_dep_mask, bit);
+}
+
+void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit)
+{
+       clear_bit(bit, &sig->tick_dep_mask);
+}
+
  /*
   * Re-evaluate the need for the tick as we switch the current task.
   * It might need the tick due to per task/process properties:
@@ -263,15 +356,19 @@ void tick_nohz_full_kick_all(void)
  void __tick_nohz_task_switch(void)
  {
         unsigned long flags;
+       struct tick_sched *ts;
  
         local_irq_save(flags);
  
         if (!tick_nohz_full_cpu(smp_processor_id()))
                 goto out;
  
-       if (tick_nohz_tick_stopped() && !can_stop_full_tick())
-               tick_nohz_full_kick();
+       ts = this_cpu_ptr(&tick_cpu_sched);
  
+       if (ts->tick_stopped) {
+               if (current->tick_dep_mask || current->signal->tick_dep_mask)
+                       tick_nohz_full_kick();
+       }
  out:
         local_irq_restore(flags);
  }
@@ -689,7 +786,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
  
                 ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
                 ts->tick_stopped = 1;
-               trace_tick_stop(1, " ");
+               trace_tick_stop(1, TICK_DEP_MASK_NONE);
         }
  
         /*
@@ -740,7 +837,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
         if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
                 return;
  
-       if (can_stop_full_tick())
+       if (can_stop_full_tick(ts))
                 tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
         else if (ts->tick_stopped)
                 tick_nohz_restart_sched_tick(ts, ktime_get(), 1);
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h

index a4a8d4e9baa13f37cbdceff5ffedc3bfc5fd8e4c..eb4e32566a832c27afc6f388121521abab5c5d9b 100644 (file)
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -60,6 +60,7 @@ struct tick_sched {
         u64                             next_timer;
         ktime_t                         idle_expires;
         int                             do_timer_last;
+       unsigned long                   tick_dep_mask;
  };
  
  extern struct tick_sched *tick_get_tick_sched(int cpu);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c

index c9956440d0e609c51e2032f6f625c4061ba764a4..21b81a41dae572a3263c7b731efa97a2d25872e9 100644 (file)
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -30,7 +30,7 @@
  struct trace_kprobe {
         struct list_head        list;
         struct kretprobe        rp;     /* Use rp.kp for kprobe use */
-       unsigned long           nhit;
+       unsigned long __percpu *nhit;
         const char              *symbol;        /* symbol name */
         struct trace_probe      tp;
  };
@@ -274,6 +274,10 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
         if (!tk)
                 return ERR_PTR(ret);
  
+       tk->nhit = alloc_percpu(unsigned long);
+       if (!tk->nhit)
+               goto error;
+
         if (symbol) {
                 tk->symbol = kstrdup(symbol, GFP_KERNEL);
                 if (!tk->symbol)
@@ -313,6 +317,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
  error:
         kfree(tk->tp.call.name);
         kfree(tk->symbol);
+       free_percpu(tk->nhit);
         kfree(tk);
         return ERR_PTR(ret);
  }
@@ -327,6 +332,7 @@ static void free_trace_kprobe(struct trace_kprobe *tk)
         kfree(tk->tp.call.class->system);
         kfree(tk->tp.call.name);
         kfree(tk->symbol);
+       free_percpu(tk->nhit);
         kfree(tk);
  }
  
@@ -874,9 +880,14 @@ static const struct file_operations kprobe_events_ops = {
  static int probes_profile_seq_show(struct seq_file *m, void *v)
  {
         struct trace_kprobe *tk = v;
+       unsigned long nhit = 0;
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               nhit += *per_cpu_ptr(tk->nhit, cpu);
  
         seq_printf(m, "  %-44s %15lu %15lu\n",
-                  trace_event_name(&tk->tp.call), tk->nhit,
+                  trace_event_name(&tk->tp.call), nhit,
                    tk->rp.kp.nmissed);
  
         return 0;
@@ -1225,7 +1236,7 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
  {
         struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
  
-       tk->nhit++;
+       raw_cpu_inc(*tk->nhit);
  
         if (tk->tp.flags & TP_FLAG_TRACE)
                 kprobe_trace_func(tk, regs);
@@ -1242,7 +1253,7 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
  {
         struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp);
  
-       tk->nhit++;
+       raw_cpu_inc(*tk->nhit);
  
         if (tk->tp.flags & TP_FLAG_TRACE)
                 kretprobe_trace_func(tk, ri, regs);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c

index 0655afbea83f596ded953d6017e0f25a640bc073..d1663083d903aaad3ce4b359cfdb20068004b237 100644 (file)
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -186,11 +186,11 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
  
  extern char *__bad_type_size(void);
  
-#define SYSCALL_FIELD(type, name)                                      \
-       sizeof(type) != sizeof(trace.name) ?                            \
+#define SYSCALL_FIELD(type, field, name)                               \
+       sizeof(type) != sizeof(trace.field) ?                           \
                 __bad_type_size() :                                     \
-               #type, #name, offsetof(typeof(trace), name),            \
-               sizeof(trace.name), is_signed_type(type)
+               #type, #name, offsetof(typeof(trace), field),           \
+               sizeof(trace.field), is_signed_type(type)
  
  static int __init
  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
@@ -261,7 +261,8 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call)
         int i;
         int offset = offsetof(typeof(trace), args);
  
-       ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
+       ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr),
+                                FILTER_OTHER);
         if (ret)
                 return ret;
  
@@ -281,11 +282,12 @@ static int __init syscall_exit_define_fields(struct trace_event_call *call)
         struct syscall_trace_exit trace;
         int ret;
  
-       ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
+       ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr),
+                                FILTER_OTHER);
         if (ret)
                 return ret;
  
-       ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
+       ret = trace_define_field(call, SYSCALL_FIELD(long, ret, ret),
                                  FILTER_OTHER);
  
         return ret;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c

index 975cb49e32bf8b1e5dc3085d27c37c312d4ce6bd..f8e26ab963ed2fc057db395bee59d6f25307b5d5 100644 (file)
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -93,9 +93,11 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
  {
         struct mm_struct *mm;
  
-       /* convert pages-usec to Mbyte-usec */
-       stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB;
-       stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB;
+       /* convert pages-nsec/1024 to Mbyte-usec, see __acct_update_integrals */
+       stats->coremem = p->acct_rss_mem1 * PAGE_SIZE;
+       do_div(stats->coremem, 1000 * KB);
+       stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE;
+       do_div(stats->virtmem, 1000 * KB);
         mm = get_task_mm(p);
         if (mm) {
                 /* adjust to KB unit */
@@ -123,27 +125,28 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
  static void __acct_update_integrals(struct task_struct *tsk,
                                     cputime_t utime, cputime_t stime)
  {
-       if (likely(tsk->mm)) {
-               cputime_t time, dtime;
-               struct timeval value;
-               unsigned long flags;
-               u64 delta;
-
-               local_irq_save(flags);
-               time = stime + utime;
-               dtime = time - tsk->acct_timexpd;
-               jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
-               delta = value.tv_sec;
-               delta = delta * USEC_PER_SEC + value.tv_usec;
-
-               if (delta == 0)
-                       goto out;
-               tsk->acct_timexpd = time;
-               tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
-               tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
-       out:
-               local_irq_restore(flags);
-       }
+       cputime_t time, dtime;
+       u64 delta;
+
+       if (!likely(tsk->mm))
+               return;
+
+       time = stime + utime;
+       dtime = time - tsk->acct_timexpd;
+       /* Avoid division: cputime_t is often in nanoseconds already. */
+       delta = cputime_to_nsecs(dtime);
+
+       if (delta < TICK_NSEC)
+               return;
+
+       tsk->acct_timexpd = time;
+       /*
+        * Divide by 1024 to avoid overflow, and to avoid division.
+        * The final unit reported to userspace is Mbyte-usecs,
+        * the rest of the math is done in xacct_add_tsk.
+        */
+       tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10;
+       tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10;
  }
  
  /**
@@ -153,9 +156,12 @@ static void __acct_update_integrals(struct task_struct *tsk,
  void acct_update_integrals(struct task_struct *tsk)
  {
         cputime_t utime, stime;
+       unsigned long flags;
  
+       local_irq_save(flags);
         task_cputime(tsk, &utime, &stime);
         __acct_update_integrals(tsk, utime, stime);
+       local_irq_restore(flags);
  }
  
  /**
diff --git a/lib/cpumask.c b/lib/cpumask.c

index 5a70f6196f577a071ae0a31e9da7fa0e1dd1bc68..81dedaab36ccfed3150281a0f915a4070f667bfd 100644 (file)
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -41,6 +41,7 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
                         break;
         return i;
  }
+EXPORT_SYMBOL(cpumask_any_but);
  
  /* These are not inline because of header tangles. */
  #ifdef CONFIG_CPUMASK_OFFSTACK
diff --git a/lib/list_debug.c b/lib/list_debug.c

index 3345a089ef7b954d475d67965b8ce891c1b16ab0..3859bf63561c63936947b007fe3ee20e822509a1 100644 (file)
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -12,13 +12,6 @@
  #include <linux/kernel.h>
  #include <linux/rculist.h>
  
-static struct list_head force_poison;
-void list_force_poison(struct list_head *entry)
-{
-       entry->next = &force_poison;
-       entry->prev = &force_poison;
-}
-
  /*
   * Insert a new entry between two known consecutive entries.
   *
@@ -30,8 +23,6 @@ void __list_add(struct list_head *new,
                               struct list_head *prev,
                               struct list_head *next)
  {
-       WARN(new->next == &force_poison || new->prev == &force_poison,
-               "list_add attempted on force-poisoned entry\n");
         WARN(next->prev != prev,
                 "list_add corruption. next->prev should be "
                 "prev (%p), but was %p. (next=%p).\n",
diff --git a/lib/test_static_keys.c b/lib/test_static_keys.c

index c61b299e367ffac86450ac59c123d0925285b96c..915d75df20864d07428376750c1bd148c216a585 100644 (file)
--- a/lib/test_static_keys.c
+++ b/lib/test_static_keys.c
@@ -46,8 +46,11 @@ struct test_key {
         bool                    (*test_key)(void);
  };
  
-#define test_key_func(key, branch) \
-       ({bool func(void) { return branch(key); } func; })
+#define test_key_func(key, branch)     \
+static bool key ## _ ## branch(void)   \
+{                                      \
+       return branch(&key);            \
+}
  
  static void invert_key(struct static_key *key)
  {
@@ -92,6 +95,25 @@ static int verify_keys(struct test_key *keys, int size, bool invert)
         return 0;
  }
  
+test_key_func(old_true_key, static_key_true)
+test_key_func(old_false_key, static_key_false)
+test_key_func(true_key, static_branch_likely)
+test_key_func(true_key, static_branch_unlikely)
+test_key_func(false_key, static_branch_likely)
+test_key_func(false_key, static_branch_unlikely)
+test_key_func(base_old_true_key, static_key_true)
+test_key_func(base_inv_old_true_key, static_key_true)
+test_key_func(base_old_false_key, static_key_false)
+test_key_func(base_inv_old_false_key, static_key_false)
+test_key_func(base_true_key, static_branch_likely)
+test_key_func(base_true_key, static_branch_unlikely)
+test_key_func(base_inv_true_key, static_branch_likely)
+test_key_func(base_inv_true_key, static_branch_unlikely)
+test_key_func(base_false_key, static_branch_likely)
+test_key_func(base_false_key, static_branch_unlikely)
+test_key_func(base_inv_false_key, static_branch_likely)
+test_key_func(base_inv_false_key, static_branch_unlikely)
+
  static int __init test_static_key_init(void)
  {
         int ret;
@@ -102,95 +124,95 @@ static int __init test_static_key_init(void)
                 {
                         .init_state     = true,
                         .key            = &old_true_key,
-                       .test_key       = test_key_func(&old_true_key, static_key_true),
+                       .test_key       = &old_true_key_static_key_true,
                 },
                 {
                         .init_state     = false,
                         .key            = &old_false_key,
-                       .test_key       = test_key_func(&old_false_key, static_key_false),
+                       .test_key       = &old_false_key_static_key_false,
                 },
                 /* internal keys - new keys */
                 {
                         .init_state     = true,
                         .key            = &true_key.key,
-                       .test_key       = test_key_func(&true_key, static_branch_likely),
+                       .test_key       = &true_key_static_branch_likely,
                 },
                 {
                         .init_state     = true,
                         .key            = &true_key.key,
-                       .test_key       = test_key_func(&true_key, static_branch_unlikely),
+                       .test_key       = &true_key_static_branch_unlikely,
                 },
                 {
                         .init_state     = false,
                         .key            = &false_key.key,
-                       .test_key       = test_key_func(&false_key, static_branch_likely),
+                       .test_key       = &false_key_static_branch_likely,
                 },
                 {
                         .init_state     = false,
                         .key            = &false_key.key,
-                       .test_key       = test_key_func(&false_key, static_branch_unlikely),
+                       .test_key       = &false_key_static_branch_unlikely,
                 },
                 /* external keys - old keys */
                 {
                         .init_state     = true,
                         .key            = &base_old_true_key,
-                       .test_key       = test_key_func(&base_old_true_key, static_key_true),
+                       .test_key       = &base_old_true_key_static_key_true,
                 },
                 {
                         .init_state     = false,
                         .key            = &base_inv_old_true_key,
-                       .test_key       = test_key_func(&base_inv_old_true_key, static_key_true),
+                       .test_key       = &base_inv_old_true_key_static_key_true,
                 },
                 {
                         .init_state     = false,
                         .key            = &base_old_false_key,
-                       .test_key       = test_key_func(&base_old_false_key, static_key_false),
+                       .test_key       = &base_old_false_key_static_key_false,
                 },
                 {
                         .init_state     = true,
                         .key            = &base_inv_old_false_key,
-                       .test_key       = test_key_func(&base_inv_old_false_key, static_key_false),
+                       .test_key       = &base_inv_old_false_key_static_key_false,
                 },
                 /* external keys - new keys */
                 {
                         .init_state     = true,
                         .key            = &base_true_key.key,
-                       .test_key       = test_key_func(&base_true_key, static_branch_likely),
+                       .test_key       = &base_true_key_static_branch_likely,
                 },
                 {
                         .init_state     = true,
                         .key            = &base_true_key.key,
-                       .test_key       = test_key_func(&base_true_key, static_branch_unlikely),
+                       .test_key       = &base_true_key_static_branch_unlikely,
                 },
                 {
                         .init_state     = false,
                         .key            = &base_inv_true_key.key,
-                       .test_key       = test_key_func(&base_inv_true_key, static_branch_likely),
+                       .test_key       = &base_inv_true_key_static_branch_likely,
                 },
                 {
                         .init_state     = false,
                         .key            = &base_inv_true_key.key,
-                       .test_key       = test_key_func(&base_inv_true_key, static_branch_unlikely),
+                       .test_key       = &base_inv_true_key_static_branch_unlikely,
                 },
                 {
                         .init_state     = false,
                         .key            = &base_false_key.key,
-                       .test_key       = test_key_func(&base_false_key, static_branch_likely),
+                       .test_key       = &base_false_key_static_branch_likely,
                 },
                 {
                         .init_state     = false,
                         .key            = &base_false_key.key,
-                       .test_key       = test_key_func(&base_false_key, static_branch_unlikely),
+                       .test_key       = &base_false_key_static_branch_unlikely,
                 },
                 {
                         .init_state     = true,
                         .key            = &base_inv_false_key.key,
-                       .test_key       = test_key_func(&base_inv_false_key, static_branch_likely),
+                       .test_key       = &base_inv_false_key_static_branch_likely,
                 },
                 {
                         .init_state     = true,
                         .key            = &base_inv_false_key.key,
-                       .test_key       = test_key_func(&base_inv_false_key, static_branch_unlikely),
+                       .test_key       = &base_inv_false_key_static_branch_unlikely,
                 },
         };
  
diff --git a/mm/filemap.c b/mm/filemap.c

index 3461d97ecb30bfe18d1ed50729e5147aa2ad7e6d..da7a35d83de7edc4ed562caa5e1925194f2c7229 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -195,6 +195,30 @@ void __delete_from_page_cache(struct page *page, void *shadow,
         else
                 cleancache_invalidate_page(mapping, page);
  
+       VM_BUG_ON_PAGE(page_mapped(page), page);
+       if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) {
+               int mapcount;
+
+               pr_alert("BUG: Bad page cache in process %s  pfn:%05lx\n",
+                        current->comm, page_to_pfn(page));
+               dump_page(page, "still mapped when deleted");
+               dump_stack();
+               add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+
+               mapcount = page_mapcount(page);
+               if (mapping_exiting(mapping) &&
+                   page_count(page) >= mapcount + 2) {
+                       /*
+                        * All vmas have already been torn down, so it's
+                        * a good bet that actually the page is unmapped,
+                        * and we'd prefer not to leak it: if we're wrong,
+                        * some other bad page check should catch it later.
+                        */
+                       page_mapcount_reset(page);
+                       atomic_sub(mapcount, &page->_count);
+               }
+       }
+
         page_cache_tree_delete(mapping, page, shadow);
  
         page->mapping = NULL;
@@ -205,7 +229,6 @@ void __delete_from_page_cache(struct page *page, void *shadow,
                 __dec_zone_page_state(page, NR_FILE_PAGES);
         if (PageSwapBacked(page))
                 __dec_zone_page_state(page, NR_SHMEM);
-       VM_BUG_ON_PAGE(page_mapped(page), page);
  
         /*
          * At this point page must be either written or cleaned by truncate.
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 01f2b48c8618a9f973eeb11f2162b75bb8cf67d4..aefba5a9cc47f74880dbbb3491a8927fcf999b06 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2751,7 +2751,7 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
         int ret;
  
         if (!hugepages_supported())
-               return -ENOTSUPP;
+               return -EOPNOTSUPP;
  
         table->data = &tmp;
         table->maxlen = sizeof(unsigned long);
@@ -2792,7 +2792,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
         int ret;
  
         if (!hugepages_supported())
-               return -ENOTSUPP;
+               return -EOPNOTSUPP;
  
         tmp = h->nr_overcommit_huge_pages;
  
@@ -3502,7 +3502,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
          * COW. Warn that such a situation has occurred as it may not be obvious
          */
         if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
-               pr_warning("PID %d killed due to inadequate hugepage pool\n",
+               pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
                            current->pid);
                 return ret;
         }
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c

index bc0a8d8b8f42faf7bca01501425ba73e156ee293..1ad20ade8c91328d2f47e8c116f9aca6bc2fb57f 100644 (file)
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -20,6 +20,7 @@
  #include <linux/init.h>
  #include <linux/kernel.h>
  #include <linux/kmemleak.h>
+#include <linux/linkage.h>
  #include <linux/memblock.h>
  #include <linux/memory.h>
  #include <linux/mm.h>
@@ -60,6 +61,25 @@ void kasan_unpoison_shadow(const void *address, size_t size)
         }
  }
  
+static void __kasan_unpoison_stack(struct task_struct *task, void *sp)
+{
+       void *base = task_stack_page(task);
+       size_t size = sp - base;
+
+       kasan_unpoison_shadow(base, size);
+}
+
+/* Unpoison the entire stack for a task. */
+void kasan_unpoison_task_stack(struct task_struct *task)
+{
+       __kasan_unpoison_stack(task, task_stack_page(task) + THREAD_SIZE);
+}
+
+/* Unpoison the stack for the current task beyond a watermark sp value. */
+asmlinkage void kasan_unpoison_remaining_stack(void *sp)
+{
+       __kasan_unpoison_stack(current, sp);
+}
  
  /*
   * All functions below always inlined so compiler could
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c

index 4af58a3a8ffa345c16fe190be76ba9f9e83113d4..979b18cbd343e3422d5b4ccb86d1fab2bb1090e2 100644 (file)
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -138,7 +138,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
         res->name = "System RAM";
         res->start = start;
         res->end = start + size - 1;
-       res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+       res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
         if (request_resource(&iomem_resource, res) < 0) {
                 pr_debug("System RAM resource %pR cannot be added\n", res);
                 kfree(res);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c

index 4c4187c0e1deeb25bdbf5eb383132d27feb9be90..9a3f6b90e6283b24bff091213768612b04b6d928 100644 (file)
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -532,7 +532,7 @@ retry:
                 nid = page_to_nid(page);
                 if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
                         continue;
-               if (PageTail(page) && PageAnon(page)) {
+               if (PageTransCompound(page) && PageAnon(page)) {
                         get_page(page);
                         pte_unmap_unlock(pte, ptl);
                         lock_page(page);
diff --git a/mm/mempool.c b/mm/mempool.c

index 004d42b1dfaf928ab174e057696afa580447f3d9..7924f4f58a6d48ae5335fdedb1c2e2dd93a56d88 100644 (file)
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -135,8 +135,8 @@ static void *remove_element(mempool_t *pool)
         void *element = pool->elements[--pool->curr_nr];
  
         BUG_ON(pool->curr_nr < 0);
-       check_element(pool, element);
         kasan_unpoison_element(pool, element);
+       check_element(pool, element);
         return element;
  }
  
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c

index 82e3e97050173542b5d6094ebc4db81324198b18..dcea4f4c62b3b43d701fb68a74a6aa1a1e0aa8cf 100644 (file)
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -723,6 +723,8 @@ int br_fdb_dump(struct sk_buff *skb,
                 struct net_bridge_fdb_entry *f;
  
                 hlist_for_each_entry_rcu(f, &br->hash[i], hlist) {
+                       int err;
+
                         if (idx < cb->args[0])
                                 goto skip;
  
@@ -741,12 +743,15 @@ int br_fdb_dump(struct sk_buff *skb,
                         if (!filter_dev && f->dst)
                                 goto skip;
  
-                       if (fdb_fill_info(skb, br, f,
-                                         NETLINK_CB(cb->skb).portid,
-                                         cb->nlh->nlmsg_seq,
-                                         RTM_NEWNEIGH,
-                                         NLM_F_MULTI) < 0)
+                       err = fdb_fill_info(skb, br, f,
+                                           NETLINK_CB(cb->skb).portid,
+                                           cb->nlh->nlmsg_seq,
+                                           RTM_NEWNEIGH,
+                                           NLM_F_MULTI);
+                       if (err < 0) {
+                               cb->args[1] = err;
                                 break;
+                       }
  skip:
                         ++idx;
                 }
diff --git a/net/core/filter.c b/net/core/filter.c

index 94d26201080d6671080f63865994bb41d7d3d8bc..bba502f7cd575692a9ed795f83d2fdedf2428ed6 100644 (file)
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1752,7 +1752,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
         u8 compat[sizeof(struct bpf_tunnel_key)];
         struct ip_tunnel_info *info;
  
-       if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6)))
+       if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX)))
                 return -EINVAL;
         if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
                 switch (size) {
@@ -1776,7 +1776,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
         info = &md->u.tun_info;
         info->mode = IP_TUNNEL_INFO_TX;
  
-       info->key.tun_flags = TUNNEL_KEY;
+       info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM;
         info->key.tun_id = cpu_to_be64(from->tunnel_id);
         info->key.tos = from->tunnel_tos;
         info->key.ttl = from->tunnel_ttl;
@@ -1787,6 +1787,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
                        sizeof(from->remote_ipv6));
         } else {
                 info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
+               if (flags & BPF_F_ZERO_CSUM_TX)
+                       info->key.tun_flags &= ~TUNNEL_CSUM;
         }
  
         return 0;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c

index d735e854f916040912fb12930cbc6a7950ace942..8261d95dd846647798c7dba8f1dbfa4cf61a0eef 100644 (file)
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2911,6 +2911,7 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb,
         nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->mc);
  out:
         netif_addr_unlock_bh(dev);
+       cb->args[1] = err;
         return idx;
  }
  EXPORT_SYMBOL(ndo_dflt_fdb_dump);
@@ -2944,6 +2945,7 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
                 ops = br_dev->netdev_ops;
         }
  
+       cb->args[1] = 0;
         for_each_netdev(net, dev) {
                 if (brport_idx && (dev->ifindex != brport_idx))
                         continue;
@@ -2971,12 +2973,16 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
                                 idx = cops->ndo_fdb_dump(skb, cb, br_dev, dev,
                                                          idx);
                 }
+               if (cb->args[1] == -EMSGSIZE)
+                       break;
  
                 if (dev->netdev_ops->ndo_fdb_dump)
                         idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL,
                                                             idx);
                 else
                         idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);
+               if (cb->args[1] == -EMSGSIZE)
+                       break;
  
                 cops = NULL;
         }
diff --git a/net/core/skbuff.c b/net/core/skbuff.c

index 5bf88f58bee7405ff65f80487a64339b92a91bcb..8616d1147c938808da752734b6cecea260b149b9 100644 (file)
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2947,6 +2947,24 @@ int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
  }
  EXPORT_SYMBOL_GPL(skb_append_pagefrags);
  
+/**
+ *     skb_push_rcsum - push skb and update receive checksum
+ *     @skb: buffer to update
+ *     @len: length of data pulled
+ *
+ *     This function performs an skb_push on the packet and updates
+ *     the CHECKSUM_COMPLETE checksum.  It should be used on
+ *     receive path processing instead of skb_push unless you know
+ *     that the checksum difference is zero (e.g., a valid IP header)
+ *     or you are setting ip_summed to CHECKSUM_NONE.
+ */
+static unsigned char *skb_push_rcsum(struct sk_buff *skb, unsigned len)
+{
+       skb_push(skb, len);
+       skb_postpush_rcsum(skb, skb->data, len);
+       return skb->data;
+}
+
  /**
   *     skb_pull_rcsum - pull skb and update receive checksum
   *     @skb: buffer to update
@@ -4084,9 +4102,9 @@ struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
         if (!pskb_may_pull(skb_chk, offset))
                 goto err;
  
-       __skb_pull(skb_chk, offset);
+       skb_pull_rcsum(skb_chk, offset);
         ret = skb_chkf(skb_chk);
-       __skb_push(skb_chk, offset);
+       skb_push_rcsum(skb_chk, offset);
  
         if (ret)
                 goto err;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c

index 05e4cba14162f3583ec588657af7e8b68546b111..b3086cf2702759d1077cb6a014afbd130d8206db 100644 (file)
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -356,9 +356,8 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
         skb_dst_set(skb, &rt->dst);
         skb->dev = dev;
  
-       skb->reserved_tailroom = skb_end_offset(skb) -
-                                min(mtu, skb_end_offset(skb));
         skb_reserve(skb, hlen);
+       skb_tailroom_reserve(skb, mtu, tlen);
  
         skb_reset_network_header(skb);
         pip = ip_hdr(skb);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c

index 64878efa045c132d1ce511c326c2da04fc77a895..565bf64b2b7d6047a29e69df00fb3b85ec84f06a 100644 (file)
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1236,13 +1236,16 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
         if (!skb)
                 return -EINVAL;
  
-       cork->length += size;
         if ((size + skb->len > mtu) &&
             (sk->sk_protocol == IPPROTO_UDP) &&
             (rt->dst.dev->features & NETIF_F_UFO)) {
+               if (skb->ip_summed != CHECKSUM_PARTIAL)
+                       return -EOPNOTSUPP;
+
                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
         }
+       cork->length += size;
  
         while (size > 0) {
                 if (skb_is_gso(skb)) {
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c

index 89e8861e05fcb1d371c371c1784b89499b69d9df..336e6892a93ce99aabf1794133c9567ee0effde7 100644 (file)
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -661,6 +661,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
         connected = (tunnel->parms.iph.daddr != 0);
  
+       memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
         dst = tnl_params->daddr;
         if (dst == 0) {
                 /* NBMA tunnel */
@@ -758,7 +760,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
                         tunnel->err_count--;
  
-                       memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
                         dst_link_failure(skb);
                 } else
                         tunnel->err_count = 0;
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c

index c8cbc2b4b7921fb4f70681e4ac6d945f5499654c..a726d7853ce53fe03b48b199ca909c02393cabcf 100644 (file)
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -550,7 +550,7 @@ reset:
          */
         if (crtt > tp->srtt_us) {
                 /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */
-               crtt /= 8 * USEC_PER_MSEC;
+               crtt /= 8 * USEC_PER_SEC / HZ;
                 inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk));
         } else if (tp->srtt_us == 0) {
                 /* RFC6298: 5.7 We've failed to get a valid RTT sample from
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c

index 75632a92582425db63f1078c223cbd54a91fa0c3..9b02af2139d3d3245ea952f643fc52e0ee92a9ab 100644 (file)
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -455,7 +455,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
  
                 newtp->rcv_wup = newtp->copied_seq =
                 newtp->rcv_nxt = treq->rcv_isn + 1;
-               newtp->segs_in = 0;
+               newtp->segs_in = 1;
  
                 newtp->snd_sml = newtp->snd_una =
                 newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
@@ -815,6 +815,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
         int ret = 0;
         int state = child->sk_state;
  
+       tcp_sk(child)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
         if (!sock_owned_by_user(child)) {
                 ret = tcp_rcv_state_process(child, skb);
                 /* Wakeup parent, send SIGIO */
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c

index 0ec08814f37d9e904b5f85bbd86cd2fa607c4307..96599d1a13184d4ef6d34ae9bff7833420d0ad93 100644 (file)
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -89,6 +89,8 @@ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb
         uh->source = src_port;
         uh->len = htons(skb->len);
  
+       memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
         udp_set_csum(nocheck, skb, src, dst, skb->len);
  
         iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet);
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c

index 5c5d23e59da598995ff962d069d1e7b6886e31d6..9508a20fbf61432f561202edbe40b59e63c3489e 100644 (file)
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -257,7 +257,11 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
                                                 *fragoff = _frag_off;
                                         return hp->nexthdr;
                                 }
-                               return -ENOENT;
+                               if (!found)
+                                       return -ENOENT;
+                               if (fragoff)
+                                       *fragoff = _frag_off;
+                               break;
                         }
                         hdrlen = 8;
                 } else if (nexthdr == NEXTHDR_AUTH) {
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c

index a69aad1e29d1ebb2429650cc083f0e4dd2bcaf86..c0d4dc1c5ea4c31d8ebb4512bb55fcd177ce9f4e 100644 (file)
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -777,6 +777,8 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev)
         __u32 mtu;
         int err;
  
+       memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
         if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
                 encap_limit = t->parms.encap_limit;
  
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c

index 137fca42aaa6bb809d46e7809b240d8810d89a04..6c5dfec7a3779601eb760eeeb975ae5e6db00bb7 100644 (file)
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1180,6 +1180,8 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
         u8 tproto;
         int err;
  
+       memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
         tproto = ACCESS_ONCE(t->parms.proto);
         if (tproto != IPPROTO_IPIP && tproto != 0)
                 return -1;
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c

index 5ee56d0a8699e22434d5ad8b4380f890fa8d2126..d64ee7e8366492a439ab70ea7487bd3669800097 100644 (file)
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1574,9 +1574,8 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu)
                 return NULL;
  
         skb->priority = TC_PRIO_CONTROL;
-       skb->reserved_tailroom = skb_end_offset(skb) -
-                                min(mtu, skb_end_offset(skb));
         skb_reserve(skb, hlen);
+       skb_tailroom_reserve(skb, mtu, tlen);
  
         if (__ipv6_get_lladdr(idev, &addr_buf, IFA_F_TENTATIVE)) {
                 /* <draft-ietf-magma-mld-source-05.txt>:
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c

index 22e28a44e3c88c6e52ff59ea1c7dd8a3e964e701..422dd014aa2ce9b27263eed4612ff156271d982f 100644 (file)
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -962,11 +962,9 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
                 ret = udpv6_queue_rcv_skb(sk, skb);
                 sock_put(sk);
  
-               /* a return value > 0 means to resubmit the input, but
-                * it wants the return to be -protocol, or 0
-                */
+               /* a return value > 0 means to resubmit the input */
                 if (ret > 0)
-                       return -ret;
+                       return ret;
  
                 return 0;
         }
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c

index 10ad4ac1fa0ba8ebd04e793595fab47d326194ce..367784be5df20f26fd3940c4608f1ea715bfddd6 100644 (file)
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -291,7 +291,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
         }
  
         /* prepare A-MPDU MLME for Rx aggregation */
-       tid_agg_rx = kmalloc(sizeof(struct tid_ampdu_rx), GFP_KERNEL);
+       tid_agg_rx = kzalloc(sizeof(*tid_agg_rx), GFP_KERNEL);
         if (!tid_agg_rx)
                 goto end;
  
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h

index b84f6aa32c0831ce200f286af2ecd947ef0fefde..f006f4a44c0e6fbcdec025dcaa5450d194403bd9 100644 (file)
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -92,7 +92,7 @@ struct ieee80211_fragment_entry {
         u16 extra_len;
         u16 last_frag;
         u8 rx_queue;
-       bool ccmp; /* Whether fragments were encrypted with CCMP */
+       bool check_sequential_pn; /* needed for CCMP/GCMP */
         u8 last_pn[6]; /* PN of the last fragment if CCMP was used */
  };
  
diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c

index 3ece7d1034c81ae8749cada074fbebecbe06d57f..b54f398cda5d0e2d561f2383f1d049a5410fcf68 100644 (file)
--- a/net/mac80211/rc80211_minstrel.c
+++ b/net/mac80211/rc80211_minstrel.c
@@ -711,7 +711,7 @@ static u32 minstrel_get_expected_throughput(void *priv_sta)
          * computing cur_tp
          */
         tmp_mrs = &mi->r[idx].stats;
-       tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_ewma);
+       tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_ewma) * 10;
         tmp_cur_tp = tmp_cur_tp * 1200 * 8 / 1024;
  
         return tmp_cur_tp;
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c

index 3928dbd24e257e68627aa977cc54a19aaa996339..370d677b547b2aa2591bea4302e4994bddfe2649 100644 (file)
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -414,15 +414,16 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index)
             (max_tp_group != MINSTREL_CCK_GROUP))
                 return;
  
+       max_gpr_group = mg->max_group_prob_rate / MCS_GROUP_RATES;
+       max_gpr_idx = mg->max_group_prob_rate % MCS_GROUP_RATES;
+       max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_ewma;
+
         if (mrs->prob_ewma > MINSTREL_FRAC(75, 100)) {
                 cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx,
                                                     mrs->prob_ewma);
                 if (cur_tp_avg > tmp_tp_avg)
                         mi->max_prob_rate = index;
  
-               max_gpr_group = mg->max_group_prob_rate / MCS_GROUP_RATES;
-               max_gpr_idx = mg->max_group_prob_rate % MCS_GROUP_RATES;
-               max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_ewma;
                 max_gpr_tp_avg = minstrel_ht_get_tp_avg(mi, max_gpr_group,
                                                         max_gpr_idx,
                                                         max_gpr_prob);
@@ -431,7 +432,7 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index)
         } else {
                 if (mrs->prob_ewma > tmp_prob)
                         mi->max_prob_rate = index;
-               if (mrs->prob_ewma > mg->rates[mg->max_group_prob_rate].prob_ewma)
+               if (mrs->prob_ewma > max_gpr_prob)
                         mg->max_group_prob_rate = index;
         }
  }
@@ -691,7 +692,7 @@ minstrel_aggr_check(struct ieee80211_sta *pubsta, struct sk_buff *skb)
         if (likely(sta->ampdu_mlme.tid_tx[tid]))
                 return;
  
-       ieee80211_start_tx_ba_session(pubsta, tid, 5000);
+       ieee80211_start_tx_ba_session(pubsta, tid, 0);
  }
  
  static void
@@ -871,7 +872,7 @@ minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
          *  - if station is in dynamic SMPS (and streams > 1)
          *  - for fallback rates, to increase chances of getting through
          */
-       if (offset > 0 &&
+       if (offset > 0 ||
             (mi->sta->smps_mode == IEEE80211_SMPS_DYNAMIC &&
              group->streams > 1)) {
                 ratetbl->rate[offset].count = ratetbl->rate[offset].count_rts;
@@ -1334,7 +1335,8 @@ static u32 minstrel_ht_get_expected_throughput(void *priv_sta)
         prob = mi->groups[i].rates[j].prob_ewma;
  
         /* convert tp_avg from pkt per second in kbps */
-       tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * AVG_PKT_SIZE * 8 / 1024;
+       tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * 10;
+       tp_avg = tp_avg * AVG_PKT_SIZE * 8 / 1024;
  
         return tp_avg;
  }
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c

index bc081850ac0e5538241bd1cab37b65aeefd2c244..60d093f40f1d16de32b1bb962b25ddb12f9a1b72 100644 (file)
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1753,7 +1753,7 @@ ieee80211_reassemble_add(struct ieee80211_sub_if_data *sdata,
         entry->seq = seq;
         entry->rx_queue = rx_queue;
         entry->last_frag = frag;
-       entry->ccmp = 0;
+       entry->check_sequential_pn = false;
         entry->extra_len = 0;
  
         return entry;
@@ -1849,15 +1849,27 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
                                                  rx->seqno_idx, &(rx->skb));
                 if (rx->key &&
                     (rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP ||
-                    rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP_256) &&
+                    rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP_256 ||
+                    rx->key->conf.cipher == WLAN_CIPHER_SUITE_GCMP ||
+                    rx->key->conf.cipher == WLAN_CIPHER_SUITE_GCMP_256) &&
                     ieee80211_has_protected(fc)) {
                         int queue = rx->security_idx;
-                       /* Store CCMP PN so that we can verify that the next
-                        * fragment has a sequential PN value. */
-                       entry->ccmp = 1;
+
+                       /* Store CCMP/GCMP PN so that we can verify that the
+                        * next fragment has a sequential PN value.
+                        */
+                       entry->check_sequential_pn = true;
                         memcpy(entry->last_pn,
                                rx->key->u.ccmp.rx_pn[queue],
                                IEEE80211_CCMP_PN_LEN);
+                       BUILD_BUG_ON(offsetof(struct ieee80211_key,
+                                             u.ccmp.rx_pn) !=
+                                    offsetof(struct ieee80211_key,
+                                             u.gcmp.rx_pn));
+                       BUILD_BUG_ON(sizeof(rx->key->u.ccmp.rx_pn[queue]) !=
+                                    sizeof(rx->key->u.gcmp.rx_pn[queue]));
+                       BUILD_BUG_ON(IEEE80211_CCMP_PN_LEN !=
+                                    IEEE80211_GCMP_PN_LEN);
                 }
                 return RX_QUEUED;
         }
@@ -1872,15 +1884,21 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
                 return RX_DROP_MONITOR;
         }
  
-       /* Verify that MPDUs within one MSDU have sequential PN values.
-        * (IEEE 802.11i, 8.3.3.4.5) */
-       if (entry->ccmp) {
+       /* "The receiver shall discard MSDUs and MMPDUs whose constituent
+        *  MPDU PN values are not incrementing in steps of 1."
+        * see IEEE P802.11-REVmc/D5.0, 12.5.3.4.4, item d (for CCMP)
+        * and IEEE P802.11-REVmc/D5.0, 12.5.5.4.4, item d (for GCMP)
+        */
+       if (entry->check_sequential_pn) {
                 int i;
                 u8 pn[IEEE80211_CCMP_PN_LEN], *rpn;
                 int queue;
+
                 if (!rx->key ||
                     (rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP &&
-                    rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP_256))
+                    rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP_256 &&
+                    rx->key->conf.cipher != WLAN_CIPHER_SUITE_GCMP &&
+                    rx->key->conf.cipher != WLAN_CIPHER_SUITE_GCMP_256))
                         return RX_DROP_UNUSABLE;
                 memcpy(pn, entry->last_pn, IEEE80211_CCMP_PN_LEN);
                 for (i = IEEE80211_CCMP_PN_LEN - 1; i >= 0; i--) {
@@ -3366,6 +3384,7 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx)
                                 return false;
                         /* ignore action frames to TDLS-peers */
                         if (ieee80211_is_action(hdr->frame_control) &&
+                           !is_broadcast_ether_addr(bssid) &&
                             !ether_addr_equal(bssid, hdr->addr1))
                                 return false;
                 }
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c

index d05869646515dbabeb352361c9d58b9ccfc687d5..6b70399ab78121e723efc439b27122f18b52a206 100644 (file)
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -62,6 +62,7 @@ static void ipt_destroy_target(struct xt_entry_target *t)
         struct xt_tgdtor_param par = {
                 .target   = t->u.kernel.target,
                 .targinfo = t->data,
+               .family   = NFPROTO_IPV4,
         };
         if (par.target->destroy != NULL)
                 par.target->destroy(&par);
@@ -195,6 +196,7 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
         par.hooknum  = ipt->tcfi_hook;
         par.target   = ipt->tcfi_t->u.kernel.target;
         par.targinfo = ipt->tcfi_t->data;
+       par.family   = NFPROTO_IPV4;
         ret = par.target->target(skb, &par);
  
         switch (ret) {
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c

index ec529121f38a03c4b3e2317249cd0694fd619d3f..ce46f1c7f133ad5b114e4c2cd571d26c2b9ee901 100644 (file)
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -526,6 +526,8 @@ static int sctp_v6_cmp_addr(const union sctp_addr *addr1,
                 }
                 return 0;
         }
+       if (addr1->v6.sin6_port != addr2->v6.sin6_port)
+               return 0;
         if (!ipv6_addr_equal(&addr1->v6.sin6_addr, &addr2->v6.sin6_addr))
                 return 0;
         /* If this is a linklocal address, compare the scope_id. */
diff --git a/net/sctp/proc.c b/net/sctp/proc.c

index ded7d931a6a5b218c7bbd4481acdea1d2bc29325..963dffcc2618b0fa5d186460bf12960c21f9babe 100644 (file)
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -482,7 +482,7 @@ static void sctp_remaddr_seq_stop(struct seq_file *seq, void *v)
  static int sctp_remaddr_seq_show(struct seq_file *seq, void *v)
  {
         struct sctp_association *assoc;
-       struct sctp_transport *tsp;
+       struct sctp_transport *transport, *tsp;
  
         if (v == SEQ_START_TOKEN) {
                 seq_printf(seq, "ADDR ASSOC_ID HB_ACT RTO MAX_PATH_RTX "
@@ -490,10 +490,10 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v)
                 return 0;
         }
  
-       tsp = (struct sctp_transport *)v;
-       if (!sctp_transport_hold(tsp))
+       transport = (struct sctp_transport *)v;
+       if (!sctp_transport_hold(transport))
                 return 0;
-       assoc = tsp->asoc;
+       assoc = transport->asoc;
  
         list_for_each_entry_rcu(tsp, &assoc->peer.transport_addr_list,
                                 transports) {
@@ -546,7 +546,7 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v)
                 seq_printf(seq, "\n");
         }
  
-       sctp_transport_put(tsp);
+       sctp_transport_put(transport);
  
         return 0;
  }
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c

index 47f7da58a7f0e93e3ffd632396a5a8db65c7f7c3..8b5833c1ff2e8695030587749c8c72d86d334323 100644 (file)
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -1093,8 +1093,11 @@ int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
                 .cb = cb,
                 .idx = idx,
         };
+       int err;
  
-       switchdev_port_obj_dump(dev, &dump.fdb.obj, switchdev_port_fdb_dump_cb);
+       err = switchdev_port_obj_dump(dev, &dump.fdb.obj,
+                                     switchdev_port_fdb_dump_cb);
+       cb->args[1] = err;
         return dump.idx;
  }
  EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump);
diff --git a/net/tipc/socket.c b/net/tipc/socket.c

index 69c29050f14abe8c926db671dfa05c96f2ddc3bc..4d420bb273960cd6eac206753f25435970f2724d 100644 (file)
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -673,7 +673,7 @@ static int tipc_sendmcast(struct  socket *sock, struct tipc_name_seq *seq,
         struct tipc_sock *tsk = tipc_sk(sk);
         struct net *net = sock_net(sk);
         struct tipc_msg *mhdr = &tsk->phdr;
-       struct sk_buff_head *pktchain = &sk->sk_write_queue;
+       struct sk_buff_head pktchain;
         struct iov_iter save = msg->msg_iter;
         uint mtu;
         int rc;
@@ -687,14 +687,16 @@ static int tipc_sendmcast(struct  socket *sock, struct tipc_name_seq *seq,
         msg_set_nameupper(mhdr, seq->upper);
         msg_set_hdr_sz(mhdr, MCAST_H_SIZE);
  
+       skb_queue_head_init(&pktchain);
+
  new_mtu:
         mtu = tipc_bcast_get_mtu(net);
-       rc = tipc_msg_build(mhdr, msg, 0, dsz, mtu, pktchain);
+       rc = tipc_msg_build(mhdr, msg, 0, dsz, mtu, &pktchain);
         if (unlikely(rc < 0))
                 return rc;
  
         do {
-               rc = tipc_bcast_xmit(net, pktchain);
+               rc = tipc_bcast_xmit(net, &pktchain);
                 if (likely(!rc))
                         return dsz;
  
@@ -704,7 +706,7 @@ new_mtu:
                         if (!rc)
                                 continue;
                 }
-               __skb_queue_purge(pktchain);
+               __skb_queue_purge(&pktchain);
                 if (rc == -EMSGSIZE) {
                         msg->msg_iter = save;
                         goto new_mtu;
@@ -863,7 +865,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz)
         struct net *net = sock_net(sk);
         struct tipc_msg *mhdr = &tsk->phdr;
         u32 dnode, dport;
-       struct sk_buff_head *pktchain = &sk->sk_write_queue;
+       struct sk_buff_head pktchain;
         struct sk_buff *skb;
         struct tipc_name_seq *seq;
         struct iov_iter save;
@@ -924,17 +926,18 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz)
                 msg_set_hdr_sz(mhdr, BASIC_H_SIZE);
         }
  
+       skb_queue_head_init(&pktchain);
         save = m->msg_iter;
  new_mtu:
         mtu = tipc_node_get_mtu(net, dnode, tsk->portid);
-       rc = tipc_msg_build(mhdr, m, 0, dsz, mtu, pktchain);
+       rc = tipc_msg_build(mhdr, m, 0, dsz, mtu, &pktchain);
         if (rc < 0)
                 return rc;
  
         do {
-               skb = skb_peek(pktchain);
+               skb = skb_peek(&pktchain);
                 TIPC_SKB_CB(skb)->wakeup_pending = tsk->link_cong;
-               rc = tipc_node_xmit(net, pktchain, dnode, tsk->portid);
+               rc = tipc_node_xmit(net, &pktchain, dnode, tsk->portid);
                 if (likely(!rc)) {
                         if (sock->state != SS_READY)
                                 sock->state = SS_CONNECTING;
@@ -946,7 +949,7 @@ new_mtu:
                         if (!rc)
                                 continue;
                 }
-               __skb_queue_purge(pktchain);
+               __skb_queue_purge(&pktchain);
                 if (rc == -EMSGSIZE) {
                         m->msg_iter = save;
                         goto new_mtu;
@@ -1016,7 +1019,7 @@ static int __tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz)
         struct net *net = sock_net(sk);
         struct tipc_sock *tsk = tipc_sk(sk);
         struct tipc_msg *mhdr = &tsk->phdr;
-       struct sk_buff_head *pktchain = &sk->sk_write_queue;
+       struct sk_buff_head pktchain;
         DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
         u32 portid = tsk->portid;
         int rc = -EINVAL;
@@ -1044,17 +1047,19 @@ static int __tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz)
  
         timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
         dnode = tsk_peer_node(tsk);
+       skb_queue_head_init(&pktchain);
  
  next:
         save = m->msg_iter;
         mtu = tsk->max_pkt;
         send = min_t(uint, dsz - sent, TIPC_MAX_USER_MSG_SIZE);
-       rc = tipc_msg_build(mhdr, m, sent, send, mtu, pktchain);
+       rc = tipc_msg_build(mhdr, m, sent, send, mtu, &pktchain);
         if (unlikely(rc < 0))
                 return rc;
+
         do {
                 if (likely(!tsk_conn_cong(tsk))) {
-                       rc = tipc_node_xmit(net, pktchain, dnode, portid);
+                       rc = tipc_node_xmit(net, &pktchain, dnode, portid);
                         if (likely(!rc)) {
                                 tsk->sent_unacked++;
                                 sent += send;
@@ -1063,7 +1068,7 @@ next:
                                 goto next;
                         }
                         if (rc == -EMSGSIZE) {
-                               __skb_queue_purge(pktchain);
+                               __skb_queue_purge(&pktchain);
                                 tsk->max_pkt = tipc_node_get_mtu(net, dnode,
                                                                  portid);
                                 m->msg_iter = save;
@@ -1077,7 +1082,7 @@ next:
                 rc = tipc_wait_for_sndpkt(sock, &timeo);
         } while (!rc);
  
-       __skb_queue_purge(pktchain);
+       __skb_queue_purge(&pktchain);
         return sent ? sent : rc;
  }
  
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c

index 69ee2eeef968851192035cb166410f1f37e7722f..f9ff73a8d8154ff0a52f8c33a64077f5d8c57ae5 100644 (file)
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -296,7 +296,8 @@ static void tipc_subscrb_rcv_cb(struct net *net, int conid,
         if (tipc_subscrp_create(net, (struct tipc_subscr *)buf, subscrb, &sub))
                 return tipc_conn_terminate(tn->topsrv, subscrb->conid);
  
-       tipc_nametbl_subscribe(sub);
+       if (sub)
+               tipc_nametbl_subscribe(sub);
  }
  
  /* Handle one request to establish a new subscriber */
diff --git a/net/wireless/core.c b/net/wireless/core.c

index b0915515640efed1ff4795103c0572aba48c38f4..8f0bac7e03c406466e0b5ca2b01b3ca2574ab79f 100644 (file)
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -1147,6 +1147,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
                 return NOTIFY_DONE;
         }
  
+       wireless_nlevent_flush();
+
         return NOTIFY_OK;
  }
  
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c

index d4786f2802aa3c94f0a9bfcba846c98ec5b2b249..711cb7ad6ae011132b868d928ebe23037040db82 100644 (file)
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -7547,7 +7547,7 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
  
                 if ((ibss.chandef.width != NL80211_CHAN_WIDTH_20_NOHT) &&
                     no_ht) {
-                       kfree(connkeys);
+                       kzfree(connkeys);
                         return -EINVAL;
                 }
         }
diff --git a/net/wireless/sme.c b/net/wireless/sme.c

index 8020b5b094d4c8fba0c0f2431f8af7d8ecc40dca..d49ed7666d4cb3e5a20641ebc0acfd3ccda80ff9 100644 (file)
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -917,6 +917,12 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
  
         nl80211_send_disconnected(rdev, dev, reason, ie, ie_len, from_ap);
  
+       /* stop critical protocol if supported */
+       if (rdev->ops->crit_proto_stop && rdev->crit_proto_nlportid) {
+               rdev->crit_proto_nlportid = 0;
+               rdev_crit_proto_stop(rdev, wdev);
+       }
+
         /*
          * Delete all the keys ... pairwise keys can't really
          * exist any more anyway, but default keys might.
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c

index c8717c1d082e702f9b071c480e873b408b400daf..b50ee5d622e14d4fb486ce0c533757e958702f54 100644 (file)
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -342,6 +342,40 @@ static const int compat_event_type_size[] = {
  
  /* IW event code */
  
+void wireless_nlevent_flush(void)
+{
+       struct sk_buff *skb;
+       struct net *net;
+
+       ASSERT_RTNL();
+
+       for_each_net(net) {
+               while ((skb = skb_dequeue(&net->wext_nlevents)))
+                       rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL,
+                                   GFP_KERNEL);
+       }
+}
+EXPORT_SYMBOL_GPL(wireless_nlevent_flush);
+
+static int wext_netdev_notifier_call(struct notifier_block *nb,
+                                    unsigned long state, void *ptr)
+{
+       /*
+        * When a netdev changes state in any way, flush all pending messages
+        * to avoid them going out in a strange order, e.g. RTM_NEWLINK after
+        * RTM_DELLINK, or with IFF_UP after without IFF_UP during dev_close()
+        * or similar - all of which could otherwise happen due to delays from
+        * schedule_work().
+        */
+       wireless_nlevent_flush();
+
+       return NOTIFY_OK;
+}
+
+static struct notifier_block wext_netdev_notifier = {
+       .notifier_call = wext_netdev_notifier_call,
+};
+
  static int __net_init wext_pernet_init(struct net *net)
  {
         skb_queue_head_init(&net->wext_nlevents);
@@ -360,7 +394,12 @@ static struct pernet_operations wext_pernet_ops = {
  
  static int __init wireless_nlevent_init(void)
  {
-       return register_pernet_subsys(&wext_pernet_ops);
+       int err = register_pernet_subsys(&wext_pernet_ops);
+
+       if (err)
+               return err;
+
+       return register_netdevice_notifier(&wext_netdev_notifier);
  }
  
  subsys_initcall(wireless_nlevent_init);
@@ -368,17 +407,8 @@ subsys_initcall(wireless_nlevent_init);
  /* Process events generated by the wireless layer or the driver. */
  static void wireless_nlevent_process(struct work_struct *work)
  {
-       struct sk_buff *skb;
-       struct net *net;
-
         rtnl_lock();
-
-       for_each_net(net) {
-               while ((skb = skb_dequeue(&net->wext_nlevents)))
-                       rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL,
-                                   GFP_KERNEL);
-       }
-
+       wireless_nlevent_flush();
         rtnl_unlock();
  }
  
diff --git a/scripts/ld-version.sh b/scripts/ld-version.sh

index d154f0877fd806a93d234e6d4288da27d6c5d09f..7bfe9fa1c8dc6db8d6c4415fbf0d26e66477e42e 100755 (executable)
--- a/scripts/ld-version.sh
+++ b/scripts/ld-version.sh
@@ -1,7 +1,7 @@
  #!/usr/bin/awk -f
  # extract linker version number from stdin and turn into single number
         {
-       gsub(".*)", "");
+       gsub(".*\\)", "");
         gsub(".*version ", "");
         gsub("-.*", "");
         split($1,a, ".");
diff --git a/scripts/sortextable.c b/scripts/sortextable.c

index c2423d913b46bd0e659ea4d4c057a3af6119c2d4..7b29fb14f870283ae4174e34643640150f7e098b 100644 (file)
--- a/scripts/sortextable.c
+++ b/scripts/sortextable.c
@@ -209,6 +209,35 @@ static int compare_relative_table(const void *a, const void *b)
         return 0;
  }
  
+static void x86_sort_relative_table(char *extab_image, int image_size)
+{
+       int i;
+
+       i = 0;
+       while (i < image_size) {
+               uint32_t *loc = (uint32_t *)(extab_image + i);
+
+               w(r(loc) + i, loc);
+               w(r(loc + 1) + i + 4, loc + 1);
+               w(r(loc + 2) + i + 8, loc + 2);
+
+               i += sizeof(uint32_t) * 3;
+       }
+
+       qsort(extab_image, image_size / 12, 12, compare_relative_table);
+
+       i = 0;
+       while (i < image_size) {
+               uint32_t *loc = (uint32_t *)(extab_image + i);
+
+               w(r(loc) - i, loc);
+               w(r(loc + 1) - (i + 4), loc + 1);
+               w(r(loc + 2) - (i + 8), loc + 2);
+
+               i += sizeof(uint32_t) * 3;
+       }
+}
+
  static void sort_relative_table(char *extab_image, int image_size)
  {
         int i;
@@ -281,6 +310,9 @@ do_file(char const *const fname)
                 break;
         case EM_386:
         case EM_X86_64:
+               custom_sort = x86_sort_relative_table;
+               break;
+
         case EM_S390:
                 custom_sort = sort_relative_table;
                 break;
diff --git a/sound/arm/pxa2xx-pcm-lib.c b/sound/arm/pxa2xx-pcm-lib.c

index e9b98af6b52c83faecccd4cc1011286a75560849..e8da3b8ee7213352426023029fcff04e87659d50 100644 (file)
--- a/sound/arm/pxa2xx-pcm-lib.c
+++ b/sound/arm/pxa2xx-pcm-lib.c
@@ -141,10 +141,8 @@ int pxa2xx_pcm_mmap(struct snd_pcm_substream *substream,
         struct vm_area_struct *vma)
  {
         struct snd_pcm_runtime *runtime = substream->runtime;
-       return dma_mmap_writecombine(substream->pcm->card->dev, vma,
-                                    runtime->dma_area,
-                                    runtime->dma_addr,
-                                    runtime->dma_bytes);
+       return dma_mmap_wc(substream->pcm->card->dev, vma, runtime->dma_area,
+                          runtime->dma_addr, runtime->dma_bytes);
  }
  EXPORT_SYMBOL(pxa2xx_pcm_mmap);
  
@@ -156,8 +154,7 @@ int pxa2xx_pcm_preallocate_dma_buffer(struct snd_pcm *pcm, int stream)
         buf->dev.type = SNDRV_DMA_TYPE_DEV;
         buf->dev.dev = pcm->card->dev;
         buf->private_data = NULL;
-       buf->area = dma_alloc_writecombine(pcm->card->dev, size,
-                                          &buf->addr, GFP_KERNEL);
+       buf->area = dma_alloc_wc(pcm->card->dev, size, &buf->addr, GFP_KERNEL);
         if (!buf->area)
                 return -ENOMEM;
         buf->bytes = size;
@@ -178,8 +175,7 @@ void pxa2xx_pcm_free_dma_buffers(struct snd_pcm *pcm)
                 buf = &substream->dma_buffer;
                 if (!buf->area)
                         continue;
-               dma_free_writecombine(pcm->card->dev, buf->bytes,
-                                     buf->area, buf->addr);
+               dma_free_wc(pcm->card->dev, buf->bytes, buf->area, buf->addr);
                 buf->area = NULL;
         }
  }
diff --git a/sound/soc/codecs/ab8500-codec.c b/sound/soc/codecs/ab8500-codec.c

index affb192238a403b88dc38f3e76e0c54440779ea0..faae6936bae4f55851e4f0a7062f03fdebf674ee 100644 (file)
--- a/sound/soc/codecs/ab8500-codec.c
+++ b/sound/soc/codecs/ab8500-codec.c
@@ -1130,7 +1130,7 @@ static int sid_status_control_get(struct snd_kcontrol *kcontrol,
         struct ab8500_codec_drvdata *drvdata = dev_get_drvdata(codec->dev);
  
         mutex_lock(&drvdata->ctrl_lock);
-       ucontrol->value.integer.value[0] = drvdata->sid_status;
+       ucontrol->value.enumerated.item[0] = drvdata->sid_status;
         mutex_unlock(&drvdata->ctrl_lock);
  
         return 0;
@@ -1147,7 +1147,7 @@ static int sid_status_control_put(struct snd_kcontrol *kcontrol,
  
         dev_dbg(codec->dev, "%s: Enter\n", __func__);
  
-       if (ucontrol->value.integer.value[0] != SID_APPLY_FIR) {
+       if (ucontrol->value.enumerated.item[0] != SID_APPLY_FIR) {
                 dev_err(codec->dev,
                         "%s: ERROR: This control supports '%s' only!\n",
                         __func__, enum_sid_state[SID_APPLY_FIR]);
@@ -1199,7 +1199,7 @@ static int anc_status_control_get(struct snd_kcontrol *kcontrol,
         struct ab8500_codec_drvdata *drvdata = dev_get_drvdata(codec->dev);
  
         mutex_lock(&drvdata->ctrl_lock);
-       ucontrol->value.integer.value[0] = drvdata->anc_status;
+       ucontrol->value.enumerated.item[0] = drvdata->anc_status;
         mutex_unlock(&drvdata->ctrl_lock);
  
         return 0;
@@ -1220,7 +1220,7 @@ static int anc_status_control_put(struct snd_kcontrol *kcontrol,
  
         mutex_lock(&drvdata->ctrl_lock);
  
-       req = ucontrol->value.integer.value[0];
+       req = ucontrol->value.enumerated.item[0];
         if (req >= ARRAY_SIZE(enum_anc_state)) {
                 status = -EINVAL;
                 goto cleanup;
diff --git a/sound/soc/codecs/adau17x1.h b/sound/soc/codecs/adau17x1.h

index e13583e6ff56aa5d69a163ca7a447acdaa18cbd6..5ae87a084d97e24a6aa243ba6cb1afc45290ac43 100644 (file)
--- a/sound/soc/codecs/adau17x1.h
+++ b/sound/soc/codecs/adau17x1.h
@@ -103,9 +103,9 @@ bool adau17x1_has_dsp(struct adau *adau);
  #define ADAU17X1_CLOCK_CONTROL_CORECLK_SRC_PLL BIT(3)
  #define ADAU17X1_CLOCK_CONTROL_SYSCLK_EN       BIT(0)
  
-#define ADAU17X1_SERIAL_PORT1_BCLK32           (0x0 << 5)
-#define ADAU17X1_SERIAL_PORT1_BCLK48           (0x1 << 5)
-#define ADAU17X1_SERIAL_PORT1_BCLK64           (0x2 << 5)
+#define ADAU17X1_SERIAL_PORT1_BCLK64           (0x0 << 5)
+#define ADAU17X1_SERIAL_PORT1_BCLK32           (0x1 << 5)
+#define ADAU17X1_SERIAL_PORT1_BCLK48           (0x2 << 5)
  #define ADAU17X1_SERIAL_PORT1_BCLK128          (0x3 << 5)
  #define ADAU17X1_SERIAL_PORT1_BCLK256          (0x4 << 5)
  #define ADAU17X1_SERIAL_PORT1_BCLK_MASK                (0x7 << 5)
diff --git a/sound/soc/codecs/cs42l51.c b/sound/soc/codecs/cs42l51.c

index b3951524339f90cdf8f90dbc23cbadd46a874af9..35488f14e2378ada271328a97504a9558b746836 100644 (file)
--- a/sound/soc/codecs/cs42l51.c
+++ b/sound/soc/codecs/cs42l51.c
@@ -60,15 +60,15 @@ static int cs42l51_get_chan_mix(struct snd_kcontrol *kcontrol,
         switch (value) {
         default:
         case 0:
-               ucontrol->value.integer.value[0] = 0;
+               ucontrol->value.enumerated.item[0] = 0;
                 break;
         /* same value : (L+R)/2 and (R+L)/2 */
         case 1:
         case 2:
-               ucontrol->value.integer.value[0] = 1;
+               ucontrol->value.enumerated.item[0] = 1;
                 break;
         case 3:
-               ucontrol->value.integer.value[0] = 2;
+               ucontrol->value.enumerated.item[0] = 2;
                 break;
         }
  
@@ -85,7 +85,7 @@ static int cs42l51_set_chan_mix(struct snd_kcontrol *kcontrol,
         struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
         unsigned char val;
  
-       switch (ucontrol->value.integer.value[0]) {
+       switch (ucontrol->value.enumerated.item[0]) {
         default:
         case 0:
                 val = CHAN_MIX_NORMAL;
diff --git a/sound/soc/codecs/da732x.c b/sound/soc/codecs/da732x.c

index 1d5a89c5164b85686f652dc8f5e13af6c1eb49e6..461506a4ca6a2ddb790b1646f27bce2d7abf25ba 100644 (file)
--- a/sound/soc/codecs/da732x.c
+++ b/sound/soc/codecs/da732x.c
@@ -334,7 +334,7 @@ static int da732x_hpf_set(struct snd_kcontrol *kcontrol,
         struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
         struct soc_enum *enum_ctrl = (struct soc_enum *)kcontrol->private_value;
         unsigned int reg = enum_ctrl->reg;
-       unsigned int sel = ucontrol->value.integer.value[0];
+       unsigned int sel = ucontrol->value.enumerated.item[0];
         unsigned int bits;
  
         switch (sel) {
@@ -368,13 +368,13 @@ static int da732x_hpf_get(struct snd_kcontrol *kcontrol,
  
         switch (val) {
         case DA732X_HPF_VOICE_EN:
-               ucontrol->value.integer.value[0] = DA732X_HPF_VOICE;
+               ucontrol->value.enumerated.item[0] = DA732X_HPF_VOICE;
                 break;
         case DA732X_HPF_MUSIC_EN:
-               ucontrol->value.integer.value[0] = DA732X_HPF_MUSIC;
+               ucontrol->value.enumerated.item[0] = DA732X_HPF_MUSIC;
                 break;
         default:
-               ucontrol->value.integer.value[0] = DA732X_HPF_DISABLED;
+               ucontrol->value.enumerated.item[0] = DA732X_HPF_DISABLED;
                 break;
         }
  
diff --git a/sound/soc/codecs/max98088.c b/sound/soc/codecs/max98088.c

index 20dcc496d39c9d8ac01e4d49004f92823fdd37a6..fc22804cabc5942acca6921c034fa98f234041dc 100644 (file)
--- a/sound/soc/codecs/max98088.c
+++ b/sound/soc/codecs/max98088.c
@@ -1496,7 +1496,7 @@ static int max98088_put_eq_enum(struct snd_kcontrol *kcontrol,
         struct max98088_pdata *pdata = max98088->pdata;
         int channel = max98088_get_channel(codec, kcontrol->id.name);
         struct max98088_cdata *cdata;
-       int sel = ucontrol->value.integer.value[0];
+       int sel = ucontrol->value.enumerated.item[0];
  
         if (channel < 0)
                return channel;
diff --git a/sound/soc/codecs/max98095.c b/sound/soc/codecs/max98095.c

index 1fedac50355e954790a15b84ed6aab49b2bb98e3..3577003f39cf8feaba0d97bb313e277b3b763ce7 100644 (file)
--- a/sound/soc/codecs/max98095.c
+++ b/sound/soc/codecs/max98095.c
@@ -1499,7 +1499,7 @@ static int max98095_put_eq_enum(struct snd_kcontrol *kcontrol,
         struct max98095_pdata *pdata = max98095->pdata;
         int channel = max98095_get_eq_channel(kcontrol->id.name);
         struct max98095_cdata *cdata;
-       unsigned int sel = ucontrol->value.integer.value[0];
+       unsigned int sel = ucontrol->value.enumerated.item[0];
         struct max98095_eq_cfg *coef_set;
         int fs, best, best_val, i;
         int regmask, regsave;
@@ -1653,7 +1653,7 @@ static int max98095_put_bq_enum(struct snd_kcontrol *kcontrol,
         struct max98095_pdata *pdata = max98095->pdata;
         int channel = max98095_get_bq_channel(codec, kcontrol->id.name);
         struct max98095_cdata *cdata;
-       unsigned int sel = ucontrol->value.integer.value[0];
+       unsigned int sel = ucontrol->value.enumerated.item[0];
         struct max98095_biquad_cfg *coef_set;
         int fs, best, best_val, i;
         int regmask, regsave;
diff --git a/sound/soc/codecs/tlv320dac33.c b/sound/soc/codecs/tlv320dac33.c

index 781398fb2841565f20d2f7e7a4f893d840e3e7ae..f7a6ce7e5fb12b7ab183428b28d6c69409676d08 100644 (file)
--- a/sound/soc/codecs/tlv320dac33.c
+++ b/sound/soc/codecs/tlv320dac33.c
@@ -446,7 +446,7 @@ static int dac33_get_fifo_mode(struct snd_kcontrol *kcontrol,
         struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
         struct tlv320dac33_priv *dac33 = snd_soc_codec_get_drvdata(codec);
  
-       ucontrol->value.integer.value[0] = dac33->fifo_mode;
+       ucontrol->value.enumerated.item[0] = dac33->fifo_mode;
  
         return 0;
  }
@@ -458,17 +458,16 @@ static int dac33_set_fifo_mode(struct snd_kcontrol *kcontrol,
         struct tlv320dac33_priv *dac33 = snd_soc_codec_get_drvdata(codec);
         int ret = 0;
  
-       if (dac33->fifo_mode == ucontrol->value.integer.value[0])
+       if (dac33->fifo_mode == ucontrol->value.enumerated.item[0])
                 return 0;
         /* Do not allow changes while stream is running*/
         if (snd_soc_codec_is_active(codec))
                 return -EPERM;
  
-       if (ucontrol->value.integer.value[0] < 0 ||
-           ucontrol->value.integer.value[0] >= DAC33_FIFO_LAST_MODE)
+       if (ucontrol->value.enumerated.item[0] >= DAC33_FIFO_LAST_MODE)
                 ret = -EINVAL;
         else
-               dac33->fifo_mode = ucontrol->value.integer.value[0];
+               dac33->fifo_mode = ucontrol->value.enumerated.item[0];
  
         return ret;
  }
diff --git a/sound/soc/codecs/wl1273.c b/sound/soc/codecs/wl1273.c

index 7693c1129babf0e42c58b5ca93832052374cb426..1b79778098d29ffbb377efc24576825cdfe8038c 100644 (file)
--- a/sound/soc/codecs/wl1273.c
+++ b/sound/soc/codecs/wl1273.c
@@ -175,7 +175,7 @@ static int snd_wl1273_get_audio_route(struct snd_kcontrol *kcontrol,
         struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
         struct wl1273_priv *wl1273 = snd_soc_codec_get_drvdata(codec);
  
-       ucontrol->value.integer.value[0] = wl1273->mode;
+       ucontrol->value.enumerated.item[0] = wl1273->mode;
  
         return 0;
  }
@@ -193,18 +193,17 @@ static int snd_wl1273_set_audio_route(struct snd_kcontrol *kcontrol,
         struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
         struct wl1273_priv *wl1273 = snd_soc_codec_get_drvdata(codec);
  
-       if (wl1273->mode == ucontrol->value.integer.value[0])
+       if (wl1273->mode == ucontrol->value.enumerated.item[0])
                 return 0;
  
         /* Do not allow changes while stream is running */
         if (snd_soc_codec_is_active(codec))
                 return -EPERM;
  
-       if (ucontrol->value.integer.value[0] < 0 ||
-           ucontrol->value.integer.value[0] >=  ARRAY_SIZE(wl1273_audio_route))
+       if (ucontrol->value.enumerated.item[0] >=  ARRAY_SIZE(wl1273_audio_route))
                 return -EINVAL;
  
-       wl1273->mode = ucontrol->value.integer.value[0];
+       wl1273->mode = ucontrol->value.enumerated.item[0];
  
         return 1;
  }
@@ -219,7 +218,7 @@ static int snd_wl1273_fm_audio_get(struct snd_kcontrol *kcontrol,
  
         dev_dbg(codec->dev, "%s: enter.\n", __func__);
  
-       ucontrol->value.integer.value[0] = wl1273->core->audio_mode;
+       ucontrol->value.enumerated.item[0] = wl1273->core->audio_mode;
  
         return 0;
  }
@@ -233,7 +232,7 @@ static int snd_wl1273_fm_audio_put(struct snd_kcontrol *kcontrol,
  
         dev_dbg(codec->dev, "%s: enter.\n", __func__);
  
-       val = ucontrol->value.integer.value[0];
+       val = ucontrol->value.enumerated.item[0];
         if (wl1273->core->audio_mode == val)
                 return 0;
  
diff --git a/sound/soc/codecs/wm8753.c b/sound/soc/codecs/wm8753.c

index 61299ca372ffc1fc3f83dd88fe823151da7b09b8..6f1024f48b193bc7d3127cc8b12d56d14124413b 100644 (file)
--- a/sound/soc/codecs/wm8753.c
+++ b/sound/soc/codecs/wm8753.c
@@ -233,7 +233,7 @@ static int wm8753_get_dai(struct snd_kcontrol *kcontrol,
         struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
         struct wm8753_priv *wm8753 = snd_soc_codec_get_drvdata(codec);
  
-       ucontrol->value.integer.value[0] = wm8753->dai_func;
+       ucontrol->value.enumerated.item[0] = wm8753->dai_func;
         return 0;
  }
  
@@ -244,7 +244,7 @@ static int wm8753_set_dai(struct snd_kcontrol *kcontrol,
         struct wm8753_priv *wm8753 = snd_soc_codec_get_drvdata(codec);
         u16 ioctl;
  
-       if (wm8753->dai_func == ucontrol->value.integer.value[0])
+       if (wm8753->dai_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
         if (snd_soc_codec_is_active(codec))
@@ -252,7 +252,7 @@ static int wm8753_set_dai(struct snd_kcontrol *kcontrol,
  
         ioctl = snd_soc_read(codec, WM8753_IOCTL);
  
-       wm8753->dai_func = ucontrol->value.integer.value[0];
+       wm8753->dai_func = ucontrol->value.enumerated.item[0];
  
         if (((ioctl >> 2) & 0x3) == wm8753->dai_func)
                 return 1;
diff --git a/sound/soc/codecs/wm8904.c b/sound/soc/codecs/wm8904.c

index 8172e499e6ed18f33e6f94b2bec1a1647d1c1788..edd7a77091942cd7b873e9c193baed54dd6186c6 100644 (file)
--- a/sound/soc/codecs/wm8904.c
+++ b/sound/soc/codecs/wm8904.c
@@ -396,7 +396,7 @@ static int wm8904_put_drc_enum(struct snd_kcontrol *kcontrol,
         struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
         struct wm8904_priv *wm8904 = snd_soc_codec_get_drvdata(codec);
         struct wm8904_pdata *pdata = wm8904->pdata;
-       int value = ucontrol->value.integer.value[0];
+       int value = ucontrol->value.enumerated.item[0];
  
         if (value >= pdata->num_drc_cfgs)
                 return -EINVAL;
@@ -467,7 +467,7 @@ static int wm8904_put_retune_mobile_enum(struct snd_kcontrol *kcontrol,
         struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
         struct wm8904_priv *wm8904 = snd_soc_codec_get_drvdata(codec);
         struct wm8904_pdata *pdata = wm8904->pdata;
-       int value = ucontrol->value.integer.value[0];
+       int value = ucontrol->value.enumerated.item[0];
  
         if (value >= pdata->num_retune_mobile_cfgs)
                 return -EINVAL;
diff --git a/sound/soc/codecs/wm8958-dsp2.c b/sound/soc/codecs/wm8958-dsp2.c

index c799cca5abeb704645687b863ccedfc6749e9ac2..6b864c0fc2b676cc83f9d03d2218815e7e805919 100644 (file)
--- a/sound/soc/codecs/wm8958-dsp2.c
+++ b/sound/soc/codecs/wm8958-dsp2.c
@@ -459,7 +459,7 @@ static int wm8958_put_mbc_enum(struct snd_kcontrol *kcontrol,
         struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
         struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
         struct wm8994 *control = wm8994->wm8994;
-       int value = ucontrol->value.integer.value[0];
+       int value = ucontrol->value.enumerated.item[0];
         int reg;
  
         /* Don't allow on the fly reconfiguration */
@@ -549,7 +549,7 @@ static int wm8958_put_vss_enum(struct snd_kcontrol *kcontrol,
         struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
         struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
         struct wm8994 *control = wm8994->wm8994;
-       int value = ucontrol->value.integer.value[0];
+       int value = ucontrol->value.enumerated.item[0];
         int reg;
  
         /* Don't allow on the fly reconfiguration */
@@ -582,7 +582,7 @@ static int wm8958_put_vss_hpf_enum(struct snd_kcontrol *kcontrol,
         struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
         struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
         struct wm8994 *control = wm8994->wm8994;
-       int value = ucontrol->value.integer.value[0];
+       int value = ucontrol->value.enumerated.item[0];
         int reg;
  
         /* Don't allow on the fly reconfiguration */
@@ -749,7 +749,7 @@ static int wm8958_put_enh_eq_enum(struct snd_kcontrol *kcontrol,
         struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
         struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
         struct wm8994 *control = wm8994->wm8994;
-       int value = ucontrol->value.integer.value[0];
+       int value = ucontrol->value.enumerated.item[0];
         int reg;
  
         /* Don't allow on the fly reconfiguration */
diff --git a/sound/soc/codecs/wm8983.c b/sound/soc/codecs/wm8983.c

index 7350ff654bbf37fe660e2b7a75cc5a15b5d63c68..0c002a5712cb4f261cd84454f38e24a89836519a 100644 (file)
--- a/sound/soc/codecs/wm8983.c
+++ b/sound/soc/codecs/wm8983.c
@@ -497,9 +497,9 @@ static int eqmode_get(struct snd_kcontrol *kcontrol,
  
         reg = snd_soc_read(codec, WM8983_EQ1_LOW_SHELF);
         if (reg & WM8983_EQ3DMODE)
-               ucontrol->value.integer.value[0] = 1;
+               ucontrol->value.enumerated.item[0] = 1;
         else
-               ucontrol->value.integer.value[0] = 0;
+               ucontrol->value.enumerated.item[0] = 0;
  
         return 0;
  }
@@ -511,18 +511,18 @@ static int eqmode_put(struct snd_kcontrol *kcontrol,
         unsigned int regpwr2, regpwr3;
         unsigned int reg_eq;
  
-       if (ucontrol->value.integer.value[0] != 0
-           && ucontrol->value.integer.value[0] != 1)
+       if (ucontrol->value.enumerated.item[0] != 0
+           && ucontrol->value.enumerated.item[0] != 1)
                 return -EINVAL;
  
         reg_eq = snd_soc_read(codec, WM8983_EQ1_LOW_SHELF);
         switch ((reg_eq & WM8983_EQ3DMODE) >> WM8983_EQ3DMODE_SHIFT) {
         case 0:
-               if (!ucontrol->value.integer.value[0])
+               if (!ucontrol->value.enumerated.item[0])
                         return 0;
                 break;
         case 1:
-               if (ucontrol->value.integer.value[0])
+               if (ucontrol->value.enumerated.item[0])
                         return 0;
                 break;
         }
@@ -537,7 +537,7 @@ static int eqmode_put(struct snd_kcontrol *kcontrol,
         /* set the desired eqmode */
         snd_soc_update_bits(codec, WM8983_EQ1_LOW_SHELF,
                             WM8983_EQ3DMODE_MASK,
-                           ucontrol->value.integer.value[0]
+                           ucontrol->value.enumerated.item[0]
                             << WM8983_EQ3DMODE_SHIFT);
         /* restore DAC/ADC configuration */
         snd_soc_write(codec, WM8983_POWER_MANAGEMENT_2, regpwr2);
diff --git a/sound/soc/codecs/wm8985.c b/sound/soc/codecs/wm8985.c

index 9918152a03c7518e5c0c2e299a35145abd808a14..6ac76fe116b028db34c8cdbf3691babeb2a68844 100644 (file)
--- a/sound/soc/codecs/wm8985.c
+++ b/sound/soc/codecs/wm8985.c
@@ -531,9 +531,9 @@ static int eqmode_get(struct snd_kcontrol *kcontrol,
  
         reg = snd_soc_read(codec, WM8985_EQ1_LOW_SHELF);
         if (reg & WM8985_EQ3DMODE)
-               ucontrol->value.integer.value[0] = 1;
+               ucontrol->value.enumerated.item[0] = 1;
         else
-               ucontrol->value.integer.value[0] = 0;
+               ucontrol->value.enumerated.item[0] = 0;
  
         return 0;
  }
@@ -545,18 +545,18 @@ static int eqmode_put(struct snd_kcontrol *kcontrol,
         unsigned int regpwr2, regpwr3;
         unsigned int reg_eq;
  
-       if (ucontrol->value.integer.value[0] != 0
-                       && ucontrol->value.integer.value[0] != 1)
+       if (ucontrol->value.enumerated.item[0] != 0
+                       && ucontrol->value.enumerated.item[0] != 1)
                 return -EINVAL;
  
         reg_eq = snd_soc_read(codec, WM8985_EQ1_LOW_SHELF);
         switch ((reg_eq & WM8985_EQ3DMODE) >> WM8985_EQ3DMODE_SHIFT) {
         case 0:
-               if (!ucontrol->value.integer.value[0])
+               if (!ucontrol->value.enumerated.item[0])
                         return 0;
                 break;
         case 1:
-               if (ucontrol->value.integer.value[0])
+               if (ucontrol->value.enumerated.item[0])
                         return 0;
                 break;
         }
@@ -573,7 +573,7 @@ static int eqmode_put(struct snd_kcontrol *kcontrol,
         /* set the desired eqmode */
         snd_soc_update_bits(codec, WM8985_EQ1_LOW_SHELF,
                             WM8985_EQ3DMODE_MASK,
-                           ucontrol->value.integer.value[0]
+                           ucontrol->value.enumerated.item[0]
                             << WM8985_EQ3DMODE_SHIFT);
         /* restore DAC/ADC configuration */
         snd_soc_write(codec, WM8985_POWER_MANAGEMENT_2, regpwr2);
diff --git a/sound/soc/codecs/wm8994.c b/sound/soc/codecs/wm8994.c

index 2ccbb322df775b803d40fe765958f439d855d41f..a18aecb4993590e6a7bd936b238fee544462a6bc 100644 (file)
--- a/sound/soc/codecs/wm8994.c
+++ b/sound/soc/codecs/wm8994.c
@@ -362,7 +362,7 @@ static int wm8994_put_drc_enum(struct snd_kcontrol *kcontrol,
         struct wm8994 *control = wm8994->wm8994;
         struct wm8994_pdata *pdata = &control->pdata;
         int drc = wm8994_get_drc(kcontrol->id.name);
-       int value = ucontrol->value.integer.value[0];
+       int value = ucontrol->value.enumerated.item[0];
  
         if (drc < 0)
                 return drc;
@@ -469,7 +469,7 @@ static int wm8994_put_retune_mobile_enum(struct snd_kcontrol *kcontrol,
         struct wm8994 *control = wm8994->wm8994;
         struct wm8994_pdata *pdata = &control->pdata;
         int block = wm8994_get_retune_mobile_block(kcontrol->id.name);
-       int value = ucontrol->value.integer.value[0];
+       int value = ucontrol->value.enumerated.item[0];
  
         if (block < 0)
                 return block;
diff --git a/sound/soc/codecs/wm8996.c b/sound/soc/codecs/wm8996.c

index 8d7d6c01a2f7264905e7a8e29888a417f44fb515..f99b34f7647b092a657fbbc3a70fd01219837351 100644 (file)
--- a/sound/soc/codecs/wm8996.c
+++ b/sound/soc/codecs/wm8996.c
@@ -416,7 +416,7 @@ static int wm8996_put_retune_mobile_enum(struct snd_kcontrol *kcontrol,
         struct wm8996_priv *wm8996 = snd_soc_codec_get_drvdata(codec);
         struct wm8996_pdata *pdata = &wm8996->pdata;
         int block = wm8996_get_retune_mobile_block(kcontrol->id.name);
-       int value = ucontrol->value.integer.value[0];
+       int value = ucontrol->value.enumerated.item[0];
  
         if (block < 0)
                 return block;
diff --git a/sound/soc/codecs/wm9081.c b/sound/soc/codecs/wm9081.c

index ccb3b15139ad9dacd41ad801201471d1ed696997..363b3b6676163e8c0b110efd165c5eccd7ec9e1a 100644 (file)
--- a/sound/soc/codecs/wm9081.c
+++ b/sound/soc/codecs/wm9081.c
@@ -344,9 +344,9 @@ static int speaker_mode_get(struct snd_kcontrol *kcontrol,
  
         reg = snd_soc_read(codec, WM9081_ANALOGUE_SPEAKER_2);
         if (reg & WM9081_SPK_MODE)
-               ucontrol->value.integer.value[0] = 1;
+               ucontrol->value.enumerated.item[0] = 1;
         else
-               ucontrol->value.integer.value[0] = 0;
+               ucontrol->value.enumerated.item[0] = 0;
  
         return 0;
  }
@@ -365,7 +365,7 @@ static int speaker_mode_put(struct snd_kcontrol *kcontrol,
         unsigned int reg2 = snd_soc_read(codec, WM9081_ANALOGUE_SPEAKER_2);
  
         /* Are we changing anything? */
-       if (ucontrol->value.integer.value[0] ==
+       if (ucontrol->value.enumerated.item[0] ==
             ((reg2 & WM9081_SPK_MODE) != 0))
                 return 0;
  
@@ -373,7 +373,7 @@ static int speaker_mode_put(struct snd_kcontrol *kcontrol,
         if (reg_pwr & WM9081_SPK_ENA)
                 return -EINVAL;
  
-       if (ucontrol->value.integer.value[0]) {
+       if (ucontrol->value.enumerated.item[0]) {
                 /* Class AB */
                 reg2 &= ~(WM9081_SPK_INV_MUTE | WM9081_OUT_SPK_CTRL);
                 reg2 |= WM9081_SPK_MODE;
diff --git a/sound/soc/codecs/wm9713.c b/sound/soc/codecs/wm9713.c

index 79e143625ac3d9c14ca72f96dc11acb6a970efdb..9849643ef8099c7f3340a719cf160c8ce633f9c5 100644 (file)
--- a/sound/soc/codecs/wm9713.c
+++ b/sound/soc/codecs/wm9713.c
@@ -1212,7 +1212,7 @@ static int wm9713_soc_probe(struct snd_soc_codec *codec)
         if (IS_ERR(wm9713->ac97))
                 return PTR_ERR(wm9713->ac97);
  
-       regmap = devm_regmap_init_ac97(wm9713->ac97, &wm9713_regmap_config);
+       regmap = regmap_init_ac97(wm9713->ac97, &wm9713_regmap_config);
         if (IS_ERR(regmap)) {
                 snd_soc_free_ac97_codec(wm9713->ac97);
                 return PTR_ERR(regmap);
diff --git a/sound/soc/codecs/wm_adsp.c b/sound/soc/codecs/wm_adsp.c

index 33806d487b8ae00711dddd4ec400393dbff59b1f..b9195b9c2b05175278d4e779f7db87aad20d9f63 100644 (file)
--- a/sound/soc/codecs/wm_adsp.c
+++ b/sound/soc/codecs/wm_adsp.c
@@ -586,7 +586,7 @@ static int wm_adsp_fw_get(struct snd_kcontrol *kcontrol,
         struct soc_enum *e = (struct soc_enum *)kcontrol->private_value;
         struct wm_adsp *dsp = snd_soc_codec_get_drvdata(codec);
  
-       ucontrol->value.integer.value[0] = dsp[e->shift_l].fw;
+       ucontrol->value.enumerated.item[0] = dsp[e->shift_l].fw;
  
         return 0;
  }
@@ -599,10 +599,10 @@ static int wm_adsp_fw_put(struct snd_kcontrol *kcontrol,
         struct wm_adsp *dsp = snd_soc_codec_get_drvdata(codec);
         int ret = 0;
  
-       if (ucontrol->value.integer.value[0] == dsp[e->shift_l].fw)
+       if (ucontrol->value.enumerated.item[0] == dsp[e->shift_l].fw)
                 return 0;
  
-       if (ucontrol->value.integer.value[0] >= WM_ADSP_NUM_FW)
+       if (ucontrol->value.enumerated.item[0] >= WM_ADSP_NUM_FW)
                 return -EINVAL;
  
         mutex_lock(&dsp[e->shift_l].pwr_lock);
@@ -610,7 +610,7 @@ static int wm_adsp_fw_put(struct snd_kcontrol *kcontrol,
         if (dsp[e->shift_l].running || dsp[e->shift_l].compr)
                 ret = -EBUSY;
         else
-               dsp[e->shift_l].fw = ucontrol->value.integer.value[0];
+               dsp[e->shift_l].fw = ucontrol->value.enumerated.item[0];
  
         mutex_unlock(&dsp[e->shift_l].pwr_lock);
  
diff --git a/sound/soc/fsl/fsl_ssi.c b/sound/soc/fsl/fsl_ssi.c

index ed8de1035cda159d0d186f2cded0fb7a97fbceb4..40dfd8a3648408a2cc76bb6e4c4c3872c8e28ac3 100644 (file)
--- a/sound/soc/fsl/fsl_ssi.c
+++ b/sound/soc/fsl/fsl_ssi.c
@@ -112,6 +112,20 @@ struct fsl_ssi_rxtx_reg_val {
         struct fsl_ssi_reg_val tx;
  };
  
+static const struct reg_default fsl_ssi_reg_defaults[] = {
+       {CCSR_SSI_SCR,     0x00000000},
+       {CCSR_SSI_SIER,    0x00003003},
+       {CCSR_SSI_STCR,    0x00000200},
+       {CCSR_SSI_SRCR,    0x00000200},
+       {CCSR_SSI_STCCR,   0x00040000},
+       {CCSR_SSI_SRCCR,   0x00040000},
+       {CCSR_SSI_SACNT,   0x00000000},
+       {CCSR_SSI_STMSK,   0x00000000},
+       {CCSR_SSI_SRMSK,   0x00000000},
+       {CCSR_SSI_SACCEN,  0x00000000},
+       {CCSR_SSI_SACCDIS, 0x00000000},
+};
+
  static bool fsl_ssi_readable_reg(struct device *dev, unsigned int reg)
  {
         switch (reg) {
@@ -176,7 +190,8 @@ static const struct regmap_config fsl_ssi_regconfig = {
         .val_bits = 32,
         .reg_stride = 4,
         .val_format_endian = REGMAP_ENDIAN_NATIVE,
-       .num_reg_defaults_raw = CCSR_SSI_SACCDIS / sizeof(uint32_t) + 1,
+       .reg_defaults = fsl_ssi_reg_defaults,
+       .num_reg_defaults = ARRAY_SIZE(fsl_ssi_reg_defaults),
         .readable_reg = fsl_ssi_readable_reg,
         .volatile_reg = fsl_ssi_volatile_reg,
         .precious_reg = fsl_ssi_precious_reg,
@@ -186,7 +201,6 @@ static const struct regmap_config fsl_ssi_regconfig = {
  
  struct fsl_ssi_soc_data {
         bool imx;
-       bool imx21regs; /* imx21-class SSI - no SACC{ST,EN,DIS} regs */
         bool offline_config;
         u32 sisr_write_mask;
  };
@@ -289,7 +303,6 @@ static struct fsl_ssi_soc_data fsl_ssi_mpc8610 = {
  
  static struct fsl_ssi_soc_data fsl_ssi_imx21 = {
         .imx = true,
-       .imx21regs = true,
         .offline_config = true,
         .sisr_write_mask = 0,
  };
@@ -573,12 +586,8 @@ static void fsl_ssi_setup_ac97(struct fsl_ssi_private *ssi_private)
          */
         regmap_write(regs, CCSR_SSI_SACNT,
                         CCSR_SSI_SACNT_AC97EN | CCSR_SSI_SACNT_FV);
-
-       /* no SACC{ST,EN,DIS} regs on imx21-class SSI */
-       if (!ssi_private->soc->imx21regs) {
-               regmap_write(regs, CCSR_SSI_SACCDIS, 0xff);
-               regmap_write(regs, CCSR_SSI_SACCEN, 0x300);
-       }
+       regmap_write(regs, CCSR_SSI_SACCDIS, 0xff);
+       regmap_write(regs, CCSR_SSI_SACCEN, 0x300);
  
         /*
          * Enable SSI, Transmit and Receive. AC97 has to communicate with the
@@ -1388,7 +1397,6 @@ static int fsl_ssi_probe(struct platform_device *pdev)
         struct resource *res;
         void __iomem *iomem;
         char name[64];
-       struct regmap_config regconfig = fsl_ssi_regconfig;
  
         of_id = of_match_device(fsl_ssi_ids, &pdev->dev);
         if (!of_id || !of_id->data)
@@ -1436,25 +1444,15 @@ static int fsl_ssi_probe(struct platform_device *pdev)
                 return PTR_ERR(iomem);
         ssi_private->ssi_phys = res->start;
  
-       if (ssi_private->soc->imx21regs) {
-               /*
-                * According to datasheet imx21-class SSI
-                * don't have SACC{ST,EN,DIS} regs.
-                */
-               regconfig.max_register = CCSR_SSI_SRMSK;
-               regconfig.num_reg_defaults_raw =
-                       CCSR_SSI_SRMSK / sizeof(uint32_t) + 1;
-       }
-
         ret = of_property_match_string(np, "clock-names", "ipg");
         if (ret < 0) {
                 ssi_private->has_ipg_clk_name = false;
                 ssi_private->regs = devm_regmap_init_mmio(&pdev->dev, iomem,
-                       &regconfig);
+                       &fsl_ssi_regconfig);
         } else {
                 ssi_private->has_ipg_clk_name = true;
                 ssi_private->regs = devm_regmap_init_mmio_clk(&pdev->dev,
-                       "ipg", iomem, &regconfig);
+                       "ipg", iomem, &fsl_ssi_regconfig);
         }
         if (IS_ERR(ssi_private->regs)) {
                 dev_err(&pdev->dev, "Failed to init register map\n");
diff --git a/sound/soc/fsl/imx-pcm-fiq.c b/sound/soc/fsl/imx-pcm-fiq.c

index 49d7513f429e5424ee3ea081178e087368e74896..e63cd5ecfd8feacd396f835756af90107fd2e2a6 100644 (file)
--- a/sound/soc/fsl/imx-pcm-fiq.c
+++ b/sound/soc/fsl/imx-pcm-fiq.c
@@ -217,8 +217,8 @@ static int snd_imx_pcm_mmap(struct snd_pcm_substream *substream,
         struct snd_pcm_runtime *runtime = substream->runtime;
         int ret;
  
-       ret = dma_mmap_writecombine(substream->pcm->card->dev, vma,
-               runtime->dma_area, runtime->dma_addr, runtime->dma_bytes);
+       ret = dma_mmap_wc(substream->pcm->card->dev, vma, runtime->dma_area,
+                         runtime->dma_addr, runtime->dma_bytes);
  
         pr_debug("%s: ret: %d %p %pad 0x%08x\n", __func__, ret,
                         runtime->dma_area,
@@ -247,8 +247,7 @@ static int imx_pcm_preallocate_dma_buffer(struct snd_pcm *pcm, int stream)
         buf->dev.type = SNDRV_DMA_TYPE_DEV;
         buf->dev.dev = pcm->card->dev;
         buf->private_data = NULL;
-       buf->area = dma_alloc_writecombine(pcm->card->dev, size,
-                                          &buf->addr, GFP_KERNEL);
+       buf->area = dma_alloc_wc(pcm->card->dev, size, &buf->addr, GFP_KERNEL);
         if (!buf->area)
                 return -ENOMEM;
         buf->bytes = size;
@@ -330,8 +329,7 @@ static void imx_pcm_free(struct snd_pcm *pcm)
                 if (!buf->area)
                         continue;
  
-               dma_free_writecombine(pcm->card->dev, buf->bytes,
-                                     buf->area, buf->addr);
+               dma_free_wc(pcm->card->dev, buf->bytes, buf->area, buf->addr);
                 buf->area = NULL;
         }
  }
diff --git a/sound/soc/intel/boards/cht_bsw_rt5645.c b/sound/soc/intel/boards/cht_bsw_rt5645.c

index 2d3afddb0a2e8c83fef54b36eed42d7ff6b69878..a7b96a9a4e0ecb57876413c848cfa91359d42a92 100644 (file)
--- a/sound/soc/intel/boards/cht_bsw_rt5645.c
+++ b/sound/soc/intel/boards/cht_bsw_rt5645.c
@@ -367,8 +367,12 @@ static int snd_cht_mc_probe(struct platform_device *pdev)
         }
         card->dev = &pdev->dev;
         sprintf(codec_name, "i2c-%s:00", drv->acpi_card->codec_id);
+
         /* set correct codec name */
-       strcpy((char *)card->dai_link[2].codec_name, codec_name);
+       for (i = 0; i < ARRAY_SIZE(cht_dailink); i++)
+               if (!strcmp(card->dai_link[i].codec_name, "i2c-10EC5645:00"))
+                       card->dai_link[i].codec_name = kstrdup(codec_name, GFP_KERNEL);
+
         snd_soc_card_set_drvdata(card, drv);
         ret_val = devm_snd_soc_register_card(&pdev->dev, card);
         if (ret_val) {
diff --git a/sound/soc/intel/boards/mfld_machine.c b/sound/soc/intel/boards/mfld_machine.c

index 49c09a0add797ff95101bee1923b62254e3928a0..34f46c72a0e27cff8cd37108d27b71029cd088c6 100644 (file)
--- a/sound/soc/intel/boards/mfld_machine.c
+++ b/sound/soc/intel/boards/mfld_machine.c
@@ -94,7 +94,7 @@ static const struct soc_enum lo_enum =
  static int headset_get_switch(struct snd_kcontrol *kcontrol,
         struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = hs_switch;
+       ucontrol->value.enumerated.item[0] = hs_switch;
         return 0;
  }
  
@@ -104,12 +104,12 @@ static int headset_set_switch(struct snd_kcontrol *kcontrol,
         struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
         struct snd_soc_dapm_context *dapm = &card->dapm;
  
-       if (ucontrol->value.integer.value[0] == hs_switch)
+       if (ucontrol->value.enumerated.item[0] == hs_switch)
                 return 0;
  
         snd_soc_dapm_mutex_lock(dapm);
  
-       if (ucontrol->value.integer.value[0]) {
+       if (ucontrol->value.enumerated.item[0]) {
                 pr_debug("hs_set HS path\n");
                 snd_soc_dapm_enable_pin_unlocked(dapm, "Headphones");
                 snd_soc_dapm_disable_pin_unlocked(dapm, "EPOUT");
@@ -123,7 +123,7 @@ static int headset_set_switch(struct snd_kcontrol *kcontrol,
  
         snd_soc_dapm_mutex_unlock(dapm);
  
-       hs_switch = ucontrol->value.integer.value[0];
+       hs_switch = ucontrol->value.enumerated.item[0];
  
         return 0;
  }
@@ -148,7 +148,7 @@ static void lo_enable_out_pins(struct snd_soc_dapm_context *dapm)
  static int lo_get_switch(struct snd_kcontrol *kcontrol,
         struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = lo_dac;
+       ucontrol->value.enumerated.item[0] = lo_dac;
         return 0;
  }
  
@@ -158,7 +158,7 @@ static int lo_set_switch(struct snd_kcontrol *kcontrol,
         struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
         struct snd_soc_dapm_context *dapm = &card->dapm;
  
-       if (ucontrol->value.integer.value[0] == lo_dac)
+       if (ucontrol->value.enumerated.item[0] == lo_dac)
                 return 0;
  
         snd_soc_dapm_mutex_lock(dapm);
@@ -168,7 +168,7 @@ static int lo_set_switch(struct snd_kcontrol *kcontrol,
          */
         lo_enable_out_pins(dapm);
  
-       switch (ucontrol->value.integer.value[0]) {
+       switch (ucontrol->value.enumerated.item[0]) {
         case 0:
                 pr_debug("set vibra path\n");
                 snd_soc_dapm_disable_pin_unlocked(dapm, "VIB1OUT");
@@ -202,7 +202,7 @@ static int lo_set_switch(struct snd_kcontrol *kcontrol,
  
         snd_soc_dapm_mutex_unlock(dapm);
  
-       lo_dac = ucontrol->value.integer.value[0];
+       lo_dac = ucontrol->value.enumerated.item[0];
         return 0;
  }
  
diff --git a/sound/soc/intel/skylake/skl-topology.c b/sound/soc/intel/skylake/skl-topology.c

index a294fee431f07363f965a81b4c9ef42eb3a42f58..5a4837dcfce3e615eedf47049ac3923332c8a9bf 100644 (file)
--- a/sound/soc/intel/skylake/skl-topology.c
+++ b/sound/soc/intel/skylake/skl-topology.c
@@ -978,7 +978,7 @@ static int skl_tplg_tlv_control_set(struct snd_kcontrol *kcontrol,
                                 return -EFAULT;
                 } else {
                         if (copy_from_user(ac->params,
-                                          data + 2 * sizeof(u32), size))
+                                          data + 2, size))
                                 return -EFAULT;
                 }
  
diff --git a/sound/soc/nuc900/nuc900-pcm.c b/sound/soc/nuc900/nuc900-pcm.c

index e09326158bc2dd0be61ea69f375f37dc6f0cdbe5..2cca055fd806cdb80e7226557af943533680c2f8 100644 (file)
--- a/sound/soc/nuc900/nuc900-pcm.c
+++ b/sound/soc/nuc900/nuc900-pcm.c
@@ -267,10 +267,8 @@ static int nuc900_dma_mmap(struct snd_pcm_substream *substream,
  {
         struct snd_pcm_runtime *runtime = substream->runtime;
  
-       return dma_mmap_writecombine(substream->pcm->card->dev, vma,
-                                       runtime->dma_area,
-                                       runtime->dma_addr,
-                                       runtime->dma_bytes);
+       return dma_mmap_wc(substream->pcm->card->dev, vma, runtime->dma_area,
+                          runtime->dma_addr, runtime->dma_bytes);
  }
  
  static struct snd_pcm_ops nuc900_dma_ops = {
diff --git a/sound/soc/omap/n810.c b/sound/soc/omap/n810.c

index 190f868e78b24af37d8442d378c06aeb8ec6aaf2..fdecb70431745bf23c7113bf7abcede1cd5306ef 100644 (file)
--- a/sound/soc/omap/n810.c
+++ b/sound/soc/omap/n810.c
@@ -133,7 +133,7 @@ static struct snd_soc_ops n810_ops = {
  static int n810_get_spk(struct snd_kcontrol *kcontrol,
                         struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = n810_spk_func;
+       ucontrol->value.enumerated.item[0] = n810_spk_func;
  
         return 0;
  }
@@ -143,10 +143,10 @@ static int n810_set_spk(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_card *card =  snd_kcontrol_chip(kcontrol);
  
-       if (n810_spk_func == ucontrol->value.integer.value[0])
+       if (n810_spk_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       n810_spk_func = ucontrol->value.integer.value[0];
+       n810_spk_func = ucontrol->value.enumerated.item[0];
         n810_ext_control(&card->dapm);
  
         return 1;
@@ -155,7 +155,7 @@ static int n810_set_spk(struct snd_kcontrol *kcontrol,
  static int n810_get_jack(struct snd_kcontrol *kcontrol,
                          struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = n810_jack_func;
+       ucontrol->value.enumerated.item[0] = n810_jack_func;
  
         return 0;
  }
@@ -165,10 +165,10 @@ static int n810_set_jack(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_card *card =  snd_kcontrol_chip(kcontrol);
  
-       if (n810_jack_func == ucontrol->value.integer.value[0])
+       if (n810_jack_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       n810_jack_func = ucontrol->value.integer.value[0];
+       n810_jack_func = ucontrol->value.enumerated.item[0];
         n810_ext_control(&card->dapm);
  
         return 1;
@@ -177,7 +177,7 @@ static int n810_set_jack(struct snd_kcontrol *kcontrol,
  static int n810_get_input(struct snd_kcontrol *kcontrol,
                           struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = n810_dmic_func;
+       ucontrol->value.enumerated.item[0] = n810_dmic_func;
  
         return 0;
  }
@@ -187,10 +187,10 @@ static int n810_set_input(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_card *card =  snd_kcontrol_chip(kcontrol);
  
-       if (n810_dmic_func == ucontrol->value.integer.value[0])
+       if (n810_dmic_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       n810_dmic_func = ucontrol->value.integer.value[0];
+       n810_dmic_func = ucontrol->value.enumerated.item[0];
         n810_ext_control(&card->dapm);
  
         return 1;
diff --git a/sound/soc/omap/omap-pcm.c b/sound/soc/omap/omap-pcm.c

index 6bb623a2a4dfcfa3cda744ca8bef18575baec2d9..99381a27295bbbd9452f0d0050c1509651373bff 100644 (file)
--- a/sound/soc/omap/omap-pcm.c
+++ b/sound/soc/omap/omap-pcm.c
@@ -156,10 +156,8 @@ static int omap_pcm_mmap(struct snd_pcm_substream *substream,
  {
         struct snd_pcm_runtime *runtime = substream->runtime;
  
-       return dma_mmap_writecombine(substream->pcm->card->dev, vma,
-                                    runtime->dma_area,
-                                    runtime->dma_addr,
-                                    runtime->dma_bytes);
+       return dma_mmap_wc(substream->pcm->card->dev, vma, runtime->dma_area,
+                          runtime->dma_addr, runtime->dma_bytes);
  }
  
  static struct snd_pcm_ops omap_pcm_ops = {
@@ -183,8 +181,7 @@ static int omap_pcm_preallocate_dma_buffer(struct snd_pcm *pcm,
         buf->dev.type = SNDRV_DMA_TYPE_DEV;
         buf->dev.dev = pcm->card->dev;
         buf->private_data = NULL;
-       buf->area = dma_alloc_writecombine(pcm->card->dev, size,
-                                          &buf->addr, GFP_KERNEL);
+       buf->area = dma_alloc_wc(pcm->card->dev, size, &buf->addr, GFP_KERNEL);
         if (!buf->area)
                 return -ENOMEM;
  
@@ -207,8 +204,7 @@ static void omap_pcm_free_dma_buffers(struct snd_pcm *pcm)
                 if (!buf->area)
                         continue;
  
-               dma_free_writecombine(pcm->card->dev, buf->bytes,
-                                     buf->area, buf->addr);
+               dma_free_wc(pcm->card->dev, buf->bytes, buf->area, buf->addr);
                 buf->area = NULL;
         }
  }
diff --git a/sound/soc/omap/rx51.c b/sound/soc/omap/rx51.c

index 5e21f08579d804c5f7895a22e87de6cdff8b79ef..54949242bc7075587e6a73a7235049db2e108734 100644 (file)
--- a/sound/soc/omap/rx51.c
+++ b/sound/soc/omap/rx51.c
@@ -132,7 +132,7 @@ static struct snd_soc_ops rx51_ops = {
  static int rx51_get_spk(struct snd_kcontrol *kcontrol,
                         struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = rx51_spk_func;
+       ucontrol->value.enumerated.item[0] = rx51_spk_func;
  
         return 0;
  }
@@ -142,10 +142,10 @@ static int rx51_set_spk(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
  
-       if (rx51_spk_func == ucontrol->value.integer.value[0])
+       if (rx51_spk_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       rx51_spk_func = ucontrol->value.integer.value[0];
+       rx51_spk_func = ucontrol->value.enumerated.item[0];
         rx51_ext_control(&card->dapm);
  
         return 1;
@@ -180,7 +180,7 @@ static int rx51_hp_event(struct snd_soc_dapm_widget *w,
  static int rx51_get_input(struct snd_kcontrol *kcontrol,
                           struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = rx51_dmic_func;
+       ucontrol->value.enumerated.item[0] = rx51_dmic_func;
  
         return 0;
  }
@@ -190,10 +190,10 @@ static int rx51_set_input(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
  
-       if (rx51_dmic_func == ucontrol->value.integer.value[0])
+       if (rx51_dmic_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       rx51_dmic_func = ucontrol->value.integer.value[0];
+       rx51_dmic_func = ucontrol->value.enumerated.item[0];
         rx51_ext_control(&card->dapm);
  
         return 1;
@@ -202,7 +202,7 @@ static int rx51_set_input(struct snd_kcontrol *kcontrol,
  static int rx51_get_jack(struct snd_kcontrol *kcontrol,
                          struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = rx51_jack_func;
+       ucontrol->value.enumerated.item[0] = rx51_jack_func;
  
         return 0;
  }
@@ -212,10 +212,10 @@ static int rx51_set_jack(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
  
-       if (rx51_jack_func == ucontrol->value.integer.value[0])
+       if (rx51_jack_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       rx51_jack_func = ucontrol->value.integer.value[0];
+       rx51_jack_func = ucontrol->value.enumerated.item[0];
         rx51_ext_control(&card->dapm);
  
         return 1;
diff --git a/sound/soc/pxa/corgi.c b/sound/soc/pxa/corgi.c

index c97dc13d36087628433de516a1b2217df02b492b..dcbb7aa9830c0347af6f7bd2c198639e52d9e24e 100644 (file)
--- a/sound/soc/pxa/corgi.c
+++ b/sound/soc/pxa/corgi.c
@@ -163,7 +163,7 @@ static struct snd_soc_ops corgi_ops = {
  static int corgi_get_jack(struct snd_kcontrol *kcontrol,
         struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = corgi_jack_func;
+       ucontrol->value.enumerated.item[0] = corgi_jack_func;
         return 0;
  }
  
@@ -172,10 +172,10 @@ static int corgi_set_jack(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
  
-       if (corgi_jack_func == ucontrol->value.integer.value[0])
+       if (corgi_jack_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       corgi_jack_func = ucontrol->value.integer.value[0];
+       corgi_jack_func = ucontrol->value.enumerated.item[0];
         corgi_ext_control(&card->dapm);
         return 1;
  }
@@ -183,7 +183,7 @@ static int corgi_set_jack(struct snd_kcontrol *kcontrol,
  static int corgi_get_spk(struct snd_kcontrol *kcontrol,
         struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = corgi_spk_func;
+       ucontrol->value.enumerated.item[0] = corgi_spk_func;
         return 0;
  }
  
@@ -192,10 +192,10 @@ static int corgi_set_spk(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_card *card =  snd_kcontrol_chip(kcontrol);
  
-       if (corgi_spk_func == ucontrol->value.integer.value[0])
+       if (corgi_spk_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       corgi_spk_func = ucontrol->value.integer.value[0];
+       corgi_spk_func = ucontrol->value.enumerated.item[0];
         corgi_ext_control(&card->dapm);
         return 1;
  }
diff --git a/sound/soc/pxa/magician.c b/sound/soc/pxa/magician.c

index 241d0be42d7a0c6779bb6459c5165728cbedb302..62b8377a9d2b93c81f06a8df4c0b5ed2cd7a567f 100644 (file)
--- a/sound/soc/pxa/magician.c
+++ b/sound/soc/pxa/magician.c
@@ -308,17 +308,17 @@ static int magician_set_spk(struct snd_kcontrol *kcontrol,
  static int magician_get_input(struct snd_kcontrol *kcontrol,
                               struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = magician_in_sel;
+       ucontrol->value.enumerated.item[0] = magician_in_sel;
         return 0;
  }
  
  static int magician_set_input(struct snd_kcontrol *kcontrol,
                               struct snd_ctl_elem_value *ucontrol)
  {
-       if (magician_in_sel == ucontrol->value.integer.value[0])
+       if (magician_in_sel == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       magician_in_sel = ucontrol->value.integer.value[0];
+       magician_in_sel = ucontrol->value.enumerated.item[0];
  
         switch (magician_in_sel) {
         case MAGICIAN_MIC:
diff --git a/sound/soc/pxa/poodle.c b/sound/soc/pxa/poodle.c

index 84d0e2e508088a260b54911bca03236a50046203..4b3b714f5ee7a67e5538b191a9da99bac86e56f8 100644 (file)
--- a/sound/soc/pxa/poodle.c
+++ b/sound/soc/pxa/poodle.c
@@ -138,7 +138,7 @@ static struct snd_soc_ops poodle_ops = {
  static int poodle_get_jack(struct snd_kcontrol *kcontrol,
         struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = poodle_jack_func;
+       ucontrol->value.enumerated.item[0] = poodle_jack_func;
         return 0;
  }
  
@@ -147,10 +147,10 @@ static int poodle_set_jack(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_card *card =  snd_kcontrol_chip(kcontrol);
  
-       if (poodle_jack_func == ucontrol->value.integer.value[0])
+       if (poodle_jack_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       poodle_jack_func = ucontrol->value.integer.value[0];
+       poodle_jack_func = ucontrol->value.enumerated.item[0];
         poodle_ext_control(&card->dapm);
         return 1;
  }
@@ -158,7 +158,7 @@ static int poodle_set_jack(struct snd_kcontrol *kcontrol,
  static int poodle_get_spk(struct snd_kcontrol *kcontrol,
         struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = poodle_spk_func;
+       ucontrol->value.enumerated.item[0] = poodle_spk_func;
         return 0;
  }
  
@@ -167,10 +167,10 @@ static int poodle_set_spk(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_card *card =  snd_kcontrol_chip(kcontrol);
  
-       if (poodle_spk_func == ucontrol->value.integer.value[0])
+       if (poodle_spk_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       poodle_spk_func = ucontrol->value.integer.value[0];
+       poodle_spk_func = ucontrol->value.enumerated.item[0];
         poodle_ext_control(&card->dapm);
         return 1;
  }
diff --git a/sound/soc/pxa/spitz.c b/sound/soc/pxa/spitz.c

index b00222620fd01be5cf00d4118d44f18832ad3bdc..0e02634c8b7f6f41764b75344ae850ff7ce586d7 100644 (file)
--- a/sound/soc/pxa/spitz.c
+++ b/sound/soc/pxa/spitz.c
@@ -164,7 +164,7 @@ static struct snd_soc_ops spitz_ops = {
  static int spitz_get_jack(struct snd_kcontrol *kcontrol,
         struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = spitz_jack_func;
+       ucontrol->value.enumerated.item[0] = spitz_jack_func;
         return 0;
  }
  
@@ -173,10 +173,10 @@ static int spitz_set_jack(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
  
-       if (spitz_jack_func == ucontrol->value.integer.value[0])
+       if (spitz_jack_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       spitz_jack_func = ucontrol->value.integer.value[0];
+       spitz_jack_func = ucontrol->value.enumerated.item[0];
         spitz_ext_control(&card->dapm);
         return 1;
  }
@@ -184,7 +184,7 @@ static int spitz_set_jack(struct snd_kcontrol *kcontrol,
  static int spitz_get_spk(struct snd_kcontrol *kcontrol,
         struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = spitz_spk_func;
+       ucontrol->value.enumerated.item[0] = spitz_spk_func;
         return 0;
  }
  
@@ -193,10 +193,10 @@ static int spitz_set_spk(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
  
-       if (spitz_spk_func == ucontrol->value.integer.value[0])
+       if (spitz_spk_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       spitz_spk_func = ucontrol->value.integer.value[0];
+       spitz_spk_func = ucontrol->value.enumerated.item[0];
         spitz_ext_control(&card->dapm);
         return 1;
  }
diff --git a/sound/soc/pxa/tosa.c b/sound/soc/pxa/tosa.c

index 49518dd642aa18ffd3f0004fe2f425210e820b54..c508f024ecfbc206887aeb9f91943311a279aed7 100644 (file)
--- a/sound/soc/pxa/tosa.c
+++ b/sound/soc/pxa/tosa.c
@@ -95,7 +95,7 @@ static struct snd_soc_ops tosa_ops = {
  static int tosa_get_jack(struct snd_kcontrol *kcontrol,
         struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = tosa_jack_func;
+       ucontrol->value.enumerated.item[0] = tosa_jack_func;
         return 0;
  }
  
@@ -104,10 +104,10 @@ static int tosa_set_jack(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
  
-       if (tosa_jack_func == ucontrol->value.integer.value[0])
+       if (tosa_jack_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       tosa_jack_func = ucontrol->value.integer.value[0];
+       tosa_jack_func = ucontrol->value.enumerated.item[0];
         tosa_ext_control(&card->dapm);
         return 1;
  }
@@ -115,7 +115,7 @@ static int tosa_set_jack(struct snd_kcontrol *kcontrol,
  static int tosa_get_spk(struct snd_kcontrol *kcontrol,
         struct snd_ctl_elem_value *ucontrol)
  {
-       ucontrol->value.integer.value[0] = tosa_spk_func;
+       ucontrol->value.enumerated.item[0] = tosa_spk_func;
         return 0;
  }
  
@@ -124,10 +124,10 @@ static int tosa_set_spk(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
  
-       if (tosa_spk_func == ucontrol->value.integer.value[0])
+       if (tosa_spk_func == ucontrol->value.enumerated.item[0])
                 return 0;
  
-       tosa_spk_func = ucontrol->value.integer.value[0];
+       tosa_spk_func = ucontrol->value.enumerated.item[0];
         tosa_ext_control(&card->dapm);
         return 1;
  }
diff --git a/sound/soc/qcom/lpass-cpu.c b/sound/soc/qcom/lpass-cpu.c

index 00b6c9d039cfada651dad6e917be9318dc4ce762..e5101e0d2d372262f8ecf7d0498240cba971b86f 100644 (file)
--- a/sound/soc/qcom/lpass-cpu.c
+++ b/sound/soc/qcom/lpass-cpu.c
@@ -355,7 +355,6 @@ static struct regmap_config lpass_cpu_regmap_config = {
         .readable_reg = lpass_cpu_regmap_readable,
         .volatile_reg = lpass_cpu_regmap_volatile,
         .cache_type = REGCACHE_FLAT,
-       .val_format_endian = REGMAP_ENDIAN_LITTLE,
  };
  
  int asoc_qcom_lpass_cpu_platform_probe(struct platform_device *pdev)
diff --git a/sound/soc/samsung/i2s.c b/sound/soc/samsung/i2s.c

index 84d9e77c0fbe1e8d5e737d26beb492c3ce85883b..70a2559b63f9050bcbb86049717518d022463e2f 100644 (file)
--- a/sound/soc/samsung/i2s.c
+++ b/sound/soc/samsung/i2s.c
@@ -481,10 +481,11 @@ static int i2s_set_sysclk(struct snd_soc_dai *dai,
         unsigned int cdcon_mask = 1 << i2s_regs->cdclkcon_off;
         unsigned int rsrc_mask = 1 << i2s_regs->rclksrc_off;
         u32 mod, mask, val = 0;
+       unsigned long flags;
  
-       spin_lock(i2s->lock);
+       spin_lock_irqsave(i2s->lock, flags);
         mod = readl(i2s->addr + I2SMOD);
-       spin_unlock(i2s->lock);
+       spin_unlock_irqrestore(i2s->lock, flags);
  
         switch (clk_id) {
         case SAMSUNG_I2S_OPCLK:
@@ -575,11 +576,11 @@ static int i2s_set_sysclk(struct snd_soc_dai *dai,
                 return -EINVAL;
         }
  
-       spin_lock(i2s->lock);
+       spin_lock_irqsave(i2s->lock, flags);
         mod = readl(i2s->addr + I2SMOD);
         mod = (mod & ~mask) | val;
         writel(mod, i2s->addr + I2SMOD);
-       spin_unlock(i2s->lock);
+       spin_unlock_irqrestore(i2s->lock, flags);
  
         return 0;
  }
@@ -590,6 +591,7 @@ static int i2s_set_fmt(struct snd_soc_dai *dai,
         struct i2s_dai *i2s = to_info(dai);
         int lrp_shift, sdf_shift, sdf_mask, lrp_rlow, mod_slave;
         u32 mod, tmp = 0;
+       unsigned long flags;
  
         lrp_shift = i2s->variant_regs->lrp_off;
         sdf_shift = i2s->variant_regs->sdf_off;
@@ -649,7 +651,7 @@ static int i2s_set_fmt(struct snd_soc_dai *dai,
                 return -EINVAL;
         }
  
-       spin_lock(i2s->lock);
+       spin_lock_irqsave(i2s->lock, flags);
         mod = readl(i2s->addr + I2SMOD);
         /*
          * Don't change the I2S mode if any controller is active on this
@@ -657,7 +659,7 @@ static int i2s_set_fmt(struct snd_soc_dai *dai,
          */
         if (any_active(i2s) &&
                 ((mod & (sdf_mask | lrp_rlow | mod_slave)) != tmp)) {
-               spin_unlock(i2s->lock);
+               spin_unlock_irqrestore(i2s->lock, flags);
                 dev_err(&i2s->pdev->dev,
                                 "%s:%d Other DAI busy\n", __func__, __LINE__);
                 return -EAGAIN;
@@ -666,7 +668,7 @@ static int i2s_set_fmt(struct snd_soc_dai *dai,
         mod &= ~(sdf_mask | lrp_rlow | mod_slave);
         mod |= tmp;
         writel(mod, i2s->addr + I2SMOD);
-       spin_unlock(i2s->lock);
+       spin_unlock_irqrestore(i2s->lock, flags);
  
         return 0;
  }
@@ -676,6 +678,7 @@ static int i2s_hw_params(struct snd_pcm_substream *substream,
  {
         struct i2s_dai *i2s = to_info(dai);
         u32 mod, mask = 0, val = 0;
+       unsigned long flags;
  
         if (!is_secondary(i2s))
                 mask |= (MOD_DC2_EN | MOD_DC1_EN);
@@ -744,11 +747,11 @@ static int i2s_hw_params(struct snd_pcm_substream *substream,
                 return -EINVAL;
         }
  
-       spin_lock(i2s->lock);
+       spin_lock_irqsave(i2s->lock, flags);
         mod = readl(i2s->addr + I2SMOD);
         mod = (mod & ~mask) | val;
         writel(mod, i2s->addr + I2SMOD);
-       spin_unlock(i2s->lock);
+       spin_unlock_irqrestore(i2s->lock, flags);
  
         samsung_asoc_init_dma_data(dai, &i2s->dma_playback, &i2s->dma_capture);
  
diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c

index 0d37079879002d472c538b69ccc3321c0d7772ad..581175a51ecf730bb8c6a2271d137f049c2a445d 100644 (file)
--- a/sound/soc/soc-dapm.c
+++ b/sound/soc/soc-dapm.c
@@ -3573,7 +3573,7 @@ static int snd_soc_dapm_dai_link_get(struct snd_kcontrol *kcontrol,
  {
         struct snd_soc_dapm_widget *w = snd_kcontrol_chip(kcontrol);
  
-       ucontrol->value.integer.value[0] = w->params_select;
+       ucontrol->value.enumerated.item[0] = w->params_select;
  
         return 0;
  }
@@ -3587,13 +3587,13 @@ static int snd_soc_dapm_dai_link_put(struct snd_kcontrol *kcontrol,
         if (w->power)
                 return -EBUSY;
  
-       if (ucontrol->value.integer.value[0] == w->params_select)
+       if (ucontrol->value.enumerated.item[0] == w->params_select)
                 return 0;
  
-       if (ucontrol->value.integer.value[0] >= w->num_params)
+       if (ucontrol->value.enumerated.item[0] >= w->num_params)
                 return -EINVAL;
  
-       w->params_select = ucontrol->value.integer.value[0];
+       w->params_select = ucontrol->value.enumerated.item[0];
  
         return 0;
  }
diff --git a/tools/build/Makefile.build b/tools/build/Makefile.build

index 4a96473b180f938b038d5e698f3578876eb7c70c..ee566e8bd1cff56efde9200bf4a76caa840669f1 100644 (file)
--- a/tools/build/Makefile.build
+++ b/tools/build/Makefile.build
@@ -85,7 +85,7 @@ $(OUTPUT)%.i: %.c FORCE
         $(call rule_mkdir)
         $(call if_changed_dep,cc_i_c)
  
-$(OUTPUT)%.i: %.S FORCE
+$(OUTPUT)%.s: %.S FORCE
         $(call rule_mkdir)
         $(call if_changed_dep,cc_i_c)
  
diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature

index 02db3cdff20ff7653c8ca0db21d28ef7b508a9eb..6b7707270aa3b19791c8b6248f90ecddeabc1fdd 100644 (file)
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -27,7 +27,7 @@ endef
  #   the rule that uses them - an example for that is the 'bionic'
  #   feature check. ]
  #
-FEATURE_TESTS ?=                       \
+FEATURE_TESTS_BASIC :=                 \
         backtrace                       \
         dwarf                           \
         fortify-source                  \
@@ -46,6 +46,7 @@ FEATURE_TESTS ?=                      \
         libpython                       \
         libpython-version               \
         libslang                        \
+       libcrypto                       \
         libunwind                       \
         pthread-attr-setaffinity-np     \
         stackprotector-all              \
@@ -56,6 +57,25 @@ FEATURE_TESTS ?=                     \
         get_cpuid                       \
         bpf
  
+# FEATURE_TESTS_BASIC + FEATURE_TESTS_EXTRA is the complete list
+# of all feature tests
+FEATURE_TESTS_EXTRA :=                 \
+       bionic                          \
+       compile-32                      \
+       compile-x32                     \
+       cplus-demangle                  \
+       hello                           \
+       libbabeltrace                   \
+       liberty                         \
+       liberty-z                       \
+       libunwind-debug-frame
+
+FEATURE_TESTS ?= $(FEATURE_TESTS_BASIC)
+
+ifeq ($(FEATURE_TESTS),all)
+  FEATURE_TESTS := $(FEATURE_TESTS_BASIC) $(FEATURE_TESTS_EXTRA)
+endif
+
  FEATURE_DISPLAY ?=                     \
         dwarf                           \
         glibc                           \
@@ -68,6 +88,7 @@ FEATURE_DISPLAY ?=                    \
         libperl                         \
         libpython                       \
         libslang                        \
+       libcrypto                       \
         libunwind                       \
         libdw-dwarf-unwind              \
         zlib                            \
@@ -100,6 +121,14 @@ ifeq ($(feature-all), 1)
    # test-all.c passed - just set all the core feature flags to 1:
    #
    $(foreach feat,$(FEATURE_TESTS),$(call feature_set,$(feat)))
+  #
+  # test-all.c does not comprise these tests, so we need to
+  # for this case to get features proper values
+  #
+  $(call feature_check,compile-32)
+  $(call feature_check,compile-x32)
+  $(call feature_check,bionic)
+  $(call feature_check,libbabeltrace)
  else
    $(foreach feat,$(FEATURE_TESTS),$(call feature_check,$(feat)))
  endif
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile

index bf8f0352264dcc8a5a1286bc4add2ca9de481357..c5f4c417428d7099fbe4f487a179b0663f478611 100644 (file)
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -23,6 +23,7 @@ FILES=                                        \
         test-libpython.bin              \
         test-libpython-version.bin      \
         test-libslang.bin               \
+       test-libcrypto.bin              \
         test-libunwind.bin              \
         test-libunwind-debug-frame.bin  \
         test-pthread-attr-setaffinity-np.bin    \
@@ -105,6 +106,9 @@ $(OUTPUT)test-libaudit.bin:
  $(OUTPUT)test-libslang.bin:
         $(BUILD) -I/usr/include/slang -lslang
  
+$(OUTPUT)test-libcrypto.bin:
+       $(BUILD) -lcrypto
+
  $(OUTPUT)test-gtk2.bin:
         $(BUILD) $(shell $(PKG_CONFIG) --libs --cflags gtk+-2.0 2>/dev/null)
  
diff --git a/tools/build/feature/test-all.c b/tools/build/feature/test-all.c

index 81025cade45fa9a3fcc6f8f0aacc8f403e6d9211..e499a36c1e4a9e21e9c355309b53a7dc5901664a 100644 (file)
--- a/tools/build/feature/test-all.c
+++ b/tools/build/feature/test-all.c
@@ -129,6 +129,10 @@
  # include "test-bpf.c"
  #undef main
  
+#define main main_test_libcrypto
+# include "test-libcrypto.c"
+#undef main
+
  int main(int argc, char *argv[])
  {
         main_test_libpython();
@@ -158,6 +162,7 @@ int main(int argc, char *argv[])
         main_test_lzma();
         main_test_get_cpuid();
         main_test_bpf();
+       main_test_libcrypto();
  
         return 0;
  }
diff --git a/tools/build/feature/test-compile.c b/tools/build/feature/test-compile.c

index 31dbf45bf99c5996ca0ee62cba186d912b08e666..c54e6551ae4c54cd6e9f418e6e8ff97280444efd 100644 (file)
--- a/tools/build/feature/test-compile.c
+++ b/tools/build/feature/test-compile.c
@@ -1,4 +1,6 @@
+#include <stdio.h>
  int main(void)
  {
+       printf("Hello World!\n");
         return 0;
  }
diff --git a/tools/build/feature/test-libcrypto.c b/tools/build/feature/test-libcrypto.c

new file mode 100644 (file)

index 0000000..bd79dc7
--- /dev/null
+++ b/tools/build/feature/test-libcrypto.c
@@ -0,0 +1,17 @@
+#include <openssl/sha.h>
+#include <openssl/md5.h>
+
+int main(void)
+{
+       MD5_CTX context;
+       unsigned char md[MD5_DIGEST_LENGTH + SHA_DIGEST_LENGTH];
+       unsigned char dat[] = "12345";
+
+       MD5_Init(&context);
+       MD5_Update(&context, &dat[0], sizeof(dat));
+       MD5_Final(&md[0], &context);
+
+       SHA1(&dat[0], sizeof(dat), &md[0]);
+
+       return 0;
+}
diff --git a/tools/lib/api/Build b/tools/lib/api/Build

index e8b8a23b9bf4cdac8ccfcf95690f8dfd93e1d6e6..954c644f7ad9e5ee0f54bdf16cf2b68766f1e2a5 100644 (file)
--- a/tools/lib/api/Build
+++ b/tools/lib/api/Build
@@ -1,3 +1,4 @@
  libapi-y += fd/
  libapi-y += fs/
  libapi-y += cpu.o
+libapi-y += debug.o
diff --git a/tools/lib/api/Makefile b/tools/lib/api/Makefile

index d85904dc9b38747dc0feda60f819cf93ceabbdcf..bbc82c614bee62eec8563c5b1dd9fd222eafa5de 100644 (file)
--- a/tools/lib/api/Makefile
+++ b/tools/lib/api/Makefile
@@ -18,6 +18,7 @@ LIBFILE = $(OUTPUT)libapi.a
  CFLAGS := $(EXTRA_WARNINGS) $(EXTRA_CFLAGS)
  CFLAGS += -ggdb3 -Wall -Wextra -std=gnu99 -Werror -O6 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 -fPIC
  CFLAGS += -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64
+CFLAGS += -I$(srctree)/tools/lib/api
  
  RM = rm -f
  
diff --git a/tools/lib/api/debug-internal.h b/tools/lib/api/debug-internal.h

new file mode 100644 (file)

index 0000000..188f788
--- /dev/null
+++ b/tools/lib/api/debug-internal.h
@@ -0,0 +1,20 @@
+#ifndef __API_DEBUG_INTERNAL_H__
+#define __API_DEBUG_INTERNAL_H__
+
+#include "debug.h"
+
+#define __pr(func, fmt, ...)   \
+do {                           \
+       if ((func))             \
+               (func)("libapi: " fmt, ##__VA_ARGS__); \
+} while (0)
+
+extern libapi_print_fn_t __pr_warning;
+extern libapi_print_fn_t __pr_info;
+extern libapi_print_fn_t __pr_debug;
+
+#define pr_warning(fmt, ...)   __pr(__pr_warning, fmt, ##__VA_ARGS__)
+#define pr_info(fmt, ...)      __pr(__pr_info, fmt, ##__VA_ARGS__)
+#define pr_debug(fmt, ...)     __pr(__pr_debug, fmt, ##__VA_ARGS__)
+
+#endif /* __API_DEBUG_INTERNAL_H__ */
diff --git a/tools/lib/api/debug.c b/tools/lib/api/debug.c

new file mode 100644 (file)

index 0000000..5fa5cf5
--- /dev/null
+++ b/tools/lib/api/debug.c
@@ -0,0 +1,28 @@
+#include <stdio.h>
+#include <stdarg.h>
+#include "debug.h"
+#include "debug-internal.h"
+
+static int __base_pr(const char *format, ...)
+{
+       va_list args;
+       int err;
+
+       va_start(args, format);
+       err = vfprintf(stderr, format, args);
+       va_end(args);
+       return err;
+}
+
+libapi_print_fn_t __pr_warning = __base_pr;
+libapi_print_fn_t __pr_info    = __base_pr;
+libapi_print_fn_t __pr_debug;
+
+void libapi_set_print(libapi_print_fn_t warn,
+                     libapi_print_fn_t info,
+                     libapi_print_fn_t debug)
+{
+       __pr_warning = warn;
+       __pr_info    = info;
+       __pr_debug   = debug;
+}
diff --git a/tools/lib/api/debug.h b/tools/lib/api/debug.h

new file mode 100644 (file)

index 0000000..a0872f6
--- /dev/null
+++ b/tools/lib/api/debug.h
@@ -0,0 +1,10 @@
+#ifndef __API_DEBUG_H__
+#define __API_DEBUG_H__
+
+typedef int (*libapi_print_fn_t)(const char *, ...);
+
+void libapi_set_print(libapi_print_fn_t warn,
+                     libapi_print_fn_t info,
+                     libapi_print_fn_t debug);
+
+#endif /* __API_DEBUG_H__ */
diff --git a/tools/lib/api/fs/fs.c b/tools/lib/api/fs/fs.c

index 459599d1b6c410b7b41333c13f248ee685c2dd4a..ef78c22ff44d4142f01a1f5aea3bbe4e15fd0101 100644 (file)
--- a/tools/lib/api/fs/fs.c
+++ b/tools/lib/api/fs/fs.c
@@ -13,6 +13,7 @@
  #include <sys/mount.h>
  
  #include "fs.h"
+#include "debug-internal.h"
  
  #define _STR(x) #x
  #define STR(x) _STR(x)
@@ -300,6 +301,56 @@ int filename__read_ull(const char *filename, unsigned long long *value)
         return err;
  }
  
+#define STRERR_BUFSIZE  128     /* For the buffer size of strerror_r */
+
+int filename__read_str(const char *filename, char **buf, size_t *sizep)
+{
+       size_t size = 0, alloc_size = 0;
+       void *bf = NULL, *nbf;
+       int fd, n, err = 0;
+       char sbuf[STRERR_BUFSIZE];
+
+       fd = open(filename, O_RDONLY);
+       if (fd < 0)
+               return -errno;
+
+       do {
+               if (size == alloc_size) {
+                       alloc_size += BUFSIZ;
+                       nbf = realloc(bf, alloc_size);
+                       if (!nbf) {
+                               err = -ENOMEM;
+                               break;
+                       }
+
+                       bf = nbf;
+               }
+
+               n = read(fd, bf + size, alloc_size - size);
+               if (n < 0) {
+                       if (size) {
+                               pr_warning("read failed %d: %s\n", errno,
+                                        strerror_r(errno, sbuf, sizeof(sbuf)));
+                               err = 0;
+                       } else
+                               err = -errno;
+
+                       break;
+               }
+
+               size += n;
+       } while (n > 0);
+
+       if (!err) {
+               *sizep = size;
+               *buf   = bf;
+       } else
+               free(bf);
+
+       close(fd);
+       return err;
+}
+
  int sysfs__read_ull(const char *entry, unsigned long long *value)
  {
         char path[PATH_MAX];
@@ -326,6 +377,19 @@ int sysfs__read_int(const char *entry, int *value)
         return filename__read_int(path, value);
  }
  
+int sysfs__read_str(const char *entry, char **buf, size_t *sizep)
+{
+       char path[PATH_MAX];
+       const char *sysfs = sysfs__mountpoint();
+
+       if (!sysfs)
+               return -1;
+
+       snprintf(path, sizeof(path), "%s/%s", sysfs, entry);
+
+       return filename__read_str(path, buf, sizep);
+}
+
  int sysctl__read_int(const char *sysctl, int *value)
  {
         char path[PATH_MAX];
diff --git a/tools/lib/api/fs/fs.h b/tools/lib/api/fs/fs.h

index d024a7f682f69b27a4bba1aed7b8f1894d07b50f..9f6598098dc5804a5bf660013ad5b07c499b6169 100644 (file)
--- a/tools/lib/api/fs/fs.h
+++ b/tools/lib/api/fs/fs.h
@@ -2,6 +2,7 @@
  #define __API_FS__
  
  #include <stdbool.h>
+#include <unistd.h>
  
  /*
   * On most systems <limits.h> would have given us this, but  not on some systems
@@ -26,8 +27,10 @@ FS(tracefs)
  
  int filename__read_int(const char *filename, int *value);
  int filename__read_ull(const char *filename, unsigned long long *value);
+int filename__read_str(const char *filename, char **buf, size_t *sizep);
  
  int sysctl__read_int(const char *sysctl, int *value);
  int sysfs__read_int(const char *entry, int *value);
  int sysfs__read_ull(const char *entry, unsigned long long *value);
+int sysfs__read_str(const char *entry, char **buf, size_t *sizep);
  #endif /* __API_FS__ */
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c

index 8334a5a9d5d7f55e97144e1d7462c52deec02e02..7e543c3102d4118a09717bb6fb4c0f779532ae91 100644 (file)
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -201,6 +201,7 @@ struct bpf_object {
                         Elf_Data *data;
                 } *reloc;
                 int nr_reloc;
+               int maps_shndx;
         } efile;
         /*
          * All loaded bpf_object is linked in a list, which is
@@ -350,6 +351,7 @@ static struct bpf_object *bpf_object__new(const char *path,
          */
         obj->efile.obj_buf = obj_buf;
         obj->efile.obj_buf_sz = obj_buf_sz;
+       obj->efile.maps_shndx = -1;
  
         obj->loaded = false;
  
@@ -529,12 +531,12 @@ bpf_object__init_maps(struct bpf_object *obj, void *data,
  }
  
  static int
-bpf_object__init_maps_name(struct bpf_object *obj, int maps_shndx)
+bpf_object__init_maps_name(struct bpf_object *obj)
  {
         int i;
         Elf_Data *symbols = obj->efile.symbols;
  
-       if (!symbols || maps_shndx < 0)
+       if (!symbols || obj->efile.maps_shndx < 0)
                 return -EINVAL;
  
         for (i = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) {
@@ -544,7 +546,7 @@ bpf_object__init_maps_name(struct bpf_object *obj, int maps_shndx)
  
                 if (!gelf_getsym(symbols, i, &sym))
                         continue;
-               if (sym.st_shndx != maps_shndx)
+               if (sym.st_shndx != obj->efile.maps_shndx)
                         continue;
  
                 map_name = elf_strptr(obj->efile.elf,
@@ -572,7 +574,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
         Elf *elf = obj->efile.elf;
         GElf_Ehdr *ep = &obj->efile.ehdr;
         Elf_Scn *scn = NULL;
-       int idx = 0, err = 0, maps_shndx = -1;
+       int idx = 0, err = 0;
  
         /* Elf is corrupted/truncated, avoid calling elf_strptr. */
         if (!elf_rawdata(elf_getscn(elf, ep->e_shstrndx), NULL)) {
@@ -625,7 +627,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
                 else if (strcmp(name, "maps") == 0) {
                         err = bpf_object__init_maps(obj, data->d_buf,
                                                     data->d_size);
-                       maps_shndx = idx;
+                       obj->efile.maps_shndx = idx;
                 } else if (sh.sh_type == SHT_SYMTAB) {
                         if (obj->efile.symbols) {
                                 pr_warning("bpf: multiple SYMTAB in %s\n",
@@ -674,8 +676,8 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
                 pr_warning("Corrupted ELF file: index of strtab invalid\n");
                 return LIBBPF_ERRNO__FORMAT;
         }
-       if (maps_shndx >= 0)
-               err = bpf_object__init_maps_name(obj, maps_shndx);
+       if (obj->efile.maps_shndx >= 0)
+               err = bpf_object__init_maps_name(obj);
  out:
         return err;
  }
@@ -697,7 +699,8 @@ bpf_object__find_prog_by_idx(struct bpf_object *obj, int idx)
  static int
  bpf_program__collect_reloc(struct bpf_program *prog,
                            size_t nr_maps, GElf_Shdr *shdr,
-                          Elf_Data *data, Elf_Data *symbols)
+                          Elf_Data *data, Elf_Data *symbols,
+                          int maps_shndx)
  {
         int i, nrels;
  
@@ -724,9 +727,6 @@ bpf_program__collect_reloc(struct bpf_program *prog,
                         return -LIBBPF_ERRNO__FORMAT;
                 }
  
-               insn_idx = rel.r_offset / sizeof(struct bpf_insn);
-               pr_debug("relocation: insn_idx=%u\n", insn_idx);
-
                 if (!gelf_getsym(symbols,
                                  GELF_R_SYM(rel.r_info),
                                  &sym)) {
@@ -735,6 +735,15 @@ bpf_program__collect_reloc(struct bpf_program *prog,
                         return -LIBBPF_ERRNO__FORMAT;
                 }
  
+               if (sym.st_shndx != maps_shndx) {
+                       pr_warning("Program '%s' contains non-map related relo data pointing to section %u\n",
+                                  prog->section_name, sym.st_shndx);
+                       return -LIBBPF_ERRNO__RELOC;
+               }
+
+               insn_idx = rel.r_offset / sizeof(struct bpf_insn);
+               pr_debug("relocation: insn_idx=%u\n", insn_idx);
+
                 if (insns[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) {
                         pr_warning("bpf: relocation: invalid relo for insns[%d].code 0x%x\n",
                                    insn_idx, insns[insn_idx].code);
@@ -863,7 +872,8 @@ static int bpf_object__collect_reloc(struct bpf_object *obj)
  
                 err = bpf_program__collect_reloc(prog, nr_maps,
                                                  shdr, data,
-                                                obj->efile.symbols);
+                                                obj->efile.symbols,
+                                                obj->efile.maps_shndx);
                 if (err)
                         return err;
         }
diff --git a/tools/lib/lockdep/Makefile b/tools/lib/lockdep/Makefile

index 90d2baeb621a5265f703ecc0e3bc0cbde60ac1f4..1d57af56814b8671c10d9619202f042ad5c031d4 100644 (file)
--- a/tools/lib/lockdep/Makefile
+++ b/tools/lib/lockdep/Makefile
@@ -100,7 +100,7 @@ include $(srctree)/tools/build/Makefile.include
  
  do_compile_shared_library =                    \
         ($(print_shared_lib_compile)            \
-       $(CC) --shared $^ -o $@ -lpthread -ldl -Wl,-soname='"$@"';$(shell ln -s $@ liblockdep.so))
+       $(CC) --shared $^ -o $@ -lpthread -ldl -Wl,-soname='"$@"';$(shell ln -sf $@ liblockdep.so))
  
  do_build_static_lib =                          \
         ($(print_static_lib_build)              \
diff --git a/tools/lib/lockdep/common.c b/tools/lib/lockdep/common.c

index 9be663340f0a4c48e9ec51c996b13c3a8b596fdc..d1c89cc06f5f6710471c42dc5b2a475a54ee7658 100644 (file)
--- a/tools/lib/lockdep/common.c
+++ b/tools/lib/lockdep/common.c
@@ -11,11 +11,6 @@ static __thread struct task_struct current_obj;
  bool debug_locks = true;
  bool debug_locks_silent;
  
-__attribute__((constructor)) static void liblockdep_init(void)
-{
-       lockdep_init();
-}
-
  __attribute__((destructor)) static void liblockdep_exit(void)
  {
         debug_check_no_locks_held();
diff --git a/tools/lib/lockdep/include/liblockdep/common.h b/tools/lib/lockdep/include/liblockdep/common.h

index a60c14b9662aefd6f17b47dec534bce2f53d9f03..6e66277ec4375c84693b25756fdf8336faf79fcf 100644 (file)
--- a/tools/lib/lockdep/include/liblockdep/common.h
+++ b/tools/lib/lockdep/include/liblockdep/common.h
@@ -44,7 +44,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
  void lock_release(struct lockdep_map *lock, int nested,
                         unsigned long ip);
  extern void debug_check_no_locks_freed(const void *from, unsigned long len);
-extern void lockdep_init(void);
  
  #define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
         { .name = (_name), .key = (void *)(_key), }
diff --git a/tools/lib/lockdep/lockdep.c b/tools/lib/lockdep/lockdep.c

index f42b7e9aa48f2661fb94fced614db8f443eca9c4..a0a2e3a266af8122fed3d66b3f5a537ab3c1c773 100644 (file)
--- a/tools/lib/lockdep/lockdep.c
+++ b/tools/lib/lockdep/lockdep.c
@@ -1,2 +1,8 @@
  #include <linux/lockdep.h>
+
+/* Trivial API wrappers, we don't (yet) have RCU in user-space: */
+#define hlist_for_each_entry_rcu       hlist_for_each_entry
+#define hlist_add_head_rcu             hlist_add_head
+#define hlist_del_rcu                  hlist_del
+
  #include "../../../kernel/locking/lockdep.c"
diff --git a/tools/lib/lockdep/preload.c b/tools/lib/lockdep/preload.c

index 21cdf869a01b8b7facc084c4dc66cf73afa163bb..52844847569c99b878c031da8f25a13d1c0b5838 100644 (file)
--- a/tools/lib/lockdep/preload.c
+++ b/tools/lib/lockdep/preload.c
@@ -439,7 +439,5 @@ __attribute__((constructor)) static void init_preload(void)
         ll_pthread_rwlock_unlock = dlsym(RTLD_NEXT, "pthread_rwlock_unlock");
  #endif
  
-       lockdep_init();
-
         __init_state = done;
  }
diff --git a/tools/lib/lockdep/tests/AA.c b/tools/lib/lockdep/tests/AA.c

index 0f782ff404ac65999c8689555967eabb9c6f0e52..18211a5f354fe53e2a0650fe1ca9150956fd5b7a 100644 (file)
--- a/tools/lib/lockdep/tests/AA.c
+++ b/tools/lib/lockdep/tests/AA.c
@@ -1,13 +1,13 @@
  #include <liblockdep/mutex.h>
  
-void main(void)
+int main(void)
  {
-       pthread_mutex_t a, b;
+       pthread_mutex_t a;
  
         pthread_mutex_init(&a, NULL);
-       pthread_mutex_init(&b, NULL);
  
         pthread_mutex_lock(&a);
-       pthread_mutex_lock(&b);
         pthread_mutex_lock(&a);
+
+       return 0;
  }
diff --git a/tools/lib/lockdep/tests/ABA.c b/tools/lib/lockdep/tests/ABA.c

new file mode 100644 (file)

index 0000000..0f782ff
--- /dev/null
+++ b/tools/lib/lockdep/tests/ABA.c
@@ -0,0 +1,13 @@
+#include <liblockdep/mutex.h>
+
+void main(void)
+{
+       pthread_mutex_t a, b;
+
+       pthread_mutex_init(&a, NULL);
+       pthread_mutex_init(&b, NULL);
+
+       pthread_mutex_lock(&a);
+       pthread_mutex_lock(&b);
+       pthread_mutex_lock(&a);
+}
diff --git a/tools/lib/lockdep/tests/ABBA_2threads.c b/tools/lib/lockdep/tests/ABBA_2threads.c

new file mode 100644 (file)

index 0000000..cd807d7
--- /dev/null
+++ b/tools/lib/lockdep/tests/ABBA_2threads.c
@@ -0,0 +1,46 @@
+#include <stdio.h>
+#include <pthread.h>
+
+pthread_mutex_t a = PTHREAD_MUTEX_INITIALIZER;
+pthread_mutex_t b = PTHREAD_MUTEX_INITIALIZER;
+pthread_barrier_t bar;
+
+void *ba_lock(void *arg)
+{
+       int ret, i;
+
+       pthread_mutex_lock(&b);
+
+       if (pthread_barrier_wait(&bar) == PTHREAD_BARRIER_SERIAL_THREAD)
+               pthread_barrier_destroy(&bar);
+
+       pthread_mutex_lock(&a);
+
+       pthread_mutex_unlock(&a);
+       pthread_mutex_unlock(&b);
+}
+
+int main(void)
+{
+       pthread_t t;
+
+       pthread_barrier_init(&bar, NULL, 2);
+
+       if (pthread_create(&t, NULL, ba_lock, NULL)) {
+               fprintf(stderr, "pthread_create() failed\n");
+               return 1;
+       }
+       pthread_mutex_lock(&a);
+
+       if (pthread_barrier_wait(&bar) == PTHREAD_BARRIER_SERIAL_THREAD)
+               pthread_barrier_destroy(&bar);
+
+       pthread_mutex_lock(&b);
+
+       pthread_mutex_unlock(&b);
+       pthread_mutex_unlock(&a);
+
+       pthread_join(t, NULL);
+
+       return 0;
+}
diff --git a/tools/lib/lockdep/uinclude/linux/compiler.h b/tools/lib/lockdep/uinclude/linux/compiler.h

index 6386dc3182a0985a40551d0af980e53102e04c2a..fd3e56a83fc27025a9016cae2eaf1882c751cc01 100644 (file)
--- a/tools/lib/lockdep/uinclude/linux/compiler.h
+++ b/tools/lib/lockdep/uinclude/linux/compiler.h
@@ -3,6 +3,7 @@
  
  #define __used         __attribute__((__unused__))
  #define unlikely
+#define READ_ONCE(x) (x)
  #define WRITE_ONCE(x, val) x=(val)
  #define RCU_INIT_POINTER(p, v) p=(v)
  
diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c

index c3bd294a63d1f7e4a75b2a950bbe398f33703cd6..190cc886ab9105ea6be58ad58caf8e940274c4a9 100644 (file)
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -1951,6 +1951,7 @@ process_op(struct event_format *event, struct print_arg *arg, char **tok)
                    strcmp(token, "*") == 0 ||
                    strcmp(token, "^") == 0 ||
                    strcmp(token, "/") == 0 ||
+                  strcmp(token, "%") == 0 ||
                    strcmp(token, "<") == 0 ||
                    strcmp(token, ">") == 0 ||
                    strcmp(token, "<=") == 0 ||
@@ -2397,6 +2398,12 @@ static int arg_num_eval(struct print_arg *arg, long long *val)
                                 break;
                         *val = left + right;
                         break;
+               case '~':
+                       ret = arg_num_eval(arg->op.right, &right);
+                       if (!ret)
+                               break;
+                       *val = ~right;
+                       break;
                 default:
                         do_warning("unknown op '%s'", arg->op.op);
                         ret = 0;
@@ -2634,6 +2641,7 @@ process_hex(struct event_format *event, struct print_arg *arg, char **tok)
  
  free_field:
         free_arg(arg->hex.field);
+       arg->hex.field = NULL;
  out:
         *tok = NULL;
         return EVENT_ERROR;
@@ -2658,8 +2666,10 @@ process_int_array(struct event_format *event, struct print_arg *arg, char **tok)
  
  free_size:
         free_arg(arg->int_array.count);
+       arg->int_array.count = NULL;
  free_field:
         free_arg(arg->int_array.field);
+       arg->int_array.field = NULL;
  out:
         *tok = NULL;
         return EVENT_ERROR;
@@ -3689,6 +3699,9 @@ eval_num_arg(void *data, int size, struct event_format *event, struct print_arg
                 case '/':
                         val = left / right;
                         break;
+               case '%':
+                       val = left % right;
+                       break;
                 case '*':
                         val = left * right;
                         break;
@@ -4971,7 +4984,7 @@ static void pretty_print(struct trace_seq *s, void *data, int size, struct event
                                                 break;
                                         }
                                 }
-                               if (pevent->long_size == 8 && ls &&
+                               if (pevent->long_size == 8 && ls == 1 &&
                                     sizeof(long) != 8) {
                                         char *p;
  
@@ -5335,41 +5348,45 @@ static bool is_timestamp_in_us(char *trace_clock, bool use_trace_clock)
         return false;
  }
  
-void pevent_print_event(struct pevent *pevent, struct trace_seq *s,
-                       struct pevent_record *record, bool use_trace_clock)
+/**
+ * pevent_find_event_by_record - return the event from a given record
+ * @pevent: a handle to the pevent
+ * @record: The record to get the event from
+ *
+ * Returns the associated event for a given record, or NULL if non is
+ * is found.
+ */
+struct event_format *
+pevent_find_event_by_record(struct pevent *pevent, struct pevent_record *record)
  {
-       static const char *spaces = "                    "; /* 20 spaces */
-       struct event_format *event;
-       unsigned long secs;
-       unsigned long usecs;
-       unsigned long nsecs;
-       const char *comm;
-       void *data = record->data;
         int type;
-       int pid;
-       int len;
-       int p;
-       bool use_usec_format;
-
-       use_usec_format = is_timestamp_in_us(pevent->trace_clock,
-                                                       use_trace_clock);
-       if (use_usec_format) {
-               secs = record->ts / NSECS_PER_SEC;
-               nsecs = record->ts - secs * NSECS_PER_SEC;
-       }
  
         if (record->size < 0) {
                 do_warning("ug! negative record size %d", record->size);
-               return;
+               return NULL;
         }
  
-       type = trace_parse_common_type(pevent, data);
+       type = trace_parse_common_type(pevent, record->data);
  
-       event = pevent_find_event(pevent, type);
-       if (!event) {
-               do_warning("ug! no event found for type %d", type);
-               return;
-       }
+       return pevent_find_event(pevent, type);
+}
+
+/**
+ * pevent_print_event_task - Write the event task comm, pid and CPU
+ * @pevent: a handle to the pevent
+ * @s: the trace_seq to write to
+ * @event: the handle to the record's event
+ * @record: The record to get the event from
+ *
+ * Writes the tasks comm, pid and CPU to @s.
+ */
+void pevent_print_event_task(struct pevent *pevent, struct trace_seq *s,
+                            struct event_format *event,
+                            struct pevent_record *record)
+{
+       void *data = record->data;
+       const char *comm;
+       int pid;
  
         pid = parse_common_pid(pevent, data);
         comm = find_cmdline(pevent, pid);
@@ -5377,9 +5394,43 @@ void pevent_print_event(struct pevent *pevent, struct trace_seq *s,
         if (pevent->latency_format) {
                 trace_seq_printf(s, "%8.8s-%-5d %3d",
                        comm, pid, record->cpu);
-               pevent_data_lat_fmt(pevent, s, record);
         } else
                 trace_seq_printf(s, "%16s-%-5d [%03d]", comm, pid, record->cpu);
+}
+
+/**
+ * pevent_print_event_time - Write the event timestamp
+ * @pevent: a handle to the pevent
+ * @s: the trace_seq to write to
+ * @event: the handle to the record's event
+ * @record: The record to get the event from
+ * @use_trace_clock: Set to parse according to the @pevent->trace_clock
+ *
+ * Writes the timestamp of the record into @s.
+ */
+void pevent_print_event_time(struct pevent *pevent, struct trace_seq *s,
+                            struct event_format *event,
+                            struct pevent_record *record,
+                            bool use_trace_clock)
+{
+       unsigned long secs;
+       unsigned long usecs;
+       unsigned long nsecs;
+       int p;
+       bool use_usec_format;
+
+       use_usec_format = is_timestamp_in_us(pevent->trace_clock,
+                                                       use_trace_clock);
+       if (use_usec_format) {
+               secs = record->ts / NSECS_PER_SEC;
+               nsecs = record->ts - secs * NSECS_PER_SEC;
+       }
+
+       if (pevent->latency_format) {
+               trace_seq_printf(s, " %3d", record->cpu);
+               pevent_data_lat_fmt(pevent, s, record);
+       } else
+               trace_seq_printf(s, " [%03d]", record->cpu);
  
         if (use_usec_format) {
                 if (pevent->flags & PEVENT_NSEC_OUTPUT) {
@@ -5387,14 +5438,36 @@ void pevent_print_event(struct pevent *pevent, struct trace_seq *s,
                         p = 9;
                 } else {
                         usecs = (nsecs + 500) / NSECS_PER_USEC;
+                       /* To avoid usecs larger than 1 sec */
+                       if (usecs >= 1000000) {
+                               usecs -= 1000000;
+                               secs++;
+                       }
                         p = 6;
                 }
  
-               trace_seq_printf(s, " %5lu.%0*lu: %s: ",
-                                       secs, p, usecs, event->name);
+               trace_seq_printf(s, " %5lu.%0*lu:", secs, p, usecs);
         } else
-               trace_seq_printf(s, " %12llu: %s: ",
-                                       record->ts, event->name);
+               trace_seq_printf(s, " %12llu:", record->ts);
+}
+
+/**
+ * pevent_print_event_data - Write the event data section
+ * @pevent: a handle to the pevent
+ * @s: the trace_seq to write to
+ * @event: the handle to the record's event
+ * @record: The record to get the event from
+ *
+ * Writes the parsing of the record's data to @s.
+ */
+void pevent_print_event_data(struct pevent *pevent, struct trace_seq *s,
+                            struct event_format *event,
+                            struct pevent_record *record)
+{
+       static const char *spaces = "                    "; /* 20 spaces */
+       int len;
+
+       trace_seq_printf(s, " %s: ", event->name);
  
         /* Space out the event names evenly. */
         len = strlen(event->name);
@@ -5404,6 +5477,23 @@ void pevent_print_event(struct pevent *pevent, struct trace_seq *s,
         pevent_event_info(s, event, record);
  }
  
+void pevent_print_event(struct pevent *pevent, struct trace_seq *s,
+                       struct pevent_record *record, bool use_trace_clock)
+{
+       struct event_format *event;
+
+       event = pevent_find_event_by_record(pevent, record);
+       if (!event) {
+               do_warning("ug! no event found for type %d",
+                          trace_parse_common_type(pevent, record->data));
+               return;
+       }
+
+       pevent_print_event_task(pevent, s, event, record);
+       pevent_print_event_time(pevent, s, event, record, use_trace_clock);
+       pevent_print_event_data(pevent, s, event, record);
+}
+
  static int events_id_cmp(const void *a, const void *b)
  {
         struct event_format * const * ea = a;
diff --git a/tools/lib/traceevent/event-parse.h b/tools/lib/traceevent/event-parse.h

index 706d9bc24066cf308ba17d718eafd95c4ff5550c..9ffde377e89d912ffb2127fe454872346759738b 100644 (file)
--- a/tools/lib/traceevent/event-parse.h
+++ b/tools/lib/traceevent/event-parse.h
@@ -628,6 +628,16 @@ int pevent_register_print_string(struct pevent *pevent, const char *fmt,
                                  unsigned long long addr);
  int pevent_pid_is_registered(struct pevent *pevent, int pid);
  
+void pevent_print_event_task(struct pevent *pevent, struct trace_seq *s,
+                            struct event_format *event,
+                            struct pevent_record *record);
+void pevent_print_event_time(struct pevent *pevent, struct trace_seq *s,
+                            struct event_format *event,
+                            struct pevent_record *record,
+                            bool use_trace_clock);
+void pevent_print_event_data(struct pevent *pevent, struct trace_seq *s,
+                            struct event_format *event,
+                            struct pevent_record *record);
  void pevent_print_event(struct pevent *pevent, struct trace_seq *s,
                         struct pevent_record *record, bool use_trace_clock);
  
@@ -694,6 +704,9 @@ struct event_format *pevent_find_event(struct pevent *pevent, int id);
  struct event_format *
  pevent_find_event_by_name(struct pevent *pevent, const char *sys, const char *name);
  
+struct event_format *
+pevent_find_event_by_record(struct pevent *pevent, struct pevent_record *record);
+
  void pevent_data_lat_fmt(struct pevent *pevent,
                          struct trace_seq *s, struct pevent_record *record);
  int pevent_data_type(struct pevent *pevent, struct pevent_record *rec);
diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt

index b9ca1e304158da86f36dafad33e7de0a52e01567..15949e2a7805cc6e7d848ddbf2f99b249e332f3b 100644 (file)
--- a/tools/perf/Documentation/perf-config.txt
+++ b/tools/perf/Documentation/perf-config.txt
@@ -8,7 +8,7 @@ perf-config - Get and set variables in a configuration file.
  SYNOPSIS
  --------
  [verse]
-'perf config' -l | --list
+'perf config' [<file-option>] -l | --list
  
  DESCRIPTION
  -----------
@@ -21,6 +21,14 @@ OPTIONS
  --list::
         Show current config variables, name and value, for all sections.
  
+--user::
+       For writing and reading options: write to user
+       '$HOME/.perfconfig' file or read it.
+
+--system::
+       For writing and reading options: write to system-wide
+       '$(sysconfdir)/perfconfig' or read it.
+
  CONFIGURATION FILE
  ------------------
  
@@ -30,6 +38,10 @@ The '$HOME/.perfconfig' file is used to store a per-user configuration.
  The file '$(sysconfdir)/perfconfig' can be used to
  store a system-wide default configuration.
  
+When reading or writing, the values are read from the system and user
+configuration files by default, and options '--system' and '--user'
+can be used to tell the command to read from or write to only that location.
+
  Syntax
  ~~~~~~
  
@@ -62,7 +74,7 @@ Given a $HOME/.perfconfig like this:
                 medium = green, default
                 normal = lightgray, default
                 selected = white, lightgray
-               code = blue, default
+               jump_arrows = blue, default
                 addr = magenta, default
                 root = white, blue
  
@@ -98,6 +110,347 @@ Given a $HOME/.perfconfig like this:
                 order = caller
                 sort-key = function
  
+Variables
+~~~~~~~~~
+
+colors.*::
+       The variables for customizing the colors used in the output for the
+       'report', 'top' and 'annotate' in the TUI. They should specify the
+       foreground and background colors, separated by a comma, for example:
+
+               medium = green, lightgray
+
+       If you want to use the color configured for you terminal, just leave it
+       as 'default', for example:
+
+               medium = default, lightgray
+
+       Available colors:
+       red, yellow, green, cyan, gray, black, blue,
+       white, default, magenta, lightgray
+
+       colors.top::
+               'top' means a overhead percentage which is more than 5%.
+               And values of this variable specify percentage colors.
+               Basic key values are foreground-color 'red' and
+               background-color 'default'.
+       colors.medium::
+               'medium' means a overhead percentage which has more than 0.5%.
+               Default values are 'green' and 'default'.
+       colors.normal::
+               'normal' means the rest of overhead percentages
+               except 'top', 'medium', 'selected'.
+               Default values are 'lightgray' and 'default'.
+       colors.selected::
+               This selects the colors for the current entry in a list of entries
+               from sub-commands (top, report, annotate).
+               Default values are 'black' and 'lightgray'.
+       colors.jump_arrows::
+               Colors for jump arrows on assembly code listings
+               such as 'jns', 'jmp', 'jane', etc.
+               Default values are 'blue', 'default'.
+       colors.addr::
+               This selects colors for addresses from 'annotate'.
+               Default values are 'magenta', 'default'.
+       colors.root::
+               Colors for headers in the output of a sub-commands (top, report).
+               Default values are 'white', 'blue'.
+
+tui.*, gtk.*::
+       Subcommands that can be configured here are 'top', 'report' and 'annotate'.
+       These values are booleans, for example:
+
+       [tui]
+               top = true
+
+       will make the TUI be the default for the 'top' subcommand. Those will be
+       available if the required libs were detected at tool build time.
+
+buildid.*::
+       buildid.dir::
+               Each executable and shared library in modern distributions comes with a
+               content based identifier that, if available, will be inserted in a
+               'perf.data' file header to, at analysis time find what is needed to do
+               symbol resolution, code annotation, etc.
+
+               The recording tools also stores a hard link or copy in a per-user
+               directory, $HOME/.debug/, of binaries, shared libraries, /proc/kallsyms
+               and /proc/kcore files to be used at analysis time.
+
+               The buildid.dir variable can be used to either change this directory
+               cache location, or to disable it altogether. If you want to disable it,
+               set buildid.dir to /dev/null. The default is $HOME/.debug
+
+annotate.*::
+       These options work only for TUI.
+       These are in control of addresses, jump function, source code
+       in lines of assembly code from a specific program.
+
+       annotate.hide_src_code::
+               If a program which is analyzed has source code,
+               this option lets 'annotate' print a list of assembly code with the source code.
+               For example, let's see a part of a program. There're four lines.
+               If this option is 'true', they can be printed
+               without source code from a program as below.
+
+               │        push   %rbp
+               │        mov    %rsp,%rbp
+               │        sub    $0x10,%rsp
+               │        mov    (%rdi),%rdx
+
+               But if this option is 'false', source code of the part
+               can be also printed as below. Default is 'false'.
+
+               │      struct rb_node *rb_next(const struct rb_node *node)
+               │      {
+               │        push   %rbp
+               │        mov    %rsp,%rbp
+               │        sub    $0x10,%rsp
+               │              struct rb_node *parent;
+               │
+               │              if (RB_EMPTY_NODE(node))
+               │        mov    (%rdi),%rdx
+               │              return n;
+
+        annotate.use_offset::
+               Basing on a first address of a loaded function, offset can be used.
+               Instead of using original addresses of assembly code,
+               addresses subtracted from a base address can be printed.
+               Let's illustrate an example.
+               If a base address is 0XFFFFFFFF81624d50 as below,
+
+               ffffffff81624d50 <load0>
+
+               an address on assembly code has a specific absolute address as below
+
+               ffffffff816250b8:│  mov    0x8(%r14),%rdi
+
+               but if use_offset is 'true', an address subtracted from a base address is printed.
+               Default is true. This option is only applied to TUI.
+
+                            368:│  mov    0x8(%r14),%rdi
+
+       annotate.jump_arrows::
+               There can be jump instruction among assembly code.
+               Depending on a boolean value of jump_arrows,
+               arrows can be printed or not which represent
+               where do the instruction jump into as below.
+
+               │     ┌──jmp    1333
+               │     │  xchg   %ax,%ax
+               │1330:│  mov    %r15,%r10
+               │1333:└─→cmp    %r15,%r14
+
+               If jump_arrow is 'false', the arrows isn't printed as below.
+               Default is 'false'.
+
+               │      ↓ jmp    1333
+               │        xchg   %ax,%ax
+               │1330:   mov    %r15,%r10
+               │1333:   cmp    %r15,%r14
+
+        annotate.show_linenr::
+               When showing source code if this option is 'true',
+               line numbers are printed as below.
+
+               │1628         if (type & PERF_SAMPLE_IDENTIFIER) {
+               │     ↓ jne    508
+               │1628                 data->id = *array;
+               │1629                 array++;
+               │1630         }
+
+               However if this option is 'false', they aren't printed as below.
+               Default is 'false'.
+
+               │             if (type & PERF_SAMPLE_IDENTIFIER) {
+               │     ↓ jne    508
+               │                     data->id = *array;
+               │                     array++;
+               │             }
+
+        annotate.show_nr_jumps::
+               Let's see a part of assembly code.
+
+               │1382:   movb   $0x1,-0x270(%rbp)
+
+               If use this, the number of branches jumping to that address can be printed as below.
+               Default is 'false'.
+
+               │1 1382:   movb   $0x1,-0x270(%rbp)
+
+        annotate.show_total_period::
+               To compare two records on an instruction base, with this option
+               provided, display total number of samples that belong to a line
+               in assembly code. If this option is 'true', total periods are printed
+               instead of percent values as below.
+
+                 302 │      mov    %eax,%eax
+
+               But if this option is 'false', percent values for overhead are printed i.e.
+               Default is 'false'.
+
+               99.93 │      mov    %eax,%eax
+
+hist.*::
+       hist.percentage::
+               This option control the way to calculate overhead of filtered entries -
+               that means the value of this option is effective only if there's a
+               filter (by comm, dso or symbol name). Suppose a following example:
+
+                      Overhead  Symbols
+                      ........  .......
+                       33.33%     foo
+                       33.33%     bar
+                       33.33%     baz
+
+              This is an original overhead and we'll filter out the first 'foo'
+              entry. The value of 'relative' would increase the overhead of 'bar'
+              and 'baz' to 50.00% for each, while 'absolute' would show their
+              current overhead (33.33%).
+
+ui.*::
+       ui.show-headers::
+               This option controls display of column headers (like 'Overhead' and 'Symbol')
+               in 'report' and 'top'. If this option is false, they are hidden.
+               This option is only applied to TUI.
+
+call-graph.*::
+       When sub-commands 'top' and 'report' work with -g/—-children
+       there're options in control of call-graph.
+
+       call-graph.record-mode::
+               The record-mode can be 'fp' (frame pointer), 'dwarf' and 'lbr'.
+               The value of 'dwarf' is effective only if perf detect needed library
+               (libunwind or a recent version of libdw).
+               'lbr' only work for cpus that support it.
+
+       call-graph.dump-size::
+               The size of stack to dump in order to do post-unwinding. Default is 8192 (byte).
+               When using dwarf into record-mode, the default size will be used if omitted.
+
+       call-graph.print-type::
+               The print-types can be graph (graph absolute), fractal (graph relative),
+               flat and folded. This option controls a way to show overhead for each callchain
+               entry. Suppose a following example.
+
+                Overhead  Symbols
+                ........  .......
+                  40.00%  foo
+                          |
+                          ---foo
+                             |
+                             |--50.00%--bar
+                             |          main
+                             |
+                              --50.00%--baz
+                                        main
+
+               This output is a 'fractal' format. The 'foo' came from 'bar' and 'baz' exactly
+               half and half so 'fractal' shows 50.00% for each
+               (meaning that it assumes 100% total overhead of 'foo').
+
+               The 'graph' uses absolute overhead value of 'foo' as total so each of
+               'bar' and 'baz' callchain will have 20.00% of overhead.
+               If 'flat' is used, single column and linear exposure of call chains.
+               'folded' mean call chains are displayed in a line, separated by semicolons.
+
+       call-graph.order::
+               This option controls print order of callchains. The default is
+               'callee' which means callee is printed at top and then followed by its
+               caller and so on. The 'caller' prints it in reverse order.
+
+               If this option is not set and report.children or top.children is
+               set to true (or the equivalent command line option is given),
+               the default value of this option is changed to 'caller' for the
+               execution of 'perf report' or 'perf top'. Other commands will
+               still default to 'callee'.
+
+       call-graph.sort-key::
+               The callchains are merged if they contain same information.
+               The sort-key option determines a way to compare the callchains.
+               A value of 'sort-key' can be 'function' or 'address'.
+               The default is 'function'.
+
+       call-graph.threshold::
+               When there're many callchains it'd print tons of lines. So perf omits
+               small callchains under a certain overhead (threshold) and this option
+               control the threshold. Default is 0.5 (%). The overhead is calculated
+               by value depends on call-graph.print-type.
+
+       call-graph.print-limit::
+               This is a maximum number of lines of callchain printed for a single
+               histogram entry. Default is 0 which means no limitation.
+
+report.*::
+       report.percent-limit::
+               This one is mostly the same as call-graph.threshold but works for
+               histogram entries. Entries having an overhead lower than this
+               percentage will not be printed. Default is '0'. If percent-limit
+               is '10', only entries which have more than 10% of overhead will be
+               printed.
+
+       report.queue-size::
+               This option sets up the maximum allocation size of the internal
+               event queue for ordering events. Default is 0, meaning no limit.
+
+       report.children::
+               'Children' means functions called from another function.
+               If this option is true, 'perf report' cumulates callchains of children
+               and show (accumulated) total overhead as well as 'Self' overhead.
+               Please refer to the 'perf report' manual. The default is 'true'.
+
+       report.group::
+               This option is to show event group information together.
+               Example output with this turned on, notice that there is one column
+               per event in the group, ref-cycles and cycles:
+
+               # group: {ref-cycles,cycles}
+               # ========
+               #
+               # Samples: 7K of event 'anon group { ref-cycles, cycles }'
+               # Event count (approx.): 6876107743
+               #
+               #         Overhead  Command      Shared Object               Symbol
+               # ................  .......  .................  ...................
+               #
+                   99.84%  99.76%  noploop  noploop            [.] main
+                    0.07%   0.00%  noploop  ld-2.15.so         [.] strcmp
+                    0.03%   0.00%  noploop  [kernel.kallsyms]  [k] timerqueue_del
+
+top.*::
+       top.children::
+               Same as 'report.children'. So if it is enabled, the output of 'top'
+               command will have 'Children' overhead column as well as 'Self' overhead
+               column by default.
+               The default is 'true'.
+
+man.*::
+       man.viewer::
+               This option can assign a tool to view manual pages when 'help'
+               subcommand was invoked. Supported tools are 'man', 'woman'
+               (with emacs client) and 'konqueror'. Default is 'man'.
+
+               New man viewer tool can be also added using 'man.<tool>.cmd'
+               or use different path using 'man.<tool>.path' config option.
+
+pager.*::
+       pager.<subcommand>::
+               When the subcommand is run on stdio, determine whether it uses
+               pager or not based on this value. Default is 'unspecified'.
+
+kmem.*::
+       kmem.default::
+               This option decides which allocator is to be analyzed if neither
+               '--slab' nor '--page' option is used. Default is 'slab'.
+
+record.*::
+       record.build-id::
+               This option can be 'cache', 'no-cache' or 'skip'.
+               'cache' is to post-process data and save/update the binaries into
+               the build-id cache (in ~/.debug). This is the default.
+               But if this option is 'no-cache', it will not update the build-id cache.
+               'skip' skips post-processing and does not update the cache.
+
  SEE ALSO
  --------
  linkperf:perf[1]
diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt

index 0b1cedeef895369c9c60aa0a1e6cbb718c65c509..87b2588d1cbdee6060a304be4f79015cb5889171 100644 (file)
--- a/tools/perf/Documentation/perf-inject.txt
+++ b/tools/perf/Documentation/perf-inject.txt
@@ -53,6 +53,13 @@ include::itrace.txt[]
  --strip::
         Use with --itrace to strip out non-synthesized events.
  
+-j::
+--jit::
+       Process jitdump files by injecting the mmap records corresponding to jitted
+       functions. This option also generates the ELF images for each jitted function
+       found in the jitdumps files captured in the input perf.data file. Use this option
+       if you are monitoring environment using JIT runtimes, such as Java, DART or V8.
+
  SEE ALSO
  --------
  linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-archive[1]
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt

index fbceb631387c31c44002080130b7ff6e282229d3..19aa17532a16709646dc52487c5f686675c2b658 100644 (file)
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -341,6 +341,12 @@ Specify vmlinux path which has debuginfo.
  --buildid-all::
  Record build-id of all DSOs regardless whether it's actually hit or not.
  
+--all-kernel::
+Configure all used events to run in kernel space.
+
+--all-user::
+Configure all used events to run in user space.
+
  SEE ALSO
  --------
  linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt

index 8a301f6afb37ad855351fcd5fba1644d12dc2529..12113992ac9d0f5ceca0b003cec568717c4208e5 100644 (file)
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -117,6 +117,22 @@ OPTIONS
         And default sort keys are changed to comm, dso_from, symbol_from, dso_to
         and symbol_to, see '--branch-stack'.
  
+       If the --mem-mode option is used, the following sort keys are also available
+       (incompatible with --branch-stack):
+       symbol_daddr, dso_daddr, locked, tlb, mem, snoop, dcacheline.
+
+       - symbol_daddr: name of data symbol being executed on at the time of sample
+       - dso_daddr: name of library or module containing the data being executed
+       on at the time of the sample
+       - locked: whether the bus was locked at the time of the sample
+       - tlb: type of tlb access for the data at the time of the sample
+       - mem: type of memory access for the data at the time of the sample
+       - snoop: type of snoop (if any) for the data at the time of the sample
+       - dcacheline: the cacheline the data address is on at the time of the sample
+
+       And the default sort keys are changed to local_weight, mem, sym, dso,
+       symbol_daddr, dso_daddr, snoop, tlb, locked, see '--mem-mode'.
+
         If the data file has tracepoint event(s), following (dynamic) sort keys
         are also available:
         trace, trace_fields, [<event>.]<field>[/raw]
@@ -151,22 +167,6 @@ OPTIONS
         By default, every sort keys not specified in -F will be appended
         automatically.
  
-       If --mem-mode option is used, following sort keys are also available
-       (incompatible with --branch-stack):
-       symbol_daddr, dso_daddr, locked, tlb, mem, snoop, dcacheline.
-
-       - symbol_daddr: name of data symbol being executed on at the time of sample
-       - dso_daddr: name of library or module containing the data being executed
-       on at the time of sample
-       - locked: whether the bus was locked at the time of sample
-       - tlb: type of tlb access for the data at the time of sample
-       - mem: type of memory access for the data at the time of sample
-       - snoop: type of snoop (if any) for the data at the time of sample
-       - dcacheline: the cacheline the data address is on at the time of sample
-
-       And default sort keys are changed to local_weight, mem, sym, dso,
-       symbol_daddr, dso_daddr, snoop, tlb, locked, see '--mem-mode'.
-
  -p::
  --parent=<regex>::
          A regex filter to identify parent. The parent is a caller of this
@@ -351,7 +351,10 @@ OPTIONS
  
  --percent-limit::
         Do not show entries which have an overhead under that percent.
-       (Default: 0).
+       (Default: 0).  Note that this option also sets the percent limit (threshold)
+       of callchains.  However the default value of callchain threshold is
+       different than the default value of hist entries.  Please see the
+       --call-graph option for details.
  
  --percentage::
         Determine how to display the overhead percentage of filtered entries.
@@ -398,6 +401,9 @@ include::itrace.txt[]
  --raw-trace::
         When displaying traceevent output, do not use print fmt or plugins.
  
+--hierarchy::
+       Enable hierarchical output.
+
  include::callchain-overhead-calculation.txt[]
  
  SEE ALSO
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt

index 52ef7a9d50aacbc3d3fbb0366852457d39718cdf..04f23b404bbc5a7bd4b5bcdfb7e833d79ba1234e 100644 (file)
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -69,6 +69,14 @@ report::
  --scale::
         scale/normalize counter values
  
+-d::
+--detailed::
+       print more detailed statistics, can be specified up to 3 times
+
+          -d:          detailed events, L1 and LLC data cache
+        -d -d:     more detailed events, dTLB and iTLB events
+     -d -d -d:     very detailed events, adding prefetch events
+
  -r::
  --repeat=<n>::
         repeat command and print average + stddev (max: 100). 0 means forever.
@@ -139,6 +147,10 @@ Print count deltas every N milliseconds (minimum: 10ms)
  The overhead percentage could be high in some cases, for instance with small, sub 100ms intervals.  Use with caution.
         example: 'perf stat -I 1000 -e cycles -a sleep 5'
  
+--metric-only::
+Only print computed metrics. Print them in a single line.
+Don't show any raw values. Not supported with --per-thread.
+
  --per-socket::
  Aggregate counts per processor socket for system-wide mode measurements.  This
  is a useful mode to detect imbalance between sockets.  To enable this mode,
@@ -211,6 +223,29 @@ $ perf stat -- make -j
  
   Wall-clock time elapsed:   719.554352 msecs
  
+CSV FORMAT
+----------
+
+With -x, perf stat is able to output a not-quite-CSV format output
+Commas in the output are not put into "". To make it easy to parse
+it is recommended to use a different character like -x \;
+
+The fields are in this order:
+
+       - optional usec time stamp in fractions of second (with -I xxx)
+       - optional CPU, core, or socket identifier
+       - optional number of logical CPUs aggregated
+       - counter value
+       - unit of the counter value or empty
+       - event name
+       - run time of counter
+       - percentage of measurement time the counter was running
+       - optional variance if multiple values are collected with -r
+       - optional metric value
+       - optional unit of metric
+
+Additional metrics may be printed with all earlier fields being empty.
+
  SEE ALSO
  --------
  linkperf:perf-top[1], linkperf:perf-list[1]
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt

index b0e60e17db389d89efafa7025cd47fd44fe625c3..19f046f027cd81e42c5696ab3172539baaeb745d 100644 (file)
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -233,6 +233,9 @@ Default is to monitor all CPUS.
  --raw-trace::
         When displaying traceevent output, do not use print fmt or plugins.
  
+--hierarchy::
+       Enable hierarchy output.
+
  INTERACTIVE PROMPTING KEYS
  --------------------------
  
diff --git a/tools/perf/Documentation/perfconfig.example b/tools/perf/Documentation/perfconfig.example

index 767ea2436e1cd841a762ee8cdb039ba80a7120b3..1d8d5bc4cd2de6601f926696d5c172b30fa735d4 100644 (file)
--- a/tools/perf/Documentation/perfconfig.example
+++ b/tools/perf/Documentation/perfconfig.example
@@ -5,7 +5,7 @@
         medium = green, lightgray
         normal = black, lightgray
         selected = lightgray, magenta
-       code = blue, lightgray
+       jump_arrows = blue, lightgray
         addr = magenta, lightgray
  
  [tui]
diff --git a/tools/perf/Documentation/tips.txt b/tools/perf/Documentation/tips.txt

index e0ce9573b79bd26ddfe8cb4aee6eade061a09a12..5950b5a24efdf6024ca58a2da084d9cd43295c12 100644 (file)
--- a/tools/perf/Documentation/tips.txt
+++ b/tools/perf/Documentation/tips.txt
@@ -27,3 +27,4 @@ Skip collecing build-id when recording: perf record -B
  To change sampling frequency to 100 Hz: perf record -F 100
  See assembly instructions with percentage: perf annotate <symbol>
  If you prefer Intel style assembly, try: perf annotate -M intel
+For hierarchical output, try: perf report --hierarchy
diff --git a/tools/perf/Makefile b/tools/perf/Makefile

index dcd9a70c7193b43b5791058b01a3773da6e0d870..32a64e6190288e5110cfa6de78138aceb955e7af 100644 (file)
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -68,6 +68,20 @@ all tags TAGS:
         $(print_msg)
         $(make)
  
+ifdef MAKECMDGOALS
+has_clean := 0
+ifneq ($(filter clean,$(MAKECMDGOALS)),)
+  has_clean := 1
+endif # clean
+
+ifeq ($(has_clean),1)
+  rest := $(filter-out clean,$(MAKECMDGOALS))
+  ifneq ($(rest),)
+$(rest): clean
+  endif # rest
+endif # has_clean
+endif # MAKECMDGOALS
+
  #
  # The clean target is not really parallel, don't print the jobs info:
  #
@@ -75,10 +89,17 @@ clean:
         $(make)
  
  #
-# The build-test target is not really parallel, don't print the jobs info:
+# The build-test target is not really parallel, don't print the jobs info,
+# it also uses only the tests/make targets that don't pollute the source
+# repository, i.e. that uses O= or builds the tarpkg outside the source
+# repo directories.
+#
+# For a full test, use:
+#
+# make -C tools/perf -f tests/make
  #
  build-test:
-       @$(MAKE) SHUF=1 -f tests/make --no-print-directory
+       @$(MAKE) SHUF=1 -f tests/make REUSE_FEATURES_DUMP=1 MK=Makefile SET_PARALLEL=1 --no-print-directory tarpkg out
  
  #
  # All other targets get passed through:
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf

index 5d34815c7ccb8e02ac6d4db0d82d4712af73f343..4a4fad4182f534f23550ca5ac1ed5d42000f9b08 100644 (file)
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -58,6 +58,9 @@ include config/utilities.mak
  #
  # Define NO_LIBBIONIC if you do not want bionic support
  #
+# Define NO_LIBCRYPTO if you do not want libcrypto (openssl) support
+# used for generating build-ids for ELFs generated by jitdump.
+#
  # Define NO_LIBDW_DWARF_UNWIND if you do not want libdw support
  # for dwarf backtrace post unwind.
  #
@@ -136,6 +139,8 @@ $(call allow-override,CC,$(CROSS_COMPILE)gcc)
  $(call allow-override,AR,$(CROSS_COMPILE)ar)
  $(call allow-override,LD,$(CROSS_COMPILE)ld)
  
+LD += $(EXTRA_LDFLAGS)
+
  PKG_CONFIG = $(CROSS_COMPILE)pkg-config
  
  RM      = rm -f
@@ -165,7 +170,16 @@ ifeq ($(filter-out $(NON_CONFIG_TARGETS),$(MAKECMDGOALS)),)
  endif
  endif
  
+# Set FEATURE_TESTS to 'all' so all possible feature checkers are executed.
+# Without this setting the output feature dump file misses some features, for
+# example, liberty. Select all checkers so we won't get an incomplete feature
+# dump file.
  ifeq ($(config),1)
+ifdef MAKECMDGOALS
+ifeq ($(filter feature-dump,$(MAKECMDGOALS)),feature-dump)
+FEATURE_TESTS := all
+endif
+endif
  include config/Makefile
  endif
  
@@ -618,7 +632,7 @@ clean: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean
         $(call QUIET_CLEAN, core-progs) $(RM) $(ALL_PROGRAMS) perf perf-read-vdso32 perf-read-vdsox32
         $(call QUIET_CLEAN, core-gen)   $(RM)  *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope* $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)FEATURE-DUMP $(OUTPUT)util/*-bison* $(OUTPUT)util/*-flex* \
                 $(OUTPUT)util/intel-pt-decoder/inat-tables.c $(OUTPUT)fixdep \
-               $(OUTPUT)tests/llvm-src-{base,kbuild,prologue}.c
+               $(OUTPUT)tests/llvm-src-{base,kbuild,prologue,relocation}.c
         $(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) clean
         $(python-clean)
  
diff --git a/tools/perf/arch/arm/Makefile b/tools/perf/arch/arm/Makefile

index 7fbca175099ec917ad69b8025c8249ee6c52a6a4..18b13518d8d8701137d489a65f7dd752bdc7f43f 100644 (file)
--- a/tools/perf/arch/arm/Makefile
+++ b/tools/perf/arch/arm/Makefile
@@ -1,3 +1,4 @@
  ifndef NO_DWARF
  PERF_HAVE_DWARF_REGS := 1
  endif
+PERF_HAVE_JITDUMP := 1
diff --git a/tools/perf/arch/arm64/Makefile b/tools/perf/arch/arm64/Makefile

index 7fbca175099ec917ad69b8025c8249ee6c52a6a4..18b13518d8d8701137d489a65f7dd752bdc7f43f 100644 (file)
--- a/tools/perf/arch/arm64/Makefile
+++ b/tools/perf/arch/arm64/Makefile
@@ -1,3 +1,4 @@
  ifndef NO_DWARF
  PERF_HAVE_DWARF_REGS := 1
  endif
+PERF_HAVE_JITDUMP := 1
diff --git a/tools/perf/arch/powerpc/Makefile b/tools/perf/arch/powerpc/Makefile

index 7fbca175099ec917ad69b8025c8249ee6c52a6a4..56e05f126ad8793d25bb1a70ce3dac5f100d3b3a 100644 (file)
--- a/tools/perf/arch/powerpc/Makefile
+++ b/tools/perf/arch/powerpc/Makefile
@@ -1,3 +1,6 @@
  ifndef NO_DWARF
  PERF_HAVE_DWARF_REGS := 1
  endif
+
+HAVE_KVM_STAT_SUPPORT := 1
+PERF_HAVE_JITDUMP := 1
diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build

index 7b8b0d1a1b626065e0b414f42a7a998723e0e57f..c8fe2074d2177f0709f0e1d54f63cd77f84f2233 100644 (file)
--- a/tools/perf/arch/powerpc/util/Build
+++ b/tools/perf/arch/powerpc/util/Build
@@ -1,5 +1,6 @@
  libperf-y += header.o
  libperf-y += sym-handling.o
+libperf-y += kvm-stat.o
  
  libperf-$(CONFIG_DWARF) += dwarf-regs.o
  libperf-$(CONFIG_DWARF) += skip-callchain-idx.o
diff --git a/tools/perf/arch/powerpc/util/book3s_hcalls.h b/tools/perf/arch/powerpc/util/book3s_hcalls.h

new file mode 100644 (file)

index 0000000..0dd6b7f
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/book3s_hcalls.h
@@ -0,0 +1,123 @@
+#ifndef ARCH_PERF_BOOK3S_HV_HCALLS_H
+#define ARCH_PERF_BOOK3S_HV_HCALLS_H
+
+/*
+ * PowerPC HCALL codes : hcall code to name mapping
+ */
+#define kvm_trace_symbol_hcall \
+       {0x4, "H_REMOVE"},                                      \
+       {0x8, "H_ENTER"},                                       \
+       {0xc, "H_READ"},                                        \
+       {0x10, "H_CLEAR_MOD"},                                  \
+       {0x14, "H_CLEAR_REF"},                                  \
+       {0x18, "H_PROTECT"},                                    \
+       {0x1c, "H_GET_TCE"},                                    \
+       {0x20, "H_PUT_TCE"},                                    \
+       {0x24, "H_SET_SPRG0"},                                  \
+       {0x28, "H_SET_DABR"},                                   \
+       {0x2c, "H_PAGE_INIT"},                                  \
+       {0x30, "H_SET_ASR"},                                    \
+       {0x34, "H_ASR_ON"},                                     \
+       {0x38, "H_ASR_OFF"},                                    \
+       {0x3c, "H_LOGICAL_CI_LOAD"},                            \
+       {0x40, "H_LOGICAL_CI_STORE"},                           \
+       {0x44, "H_LOGICAL_CACHE_LOAD"},                         \
+       {0x48, "H_LOGICAL_CACHE_STORE"},                        \
+       {0x4c, "H_LOGICAL_ICBI"},                               \
+       {0x50, "H_LOGICAL_DCBF"},                               \
+       {0x54, "H_GET_TERM_CHAR"},                              \
+       {0x58, "H_PUT_TERM_CHAR"},                              \
+       {0x5c, "H_REAL_TO_LOGICAL"},                            \
+       {0x60, "H_HYPERVISOR_DATA"},                            \
+       {0x64, "H_EOI"},                                        \
+       {0x68, "H_CPPR"},                                       \
+       {0x6c, "H_IPI"},                                        \
+       {0x70, "H_IPOLL"},                                      \
+       {0x74, "H_XIRR"},                                       \
+       {0x78, "H_MIGRATE_DMA"},                                \
+       {0x7c, "H_PERFMON"},                                    \
+       {0xdc, "H_REGISTER_VPA"},                               \
+       {0xe0, "H_CEDE"},                                       \
+       {0xe4, "H_CONFER"},                                     \
+       {0xe8, "H_PROD"},                                       \
+       {0xec, "H_GET_PPP"},                                    \
+       {0xf0, "H_SET_PPP"},                                    \
+       {0xf4, "H_PURR"},                                       \
+       {0xf8, "H_PIC"},                                        \
+       {0xfc, "H_REG_CRQ"},                                    \
+       {0x100, "H_FREE_CRQ"},                                  \
+       {0x104, "H_VIO_SIGNAL"},                                \
+       {0x108, "H_SEND_CRQ"},                                  \
+       {0x110, "H_COPY_RDMA"},                                 \
+       {0x114, "H_REGISTER_LOGICAL_LAN"},                      \
+       {0x118, "H_FREE_LOGICAL_LAN"},                          \
+       {0x11c, "H_ADD_LOGICAL_LAN_BUFFER"},                    \
+       {0x120, "H_SEND_LOGICAL_LAN"},                          \
+       {0x124, "H_BULK_REMOVE"},                               \
+       {0x130, "H_MULTICAST_CTRL"},                            \
+       {0x134, "H_SET_XDABR"},                                 \
+       {0x138, "H_STUFF_TCE"},                                 \
+       {0x13c, "H_PUT_TCE_INDIRECT"},                          \
+       {0x14c, "H_CHANGE_LOGICAL_LAN_MAC"},                    \
+       {0x150, "H_VTERM_PARTNER_INFO"},                        \
+       {0x154, "H_REGISTER_VTERM"},                            \
+       {0x158, "H_FREE_VTERM"},                                \
+       {0x15c, "H_RESET_EVENTS"},                              \
+       {0x160, "H_ALLOC_RESOURCE"},                            \
+       {0x164, "H_FREE_RESOURCE"},                             \
+       {0x168, "H_MODIFY_QP"},                                 \
+       {0x16c, "H_QUERY_QP"},                                  \
+       {0x170, "H_REREGISTER_PMR"},                            \
+       {0x174, "H_REGISTER_SMR"},                              \
+       {0x178, "H_QUERY_MR"},                                  \
+       {0x17c, "H_QUERY_MW"},                                  \
+       {0x180, "H_QUERY_HCA"},                                 \
+       {0x184, "H_QUERY_PORT"},                                \
+       {0x188, "H_MODIFY_PORT"},                               \
+       {0x18c, "H_DEFINE_AQP1"},                               \
+       {0x190, "H_GET_TRACE_BUFFER"},                          \
+       {0x194, "H_DEFINE_AQP0"},                               \
+       {0x198, "H_RESIZE_MR"},                                 \
+       {0x19c, "H_ATTACH_MCQP"},                               \
+       {0x1a0, "H_DETACH_MCQP"},                               \
+       {0x1a4, "H_CREATE_RPT"},                                \
+       {0x1a8, "H_REMOVE_RPT"},                                \
+       {0x1ac, "H_REGISTER_RPAGES"},                           \
+       {0x1b0, "H_DISABLE_AND_GETC"},                          \
+       {0x1b4, "H_ERROR_DATA"},                                \
+       {0x1b8, "H_GET_HCA_INFO"},                              \
+       {0x1bc, "H_GET_PERF_COUNT"},                            \
+       {0x1c0, "H_MANAGE_TRACE"},                              \
+       {0x1d4, "H_FREE_LOGICAL_LAN_BUFFER"},                   \
+       {0x1d8, "H_POLL_PENDING"},                              \
+       {0x1e4, "H_QUERY_INT_STATE"},                           \
+       {0x244, "H_ILLAN_ATTRIBUTES"},                          \
+       {0x250, "H_MODIFY_HEA_QP"},                             \
+       {0x254, "H_QUERY_HEA_QP"},                              \
+       {0x258, "H_QUERY_HEA"},                                 \
+       {0x25c, "H_QUERY_HEA_PORT"},                            \
+       {0x260, "H_MODIFY_HEA_PORT"},                           \
+       {0x264, "H_REG_BCMC"},                                  \
+       {0x268, "H_DEREG_BCMC"},                                \
+       {0x26c, "H_REGISTER_HEA_RPAGES"},                       \
+       {0x270, "H_DISABLE_AND_GET_HEA"},                       \
+       {0x274, "H_GET_HEA_INFO"},                              \
+       {0x278, "H_ALLOC_HEA_RESOURCE"},                        \
+       {0x284, "H_ADD_CONN"},                                  \
+       {0x288, "H_DEL_CONN"},                                  \
+       {0x298, "H_JOIN"},                                      \
+       {0x2a4, "H_VASI_STATE"},                                \
+       {0x2b0, "H_ENABLE_CRQ"},                                \
+       {0x2b8, "H_GET_EM_PARMS"},                              \
+       {0x2d0, "H_SET_MPP"},                                   \
+       {0x2d4, "H_GET_MPP"},                                   \
+       {0x2ec, "H_HOME_NODE_ASSOCIATIVITY"},                   \
+       {0x2f4, "H_BEST_ENERGY"},                               \
+       {0x2fc, "H_XIRR_X"},                                    \
+       {0x300, "H_RANDOM"},                                    \
+       {0x304, "H_COP"},                                       \
+       {0x314, "H_GET_MPP_X"},                                 \
+       {0x31c, "H_SET_MODE"},                                  \
+       {0xf000, "H_RTAS"}                                      \
+
+#endif
diff --git a/tools/perf/arch/powerpc/util/book3s_hv_exits.h b/tools/perf/arch/powerpc/util/book3s_hv_exits.h

new file mode 100644 (file)

index 0000000..e68ba2d
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/book3s_hv_exits.h
@@ -0,0 +1,33 @@
+#ifndef ARCH_PERF_BOOK3S_HV_EXITS_H
+#define ARCH_PERF_BOOK3S_HV_EXITS_H
+
+/*
+ * PowerPC Interrupt vectors : exit code to name mapping
+ */
+
+#define kvm_trace_symbol_exit \
+       {0x0,   "RETURN_TO_HOST"}, \
+       {0x100, "SYSTEM_RESET"}, \
+       {0x200, "MACHINE_CHECK"}, \
+       {0x300, "DATA_STORAGE"}, \
+       {0x380, "DATA_SEGMENT"}, \
+       {0x400, "INST_STORAGE"}, \
+       {0x480, "INST_SEGMENT"}, \
+       {0x500, "EXTERNAL"}, \
+       {0x501, "EXTERNAL_LEVEL"}, \
+       {0x502, "EXTERNAL_HV"}, \
+       {0x600, "ALIGNMENT"}, \
+       {0x700, "PROGRAM"}, \
+       {0x800, "FP_UNAVAIL"}, \
+       {0x900, "DECREMENTER"}, \
+       {0x980, "HV_DECREMENTER"}, \
+       {0xc00, "SYSCALL"}, \
+       {0xd00, "TRACE"}, \
+       {0xe00, "H_DATA_STORAGE"}, \
+       {0xe20, "H_INST_STORAGE"}, \
+       {0xe40, "H_EMUL_ASSIST"}, \
+       {0xf00, "PERFMON"}, \
+       {0xf20, "ALTIVEC"}, \
+       {0xf40, "VSX"}
+
+#endif
diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/arch/powerpc/util/kvm-stat.c

new file mode 100644 (file)

index 0000000..74eee30
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/kvm-stat.c
@@ -0,0 +1,170 @@
+#include "util/kvm-stat.h"
+#include "util/parse-events.h"
+#include "util/debug.h"
+
+#include "book3s_hv_exits.h"
+#include "book3s_hcalls.h"
+
+#define NR_TPS 4
+
+const char *vcpu_id_str = "vcpu_id";
+const int decode_str_len = 40;
+const char *kvm_entry_trace = "kvm_hv:kvm_guest_enter";
+const char *kvm_exit_trace = "kvm_hv:kvm_guest_exit";
+
+define_exit_reasons_table(hv_exit_reasons, kvm_trace_symbol_exit);
+define_exit_reasons_table(hcall_reasons, kvm_trace_symbol_hcall);
+
+/* Tracepoints specific to ppc_book3s_hv */
+const char *ppc_book3s_hv_kvm_tp[] = {
+       "kvm_hv:kvm_guest_enter",
+       "kvm_hv:kvm_guest_exit",
+       "kvm_hv:kvm_hcall_enter",
+       "kvm_hv:kvm_hcall_exit",
+       NULL,
+};
+
+/* 1 extra placeholder for NULL */
+const char *kvm_events_tp[NR_TPS + 1];
+const char *kvm_exit_reason;
+
+static void hcall_event_get_key(struct perf_evsel *evsel,
+                               struct perf_sample *sample,
+                               struct event_key *key)
+{
+       key->info = 0;
+       key->key = perf_evsel__intval(evsel, sample, "req");
+}
+
+static const char *get_hcall_exit_reason(u64 exit_code)
+{
+       struct exit_reasons_table *tbl = hcall_reasons;
+
+       while (tbl->reason != NULL) {
+               if (tbl->exit_code == exit_code)
+                       return tbl->reason;
+               tbl++;
+       }
+
+       pr_debug("Unknown hcall code: %lld\n",
+              (unsigned long long)exit_code);
+       return "UNKNOWN";
+}
+
+static bool hcall_event_end(struct perf_evsel *evsel,
+                           struct perf_sample *sample __maybe_unused,
+                           struct event_key *key __maybe_unused)
+{
+       return (!strcmp(evsel->name, kvm_events_tp[3]));
+}
+
+static bool hcall_event_begin(struct perf_evsel *evsel,
+                             struct perf_sample *sample, struct event_key *key)
+{
+       if (!strcmp(evsel->name, kvm_events_tp[2])) {
+               hcall_event_get_key(evsel, sample, key);
+               return true;
+       }
+
+       return false;
+}
+static void hcall_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
+                                  struct event_key *key,
+                                  char *decode)
+{
+       const char *hcall_reason = get_hcall_exit_reason(key->key);
+
+       scnprintf(decode, decode_str_len, "%s", hcall_reason);
+}
+
+static struct kvm_events_ops hcall_events = {
+       .is_begin_event = hcall_event_begin,
+       .is_end_event = hcall_event_end,
+       .decode_key = hcall_event_decode_key,
+       .name = "HCALL-EVENT",
+};
+
+static struct kvm_events_ops exit_events = {
+       .is_begin_event = exit_event_begin,
+       .is_end_event = exit_event_end,
+       .decode_key = exit_event_decode_key,
+       .name = "VM-EXIT"
+};
+
+struct kvm_reg_events_ops kvm_reg_events_ops[] = {
+       { .name = "vmexit", .ops = &exit_events },
+       { .name = "hcall", .ops = &hcall_events },
+       { NULL, NULL },
+};
+
+const char * const kvm_skip_events[] = {
+       NULL,
+};
+
+
+static int is_tracepoint_available(const char *str, struct perf_evlist *evlist)
+{
+       struct parse_events_error err;
+       int ret;
+
+       err.str = NULL;
+       ret = parse_events(evlist, str, &err);
+       if (err.str)
+               pr_err("%s : %s\n", str, err.str);
+       return ret;
+}
+
+static int ppc__setup_book3s_hv(struct perf_kvm_stat *kvm,
+                               struct perf_evlist *evlist)
+{
+       const char **events_ptr;
+       int i, nr_tp = 0, err = -1;
+
+       /* Check for book3s_hv tracepoints */
+       for (events_ptr = ppc_book3s_hv_kvm_tp; *events_ptr; events_ptr++) {
+               err = is_tracepoint_available(*events_ptr, evlist);
+               if (err)
+                       return -1;
+               nr_tp++;
+       }
+
+       for (i = 0; i < nr_tp; i++)
+               kvm_events_tp[i] = ppc_book3s_hv_kvm_tp[i];
+
+       kvm_events_tp[i] = NULL;
+       kvm_exit_reason = "trap";
+       kvm->exit_reasons = hv_exit_reasons;
+       kvm->exit_reasons_isa = "HV";
+
+       return 0;
+}
+
+/* Wrapper to setup kvm tracepoints */
+static int ppc__setup_kvm_tp(struct perf_kvm_stat *kvm)
+{
+       struct perf_evlist *evlist = perf_evlist__new();
+
+       if (evlist == NULL)
+               return -ENOMEM;
+
+       /* Right now, only supported on book3s_hv */
+       return ppc__setup_book3s_hv(kvm, evlist);
+}
+
+int setup_kvm_events_tp(struct perf_kvm_stat *kvm)
+{
+       return ppc__setup_kvm_tp(kvm);
+}
+
+int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused)
+{
+       int ret;
+
+       ret = ppc__setup_kvm_tp(kvm);
+       if (ret) {
+               kvm->exit_reasons = NULL;
+               kvm->exit_reasons_isa = NULL;
+       }
+
+       return ret;
+}
diff --git a/tools/perf/arch/s390/util/kvm-stat.c b/tools/perf/arch/s390/util/kvm-stat.c

index a5dbc07ec9dc70900ed4a593f5979d419061fd35..ed57df2e6d687d89ccf95d9aa03c263411de7fe9 100644 (file)
--- a/tools/perf/arch/s390/util/kvm-stat.c
+++ b/tools/perf/arch/s390/util/kvm-stat.c
@@ -10,7 +10,7 @@
   */
  
  #include "../../util/kvm-stat.h"
-#include <asm/kvm_perf.h>
+#include <asm/sie.h>
  
  define_exit_reasons_table(sie_exit_reasons, sie_intercept_code);
  define_exit_reasons_table(sie_icpt_insn_codes, icpt_insn_codes);
@@ -18,6 +18,12 @@ define_exit_reasons_table(sie_sigp_order_codes, sigp_order_codes);
  define_exit_reasons_table(sie_diagnose_codes, diagnose_codes);
  define_exit_reasons_table(sie_icpt_prog_codes, icpt_prog_codes);
  
+const char *vcpu_id_str = "id";
+const int decode_str_len = 40;
+const char *kvm_exit_reason = "icptcode";
+const char *kvm_entry_trace = "kvm:kvm_s390_sie_enter";
+const char *kvm_exit_trace = "kvm:kvm_s390_sie_exit";
+
  static void event_icpt_insn_get_key(struct perf_evsel *evsel,
                                     struct perf_sample *sample,
                                     struct event_key *key)
@@ -73,7 +79,7 @@ static struct kvm_events_ops exit_events = {
         .name = "VM-EXIT"
  };
  
-const char * const kvm_events_tp[] = {
+const char *kvm_events_tp[] = {
         "kvm:kvm_s390_sie_enter",
         "kvm:kvm_s390_sie_exit",
         "kvm:kvm_s390_intercept_instruction",
diff --git a/tools/perf/arch/x86/Makefile b/tools/perf/arch/x86/Makefile

index 09ba923debe86810f8380f7df54504dee4232ec8..269af21437353b2fb886383ede1e3f9a2f586f25 100644 (file)
--- a/tools/perf/arch/x86/Makefile
+++ b/tools/perf/arch/x86/Makefile
@@ -3,3 +3,4 @@ PERF_HAVE_DWARF_REGS := 1
  endif
  HAVE_KVM_STAT_SUPPORT := 1
  PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1
+PERF_HAVE_JITDUMP := 1
diff --git a/tools/perf/arch/x86/tests/rdpmc.c b/tools/perf/arch/x86/tests/rdpmc.c

index 7bb0d13c235f70326afc28d726f38f1d95499055..72193f19d6d75d3c32e19b424a0f1910ebbc24a6 100644 (file)
--- a/tools/perf/arch/x86/tests/rdpmc.c
+++ b/tools/perf/arch/x86/tests/rdpmc.c
@@ -59,7 +59,7 @@ static u64 mmap_read_self(void *addr)
                 u64 quot, rem;
  
                 quot = (cyc >> time_shift);
-               rem = cyc & ((1 << time_shift) - 1);
+               rem = cyc & (((u64)1 << time_shift) - 1);
                 delta = time_offset + quot * time_mult +
                         ((rem * time_mult) >> time_shift);
  
@@ -103,6 +103,7 @@ static int __test__rdpmc(void)
  
         sigfillset(&sa.sa_mask);
         sa.sa_sigaction = segfault_handler;
+       sa.sa_flags = 0;
         sigaction(SIGSEGV, &sa, NULL);
  
         fd = sys_perf_event_open(&attr, 0, -1, -1,
diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c

index 8d8150f1cf9bcf426b4c29a19170221db680a980..d66f9ad4df2ea5da6eca22cae351a842d1143508 100644 (file)
--- a/tools/perf/arch/x86/util/intel-bts.c
+++ b/tools/perf/arch/x86/util/intel-bts.c
@@ -60,7 +60,9 @@ struct branch {
         u64 misc;
  };
  
-static size_t intel_bts_info_priv_size(struct auxtrace_record *itr __maybe_unused)
+static size_t
+intel_bts_info_priv_size(struct auxtrace_record *itr __maybe_unused,
+                        struct perf_evlist *evlist __maybe_unused)
  {
         return INTEL_BTS_AUXTRACE_PRIV_SIZE;
  }
diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c

index f05daacc9e7810117cfe25415a75990b3aa89f63..a3395179c9eebd5fcd39aa4796a0dc3d495f4b5d 100644 (file)
--- a/tools/perf/arch/x86/util/intel-pt.c
+++ b/tools/perf/arch/x86/util/intel-pt.c
@@ -89,7 +89,7 @@ static int intel_pt_parse_terms_with_default(struct list_head *formats,
  
         *config = attr.config;
  out_free:
-       parse_events__free_terms(terms);
+       parse_events_terms__delete(terms);
         return err;
  }
  
@@ -273,7 +273,9 @@ intel_pt_pmu_default_config(struct perf_pmu *intel_pt_pmu)
         return attr;
  }
  
-static size_t intel_pt_info_priv_size(struct auxtrace_record *itr __maybe_unused)
+static size_t
+intel_pt_info_priv_size(struct auxtrace_record *itr __maybe_unused,
+                       struct perf_evlist *evlist __maybe_unused)
  {
         return INTEL_PT_AUXTRACE_PRIV_SIZE;
  }
diff --git a/tools/perf/arch/x86/util/kvm-stat.c b/tools/perf/arch/x86/util/kvm-stat.c

index 14e4e668fad733c1802908047ae524318aa586de..b63d4be655a24a678f69d6deea3b4ee87283b938 100644 (file)
--- a/tools/perf/arch/x86/util/kvm-stat.c
+++ b/tools/perf/arch/x86/util/kvm-stat.c
@@ -1,5 +1,7 @@
  #include "../../util/kvm-stat.h"
-#include <asm/kvm_perf.h>
+#include <asm/svm.h>
+#include <asm/vmx.h>
+#include <asm/kvm.h>
  
  define_exit_reasons_table(vmx_exit_reasons, VMX_EXIT_REASONS);
  define_exit_reasons_table(svm_exit_reasons, SVM_EXIT_REASONS);
@@ -11,6 +13,12 @@ static struct kvm_events_ops exit_events = {
         .name = "VM-EXIT"
  };
  
+const char *vcpu_id_str = "vcpu_id";
+const int decode_str_len = 20;
+const char *kvm_exit_reason = "exit_reason";
+const char *kvm_entry_trace = "kvm:kvm_entry";
+const char *kvm_exit_trace = "kvm:kvm_exit";
+
  /*
   * For the mmio events, we treat:
   * the time of MMIO write: kvm_mmio(KVM_TRACE_MMIO_WRITE...) -> kvm_entry
@@ -65,7 +73,7 @@ static void mmio_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
                                   struct event_key *key,
                                   char *decode)
  {
-       scnprintf(decode, DECODE_STR_LEN, "%#lx:%s",
+       scnprintf(decode, decode_str_len, "%#lx:%s",
                   (unsigned long)key->key,
                   key->info == KVM_TRACE_MMIO_WRITE ? "W" : "R");
  }
@@ -109,7 +117,7 @@ static void ioport_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
                                     struct event_key *key,
                                     char *decode)
  {
-       scnprintf(decode, DECODE_STR_LEN, "%#llx:%s",
+       scnprintf(decode, decode_str_len, "%#llx:%s",
                   (unsigned long long)key->key,
                   key->info ? "POUT" : "PIN");
  }
@@ -121,7 +129,7 @@ static struct kvm_events_ops ioport_events = {
         .name = "IO Port Access"
  };
  
-const char * const kvm_events_tp[] = {
+const char *kvm_events_tp[] = {
         "kvm:kvm_entry",
         "kvm:kvm_exit",
         "kvm:kvm_mmio",
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S

index e4c2c30143b95133913a2bc3569b2871b79aa75c..5c3cce082cb88c5cab984bdb1ae6944274861ad4 100644 (file)
--- a/tools/perf/bench/mem-memcpy-x86-64-asm.S
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S
@@ -1,6 +1,11 @@
+
+/* Various wrappers to make the kernel .S file build in user-space: */
+
  #define memcpy MEMCPY /* don't hide glibc's memcpy() */
  #define altinstr_replacement text
  #define globl p2align 4; .globl
+#define _ASM_EXTABLE_FAULT(x, y)
+
  #include "../../../arch/x86/lib/memcpy_64.S"
  /*
   * We need to provide note.GNU-stack section, saying that we want
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c

index cc5c1267c738da038cfb6b824f79854fab41a629..cfe366375c4b89bb3642f5531d04b418a8d76e56 100644 (file)
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -245,7 +245,7 @@ static int __cmd_annotate(struct perf_annotate *ann)
                         hists__collapse_resort(hists, NULL);
                         /* Don't sort callchain */
                         perf_evsel__reset_sample_bit(pos, CALLCHAIN);
-                       hists__output_resort(hists, NULL);
+                       perf_evsel__output_resort(pos, NULL);
  
                         if (symbol_conf.event_group &&
                             !perf_evsel__is_group_leader(pos))
diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c

index d93bff7fc0e407d6518a6ce0e2ee0dd4b65bf910..632efc6b79a07e20c4b25f9757bcd9c089050270 100644 (file)
--- a/tools/perf/builtin-buildid-cache.c
+++ b/tools/perf/builtin-buildid-cache.c
@@ -38,19 +38,7 @@ static int build_id_cache__kcore_buildid(const char *proc_dir, char *sbuildid)
  
  static int build_id_cache__kcore_dir(char *dir, size_t sz)
  {
-       struct timeval tv;
-       struct tm tm;
-       char dt[32];
-
-       if (gettimeofday(&tv, NULL) || !localtime_r(&tv.tv_sec, &tm))
-               return -1;
-
-       if (!strftime(dt, sizeof(dt), "%Y%m%d%H%M%S", &tm))
-               return -1;
-
-       scnprintf(dir, sz, "%s%02u", dt, (unsigned)tv.tv_usec / 10000);
-
-       return 0;
+       return fetch_current_timestamp(dir, sz);
  }
  
  static bool same_kallsyms_reloc(const char *from_dir, char *to_dir)
diff --git a/tools/perf/builtin-config.c b/tools/perf/builtin-config.c

index f04e804a9fadc6807a7bc74810fac5ddaad912b2..c42448ed5dfe20a74af9c671669aa23a25cadb5b 100644 (file)
--- a/tools/perf/builtin-config.c
+++ b/tools/perf/builtin-config.c
@@ -13,8 +13,10 @@
  #include "util/util.h"
  #include "util/debug.h"
  
+static bool use_system_config, use_user_config;
+
  static const char * const config_usage[] = {
-       "perf config [options]",
+       "perf config [<file-option>] [options]",
         NULL
  };
  
@@ -25,6 +27,8 @@ enum actions {
  static struct option config_options[] = {
         OPT_SET_UINT('l', "list", &actions,
                      "show current config variables", ACTION_LIST),
+       OPT_BOOLEAN(0, "system", &use_system_config, "use system config file"),
+       OPT_BOOLEAN(0, "user", &use_user_config, "use user config file"),
         OPT_END()
  };
  
@@ -42,10 +46,23 @@ static int show_config(const char *key, const char *value,
  int cmd_config(int argc, const char **argv, const char *prefix __maybe_unused)
  {
         int ret = 0;
+       char *user_config = mkpath("%s/.perfconfig", getenv("HOME"));
  
         argc = parse_options(argc, argv, config_options, config_usage,
                              PARSE_OPT_STOP_AT_NON_OPTION);
  
+       if (use_system_config && use_user_config) {
+               pr_err("Error: only one config file at a time\n");
+               parse_options_usage(config_usage, config_options, "user", 0);
+               parse_options_usage(NULL, config_options, "system", 0);
+               return -1;
+       }
+
+       if (use_system_config)
+               config_exclusive_filename = perf_etc_perfconfig();
+       else if (use_user_config)
+               config_exclusive_filename = user_config;
+
         switch (actions) {
         case ACTION_LIST:
                 if (argc) {
@@ -53,9 +70,13 @@ int cmd_config(int argc, const char **argv, const char *prefix __maybe_unused)
                         parse_options_usage(config_usage, config_options, "l", 1);
                 } else {
                         ret = perf_config(show_config, NULL);
-                       if (ret < 0)
+                       if (ret < 0) {
+                               const char * config_filename = config_exclusive_filename;
+                               if (!config_exclusive_filename)
+                                       config_filename = user_config;
                                 pr_err("Nothing configured, "
-                                      "please check your ~/.perfconfig file\n");
+                                      "please check your %s \n", config_filename);
+                       }
                 }
                 break;
         default:
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c

index 36ccc2b8827fdd239b657e79b396ae013c318462..4d72359fd15ad00ca7397bb033e84b4f899546c2 100644 (file)
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -1264,8 +1264,6 @@ int cmd_diff(int argc, const char **argv, const char *prefix __maybe_unused)
         if (ret < 0)
                 return ret;
  
-       perf_config(perf_default_config, NULL);
-
         argc = parse_options(argc, argv, options, diff_usage, 0);
  
         if (symbol__init(NULL) < 0)
diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c

index 96c1a4cfbbbf6b1f639d91d562c55b09da5c7e6f..49d55e21b1b06dbecd9c6935a2e0f33550585514 100644 (file)
--- a/tools/perf/builtin-help.c
+++ b/tools/perf/builtin-help.c
@@ -86,8 +86,7 @@ static int check_emacsclient_version(void)
                 return -1;
         }
  
-       strbuf_remove(&buffer, 0, strlen("emacsclient"));
-       version = atoi(buffer.buf);
+       version = atoi(buffer.buf + strlen("emacsclient"));
  
         if (version < 22) {
                 fprintf(stderr,
@@ -273,7 +272,7 @@ static int perf_help_config(const char *var, const char *value, void *cb)
         if (!prefixcmp(var, "man."))
                 return add_man_viewer_info(var, value);
  
-       return perf_default_config(var, value, cb);
+       return 0;
  }
  
  static struct cmdnames main_cmds, other_cmds;
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c

index 0022e02ed31a7034b9286884ab87c4c131e41fa3..7fa68663ed7287f63adad2342db987941ba0c1b4 100644 (file)
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -17,6 +17,7 @@
  #include "util/build-id.h"
  #include "util/data.h"
  #include "util/auxtrace.h"
+#include "util/jit.h"
  
  #include <subcmd/parse-options.h>
  
@@ -29,6 +30,7 @@ struct perf_inject {
         bool                    sched_stat;
         bool                    have_auxtrace;
         bool                    strip;
+       bool                    jit_mode;
         const char              *input_name;
         struct perf_data_file   output;
         u64                     bytes_written;
@@ -71,6 +73,15 @@ static int perf_event__repipe_oe_synth(struct perf_tool *tool,
         return perf_event__repipe_synth(tool, event);
  }
  
+#ifdef HAVE_JITDUMP
+static int perf_event__drop_oe(struct perf_tool *tool __maybe_unused,
+                              union perf_event *event __maybe_unused,
+                              struct ordered_events *oe __maybe_unused)
+{
+       return 0;
+}
+#endif
+
  static int perf_event__repipe_op2_synth(struct perf_tool *tool,
                                         union perf_event *event,
                                         struct perf_session *session
@@ -234,6 +245,31 @@ static int perf_event__repipe_mmap(struct perf_tool *tool,
         return err;
  }
  
+#ifdef HAVE_JITDUMP
+static int perf_event__jit_repipe_mmap(struct perf_tool *tool,
+                                      union perf_event *event,
+                                      struct perf_sample *sample,
+                                      struct machine *machine)
+{
+       struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
+       u64 n = 0;
+       int ret;
+
+       /*
+        * if jit marker, then inject jit mmaps and generate ELF images
+        */
+       ret = jit_process(inject->session, &inject->output, machine,
+                         event->mmap.filename, sample->pid, &n);
+       if (ret < 0)
+               return ret;
+       if (ret) {
+               inject->bytes_written += n;
+               return 0;
+       }
+       return perf_event__repipe_mmap(tool, event, sample, machine);
+}
+#endif
+
  static int perf_event__repipe_mmap2(struct perf_tool *tool,
                                    union perf_event *event,
                                    struct perf_sample *sample,
@@ -247,6 +283,31 @@ static int perf_event__repipe_mmap2(struct perf_tool *tool,
         return err;
  }
  
+#ifdef HAVE_JITDUMP
+static int perf_event__jit_repipe_mmap2(struct perf_tool *tool,
+                                       union perf_event *event,
+                                       struct perf_sample *sample,
+                                       struct machine *machine)
+{
+       struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
+       u64 n = 0;
+       int ret;
+
+       /*
+        * if jit marker, then inject jit mmaps and generate ELF images
+        */
+       ret = jit_process(inject->session, &inject->output, machine,
+                         event->mmap2.filename, sample->pid, &n);
+       if (ret < 0)
+               return ret;
+       if (ret) {
+               inject->bytes_written += n;
+               return 0;
+       }
+       return perf_event__repipe_mmap2(tool, event, sample, machine);
+}
+#endif
+
  static int perf_event__repipe_fork(struct perf_tool *tool,
                                    union perf_event *event,
                                    struct perf_sample *sample,
@@ -626,12 +687,16 @@ static int __cmd_inject(struct perf_inject *inject)
         ret = perf_session__process_events(session);
  
         if (!file_out->is_pipe) {
-               if (inject->build_ids) {
+               if (inject->build_ids)
                         perf_header__set_feat(&session->header,
                                               HEADER_BUILD_ID);
-                       if (inject->have_auxtrace)
-                               dsos__hit_all(session);
-               }
+               /*
+                * Keep all buildids when there is unprocessed AUX data because
+                * it is not known which ones the AUX trace hits.
+                */
+               if (perf_header__has_feat(&session->header, HEADER_BUILD_ID) &&
+                   inject->have_auxtrace && !inject->itrace_synth_opts.set)
+                       dsos__hit_all(session);
                 /*
                  * The AUX areas have been removed and replaced with
                  * synthesized hardware events, so clear the feature flag and
@@ -703,7 +768,7 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
         };
         int ret;
  
-       const struct option options[] = {
+       struct option options[] = {
                 OPT_BOOLEAN('b', "build-ids", &inject.build_ids,
                             "Inject build-ids into the output stream"),
                 OPT_STRING('i', "input", &inject.input_name, "file",
@@ -713,6 +778,9 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
                 OPT_BOOLEAN('s', "sched-stat", &inject.sched_stat,
                             "Merge sched-stat and sched-switch for getting events "
                             "where and how long tasks slept"),
+#ifdef HAVE_JITDUMP
+               OPT_BOOLEAN('j', "jit", &inject.jit_mode, "merge jitdump files into perf.data file"),
+#endif
                 OPT_INCR('v', "verbose", &verbose,
                          "be more verbose (show build ids, etc)"),
                 OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name, "file",
@@ -729,7 +797,9 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
                 "perf inject [<options>]",
                 NULL
         };
-
+#ifndef HAVE_JITDUMP
+       set_option_nobuild(options, 'j', "jit", "NO_LIBELF=1", true);
+#endif
         argc = parse_options(argc, argv, options, inject_usage, 0);
  
         /*
@@ -755,6 +825,29 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
         if (inject.session == NULL)
                 return -1;
  
+       if (inject.build_ids) {
+               /*
+                * to make sure the mmap records are ordered correctly
+                * and so that the correct especially due to jitted code
+                * mmaps. We cannot generate the buildid hit list and
+                * inject the jit mmaps at the same time for now.
+                */
+               inject.tool.ordered_events = true;
+               inject.tool.ordering_requires_timestamps = true;
+       }
+#ifdef HAVE_JITDUMP
+       if (inject.jit_mode) {
+               inject.tool.mmap2          = perf_event__jit_repipe_mmap2;
+               inject.tool.mmap           = perf_event__jit_repipe_mmap;
+               inject.tool.ordered_events = true;
+               inject.tool.ordering_requires_timestamps = true;
+               /*
+                * JIT MMAP injection injects all MMAP events in one go, so it
+                * does not obey finished_round semantics.
+                */
+               inject.tool.finished_round = perf_event__drop_oe;
+       }
+#endif
         ret = symbol__init(&inject.session->header.env);
         if (ret < 0)
                 goto out_delete;
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c

index 118010553d0cf0dca9de6f493a5c8e267b0fff48..4d3340cce9a0200b469fd61a563eee4cb7f48830 100644 (file)
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -1834,7 +1834,7 @@ static int __cmd_record(int argc, const char **argv)
         return cmd_record(i, rec_argv, NULL);
  }
  
-static int kmem_config(const char *var, const char *value, void *cb)
+static int kmem_config(const char *var, const char *value, void *cb __maybe_unused)
  {
         if (!strcmp(var, "kmem.default")) {
                 if (!strcmp(value, "slab"))
@@ -1847,7 +1847,7 @@ static int kmem_config(const char *var, const char *value, void *cb)
                 return 0;
         }
  
-       return perf_default_config(var, value, cb);
+       return 0;
  }
  
  int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c

index 4418d9214872150648a719dd07fc14a29c913e9b..bff666458b28e24dccac682d0f28b6708a1a7c83 100644 (file)
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -30,7 +30,6 @@
  #include <math.h>
  
  #ifdef HAVE_KVM_STAT_SUPPORT
-#include <asm/kvm_perf.h>
  #include "util/kvm-stat.h"
  
  void exit_event_get_key(struct perf_evsel *evsel,
@@ -38,12 +37,12 @@ void exit_event_get_key(struct perf_evsel *evsel,
                         struct event_key *key)
  {
         key->info = 0;
-       key->key = perf_evsel__intval(evsel, sample, KVM_EXIT_REASON);
+       key->key = perf_evsel__intval(evsel, sample, kvm_exit_reason);
  }
  
  bool kvm_exit_event(struct perf_evsel *evsel)
  {
-       return !strcmp(evsel->name, KVM_EXIT_TRACE);
+       return !strcmp(evsel->name, kvm_exit_trace);
  }
  
  bool exit_event_begin(struct perf_evsel *evsel,
@@ -59,7 +58,7 @@ bool exit_event_begin(struct perf_evsel *evsel,
  
  bool kvm_entry_event(struct perf_evsel *evsel)
  {
-       return !strcmp(evsel->name, KVM_ENTRY_TRACE);
+       return !strcmp(evsel->name, kvm_entry_trace);
  }
  
  bool exit_event_end(struct perf_evsel *evsel,
@@ -91,7 +90,7 @@ void exit_event_decode_key(struct perf_kvm_stat *kvm,
         const char *exit_reason = get_exit_reason(kvm, key->exit_reasons,
                                                   key->key);
  
-       scnprintf(decode, DECODE_STR_LEN, "%s", exit_reason);
+       scnprintf(decode, decode_str_len, "%s", exit_reason);
  }
  
  static bool register_kvm_events_ops(struct perf_kvm_stat *kvm)
@@ -357,7 +356,7 @@ static bool handle_end_event(struct perf_kvm_stat *kvm,
         time_diff = sample->time - time_begin;
  
         if (kvm->duration && time_diff > kvm->duration) {
-               char decode[DECODE_STR_LEN];
+               char decode[decode_str_len];
  
                 kvm->events_ops->decode_key(kvm, &event->key, decode);
                 if (!skip_event(decode)) {
@@ -385,7 +384,8 @@ struct vcpu_event_record *per_vcpu_record(struct thread *thread,
                         return NULL;
                 }
  
-               vcpu_record->vcpu_id = perf_evsel__intval(evsel, sample, VCPU_ID);
+               vcpu_record->vcpu_id = perf_evsel__intval(evsel, sample,
+                                                         vcpu_id_str);
                 thread__set_priv(thread, vcpu_record);
         }
  
@@ -574,7 +574,7 @@ static void show_timeofday(void)
  
  static void print_result(struct perf_kvm_stat *kvm)
  {
-       char decode[DECODE_STR_LEN];
+       char decode[decode_str_len];
         struct kvm_event *event;
         int vcpu = kvm->trace_vcpu;
  
@@ -585,7 +585,7 @@ static void print_result(struct perf_kvm_stat *kvm)
  
         pr_info("\n\n");
         print_vcpu_info(kvm);
-       pr_info("%*s ", DECODE_STR_LEN, kvm->events_ops->name);
+       pr_info("%*s ", decode_str_len, kvm->events_ops->name);
         pr_info("%10s ", "Samples");
         pr_info("%9s ", "Samples%");
  
@@ -604,7 +604,7 @@ static void print_result(struct perf_kvm_stat *kvm)
                 min = get_event_min(event, vcpu);
  
                 kvm->events_ops->decode_key(kvm, &event->key, decode);
-               pr_info("%*s ", DECODE_STR_LEN, decode);
+               pr_info("%*s ", decode_str_len, decode);
                 pr_info("%10llu ", (unsigned long long)ecount);
                 pr_info("%8.2f%% ", (double)ecount / kvm->total_count * 100);
                 pr_info("%8.2f%% ", (double)etime / kvm->total_time * 100);
@@ -1132,6 +1132,11 @@ exit:
                 _p;                     \
         })
  
+int __weak setup_kvm_events_tp(struct perf_kvm_stat *kvm __maybe_unused)
+{
+       return 0;
+}
+
  static int
  kvm_events_record(struct perf_kvm_stat *kvm, int argc, const char **argv)
  {
@@ -1148,7 +1153,14 @@ kvm_events_record(struct perf_kvm_stat *kvm, int argc, const char **argv)
                 NULL
         };
         const char * const *events_tp;
+       int ret;
+
         events_tp_size = 0;
+       ret = setup_kvm_events_tp(kvm);
+       if (ret < 0) {
+               pr_err("Unable to setup the kvm tracepoints\n");
+               return ret;
+       }
  
         for (events_tp = kvm_events_tp; *events_tp; events_tp++)
                 events_tp_size++;
@@ -1377,6 +1389,12 @@ static int kvm_events_live(struct perf_kvm_stat *kvm,
         /*
          * generate the event list
          */
+       err = setup_kvm_events_tp(kvm);
+       if (err < 0) {
+               pr_err("Unable to setup the kvm tracepoints\n");
+               return err;
+       }
+
         kvm->evlist = kvm_live_event_list();
         if (kvm->evlist == NULL) {
                 err = -1;
diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c

index 39017004169665ec4a2ebd59aa0bc43e58412874..88aeac9aa1da12c664ec64ee419d6ce48c5ad8ec 100644 (file)
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -6,6 +6,8 @@
  #include "util/tool.h"
  #include "util/session.h"
  #include "util/data.h"
+#include "util/mem-events.h"
+#include "util/debug.h"
  
  #define MEM_OPERATION_LOAD     0x1
  #define MEM_OPERATION_STORE    0x2
@@ -21,11 +23,56 @@ struct perf_mem {
         DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
  };
  
+static int parse_record_events(const struct option *opt,
+                              const char *str, int unset __maybe_unused)
+{
+       struct perf_mem *mem = *(struct perf_mem **)opt->value;
+       int j;
+
+       if (strcmp(str, "list")) {
+               if (!perf_mem_events__parse(str)) {
+                       mem->operation = 0;
+                       return 0;
+               }
+               exit(-1);
+       }
+
+       for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
+               struct perf_mem_event *e = &perf_mem_events[j];
+
+               fprintf(stderr, "%-13s%-*s%s\n",
+                       e->tag,
+                       verbose ? 25 : 0,
+                       verbose ? perf_mem_events__name(j) : "",
+                       e->supported ? ": available" : "");
+       }
+       exit(0);
+}
+
+static const char * const __usage[] = {
+       "perf mem record [<options>] [<command>]",
+       "perf mem record [<options>] -- <command> [<options>]",
+       NULL
+};
+
+static const char * const *record_mem_usage = __usage;
+
  static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
  {
         int rec_argc, i = 0, j;
         const char **rec_argv;
         int ret;
+       struct option options[] = {
+       OPT_CALLBACK('e', "event", &mem, "event",
+                    "event selector. use 'perf mem record -e list' to list available events",
+                    parse_record_events),
+       OPT_INCR('v', "verbose", &verbose,
+                "be more verbose (show counter open errors, etc)"),
+       OPT_END()
+       };
+
+       argc = parse_options(argc, argv, options, record_mem_usage,
+                            PARSE_OPT_STOP_AT_NON_OPTION);
  
         rec_argc = argc + 7; /* max number of arguments */
         rec_argv = calloc(rec_argc + 1, sizeof(char *));
@@ -35,23 +82,40 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
         rec_argv[i++] = "record";
  
         if (mem->operation & MEM_OPERATION_LOAD)
+               perf_mem_events[PERF_MEM_EVENTS__LOAD].record = true;
+
+       if (perf_mem_events[PERF_MEM_EVENTS__LOAD].record)
                 rec_argv[i++] = "-W";
  
         rec_argv[i++] = "-d";
  
-       if (mem->operation & MEM_OPERATION_LOAD) {
-               rec_argv[i++] = "-e";
-               rec_argv[i++] = "cpu/mem-loads/pp";
-       }
+       for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
+               if (!perf_mem_events[j].record)
+                       continue;
+
+               if (!perf_mem_events[j].supported) {
+                       pr_err("failed: event '%s' not supported\n",
+                              perf_mem_events__name(j));
+                       return -1;
+               }
  
-       if (mem->operation & MEM_OPERATION_STORE) {
                 rec_argv[i++] = "-e";
-               rec_argv[i++] = "cpu/mem-stores/pp";
-       }
+               rec_argv[i++] = perf_mem_events__name(j);
+       };
  
-       for (j = 1; j < argc; j++, i++)
+       for (j = 0; j < argc; j++, i++)
                 rec_argv[i] = argv[j];
  
+       if (verbose > 0) {
+               pr_debug("calling: record ");
+
+               while (rec_argv[j]) {
+                       pr_debug("%s ", rec_argv[j]);
+                       j++;
+               }
+               pr_debug("\n");
+       }
+
         ret = cmd_record(i, rec_argv, NULL);
         free(rec_argv);
         return ret;
@@ -298,6 +362,10 @@ int cmd_mem(int argc, const char **argv, const char *prefix __maybe_unused)
                 NULL
         };
  
+       if (perf_mem_events__init()) {
+               pr_err("failed: memory events not supported\n");
+               return -1;
+       }
  
         argc = parse_options_subcommand(argc, argv, mem_options, mem_subcommands,
                                         mem_usage, PARSE_OPT_STOP_AT_NON_OPTION);
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c

index 319712a4e02b73de7359fc2ed772bed1d9cd424f..515510ecc76a43391e2ac58f830557b51810b466 100644 (file)
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -32,6 +32,8 @@
  #include "util/parse-branch-options.h"
  #include "util/parse-regs-options.h"
  #include "util/llvm-utils.h"
+#include "util/bpf-loader.h"
+#include "asm/bug.h"
  
  #include <unistd.h>
  #include <sched.h>
@@ -49,7 +51,9 @@ struct record {
         const char              *progname;
         int                     realtime_prio;
         bool                    no_buildid;
+       bool                    no_buildid_set;
         bool                    no_buildid_cache;
+       bool                    no_buildid_cache_set;
         bool                    buildid_all;
         unsigned long long      samples;
  };
@@ -320,7 +324,10 @@ try_again:
                 } else {
                         pr_err("failed to mmap with %d (%s)\n", errno,
                                 strerror_r(errno, msg, sizeof(msg)));
-                       rc = -errno;
+                       if (errno)
+                               rc = -errno;
+                       else
+                               rc = -EINVAL;
                 }
                 goto out;
         }
@@ -464,6 +471,29 @@ static void record__init_features(struct record *rec)
         perf_header__clear_feat(&session->header, HEADER_STAT);
  }
  
+static void
+record__finish_output(struct record *rec)
+{
+       struct perf_data_file *file = &rec->file;
+       int fd = perf_data_file__fd(file);
+
+       if (file->is_pipe)
+               return;
+
+       rec->session->header.data_size += rec->bytes_written;
+       file->size = lseek(perf_data_file__fd(file), 0, SEEK_CUR);
+
+       if (!rec->no_buildid) {
+               process_buildids(rec);
+
+               if (rec->buildid_all)
+                       dsos__hit_all(rec->session);
+       }
+       perf_session__write_header(rec->session, rec->evlist, fd, true);
+
+       return;
+}
+
  static volatile int workload_exec_errno;
  
  /*
@@ -482,6 +512,74 @@ static void workload_exec_failed_signal(int signo __maybe_unused,
  
  static void snapshot_sig_handler(int sig);
  
+static int record__synthesize(struct record *rec)
+{
+       struct perf_session *session = rec->session;
+       struct machine *machine = &session->machines.host;
+       struct perf_data_file *file = &rec->file;
+       struct record_opts *opts = &rec->opts;
+       struct perf_tool *tool = &rec->tool;
+       int fd = perf_data_file__fd(file);
+       int err = 0;
+
+       if (file->is_pipe) {
+               err = perf_event__synthesize_attrs(tool, session,
+                                                  process_synthesized_event);
+               if (err < 0) {
+                       pr_err("Couldn't synthesize attrs.\n");
+                       goto out;
+               }
+
+               if (have_tracepoints(&rec->evlist->entries)) {
+                       /*
+                        * FIXME err <= 0 here actually means that
+                        * there were no tracepoints so its not really
+                        * an error, just that we don't need to
+                        * synthesize anything.  We really have to
+                        * return this more properly and also
+                        * propagate errors that now are calling die()
+                        */
+                       err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist,
+                                                                 process_synthesized_event);
+                       if (err <= 0) {
+                               pr_err("Couldn't record tracing data.\n");
+                               goto out;
+                       }
+                       rec->bytes_written += err;
+               }
+       }
+
+       if (rec->opts.full_auxtrace) {
+               err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
+                                       session, process_synthesized_event);
+               if (err)
+                       goto out;
+       }
+
+       err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
+                                                machine);
+       WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
+                          "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
+                          "Check /proc/kallsyms permission or run as root.\n");
+
+       err = perf_event__synthesize_modules(tool, process_synthesized_event,
+                                            machine);
+       WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
+                          "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
+                          "Check /proc/modules permission or run as root.\n");
+
+       if (perf_guest) {
+               machines__process_guests(&session->machines,
+                                        perf_event__synthesize_guest_os, tool);
+       }
+
+       err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
+                                           process_synthesized_event, opts->sample_address,
+                                           opts->proc_map_timeout);
+out:
+       return err;
+}
+
  static int __cmd_record(struct record *rec, int argc, const char **argv)
  {
         int err;
@@ -534,6 +632,16 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
                 goto out_child;
         }
  
+       err = bpf__apply_obj_config();
+       if (err) {
+               char errbuf[BUFSIZ];
+
+               bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
+               pr_err("ERROR: Apply config to BPF failed: %s\n",
+                        errbuf);
+               goto out_child;
+       }
+
         /*
          * Normally perf_session__new would do this, but it doesn't have the
          * evlist.
@@ -566,63 +674,8 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
  
         machine = &session->machines.host;
  
-       if (file->is_pipe) {
-               err = perf_event__synthesize_attrs(tool, session,
-                                                  process_synthesized_event);
-               if (err < 0) {
-                       pr_err("Couldn't synthesize attrs.\n");
-                       goto out_child;
-               }
-
-               if (have_tracepoints(&rec->evlist->entries)) {
-                       /*
-                        * FIXME err <= 0 here actually means that
-                        * there were no tracepoints so its not really
-                        * an error, just that we don't need to
-                        * synthesize anything.  We really have to
-                        * return this more properly and also
-                        * propagate errors that now are calling die()
-                        */
-                       err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist,
-                                                                 process_synthesized_event);
-                       if (err <= 0) {
-                               pr_err("Couldn't record tracing data.\n");
-                               goto out_child;
-                       }
-                       rec->bytes_written += err;
-               }
-       }
-
-       if (rec->opts.full_auxtrace) {
-               err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
-                                       session, process_synthesized_event);
-               if (err)
-                       goto out_delete_session;
-       }
-
-       err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
-                                                machine);
-       if (err < 0)
-               pr_err("Couldn't record kernel reference relocation symbol\n"
-                      "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
-                      "Check /proc/kallsyms permission or run as root.\n");
-
-       err = perf_event__synthesize_modules(tool, process_synthesized_event,
-                                            machine);
+       err = record__synthesize(rec);
         if (err < 0)
-               pr_err("Couldn't record kernel module information.\n"
-                      "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
-                      "Check /proc/modules permission or run as root.\n");
-
-       if (perf_guest) {
-               machines__process_guests(&session->machines,
-                                        perf_event__synthesize_guest_os, tool);
-       }
-
-       err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
-                                           process_synthesized_event, opts->sample_address,
-                                           opts->proc_map_timeout);
-       if (err != 0)
                 goto out_child;
  
         if (rec->realtime_prio) {
@@ -758,18 +811,8 @@ out_child:
         /* this will be recalculated during process_buildids() */
         rec->samples = 0;
  
-       if (!err && !file->is_pipe) {
-               rec->session->header.data_size += rec->bytes_written;
-               file->size = lseek(perf_data_file__fd(file), 0, SEEK_CUR);
-
-               if (!rec->no_buildid) {
-                       process_buildids(rec);
-
-                       if (rec->buildid_all)
-                               dsos__hit_all(rec->session);
-               }
-               perf_session__write_header(rec->session, rec->evlist, fd, true);
-       }
+       if (!err)
+               record__finish_output(rec);
  
         if (!err && !quiet) {
                 char samples[128];
@@ -1097,10 +1140,12 @@ struct option __record_options[] = {
         OPT_BOOLEAN('P', "period", &record.opts.period, "Record the sample period"),
         OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
                     "don't sample"),
-       OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
-                   "do not update the buildid cache"),
-       OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
-                   "do not collect buildids in perf.data"),
+       OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
+                       &record.no_buildid_cache_set,
+                       "do not update the buildid cache"),
+       OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
+                       &record.no_buildid_set,
+                       "do not collect buildids in perf.data"),
         OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
                      "monitor event in cgroup name only",
                      parse_cgroups),
@@ -1136,6 +1181,12 @@ struct option __record_options[] = {
                         "per thread proc mmap processing timeout in ms"),
         OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
                     "Record context switch events"),
+       OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
+                        "Configure all used events to run in kernel space.",
+                        PARSE_OPT_EXCLUSIVE),
+       OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
+                        "Configure all used events to run in user space.",
+                        PARSE_OPT_EXCLUSIVE),
         OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
                    "clang binary to use for compiling BPF scriptlets"),
         OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c

index 2bf537f190a026952bd63126e9c37def827e9914..7eea49f9ed46b520bdce56972379b76c14302c4b 100644 (file)
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -75,7 +75,10 @@ static int report__config(const char *var, const char *value, void *cb)
                 return 0;
         }
         if (!strcmp(var, "report.percent-limit")) {
-               rep->min_percent = strtof(value, NULL);
+               double pcnt = strtof(value, NULL);
+
+               rep->min_percent = pcnt;
+               callchain_param.min_percent = pcnt;
                 return 0;
         }
         if (!strcmp(var, "report.children")) {
@@ -87,7 +90,7 @@ static int report__config(const char *var, const char *value, void *cb)
                 return 0;
         }
  
-       return perf_default_config(var, value, cb);
+       return 0;
  }
  
  static int hist_iter__report_callback(struct hist_entry_iter *iter,
@@ -466,10 +469,11 @@ static int report__browse_hists(struct report *rep)
         return ret;
  }
  
-static void report__collapse_hists(struct report *rep)
+static int report__collapse_hists(struct report *rep)
  {
         struct ui_progress prog;
         struct perf_evsel *pos;
+       int ret = 0;
  
         ui_progress__init(&prog, rep->nr_entries, "Merging related events...");
  
@@ -481,7 +485,9 @@ static void report__collapse_hists(struct report *rep)
  
                 hists->socket_filter = rep->socket_filter;
  
-               hists__collapse_resort(hists, &prog);
+               ret = hists__collapse_resort(hists, &prog);
+               if (ret < 0)
+                       break;
  
                 /* Non-group events are considered as leader */
                 if (symbol_conf.event_group &&
@@ -494,6 +500,7 @@ static void report__collapse_hists(struct report *rep)
         }
  
         ui_progress__finish();
+       return ret;
  }
  
  static void report__output_resort(struct report *rep)
@@ -504,7 +511,7 @@ static void report__output_resort(struct report *rep)
         ui_progress__init(&prog, rep->nr_entries, "Sorting events for output...");
  
         evlist__for_each(rep->session->evlist, pos)
-               hists__output_resort(evsel__hists(pos), &prog);
+               perf_evsel__output_resort(pos, &prog);
  
         ui_progress__finish();
  }
@@ -561,7 +568,11 @@ static int __cmd_report(struct report *rep)
                 }
         }
  
-       report__collapse_hists(rep);
+       ret = report__collapse_hists(rep);
+       if (ret) {
+               ui__error("failed to process hist entry\n");
+               return ret;
+       }
  
         if (session_done())
                 return 0;
@@ -633,8 +644,10 @@ parse_percent_limit(const struct option *opt, const char *str,
                     int unset __maybe_unused)
  {
         struct report *rep = opt->value;
+       double pcnt = strtof(str, NULL);
  
-       rep->min_percent = strtof(str, NULL);
+       rep->min_percent = pcnt;
+       callchain_param.min_percent = pcnt;
         return 0;
  }
  
@@ -798,6 +811,8 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
                     "only show processor socket that match with this filter"),
         OPT_BOOLEAN(0, "raw-trace", &symbol_conf.raw_trace,
                     "Show raw trace event output (do not use print fmt or plugins)"),
+       OPT_BOOLEAN(0, "hierarchy", &symbol_conf.report_hierarchy,
+                   "Show entries in a hierarchy"),
         OPT_END()
         };
         struct perf_data_file file = {
@@ -907,13 +922,19 @@ repeat:
                 symbol_conf.cumulate_callchain = false;
         }
  
-       if (setup_sorting(session->evlist) < 0) {
-               if (sort_order)
-                       parse_options_usage(report_usage, options, "s", 1);
-               if (field_order)
-                       parse_options_usage(sort_order ? NULL : report_usage,
-                                           options, "F", 1);
-               goto error;
+       if (symbol_conf.report_hierarchy) {
+               /* disable incompatible options */
+               symbol_conf.event_group = false;
+               symbol_conf.cumulate_callchain = false;
+
+               if (field_order) {
+                       pr_err("Error: --hierarchy and --fields options cannot be used together\n");
+                       parse_options_usage(report_usage, options, "F", 1);
+                       parse_options_usage(NULL, options, "hierarchy", 0);
+                       goto error;
+               }
+
+               sort__need_collapse = true;
         }
  
         /* Force tty output for header output and per-thread stat. */
@@ -925,6 +946,15 @@ repeat:
         else
                 use_browser = 0;
  
+       if (setup_sorting(session->evlist) < 0) {
+               if (sort_order)
+                       parse_options_usage(report_usage, options, "s", 1);
+               if (field_order)
+                       parse_options_usage(sort_order ? NULL : report_usage,
+                                           options, "F", 1);
+               goto error;
+       }
+
         if (report.header || report.header_only) {
                 perf_session__fprintf_info(session, stdout,
                                            report.show_full_info);
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c

index c691214d820f0050c39e9d376d69cd5891132a47..57f9a7e7f7d3e948a29a92018140902c7f5c840f 100644 (file)
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -23,6 +23,7 @@
  #include "util/stat.h"
  #include <linux/bitmap.h>
  #include "asm/bug.h"
+#include "util/mem-events.h"
  
  static char const              *script_name;
  static char const              *generate_script_lang;
@@ -58,6 +59,9 @@ enum perf_output_field {
         PERF_OUTPUT_IREGS           = 1U << 14,
         PERF_OUTPUT_BRSTACK         = 1U << 15,
         PERF_OUTPUT_BRSTACKSYM      = 1U << 16,
+       PERF_OUTPUT_DATA_SRC        = 1U << 17,
+       PERF_OUTPUT_WEIGHT          = 1U << 18,
+       PERF_OUTPUT_BPF_OUTPUT      = 1U << 19,
  };
  
  struct output_option {
@@ -81,6 +85,9 @@ struct output_option {
         {.str = "iregs", .field = PERF_OUTPUT_IREGS},
         {.str = "brstack", .field = PERF_OUTPUT_BRSTACK},
         {.str = "brstacksym", .field = PERF_OUTPUT_BRSTACKSYM},
+       {.str = "data_src", .field = PERF_OUTPUT_DATA_SRC},
+       {.str = "weight",   .field = PERF_OUTPUT_WEIGHT},
+       {.str = "bpf-output",   .field = PERF_OUTPUT_BPF_OUTPUT},
  };
  
  /* default set to maintain compatibility with current format */
@@ -101,7 +108,7 @@ static struct {
                               PERF_OUTPUT_SYM | PERF_OUTPUT_DSO |
                               PERF_OUTPUT_PERIOD,
  
-               .invalid_fields = PERF_OUTPUT_TRACE,
+               .invalid_fields = PERF_OUTPUT_TRACE | PERF_OUTPUT_BPF_OUTPUT,
         },
  
         [PERF_TYPE_SOFTWARE] = {
@@ -111,7 +118,7 @@ static struct {
                               PERF_OUTPUT_CPU | PERF_OUTPUT_TIME |
                               PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP |
                               PERF_OUTPUT_SYM | PERF_OUTPUT_DSO |
-                             PERF_OUTPUT_PERIOD,
+                             PERF_OUTPUT_PERIOD | PERF_OUTPUT_BPF_OUTPUT,
  
                 .invalid_fields = PERF_OUTPUT_TRACE,
         },
@@ -121,7 +128,7 @@ static struct {
  
                 .fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID |
                                   PERF_OUTPUT_CPU | PERF_OUTPUT_TIME |
-                                 PERF_OUTPUT_EVNAME | PERF_OUTPUT_TRACE,
+                                 PERF_OUTPUT_EVNAME | PERF_OUTPUT_TRACE
         },
  
         [PERF_TYPE_RAW] = {
@@ -131,9 +138,10 @@ static struct {
                               PERF_OUTPUT_CPU | PERF_OUTPUT_TIME |
                               PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP |
                               PERF_OUTPUT_SYM | PERF_OUTPUT_DSO |
-                             PERF_OUTPUT_PERIOD,
+                             PERF_OUTPUT_PERIOD |  PERF_OUTPUT_ADDR |
+                             PERF_OUTPUT_DATA_SRC | PERF_OUTPUT_WEIGHT,
  
-               .invalid_fields = PERF_OUTPUT_TRACE,
+               .invalid_fields = PERF_OUTPUT_TRACE | PERF_OUTPUT_BPF_OUTPUT,
         },
  
         [PERF_TYPE_BREAKPOINT] = {
@@ -145,7 +153,7 @@ static struct {
                               PERF_OUTPUT_SYM | PERF_OUTPUT_DSO |
                               PERF_OUTPUT_PERIOD,
  
-               .invalid_fields = PERF_OUTPUT_TRACE,
+               .invalid_fields = PERF_OUTPUT_TRACE | PERF_OUTPUT_BPF_OUTPUT,
         },
  };
  
@@ -242,6 +250,16 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
                                            PERF_OUTPUT_ADDR, allow_user_set))
                 return -EINVAL;
  
+       if (PRINT_FIELD(DATA_SRC) &&
+               perf_evsel__check_stype(evsel, PERF_SAMPLE_DATA_SRC, "DATA_SRC",
+                                       PERF_OUTPUT_DATA_SRC))
+               return -EINVAL;
+
+       if (PRINT_FIELD(WEIGHT) &&
+               perf_evsel__check_stype(evsel, PERF_SAMPLE_WEIGHT, "WEIGHT",
+                                       PERF_OUTPUT_WEIGHT))
+               return -EINVAL;
+
         if (PRINT_FIELD(SYM) && !PRINT_FIELD(IP) && !PRINT_FIELD(ADDR)) {
                 pr_err("Display of symbols requested but neither sample IP nor "
                            "sample address\nis selected. Hence, no addresses to convert "
@@ -608,6 +626,84 @@ static void print_sample_flags(u32 flags)
         printf("  %-4s ", str);
  }
  
+struct printer_data {
+       int line_no;
+       bool hit_nul;
+       bool is_printable;
+};
+
+static void
+print_sample_bpf_output_printer(enum binary_printer_ops op,
+                               unsigned int val,
+                               void *extra)
+{
+       unsigned char ch = (unsigned char)val;
+       struct printer_data *printer_data = extra;
+
+       switch (op) {
+       case BINARY_PRINT_DATA_BEGIN:
+               printf("\n");
+               break;
+       case BINARY_PRINT_LINE_BEGIN:
+               printf("%17s", !printer_data->line_no ? "BPF output:" :
+                                                       "           ");
+               break;
+       case BINARY_PRINT_ADDR:
+               printf(" %04x:", val);
+               break;
+       case BINARY_PRINT_NUM_DATA:
+               printf(" %02x", val);
+               break;
+       case BINARY_PRINT_NUM_PAD:
+               printf("   ");
+               break;
+       case BINARY_PRINT_SEP:
+               printf("  ");
+               break;
+       case BINARY_PRINT_CHAR_DATA:
+               if (printer_data->hit_nul && ch)
+                       printer_data->is_printable = false;
+
+               if (!isprint(ch)) {
+                       printf("%c", '.');
+
+                       if (!printer_data->is_printable)
+                               break;
+
+                       if (ch == '\0')
+                               printer_data->hit_nul = true;
+                       else
+                               printer_data->is_printable = false;
+               } else {
+                       printf("%c", ch);
+               }
+               break;
+       case BINARY_PRINT_CHAR_PAD:
+               printf(" ");
+               break;
+       case BINARY_PRINT_LINE_END:
+               printf("\n");
+               printer_data->line_no++;
+               break;
+       case BINARY_PRINT_DATA_END:
+       default:
+               break;
+       }
+}
+
+static void print_sample_bpf_output(struct perf_sample *sample)
+{
+       unsigned int nr_bytes = sample->raw_size;
+       struct printer_data printer_data = {0, false, true};
+
+       print_binary(sample->raw_data, nr_bytes, 8,
+                    print_sample_bpf_output_printer, &printer_data);
+
+       if (printer_data.is_printable && printer_data.hit_nul)
+               printf("%17s \"%s\"\n", "BPF string:",
+                      (char *)(sample->raw_data));
+}
+
  struct perf_script {
         struct perf_tool        tool;
         struct perf_session     *session;
@@ -634,6 +730,23 @@ static int perf_evlist__max_name_len(struct perf_evlist *evlist)
         return max;
  }
  
+static size_t data_src__printf(u64 data_src)
+{
+       struct mem_info mi = { .data_src.val = data_src };
+       char decode[100];
+       char out[100];
+       static int maxlen;
+       int len;
+
+       perf_script__meminfo_scnprintf(decode, 100, &mi);
+
+       len = scnprintf(out, 100, "%16" PRIx64 " %s", data_src, decode);
+       if (maxlen < len)
+               maxlen = len;
+
+       return printf("%-*s", maxlen, out);
+}
+
  static void process_event(struct perf_script *script, union perf_event *event,
                           struct perf_sample *sample, struct perf_evsel *evsel,
                           struct addr_location *al)
@@ -673,6 +786,12 @@ static void process_event(struct perf_script *script, union perf_event *event,
         if (PRINT_FIELD(ADDR))
                 print_sample_addr(event, sample, thread, attr);
  
+       if (PRINT_FIELD(DATA_SRC))
+               data_src__printf(sample->data_src);
+
+       if (PRINT_FIELD(WEIGHT))
+               printf("%16" PRIu64, sample->weight);
+
         if (PRINT_FIELD(IP)) {
                 if (!symbol_conf.use_callchain)
                         printf(" ");
@@ -692,6 +811,9 @@ static void process_event(struct perf_script *script, union perf_event *event,
         else if (PRINT_FIELD(BRSTACKSYM))
                 print_sample_brstacksym(event, sample, thread, attr);
  
+       if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT))
+               print_sample_bpf_output(sample);
+
         printf("\n");
  }
  
@@ -1090,23 +1212,6 @@ static struct script_spec *script_spec__find(const char *spec)
         return NULL;
  }
  
-static struct script_spec *script_spec__findnew(const char *spec,
-                                               struct scripting_ops *ops)
-{
-       struct script_spec *s = script_spec__find(spec);
-
-       if (s)
-               return s;
-
-       s = script_spec__new(spec, ops);
-       if (!s)
-               return NULL;
-
-       script_spec__add(s);
-
-       return s;
-}
-
  int script_spec_register(const char *spec, struct scripting_ops *ops)
  {
         struct script_spec *s;
@@ -1115,9 +1220,11 @@ int script_spec_register(const char *spec, struct scripting_ops *ops)
         if (s)
                 return -1;
  
-       s = script_spec__findnew(spec, ops);
+       s = script_spec__new(spec, ops);
         if (!s)
                 return -1;
+       else
+               script_spec__add(s);
  
         return 0;
  }
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c

index 038e877081b682dd8d9ba052a0be66c8ac2786c6..1f19f2f999c841b9da140e10bcaf5e6e0f41ee6b 100644 (file)
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -122,6 +122,7 @@ static bool                 sync_run                        = false;
  static unsigned int            initial_delay                   = 0;
  static unsigned int            unit_width                      = 4; /* strlen("unit") */
  static bool                    forever                         = false;
+static bool                    metric_only                     = false;
  static struct timespec         ref_time;
  static struct cpu_map          *aggr_map;
  static aggr_get_id_t           aggr_get_id;
@@ -735,6 +736,191 @@ static void aggr_printout(struct perf_evsel *evsel, int id, int nr)
         }
  }
  
+struct outstate {
+       FILE *fh;
+       bool newline;
+       const char *prefix;
+       int  nfields;
+       int  id, nr;
+       struct perf_evsel *evsel;
+};
+
+#define METRIC_LEN  35
+
+static void new_line_std(void *ctx)
+{
+       struct outstate *os = ctx;
+
+       os->newline = true;
+}
+
+static void do_new_line_std(struct outstate *os)
+{
+       fputc('\n', os->fh);
+       fputs(os->prefix, os->fh);
+       aggr_printout(os->evsel, os->id, os->nr);
+       if (stat_config.aggr_mode == AGGR_NONE)
+               fprintf(os->fh, "        ");
+       fprintf(os->fh, "                                                 ");
+}
+
+static void print_metric_std(void *ctx, const char *color, const char *fmt,
+                            const char *unit, double val)
+{
+       struct outstate *os = ctx;
+       FILE *out = os->fh;
+       int n;
+       bool newline = os->newline;
+
+       os->newline = false;
+
+       if (unit == NULL || fmt == NULL) {
+               fprintf(out, "%-*s", METRIC_LEN, "");
+               return;
+       }
+
+       if (newline)
+               do_new_line_std(os);
+
+       n = fprintf(out, " # ");
+       if (color)
+               n += color_fprintf(out, color, fmt, val);
+       else
+               n += fprintf(out, fmt, val);
+       fprintf(out, " %-*s", METRIC_LEN - n - 1, unit);
+}
+
+static void new_line_csv(void *ctx)
+{
+       struct outstate *os = ctx;
+       int i;
+
+       fputc('\n', os->fh);
+       if (os->prefix)
+               fprintf(os->fh, "%s%s", os->prefix, csv_sep);
+       aggr_printout(os->evsel, os->id, os->nr);
+       for (i = 0; i < os->nfields; i++)
+               fputs(csv_sep, os->fh);
+}
+
+static void print_metric_csv(void *ctx,
+                            const char *color __maybe_unused,
+                            const char *fmt, const char *unit, double val)
+{
+       struct outstate *os = ctx;
+       FILE *out = os->fh;
+       char buf[64], *vals, *ends;
+
+       if (unit == NULL || fmt == NULL) {
+               fprintf(out, "%s%s%s%s", csv_sep, csv_sep, csv_sep, csv_sep);
+               return;
+       }
+       snprintf(buf, sizeof(buf), fmt, val);
+       vals = buf;
+       while (isspace(*vals))
+               vals++;
+       ends = vals;
+       while (isdigit(*ends) || *ends == '.')
+               ends++;
+       *ends = 0;
+       while (isspace(*unit))
+               unit++;
+       fprintf(out, "%s%s%s%s", csv_sep, vals, csv_sep, unit);
+}
+
+#define METRIC_ONLY_LEN 20
+
+/* Filter out some columns that don't work well in metrics only mode */
+
+static bool valid_only_metric(const char *unit)
+{
+       if (!unit)
+               return false;
+       if (strstr(unit, "/sec") ||
+           strstr(unit, "hz") ||
+           strstr(unit, "Hz") ||
+           strstr(unit, "CPUs utilized"))
+               return false;
+       return true;
+}
+
+static const char *fixunit(char *buf, struct perf_evsel *evsel,
+                          const char *unit)
+{
+       if (!strncmp(unit, "of all", 6)) {
+               snprintf(buf, 1024, "%s %s", perf_evsel__name(evsel),
+                        unit);
+               return buf;
+       }
+       return unit;
+}
+
+static void print_metric_only(void *ctx, const char *color, const char *fmt,
+                             const char *unit, double val)
+{
+       struct outstate *os = ctx;
+       FILE *out = os->fh;
+       int n;
+       char buf[1024];
+       unsigned mlen = METRIC_ONLY_LEN;
+
+       if (!valid_only_metric(unit))
+               return;
+       unit = fixunit(buf, os->evsel, unit);
+       if (color)
+               n = color_fprintf(out, color, fmt, val);
+       else
+               n = fprintf(out, fmt, val);
+       if (n > METRIC_ONLY_LEN)
+               n = METRIC_ONLY_LEN;
+       if (mlen < strlen(unit))
+               mlen = strlen(unit) + 1;
+       fprintf(out, "%*s", mlen - n, "");
+}
+
+static void print_metric_only_csv(void *ctx, const char *color __maybe_unused,
+                                 const char *fmt,
+                                 const char *unit, double val)
+{
+       struct outstate *os = ctx;
+       FILE *out = os->fh;
+       char buf[64], *vals, *ends;
+       char tbuf[1024];
+
+       if (!valid_only_metric(unit))
+               return;
+       unit = fixunit(tbuf, os->evsel, unit);
+       snprintf(buf, sizeof buf, fmt, val);
+       vals = buf;
+       while (isspace(*vals))
+               vals++;
+       ends = vals;
+       while (isdigit(*ends) || *ends == '.')
+               ends++;
+       *ends = 0;
+       fprintf(out, "%s%s", vals, csv_sep);
+}
+
+static void new_line_metric(void *ctx __maybe_unused)
+{
+}
+
+static void print_metric_header(void *ctx, const char *color __maybe_unused,
+                               const char *fmt __maybe_unused,
+                               const char *unit, double val __maybe_unused)
+{
+       struct outstate *os = ctx;
+       char tbuf[1024];
+
+       if (!valid_only_metric(unit))
+               return;
+       unit = fixunit(tbuf, os->evsel, unit);
+       if (csv_output)
+               fprintf(os->fh, "%s%s", unit, csv_sep);
+       else
+               fprintf(os->fh, "%-*s ", METRIC_ONLY_LEN, unit);
+}
+
  static void nsec_printout(int id, int nr, struct perf_evsel *evsel, double avg)
  {
         FILE *output = stat_config.output;
@@ -763,6 +949,28 @@ static void nsec_printout(int id, int nr, struct perf_evsel *evsel, double avg)
                 fprintf(output, "%s%s", csv_sep, evsel->cgrp->name);
  }
  
+static int first_shadow_cpu(struct perf_evsel *evsel, int id)
+{
+       int i;
+
+       if (!aggr_get_id)
+               return 0;
+
+       if (stat_config.aggr_mode == AGGR_NONE)
+               return id;
+
+       if (stat_config.aggr_mode == AGGR_GLOBAL)
+               return 0;
+
+       for (i = 0; i < perf_evsel__nr_cpus(evsel); i++) {
+               int cpu2 = perf_evsel__cpus(evsel)->map[i];
+
+               if (aggr_get_id(evsel_list->cpus, cpu2) == id)
+                       return cpu2;
+       }
+       return 0;
+}
+
  static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
  {
         FILE *output = stat_config.output;
@@ -793,22 +1001,124 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
                 fprintf(output, "%s%s", csv_sep, evsel->cgrp->name);
  }
  
-static void printout(int id, int nr, struct perf_evsel *counter, double uval)
+static void printout(int id, int nr, struct perf_evsel *counter, double uval,
+                    char *prefix, u64 run, u64 ena, double noise)
  {
-       int cpu = cpu_map__id_to_cpu(id);
+       struct perf_stat_output_ctx out;
+       struct outstate os = {
+               .fh = stat_config.output,
+               .prefix = prefix ? prefix : "",
+               .id = id,
+               .nr = nr,
+               .evsel = counter,
+       };
+       print_metric_t pm = print_metric_std;
+       void (*nl)(void *);
  
-       if (stat_config.aggr_mode == AGGR_GLOBAL)
-               cpu = 0;
+       if (metric_only) {
+               nl = new_line_metric;
+               if (csv_output)
+                       pm = print_metric_only_csv;
+               else
+                       pm = print_metric_only;
+       } else
+               nl = new_line_std;
+
+       if (csv_output && !metric_only) {
+               static int aggr_fields[] = {
+                       [AGGR_GLOBAL] = 0,
+                       [AGGR_THREAD] = 1,
+                       [AGGR_NONE] = 1,
+                       [AGGR_SOCKET] = 2,
+                       [AGGR_CORE] = 2,
+               };
+
+               pm = print_metric_csv;
+               nl = new_line_csv;
+               os.nfields = 3;
+               os.nfields += aggr_fields[stat_config.aggr_mode];
+               if (counter->cgrp)
+                       os.nfields++;
+       }
+       if (run == 0 || ena == 0 || counter->counts->scaled == -1) {
+               if (metric_only) {
+                       pm(&os, NULL, "", "", 0);
+                       return;
+               }
+               aggr_printout(counter, id, nr);
+
+               fprintf(stat_config.output, "%*s%s",
+                       csv_output ? 0 : 18,
+                       counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
+                       csv_sep);
+
+               fprintf(stat_config.output, "%-*s%s",
+                       csv_output ? 0 : unit_width,
+                       counter->unit, csv_sep);
+
+               fprintf(stat_config.output, "%*s",
+                       csv_output ? 0 : -25,
+                       perf_evsel__name(counter));
+
+               if (counter->cgrp)
+                       fprintf(stat_config.output, "%s%s",
+                               csv_sep, counter->cgrp->name);
  
-       if (nsec_counter(counter))
+               if (!csv_output)
+                       pm(&os, NULL, NULL, "", 0);
+               print_noise(counter, noise);
+               print_running(run, ena);
+               if (csv_output)
+                       pm(&os, NULL, NULL, "", 0);
+               return;
+       }
+
+       if (metric_only)
+               /* nothing */;
+       else if (nsec_counter(counter))
                 nsec_printout(id, nr, counter, uval);
         else
                 abs_printout(id, nr, counter, uval);
  
-       if (!csv_output && !stat_config.interval)
-               perf_stat__print_shadow_stats(stat_config.output, counter,
-                                             uval, cpu,
-                                             stat_config.aggr_mode);
+       out.print_metric = pm;
+       out.new_line = nl;
+       out.ctx = &os;
+
+       if (csv_output && !metric_only) {
+               print_noise(counter, noise);
+               print_running(run, ena);
+       }
+
+       perf_stat__print_shadow_stats(counter, uval,
+                               first_shadow_cpu(counter, id),
+                               &out);
+       if (!csv_output && !metric_only) {
+               print_noise(counter, noise);
+               print_running(run, ena);
+       }
+}
+
+static void aggr_update_shadow(void)
+{
+       int cpu, s2, id, s;
+       u64 val;
+       struct perf_evsel *counter;
+
+       for (s = 0; s < aggr_map->nr; s++) {
+               id = aggr_map->map[s];
+               evlist__for_each(evsel_list, counter) {
+                       val = 0;
+                       for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
+                               s2 = aggr_get_id(evsel_list->cpus, cpu);
+                               if (s2 != id)
+                                       continue;
+                               val += perf_counts(counter->counts, cpu, 0)->val;
+                       }
+                       val = val * counter->scale;
+                       perf_stat__update_shadow_stats(counter, &val,
+                                                      first_shadow_cpu(counter, id));
+               }
+       }
  }
  
  static void print_aggr(char *prefix)
@@ -818,12 +1128,23 @@ static void print_aggr(char *prefix)
         int cpu, s, s2, id, nr;
         double uval;
         u64 ena, run, val;
+       bool first;
  
         if (!(aggr_map || aggr_get_id))
                 return;
  
+       aggr_update_shadow();
+
+       /*
+        * With metric_only everything is on a single line.
+        * Without each counter has its own line.
+        */
         for (s = 0; s < aggr_map->nr; s++) {
+               if (prefix && metric_only)
+                       fprintf(output, "%s", prefix);
+
                 id = aggr_map->map[s];
+               first = true;
                 evlist__for_each(evsel_list, counter) {
                         val = ena = run = 0;
                         nr = 0;
@@ -836,41 +1157,20 @@ static void print_aggr(char *prefix)
                                 run += perf_counts(counter->counts, cpu, 0)->run;
                                 nr++;
                         }
-                       if (prefix)
-                               fprintf(output, "%s", prefix);
-
-                       if (run == 0 || ena == 0) {
+                       if (first && metric_only) {
+                               first = false;
                                 aggr_printout(counter, id, nr);
-
-                               fprintf(output, "%*s%s",
-                                       csv_output ? 0 : 18,
-                                       counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
-                                       csv_sep);
-
-                               fprintf(output, "%-*s%s",
-                                       csv_output ? 0 : unit_width,
-                                       counter->unit, csv_sep);
-
-                               fprintf(output, "%*s",
-                                       csv_output ? 0 : -25,
-                                       perf_evsel__name(counter));
-
-                               if (counter->cgrp)
-                                       fprintf(output, "%s%s",
-                                               csv_sep, counter->cgrp->name);
-
-                               print_running(run, ena);
-                               fputc('\n', output);
-                               continue;
                         }
-                       uval = val * counter->scale;
-                       printout(id, nr, counter, uval);
-                       if (!csv_output)
-                               print_noise(counter, 1.0);
+                       if (prefix && !metric_only)
+                               fprintf(output, "%s", prefix);
  
-                       print_running(run, ena);
-                       fputc('\n', output);
+                       uval = val * counter->scale;
+                       printout(id, nr, counter, uval, prefix, run, ena, 1.0);
+                       if (!metric_only)
+                               fputc('\n', output);
                 }
+               if (metric_only)
+                       fputc('\n', output);
         }
  }
  
@@ -895,12 +1195,7 @@ static void print_aggr_thread(struct perf_evsel *counter, char *prefix)
                         fprintf(output, "%s", prefix);
  
                 uval = val * counter->scale;
-               printout(thread, 0, counter, uval);
-
-               if (!csv_output)
-                       print_noise(counter, 1.0);
-
-               print_running(run, ena);
+               printout(thread, 0, counter, uval, prefix, run, ena, 1.0);
                 fputc('\n', output);
         }
  }
@@ -914,43 +1209,19 @@ static void print_counter_aggr(struct perf_evsel *counter, char *prefix)
         FILE *output = stat_config.output;
         struct perf_stat_evsel *ps = counter->priv;
         double avg = avg_stats(&ps->res_stats[0]);
-       int scaled = counter->counts->scaled;
         double uval;
         double avg_enabled, avg_running;
  
         avg_enabled = avg_stats(&ps->res_stats[1]);
         avg_running = avg_stats(&ps->res_stats[2]);
  
-       if (prefix)
+       if (prefix && !metric_only)
                 fprintf(output, "%s", prefix);
  
-       if (scaled == -1 || !counter->supported) {
-               fprintf(output, "%*s%s",
-                       csv_output ? 0 : 18,
-                       counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
-                       csv_sep);
-               fprintf(output, "%-*s%s",
-                       csv_output ? 0 : unit_width,
-                       counter->unit, csv_sep);
-               fprintf(output, "%*s",
-                       csv_output ? 0 : -25,
-                       perf_evsel__name(counter));
-
-               if (counter->cgrp)
-                       fprintf(output, "%s%s", csv_sep, counter->cgrp->name);
-
-               print_running(avg_running, avg_enabled);
-               fputc('\n', output);
-               return;
-       }
-
         uval = avg * counter->scale;
-       printout(-1, 0, counter, uval);
-
-       print_noise(counter, avg);
-
-       print_running(avg_running, avg_enabled);
-       fprintf(output, "\n");
+       printout(-1, 0, counter, uval, prefix, avg_running, avg_enabled, avg);
+       if (!metric_only)
+               fprintf(output, "\n");
  }
  
  /*
@@ -972,39 +1243,78 @@ static void print_counter(struct perf_evsel *counter, char *prefix)
                 if (prefix)
                         fprintf(output, "%s", prefix);
  
-               if (run == 0 || ena == 0) {
-                       fprintf(output, "CPU%*d%s%*s%s",
-                               csv_output ? 0 : -4,
-                               perf_evsel__cpus(counter)->map[cpu], csv_sep,
-                               csv_output ? 0 : 18,
-                               counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
-                               csv_sep);
+               uval = val * counter->scale;
+               printout(cpu, 0, counter, uval, prefix, run, ena, 1.0);
  
-                               fprintf(output, "%-*s%s",
-                                       csv_output ? 0 : unit_width,
-                                       counter->unit, csv_sep);
+               fputc('\n', output);
+       }
+}
  
-                               fprintf(output, "%*s",
-                                       csv_output ? 0 : -25,
-                                       perf_evsel__name(counter));
+static void print_no_aggr_metric(char *prefix)
+{
+       int cpu;
+       int nrcpus = 0;
+       struct perf_evsel *counter;
+       u64 ena, run, val;
+       double uval;
  
-                       if (counter->cgrp)
-                               fprintf(output, "%s%s",
-                                       csv_sep, counter->cgrp->name);
+       nrcpus = evsel_list->cpus->nr;
+       for (cpu = 0; cpu < nrcpus; cpu++) {
+               bool first = true;
  
-                       print_running(run, ena);
-                       fputc('\n', output);
-                       continue;
+               if (prefix)
+                       fputs(prefix, stat_config.output);
+               evlist__for_each(evsel_list, counter) {
+                       if (first) {
+                               aggr_printout(counter, cpu, 0);
+                               first = false;
+                       }
+                       val = perf_counts(counter->counts, cpu, 0)->val;
+                       ena = perf_counts(counter->counts, cpu, 0)->ena;
+                       run = perf_counts(counter->counts, cpu, 0)->run;
+
+                       uval = val * counter->scale;
+                       printout(cpu, 0, counter, uval, prefix, run, ena, 1.0);
                 }
+               fputc('\n', stat_config.output);
+       }
+}
  
-               uval = val * counter->scale;
-               printout(cpu, 0, counter, uval);
-               if (!csv_output)
-                       print_noise(counter, 1.0);
-               print_running(run, ena);
+static int aggr_header_lens[] = {
+       [AGGR_CORE] = 18,
+       [AGGR_SOCKET] = 12,
+       [AGGR_NONE] = 6,
+       [AGGR_THREAD] = 24,
+       [AGGR_GLOBAL] = 0,
+};
  
-               fputc('\n', output);
+static void print_metric_headers(char *prefix)
+{
+       struct perf_stat_output_ctx out;
+       struct perf_evsel *counter;
+       struct outstate os = {
+               .fh = stat_config.output
+       };
+
+       if (prefix)
+               fprintf(stat_config.output, "%s", prefix);
+
+       if (!csv_output)
+               fprintf(stat_config.output, "%*s",
+                       aggr_header_lens[stat_config.aggr_mode], "");
+
+       /* Print metrics headers only */
+       evlist__for_each(evsel_list, counter) {
+               os.evsel = counter;
+               out.ctx = &os;
+               out.print_metric = print_metric_header;
+               out.new_line = new_line_metric;
+               os.evsel = counter;
+               perf_stat__print_shadow_stats(counter, 0,
+                                             0,
+                                             &out);
         }
+       fputc('\n', stat_config.output);
  }
  
  static void print_interval(char *prefix, struct timespec *ts)
@@ -1014,7 +1324,7 @@ static void print_interval(char *prefix, struct timespec *ts)
  
         sprintf(prefix, "%6lu.%09lu%s", ts->tv_sec, ts->tv_nsec, csv_sep);
  
-       if (num_print_interval == 0 && !csv_output) {
+       if (num_print_interval == 0 && !csv_output && !metric_only) {
                 switch (stat_config.aggr_mode) {
                 case AGGR_SOCKET:
                         fprintf(output, "#           time socket cpus             counts %*s events\n", unit_width, "unit");
@@ -1101,6 +1411,17 @@ static void print_counters(struct timespec *ts, int argc, const char **argv)
         else
                 print_header(argc, argv);
  
+       if (metric_only) {
+               static int num_print_iv;
+
+               if (num_print_iv == 0)
+                       print_metric_headers(prefix);
+               if (num_print_iv++ == 25)
+                       num_print_iv = 0;
+               if (stat_config.aggr_mode == AGGR_GLOBAL && prefix)
+                       fprintf(stat_config.output, "%s", prefix);
+       }
+
         switch (stat_config.aggr_mode) {
         case AGGR_CORE:
         case AGGR_SOCKET:
@@ -1113,10 +1434,16 @@ static void print_counters(struct timespec *ts, int argc, const char **argv)
         case AGGR_GLOBAL:
                 evlist__for_each(evsel_list, counter)
                         print_counter_aggr(counter, prefix);
+               if (metric_only)
+                       fputc('\n', stat_config.output);
                 break;
         case AGGR_NONE:
-               evlist__for_each(evsel_list, counter)
-                       print_counter(counter, prefix);
+               if (metric_only)
+                       print_no_aggr_metric(prefix);
+               else {
+                       evlist__for_each(evsel_list, counter)
+                               print_counter(counter, prefix);
+               }
                 break;
         case AGGR_UNSET:
         default:
@@ -1237,6 +1564,8 @@ static const struct option stat_options[] = {
                      "aggregate counts per thread", AGGR_THREAD),
         OPT_UINTEGER('D', "delay", &initial_delay,
                      "ms to wait before starting measurement after program start"),
+       OPT_BOOLEAN(0, "metric-only", &metric_only,
+                       "Only print computed metrics. No raw values"),
         OPT_END()
  };
  
@@ -1435,7 +1764,7 @@ static int perf_stat_init_aggr_mode_file(struct perf_stat *st)
   */
  static int add_default_attributes(void)
  {
-       struct perf_event_attr default_attrs[] = {
+       struct perf_event_attr default_attrs0[] = {
  
    { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK             },
    { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES       },
@@ -1443,8 +1772,14 @@ static int add_default_attributes(void)
    { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS            },
  
    { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES             },
+};
+       struct perf_event_attr frontend_attrs[] = {
    { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND        },
+};
+       struct perf_event_attr backend_attrs[] = {
    { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
+};
+       struct perf_event_attr default_attrs1[] = {
    { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS           },
    { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS    },
    { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES          },
@@ -1561,7 +1896,19 @@ static int add_default_attributes(void)
         }
  
         if (!evsel_list->nr_entries) {
-               if (perf_evlist__add_default_attrs(evsel_list, default_attrs) < 0)
+               if (perf_evlist__add_default_attrs(evsel_list, default_attrs0) < 0)
+                       return -1;
+               if (pmu_have_event("cpu", "stalled-cycles-frontend")) {
+                       if (perf_evlist__add_default_attrs(evsel_list,
+                                               frontend_attrs) < 0)
+                               return -1;
+               }
+               if (pmu_have_event("cpu", "stalled-cycles-backend")) {
+                       if (perf_evlist__add_default_attrs(evsel_list,
+                                               backend_attrs) < 0)
+                               return -1;
+               }
+               if (perf_evlist__add_default_attrs(evsel_list, default_attrs1) < 0)
                         return -1;
         }
  
@@ -1825,9 +2172,11 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
         if (evsel_list == NULL)
                 return -ENOMEM;
  
+       parse_events__shrink_config_terms();
         argc = parse_options_subcommand(argc, argv, stat_options, stat_subcommands,
                                         (const char **) stat_usage,
                                         PARSE_OPT_STOP_AT_NON_OPTION);
+       perf_stat__init_shadow_stats();
  
         if (csv_sep) {
                 csv_output = true;
@@ -1858,6 +2207,16 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
                 goto out;
         }
  
+       if (metric_only && stat_config.aggr_mode == AGGR_THREAD) {
+               fprintf(stderr, "--metric-only is not supported with --per-thread\n");
+               goto out;
+       }
+
+       if (metric_only && run_count > 1) {
+               fprintf(stderr, "--metric-only is not supported with -r\n");
+               goto out;
+       }
+
         if (output_fd < 0) {
                 fprintf(stderr, "argument to --log-fd must be a > 0\n");
                 parse_options_usage(stat_usage, stat_options, "log-fd", 0);
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c

index bf01cbb0ef2369f2fc904809b86b70a823bdb604..94af190f6843d2c92e68b2dfef8dc51017c42079 100644 (file)
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -252,7 +252,8 @@ static void perf_top__print_sym_table(struct perf_top *top)
         char bf[160];
         int printed = 0;
         const int win_width = top->winsize.ws_col - 1;
-       struct hists *hists = evsel__hists(top->sym_evsel);
+       struct perf_evsel *evsel = top->sym_evsel;
+       struct hists *hists = evsel__hists(evsel);
  
         puts(CONSOLE_CLEAR);
  
@@ -288,7 +289,7 @@ static void perf_top__print_sym_table(struct perf_top *top)
         }
  
         hists__collapse_resort(hists, NULL);
-       hists__output_resort(hists, NULL);
+       perf_evsel__output_resort(evsel, NULL);
  
         hists__output_recalc_col_len(hists, top->print_entries - printed);
         putchar('\n');
@@ -540,6 +541,7 @@ static bool perf_top__handle_keypress(struct perf_top *top, int c)
  static void perf_top__sort_new_samples(void *arg)
  {
         struct perf_top *t = arg;
+       struct perf_evsel *evsel = t->sym_evsel;
         struct hists *hists;
  
         perf_top__reset_sample_counters(t);
@@ -547,7 +549,7 @@ static void perf_top__sort_new_samples(void *arg)
         if (t->evlist->selected != NULL)
                 t->sym_evsel = t->evlist->selected;
  
-       hists = evsel__hists(t->sym_evsel);
+       hists = evsel__hists(evsel);
  
         if (t->evlist->enabled) {
                 if (t->zero) {
@@ -559,7 +561,7 @@ static void perf_top__sort_new_samples(void *arg)
         }
  
         hists__collapse_resort(hists, NULL);
-       hists__output_resort(hists, NULL);
+       perf_evsel__output_resort(evsel, NULL);
  }
  
  static void *display_thread_tui(void *arg)
@@ -1063,7 +1065,7 @@ parse_callchain_opt(const struct option *opt, const char *arg, int unset)
         return parse_callchain_top_opt(arg);
  }
  
-static int perf_top_config(const char *var, const char *value, void *cb)
+static int perf_top_config(const char *var, const char *value, void *cb __maybe_unused)
  {
         if (!strcmp(var, "top.call-graph"))
                 var = "call-graph.record-mode"; /* fall-through */
@@ -1072,7 +1074,7 @@ static int perf_top_config(const char *var, const char *value, void *cb)
                 return 0;
         }
  
-       return perf_default_config(var, value, cb);
+       return 0;
  }
  
  static int
@@ -1212,6 +1214,8 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
                      parse_branch_stack),
         OPT_BOOLEAN(0, "raw-trace", &symbol_conf.raw_trace,
                     "Show raw trace event output (do not use print fmt or plugins)"),
+       OPT_BOOLEAN(0, "hierarchy", &symbol_conf.report_hierarchy,
+                   "Show entries in a hierarchy"),
         OPT_END()
         };
         const char * const top_usage[] = {
@@ -1239,10 +1243,30 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
                 goto out_delete_evlist;
         }
  
+       if (symbol_conf.report_hierarchy) {
+               /* disable incompatible options */
+               symbol_conf.event_group = false;
+               symbol_conf.cumulate_callchain = false;
+
+               if (field_order) {
+                       pr_err("Error: --hierarchy and --fields options cannot be used together\n");
+                       parse_options_usage(top_usage, options, "fields", 0);
+                       parse_options_usage(NULL, options, "hierarchy", 0);
+                       goto out_delete_evlist;
+               }
+       }
+
         sort__mode = SORT_MODE__TOP;
         /* display thread wants entries to be collapsed in a different tree */
         sort__need_collapse = 1;
  
+       if (top.use_stdio)
+               use_browser = 0;
+       else if (top.use_tui)
+               use_browser = 1;
+
+       setup_browser(false);
+
         if (setup_sorting(top.evlist) < 0) {
                 if (sort_order)
                         parse_options_usage(top_usage, options, "s", 1);
@@ -1252,13 +1276,6 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
                 goto out_delete_evlist;
         }
  
-       if (top.use_stdio)
-               use_browser = 0;
-       else if (top.use_tui)
-               use_browser = 1;
-
-       setup_browser(false);
-
         status = target__validate(target);
         if (status) {
                 target__strerror(target, status, errbuf, BUFSIZ);
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c

index 20916dd77aac24847bffbaa5d1c0fb56580d6e39..8dc98c598b1aed734ab6a6695c7308f3c16e0910 100644 (file)
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -33,6 +33,7 @@
  #include "util/stat.h"
  #include "trace-event.h"
  #include "util/parse-events.h"
+#include "util/bpf-loader.h"
  
  #include <libaudit.h>
  #include <stdlib.h>
@@ -1724,8 +1725,12 @@ static int trace__read_syscall_info(struct trace *trace, int id)
  
         sc->args = sc->tp_format->format.fields;
         sc->nr_args = sc->tp_format->format.nr_fields;
-       /* drop nr field - not relevant here; does not exist on older kernels */
-       if (sc->args && strcmp(sc->args->name, "nr") == 0) {
+       /*
+        * We need to check and discard the first variable '__syscall_nr'
+        * or 'nr' that mean the syscall number. It is needless here.
+        * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
+        */
+       if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
                 sc->args = sc->args->next;
                 --sc->nr_args;
         }
@@ -2177,6 +2182,37 @@ out_dump:
         return 0;
  }
  
+static void bpf_output__printer(enum binary_printer_ops op,
+                               unsigned int val, void *extra)
+{
+       FILE *output = extra;
+       unsigned char ch = (unsigned char)val;
+
+       switch (op) {
+       case BINARY_PRINT_CHAR_DATA:
+               fprintf(output, "%c", isprint(ch) ? ch : '.');
+               break;
+       case BINARY_PRINT_DATA_BEGIN:
+       case BINARY_PRINT_LINE_BEGIN:
+       case BINARY_PRINT_ADDR:
+       case BINARY_PRINT_NUM_DATA:
+       case BINARY_PRINT_NUM_PAD:
+       case BINARY_PRINT_SEP:
+       case BINARY_PRINT_CHAR_PAD:
+       case BINARY_PRINT_LINE_END:
+       case BINARY_PRINT_DATA_END:
+       default:
+               break;
+       }
+}
+
+static void bpf_output__fprintf(struct trace *trace,
+                               struct perf_sample *sample)
+{
+       print_binary(sample->raw_data, sample->raw_size, 8,
+                    bpf_output__printer, trace->output);
+}
+
  static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
                                 union perf_event *event __maybe_unused,
                                 struct perf_sample *sample)
@@ -2189,7 +2225,9 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
  
         fprintf(trace->output, "%s:", evsel->name);
  
-       if (evsel->tp_format) {
+       if (perf_evsel__is_bpf_output(evsel)) {
+               bpf_output__fprintf(trace, sample);
+       } else if (evsel->tp_format) {
                 event_format__fprintf(evsel->tp_format, sample->cpu,
                                       sample->raw_data, sample->raw_size,
                                       trace->output);
@@ -2586,6 +2624,16 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
         if (err < 0)
                 goto out_error_open;
  
+       err = bpf__apply_obj_config();
+       if (err) {
+               char errbuf[BUFSIZ];
+
+               bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
+               pr_err("ERROR: Apply config to BPF failed: %s\n",
+                        errbuf);
+               goto out_error_open;
+       }
+
         /*
          * Better not use !target__has_task() here because we need to cover the
          * case where no threads were specified in the command line, but a
diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile

index 511141b102e8464fe3e102007f792220306ad3b9..eca6a912e8c22b0df342a7d955c6bad2db5ddf47 100644 (file)
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile
@@ -61,50 +61,45 @@ endif
  
  ifeq ($(LIBUNWIND_LIBS),)
    NO_LIBUNWIND := 1
-else
-  #
-  # For linking with debug library, run like:
-  #
-  #   make DEBUG=1 LIBUNWIND_DIR=/opt/libunwind/
-  #
-  ifdef LIBUNWIND_DIR
-    LIBUNWIND_CFLAGS  = -I$(LIBUNWIND_DIR)/include
-    LIBUNWIND_LDFLAGS = -L$(LIBUNWIND_DIR)/lib
-  endif
-  LIBUNWIND_LDFLAGS += $(LIBUNWIND_LIBS)
-
-  # Set per-feature check compilation flags
-  FEATURE_CHECK_CFLAGS-libunwind = $(LIBUNWIND_CFLAGS)
-  FEATURE_CHECK_LDFLAGS-libunwind = $(LIBUNWIND_LDFLAGS)
-  FEATURE_CHECK_CFLAGS-libunwind-debug-frame = $(LIBUNWIND_CFLAGS)
-  FEATURE_CHECK_LDFLAGS-libunwind-debug-frame = $(LIBUNWIND_LDFLAGS)
  endif
+#
+# For linking with debug library, run like:
+#
+#   make DEBUG=1 LIBUNWIND_DIR=/opt/libunwind/
+#
+ifdef LIBUNWIND_DIR
+  LIBUNWIND_CFLAGS  = -I$(LIBUNWIND_DIR)/include
+  LIBUNWIND_LDFLAGS = -L$(LIBUNWIND_DIR)/lib
+endif
+LIBUNWIND_LDFLAGS += $(LIBUNWIND_LIBS)
+
+# Set per-feature check compilation flags
+FEATURE_CHECK_CFLAGS-libunwind = $(LIBUNWIND_CFLAGS)
+FEATURE_CHECK_LDFLAGS-libunwind = $(LIBUNWIND_LDFLAGS)
+FEATURE_CHECK_CFLAGS-libunwind-debug-frame = $(LIBUNWIND_CFLAGS)
+FEATURE_CHECK_LDFLAGS-libunwind-debug-frame = $(LIBUNWIND_LDFLAGS)
  
  ifeq ($(NO_PERF_REGS),0)
    CFLAGS += -DHAVE_PERF_REGS_SUPPORT
  endif
  
-ifndef NO_LIBELF
-  # for linking with debug library, run like:
-  # make DEBUG=1 LIBDW_DIR=/opt/libdw/
-  ifdef LIBDW_DIR
-    LIBDW_CFLAGS  := -I$(LIBDW_DIR)/include
-    LIBDW_LDFLAGS := -L$(LIBDW_DIR)/lib
-  endif
-  FEATURE_CHECK_CFLAGS-libdw-dwarf-unwind := $(LIBDW_CFLAGS)
-  FEATURE_CHECK_LDFLAGS-libdw-dwarf-unwind := $(LIBDW_LDFLAGS) -ldw
+# for linking with debug library, run like:
+# make DEBUG=1 LIBDW_DIR=/opt/libdw/
+ifdef LIBDW_DIR
+  LIBDW_CFLAGS  := -I$(LIBDW_DIR)/include
+  LIBDW_LDFLAGS := -L$(LIBDW_DIR)/lib
  endif
+FEATURE_CHECK_CFLAGS-libdw-dwarf-unwind := $(LIBDW_CFLAGS)
+FEATURE_CHECK_LDFLAGS-libdw-dwarf-unwind := $(LIBDW_LDFLAGS) -ldw
  
-ifdef LIBBABELTRACE
-  # for linking with debug library, run like:
-  # make DEBUG=1 LIBBABELTRACE_DIR=/opt/libbabeltrace/
-  ifdef LIBBABELTRACE_DIR
-    LIBBABELTRACE_CFLAGS  := -I$(LIBBABELTRACE_DIR)/include
-    LIBBABELTRACE_LDFLAGS := -L$(LIBBABELTRACE_DIR)/lib
-  endif
-  FEATURE_CHECK_CFLAGS-libbabeltrace := $(LIBBABELTRACE_CFLAGS)
-  FEATURE_CHECK_LDFLAGS-libbabeltrace := $(LIBBABELTRACE_LDFLAGS) -lbabeltrace-ctf
+# for linking with debug library, run like:
+# make DEBUG=1 LIBBABELTRACE_DIR=/opt/libbabeltrace/
+ifdef LIBBABELTRACE_DIR
+  LIBBABELTRACE_CFLAGS  := -I$(LIBBABELTRACE_DIR)/include
+  LIBBABELTRACE_LDFLAGS := -L$(LIBBABELTRACE_DIR)/lib
  endif
+FEATURE_CHECK_CFLAGS-libbabeltrace := $(LIBBABELTRACE_CFLAGS)
+FEATURE_CHECK_LDFLAGS-libbabeltrace := $(LIBBABELTRACE_LDFLAGS) -lbabeltrace-ctf
  
  FEATURE_CHECK_CFLAGS-bpf = -I. -I$(srctree)/tools/include -I$(srctree)/arch/$(ARCH)/include/uapi -I$(srctree)/include/uapi
  # include ARCH specific config
@@ -145,28 +140,26 @@ ifdef PARSER_DEBUG
    $(call detected_var,PARSER_DEBUG_FLEX)
  endif
  
-ifndef NO_LIBPYTHON
-  # Try different combinations to accommodate systems that only have
-  # python[2][-config] in weird combinations but always preferring
-  # python2 and python2-config as per pep-0394. If we catch a
-  # python[-config] in version 3, the version check will kill it.
-  PYTHON2 := $(if $(call get-executable,python2),python2,python)
-  override PYTHON := $(call get-executable-or-default,PYTHON,$(PYTHON2))
-  PYTHON2_CONFIG := \
-    $(if $(call get-executable,$(PYTHON)-config),$(PYTHON)-config,python-config)
-  override PYTHON_CONFIG := \
-    $(call get-executable-or-default,PYTHON_CONFIG,$(PYTHON2_CONFIG))
+# Try different combinations to accommodate systems that only have
+# python[2][-config] in weird combinations but always preferring
+# python2 and python2-config as per pep-0394. If we catch a
+# python[-config] in version 3, the version check will kill it.
+PYTHON2 := $(if $(call get-executable,python2),python2,python)
+override PYTHON := $(call get-executable-or-default,PYTHON,$(PYTHON2))
+PYTHON2_CONFIG := \
+  $(if $(call get-executable,$(PYTHON)-config),$(PYTHON)-config,python-config)
+override PYTHON_CONFIG := \
+  $(call get-executable-or-default,PYTHON_CONFIG,$(PYTHON2_CONFIG))
  
-  PYTHON_CONFIG_SQ := $(call shell-sq,$(PYTHON_CONFIG))
+PYTHON_CONFIG_SQ := $(call shell-sq,$(PYTHON_CONFIG))
  
-  PYTHON_EMBED_LDOPTS := $(shell $(PYTHON_CONFIG_SQ) --ldflags 2>/dev/null)
-  PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null)
+PYTHON_EMBED_LDOPTS := $(shell $(PYTHON_CONFIG_SQ) --ldflags 2>/dev/null)
+PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null)
  
-  FEATURE_CHECK_CFLAGS-libpython := $(PYTHON_EMBED_CCOPTS)
-  FEATURE_CHECK_LDFLAGS-libpython := $(PYTHON_EMBED_LDOPTS)
-  FEATURE_CHECK_CFLAGS-libpython-version := $(PYTHON_EMBED_CCOPTS)
-  FEATURE_CHECK_LDFLAGS-libpython-version := $(PYTHON_EMBED_LDOPTS)
-endif
+FEATURE_CHECK_CFLAGS-libpython := $(PYTHON_EMBED_CCOPTS)
+FEATURE_CHECK_LDFLAGS-libpython := $(PYTHON_EMBED_LDOPTS)
+FEATURE_CHECK_CFLAGS-libpython-version := $(PYTHON_EMBED_CCOPTS)
+FEATURE_CHECK_LDFLAGS-libpython-version := $(PYTHON_EMBED_LDOPTS)
  
  CFLAGS += -fno-omit-frame-pointer
  CFLAGS += -ggdb3
@@ -335,6 +328,13 @@ ifndef NO_LIBELF
    endif # NO_LIBBPF
  endif # NO_LIBELF
  
+ifdef PERF_HAVE_JITDUMP
+  ifndef NO_DWARF
+    $(call detected,CONFIG_JITDUMP)
+    CFLAGS += -DHAVE_JITDUMP
+  endif
+endif
+
  ifeq ($(ARCH),powerpc)
    ifndef NO_DWARF
      CFLAGS += -DHAVE_SKIP_CALLCHAIN_IDX
@@ -411,6 +411,17 @@ ifndef NO_LIBAUDIT
    endif
  endif
  
+ifndef NO_LIBCRYPTO
+  ifneq ($(feature-libcrypto), 1)
+    msg := $(warning No libcrypto.h found, disables jitted code injection, please install libssl-devel or libssl-dev);
+    NO_LIBCRYPTO := 1
+  else
+    CFLAGS += -DHAVE_LIBCRYPTO_SUPPORT
+    EXTLIBS += -lcrypto
+    $(call detected,CONFIG_CRYPTO)
+  endif
+endif
+
  ifdef NO_NEWT
    NO_SLANG=1
  endif
diff --git a/tools/perf/jvmti/Makefile b/tools/perf/jvmti/Makefile

new file mode 100644 (file)

index 0000000..5ce61a1
--- /dev/null
+++ b/tools/perf/jvmti/Makefile
@@ -0,0 +1,89 @@
+ARCH=$(shell uname -m)
+
+ifeq ($(ARCH), x86_64)
+JARCH=amd64
+endif
+ifeq ($(ARCH), armv7l)
+JARCH=armhf
+endif
+ifeq ($(ARCH), armv6l)
+JARCH=armhf
+endif
+ifeq ($(ARCH), aarch64)
+JARCH=aarch64
+endif
+ifeq ($(ARCH), ppc64)
+JARCH=powerpc
+endif
+ifeq ($(ARCH), ppc64le)
+JARCH=powerpc
+endif
+
+DESTDIR=/usr/local
+
+VERSION=1
+REVISION=0
+AGE=0
+
+LN=ln -sf
+RM=rm
+
+SLIBJVMTI=libjvmti.so.$(VERSION).$(REVISION).$(AGE)
+VLIBJVMTI=libjvmti.so.$(VERSION)
+SLDFLAGS=-shared -Wl,-soname -Wl,$(VLIBJVMTI)
+SOLIBEXT=so
+
+# The following works at least on fedora 23, you may need the next
+# line for other distros.
+ifneq (,$(wildcard /usr/sbin/update-java-alternatives))
+JDIR=$(shell /usr/sbin/update-java-alternatives -l | head -1 | cut -d ' ' -f 3)
+else
+  ifneq (,$(wildcard /usr/sbin/alternatives))
+    JDIR=$(shell alternatives --display java | tail -1 | cut -d' ' -f 5 | sed 's%/jre/bin/java.%%g')
+  endif
+endif
+ifndef JDIR
+$(error Could not find alternatives command, you need to set JDIR= to point to the root of your Java directory)
+else
+  ifeq (,$(wildcard $(JDIR)/include/jvmti.h))
+  $(error the openjdk development package appears to me missing, install and try again)
+  endif
+endif
+$(info Using Java from $(JDIR))
+# -lrt required in 32-bit mode for clock_gettime()
+LIBS=-lelf -lrt
+INCDIR=-I $(JDIR)/include -I $(JDIR)/include/linux
+
+TARGETS=$(SLIBJVMTI)
+
+SRCS=libjvmti.c jvmti_agent.c
+OBJS=$(SRCS:.c=.o)
+SOBJS=$(OBJS:.o=.lo)
+OPT=-O2 -g -Werror -Wall
+
+CFLAGS=$(INCDIR) $(OPT)
+
+all: $(TARGETS)
+
+.c.o:
+       $(CC) $(CFLAGS) -c $*.c
+.c.lo:
+       $(CC) -fPIC -DPIC $(CFLAGS) -c $*.c -o $*.lo
+
+$(OBJS) $(SOBJS): Makefile jvmti_agent.h ../util/jitdump.h
+
+$(SLIBJVMTI):  $(SOBJS)
+       $(CC) $(CFLAGS) $(SLDFLAGS)  -o $@ $(SOBJS) $(LIBS)
+       $(LN) $@ libjvmti.$(SOLIBEXT)
+
+clean:
+       $(RM) -f *.o *.so.* *.so *.lo
+
+install:
+       -mkdir -p $(DESTDIR)/lib
+       install -m 755 $(SLIBJVMTI) $(DESTDIR)/lib/
+       (cd $(DESTDIR)/lib; $(LN) $(SLIBJVMTI) $(VLIBJVMTI))
+       (cd $(DESTDIR)/lib; $(LN) $(SLIBJVMTI) libjvmti.$(SOLIBEXT))
+       ldconfig
+
+.SUFFIXES: .c .S .o .lo
diff --git a/tools/perf/jvmti/jvmti_agent.c b/tools/perf/jvmti/jvmti_agent.c

new file mode 100644 (file)

index 0000000..6461e02
--- /dev/null
+++ b/tools/perf/jvmti/jvmti_agent.c
@@ -0,0 +1,465 @@
+/*
+ * jvmti_agent.c: JVMTI agent interface
+ *
+ * Adapted from the Oprofile code in opagent.c:
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Copyright 2007 OProfile authors
+ * Jens Wilke
+ * Daniel Hansel
+ * Copyright IBM Corporation 2007
+ */
+#include <sys/types.h>
+#include <sys/stat.h> /* for mkdir() */
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <time.h>
+#include <sys/mman.h>
+#include <syscall.h> /* for gettid() */
+#include <err.h>
+
+#include "jvmti_agent.h"
+#include "../util/jitdump.h"
+
+#define JIT_LANG "java"
+
+static char jit_path[PATH_MAX];
+static void *marker_addr;
+
+/*
+ * padding buffer
+ */
+static const char pad_bytes[7];
+
+static inline pid_t gettid(void)
+{
+       return (pid_t)syscall(__NR_gettid);
+}
+
+static int get_e_machine(struct jitheader *hdr)
+{
+       ssize_t sret;
+       char id[16];
+       int fd, ret = -1;
+       int m = -1;
+       struct {
+               uint16_t e_type;
+               uint16_t e_machine;
+       } info;
+
+       fd = open("/proc/self/exe", O_RDONLY);
+       if (fd == -1)
+               return -1;
+
+       sret = read(fd, id, sizeof(id));
+       if (sret != sizeof(id))
+               goto error;
+
+       /* check ELF signature */
+       if (id[0] != 0x7f || id[1] != 'E' || id[2] != 'L' || id[3] != 'F')
+               goto error;
+
+       sret = read(fd, &info, sizeof(info));
+       if (sret != sizeof(info))
+               goto error;
+
+       m = info.e_machine;
+       if (m < 0)
+               m = 0; /* ELF EM_NONE */
+
+       hdr->elf_mach = m;
+       ret = 0;
+error:
+       close(fd);
+       return ret;
+}
+
+#define NSEC_PER_SEC   1000000000
+static int perf_clk_id = CLOCK_MONOTONIC;
+
+static inline uint64_t
+timespec_to_ns(const struct timespec *ts)
+{
+        return ((uint64_t) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec;
+}
+
+static inline uint64_t
+perf_get_timestamp(void)
+{
+       struct timespec ts;
+       int ret;
+
+       ret = clock_gettime(perf_clk_id, &ts);
+       if (ret)
+               return 0;
+
+       return timespec_to_ns(&ts);
+}
+
+static int
+debug_cache_init(void)
+{
+       char str[32];
+       char *base, *p;
+       struct tm tm;
+       time_t t;
+       int ret;
+
+       time(&t);
+       localtime_r(&t, &tm);
+
+       base = getenv("JITDUMPDIR");
+       if (!base)
+               base = getenv("HOME");
+       if (!base)
+               base = ".";
+
+       strftime(str, sizeof(str), JIT_LANG"-jit-%Y%m%d", &tm);
+
+       snprintf(jit_path, PATH_MAX - 1, "%s/.debug/", base);
+
+       ret = mkdir(jit_path, 0755);
+       if (ret == -1) {
+               if (errno != EEXIST) {
+                       warn("jvmti: cannot create jit cache dir %s", jit_path);
+                       return -1;
+               }
+       }
+
+       snprintf(jit_path, PATH_MAX - 1, "%s/.debug/jit", base);
+       ret = mkdir(jit_path, 0755);
+       if (ret == -1) {
+               if (errno != EEXIST) {
+                       warn("cannot create jit cache dir %s", jit_path);
+                       return -1;
+               }
+       }
+
+       snprintf(jit_path, PATH_MAX - 1, "%s/.debug/jit/%s.XXXXXXXX", base, str);
+
+       p = mkdtemp(jit_path);
+       if (p != jit_path) {
+               warn("cannot create jit cache dir %s", jit_path);
+               return -1;
+       }
+
+       return 0;
+}
+
+static int
+perf_open_marker_file(int fd)
+{
+       long pgsz;
+
+       pgsz = sysconf(_SC_PAGESIZE);
+       if (pgsz == -1)
+               return -1;
+
+       /*
+        * we mmap the jitdump to create an MMAP RECORD in perf.data file.
+        * The mmap is captured either live (perf record running when we mmap)
+        * or  in deferred mode, via /proc/PID/maps
+        * the MMAP record is used as a marker of a jitdump file for more meta
+        * data info about the jitted code. Perf report/annotate detect this
+        * special filename and process the jitdump file.
+        *
+        * mapping must be PROT_EXEC to ensure it is captured by perf record
+        * even when not using -d option
+        */
+       marker_addr = mmap(NULL, pgsz, PROT_READ|PROT_EXEC, MAP_PRIVATE, fd, 0);
+       return (marker_addr == MAP_FAILED) ? -1 : 0;
+}
+
+static void
+perf_close_marker_file(void)
+{
+       long pgsz;
+
+       if (!marker_addr)
+               return;
+
+       pgsz = sysconf(_SC_PAGESIZE);
+       if (pgsz == -1)
+               return;
+
+       munmap(marker_addr, pgsz);
+}
+
+void *jvmti_open(void)
+{
+       int pad_cnt;
+       char dump_path[PATH_MAX];
+       struct jitheader header;
+       int fd;
+       FILE *fp;
+
+       /*
+        * check if clockid is supported
+        */
+       if (!perf_get_timestamp())
+               warnx("jvmti: kernel does not support %d clock id", perf_clk_id);
+
+       memset(&header, 0, sizeof(header));
+
+       debug_cache_init();
+
+       /*
+        * jitdump file name
+        */
+       snprintf(dump_path, PATH_MAX, "%s/jit-%i.dump", jit_path, getpid());
+
+       fd = open(dump_path, O_CREAT|O_TRUNC|O_RDWR, 0666);
+       if (fd == -1)
+               return NULL;
+
+       /*
+        * create perf.data maker for the jitdump file
+        */
+       if (perf_open_marker_file(fd)) {
+               warnx("jvmti: failed to create marker file");
+               return NULL;
+       }
+
+       fp = fdopen(fd, "w+");
+       if (!fp) {
+               warn("jvmti: cannot create %s", dump_path);
+               close(fd);
+               goto error;
+       }
+
+       warnx("jvmti: jitdump in %s", dump_path);
+
+       if (get_e_machine(&header)) {
+               warn("get_e_machine failed\n");
+               goto error;
+       }
+
+       header.magic      = JITHEADER_MAGIC;
+       header.version    = JITHEADER_VERSION;
+       header.total_size = sizeof(header);
+       header.pid        = getpid();
+
+       /* calculate amount of padding '\0' */
+       pad_cnt = PADDING_8ALIGNED(header.total_size);
+       header.total_size += pad_cnt;
+
+       header.timestamp = perf_get_timestamp();
+
+       if (!fwrite(&header, sizeof(header), 1, fp)) {
+               warn("jvmti: cannot write dumpfile header");
+               goto error;
+       }
+
+       /* write padding '\0' if necessary */
+       if (pad_cnt && !fwrite(pad_bytes, pad_cnt, 1, fp)) {
+               warn("jvmti: cannot write dumpfile header padding");
+               goto error;
+       }
+
+       return fp;
+error:
+       fclose(fp);
+       return NULL;
+}
+
+int
+jvmti_close(void *agent)
+{
+       struct jr_code_close rec;
+       FILE *fp = agent;
+
+       if (!fp) {
+               warnx("jvmti: incalid fd in close_agent");
+               return -1;
+       }
+
+       rec.p.id = JIT_CODE_CLOSE;
+       rec.p.total_size = sizeof(rec);
+
+       rec.p.timestamp = perf_get_timestamp();
+
+       if (!fwrite(&rec, sizeof(rec), 1, fp))
+               return -1;
+
+       fclose(fp);
+
+       fp = NULL;
+
+       perf_close_marker_file();
+
+       return 0;
+}
+
+int
+jvmti_write_code(void *agent, char const *sym,
+       uint64_t vma, void const *code, unsigned int const size)
+{
+       static int code_generation = 1;
+       struct jr_code_load rec;
+       size_t sym_len;
+       size_t padding_count;
+       FILE *fp = agent;
+       int ret = -1;
+
+       /* don't care about 0 length function, no samples */
+       if (size == 0)
+               return 0;
+
+       if (!fp) {
+               warnx("jvmti: invalid fd in write_native_code");
+               return -1;
+       }
+
+       sym_len = strlen(sym) + 1;
+
+       rec.p.id           = JIT_CODE_LOAD;
+       rec.p.total_size   = sizeof(rec) + sym_len;
+       padding_count      = PADDING_8ALIGNED(rec.p.total_size);
+       rec.p. total_size += padding_count;
+       rec.p.timestamp    = perf_get_timestamp();
+
+       rec.code_size  = size;
+       rec.vma        = vma;
+       rec.code_addr  = vma;
+       rec.pid        = getpid();
+       rec.tid        = gettid();
+
+       if (code)
+               rec.p.total_size += size;
+
+       /*
+        * If JVM is multi-threaded, nultiple concurrent calls to agent
+        * may be possible, so protect file writes
+        */
+       flockfile(fp);
+
+       /*
+        * get code index inside lock to avoid race condition
+        */
+       rec.code_index = code_generation++;
+
+       ret = fwrite_unlocked(&rec, sizeof(rec), 1, fp);
+       fwrite_unlocked(sym, sym_len, 1, fp);
+
+       if (padding_count)
+               fwrite_unlocked(pad_bytes, padding_count, 1, fp);
+
+       if (code)
+               fwrite_unlocked(code, size, 1, fp);
+
+       funlockfile(fp);
+
+       ret = 0;
+
+       return ret;
+}
+
+int
+jvmti_write_debug_info(void *agent, uint64_t code, const char *file,
+                      jvmti_line_info_t *li, int nr_lines)
+{
+       struct jr_code_debug_info rec;
+       size_t sret, len, size, flen;
+       size_t padding_count;
+       uint64_t addr;
+       const char *fn = file;
+       FILE *fp = agent;
+       int i;
+
+       /*
+        * no entry to write
+        */
+       if (!nr_lines)
+               return 0;
+
+       if (!fp) {
+               warnx("jvmti: invalid fd in write_debug_info");
+               return -1;
+       }
+
+       flen = strlen(file) + 1;
+
+       rec.p.id        = JIT_CODE_DEBUG_INFO;
+       size            = sizeof(rec);
+       rec.p.timestamp = perf_get_timestamp();
+       rec.code_addr   = (uint64_t)(uintptr_t)code;
+       rec.nr_entry    = nr_lines;
+
+       /*
+        * on disk source line info layout:
+        * uint64_t : addr
+        * int      : line number
+        * int      : column discriminator
+        * file[]   : source file name
+        * padding  : pad to multiple of 8 bytes
+        */
+       size += nr_lines * sizeof(struct debug_entry);
+       size += flen * nr_lines;
+       /*
+        * pad to 8 bytes
+        */
+       padding_count = PADDING_8ALIGNED(size);
+
+       rec.p.total_size = size + padding_count;
+
+       /*
+        * If JVM is multi-threaded, nultiple concurrent calls to agent
+        * may be possible, so protect file writes
+        */
+       flockfile(fp);
+
+       sret = fwrite_unlocked(&rec, sizeof(rec), 1, fp);
+       if (sret != 1)
+               goto error;
+
+       for (i = 0; i < nr_lines; i++) {
+
+               addr = (uint64_t)li[i].pc;
+               len  = sizeof(addr);
+               sret = fwrite_unlocked(&addr, len, 1, fp);
+               if (sret != 1)
+                       goto error;
+
+               len  = sizeof(li[0].line_number);
+               sret = fwrite_unlocked(&li[i].line_number, len, 1, fp);
+               if (sret != 1)
+                       goto error;
+
+               len  = sizeof(li[0].discrim);
+               sret = fwrite_unlocked(&li[i].discrim, len, 1, fp);
+               if (sret != 1)
+                       goto error;
+
+               sret = fwrite_unlocked(fn, flen, 1, fp);
+               if (sret != 1)
+                       goto error;
+       }
+       if (padding_count)
+               sret = fwrite_unlocked(pad_bytes, padding_count, 1, fp);
+               if (sret != 1)
+                       goto error;
+
+       funlockfile(fp);
+       return 0;
+error:
+       funlockfile(fp);
+       return -1;
+}
diff --git a/tools/perf/jvmti/jvmti_agent.h b/tools/perf/jvmti/jvmti_agent.h

new file mode 100644 (file)

index 0000000..bedf5d0
--- /dev/null
+++ b/tools/perf/jvmti/jvmti_agent.h
@@ -0,0 +1,36 @@
+#ifndef __JVMTI_AGENT_H__
+#define __JVMTI_AGENT_H__
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <jvmti.h>
+
+#define __unused __attribute__((unused))
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+typedef struct {
+       unsigned long   pc;
+       int             line_number;
+       int             discrim; /* discriminator -- 0 for now */
+} jvmti_line_info_t;
+
+void *jvmti_open(void);
+int   jvmti_close(void *agent);
+int   jvmti_write_code(void *agent, char const *symbol_name,
+                      uint64_t vma, void const *code,
+                      const unsigned int code_size);
+
+int   jvmti_write_debug_info(void *agent,
+                            uint64_t code,
+                            const char *file,
+                            jvmti_line_info_t *li,
+                            int nr_lines);
+
+#if defined(__cplusplus)
+}
+
+#endif
+#endif /* __JVMTI_H__ */
diff --git a/tools/perf/jvmti/libjvmti.c b/tools/perf/jvmti/libjvmti.c

new file mode 100644 (file)

index 0000000..ac12e4b
--- /dev/null
+++ b/tools/perf/jvmti/libjvmti.c
@@ -0,0 +1,304 @@
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <err.h>
+#include <jvmti.h>
+#include <jvmticmlr.h>
+#include <limits.h>
+
+#include "jvmti_agent.h"
+
+static int has_line_numbers;
+void *jvmti_agent;
+
+static jvmtiError
+do_get_line_numbers(jvmtiEnv *jvmti, void *pc, jmethodID m, jint bci,
+                   jvmti_line_info_t *tab, jint *nr)
+{
+       jint i, lines = 0;
+       jint nr_lines = 0;
+       jvmtiLineNumberEntry *loc_tab = NULL;
+       jvmtiError ret;
+
+       ret = (*jvmti)->GetLineNumberTable(jvmti, m, &nr_lines, &loc_tab);
+       if (ret != JVMTI_ERROR_NONE)
+               return ret;
+
+       for (i = 0; i < nr_lines; i++) {
+               if (loc_tab[i].start_location < bci) {
+                       tab[lines].pc = (unsigned long)pc;
+                       tab[lines].line_number = loc_tab[i].line_number;
+                       tab[lines].discrim = 0; /* not yet used */
+                       lines++;
+               } else {
+                       break;
+               }
+       }
+       (*jvmti)->Deallocate(jvmti, (unsigned char *)loc_tab);
+       *nr = lines;
+       return JVMTI_ERROR_NONE;
+}
+
+static jvmtiError
+get_line_numbers(jvmtiEnv *jvmti, const void *compile_info, jvmti_line_info_t **tab, int *nr_lines)
+{
+       const jvmtiCompiledMethodLoadRecordHeader *hdr;
+       jvmtiCompiledMethodLoadInlineRecord *rec;
+       jvmtiLineNumberEntry *lne = NULL;
+       PCStackInfo *c;
+       jint nr, ret;
+       int nr_total = 0;
+       int i, lines_total = 0;
+
+       if (!(tab && nr_lines))
+               return JVMTI_ERROR_NULL_POINTER;
+
+       /*
+        * Phase 1 -- get the number of lines necessary
+        */
+       for (hdr = compile_info; hdr != NULL; hdr = hdr->next) {
+               if (hdr->kind == JVMTI_CMLR_INLINE_INFO) {
+                       rec = (jvmtiCompiledMethodLoadInlineRecord *)hdr;
+                       for (i = 0; i < rec->numpcs; i++) {
+                               c = rec->pcinfo + i;
+                               nr = 0;
+                               /*
+                                * unfortunately, need a tab to get the number of lines!
+                                */
+                               ret = (*jvmti)->GetLineNumberTable(jvmti, c->methods[0], &nr, &lne);
+                               if (ret == JVMTI_ERROR_NONE) {
+                                       /* free what was allocated for nothing */
+                                       (*jvmti)->Deallocate(jvmti, (unsigned char *)lne);
+                                       nr_total += (int)nr;
+                               }
+                       }
+               }
+       }
+
+       if (nr_total == 0)
+               return JVMTI_ERROR_NOT_FOUND;
+
+       /*
+        * Phase 2 -- allocate big enough line table
+        */
+       *tab = malloc(nr_total * sizeof(**tab));
+       if (!*tab)
+               return JVMTI_ERROR_OUT_OF_MEMORY;
+
+       for (hdr = compile_info; hdr != NULL; hdr = hdr->next) {
+               if (hdr->kind == JVMTI_CMLR_INLINE_INFO) {
+                       rec = (jvmtiCompiledMethodLoadInlineRecord *)hdr;
+                       for (i = 0; i < rec->numpcs; i++) {
+                               c = rec->pcinfo + i;
+                               nr = 0;
+                               ret = do_get_line_numbers(jvmti, c->pc,
+                                                         c->methods[0],
+                                                         c->bcis[0],
+                                                         *tab + lines_total,
+                                                         &nr);
+                               if (ret == JVMTI_ERROR_NONE)
+                                       lines_total += nr;
+                       }
+               }
+       }
+       *nr_lines = lines_total;
+       return JVMTI_ERROR_NONE;
+}
+
+static void JNICALL
+compiled_method_load_cb(jvmtiEnv *jvmti,
+                       jmethodID method,
+                       jint code_size,
+                       void const *code_addr,
+                       jint map_length,
+                       jvmtiAddrLocationMap const *map,
+                       const void *compile_info)
+{
+       jvmti_line_info_t *line_tab = NULL;
+       jclass decl_class;
+       char *class_sign = NULL;
+       char *func_name = NULL;
+       char *func_sign = NULL;
+       char *file_name= NULL;
+       char fn[PATH_MAX];
+       uint64_t addr = (uint64_t)(uintptr_t)code_addr;
+       jvmtiError ret;
+       int nr_lines = 0; /* in line_tab[] */
+       size_t len;
+
+       ret = (*jvmti)->GetMethodDeclaringClass(jvmti, method,
+                                               &decl_class);
+       if (ret != JVMTI_ERROR_NONE) {
+               warnx("jvmti: cannot get declaring class");
+               return;
+       }
+
+       if (has_line_numbers && map && map_length) {
+               ret = get_line_numbers(jvmti, compile_info, &line_tab, &nr_lines);
+               if (ret != JVMTI_ERROR_NONE) {
+                       warnx("jvmti: cannot get line table for method");
+                       nr_lines = 0;
+               }
+       }
+
+       ret = (*jvmti)->GetSourceFileName(jvmti, decl_class, &file_name);
+       if (ret != JVMTI_ERROR_NONE) {
+               warnx("jvmti: cannot get source filename ret=%d", ret);
+               goto error;
+       }
+
+       ret = (*jvmti)->GetClassSignature(jvmti, decl_class,
+                                         &class_sign, NULL);
+       if (ret != JVMTI_ERROR_NONE) {
+               warnx("jvmti: getclassignature failed");
+               goto error;
+       }
+
+       ret = (*jvmti)->GetMethodName(jvmti, method, &func_name,
+                                     &func_sign, NULL);
+       if (ret != JVMTI_ERROR_NONE) {
+               warnx("jvmti: failed getmethodname");
+               goto error;
+       }
+
+       /*
+        * Assume path name is class hierarchy, this is a common practice with Java programs
+        */
+       if (*class_sign == 'L') {
+               int j, i = 0;
+               char *p = strrchr(class_sign, '/');
+               if (p) {
+                       /* drop the 'L' prefix and copy up to the final '/' */
+                       for (i = 0; i < (p - class_sign); i++)
+                               fn[i] = class_sign[i+1];
+               }
+               /*
+                * append file name, we use loops and not string ops to avoid modifying
+                * class_sign which is used later for the symbol name
+                */
+               for (j = 0; i < (PATH_MAX - 1) && file_name && j < strlen(file_name); j++, i++)
+                       fn[i] = file_name[j];
+               fn[i] = '\0';
+       } else {
+               /* fallback case */
+               strcpy(fn, file_name);
+       }
+       /*
+        * write source line info record if we have it
+        */
+       if (jvmti_write_debug_info(jvmti_agent, addr, fn, line_tab, nr_lines))
+               warnx("jvmti: write_debug_info() failed");
+
+       len = strlen(func_name) + strlen(class_sign) + strlen(func_sign) + 2;
+       {
+               char str[len];
+               snprintf(str, len, "%s%s%s", class_sign, func_name, func_sign);
+
+               if (jvmti_write_code(jvmti_agent, str, addr, code_addr, code_size))
+                       warnx("jvmti: write_code() failed");
+       }
+error:
+       (*jvmti)->Deallocate(jvmti, (unsigned char *)func_name);
+       (*jvmti)->Deallocate(jvmti, (unsigned char *)func_sign);
+       (*jvmti)->Deallocate(jvmti, (unsigned char *)class_sign);
+       (*jvmti)->Deallocate(jvmti, (unsigned char *)file_name);
+       free(line_tab);
+}
+
+static void JNICALL
+code_generated_cb(jvmtiEnv *jvmti,
+                 char const *name,
+                 void const *code_addr,
+                 jint code_size)
+{
+       uint64_t addr = (uint64_t)(unsigned long)code_addr;
+       int ret;
+
+       ret = jvmti_write_code(jvmti_agent, name, addr, code_addr, code_size);
+       if (ret)
+               warnx("jvmti: write_code() failed for code_generated");
+}
+
+JNIEXPORT jint JNICALL
+Agent_OnLoad(JavaVM *jvm, char *options, void *reserved __unused)
+{
+       jvmtiEventCallbacks cb;
+       jvmtiCapabilities caps1;
+       jvmtiJlocationFormat format;
+       jvmtiEnv *jvmti = NULL;
+       jint ret;
+
+       jvmti_agent = jvmti_open();
+       if (!jvmti_agent) {
+               warnx("jvmti: open_agent failed");
+               return -1;
+       }
+
+       /*
+        * Request a JVMTI interface version 1 environment
+        */
+       ret = (*jvm)->GetEnv(jvm, (void *)&jvmti, JVMTI_VERSION_1);
+       if (ret != JNI_OK) {
+               warnx("jvmti: jvmti version 1 not supported");
+               return -1;
+       }
+
+       /*
+        * acquire method_load capability, we require it
+        * request line numbers (optional)
+        */
+       memset(&caps1, 0, sizeof(caps1));
+       caps1.can_generate_compiled_method_load_events = 1;
+
+       ret = (*jvmti)->AddCapabilities(jvmti, &caps1);
+       if (ret != JVMTI_ERROR_NONE) {
+               warnx("jvmti: acquire compiled_method capability failed");
+               return -1;
+       }
+       ret = (*jvmti)->GetJLocationFormat(jvmti, &format);
+        if (ret == JVMTI_ERROR_NONE && format == JVMTI_JLOCATION_JVMBCI) {
+                memset(&caps1, 0, sizeof(caps1));
+                caps1.can_get_line_numbers = 1;
+                caps1.can_get_source_file_name = 1;
+               ret = (*jvmti)->AddCapabilities(jvmti, &caps1);
+                if (ret == JVMTI_ERROR_NONE)
+                        has_line_numbers = 1;
+        }
+
+       memset(&cb, 0, sizeof(cb));
+
+       cb.CompiledMethodLoad   = compiled_method_load_cb;
+       cb.DynamicCodeGenerated = code_generated_cb;
+
+       ret = (*jvmti)->SetEventCallbacks(jvmti, &cb, sizeof(cb));
+       if (ret != JVMTI_ERROR_NONE) {
+               warnx("jvmti: cannot set event callbacks");
+               return -1;
+       }
+
+       ret = (*jvmti)->SetEventNotificationMode(jvmti, JVMTI_ENABLE,
+                       JVMTI_EVENT_COMPILED_METHOD_LOAD, NULL);
+       if (ret != JVMTI_ERROR_NONE) {
+               warnx("jvmti: setnotification failed for method_load");
+               return -1;
+       }
+
+       ret = (*jvmti)->SetEventNotificationMode(jvmti, JVMTI_ENABLE,
+                       JVMTI_EVENT_DYNAMIC_CODE_GENERATED, NULL);
+       if (ret != JVMTI_ERROR_NONE) {
+               warnx("jvmti: setnotification failed on code_generated");
+               return -1;
+       }
+       return 0;
+}
+
+JNIEXPORT void JNICALL
+Agent_OnUnload(JavaVM *jvm __unused)
+{
+       int ret;
+
+       ret = jvmti_close(jvmti_agent);
+       if (ret)
+               errx(1, "Error: op_close_agent()");
+}
diff --git a/tools/perf/perf.c b/tools/perf/perf.c

index a929618b8eb616f90c9bb8e26bad5416a0fdbc19..aaee0a7827477810c5c0d2d82753545592d7f22f 100644 (file)
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -454,11 +454,12 @@ static void handle_internal_command(int argc, const char **argv)
  
  static void execv_dashed_external(const char **argv)
  {
-       struct strbuf cmd = STRBUF_INIT;
+       char *cmd;
         const char *tmp;
         int status;
  
-       strbuf_addf(&cmd, "perf-%s", argv[0]);
+       if (asprintf(&cmd, "perf-%s", argv[0]) < 0)
+               goto do_die;
  
         /*
          * argv[0] must be the perf command, but the argv array
@@ -467,7 +468,7 @@ static void execv_dashed_external(const char **argv)
          * restore it on error.
          */
         tmp = argv[0];
-       argv[0] = cmd.buf;
+       argv[0] = cmd;
  
         /*
          * if we fail because the command is not found, it is
@@ -475,15 +476,16 @@ static void execv_dashed_external(const char **argv)
          */
         status = run_command_v_opt(argv, 0);
         if (status != -ERR_RUN_COMMAND_EXEC) {
-               if (IS_RUN_COMMAND_ERR(status))
+               if (IS_RUN_COMMAND_ERR(status)) {
+do_die:
                         die("unable to run '%s'", argv[0]);
+               }
                 exit(-status);
         }
         errno = ENOENT; /* as if we called execvp */
  
         argv[0] = tmp;
-
-       strbuf_release(&cmd);
+       zfree(&cmd);
  }
  
  static int run_argv(int *argcp, const char ***argv)
@@ -546,6 +548,8 @@ int main(int argc, const char **argv)
  
         srandom(time(NULL));
  
+       perf_config(perf_default_config, NULL);
+
         /* get debugfs/tracefs mount point from /proc/mounts */
         tracing_path_mount();
  
@@ -613,6 +617,8 @@ int main(int argc, const char **argv)
          */
         pthread__block_sigwinch();
  
+       perf_debug_setup();
+
         while (1) {
                 static int done_help;
                 int was_alias = run_argv(&argc, &argv);
diff --git a/tools/perf/perf.h b/tools/perf/perf.h

index 90129accffbe824e37d9831ba3b323011295f78a..5381a01c0610c0e61f079140ed5cdc2df3f87b0d 100644 (file)
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -58,6 +58,8 @@ struct record_opts {
         bool         full_auxtrace;
         bool         auxtrace_snapshot_mode;
         bool         record_switch_events;
+       bool         all_kernel;
+       bool         all_user;
         unsigned int freq;
         unsigned int mmap_pages;
         unsigned int auxtrace_mmap_pages;
diff --git a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py

index 15c8400240fd9029ae34fca077304337d9c75ca6..1d95009592eb3a8a8d053e1340a3e42dd412a68c 100644 (file)
--- a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py
+++ b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py
@@ -71,7 +71,10 @@ try:
  except:
         if not audit_package_warned:
                 audit_package_warned = True
-               print "Install the audit-libs-python package to get syscall names"
+               print "Install the audit-libs-python package to get syscall names.\n" \
+                    "For example:\n  # apt-get install python-audit (Ubuntu)" \
+                    "\n  # yum install audit-libs-python (Fedora)" \
+                    "\n  etc.\n"
  
  def syscall_name(id):
         try:
diff --git a/tools/perf/tests/.gitignore b/tools/perf/tests/.gitignore

index bf016c439fbd10a3cada60253c98d60ce441aa98..8cc30e731c739495f3eb61a0da9a76246ad35600 100644 (file)
--- a/tools/perf/tests/.gitignore
+++ b/tools/perf/tests/.gitignore
@@ -1,3 +1,4 @@
  llvm-src-base.c
  llvm-src-kbuild.c
  llvm-src-prologue.c
+llvm-src-relocation.c
diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build

index 614899b88b377e07f9615d618ba7045f66051bea..1ba628ed049adbafc27c7b8900ecb838165a2aa7 100644 (file)
--- a/tools/perf/tests/Build
+++ b/tools/perf/tests/Build
@@ -31,7 +31,7 @@ perf-y += sample-parsing.o
  perf-y += parse-no-sample-id-all.o
  perf-y += kmod-path.o
  perf-y += thread-map.o
-perf-y += llvm.o llvm-src-base.o llvm-src-kbuild.o llvm-src-prologue.o
+perf-y += llvm.o llvm-src-base.o llvm-src-kbuild.o llvm-src-prologue.o llvm-src-relocation.o
  perf-y += bpf.o
  perf-y += topology.o
  perf-y += cpumap.o
@@ -59,6 +59,13 @@ $(OUTPUT)tests/llvm-src-prologue.c: tests/bpf-script-test-prologue.c tests/Build
         $(Q)sed -e 's/"/\\"/g' -e 's/\(.*\)/"\1\\n"/g' $< >> $@
         $(Q)echo ';' >> $@
  
+$(OUTPUT)tests/llvm-src-relocation.c: tests/bpf-script-test-relocation.c tests/Build
+       $(call rule_mkdir)
+       $(Q)echo '#include <tests/llvm.h>' > $@
+       $(Q)echo 'const char test_llvm__bpf_test_relocation[] =' >> $@
+       $(Q)sed -e 's/"/\\"/g' -e 's/\(.*\)/"\1\\n"/g' $< >> $@
+       $(Q)echo ';' >> $@
+
  ifeq ($(ARCH),$(filter $(ARCH),x86 arm arm64))
  perf-$(CONFIG_DWARF_UNWIND) += dwarf-unwind.o
  endif
diff --git a/tools/perf/tests/bp_signal.c b/tools/perf/tests/bp_signal.c

index fb80c9eb6a95b67947b23adaeca69b5e57bac704..e7664fe3bd33739fd92be2579c30102e481e8f03 100644 (file)
--- a/tools/perf/tests/bp_signal.c
+++ b/tools/perf/tests/bp_signal.c
@@ -29,14 +29,59 @@
  
  static int fd1;
  static int fd2;
+static int fd3;
  static int overflows;
+static int overflows_2;
+
+volatile long the_var;
+
+
+/*
+ * Use ASM to ensure watchpoint and breakpoint can be triggered
+ * at one instruction.
+ */
+#if defined (__x86_64__)
+extern void __test_function(volatile long *ptr);
+asm (
+       ".globl __test_function\n"
+       "__test_function:\n"
+       "incq (%rdi)\n"
+       "ret\n");
+#elif defined (__aarch64__)
+extern void __test_function(volatile long *ptr);
+asm (
+       ".globl __test_function\n"
+       "__test_function:\n"
+       "str x30, [x0]\n"
+       "ret\n");
+
+#else
+static void __test_function(volatile long *ptr)
+{
+       *ptr = 0x1234;
+}
+#endif
  
  __attribute__ ((noinline))
  static int test_function(void)
  {
+       __test_function(&the_var);
+       the_var++;
         return time(NULL);
  }
  
+static void sig_handler_2(int signum __maybe_unused,
+                         siginfo_t *oh __maybe_unused,
+                         void *uc __maybe_unused)
+{
+       overflows_2++;
+       if (overflows_2 > 10) {
+               ioctl(fd1, PERF_EVENT_IOC_DISABLE, 0);
+               ioctl(fd2, PERF_EVENT_IOC_DISABLE, 0);
+               ioctl(fd3, PERF_EVENT_IOC_DISABLE, 0);
+       }
+}
+
  static void sig_handler(int signum __maybe_unused,
                         siginfo_t *oh __maybe_unused,
                         void *uc __maybe_unused)
@@ -54,10 +99,11 @@ static void sig_handler(int signum __maybe_unused,
                  */
                 ioctl(fd1, PERF_EVENT_IOC_DISABLE, 0);
                 ioctl(fd2, PERF_EVENT_IOC_DISABLE, 0);
+               ioctl(fd3, PERF_EVENT_IOC_DISABLE, 0);
         }
  }
  
-static int bp_event(void *fn, int setup_signal)
+static int __event(bool is_x, void *addr, int sig)
  {
         struct perf_event_attr pe;
         int fd;
@@ -67,8 +113,8 @@ static int bp_event(void *fn, int setup_signal)
         pe.size = sizeof(struct perf_event_attr);
  
         pe.config = 0;
-       pe.bp_type = HW_BREAKPOINT_X;
-       pe.bp_addr = (unsigned long) fn;
+       pe.bp_type = is_x ? HW_BREAKPOINT_X : HW_BREAKPOINT_W;
+       pe.bp_addr = (unsigned long) addr;
         pe.bp_len = sizeof(long);
  
         pe.sample_period = 1;
@@ -86,17 +132,25 @@ static int bp_event(void *fn, int setup_signal)
                 return TEST_FAIL;
         }
  
-       if (setup_signal) {
-               fcntl(fd, F_SETFL, O_RDWR|O_NONBLOCK|O_ASYNC);
-               fcntl(fd, F_SETSIG, SIGIO);
-               fcntl(fd, F_SETOWN, getpid());
-       }
+       fcntl(fd, F_SETFL, O_RDWR|O_NONBLOCK|O_ASYNC);
+       fcntl(fd, F_SETSIG, sig);
+       fcntl(fd, F_SETOWN, getpid());
  
         ioctl(fd, PERF_EVENT_IOC_RESET, 0);
  
         return fd;
  }
  
+static int bp_event(void *addr, int sig)
+{
+       return __event(true, addr, sig);
+}
+
+static int wp_event(void *addr, int sig)
+{
+       return __event(false, addr, sig);
+}
+
  static long long bp_count(int fd)
  {
         long long count;
@@ -114,7 +168,7 @@ static long long bp_count(int fd)
  int test__bp_signal(int subtest __maybe_unused)
  {
         struct sigaction sa;
-       long long count1, count2;
+       long long count1, count2, count3;
  
         /* setup SIGIO signal handler */
         memset(&sa, 0, sizeof(struct sigaction));
@@ -126,21 +180,52 @@ int test__bp_signal(int subtest __maybe_unused)
                 return TEST_FAIL;
         }
  
+       sa.sa_sigaction = (void *) sig_handler_2;
+       if (sigaction(SIGUSR1, &sa, NULL) < 0) {
+               pr_debug("failed setting up signal handler 2\n");
+               return TEST_FAIL;
+       }
+
         /*
          * We create following events:
          *
-        * fd1 - breakpoint event on test_function with SIGIO
+        * fd1 - breakpoint event on __test_function with SIGIO
          *       signal configured. We should get signal
          *       notification each time the breakpoint is hit
          *
-        * fd2 - breakpoint event on sig_handler without SIGIO
+        * fd2 - breakpoint event on sig_handler with SIGUSR1
+        *       configured. We should get SIGUSR1 each time when
+        *       breakpoint is hit
+        *
+        * fd3 - watchpoint event on __test_function with SIGIO
          *       configured.
          *
          * Following processing should happen:
-        *   - execute test_function
-        *   - fd1 event breakpoint hit -> count1 == 1
-        *   - SIGIO is delivered       -> overflows == 1
-        *   - fd2 event breakpoint hit -> count2 == 1
+        *   Exec:               Action:                       Result:
+        *   incq (%rdi)       - fd1 event breakpoint hit   -> count1 == 1
+        *                     - SIGIO is delivered
+        *   sig_handler       - fd2 event breakpoint hit   -> count2 == 1
+        *                     - SIGUSR1 is delivered
+        *   sig_handler_2                                  -> overflows_2 == 1  (nested signal)
+        *   sys_rt_sigreturn  - return from sig_handler_2
+        *   overflows++                                    -> overflows = 1
+        *   sys_rt_sigreturn  - return from sig_handler
+        *   incq (%rdi)       - fd3 event watchpoint hit   -> count3 == 1       (wp and bp in one insn)
+        *                     - SIGIO is delivered
+        *   sig_handler       - fd2 event breakpoint hit   -> count2 == 2
+        *                     - SIGUSR1 is delivered
+        *   sig_handler_2                                  -> overflows_2 == 2  (nested signal)
+        *   sys_rt_sigreturn  - return from sig_handler_2
+        *   overflows++                                    -> overflows = 2
+        *   sys_rt_sigreturn  - return from sig_handler
+        *   the_var++         - fd3 event watchpoint hit   -> count3 == 2       (standalone watchpoint)
+        *                     - SIGIO is delivered
+        *   sig_handler       - fd2 event breakpoint hit   -> count2 == 3
+        *                     - SIGUSR1 is delivered
+        *   sig_handler_2                                  -> overflows_2 == 3  (nested signal)
+        *   sys_rt_sigreturn  - return from sig_handler_2
+        *   overflows++                                    -> overflows == 3
+        *   sys_rt_sigreturn  - return from sig_handler
          *
          * The test case check following error conditions:
          * - we get stuck in signal handler because of debug
@@ -152,11 +237,13 @@ int test__bp_signal(int subtest __maybe_unused)
          *
          */
  
-       fd1 = bp_event(test_function, 1);
-       fd2 = bp_event(sig_handler, 0);
+       fd1 = bp_event(__test_function, SIGIO);
+       fd2 = bp_event(sig_handler, SIGUSR1);
+       fd3 = wp_event((void *)&the_var, SIGIO);
  
         ioctl(fd1, PERF_EVENT_IOC_ENABLE, 0);
         ioctl(fd2, PERF_EVENT_IOC_ENABLE, 0);
+       ioctl(fd3, PERF_EVENT_IOC_ENABLE, 0);
  
         /*
          * Kick off the test by trigering 'fd1'
@@ -166,15 +253,18 @@ int test__bp_signal(int subtest __maybe_unused)
  
         ioctl(fd1, PERF_EVENT_IOC_DISABLE, 0);
         ioctl(fd2, PERF_EVENT_IOC_DISABLE, 0);
+       ioctl(fd3, PERF_EVENT_IOC_DISABLE, 0);
  
         count1 = bp_count(fd1);
         count2 = bp_count(fd2);
+       count3 = bp_count(fd3);
  
         close(fd1);
         close(fd2);
+       close(fd3);
  
-       pr_debug("count1 %lld, count2 %lld, overflow %d\n",
-                count1, count2, overflows);
+       pr_debug("count1 %lld, count2 %lld, count3 %lld, overflow %d, overflows_2 %d\n",
+                count1, count2, count3, overflows, overflows_2);
  
         if (count1 != 1) {
                 if (count1 == 11)
@@ -183,12 +273,18 @@ int test__bp_signal(int subtest __maybe_unused)
                         pr_debug("failed: wrong count for bp1%lld\n", count1);
         }
  
-       if (overflows != 1)
+       if (overflows != 3)
                 pr_debug("failed: wrong overflow hit\n");
  
-       if (count2 != 1)
+       if (overflows_2 != 3)
+               pr_debug("failed: wrong overflow_2 hit\n");
+
+       if (count2 != 3)
                 pr_debug("failed: wrong count for bp2\n");
  
-       return count1 == 1 && overflows == 1 && count2 == 1 ?
+       if (count3 != 2)
+               pr_debug("failed: wrong count for bp3\n");
+
+       return count1 == 1 && overflows == 3 && count2 == 3 && overflows_2 == 3 && count3 == 2 ?
                 TEST_OK : TEST_FAIL;
  }
diff --git a/tools/perf/tests/bpf-script-test-relocation.c b/tools/perf/tests/bpf-script-test-relocation.c

new file mode 100644 (file)

index 0000000..93af774
--- /dev/null
+++ b/tools/perf/tests/bpf-script-test-relocation.c
@@ -0,0 +1,50 @@
+/*
+ * bpf-script-test-relocation.c
+ * Test BPF loader checking relocation
+ */
+#ifndef LINUX_VERSION_CODE
+# error Need LINUX_VERSION_CODE
+# error Example: for 4.2 kernel, put 'clang-opt="-DLINUX_VERSION_CODE=0x40200" into llvm section of ~/.perfconfig'
+#endif
+#define BPF_ANY 0
+#define BPF_MAP_TYPE_ARRAY 2
+#define BPF_FUNC_map_lookup_elem 1
+#define BPF_FUNC_map_update_elem 2
+
+static void *(*bpf_map_lookup_elem)(void *map, void *key) =
+       (void *) BPF_FUNC_map_lookup_elem;
+static void *(*bpf_map_update_elem)(void *map, void *key, void *value, int flags) =
+       (void *) BPF_FUNC_map_update_elem;
+
+struct bpf_map_def {
+       unsigned int type;
+       unsigned int key_size;
+       unsigned int value_size;
+       unsigned int max_entries;
+};
+
+#define SEC(NAME) __attribute__((section(NAME), used))
+struct bpf_map_def SEC("maps") my_table = {
+       .type = BPF_MAP_TYPE_ARRAY,
+       .key_size = sizeof(int),
+       .value_size = sizeof(int),
+       .max_entries = 1,
+};
+
+int this_is_a_global_val;
+
+SEC("func=sys_write")
+int bpf_func__sys_write(void *ctx)
+{
+       int key = 0;
+       int value = 0;
+
+       /*
+        * Incorrect relocation. Should not allow this program be
+        * loaded into kernel.
+        */
+       bpf_map_update_elem(&this_is_a_global_val, &key, &value, 0);
+       return 0;
+}
+char _license[] SEC("license") = "GPL";
+int _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c

index 33689a0cf821e5b9608e54b981e3280ee580890c..199501c71e272491850065910aae5003603ab10e 100644 (file)
--- a/tools/perf/tests/bpf.c
+++ b/tools/perf/tests/bpf.c
@@ -1,7 +1,11 @@
  #include <stdio.h>
  #include <sys/epoll.h>
+#include <util/util.h>
  #include <util/bpf-loader.h>
  #include <util/evlist.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <bpf/bpf.h>
  #include "tests.h"
  #include "llvm.h"
  #include "debug.h"
@@ -71,6 +75,15 @@ static struct {
                 (NR_ITERS + 1) / 4,
         },
  #endif
+       {
+               LLVM_TESTCASE_BPF_RELOCATION,
+               "Test BPF relocation checker",
+               "[bpf_relocation_test]",
+               "fix 'perf test LLVM' first",
+               "libbpf error when dealing with relocation",
+               NULL,
+               0,
+       },
  };
  
  static int do_test(struct bpf_object *obj, int (*func)(void),
@@ -99,7 +112,7 @@ static int do_test(struct bpf_object *obj, int (*func)(void),
         parse_evlist.error = &parse_error;
         INIT_LIST_HEAD(&parse_evlist.list);
  
-       err = parse_events_load_bpf_obj(&parse_evlist, &parse_evlist.list, obj);
+       err = parse_events_load_bpf_obj(&parse_evlist, &parse_evlist.list, obj, NULL);
         if (err || list_empty(&parse_evlist.list)) {
                 pr_debug("Failed to add events selected by BPF\n");
                 return TEST_FAIL;
@@ -190,7 +203,7 @@ static int __test__bpf(int idx)
  
         ret = test_llvm__fetch_bpf_obj(&obj_buf, &obj_buf_sz,
                                        bpf_testcase_table[idx].prog_id,
-                                      true);
+                                      true, NULL);
         if (ret != TEST_OK || !obj_buf || !obj_buf_sz) {
                 pr_debug("Unable to get BPF object, %s\n",
                          bpf_testcase_table[idx].msg_compile_fail);
@@ -202,14 +215,21 @@ static int __test__bpf(int idx)
  
         obj = prepare_bpf(obj_buf, obj_buf_sz,
                           bpf_testcase_table[idx].name);
-       if (!obj) {
+       if ((!!bpf_testcase_table[idx].target_func) != (!!obj)) {
+               if (!obj)
+                       pr_debug("Fail to load BPF object: %s\n",
+                                bpf_testcase_table[idx].msg_load_fail);
+               else
+                       pr_debug("Success unexpectedly: %s\n",
+                                bpf_testcase_table[idx].msg_load_fail);
                 ret = TEST_FAIL;
                 goto out;
         }
  
-       ret = do_test(obj,
-                     bpf_testcase_table[idx].target_func,
-                     bpf_testcase_table[idx].expect_result);
+       if (obj)
+               ret = do_test(obj,
+                             bpf_testcase_table[idx].target_func,
+                             bpf_testcase_table[idx].expect_result);
  out:
         bpf__clear();
         return ret;
@@ -227,6 +247,36 @@ const char *test__bpf_subtest_get_desc(int i)
         return bpf_testcase_table[i].desc;
  }
  
+static int check_env(void)
+{
+       int err;
+       unsigned int kver_int;
+       char license[] = "GPL";
+
+       struct bpf_insn insns[] = {
+               BPF_MOV64_IMM(BPF_REG_0, 1),
+               BPF_EXIT_INSN(),
+       };
+
+       err = fetch_kernel_version(&kver_int, NULL, 0);
+       if (err) {
+               pr_debug("Unable to get kernel version\n");
+               return err;
+       }
+
+       err = bpf_load_program(BPF_PROG_TYPE_KPROBE, insns,
+                              sizeof(insns) / sizeof(insns[0]),
+                              license, kver_int, NULL, 0);
+       if (err < 0) {
+               pr_err("Missing basic BPF support, skip this test: %s\n",
+                      strerror(errno));
+               return err;
+       }
+       close(err);
+
+       return 0;
+}
+
  int test__bpf(int i)
  {
         int err;
@@ -239,6 +289,9 @@ int test__bpf(int i)
                 return TEST_SKIP;
         }
  
+       if (check_env())
+               return TEST_SKIP;
+
         err = __test__bpf(i);
         return err;
  }
diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c

index 313a48c6b2bc8e111e113e79c4a72fdc2d720ac5..afc9ad0a0515c5db77745d0d0c6964d950fa3bfc 100644 (file)
--- a/tools/perf/tests/code-reading.c
+++ b/tools/perf/tests/code-reading.c
@@ -439,7 +439,7 @@ static int do_test_code_reading(bool try_kcore)
                 .mmap_pages          = UINT_MAX,
                 .user_freq           = UINT_MAX,
                 .user_interval       = ULLONG_MAX,
-               .freq                = 4000,
+               .freq                = 500,
                 .target              = {
                         .uses_mmap   = true,
                 },
@@ -559,7 +559,13 @@ static int do_test_code_reading(bool try_kcore)
                                 evlist = NULL;
                                 continue;
                         }
-                       pr_debug("perf_evlist__open failed\n");
+
+                       if (verbose) {
+                               char errbuf[512];
+                               perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
+                               pr_debug("perf_evlist__open() failed!\n%s\n", errbuf);
+                       }
+
                         goto out_put;
                 }
                 break;
diff --git a/tools/perf/tests/hists_cumulate.c b/tools/perf/tests/hists_cumulate.c

index 5e6a86e50fb97aae648c16ce627157e8377ebb02..ecf136c385d5ff2f21111e813a9ce1e30c03365a 100644 (file)
--- a/tools/perf/tests/hists_cumulate.c
+++ b/tools/perf/tests/hists_cumulate.c
@@ -191,7 +191,7 @@ static int do_test(struct hists *hists, struct result *expected, size_t nr_expec
          * function since TEST_ASSERT_VAL() returns in case of failure.
          */
         hists__collapse_resort(hists, NULL);
-       hists__output_resort(hists, NULL);
+       perf_evsel__output_resort(hists_to_evsel(hists), NULL);
  
         if (verbose > 2) {
                 pr_info("use callchain: %d, cumulate callchain: %d\n",
diff --git a/tools/perf/tests/hists_filter.c b/tools/perf/tests/hists_filter.c

index 351a42463444a3df9f80daea3848499b8d39659a..34b945a55d4d2864cc561f36275191f57f84210a 100644 (file)
--- a/tools/perf/tests/hists_filter.c
+++ b/tools/perf/tests/hists_filter.c
@@ -145,7 +145,7 @@ int test__hists_filter(int subtest __maybe_unused)
                 struct hists *hists = evsel__hists(evsel);
  
                 hists__collapse_resort(hists, NULL);
-               hists__output_resort(hists, NULL);
+               perf_evsel__output_resort(evsel, NULL);
  
                 if (verbose > 2) {
                         pr_info("Normal histogram\n");
diff --git a/tools/perf/tests/hists_output.c b/tools/perf/tests/hists_output.c

index b231265148d89a28e39387d423711c643351eb70..23cce67c7e48902b86c00cc6e56df22ef16386e4 100644 (file)
--- a/tools/perf/tests/hists_output.c
+++ b/tools/perf/tests/hists_output.c
@@ -156,7 +156,7 @@ static int test1(struct perf_evsel *evsel, struct machine *machine)
                 goto out;
  
         hists__collapse_resort(hists, NULL);
-       hists__output_resort(hists, NULL);
+       perf_evsel__output_resort(evsel, NULL);
  
         if (verbose > 2) {
                 pr_info("[fields = %s, sort = %s]\n", field_order, sort_order);
@@ -256,7 +256,7 @@ static int test2(struct perf_evsel *evsel, struct machine *machine)
                 goto out;
  
         hists__collapse_resort(hists, NULL);
-       hists__output_resort(hists, NULL);
+       perf_evsel__output_resort(evsel, NULL);
  
         if (verbose > 2) {
                 pr_info("[fields = %s, sort = %s]\n", field_order, sort_order);
@@ -310,7 +310,7 @@ static int test3(struct perf_evsel *evsel, struct machine *machine)
                 goto out;
  
         hists__collapse_resort(hists, NULL);
-       hists__output_resort(hists, NULL);
+       perf_evsel__output_resort(evsel, NULL);
  
         if (verbose > 2) {
                 pr_info("[fields = %s, sort = %s]\n", field_order, sort_order);
@@ -388,7 +388,7 @@ static int test4(struct perf_evsel *evsel, struct machine *machine)
                 goto out;
  
         hists__collapse_resort(hists, NULL);
-       hists__output_resort(hists, NULL);
+       perf_evsel__output_resort(evsel, NULL);
  
         if (verbose > 2) {
                 pr_info("[fields = %s, sort = %s]\n", field_order, sort_order);
@@ -491,7 +491,7 @@ static int test5(struct perf_evsel *evsel, struct machine *machine)
                 goto out;
  
         hists__collapse_resort(hists, NULL);
-       hists__output_resort(hists, NULL);
+       perf_evsel__output_resort(evsel, NULL);
  
         if (verbose > 2) {
                 pr_info("[fields = %s, sort = %s]\n", field_order, sort_order);
diff --git a/tools/perf/tests/llvm.c b/tools/perf/tests/llvm.c

index 06f45c1d42561df030e35d55078d70d910e79998..cff564fb4b66761f7f7fdcd5ee10727ad3b08726 100644 (file)
--- a/tools/perf/tests/llvm.c
+++ b/tools/perf/tests/llvm.c
@@ -6,12 +6,6 @@
  #include "tests.h"
  #include "debug.h"
  
-static int perf_config_cb(const char *var, const char *val,
-                         void *arg __maybe_unused)
-{
-       return perf_default_config(var, val, arg);
-}
-
  #ifdef HAVE_LIBBPF_SUPPORT
  static int test__bpf_parsing(void *obj_buf, size_t obj_buf_sz)
  {
@@ -35,6 +29,7 @@ static int test__bpf_parsing(void *obj_buf __maybe_unused,
  static struct {
         const char *source;
         const char *desc;
+       bool should_load_fail;
  } bpf_source_table[__LLVM_TESTCASE_MAX] = {
         [LLVM_TESTCASE_BASE] = {
                 .source = test_llvm__bpf_base_prog,
@@ -48,14 +43,19 @@ static struct {
                 .source = test_llvm__bpf_test_prologue_prog,
                 .desc = "Compile source for BPF prologue generation test",
         },
+       [LLVM_TESTCASE_BPF_RELOCATION] = {
+               .source = test_llvm__bpf_test_relocation,
+               .desc = "Compile source for BPF relocation test",
+               .should_load_fail = true,
+       },
  };
  
-
  int
  test_llvm__fetch_bpf_obj(void **p_obj_buf,
                          size_t *p_obj_buf_sz,
                          enum test_llvm__testcase idx,
-                        bool force)
+                        bool force,
+                        bool *should_load_fail)
  {
         const char *source;
         const char *desc;
@@ -68,8 +68,8 @@ test_llvm__fetch_bpf_obj(void **p_obj_buf,
  
         source = bpf_source_table[idx].source;
         desc = bpf_source_table[idx].desc;
-
-       perf_config(perf_config_cb, NULL);
+       if (should_load_fail)
+               *should_load_fail = bpf_source_table[idx].should_load_fail;
  
         /*
          * Skip this test if user's .perfconfig doesn't set [llvm] section
@@ -136,14 +136,15 @@ int test__llvm(int subtest)
         int ret;
         void *obj_buf = NULL;
         size_t obj_buf_sz = 0;
+       bool should_load_fail = false;
  
         if ((subtest < 0) || (subtest >= __LLVM_TESTCASE_MAX))
                 return TEST_FAIL;
  
         ret = test_llvm__fetch_bpf_obj(&obj_buf, &obj_buf_sz,
-                                      subtest, false);
+                                      subtest, false, &should_load_fail);
  
-       if (ret == TEST_OK) {
+       if (ret == TEST_OK && !should_load_fail) {
                 ret = test__bpf_parsing(obj_buf, obj_buf_sz);
                 if (ret != TEST_OK) {
                         pr_debug("Failed to parse test case '%s'\n",
diff --git a/tools/perf/tests/llvm.h b/tools/perf/tests/llvm.h

index 5150b4d6ef50afe357f5f86300f4d39d28bec658..0eaa604be99defec6dcf3b09ee9a75eb348096fb 100644 (file)
--- a/tools/perf/tests/llvm.h
+++ b/tools/perf/tests/llvm.h
@@ -7,14 +7,17 @@
  extern const char test_llvm__bpf_base_prog[];
  extern const char test_llvm__bpf_test_kbuild_prog[];
  extern const char test_llvm__bpf_test_prologue_prog[];
+extern const char test_llvm__bpf_test_relocation[];
  
  enum test_llvm__testcase {
         LLVM_TESTCASE_BASE,
         LLVM_TESTCASE_KBUILD,
         LLVM_TESTCASE_BPF_PROLOGUE,
+       LLVM_TESTCASE_BPF_RELOCATION,
         __LLVM_TESTCASE_MAX,
  };
  
  int test_llvm__fetch_bpf_obj(void **p_obj_buf, size_t *p_obj_buf_sz,
-                            enum test_llvm__testcase index, bool force);
+                            enum test_llvm__testcase index, bool force,
+                            bool *should_load_fail);
  #endif
diff --git a/tools/perf/tests/make b/tools/perf/tests/make

index f918015512af96d1173891dac2e8f460ff65ca85..cac15d93aea656f96ad449cba1f9529e42afcbec 100644 (file)
--- a/tools/perf/tests/make
+++ b/tools/perf/tests/make
@@ -15,6 +15,7 @@ else
  PERF := .
  PERF_O := $(PERF)
  O_OPT :=
+FULL_O := $(shell readlink -f $(PERF_O) || echo $(PERF_O))
  
  ifneq ($(O),)
    FULL_O := $(shell readlink -f $(O) || echo $(O))
@@ -79,6 +80,7 @@ make_no_libaudit    := NO_LIBAUDIT=1
  make_no_libbionic   := NO_LIBBIONIC=1
  make_no_auxtrace    := NO_AUXTRACE=1
  make_no_libbpf     := NO_LIBBPF=1
+make_no_libcrypto   := NO_LIBCRYPTO=1
  make_tags           := tags
  make_cscope         := cscope
  make_help           := help
@@ -102,6 +104,7 @@ make_minimal        := NO_LIBPERL=1 NO_LIBPYTHON=1 NO_NEWT=1 NO_GTK2=1
  make_minimal        += NO_DEMANGLE=1 NO_LIBELF=1 NO_LIBUNWIND=1 NO_BACKTRACE=1
  make_minimal        += NO_LIBNUMA=1 NO_LIBAUDIT=1 NO_LIBBIONIC=1
  make_minimal        += NO_LIBDW_DWARF_UNWIND=1 NO_AUXTRACE=1 NO_LIBBPF=1
+make_minimal        += NO_LIBCRYPTO=1
  
  # $(run) contains all available tests
  run := make_pure
@@ -110,6 +113,9 @@ run := make_pure
  # disable features detection
  ifeq ($(MK),Makefile)
  run += make_clean_all
+MAKE_F := $(MAKE)
+else
+MAKE_F := $(MAKE) -f $(MK)
  endif
  run += make_python_perf_so
  run += make_debug
@@ -260,6 +266,8 @@ run := $(shell shuf -e $(run))
  run_O := $(shell shuf -e $(run_O))
  endif
  
+max_width := $(shell echo $(run_O) | sed 's/ /\n/g' | wc -L)
+
  ifdef DEBUG
  d := $(info run   $(run))
  d := $(info run_O $(run_O))
@@ -267,13 +275,13 @@ endif
  
  MAKEFLAGS := --no-print-directory
  
-clean := @(cd $(PERF); make -s -f $(MK) $(O_OPT) clean >/dev/null)
+clean := @(cd $(PERF); $(MAKE_F) -s $(O_OPT) clean >/dev/null)
  
  $(run):
         $(call clean)
         @TMP_DEST=$$(mktemp -d); \
-       cmd="cd $(PERF) && make -f $(MK) $(PARALLEL_OPT) $(O_OPT) DESTDIR=$$TMP_DEST $($@)"; \
-       echo "- $@: $$cmd" && echo $$cmd > $@ && \
+       cmd="cd $(PERF) && $(MAKE_F) $($@) $(PARALLEL_OPT) $(O_OPT) DESTDIR=$$TMP_DEST"; \
+       printf "%*.*s: %s\n" $(max_width) $(max_width) "$@" "$$cmd" && echo $$cmd > $@ && \
         ( eval $$cmd ) >> $@ 2>&1; \
         echo "  test: $(call test,$@)" >> $@ 2>&1; \
         $(call test,$@) && \
@@ -283,8 +291,8 @@ $(run_O):
         $(call clean)
         @TMP_O=$$(mktemp -d); \
         TMP_DEST=$$(mktemp -d); \
-       cmd="cd $(PERF) && make -f $(MK) $(PARALLEL_OPT) O=$$TMP_O DESTDIR=$$TMP_DEST $($(patsubst %_O,%,$@))"; \
-       echo "- $@: $$cmd" && echo $$cmd > $@ && \
+       cmd="cd $(PERF) && $(MAKE_F) $($(patsubst %_O,%,$@)) $(PARALLEL_OPT) O=$$TMP_O DESTDIR=$$TMP_DEST"; \
+       printf "%*.*s: %s\n" $(max_width) $(max_width) "$@" "$$cmd" && echo $$cmd > $@ && \
         ( eval $$cmd ) >> $@ 2>&1 && \
         echo "  test: $(call test_O,$@)" >> $@ 2>&1; \
         $(call test_O,$@) && \
@@ -313,11 +321,43 @@ make_kernelsrc_tools:
         (make -C ../../tools $(PARALLEL_OPT) $(K_O_OPT) perf) > $@ 2>&1 && \
         test -x $(KERNEL_O)/tools/perf/perf && rm -f $@ || (cat $@ ; false)
  
+FEATURES_DUMP_FILE := $(FULL_O)/BUILD_TEST_FEATURE_DUMP
+FEATURES_DUMP_FILE_STATIC := $(FULL_O)/BUILD_TEST_FEATURE_DUMP_STATIC
+
  all: $(run) $(run_O) tarpkg make_kernelsrc make_kernelsrc_tools
         @echo OK
+       @rm -f $(FEATURES_DUMP_FILE) $(FEATURES_DUMP_FILE_STATIC)
  
  out: $(run_O)
         @echo OK
+       @rm -f $(FEATURES_DUMP_FILE) $(FEATURES_DUMP_FILE_STATIC)
+
+ifeq ($(REUSE_FEATURES_DUMP),1)
+$(FEATURES_DUMP_FILE):
+       $(call clean)
+       @cmd="cd $(PERF) && make FEATURE_DUMP_COPY=$@ $(O_OPT) feature-dump"; \
+       echo "- $@: $$cmd" && echo $$cmd && \
+       ( eval $$cmd ) > /dev/null 2>&1
+
+$(FEATURES_DUMP_FILE_STATIC):
+       $(call clean)
+       @cmd="cd $(PERF) && make FEATURE_DUMP_COPY=$@ $(O_OPT) LDFLAGS='-static' feature-dump"; \
+       echo "- $@: $$cmd" && echo $$cmd && \
+       ( eval $$cmd ) > /dev/null 2>&1
+
+# Add feature dump dependency for run/run_O targets
+$(foreach t,$(run) $(run_O),$(eval \
+       $(t): $(if $(findstring make_static,$(t)),\
+               $(FEATURES_DUMP_FILE_STATIC),\
+               $(FEATURES_DUMP_FILE))))
+
+# Append 'FEATURES_DUMP=' option to all test cases. For example:
+# make_no_libbpf: NO_LIBBPF=1  --> NO_LIBBPF=1 FEATURES_DUMP=/a/b/BUILD_TEST_FEATURE_DUMP
+# make_static: LDFLAGS=-static --> LDFLAGS=-static FEATURES_DUMP=/a/b/BUILD_TEST_FEATURE_DUMP_STATIC
+$(foreach t,$(run),$(if $(findstring make_static,$(t)),\
+                       $(eval $(t) := $($(t)) FEATURES_DUMP=$(FEATURES_DUMP_FILE_STATIC)),\
+                       $(eval $(t) := $($(t)) FEATURES_DUMP=$(FEATURES_DUMP_FILE))))
+endif
  
  .PHONY: all $(run) $(run_O) tarpkg clean make_kernelsrc make_kernelsrc_tools
  endif # ifndef MK
diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c

index abe8849d1d7030bda2ae65f770424e3c9ad84330..7865f68dc0d82bea12c960c61792e9fd28857305 100644 (file)
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -1271,6 +1271,38 @@ static int test__checkevent_precise_max_modifier(struct perf_evlist *evlist)
         return 0;
  }
  
+static int test__checkevent_config_symbol(struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
+
+       TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "insn") == 0);
+       return 0;
+}
+
+static int test__checkevent_config_raw(struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
+
+       TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "rawpmu") == 0);
+       return 0;
+}
+
+static int test__checkevent_config_num(struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
+
+       TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "numpmu") == 0);
+       return 0;
+}
+
+static int test__checkevent_config_cache(struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
+
+       TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "cachepmu") == 0);
+       return 0;
+}
+
  static int count_tracepoints(void)
  {
         struct dirent *events_ent;
@@ -1579,6 +1611,26 @@ static struct evlist_test test__events[] = {
                 .check = test__checkevent_precise_max_modifier,
                 .id    = 47,
         },
+       {
+               .name  = "instructions/name=insn/",
+               .check = test__checkevent_config_symbol,
+               .id    = 48,
+       },
+       {
+               .name  = "r1234/name=rawpmu/",
+               .check = test__checkevent_config_raw,
+               .id    = 49,
+       },
+       {
+               .name  = "4:0x6530160/name=numpmu/",
+               .check = test__checkevent_config_num,
+               .id    = 50,
+       },
+       {
+               .name  = "L1-dcache-misses/name=cachepmu/",
+               .check = test__checkevent_config_cache,
+               .id    = 51,
+       },
  };
  
  static struct evlist_test test__events_pmu[] = {
@@ -1666,7 +1718,7 @@ static int test_term(struct terms_test *t)
         }
  
         ret = t->check(&terms);
-       parse_events__free_terms(&terms);
+       parse_events_terms__purge(&terms);
  
         return ret;
  }
diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c

index f0bfc9e8fd9f617d69c0b3cb36fe6c210ebbd6d5..630b0b409b973f87ba00312e5e4e3d8fdf829f32 100644 (file)
--- a/tools/perf/tests/vmlinux-kallsyms.c
+++ b/tools/perf/tests/vmlinux-kallsyms.c
@@ -110,7 +110,6 @@ int test__vmlinux_matches_kallsyms(int subtest __maybe_unused)
          */
         for (nd = rb_first(&vmlinux_map->dso->symbols[type]); nd; nd = rb_next(nd)) {
                 struct symbol *pair, *first_pair;
-               bool backwards = true;
  
                 sym  = rb_entry(nd, struct symbol, rb_node);
  
@@ -151,27 +150,14 @@ next_pair:
                                 continue;
  
                         } else {
-                               struct rb_node *nnd;
-detour:
-                               nnd = backwards ? rb_prev(&pair->rb_node) :
-                                                 rb_next(&pair->rb_node);
-                               if (nnd) {
-                                       struct symbol *next = rb_entry(nnd, struct symbol, rb_node);
-
-                                       if (UM(next->start) == mem_start) {
-                                               pair = next;
+                               pair = machine__find_kernel_symbol_by_name(&kallsyms, type, sym->name, NULL, NULL);
+                               if (pair) {
+                                       if (UM(pair->start) == mem_start)
                                                 goto next_pair;
-                                       }
-                               }
  
-                               if (backwards) {
-                                       backwards = false;
-                                       pair = first_pair;
-                                       goto detour;
+                                       pr_debug("%#" PRIx64 ": diff name v: %s k: %s\n",
+                                                mem_start, sym->name, pair->name);
                                 }
-
-                               pr_debug("%#" PRIx64 ": diff name v: %s k: %s\n",
-                                        mem_start, sym->name, pair->name);
                         }
                 } else
                         pr_debug("%#" PRIx64 ": %s not on kallsyms\n",
diff --git a/tools/perf/ui/browser.c b/tools/perf/ui/browser.c

index d37202121689a017b0790d552e088d695716c527..af68a9d488bfce964c84e67cf5394e7e13daab29 100644 (file)
--- a/tools/perf/ui/browser.c
+++ b/tools/perf/ui/browser.c
@@ -531,8 +531,8 @@ static struct ui_browser_colorset {
                 .bg       = "yellow",
         },
         {
-               .colorset = HE_COLORSET_CODE,
-               .name     = "code",
+               .colorset = HE_COLORSET_JUMP_ARROWS,
+               .name     = "jump_arrows",
                 .fg       = "blue",
                 .bg       = "default",
         },
diff --git a/tools/perf/ui/browser.h b/tools/perf/ui/browser.h

index 01781de59532ce9c9fd1ff7f8c7fdf97d45fa105..be3b70eb5fca6e402830570c4111f10258702c04 100644 (file)
--- a/tools/perf/ui/browser.h
+++ b/tools/perf/ui/browser.h
@@ -7,7 +7,7 @@
  #define HE_COLORSET_MEDIUM     51
  #define HE_COLORSET_NORMAL     52
  #define HE_COLORSET_SELECTED   53
-#define HE_COLORSET_CODE       54
+#define HE_COLORSET_JUMP_ARROWS        54
  #define HE_COLORSET_ADDR       55
  #define HE_COLORSET_ROOT       56
  
diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c

index 718bd46d47fa7bc88674a192dd1a8e8ab1fb7ca9..4fc208e82c6fc7b28d99af147d1c75738bc338e6 100644 (file)
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -284,7 +284,7 @@ static void annotate_browser__draw_current_jump(struct ui_browser *browser)
                 to = (u64)btarget->idx;
         }
  
-       ui_browser__set_color(browser, HE_COLORSET_CODE);
+       ui_browser__set_color(browser, HE_COLORSET_JUMP_ARROWS);
         __ui_browser__line_arrow(browser, pcnt_width + 2 + ab->addr_width,
                                  from, to);
  }
diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c

index 08c09ad755d2d2f8dd55be06b258e16fb2c8335f..4b98165559462025ded4db4a41fe8b5e66a9f151 100644 (file)
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -32,6 +32,7 @@ struct hist_browser {
         bool                 show_headers;
         float                min_pcnt;
         u64                  nr_non_filtered_entries;
+       u64                  nr_hierarchy_entries;
         u64                  nr_callchain_rows;
  };
  
@@ -58,11 +59,11 @@ static int hist_browser__get_folding(struct hist_browser *browser)
  
         for (nd = rb_first(&hists->entries);
              (nd = hists__filter_entries(nd, browser->min_pcnt)) != NULL;
-            nd = rb_next(nd)) {
+            nd = rb_hierarchy_next(nd)) {
                 struct hist_entry *he =
                         rb_entry(nd, struct hist_entry, rb_node);
  
-               if (he->unfolded)
+               if (he->leaf && he->unfolded)
                         unfolded_rows += he->nr_rows;
         }
         return unfolded_rows;
@@ -72,7 +73,9 @@ static u32 hist_browser__nr_entries(struct hist_browser *hb)
  {
         u32 nr_entries;
  
-       if (hist_browser__has_filter(hb))
+       if (symbol_conf.report_hierarchy)
+               nr_entries = hb->nr_hierarchy_entries;
+       else if (hist_browser__has_filter(hb))
                 nr_entries = hb->nr_non_filtered_entries;
         else
                 nr_entries = hb->hists->nr_entries;
@@ -247,6 +250,38 @@ static int callchain__count_rows(struct rb_root *chain)
         return n;
  }
  
+static int hierarchy_count_rows(struct hist_browser *hb, struct hist_entry *he,
+                               bool include_children)
+{
+       int count = 0;
+       struct rb_node *node;
+       struct hist_entry *child;
+
+       if (he->leaf)
+               return callchain__count_rows(&he->sorted_chain);
+
+       if (he->has_no_entry)
+               return 1;
+
+       node = rb_first(&he->hroot_out);
+       while (node) {
+               float percent;
+
+               child = rb_entry(node, struct hist_entry, rb_node);
+               percent = hist_entry__get_percent_limit(child);
+
+               if (!child->filtered && percent >= hb->min_pcnt) {
+                       count++;
+
+                       if (include_children && child->unfolded)
+                               count += hierarchy_count_rows(hb, child, true);
+               }
+
+               node = rb_next(node);
+       }
+       return count;
+}
+
  static bool hist_entry__toggle_fold(struct hist_entry *he)
  {
         if (!he)
@@ -326,11 +361,17 @@ static void callchain__init_have_children(struct rb_root *root)
  
  static void hist_entry__init_have_children(struct hist_entry *he)
  {
-       if (!he->init_have_children) {
+       if (he->init_have_children)
+               return;
+
+       if (he->leaf) {
                 he->has_children = !RB_EMPTY_ROOT(&he->sorted_chain);
                 callchain__init_have_children(&he->sorted_chain);
-               he->init_have_children = true;
+       } else {
+               he->has_children = !RB_EMPTY_ROOT(&he->hroot_out);
         }
+
+       he->init_have_children = true;
  }
  
  static bool hist_browser__toggle_fold(struct hist_browser *browser)
@@ -349,17 +390,49 @@ static bool hist_browser__toggle_fold(struct hist_browser *browser)
                 has_children = callchain_list__toggle_fold(cl);
  
         if (has_children) {
+               int child_rows = 0;
+
                 hist_entry__init_have_children(he);
                 browser->b.nr_entries -= he->nr_rows;
-               browser->nr_callchain_rows -= he->nr_rows;
  
-               if (he->unfolded)
-                       he->nr_rows = callchain__count_rows(&he->sorted_chain);
+               if (he->leaf)
+                       browser->nr_callchain_rows -= he->nr_rows;
                 else
+                       browser->nr_hierarchy_entries -= he->nr_rows;
+
+               if (symbol_conf.report_hierarchy)
+                       child_rows = hierarchy_count_rows(browser, he, true);
+
+               if (he->unfolded) {
+                       if (he->leaf)
+                               he->nr_rows = callchain__count_rows(&he->sorted_chain);
+                       else
+                               he->nr_rows = hierarchy_count_rows(browser, he, false);
+
+                       /* account grand children */
+                       if (symbol_conf.report_hierarchy)
+                               browser->b.nr_entries += child_rows - he->nr_rows;
+
+                       if (!he->leaf && he->nr_rows == 0) {
+                               he->has_no_entry = true;
+                               he->nr_rows = 1;
+                       }
+               } else {
+                       if (symbol_conf.report_hierarchy)
+                               browser->b.nr_entries -= child_rows - he->nr_rows;
+
+                       if (he->has_no_entry)
+                               he->has_no_entry = false;
+
                         he->nr_rows = 0;
+               }
  
                 browser->b.nr_entries += he->nr_rows;
-               browser->nr_callchain_rows += he->nr_rows;
+
+               if (he->leaf)
+                       browser->nr_callchain_rows += he->nr_rows;
+               else
+                       browser->nr_hierarchy_entries += he->nr_rows;
  
                 return true;
         }
@@ -422,13 +495,38 @@ static int callchain__set_folding(struct rb_root *chain, bool unfold)
         return n;
  }
  
-static void hist_entry__set_folding(struct hist_entry *he, bool unfold)
+static int hierarchy_set_folding(struct hist_browser *hb, struct hist_entry *he,
+                                bool unfold __maybe_unused)
+{
+       float percent;
+       struct rb_node *nd;
+       struct hist_entry *child;
+       int n = 0;
+
+       for (nd = rb_first(&he->hroot_out); nd; nd = rb_next(nd)) {
+               child = rb_entry(nd, struct hist_entry, rb_node);
+               percent = hist_entry__get_percent_limit(child);
+               if (!child->filtered && percent >= hb->min_pcnt)
+                       n++;
+       }
+
+       return n;
+}
+
+static void hist_entry__set_folding(struct hist_entry *he,
+                                   struct hist_browser *hb, bool unfold)
  {
         hist_entry__init_have_children(he);
         he->unfolded = unfold ? he->has_children : false;
  
         if (he->has_children) {
-               int n = callchain__set_folding(&he->sorted_chain, unfold);
+               int n;
+
+               if (he->leaf)
+                       n = callchain__set_folding(&he->sorted_chain, unfold);
+               else
+                       n = hierarchy_set_folding(hb, he, unfold);
+
                 he->nr_rows = unfold ? n : 0;
         } else
                 he->nr_rows = 0;
@@ -438,19 +536,38 @@ static void
  __hist_browser__set_folding(struct hist_browser *browser, bool unfold)
  {
         struct rb_node *nd;
-       struct hists *hists = browser->hists;
+       struct hist_entry *he;
+       double percent;
  
-       for (nd = rb_first(&hists->entries);
-            (nd = hists__filter_entries(nd, browser->min_pcnt)) != NULL;
-            nd = rb_next(nd)) {
-               struct hist_entry *he = rb_entry(nd, struct hist_entry, rb_node);
-               hist_entry__set_folding(he, unfold);
-               browser->nr_callchain_rows += he->nr_rows;
+       nd = rb_first(&browser->hists->entries);
+       while (nd) {
+               he = rb_entry(nd, struct hist_entry, rb_node);
+
+               /* set folding state even if it's currently folded */
+               nd = __rb_hierarchy_next(nd, HMD_FORCE_CHILD);
+
+               hist_entry__set_folding(he, browser, unfold);
+
+               percent = hist_entry__get_percent_limit(he);
+               if (he->filtered || percent < browser->min_pcnt)
+                       continue;
+
+               if (!he->depth || unfold)
+                       browser->nr_hierarchy_entries++;
+               if (he->leaf)
+                       browser->nr_callchain_rows += he->nr_rows;
+               else if (unfold && !hist_entry__has_hierarchy_children(he, browser->min_pcnt)) {
+                       browser->nr_hierarchy_entries++;
+                       he->has_no_entry = true;
+                       he->nr_rows = 1;
+               } else
+                       he->has_no_entry = false;
         }
  }
  
  static void hist_browser__set_folding(struct hist_browser *browser, bool unfold)
  {
+       browser->nr_hierarchy_entries = 0;
         browser->nr_callchain_rows = 0;
         __hist_browser__set_folding(browser, unfold);
  
@@ -657,9 +774,24 @@ static int hist_browser__show_callchain_list(struct hist_browser *browser,
         return 1;
  }
  
+static bool check_percent_display(struct rb_node *node, u64 parent_total)
+{
+       struct callchain_node *child;
+
+       if (node == NULL)
+               return false;
+
+       if (rb_next(node))
+               return true;
+
+       child = rb_entry(node, struct callchain_node, rb_node);
+       return callchain_cumul_hits(child) != parent_total;
+}
+
  static int hist_browser__show_callchain_flat(struct hist_browser *browser,
                                              struct rb_root *root,
                                              unsigned short row, u64 total,
+                                            u64 parent_total,
                                              print_callchain_entry_fn print,
                                              struct callchain_print_arg *arg,
                                              check_output_full_fn is_output_full)
@@ -669,7 +801,7 @@ static int hist_browser__show_callchain_flat(struct hist_browser *browser,
         bool need_percent;
  
         node = rb_first(root);
-       need_percent = node && rb_next(node);
+       need_percent = check_percent_display(node, parent_total);
  
         while (node) {
                 struct callchain_node *child = rb_entry(node, struct callchain_node, rb_node);
@@ -763,6 +895,7 @@ static char *hist_browser__folded_callchain_str(struct hist_browser *browser,
  static int hist_browser__show_callchain_folded(struct hist_browser *browser,
                                                struct rb_root *root,
                                                unsigned short row, u64 total,
+                                              u64 parent_total,
                                                print_callchain_entry_fn print,
                                                struct callchain_print_arg *arg,
                                                check_output_full_fn is_output_full)
@@ -772,7 +905,7 @@ static int hist_browser__show_callchain_folded(struct hist_browser *browser,
         bool need_percent;
  
         node = rb_first(root);
-       need_percent = node && rb_next(node);
+       need_percent = check_percent_display(node, parent_total);
  
         while (node) {
                 struct callchain_node *child = rb_entry(node, struct callchain_node, rb_node);
@@ -844,20 +977,24 @@ next:
         return row - first_row;
  }
  
-static int hist_browser__show_callchain(struct hist_browser *browser,
+static int hist_browser__show_callchain_graph(struct hist_browser *browser,
                                         struct rb_root *root, int level,
                                         unsigned short row, u64 total,
+                                       u64 parent_total,
                                         print_callchain_entry_fn print,
                                         struct callchain_print_arg *arg,
                                         check_output_full_fn is_output_full)
  {
         struct rb_node *node;
         int first_row = row, offset = level * LEVEL_OFFSET_STEP;
-       u64 new_total;
         bool need_percent;
+       u64 percent_total = total;
+
+       if (callchain_param.mode == CHAIN_GRAPH_REL)
+               percent_total = parent_total;
  
         node = rb_first(root);
-       need_percent = node && rb_next(node);
+       need_percent = check_percent_display(node, parent_total);
  
         while (node) {
                 struct callchain_node *child = rb_entry(node, struct callchain_node, rb_node);
@@ -878,7 +1015,7 @@ static int hist_browser__show_callchain(struct hist_browser *browser,
                         folded_sign = callchain_list__folded(chain);
  
                         row += hist_browser__show_callchain_list(browser, child,
-                                                       chain, row, total,
+                                                       chain, row, percent_total,
                                                         was_first && need_percent,
                                                         offset + extra_offset,
                                                         print, arg);
@@ -893,13 +1030,9 @@ static int hist_browser__show_callchain(struct hist_browser *browser,
                 if (folded_sign == '-') {
                         const int new_level = level + (extra_offset ? 2 : 1);
  
-                       if (callchain_param.mode == CHAIN_GRAPH_REL)
-                               new_total = child->children_hit;
-                       else
-                               new_total = total;
-
-                       row += hist_browser__show_callchain(browser, &child->rb_root,
-                                                           new_level, row, new_total,
+                       row += hist_browser__show_callchain_graph(browser, &child->rb_root,
+                                                           new_level, row, total,
+                                                           child->children_hit,
                                                             print, arg, is_output_full);
                 }
                 if (is_output_full(browser, row))
@@ -910,6 +1043,45 @@ out:
         return row - first_row;
  }
  
+static int hist_browser__show_callchain(struct hist_browser *browser,
+                                       struct hist_entry *entry, int level,
+                                       unsigned short row,
+                                       print_callchain_entry_fn print,
+                                       struct callchain_print_arg *arg,
+                                       check_output_full_fn is_output_full)
+{
+       u64 total = hists__total_period(entry->hists);
+       u64 parent_total;
+       int printed;
+
+       if (symbol_conf.cumulate_callchain)
+               parent_total = entry->stat_acc->period;
+       else
+               parent_total = entry->stat.period;
+
+       if (callchain_param.mode == CHAIN_FLAT) {
+               printed = hist_browser__show_callchain_flat(browser,
+                                               &entry->sorted_chain, row,
+                                               total, parent_total, print, arg,
+                                               is_output_full);
+       } else if (callchain_param.mode == CHAIN_FOLDED) {
+               printed = hist_browser__show_callchain_folded(browser,
+                                               &entry->sorted_chain, row,
+                                               total, parent_total, print, arg,
+                                               is_output_full);
+       } else {
+               printed = hist_browser__show_callchain_graph(browser,
+                                               &entry->sorted_chain, level, row,
+                                               total, parent_total, print, arg,
+                                               is_output_full);
+       }
+
+       if (arg->is_current_entry)
+               browser->he_selection = entry;
+
+       return printed;
+}
+
  struct hpp_arg {
         struct ui_browser *b;
         char folded_sign;
@@ -1006,7 +1178,6 @@ static int hist_browser__show_entry(struct hist_browser *browser,
                                     struct hist_entry *entry,
                                     unsigned short row)
  {
-       char s[256];
         int printed = 0;
         int width = browser->b.width;
         char folded_sign = ' ';
@@ -1031,16 +1202,18 @@ static int hist_browser__show_entry(struct hist_browser *browser,
                         .folded_sign    = folded_sign,
                         .current_entry  = current_entry,
                 };
-               struct perf_hpp hpp = {
-                       .buf            = s,
-                       .size           = sizeof(s),
-                       .ptr            = &arg,
-               };
                 int column = 0;
  
                 hist_browser__gotorc(browser, row, 0);
  
-               perf_hpp__for_each_format(fmt) {
+               hists__for_each_format(browser->hists, fmt) {
+                       char s[2048];
+                       struct perf_hpp hpp = {
+                               .buf    = s,
+                               .size   = sizeof(s),
+                               .ptr    = &arg,
+                       };
+
                         if (perf_hpp__should_skip(fmt, entry->hists) ||
                             column++ < browser->b.horiz_scroll)
                                 continue;
@@ -1065,11 +1238,18 @@ static int hist_browser__show_entry(struct hist_browser *browser,
                         }
  
                         if (fmt->color) {
-                               width -= fmt->color(fmt, &hpp, entry);
+                               int ret = fmt->color(fmt, &hpp, entry);
+                               hist_entry__snprintf_alignment(entry, &hpp, fmt, ret);
+                               /*
+                                * fmt->color() already used ui_browser to
+                                * print the non alignment bits, skip it (+ret):
+                                */
+                               ui_browser__printf(&browser->b, "%s", s + ret);
                         } else {
-                               width -= fmt->entry(fmt, &hpp, entry);
+                               hist_entry__snprintf_alignment(entry, &hpp, fmt, fmt->entry(fmt, &hpp, entry));
                                 ui_browser__printf(&browser->b, "%s", s);
                         }
+                       width -= hpp.buf - s;
                 }
  
                 /* The scroll bar isn't being used */
@@ -1084,43 +1264,246 @@ static int hist_browser__show_entry(struct hist_browser *browser,
                 --row_offset;
  
         if (folded_sign == '-' && row != browser->b.rows) {
-               u64 total = hists__total_period(entry->hists);
                 struct callchain_print_arg arg = {
                         .row_offset = row_offset,
                         .is_current_entry = current_entry,
                 };
  
-               if (callchain_param.mode == CHAIN_GRAPH_REL) {
-                       if (symbol_conf.cumulate_callchain)
-                               total = entry->stat_acc->period;
-                       else
-                               total = entry->stat.period;
-               }
-
-               if (callchain_param.mode == CHAIN_FLAT) {
-                       printed += hist_browser__show_callchain_flat(browser,
-                                       &entry->sorted_chain, row, total,
-                                       hist_browser__show_callchain_entry, &arg,
-                                       hist_browser__check_output_full);
-               } else if (callchain_param.mode == CHAIN_FOLDED) {
-                       printed += hist_browser__show_callchain_folded(browser,
-                                       &entry->sorted_chain, row, total,
+               printed += hist_browser__show_callchain(browser, entry, 1, row,
                                         hist_browser__show_callchain_entry, &arg,
                                         hist_browser__check_output_full);
+       }
+
+       return printed;
+}
+
+static int hist_browser__show_hierarchy_entry(struct hist_browser *browser,
+                                             struct hist_entry *entry,
+                                             unsigned short row,
+                                             int level)
+{
+       int printed = 0;
+       int width = browser->b.width;
+       char folded_sign = ' ';
+       bool current_entry = ui_browser__is_current_entry(&browser->b, row);
+       off_t row_offset = entry->row_offset;
+       bool first = true;
+       struct perf_hpp_fmt *fmt;
+       struct perf_hpp_list_node *fmt_node;
+       struct hpp_arg arg = {
+               .b              = &browser->b,
+               .current_entry  = current_entry,
+       };
+       int column = 0;
+       int hierarchy_indent = (entry->hists->nr_hpp_node - 2) * HIERARCHY_INDENT;
+
+       if (current_entry) {
+               browser->he_selection = entry;
+               browser->selection = &entry->ms;
+       }
+
+       hist_entry__init_have_children(entry);
+       folded_sign = hist_entry__folded(entry);
+       arg.folded_sign = folded_sign;
+
+       if (entry->leaf && row_offset) {
+               row_offset--;
+               goto show_callchain;
+       }
+
+       hist_browser__gotorc(browser, row, 0);
+
+       if (current_entry && browser->b.navkeypressed)
+               ui_browser__set_color(&browser->b, HE_COLORSET_SELECTED);
+       else
+               ui_browser__set_color(&browser->b, HE_COLORSET_NORMAL);
+
+       ui_browser__write_nstring(&browser->b, "", level * HIERARCHY_INDENT);
+       width -= level * HIERARCHY_INDENT;
+
+       /* the first hpp_list_node is for overhead columns */
+       fmt_node = list_first_entry(&entry->hists->hpp_formats,
+                                   struct perf_hpp_list_node, list);
+       perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+               char s[2048];
+               struct perf_hpp hpp = {
+                       .buf            = s,
+                       .size           = sizeof(s),
+                       .ptr            = &arg,
+               };
+
+               if (perf_hpp__should_skip(fmt, entry->hists) ||
+                   column++ < browser->b.horiz_scroll)
+                       continue;
+
+               if (current_entry && browser->b.navkeypressed) {
+                       ui_browser__set_color(&browser->b,
+                                             HE_COLORSET_SELECTED);
                 } else {
-                       printed += hist_browser__show_callchain(browser,
-                                       &entry->sorted_chain, 1, row, total,
-                                       hist_browser__show_callchain_entry, &arg,
-                                       hist_browser__check_output_full);
+                       ui_browser__set_color(&browser->b,
+                                             HE_COLORSET_NORMAL);
+               }
+
+               if (first) {
+                       ui_browser__printf(&browser->b, "%c", folded_sign);
+                       width--;
+                       first = false;
+               } else {
+                       ui_browser__printf(&browser->b, "  ");
+                       width -= 2;
+               }
+
+               if (fmt->color) {
+                       int ret = fmt->color(fmt, &hpp, entry);
+                       hist_entry__snprintf_alignment(entry, &hpp, fmt, ret);
+                       /*
+                        * fmt->color() already used ui_browser to
+                        * print the non alignment bits, skip it (+ret):
+                        */
+                       ui_browser__printf(&browser->b, "%s", s + ret);
+               } else {
+                       int ret = fmt->entry(fmt, &hpp, entry);
+                       hist_entry__snprintf_alignment(entry, &hpp, fmt, ret);
+                       ui_browser__printf(&browser->b, "%s", s);
+               }
+               width -= hpp.buf - s;
+       }
+
+       ui_browser__write_nstring(&browser->b, "", hierarchy_indent);
+       width -= hierarchy_indent;
+
+       if (column >= browser->b.horiz_scroll) {
+               char s[2048];
+               struct perf_hpp hpp = {
+                       .buf            = s,
+                       .size           = sizeof(s),
+                       .ptr            = &arg,
+               };
+
+               if (current_entry && browser->b.navkeypressed) {
+                       ui_browser__set_color(&browser->b,
+                                             HE_COLORSET_SELECTED);
+               } else {
+                       ui_browser__set_color(&browser->b,
+                                             HE_COLORSET_NORMAL);
                 }
  
-               if (arg.is_current_entry)
-                       browser->he_selection = entry;
+               perf_hpp_list__for_each_format(entry->hpp_list, fmt) {
+                       ui_browser__write_nstring(&browser->b, "", 2);
+                       width -= 2;
+
+                       /*
+                        * No need to call hist_entry__snprintf_alignment()
+                        * since this fmt is always the last column in the
+                        * hierarchy mode.
+                        */
+                       if (fmt->color) {
+                               width -= fmt->color(fmt, &hpp, entry);
+                       } else {
+                               int i = 0;
+
+                               width -= fmt->entry(fmt, &hpp, entry);
+                               ui_browser__printf(&browser->b, "%s", ltrim(s));
+
+                               while (isspace(s[i++]))
+                                       width++;
+                       }
+               }
+       }
+
+       /* The scroll bar isn't being used */
+       if (!browser->b.navkeypressed)
+               width += 1;
+
+       ui_browser__write_nstring(&browser->b, "", width);
+
+       ++row;
+       ++printed;
+
+show_callchain:
+       if (entry->leaf && folded_sign == '-' && row != browser->b.rows) {
+               struct callchain_print_arg carg = {
+                       .row_offset = row_offset,
+               };
+
+               printed += hist_browser__show_callchain(browser, entry,
+                                       level + 1, row,
+                                       hist_browser__show_callchain_entry, &carg,
+                                       hist_browser__check_output_full);
         }
  
         return printed;
  }
  
+static int hist_browser__show_no_entry(struct hist_browser *browser,
+                                      unsigned short row, int level)
+{
+       int width = browser->b.width;
+       bool current_entry = ui_browser__is_current_entry(&browser->b, row);
+       bool first = true;
+       int column = 0;
+       int ret;
+       struct perf_hpp_fmt *fmt;
+       struct perf_hpp_list_node *fmt_node;
+       int indent = browser->hists->nr_hpp_node - 2;
+
+       if (current_entry) {
+               browser->he_selection = NULL;
+               browser->selection = NULL;
+       }
+
+       hist_browser__gotorc(browser, row, 0);
+
+       if (current_entry && browser->b.navkeypressed)
+               ui_browser__set_color(&browser->b, HE_COLORSET_SELECTED);
+       else
+               ui_browser__set_color(&browser->b, HE_COLORSET_NORMAL);
+
+       ui_browser__write_nstring(&browser->b, "", level * HIERARCHY_INDENT);
+       width -= level * HIERARCHY_INDENT;
+
+       /* the first hpp_list_node is for overhead columns */
+       fmt_node = list_first_entry(&browser->hists->hpp_formats,
+                                   struct perf_hpp_list_node, list);
+       perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+               if (perf_hpp__should_skip(fmt, browser->hists) ||
+                   column++ < browser->b.horiz_scroll)
+                       continue;
+
+               ret = fmt->width(fmt, NULL, hists_to_evsel(browser->hists));
+
+               if (first) {
+                       /* for folded sign */
+                       first = false;
+                       ret++;
+               } else {
+                       /* space between columns */
+                       ret += 2;
+               }
+
+               ui_browser__write_nstring(&browser->b, "", ret);
+               width -= ret;
+       }
+
+       ui_browser__write_nstring(&browser->b, "", indent * HIERARCHY_INDENT);
+       width -= indent * HIERARCHY_INDENT;
+
+       if (column >= browser->b.horiz_scroll) {
+               char buf[32];
+
+               ret = snprintf(buf, sizeof(buf), "no entry >= %.2f%%", browser->min_pcnt);
+               ui_browser__printf(&browser->b, "  %s", buf);
+               width -= ret + 2;
+       }
+
+       /* The scroll bar isn't being used */
+       if (!browser->b.navkeypressed)
+               width += 1;
+
+       ui_browser__write_nstring(&browser->b, "", width);
+       return 1;
+}
+
  static int advance_hpp_check(struct perf_hpp *hpp, int inc)
  {
         advance_hpp(hpp, inc);
@@ -1144,7 +1527,7 @@ static int hists_browser__scnprintf_headers(struct hist_browser *browser, char *
                         return ret;
         }
  
-       perf_hpp__for_each_format(fmt) {
+       hists__for_each_format(browser->hists, fmt) {
                 if (perf_hpp__should_skip(fmt, hists)  || column++ < browser->b.horiz_scroll)
                         continue;
  
@@ -1160,11 +1543,96 @@ static int hists_browser__scnprintf_headers(struct hist_browser *browser, char *
         return ret;
  }
  
+static int hists_browser__scnprintf_hierarchy_headers(struct hist_browser *browser, char *buf, size_t size)
+{
+       struct hists *hists = browser->hists;
+       struct perf_hpp dummy_hpp = {
+               .buf    = buf,
+               .size   = size,
+       };
+       struct perf_hpp_fmt *fmt;
+       struct perf_hpp_list_node *fmt_node;
+       size_t ret = 0;
+       int column = 0;
+       int indent = hists->nr_hpp_node - 2;
+       bool first_node, first_col;
+
+       ret = scnprintf(buf, size, " ");
+       if (advance_hpp_check(&dummy_hpp, ret))
+               return ret;
+
+       /* the first hpp_list_node is for overhead columns */
+       fmt_node = list_first_entry(&hists->hpp_formats,
+                                   struct perf_hpp_list_node, list);
+       perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+               if (column++ < browser->b.horiz_scroll)
+                       continue;
+
+               ret = fmt->header(fmt, &dummy_hpp, hists_to_evsel(hists));
+               if (advance_hpp_check(&dummy_hpp, ret))
+                       break;
+
+               ret = scnprintf(dummy_hpp.buf, dummy_hpp.size, "  ");
+               if (advance_hpp_check(&dummy_hpp, ret))
+                       break;
+       }
+
+       ret = scnprintf(dummy_hpp.buf, dummy_hpp.size, "%*s",
+                       indent * HIERARCHY_INDENT, "");
+       if (advance_hpp_check(&dummy_hpp, ret))
+               return ret;
+
+       first_node = true;
+       list_for_each_entry_continue(fmt_node, &hists->hpp_formats, list) {
+               if (!first_node) {
+                       ret = scnprintf(dummy_hpp.buf, dummy_hpp.size, " / ");
+                       if (advance_hpp_check(&dummy_hpp, ret))
+                               break;
+               }
+               first_node = false;
+
+               first_col = true;
+               perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+                       char *start;
+
+                       if (perf_hpp__should_skip(fmt, hists))
+                               continue;
+
+                       if (!first_col) {
+                               ret = scnprintf(dummy_hpp.buf, dummy_hpp.size, "+");
+                               if (advance_hpp_check(&dummy_hpp, ret))
+                                       break;
+                       }
+                       first_col = false;
+
+                       ret = fmt->header(fmt, &dummy_hpp, hists_to_evsel(hists));
+                       dummy_hpp.buf[ret] = '\0';
+                       rtrim(dummy_hpp.buf);
+
+                       start = ltrim(dummy_hpp.buf);
+                       ret = strlen(start);
+
+                       if (start != dummy_hpp.buf)
+                               memmove(dummy_hpp.buf, start, ret + 1);
+
+                       if (advance_hpp_check(&dummy_hpp, ret))
+                               break;
+               }
+       }
+
+       return ret;
+}
+
  static void hist_browser__show_headers(struct hist_browser *browser)
  {
         char headers[1024];
  
-       hists_browser__scnprintf_headers(browser, headers, sizeof(headers));
+       if (symbol_conf.report_hierarchy)
+               hists_browser__scnprintf_hierarchy_headers(browser, headers,
+                                                          sizeof(headers));
+       else
+               hists_browser__scnprintf_headers(browser, headers,
+                                                sizeof(headers));
         ui_browser__gotorc(&browser->b, 0, 0);
         ui_browser__set_color(&browser->b, HE_COLORSET_ROOT);
         ui_browser__write_nstring(&browser->b, headers, browser->b.width + 1);
@@ -1196,18 +1664,34 @@ static unsigned int hist_browser__refresh(struct ui_browser *browser)
         hb->he_selection = NULL;
         hb->selection = NULL;
  
-       for (nd = browser->top; nd; nd = rb_next(nd)) {
+       for (nd = browser->top; nd; nd = rb_hierarchy_next(nd)) {
                 struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
                 float percent;
  
-               if (h->filtered)
+               if (h->filtered) {
+                       /* let it move to sibling */
+                       h->unfolded = false;
                         continue;
+               }
  
                 percent = hist_entry__get_percent_limit(h);
                 if (percent < hb->min_pcnt)
                         continue;
  
-               row += hist_browser__show_entry(hb, h, row);
+               if (symbol_conf.report_hierarchy) {
+                       row += hist_browser__show_hierarchy_entry(hb, h, row,
+                                                                 h->depth);
+                       if (row == browser->rows)
+                               break;
+
+                       if (h->has_no_entry) {
+                               hist_browser__show_no_entry(hb, row, h->depth + 1);
+                               row++;
+                       }
+               } else {
+                       row += hist_browser__show_entry(hb, h, row);
+               }
+
                 if (row == browser->rows)
                         break;
         }
@@ -1225,7 +1709,14 @@ static struct rb_node *hists__filter_entries(struct rb_node *nd,
                 if (!h->filtered && percent >= min_pcnt)
                         return nd;
  
-               nd = rb_next(nd);
+               /*
+                * If it's filtered, its all children also were filtered.
+                * So move to sibling node.
+                */
+               if (rb_next(nd))
+                       nd = rb_next(nd);
+               else
+                       nd = rb_hierarchy_next(nd);
         }
  
         return NULL;
@@ -1241,7 +1732,7 @@ static struct rb_node *hists__filter_prev_entries(struct rb_node *nd,
                 if (!h->filtered && percent >= min_pcnt)
                         return nd;
  
-               nd = rb_prev(nd);
+               nd = rb_hierarchy_prev(nd);
         }
  
         return NULL;
@@ -1271,8 +1762,8 @@ static void ui_browser__hists_seek(struct ui_browser *browser,
                 nd = browser->top;
                 goto do_offset;
         case SEEK_END:
-               nd = hists__filter_prev_entries(rb_last(browser->entries),
-                                               hb->min_pcnt);
+               nd = rb_hierarchy_last(rb_last(browser->entries));
+               nd = hists__filter_prev_entries(nd, hb->min_pcnt);
                 first = false;
                 break;
         default:
@@ -1306,7 +1797,7 @@ do_offset:
         if (offset > 0) {
                 do {
                         h = rb_entry(nd, struct hist_entry, rb_node);
-                       if (h->unfolded) {
+                       if (h->unfolded && h->leaf) {
                                 u16 remaining = h->nr_rows - h->row_offset;
                                 if (offset > remaining) {
                                         offset -= remaining;
@@ -1318,7 +1809,8 @@ do_offset:
                                         break;
                                 }
                         }
-                       nd = hists__filter_entries(rb_next(nd), hb->min_pcnt);
+                       nd = hists__filter_entries(rb_hierarchy_next(nd),
+                                                  hb->min_pcnt);
                         if (nd == NULL)
                                 break;
                         --offset;
@@ -1327,7 +1819,7 @@ do_offset:
         } else if (offset < 0) {
                 while (1) {
                         h = rb_entry(nd, struct hist_entry, rb_node);
-                       if (h->unfolded) {
+                       if (h->unfolded && h->leaf) {
                                 if (first) {
                                         if (-offset > h->row_offset) {
                                                 offset += h->row_offset;
@@ -1351,7 +1843,7 @@ do_offset:
                                 }
                         }
  
-                       nd = hists__filter_prev_entries(rb_prev(nd),
+                       nd = hists__filter_prev_entries(rb_hierarchy_prev(nd),
                                                         hb->min_pcnt);
                         if (nd == NULL)
                                 break;
@@ -1364,7 +1856,7 @@ do_offset:
                                  * row_offset at its last entry.
                                  */
                                 h = rb_entry(nd, struct hist_entry, rb_node);
-                               if (h->unfolded)
+                               if (h->unfolded && h->leaf)
                                         h->row_offset = h->nr_rows;
                                 break;
                         }
@@ -1378,17 +1870,14 @@ do_offset:
  }
  
  static int hist_browser__fprintf_callchain(struct hist_browser *browser,
-                                          struct hist_entry *he, FILE *fp)
+                                          struct hist_entry *he, FILE *fp,
+                                          int level)
  {
-       u64 total = hists__total_period(he->hists);
         struct callchain_print_arg arg  = {
                 .fp = fp,
         };
  
-       if (symbol_conf.cumulate_callchain)
-               total = he->stat_acc->period;
-
-       hist_browser__show_callchain(browser, &he->sorted_chain, 1, 0, total,
+       hist_browser__show_callchain(browser, he, level, 0,
                                      hist_browser__fprintf_callchain_entry, &arg,
                                      hist_browser__check_dump_full);
         return arg.printed;
@@ -1414,7 +1903,7 @@ static int hist_browser__fprintf_entry(struct hist_browser *browser,
         if (symbol_conf.use_callchain)
                 printed += fprintf(fp, "%c ", folded_sign);
  
-       perf_hpp__for_each_format(fmt) {
+       hists__for_each_format(browser->hists, fmt) {
                 if (perf_hpp__should_skip(fmt, he->hists))
                         continue;
  
@@ -1425,12 +1914,71 @@ static int hist_browser__fprintf_entry(struct hist_browser *browser,
                         first = false;
  
                 ret = fmt->entry(fmt, &hpp, he);
+               ret = hist_entry__snprintf_alignment(he, &hpp, fmt, ret);
                 advance_hpp(&hpp, ret);
         }
-       printed += fprintf(fp, "%s\n", rtrim(s));
+       printed += fprintf(fp, "%s\n", s);
  
         if (folded_sign == '-')
-               printed += hist_browser__fprintf_callchain(browser, he, fp);
+               printed += hist_browser__fprintf_callchain(browser, he, fp, 1);
+
+       return printed;
+}
+
+
+static int hist_browser__fprintf_hierarchy_entry(struct hist_browser *browser,
+                                                struct hist_entry *he,
+                                                FILE *fp, int level)
+{
+       char s[8192];
+       int printed = 0;
+       char folded_sign = ' ';
+       struct perf_hpp hpp = {
+               .buf = s,
+               .size = sizeof(s),
+       };
+       struct perf_hpp_fmt *fmt;
+       struct perf_hpp_list_node *fmt_node;
+       bool first = true;
+       int ret;
+       int hierarchy_indent = (he->hists->nr_hpp_node - 2) * HIERARCHY_INDENT;
+
+       printed = fprintf(fp, "%*s", level * HIERARCHY_INDENT, "");
+
+       folded_sign = hist_entry__folded(he);
+       printed += fprintf(fp, "%c", folded_sign);
+
+       /* the first hpp_list_node is for overhead columns */
+       fmt_node = list_first_entry(&he->hists->hpp_formats,
+                                   struct perf_hpp_list_node, list);
+       perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+               if (!first) {
+                       ret = scnprintf(hpp.buf, hpp.size, "  ");
+                       advance_hpp(&hpp, ret);
+               } else
+                       first = false;
+
+               ret = fmt->entry(fmt, &hpp, he);
+               advance_hpp(&hpp, ret);
+       }
+
+       ret = scnprintf(hpp.buf, hpp.size, "%*s", hierarchy_indent, "");
+       advance_hpp(&hpp, ret);
+
+       perf_hpp_list__for_each_format(he->hpp_list, fmt) {
+               ret = scnprintf(hpp.buf, hpp.size, "  ");
+               advance_hpp(&hpp, ret);
+
+               ret = fmt->entry(fmt, &hpp, he);
+               advance_hpp(&hpp, ret);
+       }
+
+       printed += fprintf(fp, "%s\n", rtrim(s));
+
+       if (he->leaf && folded_sign == '-') {
+               printed += hist_browser__fprintf_callchain(browser, he, fp,
+                                                          he->depth + 1);
+       }
  
         return printed;
  }
@@ -1444,8 +1992,16 @@ static int hist_browser__fprintf(struct hist_browser *browser, FILE *fp)
         while (nd) {
                 struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
  
-               printed += hist_browser__fprintf_entry(browser, h, fp);
-               nd = hists__filter_entries(rb_next(nd), browser->min_pcnt);
+               if (symbol_conf.report_hierarchy) {
+                       printed += hist_browser__fprintf_hierarchy_entry(browser,
+                                                                        h, fp,
+                                                                        h->depth);
+               } else {
+                       printed += hist_browser__fprintf_entry(browser, h, fp);
+               }
+
+               nd = hists__filter_entries(rb_hierarchy_next(nd),
+                                          browser->min_pcnt);
         }
  
         return printed;
@@ -1580,11 +2136,18 @@ static int hists__browser_title(struct hists *hists,
         if (hists->uid_filter_str)
                 printed += snprintf(bf + printed, size - printed,
                                     ", UID: %s", hists->uid_filter_str);
-       if (thread)
-               printed += scnprintf(bf + printed, size - printed,
+       if (thread) {
+               if (sort__has_thread) {
+                       printed += scnprintf(bf + printed, size - printed,
                                     ", Thread: %s(%d)",
                                      (thread->comm_set ? thread__comm_str(thread) : ""),
                                     thread->tid);
+               } else {
+                       printed += scnprintf(bf + printed, size - printed,
+                                   ", Thread: %s",
+                                    (thread->comm_set ? thread__comm_str(thread) : ""));
+               }
+       }
         if (dso)
                 printed += scnprintf(bf + printed, size - printed,
                                     ", DSO: %s", dso->short_name);
@@ -1759,15 +2322,24 @@ do_zoom_thread(struct hist_browser *browser, struct popup_action *act)
  {
         struct thread *thread = act->thread;
  
+       if ((!sort__has_thread && !sort__has_comm) || thread == NULL)
+               return 0;
+
         if (browser->hists->thread_filter) {
                 pstack__remove(browser->pstack, &browser->hists->thread_filter);
                 perf_hpp__set_elide(HISTC_THREAD, false);
                 thread__zput(browser->hists->thread_filter);
                 ui_helpline__pop();
         } else {
-               ui_helpline__fpush("To zoom out press ESC or ENTER + \"Zoom out of %s(%d) thread\"",
-                                  thread->comm_set ? thread__comm_str(thread) : "",
-                                  thread->tid);
+               if (sort__has_thread) {
+                       ui_helpline__fpush("To zoom out press ESC or ENTER + \"Zoom out of %s(%d) thread\"",
+                                          thread->comm_set ? thread__comm_str(thread) : "",
+                                          thread->tid);
+               } else {
+                       ui_helpline__fpush("To zoom out press ESC or ENTER + \"Zoom out of %s thread\"",
+                                          thread->comm_set ? thread__comm_str(thread) : "");
+               }
+
                 browser->hists->thread_filter = thread__get(thread);
                 perf_hpp__set_elide(HISTC_THREAD, false);
                 pstack__push(browser->pstack, &browser->hists->thread_filter);
@@ -1782,13 +2354,22 @@ static int
  add_thread_opt(struct hist_browser *browser, struct popup_action *act,
                char **optstr, struct thread *thread)
  {
-       if (thread == NULL)
+       int ret;
+
+       if ((!sort__has_thread && !sort__has_comm) || thread == NULL)
                 return 0;
  
-       if (asprintf(optstr, "Zoom %s %s(%d) thread",
-                    browser->hists->thread_filter ? "out of" : "into",
-                    thread->comm_set ? thread__comm_str(thread) : "",
-                    thread->tid) < 0)
+       if (sort__has_thread) {
+               ret = asprintf(optstr, "Zoom %s %s(%d) thread",
+                              browser->hists->thread_filter ? "out of" : "into",
+                              thread->comm_set ? thread__comm_str(thread) : "",
+                              thread->tid);
+       } else {
+               ret = asprintf(optstr, "Zoom %s %s thread",
+                              browser->hists->thread_filter ? "out of" : "into",
+                              thread->comm_set ? thread__comm_str(thread) : "");
+       }
+       if (ret < 0)
                 return 0;
  
         act->thread = thread;
@@ -1801,6 +2382,9 @@ do_zoom_dso(struct hist_browser *browser, struct popup_action *act)
  {
         struct map *map = act->ms.map;
  
+       if (!sort__has_dso || map == NULL)
+               return 0;
+
         if (browser->hists->dso_filter) {
                 pstack__remove(browser->pstack, &browser->hists->dso_filter);
                 perf_hpp__set_elide(HISTC_DSO, false);
@@ -1825,7 +2409,7 @@ static int
  add_dso_opt(struct hist_browser *browser, struct popup_action *act,
             char **optstr, struct map *map)
  {
-       if (map == NULL)
+       if (!sort__has_dso || map == NULL)
                 return 0;
  
         if (asprintf(optstr, "Zoom %s %s DSO",
@@ -1850,7 +2434,7 @@ static int
  add_map_opt(struct hist_browser *browser __maybe_unused,
             struct popup_action *act, char **optstr, struct map *map)
  {
-       if (map == NULL)
+       if (!sort__has_dso || map == NULL)
                 return 0;
  
         if (asprintf(optstr, "Browse map details") < 0)
@@ -1952,6 +2536,9 @@ add_exit_opt(struct hist_browser *browser __maybe_unused,
  static int
  do_zoom_socket(struct hist_browser *browser, struct popup_action *act)
  {
+       if (!sort__has_socket || act->socket < 0)
+               return 0;
+
         if (browser->hists->socket_filter > -1) {
                 pstack__remove(browser->pstack, &browser->hists->socket_filter);
                 browser->hists->socket_filter = -1;
@@ -1971,7 +2558,7 @@ static int
  add_socket_opt(struct hist_browser *browser, struct popup_action *act,
                char **optstr, int socket_id)
  {
-       if (socket_id < 0)
+       if (!sort__has_socket || socket_id < 0)
                 return 0;
  
         if (asprintf(optstr, "Zoom %s Processor Socket %d",
@@ -1989,17 +2576,60 @@ static void hist_browser__update_nr_entries(struct hist_browser *hb)
         u64 nr_entries = 0;
         struct rb_node *nd = rb_first(&hb->hists->entries);
  
-       if (hb->min_pcnt == 0) {
+       if (hb->min_pcnt == 0 && !symbol_conf.report_hierarchy) {
                 hb->nr_non_filtered_entries = hb->hists->nr_non_filtered_entries;
                 return;
         }
  
         while ((nd = hists__filter_entries(nd, hb->min_pcnt)) != NULL) {
                 nr_entries++;
-               nd = rb_next(nd);
+               nd = rb_hierarchy_next(nd);
         }
  
         hb->nr_non_filtered_entries = nr_entries;
+       hb->nr_hierarchy_entries = nr_entries;
+}
+
+static void hist_browser__update_percent_limit(struct hist_browser *hb,
+                                              double percent)
+{
+       struct hist_entry *he;
+       struct rb_node *nd = rb_first(&hb->hists->entries);
+       u64 total = hists__total_period(hb->hists);
+       u64 min_callchain_hits = total * (percent / 100);
+
+       hb->min_pcnt = callchain_param.min_percent = percent;
+
+       while ((nd = hists__filter_entries(nd, hb->min_pcnt)) != NULL) {
+               he = rb_entry(nd, struct hist_entry, rb_node);
+
+               if (he->has_no_entry) {
+                       he->has_no_entry = false;
+                       he->nr_rows = 0;
+               }
+
+               if (!he->leaf || !symbol_conf.use_callchain)
+                       goto next;
+
+               if (callchain_param.mode == CHAIN_GRAPH_REL) {
+                       total = he->stat.period;
+
+                       if (symbol_conf.cumulate_callchain)
+                               total = he->stat_acc->period;
+
+                       min_callchain_hits = total * (percent / 100);
+               }
+
+               callchain_param.sort(&he->sorted_chain, he->callchain,
+                                    min_callchain_hits, &callchain_param);
+
+next:
+               nd = __rb_hierarchy_next(nd, HMD_FORCE_CHILD);
+
+               /* force to re-evaluate folding state of callchains */
+               he->init_have_children = false;
+               hist_entry__set_folding(he, hb, false);
+       }
  }
  
  static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
@@ -2037,6 +2667,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
         "E             Expand all callchains\n"                         \
         "F             Toggle percentage of filtered entries\n"         \
         "H             Display column headers\n"                        \
+       "L             Change percent limit\n"                          \
         "m             Display context menu\n"                          \
         "S             Zoom into current Processor Socket\n"            \
  
@@ -2077,7 +2708,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
         memset(options, 0, sizeof(options));
         memset(actions, 0, sizeof(actions));
  
-       perf_hpp__for_each_format(fmt) {
+       hists__for_each_format(browser->hists, fmt) {
                 perf_hpp__reset_width(fmt, hists);
                 /*
                  * This is done just once, and activates the horizontal scrolling
@@ -2192,6 +2823,24 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
                                 top->zero = !top->zero;
                         }
                         continue;
+               case 'L':
+                       if (ui_browser__input_window("Percent Limit",
+                                       "Please enter the value you want to hide entries under that percent.",
+                                       buf, "ENTER: OK, ESC: Cancel",
+                                       delay_secs * 2) == K_ENTER) {
+                               char *end;
+                               double new_percent = strtod(buf, &end);
+
+                               if (new_percent < 0 || new_percent > 100) {
+                                       ui_browser__warning(&browser->b, delay_secs * 2,
+                                               "Invalid percent: %.2f", new_percent);
+                                       continue;
+                               }
+
+                               hist_browser__update_percent_limit(browser, new_percent);
+                               hist_browser__reset(browser);
+                       }
+                       continue;
                 case K_F1:
                 case 'h':
                 case '?':
@@ -2263,10 +2912,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
                         continue;
                 }
  
-               if (!sort__has_sym)
-                       goto add_exit_option;
-
-               if (browser->selection == NULL)
+               if (!sort__has_sym || browser->selection == NULL)
                         goto skip_annotation;
  
                 if (sort__mode == SORT_MODE__BRANCH) {
@@ -2306,11 +2952,16 @@ skip_annotation:
                                              &options[nr_options],
                                              socked_id);
                 /* perf script support */
+               if (!is_report_browser(hbt))
+                       goto skip_scripting;
+
                 if (browser->he_selection) {
-                       nr_options += add_script_opt(browser,
-                                                    &actions[nr_options],
-                                                    &options[nr_options],
-                                                    thread, NULL);
+                       if (sort__has_thread && thread) {
+                               nr_options += add_script_opt(browser,
+                                                            &actions[nr_options],
+                                                            &options[nr_options],
+                                                            thread, NULL);
+                       }
                         /*
                          * Note that browser->selection != NULL
                          * when browser->he_selection is not NULL,
@@ -2320,16 +2971,18 @@ skip_annotation:
                          *
                          * See hist_browser__show_entry.
                          */
-                       nr_options += add_script_opt(browser,
-                                                    &actions[nr_options],
-                                                    &options[nr_options],
-                                                    NULL, browser->selection->sym);
+                       if (sort__has_sym && browser->selection->sym) {
+                               nr_options += add_script_opt(browser,
+                                                            &actions[nr_options],
+                                                            &options[nr_options],
+                                                            NULL, browser->selection->sym);
+                       }
                 }
                 nr_options += add_script_opt(browser, &actions[nr_options],
                                              &options[nr_options], NULL, NULL);
                 nr_options += add_switch_opt(browser, &actions[nr_options],
                                              &options[nr_options]);
-add_exit_option:
+skip_scripting:
                 nr_options += add_exit_opt(browser, &actions[nr_options],
                                            &options[nr_options]);
  
diff --git a/tools/perf/ui/gtk/hists.c b/tools/perf/ui/gtk/hists.c

index 0f8dcfdfb10f3856abefc7f252631e8937bacf6b..bd9bf7e343b1e310d20587d0b0f7b75c6b10a574 100644 (file)
--- a/tools/perf/ui/gtk/hists.c
+++ b/tools/perf/ui/gtk/hists.c
@@ -306,7 +306,7 @@ static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists,
  
         nr_cols = 0;
  
-       perf_hpp__for_each_format(fmt)
+       hists__for_each_format(hists, fmt)
                 col_types[nr_cols++] = G_TYPE_STRING;
  
         store = gtk_tree_store_newv(nr_cols, col_types);
@@ -317,7 +317,7 @@ static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists,
  
         col_idx = 0;
  
-       perf_hpp__for_each_format(fmt) {
+       hists__for_each_format(hists, fmt) {
                 if (perf_hpp__should_skip(fmt, hists))
                         continue;
  
@@ -367,7 +367,7 @@ static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists,
  
                 col_idx = 0;
  
-               perf_hpp__for_each_format(fmt) {
+               hists__for_each_format(hists, fmt) {
                         if (perf_hpp__should_skip(fmt, h->hists))
                                 continue;
  
@@ -396,6 +396,194 @@ static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists,
         gtk_container_add(GTK_CONTAINER(window), view);
  }
  
+static void perf_gtk__add_hierarchy_entries(struct hists *hists,
+                                           struct rb_root *root,
+                                           GtkTreeStore *store,
+                                           GtkTreeIter *parent,
+                                           struct perf_hpp *hpp,
+                                           float min_pcnt)
+{
+       int col_idx = 0;
+       struct rb_node *node;
+       struct hist_entry *he;
+       struct perf_hpp_fmt *fmt;
+       struct perf_hpp_list_node *fmt_node;
+       u64 total = hists__total_period(hists);
+       int size;
+
+       for (node = rb_first(root); node; node = rb_next(node)) {
+               GtkTreeIter iter;
+               float percent;
+               char *bf;
+
+               he = rb_entry(node, struct hist_entry, rb_node);
+               if (he->filtered)
+                       continue;
+
+               percent = hist_entry__get_percent_limit(he);
+               if (percent < min_pcnt)
+                       continue;
+
+               gtk_tree_store_append(store, &iter, parent);
+
+               col_idx = 0;
+
+               /* the first hpp_list_node is for overhead columns */
+               fmt_node = list_first_entry(&hists->hpp_formats,
+                                           struct perf_hpp_list_node, list);
+               perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+                       if (fmt->color)
+                               fmt->color(fmt, hpp, he);
+                       else
+                               fmt->entry(fmt, hpp, he);
+
+                       gtk_tree_store_set(store, &iter, col_idx++, hpp->buf, -1);
+               }
+
+               bf = hpp->buf;
+               size = hpp->size;
+               perf_hpp_list__for_each_format(he->hpp_list, fmt) {
+                       int ret;
+
+                       if (fmt->color)
+                               ret = fmt->color(fmt, hpp, he);
+                       else
+                               ret = fmt->entry(fmt, hpp, he);
+
+                       snprintf(hpp->buf + ret, hpp->size - ret, "  ");
+                       advance_hpp(hpp, ret + 2);
+               }
+
+               gtk_tree_store_set(store, &iter, col_idx, ltrim(rtrim(bf)), -1);
+
+               if (!he->leaf) {
+                       hpp->buf = bf;
+                       hpp->size = size;
+
+                       perf_gtk__add_hierarchy_entries(hists, &he->hroot_out,
+                                                       store, &iter, hpp,
+                                                       min_pcnt);
+
+                       if (!hist_entry__has_hierarchy_children(he, min_pcnt)) {
+                               char buf[32];
+                               GtkTreeIter child;
+
+                               snprintf(buf, sizeof(buf), "no entry >= %.2f%%",
+                                        min_pcnt);
+
+                               gtk_tree_store_append(store, &child, &iter);
+                               gtk_tree_store_set(store, &child, col_idx, buf, -1);
+                       }
+               }
+
+               if (symbol_conf.use_callchain && he->leaf) {
+                       if (callchain_param.mode == CHAIN_GRAPH_REL)
+                               total = symbol_conf.cumulate_callchain ?
+                                       he->stat_acc->period : he->stat.period;
+
+                       perf_gtk__add_callchain(&he->sorted_chain, store, &iter,
+                                               col_idx, total);
+               }
+       }
+
+}
+
+static void perf_gtk__show_hierarchy(GtkWidget *window, struct hists *hists,
+                                    float min_pcnt)
+{
+       struct perf_hpp_fmt *fmt;
+       struct perf_hpp_list_node *fmt_node;
+       GType col_types[MAX_COLUMNS];
+       GtkCellRenderer *renderer;
+       GtkTreeStore *store;
+       GtkWidget *view;
+       int col_idx;
+       int nr_cols = 0;
+       char s[512];
+       char buf[512];
+       bool first_node, first_col;
+       struct perf_hpp hpp = {
+               .buf            = s,
+               .size           = sizeof(s),
+       };
+
+       hists__for_each_format(hists, fmt) {
+               if (perf_hpp__is_sort_entry(fmt) ||
+                   perf_hpp__is_dynamic_entry(fmt))
+                       break;
+
+               col_types[nr_cols++] = G_TYPE_STRING;
+       }
+       col_types[nr_cols++] = G_TYPE_STRING;
+
+       store = gtk_tree_store_newv(nr_cols, col_types);
+       view = gtk_tree_view_new();
+       renderer = gtk_cell_renderer_text_new();
+
+       col_idx = 0;
+
+       /* the first hpp_list_node is for overhead columns */
+       fmt_node = list_first_entry(&hists->hpp_formats,
+                                   struct perf_hpp_list_node, list);
+       perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+               gtk_tree_view_insert_column_with_attributes(GTK_TREE_VIEW(view),
+                                                           -1, fmt->name,
+                                                           renderer, "markup",
+                                                           col_idx++, NULL);
+       }
+
+       /* construct merged column header since sort keys share single column */
+       buf[0] = '\0';
+       first_node = true;
+       list_for_each_entry_continue(fmt_node, &hists->hpp_formats, list) {
+               if (!first_node)
+                       strcat(buf, " / ");
+               first_node = false;
+
+               first_col = true;
+               perf_hpp_list__for_each_format(&fmt_node->hpp ,fmt) {
+                       if (perf_hpp__should_skip(fmt, hists))
+                               continue;
+
+                       if (!first_col)
+                               strcat(buf, "+");
+                       first_col = false;
+
+                       fmt->header(fmt, &hpp, hists_to_evsel(hists));
+                       strcat(buf, ltrim(rtrim(hpp.buf)));
+               }
+       }
+
+       gtk_tree_view_insert_column_with_attributes(GTK_TREE_VIEW(view),
+                                                   -1, buf,
+                                                   renderer, "markup",
+                                                   col_idx++, NULL);
+
+       for (col_idx = 0; col_idx < nr_cols; col_idx++) {
+               GtkTreeViewColumn *column;
+
+               column = gtk_tree_view_get_column(GTK_TREE_VIEW(view), col_idx);
+               gtk_tree_view_column_set_resizable(column, TRUE);
+
+               if (col_idx == 0) {
+                       gtk_tree_view_set_expander_column(GTK_TREE_VIEW(view),
+                                                         column);
+               }
+       }
+
+       gtk_tree_view_set_model(GTK_TREE_VIEW(view), GTK_TREE_MODEL(store));
+       g_object_unref(GTK_TREE_MODEL(store));
+
+       perf_gtk__add_hierarchy_entries(hists, &hists->entries, store,
+                                       NULL, &hpp, min_pcnt);
+
+       gtk_tree_view_set_rules_hint(GTK_TREE_VIEW(view), TRUE);
+
+       g_signal_connect(view, "row-activated",
+                        G_CALLBACK(on_row_activated), NULL);
+       gtk_container_add(GTK_CONTAINER(window), view);
+}
+
  int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist,
                                   const char *help,
                                   struct hist_browser_timer *hbt __maybe_unused,
@@ -463,7 +651,10 @@ int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist,
                                                         GTK_POLICY_AUTOMATIC,
                                                         GTK_POLICY_AUTOMATIC);
  
-               perf_gtk__show_hists(scrolled_window, hists, min_pcnt);
+               if (symbol_conf.report_hierarchy)
+                       perf_gtk__show_hierarchy(scrolled_window, hists, min_pcnt);
+               else
+                       perf_gtk__show_hists(scrolled_window, hists, min_pcnt);
  
                 tab_label = gtk_label_new(evname);
  
diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c

index bf2a66e254eac35c63a0dd44e83628f34e4dc2f9..3baeaa6e71b5a51e113b8b485df97b7c8ae003a2 100644 (file)
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -5,6 +5,7 @@
  #include "../util/util.h"
  #include "../util/sort.h"
  #include "../util/evsel.h"
+#include "../util/evlist.h"
  
  /* hist period print (hpp) functions */
  
@@ -371,7 +372,20 @@ static int64_t hpp__nop_cmp(struct perf_hpp_fmt *fmt __maybe_unused,
         return 0;
  }
  
-#define HPP__COLOR_PRINT_FNS(_name, _fn)               \
+static bool perf_hpp__is_hpp_entry(struct perf_hpp_fmt *a)
+{
+       return a->header == hpp__header_fn;
+}
+
+static bool hpp__equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
+{
+       if (!perf_hpp__is_hpp_entry(a) || !perf_hpp__is_hpp_entry(b))
+               return false;
+
+       return a->idx == b->idx;
+}
+
+#define HPP__COLOR_PRINT_FNS(_name, _fn, _idx)         \
         {                                               \
                 .name   = _name,                        \
                 .header = hpp__header_fn,               \
@@ -381,9 +395,11 @@ static int64_t hpp__nop_cmp(struct perf_hpp_fmt *fmt __maybe_unused,
                 .cmp    = hpp__nop_cmp,                 \
                 .collapse = hpp__nop_cmp,               \
                 .sort   = hpp__sort_ ## _fn,            \
+               .idx    = PERF_HPP__ ## _idx,           \
+               .equal  = hpp__equal,                   \
         }
  
-#define HPP__COLOR_ACC_PRINT_FNS(_name, _fn)           \
+#define HPP__COLOR_ACC_PRINT_FNS(_name, _fn, _idx)     \
         {                                               \
                 .name   = _name,                        \
                 .header = hpp__header_fn,               \
@@ -393,9 +409,11 @@ static int64_t hpp__nop_cmp(struct perf_hpp_fmt *fmt __maybe_unused,
                 .cmp    = hpp__nop_cmp,                 \
                 .collapse = hpp__nop_cmp,               \
                 .sort   = hpp__sort_ ## _fn,            \
+               .idx    = PERF_HPP__ ## _idx,           \
+               .equal  = hpp__equal,                   \
         }
  
-#define HPP__PRINT_FNS(_name, _fn)                     \
+#define HPP__PRINT_FNS(_name, _fn, _idx)               \
         {                                               \
                 .name   = _name,                        \
                 .header = hpp__header_fn,               \
@@ -404,22 +422,25 @@ static int64_t hpp__nop_cmp(struct perf_hpp_fmt *fmt __maybe_unused,
                 .cmp    = hpp__nop_cmp,                 \
                 .collapse = hpp__nop_cmp,               \
                 .sort   = hpp__sort_ ## _fn,            \
+               .idx    = PERF_HPP__ ## _idx,           \
+               .equal  = hpp__equal,                   \
         }
  
  struct perf_hpp_fmt perf_hpp__format[] = {
-       HPP__COLOR_PRINT_FNS("Overhead", overhead),
-       HPP__COLOR_PRINT_FNS("sys", overhead_sys),
-       HPP__COLOR_PRINT_FNS("usr", overhead_us),
-       HPP__COLOR_PRINT_FNS("guest sys", overhead_guest_sys),
-       HPP__COLOR_PRINT_FNS("guest usr", overhead_guest_us),
-       HPP__COLOR_ACC_PRINT_FNS("Children", overhead_acc),
-       HPP__PRINT_FNS("Samples", samples),
-       HPP__PRINT_FNS("Period", period)
+       HPP__COLOR_PRINT_FNS("Overhead", overhead, OVERHEAD),
+       HPP__COLOR_PRINT_FNS("sys", overhead_sys, OVERHEAD_SYS),
+       HPP__COLOR_PRINT_FNS("usr", overhead_us, OVERHEAD_US),
+       HPP__COLOR_PRINT_FNS("guest sys", overhead_guest_sys, OVERHEAD_GUEST_SYS),
+       HPP__COLOR_PRINT_FNS("guest usr", overhead_guest_us, OVERHEAD_GUEST_US),
+       HPP__COLOR_ACC_PRINT_FNS("Children", overhead_acc, OVERHEAD_ACC),
+       HPP__PRINT_FNS("Samples", samples, SAMPLES),
+       HPP__PRINT_FNS("Period", period, PERIOD)
  };
  
-LIST_HEAD(perf_hpp__list);
-LIST_HEAD(perf_hpp__sort_list);
-
+struct perf_hpp_list perf_hpp_list = {
+       .fields = LIST_HEAD_INIT(perf_hpp_list.fields),
+       .sorts  = LIST_HEAD_INIT(perf_hpp_list.sorts),
+};
  
  #undef HPP__COLOR_PRINT_FNS
  #undef HPP__COLOR_ACC_PRINT_FNS
@@ -485,63 +506,60 @@ void perf_hpp__init(void)
                 hpp_dimension__add_output(PERF_HPP__PERIOD);
  }
  
-void perf_hpp__column_register(struct perf_hpp_fmt *format)
+void perf_hpp_list__column_register(struct perf_hpp_list *list,
+                                   struct perf_hpp_fmt *format)
  {
-       list_add_tail(&format->list, &perf_hpp__list);
+       list_add_tail(&format->list, &list->fields);
  }
  
-void perf_hpp__column_unregister(struct perf_hpp_fmt *format)
+void perf_hpp_list__register_sort_field(struct perf_hpp_list *list,
+                                       struct perf_hpp_fmt *format)
  {
-       list_del(&format->list);
+       list_add_tail(&format->sort_list, &list->sorts);
  }
  
-void perf_hpp__register_sort_field(struct perf_hpp_fmt *format)
-{
-       list_add_tail(&format->sort_list, &perf_hpp__sort_list);
-}
-
-void perf_hpp__column_enable(unsigned col)
-{
-       BUG_ON(col >= PERF_HPP__MAX_INDEX);
-       perf_hpp__column_register(&perf_hpp__format[col]);
-}
-
-void perf_hpp__column_disable(unsigned col)
+void perf_hpp__column_unregister(struct perf_hpp_fmt *format)
  {
-       BUG_ON(col >= PERF_HPP__MAX_INDEX);
-       perf_hpp__column_unregister(&perf_hpp__format[col]);
+       list_del(&format->list);
  }
  
  void perf_hpp__cancel_cumulate(void)
  {
+       struct perf_hpp_fmt *fmt, *acc, *ovh, *tmp;
+
         if (is_strict_order(field_order))
                 return;
  
-       perf_hpp__column_disable(PERF_HPP__OVERHEAD_ACC);
-       perf_hpp__format[PERF_HPP__OVERHEAD].name = "Overhead";
+       ovh = &perf_hpp__format[PERF_HPP__OVERHEAD];
+       acc = &perf_hpp__format[PERF_HPP__OVERHEAD_ACC];
+
+       perf_hpp_list__for_each_format_safe(&perf_hpp_list, fmt, tmp) {
+               if (acc->equal(acc, fmt)) {
+                       perf_hpp__column_unregister(fmt);
+                       continue;
+               }
+
+               if (ovh->equal(ovh, fmt))
+                       fmt->name = "Overhead";
+       }
  }
  
-void perf_hpp__setup_output_field(void)
+static bool fmt_equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
+{
+       return a->equal && a->equal(a, b);
+}
+
+void perf_hpp__setup_output_field(struct perf_hpp_list *list)
  {
         struct perf_hpp_fmt *fmt;
  
         /* append sort keys to output field */
-       perf_hpp__for_each_sort_list(fmt) {
-               if (!list_empty(&fmt->list))
-                       continue;
-
-               /*
-                * sort entry fields are dynamically created,
-                * so they can share a same sort key even though
-                * the list is empty.
-                */
-               if (perf_hpp__is_sort_entry(fmt)) {
-                       struct perf_hpp_fmt *pos;
+       perf_hpp_list__for_each_sort_list(list, fmt) {
+               struct perf_hpp_fmt *pos;
  
-                       perf_hpp__for_each_format(pos) {
-                               if (perf_hpp__same_sort_entry(pos, fmt))
-                                       goto next;
-                       }
+               perf_hpp_list__for_each_format(list, pos) {
+                       if (fmt_equal(fmt, pos))
+                               goto next;
                 }
  
                 perf_hpp__column_register(fmt);
@@ -550,27 +568,17 @@ next:
         }
  }
  
-void perf_hpp__append_sort_keys(void)
+void perf_hpp__append_sort_keys(struct perf_hpp_list *list)
  {
         struct perf_hpp_fmt *fmt;
  
         /* append output fields to sort keys */
-       perf_hpp__for_each_format(fmt) {
-               if (!list_empty(&fmt->sort_list))
-                       continue;
-
-               /*
-                * sort entry fields are dynamically created,
-                * so they can share a same sort key even though
-                * the list is empty.
-                */
-               if (perf_hpp__is_sort_entry(fmt)) {
-                       struct perf_hpp_fmt *pos;
+       perf_hpp_list__for_each_format(list, fmt) {
+               struct perf_hpp_fmt *pos;
  
-                       perf_hpp__for_each_sort_list(pos) {
-                               if (perf_hpp__same_sort_entry(pos, fmt))
-                                       goto next;
-                       }
+               perf_hpp_list__for_each_sort_list(list, pos) {
+                       if (fmt_equal(fmt, pos))
+                               goto next;
                 }
  
                 perf_hpp__register_sort_field(fmt);
@@ -579,20 +587,29 @@ next:
         }
  }
  
-void perf_hpp__reset_output_field(void)
+
+static void fmt_free(struct perf_hpp_fmt *fmt)
+{
+       if (fmt->free)
+               fmt->free(fmt);
+}
+
+void perf_hpp__reset_output_field(struct perf_hpp_list *list)
  {
         struct perf_hpp_fmt *fmt, *tmp;
  
         /* reset output fields */
-       perf_hpp__for_each_format_safe(fmt, tmp) {
+       perf_hpp_list__for_each_format_safe(list, fmt, tmp) {
                 list_del_init(&fmt->list);
                 list_del_init(&fmt->sort_list);
+               fmt_free(fmt);
         }
  
         /* reset sort keys */
-       perf_hpp__for_each_sort_list_safe(fmt, tmp) {
+       perf_hpp_list__for_each_sort_list_safe(list, fmt, tmp) {
                 list_del_init(&fmt->list);
                 list_del_init(&fmt->sort_list);
+               fmt_free(fmt);
         }
  }
  
@@ -606,7 +623,7 @@ unsigned int hists__sort_list_width(struct hists *hists)
         bool first = true;
         struct perf_hpp dummy_hpp;
  
-       perf_hpp__for_each_format(fmt) {
+       hists__for_each_format(hists, fmt) {
                 if (perf_hpp__should_skip(fmt, hists))
                         continue;
  
@@ -624,22 +641,39 @@ unsigned int hists__sort_list_width(struct hists *hists)
         return ret;
  }
  
-void perf_hpp__reset_width(struct perf_hpp_fmt *fmt, struct hists *hists)
+unsigned int hists__overhead_width(struct hists *hists)
  {
-       int idx;
-
-       if (perf_hpp__is_sort_entry(fmt))
-               return perf_hpp__reset_sort_width(fmt, hists);
+       struct perf_hpp_fmt *fmt;
+       int ret = 0;
+       bool first = true;
+       struct perf_hpp dummy_hpp;
  
-       for (idx = 0; idx < PERF_HPP__MAX_INDEX; idx++) {
-               if (fmt == &perf_hpp__format[idx])
+       hists__for_each_format(hists, fmt) {
+               if (perf_hpp__is_sort_entry(fmt) || perf_hpp__is_dynamic_entry(fmt))
                         break;
+
+               if (first)
+                       first = false;
+               else
+                       ret += 2;
+
+               ret += fmt->width(fmt, &dummy_hpp, hists_to_evsel(hists));
         }
  
-       if (idx == PERF_HPP__MAX_INDEX)
+       return ret;
+}
+
+void perf_hpp__reset_width(struct perf_hpp_fmt *fmt, struct hists *hists)
+{
+       if (perf_hpp__is_sort_entry(fmt))
+               return perf_hpp__reset_sort_width(fmt, hists);
+
+       if (perf_hpp__is_dynamic_entry(fmt))
                 return;
  
-       switch (idx) {
+       BUG_ON(fmt->idx >= PERF_HPP__MAX_INDEX);
+
+       switch (fmt->idx) {
         case PERF_HPP__OVERHEAD:
         case PERF_HPP__OVERHEAD_SYS:
         case PERF_HPP__OVERHEAD_US:
@@ -667,7 +701,7 @@ void perf_hpp__set_user_width(const char *width_list_str)
         struct perf_hpp_fmt *fmt;
         const char *ptr = width_list_str;
  
-       perf_hpp__for_each_format(fmt) {
+       perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
                 char *p;
  
                 int len = strtol(ptr, &p, 10);
@@ -679,3 +713,71 @@ void perf_hpp__set_user_width(const char *width_list_str)
                         break;
         }
  }
+
+static int add_hierarchy_fmt(struct hists *hists, struct perf_hpp_fmt *fmt)
+{
+       struct perf_hpp_list_node *node = NULL;
+       struct perf_hpp_fmt *fmt_copy;
+       bool found = false;
+       bool skip = perf_hpp__should_skip(fmt, hists);
+
+       list_for_each_entry(node, &hists->hpp_formats, list) {
+               if (node->level == fmt->level) {
+                       found = true;
+                       break;
+               }
+       }
+
+       if (!found) {
+               node = malloc(sizeof(*node));
+               if (node == NULL)
+                       return -1;
+
+               node->skip = skip;
+               node->level = fmt->level;
+               perf_hpp_list__init(&node->hpp);
+
+               hists->nr_hpp_node++;
+               list_add_tail(&node->list, &hists->hpp_formats);
+       }
+
+       fmt_copy = perf_hpp_fmt__dup(fmt);
+       if (fmt_copy == NULL)
+               return -1;
+
+       if (!skip)
+               node->skip = false;
+
+       list_add_tail(&fmt_copy->list, &node->hpp.fields);
+       list_add_tail(&fmt_copy->sort_list, &node->hpp.sorts);
+
+       return 0;
+}
+
+int perf_hpp__setup_hists_formats(struct perf_hpp_list *list,
+                                 struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel;
+       struct perf_hpp_fmt *fmt;
+       struct hists *hists;
+       int ret;
+
+       if (!symbol_conf.report_hierarchy)
+               return 0;
+
+       evlist__for_each(evlist, evsel) {
+               hists = evsel__hists(evsel);
+
+               perf_hpp_list__for_each_sort_list(list, fmt) {
+                       if (perf_hpp__is_dynamic_entry(fmt) &&
+                           !perf_hpp__defined_dynamic_entry(fmt, hists))
+                               continue;
+
+                       ret = add_hierarchy_fmt(hists, fmt);
+                       if (ret < 0)
+                               return ret;
+               }
+       }
+
+       return 0;
+}
diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c

index 387110d50b002557d99e723605407c3db990d71a..7aff5acf3265782e03254de2d8bc1dfafb56e03a 100644 (file)
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -165,8 +165,28 @@ static size_t __callchain__fprintf_graph(FILE *fp, struct rb_root *root,
         return ret;
  }
  
+/*
+ * If have one single callchain root, don't bother printing
+ * its percentage (100 % in fractal mode and the same percentage
+ * than the hist in graph mode). This also avoid one level of column.
+ *
+ * However when percent-limit applied, it's possible that single callchain
+ * node have different (non-100% in fractal mode) percentage.
+ */
+static bool need_percent_display(struct rb_node *node, u64 parent_samples)
+{
+       struct callchain_node *cnode;
+
+       if (rb_next(node))
+               return true;
+
+       cnode = rb_entry(node, struct callchain_node, rb_node);
+       return callchain_cumul_hits(cnode) != parent_samples;
+}
+
  static size_t callchain__fprintf_graph(FILE *fp, struct rb_root *root,
-                                      u64 total_samples, int left_margin)
+                                      u64 total_samples, u64 parent_samples,
+                                      int left_margin)
  {
         struct callchain_node *cnode;
         struct callchain_list *chain;
@@ -177,13 +197,8 @@ static size_t callchain__fprintf_graph(FILE *fp, struct rb_root *root,
         int ret = 0;
         char bf[1024];
  
-       /*
-        * If have one single callchain root, don't bother printing
-        * its percentage (100 % in fractal mode and the same percentage
-        * than the hist in graph mode). This also avoid one level of column.
-        */
         node = rb_first(root);
-       if (node && !rb_next(node)) {
+       if (node && !need_percent_display(node, parent_samples)) {
                 cnode = rb_entry(node, struct callchain_node, rb_node);
                 list_for_each_entry(chain, &cnode->val, list) {
                         /*
@@ -213,9 +228,15 @@ static size_t callchain__fprintf_graph(FILE *fp, struct rb_root *root,
                 root = &cnode->rb_root;
         }
  
+       if (callchain_param.mode == CHAIN_GRAPH_REL)
+               total_samples = parent_samples;
+
         ret += __callchain__fprintf_graph(fp, root, total_samples,
                                           1, 1, left_margin);
-       ret += fprintf(fp, "\n");
+       if (ret) {
+               /* do not add a blank line if it printed nothing */
+               ret += fprintf(fp, "\n");
+       }
  
         return ret;
  }
@@ -323,16 +344,19 @@ static size_t hist_entry_callchain__fprintf(struct hist_entry *he,
                                             u64 total_samples, int left_margin,
                                             FILE *fp)
  {
+       u64 parent_samples = he->stat.period;
+
+       if (symbol_conf.cumulate_callchain)
+               parent_samples = he->stat_acc->period;
+
         switch (callchain_param.mode) {
         case CHAIN_GRAPH_REL:
-               return callchain__fprintf_graph(fp, &he->sorted_chain,
-                                               symbol_conf.cumulate_callchain ?
-                                               he->stat_acc->period : he->stat.period,
-                                               left_margin);
+               return callchain__fprintf_graph(fp, &he->sorted_chain, total_samples,
+                                               parent_samples, left_margin);
                 break;
         case CHAIN_GRAPH_ABS:
                 return callchain__fprintf_graph(fp, &he->sorted_chain, total_samples,
-                                               left_margin);
+                                               parent_samples, left_margin);
                 break;
         case CHAIN_FLAT:
                 return callchain__fprintf_flat(fp, &he->sorted_chain, total_samples);
@@ -349,45 +373,66 @@ static size_t hist_entry_callchain__fprintf(struct hist_entry *he,
         return 0;
  }
  
-static size_t hist_entry__callchain_fprintf(struct hist_entry *he,
-                                           struct hists *hists,
-                                           FILE *fp)
+static int hist_entry__snprintf(struct hist_entry *he, struct perf_hpp *hpp)
  {
-       int left_margin = 0;
-       u64 total_period = hists->stats.total_period;
+       const char *sep = symbol_conf.field_sep;
+       struct perf_hpp_fmt *fmt;
+       char *start = hpp->buf;
+       int ret;
+       bool first = true;
  
-       if (field_order == NULL && (sort_order == NULL ||
-                                   !prefixcmp(sort_order, "comm"))) {
-               struct perf_hpp_fmt *fmt;
+       if (symbol_conf.exclude_other && !he->parent)
+               return 0;
  
-               perf_hpp__for_each_format(fmt) {
-                       if (!perf_hpp__is_sort_entry(fmt))
-                               continue;
+       hists__for_each_format(he->hists, fmt) {
+               if (perf_hpp__should_skip(fmt, he->hists))
+                       continue;
  
-                       /* must be 'comm' sort entry */
-                       left_margin = fmt->width(fmt, NULL, hists_to_evsel(hists));
-                       left_margin -= thread__comm_len(he->thread);
-                       break;
-               }
+               /*
+                * If there's no field_sep, we still need
+                * to display initial '  '.
+                */
+               if (!sep || !first) {
+                       ret = scnprintf(hpp->buf, hpp->size, "%s", sep ?: "  ");
+                       advance_hpp(hpp, ret);
+               } else
+                       first = false;
+
+               if (perf_hpp__use_color() && fmt->color)
+                       ret = fmt->color(fmt, hpp, he);
+               else
+                       ret = fmt->entry(fmt, hpp, he);
+
+               ret = hist_entry__snprintf_alignment(he, hpp, fmt, ret);
+               advance_hpp(hpp, ret);
         }
-       return hist_entry_callchain__fprintf(he, total_period, left_margin, fp);
+
+       return hpp->buf - start;
  }
  
-static int hist_entry__snprintf(struct hist_entry *he, struct perf_hpp *hpp)
+static int hist_entry__hierarchy_fprintf(struct hist_entry *he,
+                                        struct perf_hpp *hpp,
+                                        struct hists *hists,
+                                        FILE *fp)
  {
         const char *sep = symbol_conf.field_sep;
         struct perf_hpp_fmt *fmt;
-       char *start = hpp->buf;
-       int ret;
+       struct perf_hpp_list_node *fmt_node;
+       char *buf = hpp->buf;
+       size_t size = hpp->size;
+       int ret, printed = 0;
         bool first = true;
  
         if (symbol_conf.exclude_other && !he->parent)
                 return 0;
  
-       perf_hpp__for_each_format(fmt) {
-               if (perf_hpp__should_skip(fmt, he->hists))
-                       continue;
+       ret = scnprintf(hpp->buf, hpp->size, "%*s", he->depth * HIERARCHY_INDENT, "");
+       advance_hpp(hpp, ret);
  
+       /* the first hpp_list_node is for overhead columns */
+       fmt_node = list_first_entry(&hists->hpp_formats,
+                                   struct perf_hpp_list_node, list);
+       perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
                 /*
                  * If there's no field_sep, we still need
                  * to display initial '  '.
@@ -403,10 +448,47 @@ static int hist_entry__snprintf(struct hist_entry *he, struct perf_hpp *hpp)
                 else
                         ret = fmt->entry(fmt, hpp, he);
  
+               ret = hist_entry__snprintf_alignment(he, hpp, fmt, ret);
                 advance_hpp(hpp, ret);
         }
  
-       return hpp->buf - start;
+       if (!sep)
+               ret = scnprintf(hpp->buf, hpp->size, "%*s",
+                               (hists->nr_hpp_node - 2) * HIERARCHY_INDENT, "");
+       advance_hpp(hpp, ret);
+
+       printed += fprintf(fp, "%s", buf);
+
+       perf_hpp_list__for_each_format(he->hpp_list, fmt) {
+               hpp->buf  = buf;
+               hpp->size = size;
+
+               /*
+                * No need to call hist_entry__snprintf_alignment() since this
+                * fmt is always the last column in the hierarchy mode.
+                */
+               if (perf_hpp__use_color() && fmt->color)
+                       fmt->color(fmt, hpp, he);
+               else
+                       fmt->entry(fmt, hpp, he);
+
+               /*
+                * dynamic entries are right-aligned but we want left-aligned
+                * in the hierarchy mode
+                */
+               printed += fprintf(fp, "%s%s", sep ?: "  ", ltrim(buf));
+       }
+       printed += putc('\n', fp);
+
+       if (symbol_conf.use_callchain && he->leaf) {
+               u64 total = hists__total_period(hists);
+
+               printed += hist_entry_callchain__fprintf(he, total, 0, fp);
+               goto out;
+       }
+
+out:
+       return printed;
  }
  
  static int hist_entry__fprintf(struct hist_entry *he, size_t size,
@@ -418,24 +500,134 @@ static int hist_entry__fprintf(struct hist_entry *he, size_t size,
                 .buf            = bf,
                 .size           = size,
         };
+       u64 total_period = hists->stats.total_period;
  
         if (size == 0 || size > bfsz)
                 size = hpp.size = bfsz;
  
+       if (symbol_conf.report_hierarchy)
+               return hist_entry__hierarchy_fprintf(he, &hpp, hists, fp);
+
         hist_entry__snprintf(he, &hpp);
  
         ret = fprintf(fp, "%s\n", bf);
  
         if (symbol_conf.use_callchain)
-               ret += hist_entry__callchain_fprintf(he, hists, fp);
+               ret += hist_entry_callchain__fprintf(he, total_period, 0, fp);
  
         return ret;
  }
  
+static int print_hierarchy_indent(const char *sep, int indent,
+                                 const char *line, FILE *fp)
+{
+       if (sep != NULL || indent < 2)
+               return 0;
+
+       return fprintf(fp, "%-.*s", (indent - 2) * HIERARCHY_INDENT, line);
+}
+
+static int print_hierarchy_header(struct hists *hists, struct perf_hpp *hpp,
+                                 const char *sep, FILE *fp)
+{
+       bool first_node, first_col;
+       int indent;
+       int depth;
+       unsigned width = 0;
+       unsigned header_width = 0;
+       struct perf_hpp_fmt *fmt;
+       struct perf_hpp_list_node *fmt_node;
+
+       indent = hists->nr_hpp_node;
+
+       /* preserve max indent depth for column headers */
+       print_hierarchy_indent(sep, indent, spaces, fp);
+
+       /* the first hpp_list_node is for overhead columns */
+       fmt_node = list_first_entry(&hists->hpp_formats,
+                                   struct perf_hpp_list_node, list);
+
+       perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+               fmt->header(fmt, hpp, hists_to_evsel(hists));
+               fprintf(fp, "%s%s", hpp->buf, sep ?: "  ");
+       }
+
+       /* combine sort headers with ' / ' */
+       first_node = true;
+       list_for_each_entry_continue(fmt_node, &hists->hpp_formats, list) {
+               if (!first_node)
+                       header_width += fprintf(fp, " / ");
+               first_node = false;
+
+               first_col = true;
+               perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+                       if (perf_hpp__should_skip(fmt, hists))
+                               continue;
+
+                       if (!first_col)
+                               header_width += fprintf(fp, "+");
+                       first_col = false;
+
+                       fmt->header(fmt, hpp, hists_to_evsel(hists));
+                       rtrim(hpp->buf);
+
+                       header_width += fprintf(fp, "%s", ltrim(hpp->buf));
+               }
+       }
+
+       fprintf(fp, "\n# ");
+
+       /* preserve max indent depth for initial dots */
+       print_hierarchy_indent(sep, indent, dots, fp);
+
+       /* the first hpp_list_node is for overhead columns */
+       fmt_node = list_first_entry(&hists->hpp_formats,
+                                   struct perf_hpp_list_node, list);
+
+       first_col = true;
+       perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+               if (!first_col)
+                       fprintf(fp, "%s", sep ?: "..");
+               first_col = false;
+
+               width = fmt->width(fmt, hpp, hists_to_evsel(hists));
+               fprintf(fp, "%.*s", width, dots);
+       }
+
+       depth = 0;
+       list_for_each_entry_continue(fmt_node, &hists->hpp_formats, list) {
+               first_col = true;
+               width = depth * HIERARCHY_INDENT;
+
+               perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+                       if (perf_hpp__should_skip(fmt, hists))
+                               continue;
+
+                       if (!first_col)
+                               width++;  /* for '+' sign between column header */
+                       first_col = false;
+
+                       width += fmt->width(fmt, hpp, hists_to_evsel(hists));
+               }
+
+               if (width > header_width)
+                       header_width = width;
+
+               depth++;
+       }
+
+       fprintf(fp, "%s%-.*s", sep ?: "  ", header_width, dots);
+
+       fprintf(fp, "\n#\n");
+
+       return 2;
+}
+
  size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
                       int max_cols, float min_pcnt, FILE *fp)
  {
         struct perf_hpp_fmt *fmt;
+       struct perf_hpp_list_node *fmt_node;
         struct rb_node *nd;
         size_t ret = 0;
         unsigned int width;
@@ -449,10 +641,11 @@ size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
         bool first = true;
         size_t linesz;
         char *line = NULL;
+       unsigned indent;
  
         init_rem_hits();
  
-       perf_hpp__for_each_format(fmt)
+       hists__for_each_format(hists, fmt)
                 perf_hpp__reset_width(fmt, hists);
  
         if (symbol_conf.col_width_list_str)
@@ -463,7 +656,16 @@ size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
  
         fprintf(fp, "# ");
  
-       perf_hpp__for_each_format(fmt) {
+       if (symbol_conf.report_hierarchy) {
+               list_for_each_entry(fmt_node, &hists->hpp_formats, list) {
+                       perf_hpp_list__for_each_format(&fmt_node->hpp, fmt)
+                               perf_hpp__reset_width(fmt, hists);
+               }
+               nr_rows += print_hierarchy_header(hists, &dummy_hpp, sep, fp);
+               goto print_entries;
+       }
+
+       hists__for_each_format(hists, fmt) {
                 if (perf_hpp__should_skip(fmt, hists))
                         continue;
  
@@ -487,7 +689,7 @@ size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
  
         fprintf(fp, "# ");
  
-       perf_hpp__for_each_format(fmt) {
+       hists__for_each_format(hists, fmt) {
                 unsigned int i;
  
                 if (perf_hpp__should_skip(fmt, hists))
@@ -520,7 +722,9 @@ print_entries:
                 goto out;
         }
  
-       for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
+       indent = hists__overhead_width(hists) + 4;
+
+       for (nd = rb_first(&hists->entries); nd; nd = __rb_hierarchy_next(nd, HMD_FORCE_CHILD)) {
                 struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
                 float percent;
  
@@ -536,6 +740,20 @@ print_entries:
                 if (max_rows && ++nr_rows >= max_rows)
                         break;
  
+               /*
+                * If all children are filtered out or percent-limited,
+                * display "no entry >= x.xx%" message.
+                */
+               if (!h->leaf && !hist_entry__has_hierarchy_children(h, min_pcnt)) {
+                       int depth = hists->nr_hpp_node + h->depth + 1;
+
+                       print_hierarchy_indent(sep, depth, spaces, fp);
+                       fprintf(fp, "%*sno entry >= %.2f%%\n", indent, "", min_pcnt);
+
+                       if (max_rows && ++nr_rows >= max_rows)
+                               break;
+               }
+
                 if (h->ms.map == NULL && verbose > 1) {
                         __map_groups__fprintf_maps(h->thread->mg,
                                                    MAP__FUNCTION, fp);
diff --git a/tools/perf/util/Build b/tools/perf/util/Build

index 5eec53a3f4ac7dbedb54776f746705acfa4fa2b4..eea25e2424e99172715c681a05ffc68b3c58e78a 100644 (file)
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -82,6 +82,7 @@ libperf-y += parse-branch-options.o
  libperf-y += parse-regs-options.o
  libperf-y += term.o
  libperf-y += help-unknown-cmd.o
+libperf-y += mem-events.o
  
  libperf-$(CONFIG_LIBBPF) += bpf-loader.o
  libperf-$(CONFIG_BPF_PROLOGUE) += bpf-prologue.o
@@ -105,8 +106,17 @@ libperf-y += scripting-engines/
  
  libperf-$(CONFIG_ZLIB) += zlib.o
  libperf-$(CONFIG_LZMA) += lzma.o
+libperf-y += demangle-java.o
+
+ifdef CONFIG_JITDUMP
+libperf-$(CONFIG_LIBELF) += jitdump.o
+libperf-$(CONFIG_LIBELF) += genelf.o
+libperf-$(CONFIG_LIBELF) += genelf_debug.o
+endif
  
  CFLAGS_config.o   += -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
+# avoid compiler warnings in 32-bit mode
+CFLAGS_genelf_debug.o  += -Wno-packed
  
  $(OUTPUT)util/parse-events-flex.c: util/parse-events.l $(OUTPUT)util/parse-events-bison.c
         $(call rule_mkdir)
diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c

index 360fda01f3b0d17369254d03fe16daf24f30c698..ec164fe70718df1480b02733d8701c7ab2b74297 100644 (file)
--- a/tools/perf/util/auxtrace.c
+++ b/tools/perf/util/auxtrace.c
@@ -478,10 +478,11 @@ void auxtrace_heap__pop(struct auxtrace_heap *heap)
                          heap_array[last].ordinal);
  }
  
-size_t auxtrace_record__info_priv_size(struct auxtrace_record *itr)
+size_t auxtrace_record__info_priv_size(struct auxtrace_record *itr,
+                                      struct perf_evlist *evlist)
  {
         if (itr)
-               return itr->info_priv_size(itr);
+               return itr->info_priv_size(itr, evlist);
         return 0;
  }
  
@@ -852,7 +853,7 @@ int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr,
         int err;
  
         pr_debug2("Synthesizing auxtrace information\n");
-       priv_size = auxtrace_record__info_priv_size(itr);
+       priv_size = auxtrace_record__info_priv_size(itr, session->evlist);
         ev = zalloc(sizeof(struct auxtrace_info_event) + priv_size);
         if (!ev)
                 return -ENOMEM;
diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h

index b86f90db1352a6c8635e3ea5d02aa3c21bccc323..e5a8e2d4f2af4e9b717be7ce27c587f569983027 100644 (file)
--- a/tools/perf/util/auxtrace.h
+++ b/tools/perf/util/auxtrace.h
@@ -293,7 +293,8 @@ struct auxtrace_record {
         int (*recording_options)(struct auxtrace_record *itr,
                                  struct perf_evlist *evlist,
                                  struct record_opts *opts);
-       size_t (*info_priv_size)(struct auxtrace_record *itr);
+       size_t (*info_priv_size)(struct auxtrace_record *itr,
+                                struct perf_evlist *evlist);
         int (*info_fill)(struct auxtrace_record *itr,
                          struct perf_session *session,
                          struct auxtrace_info_event *auxtrace_info,
@@ -429,7 +430,8 @@ int auxtrace_parse_snapshot_options(struct auxtrace_record *itr,
  int auxtrace_record__options(struct auxtrace_record *itr,
                              struct perf_evlist *evlist,
                              struct record_opts *opts);
-size_t auxtrace_record__info_priv_size(struct auxtrace_record *itr);
+size_t auxtrace_record__info_priv_size(struct auxtrace_record *itr,
+                                      struct perf_evlist *evlist);
  int auxtrace_record__info_fill(struct auxtrace_record *itr,
                                struct perf_session *session,
                                struct auxtrace_info_event *auxtrace_info,
diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c

index 540a7efa657ea8668eeddf1b476bbcb3c0490bbc..0967ce601931685ed294827e8aef7c30c47736c6 100644 (file)
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -7,6 +7,7 @@
  
  #include <linux/bpf.h>
  #include <bpf/libbpf.h>
+#include <bpf/bpf.h>
  #include <linux/err.h>
  #include <linux/string.h>
  #include "perf.h"
@@ -16,6 +17,7 @@
  #include "llvm-utils.h"
  #include "probe-event.h"
  #include "probe-finder.h" // for MAX_PROBES
+#include "parse-events.h"
  #include "llvm-utils.h"
  
  #define DEFINE_PRINT_FN(name, level) \
@@ -108,8 +110,8 @@ void bpf__clear(void)
  }
  
  static void
-bpf_prog_priv__clear(struct bpf_program *prog __maybe_unused,
-                    void *_priv)
+clear_prog_priv(struct bpf_program *prog __maybe_unused,
+               void *_priv)
  {
         struct bpf_prog_priv *priv = _priv;
  
@@ -337,7 +339,7 @@ config_bpf_program(struct bpf_program *prog)
         }
         pr_debug("bpf: config '%s' is ok\n", config_str);
  
-       err = bpf_program__set_private(prog, priv, bpf_prog_priv__clear);
+       err = bpf_program__set_private(prog, priv, clear_prog_priv);
         if (err) {
                 pr_debug("Failed to set priv for program '%s'\n", config_str);
                 goto errout;
@@ -739,6 +741,682 @@ int bpf__foreach_tev(struct bpf_object *obj,
         return 0;
  }
  
+enum bpf_map_op_type {
+       BPF_MAP_OP_SET_VALUE,
+       BPF_MAP_OP_SET_EVSEL,
+};
+
+enum bpf_map_key_type {
+       BPF_MAP_KEY_ALL,
+       BPF_MAP_KEY_RANGES,
+};
+
+struct bpf_map_op {
+       struct list_head list;
+       enum bpf_map_op_type op_type;
+       enum bpf_map_key_type key_type;
+       union {
+               struct parse_events_array array;
+       } k;
+       union {
+               u64 value;
+               struct perf_evsel *evsel;
+       } v;
+};
+
+struct bpf_map_priv {
+       struct list_head ops_list;
+};
+
+static void
+bpf_map_op__delete(struct bpf_map_op *op)
+{
+       if (!list_empty(&op->list))
+               list_del(&op->list);
+       if (op->key_type == BPF_MAP_KEY_RANGES)
+               parse_events__clear_array(&op->k.array);
+       free(op);
+}
+
+static void
+bpf_map_priv__purge(struct bpf_map_priv *priv)
+{
+       struct bpf_map_op *pos, *n;
+
+       list_for_each_entry_safe(pos, n, &priv->ops_list, list) {
+               list_del_init(&pos->list);
+               bpf_map_op__delete(pos);
+       }
+}
+
+static void
+bpf_map_priv__clear(struct bpf_map *map __maybe_unused,
+                   void *_priv)
+{
+       struct bpf_map_priv *priv = _priv;
+
+       bpf_map_priv__purge(priv);
+       free(priv);
+}
+
+static int
+bpf_map_op_setkey(struct bpf_map_op *op, struct parse_events_term *term)
+{
+       op->key_type = BPF_MAP_KEY_ALL;
+       if (!term)
+               return 0;
+
+       if (term->array.nr_ranges) {
+               size_t memsz = term->array.nr_ranges *
+                               sizeof(op->k.array.ranges[0]);
+
+               op->k.array.ranges = memdup(term->array.ranges, memsz);
+               if (!op->k.array.ranges) {
+                       pr_debug("No enough memory to alloc indices for map\n");
+                       return -ENOMEM;
+               }
+               op->key_type = BPF_MAP_KEY_RANGES;
+               op->k.array.nr_ranges = term->array.nr_ranges;
+       }
+       return 0;
+}
+
+static struct bpf_map_op *
+bpf_map_op__new(struct parse_events_term *term)
+{
+       struct bpf_map_op *op;
+       int err;
+
+       op = zalloc(sizeof(*op));
+       if (!op) {
+               pr_debug("Failed to alloc bpf_map_op\n");
+               return ERR_PTR(-ENOMEM);
+       }
+       INIT_LIST_HEAD(&op->list);
+
+       err = bpf_map_op_setkey(op, term);
+       if (err) {
+               free(op);
+               return ERR_PTR(err);
+       }
+       return op;
+}
+
+static int
+bpf_map__add_op(struct bpf_map *map, struct bpf_map_op *op)
+{
+       struct bpf_map_priv *priv;
+       const char *map_name;
+       int err;
+
+       map_name = bpf_map__get_name(map);
+       err = bpf_map__get_private(map, (void **)&priv);
+       if (err) {
+               pr_debug("Failed to get private from map %s\n", map_name);
+               return err;
+       }
+
+       if (!priv) {
+               priv = zalloc(sizeof(*priv));
+               if (!priv) {
+                       pr_debug("No enough memory to alloc map private\n");
+                       return -ENOMEM;
+               }
+               INIT_LIST_HEAD(&priv->ops_list);
+
+               if (bpf_map__set_private(map, priv, bpf_map_priv__clear)) {
+                       free(priv);
+                       return -BPF_LOADER_ERRNO__INTERNAL;
+               }
+       }
+
+       list_add_tail(&op->list, &priv->ops_list);
+       return 0;
+}
+
+static struct bpf_map_op *
+bpf_map__add_newop(struct bpf_map *map, struct parse_events_term *term)
+{
+       struct bpf_map_op *op;
+       int err;
+
+       op = bpf_map_op__new(term);
+       if (IS_ERR(op))
+               return op;
+
+       err = bpf_map__add_op(map, op);
+       if (err) {
+               bpf_map_op__delete(op);
+               return ERR_PTR(err);
+       }
+       return op;
+}
+
+static int
+__bpf_map__config_value(struct bpf_map *map,
+                       struct parse_events_term *term)
+{
+       struct bpf_map_def def;
+       struct bpf_map_op *op;
+       const char *map_name;
+       int err;
+
+       map_name = bpf_map__get_name(map);
+
+       err = bpf_map__get_def(map, &def);
+       if (err) {
+               pr_debug("Unable to get map definition from '%s'\n",
+                        map_name);
+               return -BPF_LOADER_ERRNO__INTERNAL;
+       }
+
+       if (def.type != BPF_MAP_TYPE_ARRAY) {
+               pr_debug("Map %s type is not BPF_MAP_TYPE_ARRAY\n",
+                        map_name);
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_TYPE;
+       }
+       if (def.key_size < sizeof(unsigned int)) {
+               pr_debug("Map %s has incorrect key size\n", map_name);
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_KEYSIZE;
+       }
+       switch (def.value_size) {
+       case 1:
+       case 2:
+       case 4:
+       case 8:
+               break;
+       default:
+               pr_debug("Map %s has incorrect value size\n", map_name);
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_VALUESIZE;
+       }
+
+       op = bpf_map__add_newop(map, term);
+       if (IS_ERR(op))
+               return PTR_ERR(op);
+       op->op_type = BPF_MAP_OP_SET_VALUE;
+       op->v.value = term->val.num;
+       return 0;
+}
+
+static int
+bpf_map__config_value(struct bpf_map *map,
+                     struct parse_events_term *term,
+                     struct perf_evlist *evlist __maybe_unused)
+{
+       if (!term->err_val) {
+               pr_debug("Config value not set\n");
+               return -BPF_LOADER_ERRNO__OBJCONF_CONF;
+       }
+
+       if (term->type_val != PARSE_EVENTS__TERM_TYPE_NUM) {
+               pr_debug("ERROR: wrong value type for 'value'\n");
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_VALUE;
+       }
+
+       return __bpf_map__config_value(map, term);
+}
+
+static int
+__bpf_map__config_event(struct bpf_map *map,
+                       struct parse_events_term *term,
+                       struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel;
+       struct bpf_map_def def;
+       struct bpf_map_op *op;
+       const char *map_name;
+       int err;
+
+       map_name = bpf_map__get_name(map);
+       evsel = perf_evlist__find_evsel_by_str(evlist, term->val.str);
+       if (!evsel) {
+               pr_debug("Event (for '%s') '%s' doesn't exist\n",
+                        map_name, term->val.str);
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_NOEVT;
+       }
+
+       err = bpf_map__get_def(map, &def);
+       if (err) {
+               pr_debug("Unable to get map definition from '%s'\n",
+                        map_name);
+               return err;
+       }
+
+       /*
+        * No need to check key_size and value_size:
+        * kernel has already checked them.
+        */
+       if (def.type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) {
+               pr_debug("Map %s type is not BPF_MAP_TYPE_PERF_EVENT_ARRAY\n",
+                        map_name);
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_TYPE;
+       }
+
+       op = bpf_map__add_newop(map, term);
+       if (IS_ERR(op))
+               return PTR_ERR(op);
+       op->op_type = BPF_MAP_OP_SET_EVSEL;
+       op->v.evsel = evsel;
+       return 0;
+}
+
+static int
+bpf_map__config_event(struct bpf_map *map,
+                     struct parse_events_term *term,
+                     struct perf_evlist *evlist)
+{
+       if (!term->err_val) {
+               pr_debug("Config value not set\n");
+               return -BPF_LOADER_ERRNO__OBJCONF_CONF;
+       }
+
+       if (term->type_val != PARSE_EVENTS__TERM_TYPE_STR) {
+               pr_debug("ERROR: wrong value type for 'event'\n");
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_VALUE;
+       }
+
+       return __bpf_map__config_event(map, term, evlist);
+}
+
+struct bpf_obj_config__map_func {
+       const char *config_opt;
+       int (*config_func)(struct bpf_map *, struct parse_events_term *,
+                          struct perf_evlist *);
+};
+
+struct bpf_obj_config__map_func bpf_obj_config__map_funcs[] = {
+       {"value", bpf_map__config_value},
+       {"event", bpf_map__config_event},
+};
+
+static int
+config_map_indices_range_check(struct parse_events_term *term,
+                              struct bpf_map *map,
+                              const char *map_name)
+{
+       struct parse_events_array *array = &term->array;
+       struct bpf_map_def def;
+       unsigned int i;
+       int err;
+
+       if (!array->nr_ranges)
+               return 0;
+       if (!array->ranges) {
+               pr_debug("ERROR: map %s: array->nr_ranges is %d but range array is NULL\n",
+                        map_name, (int)array->nr_ranges);
+               return -BPF_LOADER_ERRNO__INTERNAL;
+       }
+
+       err = bpf_map__get_def(map, &def);
+       if (err) {
+               pr_debug("ERROR: Unable to get map definition from '%s'\n",
+                        map_name);
+               return -BPF_LOADER_ERRNO__INTERNAL;
+       }
+
+       for (i = 0; i < array->nr_ranges; i++) {
+               unsigned int start = array->ranges[i].start;
+               size_t length = array->ranges[i].length;
+               unsigned int idx = start + length - 1;
+
+               if (idx >= def.max_entries) {
+                       pr_debug("ERROR: index %d too large\n", idx);
+                       return -BPF_LOADER_ERRNO__OBJCONF_MAP_IDX2BIG;
+               }
+       }
+       return 0;
+}
+
+static int
+bpf__obj_config_map(struct bpf_object *obj,
+                   struct parse_events_term *term,
+                   struct perf_evlist *evlist,
+                   int *key_scan_pos)
+{
+       /* key is "map:<mapname>.<config opt>" */
+       char *map_name = strdup(term->config + sizeof("map:") - 1);
+       struct bpf_map *map;
+       int err = -BPF_LOADER_ERRNO__OBJCONF_OPT;
+       char *map_opt;
+       size_t i;
+
+       if (!map_name)
+               return -ENOMEM;
+
+       map_opt = strchr(map_name, '.');
+       if (!map_opt) {
+               pr_debug("ERROR: Invalid map config: %s\n", map_name);
+               goto out;
+       }
+
+       *map_opt++ = '\0';
+       if (*map_opt == '\0') {
+               pr_debug("ERROR: Invalid map option: %s\n", term->config);
+               goto out;
+       }
+
+       map = bpf_object__get_map_by_name(obj, map_name);
+       if (!map) {
+               pr_debug("ERROR: Map %s doesn't exist\n", map_name);
+               err = -BPF_LOADER_ERRNO__OBJCONF_MAP_NOTEXIST;
+               goto out;
+       }
+
+       *key_scan_pos += strlen(map_opt);
+       err = config_map_indices_range_check(term, map, map_name);
+       if (err)
+               goto out;
+       *key_scan_pos -= strlen(map_opt);
+
+       for (i = 0; i < ARRAY_SIZE(bpf_obj_config__map_funcs); i++) {
+               struct bpf_obj_config__map_func *func =
+                               &bpf_obj_config__map_funcs[i];
+
+               if (strcmp(map_opt, func->config_opt) == 0) {
+                       err = func->config_func(map, term, evlist);
+                       goto out;
+               }
+       }
+
+       pr_debug("ERROR: Invalid map config option '%s'\n", map_opt);
+       err = -BPF_LOADER_ERRNO__OBJCONF_MAP_OPT;
+out:
+       free(map_name);
+       if (!err)
+               key_scan_pos += strlen(map_opt);
+       return err;
+}
+
+int bpf__config_obj(struct bpf_object *obj,
+                   struct parse_events_term *term,
+                   struct perf_evlist *evlist,
+                   int *error_pos)
+{
+       int key_scan_pos = 0;
+       int err;
+
+       if (!obj || !term || !term->config)
+               return -EINVAL;
+
+       if (!prefixcmp(term->config, "map:")) {
+               key_scan_pos = sizeof("map:") - 1;
+               err = bpf__obj_config_map(obj, term, evlist, &key_scan_pos);
+               goto out;
+       }
+       err = -BPF_LOADER_ERRNO__OBJCONF_OPT;
+out:
+       if (error_pos)
+               *error_pos = key_scan_pos;
+       return err;
+
+}
+
+typedef int (*map_config_func_t)(const char *name, int map_fd,
+                                struct bpf_map_def *pdef,
+                                struct bpf_map_op *op,
+                                void *pkey, void *arg);
+
+static int
+foreach_key_array_all(map_config_func_t func,
+                     void *arg, const char *name,
+                     int map_fd, struct bpf_map_def *pdef,
+                     struct bpf_map_op *op)
+{
+       unsigned int i;
+       int err;
+
+       for (i = 0; i < pdef->max_entries; i++) {
+               err = func(name, map_fd, pdef, op, &i, arg);
+               if (err) {
+                       pr_debug("ERROR: failed to insert value to %s[%u]\n",
+                                name, i);
+                       return err;
+               }
+       }
+       return 0;
+}
+
+static int
+foreach_key_array_ranges(map_config_func_t func, void *arg,
+                        const char *name, int map_fd,
+                        struct bpf_map_def *pdef,
+                        struct bpf_map_op *op)
+{
+       unsigned int i, j;
+       int err;
+
+       for (i = 0; i < op->k.array.nr_ranges; i++) {
+               unsigned int start = op->k.array.ranges[i].start;
+               size_t length = op->k.array.ranges[i].length;
+
+               for (j = 0; j < length; j++) {
+                       unsigned int idx = start + j;
+
+                       err = func(name, map_fd, pdef, op, &idx, arg);
+                       if (err) {
+                               pr_debug("ERROR: failed to insert value to %s[%u]\n",
+                                        name, idx);
+                               return err;
+                       }
+               }
+       }
+       return 0;
+}
+
+static int
+bpf_map_config_foreach_key(struct bpf_map *map,
+                          map_config_func_t func,
+                          void *arg)
+{
+       int err, map_fd;
+       const char *name;
+       struct bpf_map_op *op;
+       struct bpf_map_def def;
+       struct bpf_map_priv *priv;
+
+       name = bpf_map__get_name(map);
+
+       err = bpf_map__get_private(map, (void **)&priv);
+       if (err) {
+               pr_debug("ERROR: failed to get private from map %s\n", name);
+               return -BPF_LOADER_ERRNO__INTERNAL;
+       }
+       if (!priv || list_empty(&priv->ops_list)) {
+               pr_debug("INFO: nothing to config for map %s\n", name);
+               return 0;
+       }
+
+       err = bpf_map__get_def(map, &def);
+       if (err) {
+               pr_debug("ERROR: failed to get definition from map %s\n", name);
+               return -BPF_LOADER_ERRNO__INTERNAL;
+       }
+       map_fd = bpf_map__get_fd(map);
+       if (map_fd < 0) {
+               pr_debug("ERROR: failed to get fd from map %s\n", name);
+               return map_fd;
+       }
+
+       list_for_each_entry(op, &priv->ops_list, list) {
+               switch (def.type) {
+               case BPF_MAP_TYPE_ARRAY:
+               case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
+                       switch (op->key_type) {
+                       case BPF_MAP_KEY_ALL:
+                               err = foreach_key_array_all(func, arg, name,
+                                                           map_fd, &def, op);
+                               break;
+                       case BPF_MAP_KEY_RANGES:
+                               err = foreach_key_array_ranges(func, arg, name,
+                                                              map_fd, &def,
+                                                              op);
+                               break;
+                       default:
+                               pr_debug("ERROR: keytype for map '%s' invalid\n",
+                                        name);
+                               return -BPF_LOADER_ERRNO__INTERNAL;
+                       }
+                       if (err)
+                               return err;
+                       break;
+               default:
+                       pr_debug("ERROR: type of '%s' incorrect\n", name);
+                       return -BPF_LOADER_ERRNO__OBJCONF_MAP_TYPE;
+               }
+       }
+
+       return 0;
+}
+
+static int
+apply_config_value_for_key(int map_fd, void *pkey,
+                          size_t val_size, u64 val)
+{
+       int err = 0;
+
+       switch (val_size) {
+       case 1: {
+               u8 _val = (u8)(val);
+               err = bpf_map_update_elem(map_fd, pkey, &_val, BPF_ANY);
+               break;
+       }
+       case 2: {
+               u16 _val = (u16)(val);
+               err = bpf_map_update_elem(map_fd, pkey, &_val, BPF_ANY);
+               break;
+       }
+       case 4: {
+               u32 _val = (u32)(val);
+               err = bpf_map_update_elem(map_fd, pkey, &_val, BPF_ANY);
+               break;
+       }
+       case 8: {
+               err = bpf_map_update_elem(map_fd, pkey, &val, BPF_ANY);
+               break;
+       }
+       default:
+               pr_debug("ERROR: invalid value size\n");
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_VALUESIZE;
+       }
+       if (err && errno)
+               err = -errno;
+       return err;
+}
+
+static int
+apply_config_evsel_for_key(const char *name, int map_fd, void *pkey,
+                          struct perf_evsel *evsel)
+{
+       struct xyarray *xy = evsel->fd;
+       struct perf_event_attr *attr;
+       unsigned int key, events;
+       bool check_pass = false;
+       int *evt_fd;
+       int err;
+
+       if (!xy) {
+               pr_debug("ERROR: evsel not ready for map %s\n", name);
+               return -BPF_LOADER_ERRNO__INTERNAL;
+       }
+
+       if (xy->row_size / xy->entry_size != 1) {
+               pr_debug("ERROR: Dimension of target event is incorrect for map %s\n",
+                        name);
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_EVTDIM;
+       }
+
+       attr = &evsel->attr;
+       if (attr->inherit) {
+               pr_debug("ERROR: Can't put inherit event into map %s\n", name);
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_EVTINH;
+       }
+
+       if (perf_evsel__is_bpf_output(evsel))
+               check_pass = true;
+       if (attr->type == PERF_TYPE_RAW)
+               check_pass = true;
+       if (attr->type == PERF_TYPE_HARDWARE)
+               check_pass = true;
+       if (!check_pass) {
+               pr_debug("ERROR: Event type is wrong for map %s\n", name);
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_EVTTYPE;
+       }
+
+       events = xy->entries / (xy->row_size / xy->entry_size);
+       key = *((unsigned int *)pkey);
+       if (key >= events) {
+               pr_debug("ERROR: there is no event %d for map %s\n",
+                        key, name);
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_MAPSIZE;
+       }
+       evt_fd = xyarray__entry(xy, key, 0);
+       err = bpf_map_update_elem(map_fd, pkey, evt_fd, BPF_ANY);
+       if (err && errno)
+               err = -errno;
+       return err;
+}
+
+static int
+apply_obj_config_map_for_key(const char *name, int map_fd,
+                            struct bpf_map_def *pdef __maybe_unused,
+                            struct bpf_map_op *op,
+                            void *pkey, void *arg __maybe_unused)
+{
+       int err;
+
+       switch (op->op_type) {
+       case BPF_MAP_OP_SET_VALUE:
+               err = apply_config_value_for_key(map_fd, pkey,
+                                                pdef->value_size,
+                                                op->v.value);
+               break;
+       case BPF_MAP_OP_SET_EVSEL:
+               err = apply_config_evsel_for_key(name, map_fd, pkey,
+                                                op->v.evsel);
+               break;
+       default:
+               pr_debug("ERROR: unknown value type for '%s'\n", name);
+               err = -BPF_LOADER_ERRNO__INTERNAL;
+       }
+       return err;
+}
+
+static int
+apply_obj_config_map(struct bpf_map *map)
+{
+       return bpf_map_config_foreach_key(map,
+                                         apply_obj_config_map_for_key,
+                                         NULL);
+}
+
+static int
+apply_obj_config_object(struct bpf_object *obj)
+{
+       struct bpf_map *map;
+       int err;
+
+       bpf_map__for_each(map, obj) {
+               err = apply_obj_config_map(map);
+               if (err)
+                       return err;
+       }
+       return 0;
+}
+
+int bpf__apply_obj_config(void)
+{
+       struct bpf_object *obj, *tmp;
+       int err;
+
+       bpf_object__for_each_safe(obj, tmp) {
+               err = apply_obj_config_object(obj);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
  #define ERRNO_OFFSET(e)                ((e) - __BPF_LOADER_ERRNO__START)
  #define ERRCODE_OFFSET(c)      ERRNO_OFFSET(BPF_LOADER_ERRNO__##c)
  #define NR_ERRNO       (__BPF_LOADER_ERRNO__END - __BPF_LOADER_ERRNO__START)
@@ -753,6 +1431,20 @@ static const char *bpf_loader_strerror_table[NR_ERRNO] = {
         [ERRCODE_OFFSET(PROLOGUE)]      = "Failed to generate prologue",
         [ERRCODE_OFFSET(PROLOGUE2BIG)]  = "Prologue too big for program",
         [ERRCODE_OFFSET(PROLOGUEOOB)]   = "Offset out of bound for prologue",
+       [ERRCODE_OFFSET(OBJCONF_OPT)]   = "Invalid object config option",
+       [ERRCODE_OFFSET(OBJCONF_CONF)]  = "Config value not set (missing '=')",
+       [ERRCODE_OFFSET(OBJCONF_MAP_OPT)]       = "Invalid object map config option",
+       [ERRCODE_OFFSET(OBJCONF_MAP_NOTEXIST)]  = "Target map doesn't exist",
+       [ERRCODE_OFFSET(OBJCONF_MAP_VALUE)]     = "Incorrect value type for map",
+       [ERRCODE_OFFSET(OBJCONF_MAP_TYPE)]      = "Incorrect map type",
+       [ERRCODE_OFFSET(OBJCONF_MAP_KEYSIZE)]   = "Incorrect map key size",
+       [ERRCODE_OFFSET(OBJCONF_MAP_VALUESIZE)] = "Incorrect map value size",
+       [ERRCODE_OFFSET(OBJCONF_MAP_NOEVT)]     = "Event not found for map setting",
+       [ERRCODE_OFFSET(OBJCONF_MAP_MAPSIZE)]   = "Invalid map size for event setting",
+       [ERRCODE_OFFSET(OBJCONF_MAP_EVTDIM)]    = "Event dimension too large",
+       [ERRCODE_OFFSET(OBJCONF_MAP_EVTINH)]    = "Doesn't support inherit event",
+       [ERRCODE_OFFSET(OBJCONF_MAP_EVTTYPE)]   = "Wrong event type for map",
+       [ERRCODE_OFFSET(OBJCONF_MAP_IDX2BIG)]   = "Index too large",
  };
  
  static int
@@ -872,3 +1564,29 @@ int bpf__strerror_load(struct bpf_object *obj,
         bpf__strerror_end(buf, size);
         return 0;
  }
+
+int bpf__strerror_config_obj(struct bpf_object *obj __maybe_unused,
+                            struct parse_events_term *term __maybe_unused,
+                            struct perf_evlist *evlist __maybe_unused,
+                            int *error_pos __maybe_unused, int err,
+                            char *buf, size_t size)
+{
+       bpf__strerror_head(err, buf, size);
+       bpf__strerror_entry(BPF_LOADER_ERRNO__OBJCONF_MAP_TYPE,
+                           "Can't use this config term with this map type");
+       bpf__strerror_end(buf, size);
+       return 0;
+}
+
+int bpf__strerror_apply_obj_config(int err, char *buf, size_t size)
+{
+       bpf__strerror_head(err, buf, size);
+       bpf__strerror_entry(BPF_LOADER_ERRNO__OBJCONF_MAP_EVTDIM,
+                           "Cannot set event to BPF map in multi-thread tracing");
+       bpf__strerror_entry(BPF_LOADER_ERRNO__OBJCONF_MAP_EVTINH,
+                           "%s (Hint: use -i to turn off inherit)", emsg);
+       bpf__strerror_entry(BPF_LOADER_ERRNO__OBJCONF_MAP_EVTTYPE,
+                           "Can only put raw, hardware and BPF output event into a BPF map");
+       bpf__strerror_end(buf, size);
+       return 0;
+}
diff --git a/tools/perf/util/bpf-loader.h b/tools/perf/util/bpf-loader.h

index 6fdc0457e2b66ea3fedd8e26c4a0e8f6a4bcc6e2..be4311944e3daa2abc87cdd54be72ed7bce70682 100644 (file)
--- a/tools/perf/util/bpf-loader.h
+++ b/tools/perf/util/bpf-loader.h
@@ -10,6 +10,7 @@
  #include <string.h>
  #include <bpf/libbpf.h>
  #include "probe-event.h"
+#include "evlist.h"
  #include "debug.h"
  
  enum bpf_loader_errno {
@@ -24,10 +25,25 @@ enum bpf_loader_errno {
         BPF_LOADER_ERRNO__PROLOGUE,     /* Failed to generate prologue */
         BPF_LOADER_ERRNO__PROLOGUE2BIG, /* Prologue too big for program */
         BPF_LOADER_ERRNO__PROLOGUEOOB,  /* Offset out of bound for prologue */
+       BPF_LOADER_ERRNO__OBJCONF_OPT,  /* Invalid object config option */
+       BPF_LOADER_ERRNO__OBJCONF_CONF, /* Config value not set (lost '=')) */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_OPT,      /* Invalid object map config option */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_NOTEXIST, /* Target map not exist */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_VALUE,    /* Incorrect value type for map */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_TYPE,     /* Incorrect map type */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_KEYSIZE,  /* Incorrect map key size */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_VALUESIZE,/* Incorrect map value size */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_NOEVT,    /* Event not found for map setting */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_MAPSIZE,  /* Invalid map size for event setting */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_EVTDIM,   /* Event dimension too large */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_EVTINH,   /* Doesn't support inherit event */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_EVTTYPE,  /* Wrong event type for map */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_IDX2BIG,  /* Index too large */
         __BPF_LOADER_ERRNO__END,
  };
  
  struct bpf_object;
+struct parse_events_term;
  #define PERF_BPF_PROBE_GROUP "perf_bpf_probe"
  
  typedef int (*bpf_prog_iter_callback_t)(struct probe_trace_event *tev,
@@ -53,6 +69,16 @@ int bpf__strerror_load(struct bpf_object *obj, int err,
                        char *buf, size_t size);
  int bpf__foreach_tev(struct bpf_object *obj,
                      bpf_prog_iter_callback_t func, void *arg);
+
+int bpf__config_obj(struct bpf_object *obj, struct parse_events_term *term,
+                   struct perf_evlist *evlist, int *error_pos);
+int bpf__strerror_config_obj(struct bpf_object *obj,
+                            struct parse_events_term *term,
+                            struct perf_evlist *evlist,
+                            int *error_pos, int err, char *buf,
+                            size_t size);
+int bpf__apply_obj_config(void);
+int bpf__strerror_apply_obj_config(int err, char *buf, size_t size);
  #else
  static inline struct bpf_object *
  bpf__prepare_load(const char *filename __maybe_unused,
@@ -83,6 +109,21 @@ bpf__foreach_tev(struct bpf_object *obj __maybe_unused,
         return 0;
  }
  
+static inline int
+bpf__config_obj(struct bpf_object *obj __maybe_unused,
+               struct parse_events_term *term __maybe_unused,
+               struct perf_evlist *evlist __maybe_unused,
+               int *error_pos __maybe_unused)
+{
+       return 0;
+}
+
+static inline int
+bpf__apply_obj_config(void)
+{
+       return 0;
+}
+
  static inline int
  __bpf_strerror(char *buf, size_t size)
  {
@@ -118,5 +159,23 @@ static inline int bpf__strerror_load(struct bpf_object *obj __maybe_unused,
  {
         return __bpf_strerror(buf, size);
  }
+
+static inline int
+bpf__strerror_config_obj(struct bpf_object *obj __maybe_unused,
+                        struct parse_events_term *term __maybe_unused,
+                        struct perf_evlist *evlist __maybe_unused,
+                        int *error_pos __maybe_unused,
+                        int err __maybe_unused,
+                        char *buf, size_t size)
+{
+       return __bpf_strerror(buf, size);
+}
+
+static inline int
+bpf__strerror_apply_obj_config(int err __maybe_unused,
+                              char *buf, size_t size)
+{
+       return __bpf_strerror(buf, size);
+}
  #endif
  #endif
diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c

index 6a7e273a514a642b30a477c3119696dc7fa09975..f1479eeef7daf9ae2c386abcef7d079a0370c836 100644 (file)
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -166,6 +166,50 @@ char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size)
         return build_id__filename(build_id_hex, bf, size);
  }
  
+bool dso__build_id_is_kmod(const struct dso *dso, char *bf, size_t size)
+{
+       char *id_name, *ch;
+       struct stat sb;
+
+       id_name = dso__build_id_filename(dso, bf, size);
+       if (!id_name)
+               goto err;
+       if (access(id_name, F_OK))
+               goto err;
+       if (lstat(id_name, &sb) == -1)
+               goto err;
+       if ((size_t)sb.st_size > size - 1)
+               goto err;
+       if (readlink(id_name, bf, size - 1) < 0)
+               goto err;
+
+       bf[sb.st_size] = '\0';
+
+       /*
+        * link should be:
+        * ../../lib/modules/4.4.0-rc4/kernel/net/ipv4/netfilter/nf_nat_ipv4.ko/a09fe3eb3147dafa4e3b31dbd6257e4d696bdc92
+        */
+       ch = strrchr(bf, '/');
+       if (!ch)
+               goto err;
+       if (ch - 3 < bf)
+               goto err;
+
+       return strncmp(".ko", ch - 3, 3) == 0;
+err:
+       /*
+        * If dso__build_id_filename work, get id_name again,
+        * because id_name points to bf and is broken.
+        */
+       if (id_name)
+               id_name = dso__build_id_filename(dso, bf, size);
+       pr_err("Invalid build id: %s\n", id_name ? :
+                                        dso->long_name ? :
+                                        dso->short_name ? :
+                                        "[unknown]");
+       return false;
+}
+
  #define dsos__for_each_with_build_id(pos, head)        \
         list_for_each_entry(pos, head, node)    \
                 if (!pos->has_build_id)         \
@@ -211,6 +255,7 @@ static int machine__write_buildid_table(struct machine *machine, int fd)
         dsos__for_each_with_build_id(pos, &machine->dsos.head) {
                 const char *name;
                 size_t name_len;
+               bool in_kernel = false;
  
                 if (!pos->hit)
                         continue;
@@ -227,8 +272,11 @@ static int machine__write_buildid_table(struct machine *machine, int fd)
                         name_len = pos->long_name_len + 1;
                 }
  
+               in_kernel = pos->kernel ||
+                               is_kernel_module(name,
+                                       PERF_RECORD_MISC_CPUMODE_UNKNOWN);
                 err = write_buildid(name, name_len, pos->build_id, machine->pid,
-                                   pos->kernel ? kmisc : umisc, fd);
+                                   in_kernel ? kmisc : umisc, fd);
                 if (err)
                         break;
         }
diff --git a/tools/perf/util/build-id.h b/tools/perf/util/build-id.h

index 27a14a8a945beb8eec9cc389ca7d6545b44a2ac4..64af3e20610d7718ecacfad3810fc4bff4ee882d 100644 (file)
--- a/tools/perf/util/build-id.h
+++ b/tools/perf/util/build-id.h
@@ -16,6 +16,7 @@ int sysfs__sprintf_build_id(const char *root_dir, char *sbuild_id);
  int filename__sprintf_build_id(const char *pathname, char *sbuild_id);
  
  char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size);
+bool dso__build_id_is_kmod(const struct dso *dso, char *bf, size_t size);
  
  int build_id__mark_dso_hit(struct perf_tool *tool, union perf_event *event,
                            struct perf_sample *sample, struct perf_evsel *evsel,
diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h

index 07b5d63947b11ec5f70029c6e41266e56b2bd8aa..3ca453f0c51f6f5b269f84270e78132e45f9768c 100644 (file)
--- a/tools/perf/util/cache.h
+++ b/tools/perf/util/cache.h
@@ -23,6 +23,8 @@
  #define PERF_TRACEFS_ENVIRONMENT "PERF_TRACEFS_DIR"
  #define PERF_PAGER_ENVIRONMENT "PERF_PAGER"
  
+extern const char *config_exclusive_filename;
+
  typedef int (*config_fn_t)(const char *, const char *, void *);
  extern int perf_default_config(const char *, const char *, void *);
  extern int perf_config(config_fn_t fn, void *);
@@ -31,6 +33,7 @@ extern u64 perf_config_u64(const char *, const char *);
  extern int perf_config_bool(const char *, const char *);
  extern int config_error_nonbool(const char *);
  extern const char *perf_config_dirname(const char *, const char *);
+extern const char *perf_etc_perfconfig(void);
  
  char *alias_lookup(const char *alias);
  int split_cmdline(char *cmdline, const char ***argv);
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c

index 53c43eb9489e4ba4a4e56a795bc05c2b85fb4cbc..24b4bd0d77545e7bb9f95e83222eb103c92f5151 100644 (file)
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -416,7 +416,7 @@ create_child(struct callchain_node *parent, bool inherit_children)
  /*
   * Fill the node with callchain values
   */
-static void
+static int
  fill_node(struct callchain_node *node, struct callchain_cursor *cursor)
  {
         struct callchain_cursor_node *cursor_node;
@@ -433,7 +433,7 @@ fill_node(struct callchain_node *node, struct callchain_cursor *cursor)
                 call = zalloc(sizeof(*call));
                 if (!call) {
                         perror("not enough memory for the code path tree");
-                       return;
+                       return -1;
                 }
                 call->ip = cursor_node->ip;
                 call->ms.sym = cursor_node->sym;
@@ -443,6 +443,7 @@ fill_node(struct callchain_node *node, struct callchain_cursor *cursor)
                 callchain_cursor_advance(cursor);
                 cursor_node = callchain_cursor_current(cursor);
         }
+       return 0;
  }
  
  static struct callchain_node *
@@ -453,7 +454,19 @@ add_child(struct callchain_node *parent,
         struct callchain_node *new;
  
         new = create_child(parent, false);
-       fill_node(new, cursor);
+       if (new == NULL)
+               return NULL;
+
+       if (fill_node(new, cursor) < 0) {
+               struct callchain_list *call, *tmp;
+
+               list_for_each_entry_safe(call, tmp, &new->val, list) {
+                       list_del(&call->list);
+                       free(call);
+               }
+               free(new);
+               return NULL;
+       }
  
         new->children_hit = 0;
         new->hit = period;
@@ -462,16 +475,32 @@ add_child(struct callchain_node *parent,
         return new;
  }
  
-static s64 match_chain(struct callchain_cursor_node *node,
-                     struct callchain_list *cnode)
+enum match_result {
+       MATCH_ERROR  = -1,
+       MATCH_EQ,
+       MATCH_LT,
+       MATCH_GT,
+};
+
+static enum match_result match_chain(struct callchain_cursor_node *node,
+                                    struct callchain_list *cnode)
  {
         struct symbol *sym = node->sym;
+       u64 left, right;
  
         if (cnode->ms.sym && sym &&
-           callchain_param.key == CCKEY_FUNCTION)
-               return cnode->ms.sym->start - sym->start;
-       else
-               return cnode->ip - node->ip;
+           callchain_param.key == CCKEY_FUNCTION) {
+               left = cnode->ms.sym->start;
+               right = sym->start;
+       } else {
+               left = cnode->ip;
+               right = node->ip;
+       }
+
+       if (left == right)
+               return MATCH_EQ;
+
+       return left > right ? MATCH_GT : MATCH_LT;
  }
  
  /*
@@ -479,7 +508,7 @@ static s64 match_chain(struct callchain_cursor_node *node,
   * give a part of its callchain to the created child.
   * Then create another child to host the given callchain of new branch
   */
-static void
+static int
  split_add_child(struct callchain_node *parent,
                 struct callchain_cursor *cursor,
                 struct callchain_list *to_split,
@@ -491,6 +520,8 @@ split_add_child(struct callchain_node *parent,
  
         /* split */
         new = create_child(parent, true);
+       if (new == NULL)
+               return -1;
  
         /* split the callchain and move a part to the new child */
         old_tail = parent->val.prev;
@@ -524,6 +555,8 @@ split_add_child(struct callchain_node *parent,
  
                 node = callchain_cursor_current(cursor);
                 new = add_child(parent, cursor, period);
+               if (new == NULL)
+                       return -1;
  
                 /*
                  * This is second child since we moved parent's children
@@ -534,7 +567,7 @@ split_add_child(struct callchain_node *parent,
                 cnode = list_first_entry(&first->val, struct callchain_list,
                                          list);
  
-               if (match_chain(node, cnode) < 0)
+               if (match_chain(node, cnode) == MATCH_LT)
                         pp = &p->rb_left;
                 else
                         pp = &p->rb_right;
@@ -545,14 +578,15 @@ split_add_child(struct callchain_node *parent,
                 parent->hit = period;
                 parent->count = 1;
         }
+       return 0;
  }
  
-static int
+static enum match_result
  append_chain(struct callchain_node *root,
              struct callchain_cursor *cursor,
              u64 period);
  
-static void
+static int
  append_chain_children(struct callchain_node *root,
                       struct callchain_cursor *cursor,
                       u64 period)
@@ -564,36 +598,42 @@ append_chain_children(struct callchain_node *root,
  
         node = callchain_cursor_current(cursor);
         if (!node)
-               return;
+               return -1;
  
         /* lookup in childrens */
         while (*p) {
-               s64 ret;
+               enum match_result ret;
  
                 parent = *p;
                 rnode = rb_entry(parent, struct callchain_node, rb_node_in);
  
                 /* If at least first entry matches, rely to children */
                 ret = append_chain(rnode, cursor, period);
-               if (ret == 0)
+               if (ret == MATCH_EQ)
                         goto inc_children_hit;
+               if (ret == MATCH_ERROR)
+                       return -1;
  
-               if (ret < 0)
+               if (ret == MATCH_LT)
                         p = &parent->rb_left;
                 else
                         p = &parent->rb_right;
         }
         /* nothing in children, add to the current node */
         rnode = add_child(root, cursor, period);
+       if (rnode == NULL)
+               return -1;
+
         rb_link_node(&rnode->rb_node_in, parent, p);
         rb_insert_color(&rnode->rb_node_in, &root->rb_root_in);
  
  inc_children_hit:
         root->children_hit += period;
         root->children_count++;
+       return 0;
  }
  
-static int
+static enum match_result
  append_chain(struct callchain_node *root,
              struct callchain_cursor *cursor,
              u64 period)
@@ -602,7 +642,7 @@ append_chain(struct callchain_node *root,
         u64 start = cursor->pos;
         bool found = false;
         u64 matches;
-       int cmp = 0;
+       enum match_result cmp = MATCH_ERROR;
  
         /*
          * Lookup in the current node
@@ -618,7 +658,7 @@ append_chain(struct callchain_node *root,
                         break;
  
                 cmp = match_chain(node, cnode);
-               if (cmp)
+               if (cmp != MATCH_EQ)
                         break;
  
                 found = true;
@@ -628,7 +668,7 @@ append_chain(struct callchain_node *root,
  
         /* matches not, relay no the parent */
         if (!found) {
-               WARN_ONCE(!cmp, "Chain comparison error\n");
+               WARN_ONCE(cmp == MATCH_ERROR, "Chain comparison error\n");
                 return cmp;
         }
  
@@ -636,21 +676,25 @@ append_chain(struct callchain_node *root,
  
         /* we match only a part of the node. Split it and add the new chain */
         if (matches < root->val_nr) {
-               split_add_child(root, cursor, cnode, start, matches, period);
-               return 0;
+               if (split_add_child(root, cursor, cnode, start, matches,
+                                   period) < 0)
+                       return MATCH_ERROR;
+
+               return MATCH_EQ;
         }
  
         /* we match 100% of the path, increment the hit */
         if (matches == root->val_nr && cursor->pos == cursor->nr) {
                 root->hit += period;
                 root->count++;
-               return 0;
+               return MATCH_EQ;
         }
  
         /* We match the node and still have a part remaining */
-       append_chain_children(root, cursor, period);
+       if (append_chain_children(root, cursor, period) < 0)
+               return MATCH_ERROR;
  
-       return 0;
+       return MATCH_EQ;
  }
  
  int callchain_append(struct callchain_root *root,
@@ -662,7 +706,8 @@ int callchain_append(struct callchain_root *root,
  
         callchain_cursor_commit(cursor);
  
-       append_chain_children(&root->node, cursor, period);
+       if (append_chain_children(&root->node, cursor, period) < 0)
+               return -1;
  
         if (cursor->nr > root->max_depth)
                 root->max_depth = cursor->nr;
@@ -690,7 +735,8 @@ merge_chain_branch(struct callchain_cursor *cursor,
  
         if (src->hit) {
                 callchain_cursor_commit(cursor);
-               append_chain_children(dst, cursor, src->hit);
+               if (append_chain_children(dst, cursor, src->hit) < 0)
+                       return -1;
         }
  
         n = rb_first(&src->rb_root_in);
diff --git a/tools/perf/util/color.c b/tools/perf/util/color.c

index e5fb88bab9e1c416a3c53fe5036f8eb57f64a55d..43e84aa27e4a6489eafdebc3636d8bfb33334e3f 100644 (file)
--- a/tools/perf/util/color.c
+++ b/tools/perf/util/color.c
@@ -32,14 +32,15 @@ int perf_config_colorbool(const char *var, const char *value, int stdout_is_tty)
         return 0;
  }
  
-int perf_color_default_config(const char *var, const char *value, void *cb)
+int perf_color_default_config(const char *var, const char *value,
+                             void *cb __maybe_unused)
  {
         if (!strcmp(var, "color.ui")) {
                 perf_use_color_default = perf_config_colorbool(var, value, -1);
                 return 0;
         }
  
-       return perf_default_config(var, value, cb);
+       return 0;
  }
  
  static int __color_vsnprintf(char *bf, size_t size, const char *color,
diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c

index d3e12e30e1d520f8073d1f01d17e1eaf4ac30254..4e727635476eadf5b105a7be4620f86a4bf46499 100644 (file)
--- a/tools/perf/util/config.c
+++ b/tools/perf/util/config.c
@@ -26,7 +26,7 @@ static const char *config_file_name;
  static int config_linenr;
  static int config_file_eof;
  
-static const char *config_exclusive_filename;
+const char *config_exclusive_filename;
  
  static int get_next_char(void)
  {
@@ -434,7 +434,7 @@ static int perf_config_from_file(config_fn_t fn, const char *filename, void *dat
         return ret;
  }
  
-static const char *perf_etc_perfconfig(void)
+const char *perf_etc_perfconfig(void)
  {
         static const char *system_wide;
         if (!system_wide)
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c

index fa935093a599429011fa214c24645fd0230e3b3a..9bcf2bed3a6d1b7369ee4deee7f38e9c4abab06d 100644 (file)
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -8,6 +8,10 @@
  #include <linux/bitmap.h>
  #include "asm/bug.h"
  
+static int max_cpu_num;
+static int max_node_num;
+static int *cpunode_map;
+
  static struct cpu_map *cpu_map__default_new(void)
  {
         struct cpu_map *cpus;
@@ -486,6 +490,32 @@ out:
                 pr_err("Failed to read max nodes, using default of %d\n", max_node_num);
  }
  
+int cpu__max_node(void)
+{
+       if (unlikely(!max_node_num))
+               set_max_node_num();
+
+       return max_node_num;
+}
+
+int cpu__max_cpu(void)
+{
+       if (unlikely(!max_cpu_num))
+               set_max_cpu_num();
+
+       return max_cpu_num;
+}
+
+int cpu__get_node(int cpu)
+{
+       if (unlikely(cpunode_map == NULL)) {
+               pr_debug("cpu_map not initialized\n");
+               return -1;
+       }
+
+       return cpunode_map[cpu];
+}
+
  static int init_cpunode_map(void)
  {
         int i;
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h

index 71c41b9efabb3b38dd17a7374413985dfd944f77..81a2562aaa2b02261b88c960997238dc9e0925ab 100644 (file)
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -57,37 +57,11 @@ static inline bool cpu_map__empty(const struct cpu_map *map)
         return map ? map->map[0] == -1 : true;
  }
  
-int max_cpu_num;
-int max_node_num;
-int *cpunode_map;
-
  int cpu__setup_cpunode_map(void);
  
-static inline int cpu__max_node(void)
-{
-       if (unlikely(!max_node_num))
-               pr_debug("cpu_map not initialized\n");
-
-       return max_node_num;
-}
-
-static inline int cpu__max_cpu(void)
-{
-       if (unlikely(!max_cpu_num))
-               pr_debug("cpu_map not initialized\n");
-
-       return max_cpu_num;
-}
-
-static inline int cpu__get_node(int cpu)
-{
-       if (unlikely(cpunode_map == NULL)) {
-               pr_debug("cpu_map not initialized\n");
-               return -1;
-       }
-
-       return cpunode_map[cpu];
-}
+int cpu__max_node(void);
+int cpu__max_cpu(void);
+int cpu__get_node(int cpu);
  
  int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
                        int (*f)(struct cpu_map *map, int cpu, void *data),
diff --git a/tools/perf/util/ctype.c b/tools/perf/util/ctype.c

index aada3ac5e891f85be1f4d25791d4ee17dd859077..d4a5a21c2a7e2e47596444665b5f5828874da337 100644 (file)
--- a/tools/perf/util/ctype.c
+++ b/tools/perf/util/ctype.c
@@ -31,9 +31,18 @@ unsigned char sane_ctype[256] = {
  };
  
  const char *graph_line =
+       "_____________________________________________________________________"
         "_____________________________________________________________________"
         "_____________________________________________________________________";
  const char *graph_dotted_line =
         "---------------------------------------------------------------------"
         "---------------------------------------------------------------------"
         "---------------------------------------------------------------------";
+const char *spaces =
+       "                                                                     "
+       "                                                                     "
+       "                                                                     ";
+const char *dots =
+       "....................................................................."
+       "....................................................................."
+       ".....................................................................";
diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c

index 34cd1e4039d35e05be0460ff0a259e7d8b2bb80a..811af89ce0bb8f37b99898dc8b2a2d5f6e183d26 100644 (file)
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@@ -352,6 +352,84 @@ static int add_tracepoint_values(struct ctf_writer *cw,
         return ret;
  }
  
+static int
+add_bpf_output_values(struct bt_ctf_event_class *event_class,
+                     struct bt_ctf_event *event,
+                     struct perf_sample *sample)
+{
+       struct bt_ctf_field_type *len_type, *seq_type;
+       struct bt_ctf_field *len_field, *seq_field;
+       unsigned int raw_size = sample->raw_size;
+       unsigned int nr_elements = raw_size / sizeof(u32);
+       unsigned int i;
+       int ret;
+
+       if (nr_elements * sizeof(u32) != raw_size)
+               pr_warning("Incorrect raw_size (%u) in bpf output event, skip %lu bytes\n",
+                          raw_size, nr_elements * sizeof(u32) - raw_size);
+
+       len_type = bt_ctf_event_class_get_field_by_name(event_class, "raw_len");
+       len_field = bt_ctf_field_create(len_type);
+       if (!len_field) {
+               pr_err("failed to create 'raw_len' for bpf output event\n");
+               ret = -1;
+               goto put_len_type;
+       }
+
+       ret = bt_ctf_field_unsigned_integer_set_value(len_field, nr_elements);
+       if (ret) {
+               pr_err("failed to set field value for raw_len\n");
+               goto put_len_field;
+       }
+       ret = bt_ctf_event_set_payload(event, "raw_len", len_field);
+       if (ret) {
+               pr_err("failed to set payload to raw_len\n");
+               goto put_len_field;
+       }
+
+       seq_type = bt_ctf_event_class_get_field_by_name(event_class, "raw_data");
+       seq_field = bt_ctf_field_create(seq_type);
+       if (!seq_field) {
+               pr_err("failed to create 'raw_data' for bpf output event\n");
+               ret = -1;
+               goto put_seq_type;
+       }
+
+       ret = bt_ctf_field_sequence_set_length(seq_field, len_field);
+       if (ret) {
+               pr_err("failed to set length of 'raw_data'\n");
+               goto put_seq_field;
+       }
+
+       for (i = 0; i < nr_elements; i++) {
+               struct bt_ctf_field *elem_field =
+                       bt_ctf_field_sequence_get_field(seq_field, i);
+
+               ret = bt_ctf_field_unsigned_integer_set_value(elem_field,
+                               ((u32 *)(sample->raw_data))[i]);
+
+               bt_ctf_field_put(elem_field);
+               if (ret) {
+                       pr_err("failed to set raw_data[%d]\n", i);
+                       goto put_seq_field;
+               }
+       }
+
+       ret = bt_ctf_event_set_payload(event, "raw_data", seq_field);
+       if (ret)
+               pr_err("failed to set payload for raw_data\n");
+
+put_seq_field:
+       bt_ctf_field_put(seq_field);
+put_seq_type:
+       bt_ctf_field_type_put(seq_type);
+put_len_field:
+       bt_ctf_field_put(len_field);
+put_len_type:
+       bt_ctf_field_type_put(len_type);
+       return ret;
+}
+
  static int add_generic_values(struct ctf_writer *cw,
                               struct bt_ctf_event *event,
                               struct perf_evsel *evsel,
@@ -597,6 +675,12 @@ static int process_sample_event(struct perf_tool *tool,
                         return -1;
         }
  
+       if (perf_evsel__is_bpf_output(evsel)) {
+               ret = add_bpf_output_values(event_class, event, sample);
+               if (ret)
+                       return -1;
+       }
+
         cs = ctf_stream(cw, get_sample_cpu(cw, sample, evsel));
         if (cs) {
                 if (is_flush_needed(cs))
@@ -744,6 +828,25 @@ static int add_tracepoint_types(struct ctf_writer *cw,
         return ret;
  }
  
+static int add_bpf_output_types(struct ctf_writer *cw,
+                               struct bt_ctf_event_class *class)
+{
+       struct bt_ctf_field_type *len_type = cw->data.u32;
+       struct bt_ctf_field_type *seq_base_type = cw->data.u32_hex;
+       struct bt_ctf_field_type *seq_type;
+       int ret;
+
+       ret = bt_ctf_event_class_add_field(class, len_type, "raw_len");
+       if (ret)
+               return ret;
+
+       seq_type = bt_ctf_field_type_sequence_create(seq_base_type, "raw_len");
+       if (!seq_type)
+               return -1;
+
+       return bt_ctf_event_class_add_field(class, seq_type, "raw_data");
+}
+
  static int add_generic_types(struct ctf_writer *cw, struct perf_evsel *evsel,
                              struct bt_ctf_event_class *event_class)
  {
@@ -755,7 +858,8 @@ static int add_generic_types(struct ctf_writer *cw, struct perf_evsel *evsel,
          *                              ctf event header
          *   PERF_SAMPLE_READ         - TODO
          *   PERF_SAMPLE_CALLCHAIN    - TODO
-        *   PERF_SAMPLE_RAW          - tracepoint fields are handled separately
+        *   PERF_SAMPLE_RAW          - tracepoint fields and BPF output
+        *                              are handled separately
          *   PERF_SAMPLE_BRANCH_STACK - TODO
          *   PERF_SAMPLE_REGS_USER    - TODO
          *   PERF_SAMPLE_STACK_USER   - TODO
@@ -824,6 +928,12 @@ static int add_event(struct ctf_writer *cw, struct perf_evsel *evsel)
                         goto err;
         }
  
+       if (perf_evsel__is_bpf_output(evsel)) {
+               ret = add_bpf_output_types(cw, event_class);
+               if (ret)
+                       goto err;
+       }
+
         ret = bt_ctf_stream_class_add_event_class(cw->stream_class, event_class);
         if (ret) {
                 pr("Failed to add event class into stream.\n");
@@ -858,6 +968,23 @@ static int setup_events(struct ctf_writer *cw, struct perf_session *session)
         return 0;
  }
  
+static void cleanup_events(struct perf_session *session)
+{
+       struct perf_evlist *evlist = session->evlist;
+       struct perf_evsel *evsel;
+
+       evlist__for_each(evlist, evsel) {
+               struct evsel_priv *priv;
+
+               priv = evsel->priv;
+               bt_ctf_event_class_put(priv->event_class);
+               zfree(&evsel->priv);
+       }
+
+       perf_evlist__delete(evlist);
+       session->evlist = NULL;
+}
+
  static int setup_streams(struct ctf_writer *cw, struct perf_session *session)
  {
         struct ctf_stream **stream;
@@ -953,6 +1080,12 @@ static struct bt_ctf_field_type *create_int_type(int size, bool sign, bool hex)
             bt_ctf_field_type_integer_set_base(type, BT_CTF_INTEGER_BASE_HEXADECIMAL))
                 goto err;
  
+#if __BYTE_ORDER == __BIG_ENDIAN
+       bt_ctf_field_type_set_byte_order(type, BT_CTF_BYTE_ORDER_BIG_ENDIAN);
+#else
+       bt_ctf_field_type_set_byte_order(type, BT_CTF_BYTE_ORDER_LITTLE_ENDIAN);
+#endif
+
         pr2("Created type: INTEGER %d-bit %ssigned %s\n",
             size, sign ? "un" : "", hex ? "hex" : "");
         return type;
@@ -1100,7 +1233,7 @@ static int convert__config(const char *var, const char *value, void *cb)
                 return 0;
         }
  
-       return perf_default_config(var, value, cb);
+       return 0;
  }
  
  int bt_convert__perf2ctf(const char *input, const char *path, bool force)
@@ -1171,6 +1304,7 @@ int bt_convert__perf2ctf(const char *input, const char *path, bool force)
                 (double) c.events_size / 1024.0 / 1024.0,
                 c.events_count);
  
+       cleanup_events(session);
         perf_session__delete(session);
         ctf_writer__cleanup(cw);
  
diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c

index 86d9c73025983d0132a4dddc8607161d713bfbf2..8c4212abd19b48b9e84ca1f9978f06561c05576a 100644 (file)
--- a/tools/perf/util/debug.c
+++ b/tools/perf/util/debug.c
@@ -5,6 +5,7 @@
  #include <string.h>
  #include <stdarg.h>
  #include <stdio.h>
+#include <api/debug.h>
  
  #include "cache.h"
  #include "color.h"
@@ -22,7 +23,7 @@ int debug_ordered_events;
  static int redirect_to_stderr;
  int debug_data_convert;
  
-static int _eprintf(int level, int var, const char *fmt, va_list args)
+int veprintf(int level, int var, const char *fmt, va_list args)
  {
         int ret = 0;
  
@@ -36,24 +37,19 @@ static int _eprintf(int level, int var, const char *fmt, va_list args)
         return ret;
  }
  
-int veprintf(int level, int var, const char *fmt, va_list args)
-{
-       return _eprintf(level, var, fmt, args);
-}
-
  int eprintf(int level, int var, const char *fmt, ...)
  {
         va_list args;
         int ret;
  
         va_start(args, fmt);
-       ret = _eprintf(level, var, fmt, args);
+       ret = veprintf(level, var, fmt, args);
         va_end(args);
  
         return ret;
  }
  
-static int __eprintf_time(u64 t, const char *fmt, va_list args)
+static int veprintf_time(u64 t, const char *fmt, va_list args)
  {
         int ret = 0;
         u64 secs, usecs, nsecs = t;
@@ -75,7 +71,7 @@ int eprintf_time(int level, int var, u64 t, const char *fmt, ...)
  
         if (var >= level) {
                 va_start(args, fmt);
-               ret = __eprintf_time(t, fmt, args);
+               ret = veprintf_time(t, fmt, args);
                 va_end(args);
         }
  
@@ -91,7 +87,7 @@ void pr_stat(const char *fmt, ...)
         va_list args;
  
         va_start(args, fmt);
-       _eprintf(1, verbose, fmt, args);
+       veprintf(1, verbose, fmt, args);
         va_end(args);
         eprintf(1, verbose, "\n");
  }
@@ -110,40 +106,61 @@ int dump_printf(const char *fmt, ...)
         return ret;
  }
  
+static void trace_event_printer(enum binary_printer_ops op,
+                               unsigned int val, void *extra)
+{
+       const char *color = PERF_COLOR_BLUE;
+       union perf_event *event = (union perf_event *)extra;
+       unsigned char ch = (unsigned char)val;
+
+       switch (op) {
+       case BINARY_PRINT_DATA_BEGIN:
+               printf(".");
+               color_fprintf(stdout, color, "\n. ... raw event: size %d bytes\n",
+                               event->header.size);
+               break;
+       case BINARY_PRINT_LINE_BEGIN:
+               printf(".");
+               break;
+       case BINARY_PRINT_ADDR:
+               color_fprintf(stdout, color, "  %04x: ", val);
+               break;
+       case BINARY_PRINT_NUM_DATA:
+               color_fprintf(stdout, color, " %02x", val);
+               break;
+       case BINARY_PRINT_NUM_PAD:
+               color_fprintf(stdout, color, "   ");
+               break;
+       case BINARY_PRINT_SEP:
+               color_fprintf(stdout, color, "  ");
+               break;
+       case BINARY_PRINT_CHAR_DATA:
+               color_fprintf(stdout, color, "%c",
+                             isprint(ch) ? ch : '.');
+               break;
+       case BINARY_PRINT_CHAR_PAD:
+               color_fprintf(stdout, color, " ");
+               break;
+       case BINARY_PRINT_LINE_END:
+               color_fprintf(stdout, color, "\n");
+               break;
+       case BINARY_PRINT_DATA_END:
+               printf("\n");
+               break;
+       default:
+               break;
+       }
+}
+
  void trace_event(union perf_event *event)
  {
         unsigned char *raw_event = (void *)event;
-       const char *color = PERF_COLOR_BLUE;
-       int i, j;
  
         if (!dump_trace)
                 return;
  
-       printf(".");
-       color_fprintf(stdout, color, "\n. ... raw event: size %d bytes\n",
-                     event->header.size);
-
-       for (i = 0; i < event->header.size; i++) {
-               if ((i & 15) == 0) {
-                       printf(".");
-                       color_fprintf(stdout, color, "  %04x: ", i);
-               }
-
-               color_fprintf(stdout, color, " %02x", raw_event[i]);
-
-               if (((i & 15) == 15) || i == event->header.size-1) {
-                       color_fprintf(stdout, color, "  ");
-                       for (j = 0; j < 15-(i & 15); j++)
-                               color_fprintf(stdout, color, "   ");
-                       for (j = i & ~15; j <= i; j++) {
-                               color_fprintf(stdout, color, "%c",
-                                             isprint(raw_event[j]) ?
-                                             raw_event[j] : '.');
-                       }
-                       color_fprintf(stdout, color, "\n");
-               }
-       }
-       printf(".\n");
+       print_binary(raw_event, event->header.size, 16,
+                    trace_event_printer, event);
  }
  
  static struct debug_variable {
@@ -192,3 +209,23 @@ int perf_debug_option(const char *str)
         free(s);
         return 0;
  }
+
+#define DEBUG_WRAPPER(__n, __l)                                \
+static int pr_ ## __n ## _wrapper(const char *fmt, ...)        \
+{                                                      \
+       va_list args;                                   \
+       int ret;                                        \
+                                                       \
+       va_start(args, fmt);                            \
+       ret = veprintf(__l, verbose, fmt, args);        \
+       va_end(args);                                   \
+       return ret;                                     \
+}
+
+DEBUG_WRAPPER(warning, 0);
+DEBUG_WRAPPER(debug, 1);
+
+void perf_debug_setup(void)
+{
+       libapi_set_print(pr_warning_wrapper, pr_warning_wrapper, pr_debug_wrapper);
+}
diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h

index 8b9a088c32ab4e330ece47ae91397bdd87bdd0ee..14bafda79edaeba1bbd3e007b5faa8607d0570bf 100644 (file)
--- a/tools/perf/util/debug.h
+++ b/tools/perf/util/debug.h
@@ -53,5 +53,6 @@ int eprintf_time(int level, int var, u64 t, const char *fmt, ...) __attribute__(
  int veprintf(int level, int var, const char *fmt, va_list args);
  
  int perf_debug_option(const char *str);
+void perf_debug_setup(void);
  
  #endif /* __PERF_DEBUG_H */
diff --git a/tools/perf/util/demangle-java.c b/tools/perf/util/demangle-java.c

new file mode 100644 (file)

index 0000000..3e6062a
--- /dev/null
+++ b/tools/perf/util/demangle-java.c
@@ -0,0 +1,199 @@
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include "util.h"
+#include "debug.h"
+#include "symbol.h"
+
+#include "demangle-java.h"
+
+enum {
+       MODE_PREFIX = 0,
+       MODE_CLASS  = 1,
+       MODE_FUNC   = 2,
+       MODE_TYPE   = 3,
+       MODE_CTYPE  = 3, /* class arg */
+};
+
+#define BASE_ENT(c, n) [c - 'A']=n
+static const char *base_types['Z' - 'A' + 1] = {
+       BASE_ENT('B', "byte" ),
+       BASE_ENT('C', "char" ),
+       BASE_ENT('D', "double" ),
+       BASE_ENT('F', "float" ),
+       BASE_ENT('I', "int" ),
+       BASE_ENT('J', "long" ),
+       BASE_ENT('S', "short" ),
+       BASE_ENT('Z', "bool" ),
+};
+
+/*
+ * demangle Java symbol between str and end positions and stores
+ * up to maxlen characters into buf. The parser starts in mode.
+ *
+ * Use MODE_PREFIX to process entire prototype till end position
+ * Use MODE_TYPE to process return type if str starts on return type char
+ *
+ *  Return:
+ *     success: buf
+ *     error  : NULL
+ */
+static char *
+__demangle_java_sym(const char *str, const char *end, char *buf, int maxlen, int mode)
+{
+       int rlen = 0;
+       int array = 0;
+       int narg = 0;
+       const char *q;
+
+       if (!end)
+               end = str + strlen(str);
+
+       for (q = str; q != end; q++) {
+
+               if (rlen == (maxlen - 1))
+                       break;
+
+               switch (*q) {
+               case 'L':
+                       if (mode == MODE_PREFIX || mode == MODE_CTYPE) {
+                               if (mode == MODE_CTYPE) {
+                                       if (narg)
+                                               rlen += scnprintf(buf + rlen, maxlen - rlen, ", ");
+                                       narg++;
+                               }
+                               rlen += scnprintf(buf + rlen, maxlen - rlen, "class ");
+                               if (mode == MODE_PREFIX)
+                                       mode = MODE_CLASS;
+                       } else
+                               buf[rlen++] = *q;
+                       break;
+               case 'B':
+               case 'C':
+               case 'D':
+               case 'F':
+               case 'I':
+               case 'J':
+               case 'S':
+               case 'Z':
+                       if (mode == MODE_TYPE) {
+                               if (narg)
+                                       rlen += scnprintf(buf + rlen, maxlen - rlen, ", ");
+                               rlen += scnprintf(buf + rlen, maxlen - rlen, "%s", base_types[*q - 'A']);
+                               while (array--)
+                                       rlen += scnprintf(buf + rlen, maxlen - rlen, "[]");
+                               array = 0;
+                               narg++;
+                       } else
+                               buf[rlen++] = *q;
+                       break;
+               case 'V':
+                       if (mode == MODE_TYPE) {
+                               rlen += scnprintf(buf + rlen, maxlen - rlen, "void");
+                               while (array--)
+                                       rlen += scnprintf(buf + rlen, maxlen - rlen, "[]");
+                               array = 0;
+                       } else
+                               buf[rlen++] = *q;
+                       break;
+               case '[':
+                       if (mode != MODE_TYPE)
+                               goto error;
+                       array++;
+                       break;
+               case '(':
+                       if (mode != MODE_FUNC)
+                               goto error;
+                       buf[rlen++] = *q;
+                       mode = MODE_TYPE;
+                       break;
+               case ')':
+                       if (mode != MODE_TYPE)
+                               goto error;
+                       buf[rlen++] = *q;
+                       narg = 0;
+                       break;
+               case ';':
+                       if (mode != MODE_CLASS && mode != MODE_CTYPE)
+                               goto error;
+                       /* safe because at least one other char to process */
+                       if (isalpha(*(q + 1)))
+                               rlen += scnprintf(buf + rlen, maxlen - rlen, ".");
+                       if (mode == MODE_CLASS)
+                               mode = MODE_FUNC;
+                       else if (mode == MODE_CTYPE)
+                               mode = MODE_TYPE;
+                       break;
+               case '/':
+                       if (mode != MODE_CLASS && mode != MODE_CTYPE)
+                               goto error;
+                       rlen += scnprintf(buf + rlen, maxlen - rlen, ".");
+                       break;
+               default :
+                       buf[rlen++] = *q;
+               }
+       }
+       buf[rlen] = '\0';
+       return buf;
+error:
+       return NULL;
+}
+
+/*
+ * Demangle Java function signature (openJDK, not GCJ)
+ * input:
+ *     str: string to parse. String is not modified
+ *    flags: comobination of JAVA_DEMANGLE_* flags to modify demangling
+ * return:
+ *     if input can be demangled, then a newly allocated string is returned.
+ *     if input cannot be demangled, then NULL is returned
+ *
+ * Note: caller is responsible for freeing demangled string
+ */
+char *
+java_demangle_sym(const char *str, int flags)
+{
+       char *buf, *ptr;
+       char *p;
+       size_t len, l1 = 0;
+
+       if (!str)
+               return NULL;
+
+       /* find start of retunr type */
+       p = strrchr(str, ')');
+       if (!p)
+               return NULL;
+
+       /*
+        * expansion factor estimated to 3x
+        */
+       len = strlen(str) * 3 + 1;
+       buf = malloc(len);
+       if (!buf)
+               return NULL;
+
+       buf[0] = '\0';
+       if (!(flags & JAVA_DEMANGLE_NORET)) {
+               /*
+                * get return type first
+                */
+               ptr = __demangle_java_sym(p + 1, NULL, buf, len, MODE_TYPE);
+               if (!ptr)
+                       goto error;
+
+               /* add space between return type and function prototype */
+               l1 = strlen(buf);
+               buf[l1++] = ' ';
+       }
+
+       /* process function up to return type */
+       ptr = __demangle_java_sym(str, p + 1, buf + l1, len - l1, MODE_PREFIX);
+       if (!ptr)
+               goto error;
+
+       return buf;
+error:
+       free(buf);
+       return NULL;
+}
diff --git a/tools/perf/util/demangle-java.h b/tools/perf/util/demangle-java.h

new file mode 100644 (file)

index 0000000..a981c1f
--- /dev/null
+++ b/tools/perf/util/demangle-java.h
@@ -0,0 +1,10 @@
+#ifndef __PERF_DEMANGLE_JAVA
+#define __PERF_DEMANGLE_JAVA 1
+/*
+ * demangle function flags
+ */
+#define JAVA_DEMANGLE_NORET    0x1 /* do not process return type */
+
+char * java_demangle_sym(const char *str, int flags);
+
+#endif /* __PERF_DEMANGLE_JAVA */
diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c

index e8e9a9dbf5e395a20d589c80b253c2d869b927c1..8e6395439ca0830cefaaa5b6dbe905ae2af93011 100644 (file)
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -52,6 +52,11 @@ int dso__read_binary_type_filename(const struct dso *dso,
                         debuglink--;
                 if (*debuglink == '/')
                         debuglink++;
+
+               ret = -1;
+               if (!is_regular_file(filename))
+                       break;
+
                 ret = filename__read_debuglink(filename, debuglink,
                                                size - (debuglink - filename));
                 }
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c

index 7dd5939dea2e58385bd374a3e59aecf766e799c1..49a11d9d8b8f050efa48715cfba9e54731f63eef 100644 (file)
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -6,6 +6,8 @@ struct perf_env perf_env;
  
  void perf_env__exit(struct perf_env *env)
  {
+       int i;
+
         zfree(&env->hostname);
         zfree(&env->os_release);
         zfree(&env->version);
@@ -19,6 +21,10 @@ void perf_env__exit(struct perf_env *env)
         zfree(&env->numa_nodes);
         zfree(&env->pmu_mappings);
         zfree(&env->cpu);
+
+       for (i = 0; i < env->caches_cnt; i++)
+               cpu_cache_level__free(&env->caches[i]);
+       zfree(&env->caches);
  }
  
  int perf_env__set_cmdline(struct perf_env *env, int argc, const char *argv[])
@@ -75,3 +81,10 @@ int perf_env__read_cpu_topology_map(struct perf_env *env)
         env->nr_cpus_avail = nr_cpus;
         return 0;
  }
+
+void cpu_cache_level__free(struct cpu_cache_level *cache)
+{
+       free(cache->type);
+       free(cache->map);
+       free(cache->size);
+}
diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h

index 0132b9557c02b56f7f31e3e51981d55b0d1027bd..56cffb60a0b42e456a11fb2bd489d74a69893b0c 100644 (file)
--- a/tools/perf/util/env.h
+++ b/tools/perf/util/env.h
@@ -1,11 +1,23 @@
  #ifndef __PERF_ENV_H
  #define __PERF_ENV_H
  
+#include <linux/types.h>
+
  struct cpu_topology_map {
         int     socket_id;
         int     core_id;
  };
  
+struct cpu_cache_level {
+       u32     level;
+       u32     line_size;
+       u32     sets;
+       u32     ways;
+       char    *type;
+       char    *size;
+       char    *map;
+};
+
  struct perf_env {
         char                    *hostname;
         char                    *os_release;
@@ -31,6 +43,8 @@ struct perf_env {
         char                    *numa_nodes;
         char                    *pmu_mappings;
         struct cpu_topology_map *cpu;
+       struct cpu_cache_level  *caches;
+       int                      caches_cnt;
  };
  
  extern struct perf_env perf_env;
@@ -41,4 +55,5 @@ int perf_env__set_cmdline(struct perf_env *env, int argc, const char *argv[]);
  
  int perf_env__read_cpu_topology_map(struct perf_env *env);
  
+void cpu_cache_level__free(struct cpu_cache_level *cache);
  #endif /* __PERF_ENV_H */
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c

index 85155e91b61ba9b70feaf867b109270df498b9b8..7bad5c3fa7b7175862f5e3bdf38ffca0e1b14ee1 100644 (file)
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -282,7 +282,7 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
                 strcpy(execname, "");
  
                 /* 00400000-0040c000 r-xp 00000000 fd:01 41038  /bin/cat */
-               n = sscanf(bf, "%"PRIx64"-%"PRIx64" %s %"PRIx64" %x:%x %u %s\n",
+               n = sscanf(bf, "%"PRIx64"-%"PRIx64" %s %"PRIx64" %x:%x %u %[^\n]\n",
                        &event->mmap2.start, &event->mmap2.len, prot,
                        &event->mmap2.pgoff, &event->mmap2.maj,
                        &event->mmap2.min,
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c

index d81f13de24769963f6c32c3f809a714e6868dc5e..86a03836a83fc3f8ee8648d83317b8d91e3f48d8 100644 (file)
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1181,12 +1181,12 @@ void perf_evlist__set_maps(struct perf_evlist *evlist, struct cpu_map *cpus,
          */
         if (cpus != evlist->cpus) {
                 cpu_map__put(evlist->cpus);
-               evlist->cpus = cpus;
+               evlist->cpus = cpu_map__get(cpus);
         }
  
         if (threads != evlist->threads) {
                 thread_map__put(evlist->threads);
-               evlist->threads = threads;
+               evlist->threads = thread_map__get(threads);
         }
  
         perf_evlist__propagate_maps(evlist);
@@ -1223,6 +1223,9 @@ int perf_evlist__set_filter(struct perf_evlist *evlist, const char *filter)
         int err = 0;
  
         evlist__for_each(evlist, evsel) {
+               if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
+                       continue;
+
                 err = perf_evsel__set_filter(evsel, filter);
                 if (err)
                         break;
@@ -1624,7 +1627,7 @@ size_t perf_evlist__fprintf(struct perf_evlist *evlist, FILE *fp)
         return printed + fprintf(fp, "\n");
  }
  
-int perf_evlist__strerror_open(struct perf_evlist *evlist __maybe_unused,
+int perf_evlist__strerror_open(struct perf_evlist *evlist,
                                int err, char *buf, size_t size)
  {
         int printed, value;
@@ -1652,7 +1655,25 @@ int perf_evlist__strerror_open(struct perf_evlist *evlist __maybe_unused,
                                     "Hint:\tTry: 'sudo sh -c \"echo -1 > /proc/sys/kernel/perf_event_paranoid\"'\n"
                                     "Hint:\tThe current value is %d.", value);
                 break;
+       case EINVAL: {
+               struct perf_evsel *first = perf_evlist__first(evlist);
+               int max_freq;
+
+               if (sysctl__read_int("kernel/perf_event_max_sample_rate", &max_freq) < 0)
+                       goto out_default;
+
+               if (first->attr.sample_freq < (u64)max_freq)
+                       goto out_default;
+
+               printed = scnprintf(buf, size,
+                                   "Error:\t%s.\n"
+                                   "Hint:\tCheck /proc/sys/kernel/perf_event_max_sample_rate.\n"
+                                   "Hint:\tThe current value is %d and %" PRIu64 " is being requested.",
+                                   emsg, max_freq, first->attr.sample_freq);
+               break;
+       }
         default:
+out_default:
                 scnprintf(buf, size, "%s", emsg);
                 break;
         }
@@ -1723,3 +1744,19 @@ void perf_evlist__set_tracking_event(struct perf_evlist *evlist,
  
         tracking_evsel->tracking = true;
  }
+
+struct perf_evsel *
+perf_evlist__find_evsel_by_str(struct perf_evlist *evlist,
+                              const char *str)
+{
+       struct perf_evsel *evsel;
+
+       evlist__for_each(evlist, evsel) {
+               if (!evsel->name)
+                       continue;
+               if (strcmp(str, evsel->name) == 0)
+                       return evsel;
+       }
+
+       return NULL;
+}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h

index 7c4d9a2067769b0e3de2d96a11e6f7dd7d62106a..a0d15221db6e878412126f1ec5de08cb030f132f 100644 (file)
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -294,4 +294,7 @@ void perf_evlist__set_tracking_event(struct perf_evlist *evlist,
                                      struct perf_evsel *tracking_evsel);
  
  void perf_event_attr__set_max_precise_ip(struct perf_event_attr *attr);
+
+struct perf_evsel *
+perf_evlist__find_evsel_by_str(struct perf_evlist *evlist, const char *str);
  #endif /* __PERF_EVLIST_H */
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c

index cdbaf9b51e428ad4537a38ded24ee07bec54e90c..0902fe418754ec0c3149d3daeafc122fee65e243 100644 (file)
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -225,6 +225,11 @@ struct perf_evsel *perf_evsel__new_idx(struct perf_event_attr *attr, int idx)
         if (evsel != NULL)
                 perf_evsel__init(evsel, attr, idx);
  
+       if (perf_evsel__is_bpf_output(evsel)) {
+               evsel->attr.sample_type |= PERF_SAMPLE_RAW;
+               evsel->attr.sample_period = 1;
+       }
+
         return evsel;
  }
  
@@ -898,6 +903,16 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts)
         if (evsel->precise_max)
                 perf_event_attr__set_max_precise_ip(attr);
  
+       if (opts->all_user) {
+               attr->exclude_kernel = 1;
+               attr->exclude_user   = 0;
+       }
+
+       if (opts->all_kernel) {
+               attr->exclude_kernel = 0;
+               attr->exclude_user   = 1;
+       }
+
         /*
          * Apply event specific term settings,
          * it overloads any global configuration.
@@ -2362,12 +2377,15 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
         case EPERM:
         case EACCES:
                 return scnprintf(msg, size,
-                "You may not have permission to collect %sstats.\n"
-                "Consider tweaking /proc/sys/kernel/perf_event_paranoid:\n"
-                " -1 - Not paranoid at all\n"
-                "  0 - Disallow raw tracepoint access for unpriv\n"
-                "  1 - Disallow cpu events for unpriv\n"
-                "  2 - Disallow kernel profiling for unpriv",
+                "You may not have permission to collect %sstats.\n\n"
+                "Consider tweaking /proc/sys/kernel/perf_event_paranoid,\n"
+                "which controls use of the performance events system by\n"
+                "unprivileged users (without CAP_SYS_ADMIN).\n\n"
+                "The default value is 1:\n\n"
+                "  -1: Allow use of (almost) all events by all users\n"
+                ">= 0: Disallow raw tracepoint access by users without CAP_IOC_LOCK\n"
+                ">= 1: Disallow CPU event access by users without CAP_SYS_ADMIN\n"
+                ">= 2: Disallow kernel profiling by users without CAP_SYS_ADMIN",
                                  target->system_wide ? "system-wide " : "");
         case ENOENT:
                 return scnprintf(msg, size, "The %s event is not supported.",
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h

index 8e75434bd01c671a8ed2e0c0b03a139212d7001c..501ea6e565f13a4a4817947957c79f15d805d130 100644 (file)
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -93,10 +93,8 @@ struct perf_evsel {
         const char              *unit;
         struct event_format     *tp_format;
         off_t                   id_offset;
-       union {
-               void            *priv;
-               u64             db_id;
-       };
+       void                    *priv;
+       u64                     db_id;
         struct cgroup_sel       *cgrp;
         void                    *handler;
         struct cpu_map          *cpus;
@@ -364,6 +362,14 @@ static inline bool perf_evsel__is_function_event(struct perf_evsel *evsel)
  #undef FUNCTION_EVENT
  }
  
+static inline bool perf_evsel__is_bpf_output(struct perf_evsel *evsel)
+{
+       struct perf_event_attr *attr = &evsel->attr;
+
+       return (attr->config == PERF_COUNT_SW_BPF_OUTPUT) &&
+               (attr->type == PERF_TYPE_SOFTWARE);
+}
+
  struct perf_attr_details {
         bool freq;
         bool verbose;
diff --git a/tools/perf/util/genelf.c b/tools/perf/util/genelf.c

new file mode 100644 (file)

index 0000000..c1ef805
--- /dev/null
+++ b/tools/perf/util/genelf.c
@@ -0,0 +1,449 @@
+/*
+ * genelf.c
+ * Copyright (C) 2014, Google, Inc
+ *
+ * Contributed by:
+ *     Stephane Eranian <eranian@gmail.com>
+ *
+ * Released under the GPL v2. (and only v2, not any later version)
+ */
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <getopt.h>
+#include <stddef.h>
+#include <libelf.h>
+#include <string.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <err.h>
+#include <dwarf.h>
+
+#include "perf.h"
+#include "genelf.h"
+#include "../util/jitdump.h"
+
+#define JVMTI
+
+#define BUILD_ID_URANDOM /* different uuid for each run */
+
+#ifdef HAVE_LIBCRYPTO
+
+#define BUILD_ID_MD5
+#undef BUILD_ID_SHA    /* does not seem to work well when linked with Java */
+#undef BUILD_ID_URANDOM /* different uuid for each run */
+
+#ifdef BUILD_ID_SHA
+#include <openssl/sha.h>
+#endif
+
+#ifdef BUILD_ID_MD5
+#include <openssl/md5.h>
+#endif
+#endif
+
+
+typedef struct {
+  unsigned int namesz;  /* Size of entry's owner string */
+  unsigned int descsz;  /* Size of the note descriptor */
+  unsigned int type;    /* Interpretation of the descriptor */
+  char         name[0]; /* Start of the name+desc data */
+} Elf_Note;
+
+struct options {
+       char *output;
+       int fd;
+};
+
+static char shd_string_table[] = {
+       0,
+       '.', 't', 'e', 'x', 't', 0,                     /*  1 */
+       '.', 's', 'h', 's', 't', 'r', 't', 'a', 'b', 0, /*  7 */
+       '.', 's', 'y', 'm', 't', 'a', 'b', 0,           /* 17 */
+       '.', 's', 't', 'r', 't', 'a', 'b', 0,           /* 25 */
+       '.', 'n', 'o', 't', 'e', '.', 'g', 'n', 'u', '.', 'b', 'u', 'i', 'l', 'd', '-', 'i', 'd', 0, /* 33 */
+       '.', 'd', 'e', 'b', 'u', 'g', '_', 'l', 'i', 'n', 'e', 0, /* 52 */
+       '.', 'd', 'e', 'b', 'u', 'g', '_', 'i', 'n', 'f', 'o', 0, /* 64 */
+       '.', 'd', 'e', 'b', 'u', 'g', '_', 'a', 'b', 'b', 'r', 'e', 'v', 0, /* 76 */
+};
+
+static struct buildid_note {
+       Elf_Note desc;          /* descsz: size of build-id, must be multiple of 4 */
+       char     name[4];       /* GNU\0 */
+       char     build_id[20];
+} bnote;
+
+static Elf_Sym symtab[]={
+       /* symbol 0 MUST be the undefined symbol */
+       { .st_name  = 0, /* index in sym_string table */
+         .st_info  = ELF_ST_TYPE(STT_NOTYPE),
+         .st_shndx = 0, /* for now */
+         .st_value = 0x0,
+         .st_other = ELF_ST_VIS(STV_DEFAULT),
+         .st_size  = 0,
+       },
+       { .st_name  = 1, /* index in sym_string table */
+         .st_info  = ELF_ST_BIND(STB_LOCAL) | ELF_ST_TYPE(STT_FUNC),
+         .st_shndx = 1,
+         .st_value = 0, /* for now */
+         .st_other = ELF_ST_VIS(STV_DEFAULT),
+         .st_size  = 0, /* for now */
+       }
+};
+
+#ifdef BUILD_ID_URANDOM
+static void
+gen_build_id(struct buildid_note *note,
+            unsigned long load_addr __maybe_unused,
+            const void *code __maybe_unused,
+            size_t csize __maybe_unused)
+{
+       int fd;
+       size_t sz = sizeof(note->build_id);
+       ssize_t sret;
+
+       fd = open("/dev/urandom", O_RDONLY);
+       if (fd == -1)
+               err(1, "cannot access /dev/urandom for builid");
+
+       sret = read(fd, note->build_id, sz);
+
+       close(fd);
+
+       if (sret != (ssize_t)sz)
+               memset(note->build_id, 0, sz);
+}
+#endif
+
+#ifdef BUILD_ID_SHA
+static void
+gen_build_id(struct buildid_note *note,
+            unsigned long load_addr __maybe_unused,
+            const void *code,
+            size_t csize)
+{
+       if (sizeof(note->build_id) < SHA_DIGEST_LENGTH)
+               errx(1, "build_id too small for SHA1");
+
+       SHA1(code, csize, (unsigned char *)note->build_id);
+}
+#endif
+
+#ifdef BUILD_ID_MD5
+static void
+gen_build_id(struct buildid_note *note, unsigned long load_addr, const void *code, size_t csize)
+{
+       MD5_CTX context;
+
+       if (sizeof(note->build_id) < 16)
+               errx(1, "build_id too small for MD5");
+
+       MD5_Init(&context);
+       MD5_Update(&context, &load_addr, sizeof(load_addr));
+       MD5_Update(&context, code, csize);
+       MD5_Final((unsigned char *)note->build_id, &context);
+}
+#endif
+
+/*
+ * fd: file descriptor open for writing for the output file
+ * load_addr: code load address (could be zero, just used for buildid)
+ * sym: function name (for native code - used as the symbol)
+ * code: the native code
+ * csize: the code size in bytes
+ */
+int
+jit_write_elf(int fd, uint64_t load_addr, const char *sym,
+             const void *code, int csize,
+             void *debug, int nr_debug_entries)
+{
+       Elf *e;
+       Elf_Data *d;
+       Elf_Scn *scn;
+       Elf_Ehdr *ehdr;
+       Elf_Shdr *shdr;
+       char *strsym = NULL;
+       int symlen;
+       int retval = -1;
+
+       if (elf_version(EV_CURRENT) == EV_NONE) {
+               warnx("ELF initialization failed");
+               return -1;
+       }
+
+       e = elf_begin(fd, ELF_C_WRITE, NULL);
+       if (!e) {
+               warnx("elf_begin failed");
+               goto error;
+       }
+
+       /*
+        * setup ELF header
+        */
+       ehdr = elf_newehdr(e);
+       if (!ehdr) {
+               warnx("cannot get ehdr");
+               goto error;
+       }
+
+       ehdr->e_ident[EI_DATA] = GEN_ELF_ENDIAN;
+       ehdr->e_ident[EI_CLASS] = GEN_ELF_CLASS;
+       ehdr->e_machine = GEN_ELF_ARCH;
+       ehdr->e_type = ET_DYN;
+       ehdr->e_entry = GEN_ELF_TEXT_OFFSET;
+       ehdr->e_version = EV_CURRENT;
+       ehdr->e_shstrndx= 2; /* shdr index for section name */
+
+       /*
+        * setup text section
+        */
+       scn = elf_newscn(e);
+       if (!scn) {
+               warnx("cannot create section");
+               goto error;
+       }
+
+       d = elf_newdata(scn);
+       if (!d) {
+               warnx("cannot get new data");
+               goto error;
+       }
+
+       d->d_align = 16;
+       d->d_off = 0LL;
+       d->d_buf = (void *)code;
+       d->d_type = ELF_T_BYTE;
+       d->d_size = csize;
+       d->d_version = EV_CURRENT;
+
+       shdr = elf_getshdr(scn);
+       if (!shdr) {
+               warnx("cannot get section header");
+               goto error;
+       }
+
+       shdr->sh_name = 1;
+       shdr->sh_type = SHT_PROGBITS;
+       shdr->sh_addr = GEN_ELF_TEXT_OFFSET;
+       shdr->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
+       shdr->sh_entsize = 0;
+
+       /*
+        * setup section headers string table
+        */
+       scn = elf_newscn(e);
+       if (!scn) {
+               warnx("cannot create section");
+               goto error;
+       }
+
+       d = elf_newdata(scn);
+       if (!d) {
+               warnx("cannot get new data");
+               goto error;
+       }
+
+       d->d_align = 1;
+       d->d_off = 0LL;
+       d->d_buf = shd_string_table;
+       d->d_type = ELF_T_BYTE;
+       d->d_size = sizeof(shd_string_table);
+       d->d_version = EV_CURRENT;
+
+       shdr = elf_getshdr(scn);
+       if (!shdr) {
+               warnx("cannot get section header");
+               goto error;
+       }
+
+       shdr->sh_name = 7; /* offset of '.shstrtab' in shd_string_table */
+       shdr->sh_type = SHT_STRTAB;
+       shdr->sh_flags = 0;
+       shdr->sh_entsize = 0;
+
+       /*
+        * setup symtab section
+        */
+       symtab[1].st_size  = csize;
+       symtab[1].st_value = GEN_ELF_TEXT_OFFSET;
+
+       scn = elf_newscn(e);
+       if (!scn) {
+               warnx("cannot create section");
+               goto error;
+       }
+
+       d = elf_newdata(scn);
+       if (!d) {
+               warnx("cannot get new data");
+               goto error;
+       }
+
+       d->d_align = 8;
+       d->d_off = 0LL;
+       d->d_buf = symtab;
+       d->d_type = ELF_T_SYM;
+       d->d_size = sizeof(symtab);
+       d->d_version = EV_CURRENT;
+
+       shdr = elf_getshdr(scn);
+       if (!shdr) {
+               warnx("cannot get section header");
+               goto error;
+       }
+
+       shdr->sh_name = 17; /* offset of '.symtab' in shd_string_table */
+       shdr->sh_type = SHT_SYMTAB;
+       shdr->sh_flags = 0;
+       shdr->sh_entsize = sizeof(Elf_Sym);
+       shdr->sh_link = 4; /* index of .strtab section */
+
+       /*
+        * setup symbols string table
+        * 2 = 1 for 0 in 1st entry, 1 for the 0 at end of symbol for 2nd entry
+        */
+       symlen = 2 + strlen(sym);
+       strsym = calloc(1, symlen);
+       if (!strsym) {
+               warnx("cannot allocate strsym");
+               goto error;
+       }
+       strcpy(strsym + 1, sym);
+
+       scn = elf_newscn(e);
+       if (!scn) {
+               warnx("cannot create section");
+               goto error;
+       }
+
+       d = elf_newdata(scn);
+       if (!d) {
+               warnx("cannot get new data");
+               goto error;
+       }
+
+       d->d_align = 1;
+       d->d_off = 0LL;
+       d->d_buf = strsym;
+       d->d_type = ELF_T_BYTE;
+       d->d_size = symlen;
+       d->d_version = EV_CURRENT;
+
+       shdr = elf_getshdr(scn);
+       if (!shdr) {
+               warnx("cannot get section header");
+               goto error;
+       }
+
+       shdr->sh_name = 25; /* offset in shd_string_table */
+       shdr->sh_type = SHT_STRTAB;
+       shdr->sh_flags = 0;
+       shdr->sh_entsize = 0;
+
+       /*
+        * setup build-id section
+        */
+       scn = elf_newscn(e);
+       if (!scn) {
+               warnx("cannot create section");
+               goto error;
+       }
+
+       d = elf_newdata(scn);
+       if (!d) {
+               warnx("cannot get new data");
+               goto error;
+       }
+
+       /*
+        * build-id generation
+        */
+       gen_build_id(&bnote, load_addr, code, csize);
+       bnote.desc.namesz = sizeof(bnote.name); /* must include 0 termination */
+       bnote.desc.descsz = sizeof(bnote.build_id);
+       bnote.desc.type   = NT_GNU_BUILD_ID;
+       strcpy(bnote.name, "GNU");
+
+       d->d_align = 4;
+       d->d_off = 0LL;
+       d->d_buf = &bnote;
+       d->d_type = ELF_T_BYTE;
+       d->d_size = sizeof(bnote);
+       d->d_version = EV_CURRENT;
+
+       shdr = elf_getshdr(scn);
+       if (!shdr) {
+               warnx("cannot get section header");
+               goto error;
+       }
+
+       shdr->sh_name = 33; /* offset in shd_string_table */
+       shdr->sh_type = SHT_NOTE;
+       shdr->sh_addr = 0x0;
+       shdr->sh_flags = SHF_ALLOC;
+       shdr->sh_size = sizeof(bnote);
+       shdr->sh_entsize = 0;
+
+       if (debug && nr_debug_entries) {
+               retval = jit_add_debug_info(e, load_addr, debug, nr_debug_entries);
+               if (retval)
+                       goto error;
+       } else {
+               if (elf_update(e, ELF_C_WRITE) < 0) {
+                       warnx("elf_update 4 failed");
+                       goto error;
+               }
+       }
+
+       retval = 0;
+error:
+       (void)elf_end(e);
+
+       free(strsym);
+
+
+       return retval;
+}
+
+#ifndef JVMTI
+
+static unsigned char x86_code[] = {
+    0xBB, 0x2A, 0x00, 0x00, 0x00, /* movl $42, %ebx */
+    0xB8, 0x01, 0x00, 0x00, 0x00, /* movl $1, %eax */
+    0xCD, 0x80            /* int $0x80 */
+};
+
+static struct options options;
+
+int main(int argc, char **argv)
+{
+       int c, fd, ret;
+
+       while ((c = getopt(argc, argv, "o:h")) != -1) {
+               switch (c) {
+               case 'o':
+                       options.output = optarg;
+                       break;
+               case 'h':
+                       printf("Usage: genelf -o output_file [-h]\n");
+                       return 0;
+               default:
+                       errx(1, "unknown option");
+               }
+       }
+
+       fd = open(options.output, O_CREAT|O_TRUNC|O_RDWR, 0666);
+       if (fd == -1)
+               err(1, "cannot create file %s", options.output);
+
+       ret = jit_write_elf(fd, "main", x86_code, sizeof(x86_code));
+       close(fd);
+
+       if (ret != 0)
+               unlink(options.output);
+
+       return ret;
+}
+#endif
diff --git a/tools/perf/util/genelf.h b/tools/perf/util/genelf.h

new file mode 100644 (file)

index 0000000..45bf9c6
--- /dev/null
+++ b/tools/perf/util/genelf.h
@@ -0,0 +1,67 @@
+#ifndef __GENELF_H__
+#define __GENELF_H__
+
+/* genelf.c */
+extern int jit_write_elf(int fd, uint64_t code_addr, const char *sym,
+                        const void *code, int csize,
+                        void *debug, int nr_debug_entries);
+/* genelf_debug.c */
+extern int jit_add_debug_info(Elf *e, uint64_t code_addr,
+                             void *debug, int nr_debug_entries);
+
+#if   defined(__arm__)
+#define GEN_ELF_ARCH   EM_ARM
+#define GEN_ELF_ENDIAN ELFDATA2LSB
+#define GEN_ELF_CLASS  ELFCLASS32
+#elif defined(__aarch64__)
+#define GEN_ELF_ARCH   EM_AARCH64
+#define GEN_ELF_ENDIAN ELFDATA2LSB
+#define GEN_ELF_CLASS  ELFCLASS64
+#elif defined(__x86_64__)
+#define GEN_ELF_ARCH   EM_X86_64
+#define GEN_ELF_ENDIAN ELFDATA2LSB
+#define GEN_ELF_CLASS  ELFCLASS64
+#elif defined(__i386__)
+#define GEN_ELF_ARCH   EM_386
+#define GEN_ELF_ENDIAN ELFDATA2LSB
+#define GEN_ELF_CLASS  ELFCLASS32
+#elif defined(__ppcle__)
+#define GEN_ELF_ARCH   EM_PPC
+#define GEN_ELF_ENDIAN ELFDATA2LSB
+#define GEN_ELF_CLASS  ELFCLASS64
+#elif defined(__powerpc__)
+#define GEN_ELF_ARCH   EM_PPC64
+#define GEN_ELF_ENDIAN ELFDATA2MSB
+#define GEN_ELF_CLASS  ELFCLASS64
+#elif defined(__powerpcle__)
+#define GEN_ELF_ARCH   EM_PPC64
+#define GEN_ELF_ENDIAN ELFDATA2LSB
+#define GEN_ELF_CLASS  ELFCLASS64
+#else
+#error "unsupported architecture"
+#endif
+
+#if GEN_ELF_CLASS == ELFCLASS64
+#define elf_newehdr    elf64_newehdr
+#define elf_getshdr    elf64_getshdr
+#define Elf_Ehdr       Elf64_Ehdr
+#define Elf_Shdr       Elf64_Shdr
+#define Elf_Sym                Elf64_Sym
+#define ELF_ST_TYPE(a) ELF64_ST_TYPE(a)
+#define ELF_ST_BIND(a) ELF64_ST_BIND(a)
+#define ELF_ST_VIS(a)  ELF64_ST_VISIBILITY(a)
+#else
+#define elf_newehdr    elf32_newehdr
+#define elf_getshdr    elf32_getshdr
+#define Elf_Ehdr       Elf32_Ehdr
+#define Elf_Shdr       Elf32_Shdr
+#define Elf_Sym                Elf32_Sym
+#define ELF_ST_TYPE(a) ELF32_ST_TYPE(a)
+#define ELF_ST_BIND(a) ELF32_ST_BIND(a)
+#define ELF_ST_VIS(a)  ELF32_ST_VISIBILITY(a)
+#endif
+
+/* The .text section is directly after the ELF header */
+#define GEN_ELF_TEXT_OFFSET sizeof(Elf_Ehdr)
+
+#endif
diff --git a/tools/perf/util/genelf_debug.c b/tools/perf/util/genelf_debug.c

new file mode 100644 (file)

index 0000000..5980f7d
--- /dev/null
+++ b/tools/perf/util/genelf_debug.c
@@ -0,0 +1,610 @@
+/*
+ * genelf_debug.c
+ * Copyright (C) 2015, Google, Inc
+ *
+ * Contributed by:
+ *     Stephane Eranian <eranian@google.com>
+ *
+ * Released under the GPL v2.
+ *
+ * based on GPLv2 source code from Oprofile
+ * @remark Copyright 2007 OProfile authors
+ * @author Philippe Elie
+ */
+#include <sys/types.h>
+#include <stdio.h>
+#include <getopt.h>
+#include <stddef.h>
+#include <libelf.h>
+#include <string.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <err.h>
+#include <dwarf.h>
+
+#include "perf.h"
+#include "genelf.h"
+#include "../util/jitdump.h"
+
+#define BUFFER_EXT_DFL_SIZE    (4 * 1024)
+
+typedef uint32_t uword;
+typedef uint16_t uhalf;
+typedef int32_t  sword;
+typedef int16_t  shalf;
+typedef uint8_t  ubyte;
+typedef int8_t   sbyte;
+
+struct buffer_ext {
+       size_t cur_pos;
+       size_t max_sz;
+       void *data;
+};
+
+static void
+buffer_ext_dump(struct buffer_ext *be, const char *msg)
+{
+       size_t i;
+       warnx("DUMP for %s", msg);
+       for (i = 0 ; i < be->cur_pos; i++)
+               warnx("%4zu 0x%02x", i, (((char *)be->data)[i]) & 0xff);
+}
+
+static inline int
+buffer_ext_add(struct buffer_ext *be, void *addr, size_t sz)
+{
+       void *tmp;
+       size_t be_sz = be->max_sz;
+
+retry:
+       if ((be->cur_pos + sz) < be_sz) {
+               memcpy(be->data + be->cur_pos, addr, sz);
+               be->cur_pos += sz;
+               return 0;
+       }
+
+       if (!be_sz)
+               be_sz = BUFFER_EXT_DFL_SIZE;
+       else
+               be_sz <<= 1;
+
+       tmp = realloc(be->data, be_sz);
+       if (!tmp)
+               return -1;
+
+       be->data   = tmp;
+       be->max_sz = be_sz;
+
+       goto retry;
+}
+
+static void
+buffer_ext_init(struct buffer_ext *be)
+{
+       be->data = NULL;
+       be->cur_pos = 0;
+       be->max_sz = 0;
+}
+
+static inline size_t
+buffer_ext_size(struct buffer_ext *be)
+{
+       return be->cur_pos;
+}
+
+static inline void *
+buffer_ext_addr(struct buffer_ext *be)
+{
+       return be->data;
+}
+
+struct debug_line_header {
+       // Not counting this field
+       uword total_length;
+       // version number (2 currently)
+       uhalf version;
+       // relative offset from next field to
+       // program statement
+       uword prolog_length;
+       ubyte minimum_instruction_length;
+       ubyte default_is_stmt;
+       // line_base - see DWARF 2 specs
+       sbyte line_base;
+       // line_range - see DWARF 2 specs
+       ubyte line_range;
+       // number of opcode + 1
+       ubyte opcode_base;
+       /* follow the array of opcode args nr: ubytes [nr_opcode_base] */
+       /* follow the search directories index, zero terminated string
+        * terminated by an empty string.
+        */
+       /* follow an array of { filename, LEB128, LEB128, LEB128 }, first is
+        * the directory index entry, 0 means current directory, then mtime
+        * and filesize, last entry is followed by en empty string.
+        */
+       /* follow the first program statement */
+} __attribute__((packed));
+
+/* DWARF 2 spec talk only about one possible compilation unit header while
+ * binutils can handle two flavours of dwarf 2, 32 and 64 bits, this is not
+ * related to the used arch, an ELF 32 can hold more than 4 Go of debug
+ * information. For now we handle only DWARF 2 32 bits comp unit. It'll only
+ * become a problem if we generate more than 4GB of debug information.
+ */
+struct compilation_unit_header {
+       uword total_length;
+       uhalf version;
+       uword debug_abbrev_offset;
+       ubyte pointer_size;
+} __attribute__((packed));
+
+#define DW_LNS_num_opcode (DW_LNS_set_isa + 1)
+
+/* field filled at run time are marked with -1 */
+static struct debug_line_header const default_debug_line_header = {
+       .total_length = -1,
+       .version = 2,
+       .prolog_length = -1,
+       .minimum_instruction_length = 1,        /* could be better when min instruction size != 1 */
+       .default_is_stmt = 1,   /* we don't take care about basic block */
+       .line_base = -5,        /* sensible value for line base ... */
+       .line_range = -14,     /* ... and line range are guessed statically */
+       .opcode_base = DW_LNS_num_opcode
+};
+
+static ubyte standard_opcode_length[] =
+{
+       0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1
+};
+#if 0
+{
+       [DW_LNS_advance_pc]   = 1,
+       [DW_LNS_advance_line] = 1,
+       [DW_LNS_set_file] =  1,
+       [DW_LNS_set_column] = 1,
+       [DW_LNS_fixed_advance_pc] = 1,
+       [DW_LNS_set_isa] = 1,
+};
+#endif
+
+/* field filled at run time are marked with -1 */
+static struct compilation_unit_header default_comp_unit_header = {
+       .total_length = -1,
+       .version = 2,
+       .debug_abbrev_offset = 0,     /* we reuse the same abbrev entries for all comp unit */
+       .pointer_size = sizeof(void *)
+};
+
+static void emit_uword(struct buffer_ext *be, uword data)
+{
+       buffer_ext_add(be, &data, sizeof(uword));
+}
+
+static void emit_string(struct buffer_ext *be, const char *s)
+{
+       buffer_ext_add(be, (void *)s, strlen(s) + 1);
+}
+
+static void emit_unsigned_LEB128(struct buffer_ext *be,
+                                unsigned long data)
+{
+       do {
+               ubyte cur = data & 0x7F;
+               data >>= 7;
+               if (data)
+                       cur |= 0x80;
+               buffer_ext_add(be, &cur, 1);
+       } while (data);
+}
+
+static void emit_signed_LEB128(struct buffer_ext *be, long data)
+{
+       int more = 1;
+       int negative = data < 0;
+       int size = sizeof(long) * CHAR_BIT;
+       while (more) {
+               ubyte cur = data & 0x7F;
+               data >>= 7;
+               if (negative)
+                       data |= - (1 << (size - 7));
+               if ((data == 0 && !(cur & 0x40)) ||
+                   (data == -1l && (cur & 0x40)))
+                       more = 0;
+               else
+                       cur |= 0x80;
+               buffer_ext_add(be, &cur, 1);
+       }
+}
+
+static void emit_extended_opcode(struct buffer_ext *be, ubyte opcode,
+                                void *data, size_t data_len)
+{
+       buffer_ext_add(be, (char *)"", 1);
+
+       emit_unsigned_LEB128(be, data_len + 1);
+
+       buffer_ext_add(be, &opcode, 1);
+       buffer_ext_add(be, data, data_len);
+}
+
+static void emit_opcode(struct buffer_ext *be, ubyte opcode)
+{
+       buffer_ext_add(be, &opcode, 1);
+}
+
+static void emit_opcode_signed(struct buffer_ext  *be,
+                              ubyte opcode, long data)
+{
+       buffer_ext_add(be, &opcode, 1);
+       emit_signed_LEB128(be, data);
+}
+
+static void emit_opcode_unsigned(struct buffer_ext *be, ubyte opcode,
+                                unsigned long data)
+{
+       buffer_ext_add(be, &opcode, 1);
+       emit_unsigned_LEB128(be, data);
+}
+
+static void emit_advance_pc(struct buffer_ext *be, unsigned long delta_pc)
+{
+       emit_opcode_unsigned(be, DW_LNS_advance_pc, delta_pc);
+}
+
+static void emit_advance_lineno(struct buffer_ext  *be, long delta_lineno)
+{
+       emit_opcode_signed(be, DW_LNS_advance_line, delta_lineno);
+}
+
+static void emit_lne_end_of_sequence(struct buffer_ext *be)
+{
+       emit_extended_opcode(be, DW_LNE_end_sequence, NULL, 0);
+}
+
+static void emit_set_file(struct buffer_ext *be, unsigned long idx)
+{
+       emit_opcode_unsigned(be, DW_LNS_set_file, idx);
+}
+
+static void emit_lne_define_filename(struct buffer_ext *be,
+                                    const char *filename)
+{
+       buffer_ext_add(be, (void *)"", 1);
+
+       /* LNE field, strlen(filename) + zero termination, 3 bytes for: the dir entry, timestamp, filesize */
+       emit_unsigned_LEB128(be, strlen(filename) + 5);
+       emit_opcode(be, DW_LNE_define_file);
+       emit_string(be, filename);
+       /* directory index 0=do not know */
+        emit_unsigned_LEB128(be, 0);
+       /* last modification date on file 0=do not know */
+        emit_unsigned_LEB128(be, 0);
+       /* filesize 0=do not know */
+        emit_unsigned_LEB128(be, 0);
+}
+
+static void emit_lne_set_address(struct buffer_ext *be,
+                                void *address)
+{
+       emit_extended_opcode(be, DW_LNE_set_address, &address, sizeof(unsigned long));
+}
+
+static ubyte get_special_opcode(struct debug_entry *ent,
+                               unsigned int last_line,
+                               unsigned long last_vma)
+{
+       unsigned int temp;
+       unsigned long delta_addr;
+
+       /*
+        * delta from line_base
+        */
+       temp = (ent->lineno - last_line) - default_debug_line_header.line_base;
+
+       if (temp >= default_debug_line_header.line_range)
+               return 0;
+
+       /*
+        * delta of addresses
+        */
+       delta_addr = (ent->addr - last_vma) / default_debug_line_header.minimum_instruction_length;
+
+       /* This is not sufficient to ensure opcode will be in [0-256] but
+        * sufficient to ensure when summing with the delta lineno we will
+        * not overflow the unsigned long opcode */
+
+       if (delta_addr <= 256 / default_debug_line_header.line_range) {
+               unsigned long opcode = temp +
+                       (delta_addr * default_debug_line_header.line_range) +
+                       default_debug_line_header.opcode_base;
+
+               return opcode <= 255 ? opcode : 0;
+       }
+       return 0;
+}
+
+static void emit_lineno_info(struct buffer_ext *be,
+                            struct debug_entry *ent, size_t nr_entry,
+                            unsigned long code_addr)
+{
+       size_t i;
+
+       /*
+        * Machine state at start of a statement program
+        * address = 0
+        * file    = 1
+        * line    = 1
+        * column  = 0
+        * is_stmt = default_is_stmt as given in the debug_line_header
+        * basic block = 0
+        * end sequence = 0
+        */
+
+       /* start state of the state machine we take care of */
+       unsigned long last_vma = code_addr;
+       char const  *cur_filename = NULL;
+       unsigned long cur_file_idx = 0;
+       int last_line = 1;
+
+       emit_lne_set_address(be, (void *)code_addr);
+
+       for (i = 0; i < nr_entry; i++, ent = debug_entry_next(ent)) {
+               int need_copy = 0;
+               ubyte special_opcode;
+
+               /*
+                * check if filename changed, if so add it
+                */
+               if (!cur_filename || strcmp(cur_filename, ent->name)) {
+                       emit_lne_define_filename(be, ent->name);
+                       cur_filename = ent->name;
+                       emit_set_file(be, ++cur_file_idx);
+                       need_copy = 1;
+               }
+
+               special_opcode = get_special_opcode(ent, last_line, last_vma);
+               if (special_opcode != 0) {
+                       last_line = ent->lineno;
+                       last_vma  = ent->addr;
+                       emit_opcode(be, special_opcode);
+               } else {
+                       /*
+                        * lines differ, emit line delta
+                        */
+                       if (last_line != ent->lineno) {
+                               emit_advance_lineno(be, ent->lineno - last_line);
+                               last_line = ent->lineno;
+                               need_copy = 1;
+                       }
+                       /*
+                        * addresses differ, emit address delta
+                        */
+                       if (last_vma != ent->addr) {
+                               emit_advance_pc(be, ent->addr - last_vma);
+                               last_vma = ent->addr;
+                               need_copy = 1;
+                       }
+                       /*
+                        * add new row to matrix
+                        */
+                       if (need_copy)
+                               emit_opcode(be, DW_LNS_copy);
+               }
+       }
+}
+
+static void add_debug_line(struct buffer_ext *be,
+       struct debug_entry *ent, size_t nr_entry,
+       unsigned long code_addr)
+{
+       struct debug_line_header * dbg_header;
+       size_t old_size;
+
+       old_size = buffer_ext_size(be);
+
+       buffer_ext_add(be, (void *)&default_debug_line_header,
+                sizeof(default_debug_line_header));
+
+       buffer_ext_add(be, &standard_opcode_length,  sizeof(standard_opcode_length));
+
+       // empty directory entry
+       buffer_ext_add(be, (void *)"", 1);
+
+       // empty filename directory
+       buffer_ext_add(be, (void *)"", 1);
+
+       dbg_header = buffer_ext_addr(be) + old_size;
+       dbg_header->prolog_length = (buffer_ext_size(be) - old_size) -
+               offsetof(struct debug_line_header, minimum_instruction_length);
+
+       emit_lineno_info(be, ent, nr_entry, code_addr);
+
+       emit_lne_end_of_sequence(be);
+
+       dbg_header = buffer_ext_addr(be) + old_size;
+       dbg_header->total_length = (buffer_ext_size(be) - old_size) -
+               offsetof(struct debug_line_header, version);
+}
+
+static void
+add_debug_abbrev(struct buffer_ext *be)
+{
+        emit_unsigned_LEB128(be, 1);
+        emit_unsigned_LEB128(be, DW_TAG_compile_unit);
+        emit_unsigned_LEB128(be, DW_CHILDREN_yes);
+        emit_unsigned_LEB128(be, DW_AT_stmt_list);
+        emit_unsigned_LEB128(be, DW_FORM_data4);
+        emit_unsigned_LEB128(be, 0);
+        emit_unsigned_LEB128(be, 0);
+        emit_unsigned_LEB128(be, 0);
+}
+
+static void
+add_compilation_unit(struct buffer_ext *be,
+                    size_t offset_debug_line)
+{
+       struct compilation_unit_header *comp_unit_header;
+       size_t old_size = buffer_ext_size(be);
+
+       buffer_ext_add(be, &default_comp_unit_header,
+                      sizeof(default_comp_unit_header));
+
+       emit_unsigned_LEB128(be, 1);
+       emit_uword(be, offset_debug_line);
+
+       comp_unit_header = buffer_ext_addr(be) + old_size;
+       comp_unit_header->total_length = (buffer_ext_size(be) - old_size) -
+               offsetof(struct compilation_unit_header, version);
+}
+
+static int
+jit_process_debug_info(uint64_t code_addr,
+                      void *debug, int nr_debug_entries,
+                      struct buffer_ext *dl,
+                      struct buffer_ext *da,
+                      struct buffer_ext *di)
+{
+       struct debug_entry *ent = debug;
+       int i;
+
+       for (i = 0; i < nr_debug_entries; i++) {
+               ent->addr = ent->addr - code_addr;
+               ent = debug_entry_next(ent);
+       }
+       add_compilation_unit(di, buffer_ext_size(dl));
+       add_debug_line(dl, debug, nr_debug_entries, 0);
+       add_debug_abbrev(da);
+       if (0) buffer_ext_dump(da, "abbrev");
+
+       return 0;
+}
+
+int
+jit_add_debug_info(Elf *e, uint64_t code_addr, void *debug, int nr_debug_entries)
+{
+       Elf_Data *d;
+       Elf_Scn *scn;
+       Elf_Shdr *shdr;
+       struct buffer_ext dl, di, da;
+       int ret;
+
+       buffer_ext_init(&dl);
+       buffer_ext_init(&di);
+       buffer_ext_init(&da);
+
+       ret = jit_process_debug_info(code_addr, debug, nr_debug_entries, &dl, &da, &di);
+       if (ret)
+               return -1;
+       /*
+        * setup .debug_line section
+        */
+       scn = elf_newscn(e);
+       if (!scn) {
+               warnx("cannot create section");
+               return -1;
+       }
+
+       d = elf_newdata(scn);
+       if (!d) {
+               warnx("cannot get new data");
+               return -1;
+       }
+
+       d->d_align = 1;
+       d->d_off = 0LL;
+       d->d_buf = buffer_ext_addr(&dl);
+       d->d_type = ELF_T_BYTE;
+       d->d_size = buffer_ext_size(&dl);
+       d->d_version = EV_CURRENT;
+
+       shdr = elf_getshdr(scn);
+       if (!shdr) {
+               warnx("cannot get section header");
+               return -1;
+       }
+
+       shdr->sh_name = 52; /* .debug_line */
+       shdr->sh_type = SHT_PROGBITS;
+       shdr->sh_addr = 0; /* must be zero or == sh_offset -> dynamic object */
+       shdr->sh_flags = 0;
+       shdr->sh_entsize = 0;
+
+       /*
+        * setup .debug_info section
+        */
+       scn = elf_newscn(e);
+       if (!scn) {
+               warnx("cannot create section");
+               return -1;
+       }
+
+       d = elf_newdata(scn);
+       if (!d) {
+               warnx("cannot get new data");
+               return -1;
+       }
+
+       d->d_align = 1;
+       d->d_off = 0LL;
+       d->d_buf = buffer_ext_addr(&di);
+       d->d_type = ELF_T_BYTE;
+       d->d_size = buffer_ext_size(&di);
+       d->d_version = EV_CURRENT;
+
+       shdr = elf_getshdr(scn);
+       if (!shdr) {
+               warnx("cannot get section header");
+               return -1;
+       }
+
+       shdr->sh_name = 64; /* .debug_info */
+       shdr->sh_type = SHT_PROGBITS;
+       shdr->sh_addr = 0; /* must be zero or == sh_offset -> dynamic object */
+       shdr->sh_flags = 0;
+       shdr->sh_entsize = 0;
+
+       /*
+        * setup .debug_abbrev section
+        */
+       scn = elf_newscn(e);
+       if (!scn) {
+               warnx("cannot create section");
+               return -1;
+       }
+
+       d = elf_newdata(scn);
+       if (!d) {
+               warnx("cannot get new data");
+               return -1;
+       }
+
+       d->d_align = 1;
+       d->d_off = 0LL;
+       d->d_buf = buffer_ext_addr(&da);
+       d->d_type = ELF_T_BYTE;
+       d->d_size = buffer_ext_size(&da);
+       d->d_version = EV_CURRENT;
+
+       shdr = elf_getshdr(scn);
+       if (!shdr) {
+               warnx("cannot get section header");
+               return -1;
+       }
+
+       shdr->sh_name = 76; /* .debug_info */
+       shdr->sh_type = SHT_PROGBITS;
+       shdr->sh_addr = 0; /* must be zero or == sh_offset -> dynamic object */
+       shdr->sh_flags = 0;
+       shdr->sh_entsize = 0;
+
+       /*
+        * now we update the ELF image with all the sections
+        */
+       if (elf_update(e, ELF_C_WRITE) < 0) {
+               warnx("elf_update debug failed");
+               return -1;
+       }
+       return 0;
+}
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c

index f50b7235ecb6558d167a475bf4199e8b05f52143..73e38e472ecd7771695b7ac2d91829a35529f419 100644 (file)
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -23,6 +23,8 @@
  #include "strbuf.h"
  #include "build-id.h"
  #include "data.h"
+#include <api/fs/fs.h>
+#include "asm/bug.h"
  
  /*
   * magic2 = "PERFILE2"
@@ -868,6 +870,199 @@ static int write_auxtrace(int fd, struct perf_header *h,
         return err;
  }
  
+static int cpu_cache_level__sort(const void *a, const void *b)
+{
+       struct cpu_cache_level *cache_a = (struct cpu_cache_level *)a;
+       struct cpu_cache_level *cache_b = (struct cpu_cache_level *)b;
+
+       return cache_a->level - cache_b->level;
+}
+
+static bool cpu_cache_level__cmp(struct cpu_cache_level *a, struct cpu_cache_level *b)
+{
+       if (a->level != b->level)
+               return false;
+
+       if (a->line_size != b->line_size)
+               return false;
+
+       if (a->sets != b->sets)
+               return false;
+
+       if (a->ways != b->ways)
+               return false;
+
+       if (strcmp(a->type, b->type))
+               return false;
+
+       if (strcmp(a->size, b->size))
+               return false;
+
+       if (strcmp(a->map, b->map))
+               return false;
+
+       return true;
+}
+
+static int cpu_cache_level__read(struct cpu_cache_level *cache, u32 cpu, u16 level)
+{
+       char path[PATH_MAX], file[PATH_MAX];
+       struct stat st;
+       size_t len;
+
+       scnprintf(path, PATH_MAX, "devices/system/cpu/cpu%d/cache/index%d/", cpu, level);
+       scnprintf(file, PATH_MAX, "%s/%s", sysfs__mountpoint(), path);
+
+       if (stat(file, &st))
+               return 1;
+
+       scnprintf(file, PATH_MAX, "%s/level", path);
+       if (sysfs__read_int(file, (int *) &cache->level))
+               return -1;
+
+       scnprintf(file, PATH_MAX, "%s/coherency_line_size", path);
+       if (sysfs__read_int(file, (int *) &cache->line_size))
+               return -1;
+
+       scnprintf(file, PATH_MAX, "%s/number_of_sets", path);
+       if (sysfs__read_int(file, (int *) &cache->sets))
+               return -1;
+
+       scnprintf(file, PATH_MAX, "%s/ways_of_associativity", path);
+       if (sysfs__read_int(file, (int *) &cache->ways))
+               return -1;
+
+       scnprintf(file, PATH_MAX, "%s/type", path);
+       if (sysfs__read_str(file, &cache->type, &len))
+               return -1;
+
+       cache->type[len] = 0;
+       cache->type = rtrim(cache->type);
+
+       scnprintf(file, PATH_MAX, "%s/size", path);
+       if (sysfs__read_str(file, &cache->size, &len)) {
+               free(cache->type);
+               return -1;
+       }
+
+       cache->size[len] = 0;
+       cache->size = rtrim(cache->size);
+
+       scnprintf(file, PATH_MAX, "%s/shared_cpu_list", path);
+       if (sysfs__read_str(file, &cache->map, &len)) {
+               free(cache->map);
+               free(cache->type);
+               return -1;
+       }
+
+       cache->map[len] = 0;
+       cache->map = rtrim(cache->map);
+       return 0;
+}
+
+static void cpu_cache_level__fprintf(FILE *out, struct cpu_cache_level *c)
+{
+       fprintf(out, "L%d %-15s %8s [%s]\n", c->level, c->type, c->size, c->map);
+}
+
+static int build_caches(struct cpu_cache_level caches[], u32 size, u32 *cntp)
+{
+       u32 i, cnt = 0;
+       long ncpus;
+       u32 nr, cpu;
+       u16 level;
+
+       ncpus = sysconf(_SC_NPROCESSORS_CONF);
+       if (ncpus < 0)
+               return -1;
+
+       nr = (u32)(ncpus & UINT_MAX);
+
+       for (cpu = 0; cpu < nr; cpu++) {
+               for (level = 0; level < 10; level++) {
+                       struct cpu_cache_level c;
+                       int err;
+
+                       err = cpu_cache_level__read(&c, cpu, level);
+                       if (err < 0)
+                               return err;
+
+                       if (err == 1)
+                               break;
+
+                       for (i = 0; i < cnt; i++) {
+                               if (cpu_cache_level__cmp(&c, &caches[i]))
+                                       break;
+                       }
+
+                       if (i == cnt)
+                               caches[cnt++] = c;
+                       else
+                               cpu_cache_level__free(&c);
+
+                       if (WARN_ONCE(cnt == size, "way too many cpu caches.."))
+                               goto out;
+               }
+       }
+ out:
+       *cntp = cnt;
+       return 0;
+}
+
+#define MAX_CACHES 2000
+
+static int write_cache(int fd, struct perf_header *h __maybe_unused,
+                         struct perf_evlist *evlist __maybe_unused)
+{
+       struct cpu_cache_level caches[MAX_CACHES];
+       u32 cnt = 0, i, version = 1;
+       int ret;
+
+       ret = build_caches(caches, MAX_CACHES, &cnt);
+       if (ret)
+               goto out;
+
+       qsort(&caches, cnt, sizeof(struct cpu_cache_level), cpu_cache_level__sort);
+
+       ret = do_write(fd, &version, sizeof(u32));
+       if (ret < 0)
+               goto out;
+
+       ret = do_write(fd, &cnt, sizeof(u32));
+       if (ret < 0)
+               goto out;
+
+       for (i = 0; i < cnt; i++) {
+               struct cpu_cache_level *c = &caches[i];
+
+               #define _W(v)                                   \
+                       ret = do_write(fd, &c->v, sizeof(u32)); \
+                       if (ret < 0)                            \
+                               goto out;
+
+               _W(level)
+               _W(line_size)
+               _W(sets)
+               _W(ways)
+               #undef _W
+
+               #define _W(v)                                           \
+                       ret = do_write_string(fd, (const char *) c->v); \
+                       if (ret < 0)                                    \
+                               goto out;
+
+               _W(type)
+               _W(size)
+               _W(map)
+               #undef _W
+       }
+
+out:
+       for (i = 0; i < cnt; i++)
+               cpu_cache_level__free(&caches[i]);
+       return ret;
+}
+
  static int write_stat(int fd __maybe_unused,
                       struct perf_header *h __maybe_unused,
                       struct perf_evlist *evlist __maybe_unused)
@@ -1172,6 +1367,18 @@ static void print_stat(struct perf_header *ph __maybe_unused,
         fprintf(fp, "# contains stat data\n");
  }
  
+static void print_cache(struct perf_header *ph __maybe_unused,
+                       int fd __maybe_unused, FILE *fp __maybe_unused)
+{
+       int i;
+
+       fprintf(fp, "# CPU cache info:\n");
+       for (i = 0; i < ph->env.caches_cnt; i++) {
+               fprintf(fp, "#  ");
+               cpu_cache_level__fprintf(fp, &ph->env.caches[i]);
+       }
+}
+
  static void print_pmu_mappings(struct perf_header *ph, int fd __maybe_unused,
                                FILE *fp)
  {
@@ -1920,6 +2127,68 @@ static int process_auxtrace(struct perf_file_section *section,
         return err;
  }
  
+static int process_cache(struct perf_file_section *section __maybe_unused,
+                        struct perf_header *ph __maybe_unused, int fd __maybe_unused,
+                        void *data __maybe_unused)
+{
+       struct cpu_cache_level *caches;
+       u32 cnt, i, version;
+
+       if (readn(fd, &version, sizeof(version)) != sizeof(version))
+               return -1;
+
+       if (ph->needs_swap)
+               version = bswap_32(version);
+
+       if (version != 1)
+               return -1;
+
+       if (readn(fd, &cnt, sizeof(cnt)) != sizeof(cnt))
+               return -1;
+
+       if (ph->needs_swap)
+               cnt = bswap_32(cnt);
+
+       caches = zalloc(sizeof(*caches) * cnt);
+       if (!caches)
+               return -1;
+
+       for (i = 0; i < cnt; i++) {
+               struct cpu_cache_level c;
+
+               #define _R(v)                                           \
+                       if (readn(fd, &c.v, sizeof(u32)) != sizeof(u32))\
+                               goto out_free_caches;                   \
+                       if (ph->needs_swap)                             \
+                               c.v = bswap_32(c.v);                    \
+
+               _R(level)
+               _R(line_size)
+               _R(sets)
+               _R(ways)
+               #undef _R
+
+               #define _R(v)                           \
+                       c.v = do_read_string(fd, ph);   \
+                       if (!c.v)                       \
+                               goto out_free_caches;
+
+               _R(type)
+               _R(size)
+               _R(map)
+               #undef _R
+
+               caches[i] = c;
+       }
+
+       ph->env.caches = caches;
+       ph->env.caches_cnt = cnt;
+       return 0;
+out_free_caches:
+       free(caches);
+       return -1;
+}
+
  struct feature_ops {
         int (*write)(int fd, struct perf_header *h, struct perf_evlist *evlist);
         void (*print)(struct perf_header *h, int fd, FILE *fp);
@@ -1962,6 +2231,7 @@ static const struct feature_ops feat_ops[HEADER_LAST_FEATURE] = {
         FEAT_OPP(HEADER_GROUP_DESC,     group_desc),
         FEAT_OPP(HEADER_AUXTRACE,       auxtrace),
         FEAT_OPA(HEADER_STAT,           stat),
+       FEAT_OPF(HEADER_CACHE,          cache),
  };
  
  struct header_print_data {
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h

index cff9892452ee393764726a12895ad95d36f766fe..3d87ca823c0ae55591e753f6f928802e831c0e3c 100644 (file)
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -32,6 +32,7 @@ enum {
         HEADER_GROUP_DESC,
         HEADER_AUXTRACE,
         HEADER_STAT,
+       HEADER_CACHE,
         HEADER_LAST_FEATURE,
         HEADER_FEAT_BITS        = 256,
  };
diff --git a/tools/perf/util/help-unknown-cmd.c b/tools/perf/util/help-unknown-cmd.c

index dc1e41c9b054b186c0d9349d97e9d0f467099ac0..43a98a4dc1e1e90c7079fba26929022ec0959a31 100644 (file)
--- a/tools/perf/util/help-unknown-cmd.c
+++ b/tools/perf/util/help-unknown-cmd.c
@@ -6,7 +6,8 @@
  static int autocorrect;
  static struct cmdnames aliases;
  
-static int perf_unknown_cmd_config(const char *var, const char *value, void *cb)
+static int perf_unknown_cmd_config(const char *var, const char *value,
+                                  void *cb __maybe_unused)
  {
         if (!strcmp(var, "help.autocorrect"))
                 autocorrect = perf_config_int(var,value);
@@ -14,7 +15,7 @@ static int perf_unknown_cmd_config(const char *var, const char *value, void *cb)
         if (!prefixcmp(var, "alias."))
                 add_cmdname(&aliases, var + 6, strlen(var + 6));
  
-       return perf_default_config(var, value, cb);
+       return 0;
  }
  
  static int levenshtein_compare(const void *p1, const void *p2)
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c

index 68a7612019dc3c3be315604693b68170183044d6..290b3cbf68772de883bff4fee5c2294234dbefd3 100644 (file)
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -179,6 +179,9 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
         if (h->transaction)
                 hists__new_col_len(hists, HISTC_TRANSACTION,
                                    hist_entry__transaction_len());
+
+       if (h->trace_output)
+               hists__new_col_len(hists, HISTC_TRACE, strlen(h->trace_output));
  }
  
  void hists__output_recalc_col_len(struct hists *hists, int max_rows)
@@ -245,6 +248,8 @@ static void he_stat__decay(struct he_stat *he_stat)
         /* XXX need decay for weight too? */
  }
  
+static void hists__delete_entry(struct hists *hists, struct hist_entry *he);
+
  static bool hists__decay_entry(struct hists *hists, struct hist_entry *he)
  {
         u64 prev_period = he->stat.period;
@@ -260,21 +265,45 @@ static bool hists__decay_entry(struct hists *hists, struct hist_entry *he)
  
         diff = prev_period - he->stat.period;
  
-       hists->stats.total_period -= diff;
-       if (!he->filtered)
-               hists->stats.total_non_filtered_period -= diff;
+       if (!he->depth) {
+               hists->stats.total_period -= diff;
+               if (!he->filtered)
+                       hists->stats.total_non_filtered_period -= diff;
+       }
+
+       if (!he->leaf) {
+               struct hist_entry *child;
+               struct rb_node *node = rb_first(&he->hroot_out);
+               while (node) {
+                       child = rb_entry(node, struct hist_entry, rb_node);
+                       node = rb_next(node);
+
+                       if (hists__decay_entry(hists, child))
+                               hists__delete_entry(hists, child);
+               }
+       }
  
         return he->stat.period == 0;
  }
  
  static void hists__delete_entry(struct hists *hists, struct hist_entry *he)
  {
-       rb_erase(&he->rb_node, &hists->entries);
+       struct rb_root *root_in;
+       struct rb_root *root_out;
  
-       if (sort__need_collapse)
-               rb_erase(&he->rb_node_in, &hists->entries_collapsed);
-       else
-               rb_erase(&he->rb_node_in, hists->entries_in);
+       if (he->parent_he) {
+               root_in  = &he->parent_he->hroot_in;
+               root_out = &he->parent_he->hroot_out;
+       } else {
+               if (sort__need_collapse)
+                       root_in = &hists->entries_collapsed;
+               else
+                       root_in = hists->entries_in;
+               root_out = &hists->entries;
+       }
+
+       rb_erase(&he->rb_node_in, root_in);
+       rb_erase(&he->rb_node, root_out);
  
         --hists->nr_entries;
         if (!he->filtered)
@@ -393,6 +422,9 @@ static struct hist_entry *hist_entry__new(struct hist_entry *template,
                 }
                 INIT_LIST_HEAD(&he->pairs.node);
                 thread__get(he->thread);
+
+               if (!symbol_conf.report_hierarchy)
+                       he->leaf = true;
         }
  
         return he;
@@ -405,6 +437,16 @@ static u8 symbol__parent_filter(const struct symbol *parent)
         return 0;
  }
  
+static void hist_entry__add_callchain_period(struct hist_entry *he, u64 period)
+{
+       if (!symbol_conf.use_callchain)
+               return;
+
+       he->hists->callchain_period += period;
+       if (!he->filtered)
+               he->hists->callchain_non_filtered_period += period;
+}
+
  static struct hist_entry *hists__findnew_entry(struct hists *hists,
                                                struct hist_entry *entry,
                                                struct addr_location *al,
@@ -432,8 +474,10 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
                 cmp = hist_entry__cmp(he, entry);
  
                 if (!cmp) {
-                       if (sample_self)
+                       if (sample_self) {
                                 he_stat__add_period(&he->stat, period, weight);
+                               hist_entry__add_callchain_period(he, period);
+                       }
                         if (symbol_conf.cumulate_callchain)
                                 he_stat__add_period(he->stat_acc, period, weight);
  
@@ -466,6 +510,8 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
         if (!he)
                 return NULL;
  
+       if (sample_self)
+               hist_entry__add_callchain_period(he, period);
         hists->nr_entries++;
  
         rb_link_node(&he->rb_node_in, parent, p);
@@ -951,10 +997,15 @@ out:
  int64_t
  hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
  {
+       struct hists *hists = left->hists;
         struct perf_hpp_fmt *fmt;
         int64_t cmp = 0;
  
-       perf_hpp__for_each_sort_list(fmt) {
+       hists__for_each_sort_list(hists, fmt) {
+               if (perf_hpp__is_dynamic_entry(fmt) &&
+                   !perf_hpp__defined_dynamic_entry(fmt, hists))
+                       continue;
+
                 cmp = fmt->cmp(fmt, left, right);
                 if (cmp)
                         break;
@@ -966,10 +1017,15 @@ hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
  int64_t
  hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
  {
+       struct hists *hists = left->hists;
         struct perf_hpp_fmt *fmt;
         int64_t cmp = 0;
  
-       perf_hpp__for_each_sort_list(fmt) {
+       hists__for_each_sort_list(hists, fmt) {
+               if (perf_hpp__is_dynamic_entry(fmt) &&
+                   !perf_hpp__defined_dynamic_entry(fmt, hists))
+                       continue;
+
                 cmp = fmt->collapse(fmt, left, right);
                 if (cmp)
                         break;
@@ -1005,18 +1061,251 @@ void hist_entry__delete(struct hist_entry *he)
         free(he);
  }
  
+/*
+ * If this is not the last column, then we need to pad it according to the
+ * pre-calculated max lenght for this column, otherwise don't bother adding
+ * spaces because that would break viewing this with, for instance, 'less',
+ * that would show tons of trailing spaces when a long C++ demangled method
+ * names is sampled.
+*/
+int hist_entry__snprintf_alignment(struct hist_entry *he, struct perf_hpp *hpp,
+                                  struct perf_hpp_fmt *fmt, int printed)
+{
+       if (!list_is_last(&fmt->list, &he->hists->hpp_list->fields)) {
+               const int width = fmt->width(fmt, hpp, hists_to_evsel(he->hists));
+               if (printed < width) {
+                       advance_hpp(hpp, printed);
+                       printed = scnprintf(hpp->buf, hpp->size, "%-*s", width - printed, " ");
+               }
+       }
+
+       return printed;
+}
+
  /*
   * collapse the histogram
   */
  
-bool hists__collapse_insert_entry(struct hists *hists __maybe_unused,
-                                 struct rb_root *root, struct hist_entry *he)
+static void hists__apply_filters(struct hists *hists, struct hist_entry *he);
+static void hists__remove_entry_filter(struct hists *hists, struct hist_entry *he,
+                                      enum hist_filter type);
+
+typedef bool (*fmt_chk_fn)(struct perf_hpp_fmt *fmt);
+
+static bool check_thread_entry(struct perf_hpp_fmt *fmt)
+{
+       return perf_hpp__is_thread_entry(fmt) || perf_hpp__is_comm_entry(fmt);
+}
+
+static void hist_entry__check_and_remove_filter(struct hist_entry *he,
+                                               enum hist_filter type,
+                                               fmt_chk_fn check)
+{
+       struct perf_hpp_fmt *fmt;
+       bool type_match = false;
+       struct hist_entry *parent = he->parent_he;
+
+       switch (type) {
+       case HIST_FILTER__THREAD:
+               if (symbol_conf.comm_list == NULL &&
+                   symbol_conf.pid_list == NULL &&
+                   symbol_conf.tid_list == NULL)
+                       return;
+               break;
+       case HIST_FILTER__DSO:
+               if (symbol_conf.dso_list == NULL)
+                       return;
+               break;
+       case HIST_FILTER__SYMBOL:
+               if (symbol_conf.sym_list == NULL)
+                       return;
+               break;
+       case HIST_FILTER__PARENT:
+       case HIST_FILTER__GUEST:
+       case HIST_FILTER__HOST:
+       case HIST_FILTER__SOCKET:
+       default:
+               return;
+       }
+
+       /* if it's filtered by own fmt, it has to have filter bits */
+       perf_hpp_list__for_each_format(he->hpp_list, fmt) {
+               if (check(fmt)) {
+                       type_match = true;
+                       break;
+               }
+       }
+
+       if (type_match) {
+               /*
+                * If the filter is for current level entry, propagate
+                * filter marker to parents.  The marker bit was
+                * already set by default so it only needs to clear
+                * non-filtered entries.
+                */
+               if (!(he->filtered & (1 << type))) {
+                       while (parent) {
+                               parent->filtered &= ~(1 << type);
+                               parent = parent->parent_he;
+                       }
+               }
+       } else {
+               /*
+                * If current entry doesn't have matching formats, set
+                * filter marker for upper level entries.  it will be
+                * cleared if its lower level entries is not filtered.
+                *
+                * For lower-level entries, it inherits parent's
+                * filter bit so that lower level entries of a
+                * non-filtered entry won't set the filter marker.
+                */
+               if (parent == NULL)
+                       he->filtered |= (1 << type);
+               else
+                       he->filtered |= (parent->filtered & (1 << type));
+       }
+}
+
+static void hist_entry__apply_hierarchy_filters(struct hist_entry *he)
+{
+       hist_entry__check_and_remove_filter(he, HIST_FILTER__THREAD,
+                                           check_thread_entry);
+
+       hist_entry__check_and_remove_filter(he, HIST_FILTER__DSO,
+                                           perf_hpp__is_dso_entry);
+
+       hist_entry__check_and_remove_filter(he, HIST_FILTER__SYMBOL,
+                                           perf_hpp__is_sym_entry);
+
+       hists__apply_filters(he->hists, he);
+}
+
+static struct hist_entry *hierarchy_insert_entry(struct hists *hists,
+                                                struct rb_root *root,
+                                                struct hist_entry *he,
+                                                struct hist_entry *parent_he,
+                                                struct perf_hpp_list *hpp_list)
+{
+       struct rb_node **p = &root->rb_node;
+       struct rb_node *parent = NULL;
+       struct hist_entry *iter, *new;
+       struct perf_hpp_fmt *fmt;
+       int64_t cmp;
+
+       while (*p != NULL) {
+               parent = *p;
+               iter = rb_entry(parent, struct hist_entry, rb_node_in);
+
+               cmp = 0;
+               perf_hpp_list__for_each_sort_list(hpp_list, fmt) {
+                       cmp = fmt->collapse(fmt, iter, he);
+                       if (cmp)
+                               break;
+               }
+
+               if (!cmp) {
+                       he_stat__add_stat(&iter->stat, &he->stat);
+                       return iter;
+               }
+
+               if (cmp < 0)
+                       p = &parent->rb_left;
+               else
+                       p = &parent->rb_right;
+       }
+
+       new = hist_entry__new(he, true);
+       if (new == NULL)
+               return NULL;
+
+       hists->nr_entries++;
+
+       /* save related format list for output */
+       new->hpp_list = hpp_list;
+       new->parent_he = parent_he;
+
+       hist_entry__apply_hierarchy_filters(new);
+
+       /* some fields are now passed to 'new' */
+       perf_hpp_list__for_each_sort_list(hpp_list, fmt) {
+               if (perf_hpp__is_trace_entry(fmt) || perf_hpp__is_dynamic_entry(fmt))
+                       he->trace_output = NULL;
+               else
+                       new->trace_output = NULL;
+
+               if (perf_hpp__is_srcline_entry(fmt))
+                       he->srcline = NULL;
+               else
+                       new->srcline = NULL;
+
+               if (perf_hpp__is_srcfile_entry(fmt))
+                       he->srcfile = NULL;
+               else
+                       new->srcfile = NULL;
+       }
+
+       rb_link_node(&new->rb_node_in, parent, p);
+       rb_insert_color(&new->rb_node_in, root);
+       return new;
+}
+
+static int hists__hierarchy_insert_entry(struct hists *hists,
+                                        struct rb_root *root,
+                                        struct hist_entry *he)
+{
+       struct perf_hpp_list_node *node;
+       struct hist_entry *new_he = NULL;
+       struct hist_entry *parent = NULL;
+       int depth = 0;
+       int ret = 0;
+
+       list_for_each_entry(node, &hists->hpp_formats, list) {
+               /* skip period (overhead) and elided columns */
+               if (node->level == 0 || node->skip)
+                       continue;
+
+               /* insert copy of 'he' for each fmt into the hierarchy */
+               new_he = hierarchy_insert_entry(hists, root, he, parent, &node->hpp);
+               if (new_he == NULL) {
+                       ret = -1;
+                       break;
+               }
+
+               root = &new_he->hroot_in;
+               new_he->depth = depth++;
+               parent = new_he;
+       }
+
+       if (new_he) {
+               new_he->leaf = true;
+
+               if (symbol_conf.use_callchain) {
+                       callchain_cursor_reset(&callchain_cursor);
+                       if (callchain_merge(&callchain_cursor,
+                                           new_he->callchain,
+                                           he->callchain) < 0)
+                               ret = -1;
+               }
+       }
+
+       /* 'he' is no longer used */
+       hist_entry__delete(he);
+
+       /* return 0 (or -1) since it already applied filters */
+       return ret;
+}
+
+int hists__collapse_insert_entry(struct hists *hists, struct rb_root *root,
+                                struct hist_entry *he)
  {
         struct rb_node **p = &root->rb_node;
         struct rb_node *parent = NULL;
         struct hist_entry *iter;
         int64_t cmp;
  
+       if (symbol_conf.report_hierarchy)
+               return hists__hierarchy_insert_entry(hists, root, he);
+
         while (*p != NULL) {
                 parent = *p;
                 iter = rb_entry(parent, struct hist_entry, rb_node_in);
@@ -1024,18 +1313,21 @@ bool hists__collapse_insert_entry(struct hists *hists __maybe_unused,
                 cmp = hist_entry__collapse(iter, he);
  
                 if (!cmp) {
+                       int ret = 0;
+
                         he_stat__add_stat(&iter->stat, &he->stat);
                         if (symbol_conf.cumulate_callchain)
                                 he_stat__add_stat(iter->stat_acc, he->stat_acc);
  
                         if (symbol_conf.use_callchain) {
                                 callchain_cursor_reset(&callchain_cursor);
-                               callchain_merge(&callchain_cursor,
-                                               iter->callchain,
-                                               he->callchain);
+                               if (callchain_merge(&callchain_cursor,
+                                                   iter->callchain,
+                                                   he->callchain) < 0)
+                                       ret = -1;
                         }
                         hist_entry__delete(he);
-                       return false;
+                       return ret;
                 }
  
                 if (cmp < 0)
@@ -1047,7 +1339,7 @@ bool hists__collapse_insert_entry(struct hists *hists __maybe_unused,
  
         rb_link_node(&he->rb_node_in, parent, p);
         rb_insert_color(&he->rb_node_in, root);
-       return true;
+       return 1;
  }
  
  struct rb_root *hists__get_rotate_entries_in(struct hists *hists)
@@ -1073,14 +1365,15 @@ static void hists__apply_filters(struct hists *hists, struct hist_entry *he)
         hists__filter_entry_by_socket(hists, he);
  }
  
-void hists__collapse_resort(struct hists *hists, struct ui_progress *prog)
+int hists__collapse_resort(struct hists *hists, struct ui_progress *prog)
  {
         struct rb_root *root;
         struct rb_node *next;
         struct hist_entry *n;
+       int ret;
  
         if (!sort__need_collapse)
-               return;
+               return 0;
  
         hists->nr_entries = 0;
  
@@ -1095,7 +1388,11 @@ void hists__collapse_resort(struct hists *hists, struct ui_progress *prog)
                 next = rb_next(&n->rb_node_in);
  
                 rb_erase(&n->rb_node_in, root);
-               if (hists__collapse_insert_entry(hists, &hists->entries_collapsed, n)) {
+               ret = hists__collapse_insert_entry(hists, &hists->entries_collapsed, n);
+               if (ret < 0)
+                       return -1;
+
+               if (ret) {
                         /*
                          * If it wasn't combined with one of the entries already
                          * collapsed, we need to apply the filters that may have
@@ -1106,14 +1403,16 @@ void hists__collapse_resort(struct hists *hists, struct ui_progress *prog)
                 if (prog)
                         ui_progress__update(prog, 1);
         }
+       return 0;
  }
  
  static int hist_entry__sort(struct hist_entry *a, struct hist_entry *b)
  {
+       struct hists *hists = a->hists;
         struct perf_hpp_fmt *fmt;
         int64_t cmp = 0;
  
-       perf_hpp__for_each_sort_list(fmt) {
+       hists__for_each_sort_list(hists, fmt) {
                 if (perf_hpp__should_skip(fmt, a->hists))
                         continue;
  
@@ -1154,6 +1453,113 @@ void hists__inc_stats(struct hists *hists, struct hist_entry *h)
         hists->stats.total_period += h->stat.period;
  }
  
+static void hierarchy_recalc_total_periods(struct hists *hists)
+{
+       struct rb_node *node;
+       struct hist_entry *he;
+
+       node = rb_first(&hists->entries);
+
+       hists->stats.total_period = 0;
+       hists->stats.total_non_filtered_period = 0;
+
+       /*
+        * recalculate total period using top-level entries only
+        * since lower level entries only see non-filtered entries
+        * but upper level entries have sum of both entries.
+        */
+       while (node) {
+               he = rb_entry(node, struct hist_entry, rb_node);
+               node = rb_next(node);
+
+               hists->stats.total_period += he->stat.period;
+               if (!he->filtered)
+                       hists->stats.total_non_filtered_period += he->stat.period;
+       }
+}
+
+static void hierarchy_insert_output_entry(struct rb_root *root,
+                                         struct hist_entry *he)
+{
+       struct rb_node **p = &root->rb_node;
+       struct rb_node *parent = NULL;
+       struct hist_entry *iter;
+       struct perf_hpp_fmt *fmt;
+
+       while (*p != NULL) {
+               parent = *p;
+               iter = rb_entry(parent, struct hist_entry, rb_node);
+
+               if (hist_entry__sort(he, iter) > 0)
+                       p = &parent->rb_left;
+               else
+                       p = &parent->rb_right;
+       }
+
+       rb_link_node(&he->rb_node, parent, p);
+       rb_insert_color(&he->rb_node, root);
+
+       /* update column width of dynamic entry */
+       perf_hpp_list__for_each_sort_list(he->hpp_list, fmt) {
+               if (perf_hpp__is_dynamic_entry(fmt))
+                       fmt->sort(fmt, he, NULL);
+       }
+}
+
+static void hists__hierarchy_output_resort(struct hists *hists,
+                                          struct ui_progress *prog,
+                                          struct rb_root *root_in,
+                                          struct rb_root *root_out,
+                                          u64 min_callchain_hits,
+                                          bool use_callchain)
+{
+       struct rb_node *node;
+       struct hist_entry *he;
+
+       *root_out = RB_ROOT;
+       node = rb_first(root_in);
+
+       while (node) {
+               he = rb_entry(node, struct hist_entry, rb_node_in);
+               node = rb_next(node);
+
+               hierarchy_insert_output_entry(root_out, he);
+
+               if (prog)
+                       ui_progress__update(prog, 1);
+
+               if (!he->leaf) {
+                       hists__hierarchy_output_resort(hists, prog,
+                                                      &he->hroot_in,
+                                                      &he->hroot_out,
+                                                      min_callchain_hits,
+                                                      use_callchain);
+                       hists->nr_entries++;
+                       if (!he->filtered) {
+                               hists->nr_non_filtered_entries++;
+                               hists__calc_col_len(hists, he);
+                       }
+
+                       continue;
+               }
+
+               if (!use_callchain)
+                       continue;
+
+               if (callchain_param.mode == CHAIN_GRAPH_REL) {
+                       u64 total = he->stat.period;
+
+                       if (symbol_conf.cumulate_callchain)
+                               total = he->stat_acc->period;
+
+                       min_callchain_hits = total * (callchain_param.min_percent / 100);
+               }
+
+               callchain_param.sort(&he->sorted_chain, he->callchain,
+                                    min_callchain_hits, &callchain_param);
+       }
+}
+
  static void __hists__insert_output_entry(struct rb_root *entries,
                                          struct hist_entry *he,
                                          u64 min_callchain_hits,
@@ -1162,10 +1568,20 @@ static void __hists__insert_output_entry(struct rb_root *entries,
         struct rb_node **p = &entries->rb_node;
         struct rb_node *parent = NULL;
         struct hist_entry *iter;
+       struct perf_hpp_fmt *fmt;
+
+       if (use_callchain) {
+               if (callchain_param.mode == CHAIN_GRAPH_REL) {
+                       u64 total = he->stat.period;
+
+                       if (symbol_conf.cumulate_callchain)
+                               total = he->stat_acc->period;
  
-       if (use_callchain)
+                       min_callchain_hits = total * (callchain_param.min_percent / 100);
+               }
                 callchain_param.sort(&he->sorted_chain, he->callchain,
                                       min_callchain_hits, &callchain_param);
+       }
  
         while (*p != NULL) {
                 parent = *p;
@@ -1179,23 +1595,41 @@ static void __hists__insert_output_entry(struct rb_root *entries,
  
         rb_link_node(&he->rb_node, parent, p);
         rb_insert_color(&he->rb_node, entries);
+
+       perf_hpp_list__for_each_sort_list(&perf_hpp_list, fmt) {
+               if (perf_hpp__is_dynamic_entry(fmt) &&
+                   perf_hpp__defined_dynamic_entry(fmt, he->hists))
+                       fmt->sort(fmt, he, NULL);  /* update column width */
+       }
  }
  
-void hists__output_resort(struct hists *hists, struct ui_progress *prog)
+static void output_resort(struct hists *hists, struct ui_progress *prog,
+                         bool use_callchain)
  {
         struct rb_root *root;
         struct rb_node *next;
         struct hist_entry *n;
+       u64 callchain_total;
         u64 min_callchain_hits;
-       struct perf_evsel *evsel = hists_to_evsel(hists);
-       bool use_callchain;
  
-       if (evsel && symbol_conf.use_callchain && !symbol_conf.show_ref_callgraph)
-               use_callchain = evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN;
-       else
-               use_callchain = symbol_conf.use_callchain;
+       callchain_total = hists->callchain_period;
+       if (symbol_conf.filter_relative)
+               callchain_total = hists->callchain_non_filtered_period;
  
-       min_callchain_hits = hists->stats.total_period * (callchain_param.min_percent / 100);
+       min_callchain_hits = callchain_total * (callchain_param.min_percent / 100);
+
+       hists__reset_stats(hists);
+       hists__reset_col_len(hists);
+
+       if (symbol_conf.report_hierarchy) {
+               hists__hierarchy_output_resort(hists, prog,
+                                              &hists->entries_collapsed,
+                                              &hists->entries,
+                                              min_callchain_hits,
+                                              use_callchain);
+               hierarchy_recalc_total_periods(hists);
+               return;
+       }
  
         if (sort__need_collapse)
                 root = &hists->entries_collapsed;
@@ -1205,9 +1639,6 @@ void hists__output_resort(struct hists *hists, struct ui_progress *prog)
         next = rb_first(root);
         hists->entries = RB_ROOT;
  
-       hists__reset_stats(hists);
-       hists__reset_col_len(hists);
-
         while (next) {
                 n = rb_entry(next, struct hist_entry, rb_node_in);
                 next = rb_next(&n->rb_node_in);
@@ -1223,15 +1654,136 @@ void hists__output_resort(struct hists *hists, struct ui_progress *prog)
         }
  }
  
+void perf_evsel__output_resort(struct perf_evsel *evsel, struct ui_progress *prog)
+{
+       bool use_callchain;
+
+       if (evsel && symbol_conf.use_callchain && !symbol_conf.show_ref_callgraph)
+               use_callchain = evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN;
+       else
+               use_callchain = symbol_conf.use_callchain;
+
+       output_resort(evsel__hists(evsel), prog, use_callchain);
+}
+
+void hists__output_resort(struct hists *hists, struct ui_progress *prog)
+{
+       output_resort(hists, prog, symbol_conf.use_callchain);
+}
+
+static bool can_goto_child(struct hist_entry *he, enum hierarchy_move_dir hmd)
+{
+       if (he->leaf || hmd == HMD_FORCE_SIBLING)
+               return false;
+
+       if (he->unfolded || hmd == HMD_FORCE_CHILD)
+               return true;
+
+       return false;
+}
+
+struct rb_node *rb_hierarchy_last(struct rb_node *node)
+{
+       struct hist_entry *he = rb_entry(node, struct hist_entry, rb_node);
+
+       while (can_goto_child(he, HMD_NORMAL)) {
+               node = rb_last(&he->hroot_out);
+               he = rb_entry(node, struct hist_entry, rb_node);
+       }
+       return node;
+}
+
+struct rb_node *__rb_hierarchy_next(struct rb_node *node, enum hierarchy_move_dir hmd)
+{
+       struct hist_entry *he = rb_entry(node, struct hist_entry, rb_node);
+
+       if (can_goto_child(he, hmd))
+               node = rb_first(&he->hroot_out);
+       else
+               node = rb_next(node);
+
+       while (node == NULL) {
+               he = he->parent_he;
+               if (he == NULL)
+                       break;
+
+               node = rb_next(&he->rb_node);
+       }
+       return node;
+}
+
+struct rb_node *rb_hierarchy_prev(struct rb_node *node)
+{
+       struct hist_entry *he = rb_entry(node, struct hist_entry, rb_node);
+
+       node = rb_prev(node);
+       if (node)
+               return rb_hierarchy_last(node);
+
+       he = he->parent_he;
+       if (he == NULL)
+               return NULL;
+
+       return &he->rb_node;
+}
+
+bool hist_entry__has_hierarchy_children(struct hist_entry *he, float limit)
+{
+       struct rb_node *node;
+       struct hist_entry *child;
+       float percent;
+
+       if (he->leaf)
+               return false;
+
+       node = rb_first(&he->hroot_out);
+       child = rb_entry(node, struct hist_entry, rb_node);
+
+       while (node && child->filtered) {
+               node = rb_next(node);
+               child = rb_entry(node, struct hist_entry, rb_node);
+       }
+
+       if (node)
+               percent = hist_entry__get_percent_limit(child);
+       else
+               percent = 0;
+
+       return node && percent >= limit;
+}
+
  static void hists__remove_entry_filter(struct hists *hists, struct hist_entry *h,
                                        enum hist_filter filter)
  {
         h->filtered &= ~(1 << filter);
+
+       if (symbol_conf.report_hierarchy) {
+               struct hist_entry *parent = h->parent_he;
+
+               while (parent) {
+                       he_stat__add_stat(&parent->stat, &h->stat);
+
+                       parent->filtered &= ~(1 << filter);
+
+                       if (parent->filtered)
+                               goto next;
+
+                       /* force fold unfiltered entry for simplicity */
+                       parent->unfolded = false;
+                       parent->has_no_entry = false;
+                       parent->row_offset = 0;
+                       parent->nr_rows = 0;
+next:
+                       parent = parent->parent_he;
+               }
+       }
+
         if (h->filtered)
                 return;
  
         /* force fold unfiltered entry for simplicity */
         h->unfolded = false;
+       h->has_no_entry = false;
         h->row_offset = 0;
         h->nr_rows = 0;
  
@@ -1254,28 +1806,6 @@ static bool hists__filter_entry_by_dso(struct hists *hists,
         return false;
  }
  
-void hists__filter_by_dso(struct hists *hists)
-{
-       struct rb_node *nd;
-
-       hists->stats.nr_non_filtered_samples = 0;
-
-       hists__reset_filter_stats(hists);
-       hists__reset_col_len(hists);
-
-       for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
-               struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
-
-               if (symbol_conf.exclude_other && !h->parent)
-                       continue;
-
-               if (hists__filter_entry_by_dso(hists, h))
-                       continue;
-
-               hists__remove_entry_filter(hists, h, HIST_FILTER__DSO);
-       }
-}
-
  static bool hists__filter_entry_by_thread(struct hists *hists,
                                           struct hist_entry *he)
  {
@@ -1288,25 +1818,6 @@ static bool hists__filter_entry_by_thread(struct hists *hists,
         return false;
  }
  
-void hists__filter_by_thread(struct hists *hists)
-{
-       struct rb_node *nd;
-
-       hists->stats.nr_non_filtered_samples = 0;
-
-       hists__reset_filter_stats(hists);
-       hists__reset_col_len(hists);
-
-       for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
-               struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
-
-               if (hists__filter_entry_by_thread(hists, h))
-                       continue;
-
-               hists__remove_entry_filter(hists, h, HIST_FILTER__THREAD);
-       }
-}
-
  static bool hists__filter_entry_by_symbol(struct hists *hists,
                                           struct hist_entry *he)
  {
@@ -1320,7 +1831,21 @@ static bool hists__filter_entry_by_symbol(struct hists *hists,
         return false;
  }
  
-void hists__filter_by_symbol(struct hists *hists)
+static bool hists__filter_entry_by_socket(struct hists *hists,
+                                         struct hist_entry *he)
+{
+       if ((hists->socket_filter > -1) &&
+           (he->socket != hists->socket_filter)) {
+               he->filtered |= (1 << HIST_FILTER__SOCKET);
+               return true;
+       }
+
+       return false;
+}
+
+typedef bool (*filter_fn_t)(struct hists *hists, struct hist_entry *he);
+
+static void hists__filter_by_type(struct hists *hists, int type, filter_fn_t filter)
  {
         struct rb_node *nd;
  
@@ -1332,42 +1857,155 @@ void hists__filter_by_symbol(struct hists *hists)
         for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
                 struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
  
-               if (hists__filter_entry_by_symbol(hists, h))
+               if (filter(hists, h))
                         continue;
  
-               hists__remove_entry_filter(hists, h, HIST_FILTER__SYMBOL);
+               hists__remove_entry_filter(hists, h, type);
         }
  }
  
-static bool hists__filter_entry_by_socket(struct hists *hists,
-                                         struct hist_entry *he)
+static void resort_filtered_entry(struct rb_root *root, struct hist_entry *he)
  {
-       if ((hists->socket_filter > -1) &&
-           (he->socket != hists->socket_filter)) {
-               he->filtered |= (1 << HIST_FILTER__SOCKET);
-               return true;
+       struct rb_node **p = &root->rb_node;
+       struct rb_node *parent = NULL;
+       struct hist_entry *iter;
+       struct rb_root new_root = RB_ROOT;
+       struct rb_node *nd;
+
+       while (*p != NULL) {
+               parent = *p;
+               iter = rb_entry(parent, struct hist_entry, rb_node);
+
+               if (hist_entry__sort(he, iter) > 0)
+                       p = &(*p)->rb_left;
+               else
+                       p = &(*p)->rb_right;
         }
  
-       return false;
+       rb_link_node(&he->rb_node, parent, p);
+       rb_insert_color(&he->rb_node, root);
+
+       if (he->leaf || he->filtered)
+               return;
+
+       nd = rb_first(&he->hroot_out);
+       while (nd) {
+               struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
+
+               nd = rb_next(nd);
+               rb_erase(&h->rb_node, &he->hroot_out);
+
+               resort_filtered_entry(&new_root, h);
+       }
+
+       he->hroot_out = new_root;
  }
  
-void hists__filter_by_socket(struct hists *hists)
+static void hists__filter_hierarchy(struct hists *hists, int type, const void *arg)
  {
         struct rb_node *nd;
+       struct rb_root new_root = RB_ROOT;
  
         hists->stats.nr_non_filtered_samples = 0;
  
         hists__reset_filter_stats(hists);
         hists__reset_col_len(hists);
  
-       for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
+       nd = rb_first(&hists->entries);
+       while (nd) {
                 struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
+               int ret;
  
-               if (hists__filter_entry_by_socket(hists, h))
-                       continue;
+               ret = hist_entry__filter(h, type, arg);
  
-               hists__remove_entry_filter(hists, h, HIST_FILTER__SOCKET);
+               /*
+                * case 1. non-matching type
+                * zero out the period, set filter marker and move to child
+                */
+               if (ret < 0) {
+                       memset(&h->stat, 0, sizeof(h->stat));
+                       h->filtered |= (1 << type);
+
+                       nd = __rb_hierarchy_next(&h->rb_node, HMD_FORCE_CHILD);
+               }
+               /*
+                * case 2. matched type (filter out)
+                * set filter marker and move to next
+                */
+               else if (ret == 1) {
+                       h->filtered |= (1 << type);
+
+                       nd = __rb_hierarchy_next(&h->rb_node, HMD_FORCE_SIBLING);
+               }
+               /*
+                * case 3. ok (not filtered)
+                * add period to hists and parents, erase the filter marker
+                * and move to next sibling
+                */
+               else {
+                       hists__remove_entry_filter(hists, h, type);
+
+                       nd = __rb_hierarchy_next(&h->rb_node, HMD_FORCE_SIBLING);
+               }
+       }
+
+       hierarchy_recalc_total_periods(hists);
+
+       /*
+        * resort output after applying a new filter since filter in a lower
+        * hierarchy can change periods in a upper hierarchy.
+        */
+       nd = rb_first(&hists->entries);
+       while (nd) {
+               struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
+
+               nd = rb_next(nd);
+               rb_erase(&h->rb_node, &hists->entries);
+
+               resort_filtered_entry(&new_root, h);
         }
+
+       hists->entries = new_root;
+}
+
+void hists__filter_by_thread(struct hists *hists)
+{
+       if (symbol_conf.report_hierarchy)
+               hists__filter_hierarchy(hists, HIST_FILTER__THREAD,
+                                       hists->thread_filter);
+       else
+               hists__filter_by_type(hists, HIST_FILTER__THREAD,
+                                     hists__filter_entry_by_thread);
+}
+
+void hists__filter_by_dso(struct hists *hists)
+{
+       if (symbol_conf.report_hierarchy)
+               hists__filter_hierarchy(hists, HIST_FILTER__DSO,
+                                       hists->dso_filter);
+       else
+               hists__filter_by_type(hists, HIST_FILTER__DSO,
+                                     hists__filter_entry_by_dso);
+}
+
+void hists__filter_by_symbol(struct hists *hists)
+{
+       if (symbol_conf.report_hierarchy)
+               hists__filter_hierarchy(hists, HIST_FILTER__SYMBOL,
+                                       hists->symbol_filter_str);
+       else
+               hists__filter_by_type(hists, HIST_FILTER__SYMBOL,
+                                     hists__filter_entry_by_symbol);
+}
+
+void hists__filter_by_socket(struct hists *hists)
+{
+       if (symbol_conf.report_hierarchy)
+               hists__filter_hierarchy(hists, HIST_FILTER__SOCKET,
+                                       &hists->socket_filter);
+       else
+               hists__filter_by_type(hists, HIST_FILTER__SOCKET,
+                                     hists__filter_entry_by_socket);
  }
  
  void events_stats__inc(struct events_stats *stats, u32 type)
@@ -1585,7 +2223,7 @@ int perf_hist_config(const char *var, const char *value)
         return 0;
  }
  
-int __hists__init(struct hists *hists)
+int __hists__init(struct hists *hists, struct perf_hpp_list *hpp_list)
  {
         memset(hists, 0, sizeof(*hists));
         hists->entries_in_array[0] = hists->entries_in_array[1] = RB_ROOT;
@@ -1594,6 +2232,8 @@ int __hists__init(struct hists *hists)
         hists->entries = RB_ROOT;
         pthread_mutex_init(&hists->lock, NULL);
         hists->socket_filter = -1;
+       hists->hpp_list = hpp_list;
+       INIT_LIST_HEAD(&hists->hpp_formats);
         return 0;
  }
  
@@ -1622,15 +2262,26 @@ static void hists__delete_all_entries(struct hists *hists)
  static void hists_evsel__exit(struct perf_evsel *evsel)
  {
         struct hists *hists = evsel__hists(evsel);
+       struct perf_hpp_fmt *fmt, *pos;
+       struct perf_hpp_list_node *node, *tmp;
  
         hists__delete_all_entries(hists);
+
+       list_for_each_entry_safe(node, tmp, &hists->hpp_formats, list) {
+               perf_hpp_list__for_each_format_safe(&node->hpp, fmt, pos) {
+                       list_del(&fmt->list);
+                       free(fmt);
+               }
+               list_del(&node->list);
+               free(node);
+       }
  }
  
  static int hists_evsel__init(struct perf_evsel *evsel)
  {
         struct hists *hists = evsel__hists(evsel);
  
-       __hists__init(hists);
+       __hists__init(hists, &perf_hpp_list);
         return 0;
  }
  
@@ -1649,3 +2300,9 @@ int hists__init(void)
  
         return err;
  }
+
+void perf_hpp_list__init(struct perf_hpp_list *list)
+{
+       INIT_LIST_HEAD(&list->fields);
+       INIT_LIST_HEAD(&list->sorts);
+}
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h

index d4ec4822a1038611a7269aa0df2eb37171a647a6..ead18c82294faf881f1d4bc89be653e27ff5c708 100644 (file)
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -66,6 +66,8 @@ struct hists {
         struct rb_root          entries_collapsed;
         u64                     nr_entries;
         u64                     nr_non_filtered_entries;
+       u64                     callchain_period;
+       u64                     callchain_non_filtered_period;
         struct thread           *thread_filter;
         const struct dso        *dso_filter;
         const char              *uid_filter_str;
@@ -75,6 +77,9 @@ struct hists {
         u64                     event_stream;
         u16                     col_len[HISTC_NR_COLS];
         int                     socket_filter;
+       struct perf_hpp_list    *hpp_list;
+       struct list_head        hpp_formats;
+       int                     nr_hpp_node;
  };
  
  struct hist_entry_iter;
@@ -121,15 +126,21 @@ struct hist_entry *__hists__add_entry(struct hists *hists,
  int hist_entry_iter__add(struct hist_entry_iter *iter, struct addr_location *al,
                          int max_stack_depth, void *arg);
  
+struct perf_hpp;
+struct perf_hpp_fmt;
+
  int64_t hist_entry__cmp(struct hist_entry *left, struct hist_entry *right);
  int64_t hist_entry__collapse(struct hist_entry *left, struct hist_entry *right);
  int hist_entry__transaction_len(void);
  int hist_entry__sort_snprintf(struct hist_entry *he, char *bf, size_t size,
                               struct hists *hists);
+int hist_entry__snprintf_alignment(struct hist_entry *he, struct perf_hpp *hpp,
+                                  struct perf_hpp_fmt *fmt, int printed);
  void hist_entry__delete(struct hist_entry *he);
  
+void perf_evsel__output_resort(struct perf_evsel *evsel, struct ui_progress *prog);
  void hists__output_resort(struct hists *hists, struct ui_progress *prog);
-void hists__collapse_resort(struct hists *hists, struct ui_progress *prog);
+int hists__collapse_resort(struct hists *hists, struct ui_progress *prog);
  
  void hists__decay_entries(struct hists *hists, bool zap_user, bool zap_kernel);
  void hists__delete_entries(struct hists *hists);
@@ -185,10 +196,10 @@ static inline struct hists *evsel__hists(struct perf_evsel *evsel)
  }
  
  int hists__init(void);
-int __hists__init(struct hists *hists);
+int __hists__init(struct hists *hists, struct perf_hpp_list *hpp_list);
  
  struct rb_root *hists__get_rotate_entries_in(struct hists *hists);
-bool hists__collapse_insert_entry(struct hists *hists __maybe_unused,
+int hists__collapse_insert_entry(struct hists *hists,
                                   struct rb_root *root, struct hist_entry *he);
  
  struct perf_hpp {
@@ -214,28 +225,64 @@ struct perf_hpp_fmt {
                             struct hist_entry *a, struct hist_entry *b);
         int64_t (*sort)(struct perf_hpp_fmt *fmt,
                         struct hist_entry *a, struct hist_entry *b);
+       bool (*equal)(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b);
+       void (*free)(struct perf_hpp_fmt *fmt);
  
         struct list_head list;
         struct list_head sort_list;
         bool elide;
         int len;
         int user_len;
+       int idx;
+       int level;
+};
+
+struct perf_hpp_list {
+       struct list_head fields;
+       struct list_head sorts;
  };
  
-extern struct list_head perf_hpp__list;
-extern struct list_head perf_hpp__sort_list;
+extern struct perf_hpp_list perf_hpp_list;
+
+struct perf_hpp_list_node {
+       struct list_head        list;
+       struct perf_hpp_list    hpp;
+       int                     level;
+       bool                    skip;
+};
+
+void perf_hpp_list__column_register(struct perf_hpp_list *list,
+                                   struct perf_hpp_fmt *format);
+void perf_hpp_list__register_sort_field(struct perf_hpp_list *list,
+                                       struct perf_hpp_fmt *format);
+
+static inline void perf_hpp__column_register(struct perf_hpp_fmt *format)
+{
+       perf_hpp_list__column_register(&perf_hpp_list, format);
+}
+
+static inline void perf_hpp__register_sort_field(struct perf_hpp_fmt *format)
+{
+       perf_hpp_list__register_sort_field(&perf_hpp_list, format);
+}
+
+#define perf_hpp_list__for_each_format(_list, format) \
+       list_for_each_entry(format, &(_list)->fields, list)
  
-#define perf_hpp__for_each_format(format) \
-       list_for_each_entry(format, &perf_hpp__list, list)
+#define perf_hpp_list__for_each_format_safe(_list, format, tmp)        \
+       list_for_each_entry_safe(format, tmp, &(_list)->fields, list)
  
-#define perf_hpp__for_each_format_safe(format, tmp)    \
-       list_for_each_entry_safe(format, tmp, &perf_hpp__list, list)
+#define perf_hpp_list__for_each_sort_list(_list, format) \
+       list_for_each_entry(format, &(_list)->sorts, sort_list)
  
-#define perf_hpp__for_each_sort_list(format) \
-       list_for_each_entry(format, &perf_hpp__sort_list, sort_list)
+#define perf_hpp_list__for_each_sort_list_safe(_list, format, tmp)     \
+       list_for_each_entry_safe(format, tmp, &(_list)->sorts, sort_list)
  
-#define perf_hpp__for_each_sort_list_safe(format, tmp) \
-       list_for_each_entry_safe(format, tmp, &perf_hpp__sort_list, sort_list)
+#define hists__for_each_format(hists, format) \
+       perf_hpp_list__for_each_format((hists)->hpp_list, fmt)
+
+#define hists__for_each_sort_list(hists, format) \
+       perf_hpp_list__for_each_sort_list((hists)->hpp_list, fmt)
  
  extern struct perf_hpp_fmt perf_hpp__format[];
  
@@ -254,21 +301,29 @@ enum {
  };
  
  void perf_hpp__init(void);
-void perf_hpp__column_register(struct perf_hpp_fmt *format);
  void perf_hpp__column_unregister(struct perf_hpp_fmt *format);
-void perf_hpp__column_enable(unsigned col);
-void perf_hpp__column_disable(unsigned col);
  void perf_hpp__cancel_cumulate(void);
+void perf_hpp__setup_output_field(struct perf_hpp_list *list);
+void perf_hpp__reset_output_field(struct perf_hpp_list *list);
+void perf_hpp__append_sort_keys(struct perf_hpp_list *list);
+int perf_hpp__setup_hists_formats(struct perf_hpp_list *list,
+                                 struct perf_evlist *evlist);
  
-void perf_hpp__register_sort_field(struct perf_hpp_fmt *format);
-void perf_hpp__setup_output_field(void);
-void perf_hpp__reset_output_field(void);
-void perf_hpp__append_sort_keys(void);
  
  bool perf_hpp__is_sort_entry(struct perf_hpp_fmt *format);
-bool perf_hpp__same_sort_entry(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b);
  bool perf_hpp__is_dynamic_entry(struct perf_hpp_fmt *format);
  bool perf_hpp__defined_dynamic_entry(struct perf_hpp_fmt *fmt, struct hists *hists);
+bool perf_hpp__is_trace_entry(struct perf_hpp_fmt *fmt);
+bool perf_hpp__is_srcline_entry(struct perf_hpp_fmt *fmt);
+bool perf_hpp__is_srcfile_entry(struct perf_hpp_fmt *fmt);
+bool perf_hpp__is_thread_entry(struct perf_hpp_fmt *fmt);
+bool perf_hpp__is_comm_entry(struct perf_hpp_fmt *fmt);
+bool perf_hpp__is_dso_entry(struct perf_hpp_fmt *fmt);
+bool perf_hpp__is_sym_entry(struct perf_hpp_fmt *fmt);
+
+struct perf_hpp_fmt *perf_hpp_fmt__dup(struct perf_hpp_fmt *fmt);
+
+int hist_entry__filter(struct hist_entry *he, int type, const void *arg);
  
  static inline bool perf_hpp__should_skip(struct perf_hpp_fmt *format,
                                          struct hists *hists)
@@ -372,6 +427,7 @@ static inline int script_browse(const char *script_opt __maybe_unused)
  #endif
  
  unsigned int hists__sort_list_width(struct hists *hists);
+unsigned int hists__overhead_width(struct hists *hists);
  
  void hist__account_cycles(struct branch_stack *bs, struct addr_location *al,
                           struct perf_sample *sample, bool nonany_branch_mode);
@@ -381,4 +437,26 @@ int parse_filter_percentage(const struct option *opt __maybe_unused,
                             const char *arg, int unset __maybe_unused);
  int perf_hist_config(const char *var, const char *value);
  
+void perf_hpp_list__init(struct perf_hpp_list *list);
+
+enum hierarchy_move_dir {
+       HMD_NORMAL,
+       HMD_FORCE_SIBLING,
+       HMD_FORCE_CHILD,
+};
+
+struct rb_node *rb_hierarchy_last(struct rb_node *node);
+struct rb_node *__rb_hierarchy_next(struct rb_node *node,
+                                   enum hierarchy_move_dir hmd);
+struct rb_node *rb_hierarchy_prev(struct rb_node *node);
+
+static inline struct rb_node *rb_hierarchy_next(struct rb_node *node)
+{
+       return __rb_hierarchy_next(node, HMD_NORMAL);
+}
+
+#define HIERARCHY_INDENT  3
+
+bool hist_entry__has_hierarchy_children(struct hist_entry *he, float limit);
+
  #endif /* __PERF_HIST_H */
diff --git a/tools/perf/util/jit.h b/tools/perf/util/jit.h

new file mode 100644 (file)

index 0000000..a1e99da
--- /dev/null
+++ b/tools/perf/util/jit.h
@@ -0,0 +1,15 @@
+#ifndef __JIT_H__
+#define __JIT_H__
+
+#include <data.h>
+
+extern int jit_process(struct perf_session *session,
+                      struct perf_data_file *output,
+                      struct machine *machine,
+                      char *filename,
+                      pid_t pid,
+                      u64 *nbytes);
+
+extern int jit_inject_record(const char *filename);
+
+#endif /* __JIT_H__ */
diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c

new file mode 100644 (file)

index 0000000..cd272cc
--- /dev/null
+++ b/tools/perf/util/jitdump.c
@@ -0,0 +1,697 @@
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <byteswap.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+#include "util.h"
+#include "event.h"
+#include "debug.h"
+#include "evlist.h"
+#include "symbol.h"
+#include "strlist.h"
+#include <elf.h>
+
+#include "session.h"
+#include "jit.h"
+#include "jitdump.h"
+#include "genelf.h"
+#include "../builtin.h"
+
+struct jit_buf_desc {
+       struct perf_data_file *output;
+       struct perf_session *session;
+       struct machine *machine;
+       union jr_entry   *entry;
+       void             *buf;
+       uint64_t         sample_type;
+       size_t           bufsize;
+       FILE             *in;
+       bool             needs_bswap; /* handles cross-endianess */
+       void             *debug_data;
+       size_t           nr_debug_entries;
+       uint32_t         code_load_count;
+       u64              bytes_written;
+       struct rb_root   code_root;
+       char             dir[PATH_MAX];
+};
+
+struct debug_line_info {
+       unsigned long vma;
+       unsigned int lineno;
+       /* The filename format is unspecified, absolute path, relative etc. */
+       char const filename[0];
+};
+
+struct jit_tool {
+       struct perf_tool tool;
+       struct perf_data_file   output;
+       struct perf_data_file   input;
+       u64 bytes_written;
+};
+
+#define hmax(a, b) ((a) > (b) ? (a) : (b))
+#define get_jit_tool(t) (container_of(tool, struct jit_tool, tool))
+
+static int
+jit_emit_elf(char *filename,
+            const char *sym,
+            uint64_t code_addr,
+            const void *code,
+            int csize,
+            void *debug,
+            int nr_debug_entries)
+{
+       int ret, fd;
+
+       if (verbose > 0)
+               fprintf(stderr, "write ELF image %s\n", filename);
+
+       fd = open(filename, O_CREAT|O_TRUNC|O_WRONLY, 0644);
+       if (fd == -1) {
+               pr_warning("cannot create jit ELF %s: %s\n", filename, strerror(errno));
+               return -1;
+       }
+
+        ret = jit_write_elf(fd, code_addr, sym, (const void *)code, csize, debug, nr_debug_entries);
+
+        close(fd);
+
+        if (ret)
+                unlink(filename);
+
+       return ret;
+}
+
+static void
+jit_close(struct jit_buf_desc *jd)
+{
+       if (!(jd && jd->in))
+               return;
+       funlockfile(jd->in);
+       fclose(jd->in);
+       jd->in = NULL;
+}
+
+static int
+jit_validate_events(struct perf_session *session)
+{
+       struct perf_evsel *evsel;
+
+       /*
+        * check that all events use CLOCK_MONOTONIC
+        */
+       evlist__for_each(session->evlist, evsel) {
+               if (evsel->attr.use_clockid == 0 || evsel->attr.clockid != CLOCK_MONOTONIC)
+                       return -1;
+       }
+       return 0;
+}
+
+static int
+jit_open(struct jit_buf_desc *jd, const char *name)
+{
+       struct jitheader header;
+       struct jr_prefix *prefix;
+       ssize_t bs, bsz = 0;
+       void *n, *buf = NULL;
+       int ret, retval = -1;
+
+       jd->in = fopen(name, "r");
+       if (!jd->in)
+               return -1;
+
+       bsz = hmax(sizeof(header), sizeof(*prefix));
+
+       buf = malloc(bsz);
+       if (!buf)
+               goto error;
+
+       /*
+        * protect from writer modifying the file while we are reading it
+        */
+       flockfile(jd->in);
+
+       ret = fread(buf, sizeof(header), 1, jd->in);
+       if (ret != 1)
+               goto error;
+
+       memcpy(&header, buf, sizeof(header));
+
+       if (header.magic != JITHEADER_MAGIC) {
+               if (header.magic != JITHEADER_MAGIC_SW)
+                       goto error;
+               jd->needs_bswap = true;
+       }
+
+       if (jd->needs_bswap) {
+               header.version    = bswap_32(header.version);
+               header.total_size = bswap_32(header.total_size);
+               header.pid        = bswap_32(header.pid);
+               header.elf_mach   = bswap_32(header.elf_mach);
+               header.timestamp  = bswap_64(header.timestamp);
+               header.flags      = bswap_64(header.flags);
+       }
+
+       if (verbose > 2)
+               pr_debug("version=%u\nhdr.size=%u\nts=0x%llx\npid=%d\nelf_mach=%d\n",
+                       header.version,
+                       header.total_size,
+                       (unsigned long long)header.timestamp,
+                       header.pid,
+                       header.elf_mach);
+
+       if (header.flags & JITDUMP_FLAGS_RESERVED) {
+               pr_err("jitdump file contains invalid or unsupported flags 0x%llx\n",
+                      (unsigned long long)header.flags & JITDUMP_FLAGS_RESERVED);
+               goto error;
+       }
+
+       /*
+        * validate event is using the correct clockid
+        */
+       if (jit_validate_events(jd->session)) {
+               pr_err("error, jitted code must be sampled with perf record -k 1\n");
+               goto error;
+       }
+
+       bs = header.total_size - sizeof(header);
+
+       if (bs > bsz) {
+               n = realloc(buf, bs);
+               if (!n)
+                       goto error;
+               bsz = bs;
+               buf = n;
+               /* read extra we do not know about */
+               ret = fread(buf, bs - bsz, 1, jd->in);
+               if (ret != 1)
+                       goto error;
+       }
+       /*
+        * keep dirname for generating files and mmap records
+        */
+       strcpy(jd->dir, name);
+       dirname(jd->dir);
+
+       return 0;
+error:
+       funlockfile(jd->in);
+       fclose(jd->in);
+       return retval;
+}
+
+static union jr_entry *
+jit_get_next_entry(struct jit_buf_desc *jd)
+{
+       struct jr_prefix *prefix;
+       union jr_entry *jr;
+       void *addr;
+       size_t bs, size;
+       int id, ret;
+
+       if (!(jd && jd->in))
+               return NULL;
+
+       if (jd->buf == NULL) {
+               size_t sz = getpagesize();
+               if (sz < sizeof(*prefix))
+                       sz = sizeof(*prefix);
+
+               jd->buf = malloc(sz);
+               if (jd->buf == NULL)
+                       return NULL;
+
+               jd->bufsize = sz;
+       }
+
+       prefix = jd->buf;
+
+       /*
+        * file is still locked at this point
+        */
+       ret = fread(prefix, sizeof(*prefix), 1, jd->in);
+       if (ret  != 1)
+               return NULL;
+
+       if (jd->needs_bswap) {
+               prefix->id         = bswap_32(prefix->id);
+               prefix->total_size = bswap_32(prefix->total_size);
+               prefix->timestamp  = bswap_64(prefix->timestamp);
+       }
+       id   = prefix->id;
+       size = prefix->total_size;
+
+       bs = (size_t)size;
+       if (bs < sizeof(*prefix))
+               return NULL;
+
+       if (id >= JIT_CODE_MAX) {
+               pr_warning("next_entry: unknown prefix %d, skipping\n", id);
+               return NULL;
+       }
+       if (bs > jd->bufsize) {
+               void *n;
+               n = realloc(jd->buf, bs);
+               if (!n)
+                       return NULL;
+               jd->buf = n;
+               jd->bufsize = bs;
+       }
+
+       addr = ((void *)jd->buf) + sizeof(*prefix);
+
+       ret = fread(addr, bs - sizeof(*prefix), 1, jd->in);
+       if (ret != 1)
+               return NULL;
+
+       jr = (union jr_entry *)jd->buf;
+
+       switch(id) {
+       case JIT_CODE_DEBUG_INFO:
+               if (jd->needs_bswap) {
+                       uint64_t n;
+                       jr->info.code_addr = bswap_64(jr->info.code_addr);
+                       jr->info.nr_entry  = bswap_64(jr->info.nr_entry);
+                       for (n = 0 ; n < jr->info.nr_entry; n++) {
+                               jr->info.entries[n].addr    = bswap_64(jr->info.entries[n].addr);
+                               jr->info.entries[n].lineno  = bswap_32(jr->info.entries[n].lineno);
+                               jr->info.entries[n].discrim = bswap_32(jr->info.entries[n].discrim);
+                       }
+               }
+               break;
+       case JIT_CODE_CLOSE:
+               break;
+       case JIT_CODE_LOAD:
+               if (jd->needs_bswap) {
+                       jr->load.pid       = bswap_32(jr->load.pid);
+                       jr->load.tid       = bswap_32(jr->load.tid);
+                       jr->load.vma       = bswap_64(jr->load.vma);
+                       jr->load.code_addr = bswap_64(jr->load.code_addr);
+                       jr->load.code_size = bswap_64(jr->load.code_size);
+                       jr->load.code_index= bswap_64(jr->load.code_index);
+               }
+               jd->code_load_count++;
+               break;
+       case JIT_CODE_MOVE:
+               if (jd->needs_bswap) {
+                       jr->move.pid           = bswap_32(jr->move.pid);
+                       jr->move.tid           = bswap_32(jr->move.tid);
+                       jr->move.vma           = bswap_64(jr->move.vma);
+                       jr->move.old_code_addr = bswap_64(jr->move.old_code_addr);
+                       jr->move.new_code_addr = bswap_64(jr->move.new_code_addr);
+                       jr->move.code_size     = bswap_64(jr->move.code_size);
+                       jr->move.code_index    = bswap_64(jr->move.code_index);
+               }
+               break;
+       case JIT_CODE_MAX:
+       default:
+               return NULL;
+       }
+       return jr;
+}
+
+static int
+jit_inject_event(struct jit_buf_desc *jd, union perf_event *event)
+{
+       ssize_t size;
+
+       size = perf_data_file__write(jd->output, event, event->header.size);
+       if (size < 0)
+               return -1;
+
+       jd->bytes_written += size;
+       return 0;
+}
+
+static int jit_repipe_code_load(struct jit_buf_desc *jd, union jr_entry *jr)
+{
+       struct perf_sample sample;
+       union perf_event *event;
+       struct perf_tool *tool = jd->session->tool;
+       uint64_t code, addr;
+       uintptr_t uaddr;
+       char *filename;
+       struct stat st;
+       size_t size;
+       u16 idr_size;
+       const char *sym;
+       uint32_t count;
+       int ret, csize;
+       pid_t pid, tid;
+       struct {
+               u32 pid, tid;
+               u64 time;
+       } *id;
+
+       pid   = jr->load.pid;
+       tid   = jr->load.tid;
+       csize = jr->load.code_size;
+       addr  = jr->load.code_addr;
+       sym   = (void *)((unsigned long)jr + sizeof(jr->load));
+       code  = (unsigned long)jr + jr->load.p.total_size - csize;
+       count = jr->load.code_index;
+       idr_size = jd->machine->id_hdr_size;
+
+       event = calloc(1, sizeof(*event) + idr_size);
+       if (!event)
+               return -1;
+
+       filename = event->mmap2.filename;
+       size = snprintf(filename, PATH_MAX, "%s/jitted-%d-%u.so",
+                       jd->dir,
+                       pid,
+                       count);
+
+       size++; /* for \0 */
+
+       size = PERF_ALIGN(size, sizeof(u64));
+       uaddr = (uintptr_t)code;
+       ret = jit_emit_elf(filename, sym, addr, (const void *)uaddr, csize, jd->debug_data, jd->nr_debug_entries);
+
+       if (jd->debug_data && jd->nr_debug_entries) {
+               free(jd->debug_data);
+               jd->debug_data = NULL;
+               jd->nr_debug_entries = 0;
+       }
+
+       if (ret) {
+               free(event);
+               return -1;
+       }
+       if (stat(filename, &st))
+               memset(&st, 0, sizeof(stat));
+
+       event->mmap2.header.type = PERF_RECORD_MMAP2;
+       event->mmap2.header.misc = PERF_RECORD_MISC_USER;
+       event->mmap2.header.size = (sizeof(event->mmap2) -
+                       (sizeof(event->mmap2.filename) - size) + idr_size);
+
+       event->mmap2.pgoff = GEN_ELF_TEXT_OFFSET;
+       event->mmap2.start = addr;
+       event->mmap2.len   = csize;
+       event->mmap2.pid   = pid;
+       event->mmap2.tid   = tid;
+       event->mmap2.ino   = st.st_ino;
+       event->mmap2.maj   = major(st.st_dev);
+       event->mmap2.min   = minor(st.st_dev);
+       event->mmap2.prot  = st.st_mode;
+       event->mmap2.flags = MAP_SHARED;
+       event->mmap2.ino_generation = 1;
+
+       id = (void *)((unsigned long)event + event->mmap.header.size - idr_size);
+       if (jd->sample_type & PERF_SAMPLE_TID) {
+               id->pid  = pid;
+               id->tid  = tid;
+       }
+       if (jd->sample_type & PERF_SAMPLE_TIME)
+               id->time = jr->load.p.timestamp;
+
+       /*
+        * create pseudo sample to induce dso hit increment
+        * use first address as sample address
+        */
+       memset(&sample, 0, sizeof(sample));
+       sample.pid  = pid;
+       sample.tid  = tid;
+       sample.time = id->time;
+       sample.ip   = addr;
+
+       ret = perf_event__process_mmap2(tool, event, &sample, jd->machine);
+       if (ret)
+               return ret;
+
+       ret = jit_inject_event(jd, event);
+       /*
+        * mark dso as use to generate buildid in the header
+        */
+       if (!ret)
+               build_id__mark_dso_hit(tool, event, &sample, NULL, jd->machine);
+
+       return ret;
+}
+
+static int jit_repipe_code_move(struct jit_buf_desc *jd, union jr_entry *jr)
+{
+       struct perf_sample sample;
+       union perf_event *event;
+       struct perf_tool *tool = jd->session->tool;
+       char *filename;
+       size_t size;
+       struct stat st;
+       u16 idr_size;
+       int ret;
+       pid_t pid, tid;
+       struct {
+               u32 pid, tid;
+               u64 time;
+       } *id;
+
+       pid = jr->move.pid;
+       tid =  jr->move.tid;
+       idr_size = jd->machine->id_hdr_size;
+
+       /*
+        * +16 to account for sample_id_all (hack)
+        */
+       event = calloc(1, sizeof(*event) + 16);
+       if (!event)
+               return -1;
+
+       filename = event->mmap2.filename;
+       size = snprintf(filename, PATH_MAX, "%s/jitted-%d-%"PRIu64,
+                jd->dir,
+                pid,
+                jr->move.code_index);
+
+       size++; /* for \0 */
+
+       if (stat(filename, &st))
+               memset(&st, 0, sizeof(stat));
+
+       size = PERF_ALIGN(size, sizeof(u64));
+
+       event->mmap2.header.type = PERF_RECORD_MMAP2;
+       event->mmap2.header.misc = PERF_RECORD_MISC_USER;
+       event->mmap2.header.size = (sizeof(event->mmap2) -
+                       (sizeof(event->mmap2.filename) - size) + idr_size);
+       event->mmap2.pgoff = GEN_ELF_TEXT_OFFSET;
+       event->mmap2.start = jr->move.new_code_addr;
+       event->mmap2.len   = jr->move.code_size;
+       event->mmap2.pid   = pid;
+       event->mmap2.tid   = tid;
+       event->mmap2.ino   = st.st_ino;
+       event->mmap2.maj   = major(st.st_dev);
+       event->mmap2.min   = minor(st.st_dev);
+       event->mmap2.prot  = st.st_mode;
+       event->mmap2.flags = MAP_SHARED;
+       event->mmap2.ino_generation = 1;
+
+       id = (void *)((unsigned long)event + event->mmap.header.size - idr_size);
+       if (jd->sample_type & PERF_SAMPLE_TID) {
+               id->pid  = pid;
+               id->tid  = tid;
+       }
+       if (jd->sample_type & PERF_SAMPLE_TIME)
+               id->time = jr->load.p.timestamp;
+
+       /*
+        * create pseudo sample to induce dso hit increment
+        * use first address as sample address
+        */
+       memset(&sample, 0, sizeof(sample));
+       sample.pid  = pid;
+       sample.tid  = tid;
+       sample.time = id->time;
+       sample.ip   = jr->move.new_code_addr;
+
+       ret = perf_event__process_mmap2(tool, event, &sample, jd->machine);
+       if (ret)
+               return ret;
+
+       ret = jit_inject_event(jd, event);
+       if (!ret)
+               build_id__mark_dso_hit(tool, event, &sample, NULL, jd->machine);
+
+       return ret;
+}
+
+static int jit_repipe_debug_info(struct jit_buf_desc *jd, union jr_entry *jr)
+{
+       void *data;
+       size_t sz;
+
+       if (!(jd && jr))
+               return -1;
+
+       sz  = jr->prefix.total_size - sizeof(jr->info);
+       data = malloc(sz);
+       if (!data)
+               return -1;
+
+       memcpy(data, &jr->info.entries, sz);
+
+       jd->debug_data       = data;
+
+       /*
+        * we must use nr_entry instead of size here because
+        * we cannot distinguish actual entry from padding otherwise
+        */
+       jd->nr_debug_entries = jr->info.nr_entry;
+
+       return 0;
+}
+
+static int
+jit_process_dump(struct jit_buf_desc *jd)
+{
+       union jr_entry *jr;
+       int ret;
+
+       while ((jr = jit_get_next_entry(jd))) {
+               switch(jr->prefix.id) {
+               case JIT_CODE_LOAD:
+                       ret = jit_repipe_code_load(jd, jr);
+                       break;
+               case JIT_CODE_MOVE:
+                       ret = jit_repipe_code_move(jd, jr);
+                       break;
+               case JIT_CODE_DEBUG_INFO:
+                       ret = jit_repipe_debug_info(jd, jr);
+                       break;
+               default:
+                       ret = 0;
+                       continue;
+               }
+       }
+       return ret;
+}
+
+static int
+jit_inject(struct jit_buf_desc *jd, char *path)
+{
+       int ret;
+
+       if (verbose > 0)
+               fprintf(stderr, "injecting: %s\n", path);
+
+       ret = jit_open(jd, path);
+       if (ret)
+               return -1;
+
+       ret = jit_process_dump(jd);
+
+       jit_close(jd);
+
+       if (verbose > 0)
+               fprintf(stderr, "injected: %s (%d)\n", path, ret);
+
+       return 0;
+}
+
+/*
+ * File must be with pattern .../jit-XXXX.dump
+ * where XXXX is the PID of the process which did the mmap()
+ * as captured in the RECORD_MMAP record
+ */
+static int
+jit_detect(char *mmap_name, pid_t pid)
+ {
+       char *p;
+       char *end = NULL;
+       pid_t pid2;
+
+       if (verbose > 2)
+               fprintf(stderr, "jit marker trying : %s\n", mmap_name);
+       /*
+        * get file name
+        */
+       p = strrchr(mmap_name, '/');
+       if (!p)
+               return -1;
+
+       /*
+        * match prefix
+        */
+       if (strncmp(p, "/jit-", 5))
+               return -1;
+
+       /*
+        * skip prefix
+        */
+       p += 5;
+
+       /*
+        * must be followed by a pid
+        */
+       if (!isdigit(*p))
+               return -1;
+
+       pid2 = (int)strtol(p, &end, 10);
+       if (!end)
+               return -1;
+
+       /*
+        * pid does not match mmap pid
+        * pid==0 in system-wide mode (synthesized)
+        */
+       if (pid && pid2 != pid)
+               return -1;
+       /*
+        * validate suffix
+        */
+       if (strcmp(end, ".dump"))
+               return -1;
+
+       if (verbose > 0)
+               fprintf(stderr, "jit marker found: %s\n", mmap_name);
+
+       return 0;
+}
+
+int
+jit_process(struct perf_session *session,
+           struct perf_data_file *output,
+           struct machine *machine,
+           char *filename,
+           pid_t pid,
+           u64 *nbytes)
+{
+       struct perf_evsel *first;
+       struct jit_buf_desc jd;
+       int ret;
+
+       /*
+        * first, detect marker mmap (i.e., the jitdump mmap)
+        */
+       if (jit_detect(filename, pid))
+               return 0;
+
+       memset(&jd, 0, sizeof(jd));
+
+       jd.session = session;
+       jd.output  = output;
+       jd.machine = machine;
+
+       /*
+        * track sample_type to compute id_all layout
+        * perf sets the same sample type to all events as of now
+        */
+       first = perf_evlist__first(session->evlist);
+       jd.sample_type = first->attr.sample_type;
+
+       *nbytes = 0;
+
+       ret = jit_inject(&jd, filename);
+       if (!ret) {
+               *nbytes = jd.bytes_written;
+               ret = 1;
+       }
+
+       return ret;
+}
diff --git a/tools/perf/util/jitdump.h b/tools/perf/util/jitdump.h

new file mode 100644 (file)

index 0000000..b66c1f5
--- /dev/null
+++ b/tools/perf/util/jitdump.h
@@ -0,0 +1,124 @@
+/*
+ * jitdump.h: jitted code info encapsulation file format
+ *
+ * Adapted from OProfile GPLv2 support jidump.h:
+ * Copyright 2007 OProfile authors
+ * Jens Wilke
+ * Daniel Hansel
+ * Copyright IBM Corporation 2007
+ */
+#ifndef JITDUMP_H
+#define JITDUMP_H
+
+#include <sys/time.h>
+#include <time.h>
+#include <stdint.h>
+
+/* JiTD */
+#define JITHEADER_MAGIC                0x4A695444
+#define JITHEADER_MAGIC_SW     0x4454694A
+
+#define PADDING_8ALIGNED(x) ((((x) + 7) & 7) ^ 7)
+
+#define JITHEADER_VERSION 1
+
+enum jitdump_flags_bits {
+       JITDUMP_FLAGS_MAX_BIT,
+};
+
+#define JITDUMP_FLAGS_RESERVED (JITDUMP_FLAGS_MAX_BIT < 64 ? \
+                               (~((1ULL << JITDUMP_FLAGS_MAX_BIT) - 1)) : 0)
+
+struct jitheader {
+       uint32_t magic;         /* characters "jItD" */
+       uint32_t version;       /* header version */
+       uint32_t total_size;    /* total size of header */
+       uint32_t elf_mach;      /* elf mach target */
+       uint32_t pad1;          /* reserved */
+       uint32_t pid;           /* JIT process id */
+       uint64_t timestamp;     /* timestamp */
+       uint64_t flags;         /* flags */
+};
+
+enum jit_record_type {
+       JIT_CODE_LOAD           = 0,
+        JIT_CODE_MOVE           = 1,
+       JIT_CODE_DEBUG_INFO     = 2,
+       JIT_CODE_CLOSE          = 3,
+
+       JIT_CODE_MAX,
+};
+
+/* record prefix (mandatory in each record) */
+struct jr_prefix {
+       uint32_t id;
+       uint32_t total_size;
+       uint64_t timestamp;
+};
+
+struct jr_code_load {
+       struct jr_prefix p;
+
+       uint32_t pid;
+       uint32_t tid;
+       uint64_t vma;
+       uint64_t code_addr;
+       uint64_t code_size;
+       uint64_t code_index;
+};
+
+struct jr_code_close {
+       struct jr_prefix p;
+};
+
+struct jr_code_move {
+       struct jr_prefix p;
+
+       uint32_t pid;
+       uint32_t tid;
+       uint64_t vma;
+       uint64_t old_code_addr;
+       uint64_t new_code_addr;
+       uint64_t code_size;
+       uint64_t code_index;
+};
+
+struct debug_entry {
+       uint64_t addr;
+       int lineno;         /* source line number starting at 1 */
+       int discrim;        /* column discriminator, 0 is default */
+       const char name[0]; /* null terminated filename, \xff\0 if same as previous entry */
+};
+
+struct jr_code_debug_info {
+       struct jr_prefix p;
+
+       uint64_t code_addr;
+       uint64_t nr_entry;
+       struct debug_entry entries[0];
+};
+
+union jr_entry {
+        struct jr_code_debug_info info;
+        struct jr_code_close close;
+        struct jr_code_load load;
+        struct jr_code_move move;
+        struct jr_prefix prefix;
+};
+
+static inline struct debug_entry *
+debug_entry_next(struct debug_entry *ent)
+{
+       void *a = ent + 1;
+       size_t l = strlen(ent->name) + 1;
+       return a + l;
+}
+
+static inline char *
+debug_entry_file(struct debug_entry *ent)
+{
+       void *a = ent + 1;
+       return a;
+}
+
+#endif /* !JITDUMP_H */
diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h

index ae825d4ec110fcfe6a06ef5e4daf50b03a214a87..d01e73592f6e34347f0c26531b58aa6ce1d57d52 100644 (file)
--- a/tools/perf/util/kvm-stat.h
+++ b/tools/perf/util/kvm-stat.h
@@ -122,6 +122,7 @@ void exit_event_decode_key(struct perf_kvm_stat *kvm,
  
  bool kvm_exit_event(struct perf_evsel *evsel);
  bool kvm_entry_event(struct perf_evsel *evsel);
+int setup_kvm_events_tp(struct perf_kvm_stat *kvm);
  
  #define define_exit_reasons_table(name, symbols)       \
         static struct exit_reasons_table name[] = {     \
@@ -133,8 +134,13 @@ bool kvm_entry_event(struct perf_evsel *evsel);
   */
  int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid);
  
-extern const char * const kvm_events_tp[];
+extern const char *kvm_events_tp[];
  extern struct kvm_reg_events_ops kvm_reg_events_ops[];
  extern const char * const kvm_skip_events[];
+extern const char *vcpu_id_str;
+extern const int decode_str_len;
+extern const char *kvm_exit_reason;
+extern const char *kvm_entry_trace;
+extern const char *kvm_exit_trace;
  
  #endif /* __PERF_KVM_STAT_H */
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h

index 2c2b443df5ba796c43ee12c6c0f2061b5d7c7b80..1a3e45baf97fb575c02afe49707727aff0f628ec 100644 (file)
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -179,6 +179,16 @@ struct symbol *machine__find_kernel_symbol(struct machine *machine,
                                        mapp, filter);
  }
  
+static inline
+struct symbol *machine__find_kernel_symbol_by_name(struct machine *machine,
+                                                  enum map_type type, const char *name,
+                                                  struct map **mapp,
+                                                  symbol_filter_t filter)
+{
+       return map_groups__find_symbol_by_name(&machine->kmaps, type, name,
+                                              mapp, filter);
+}
+
  static inline
  struct symbol *machine__find_kernel_function(struct machine *machine, u64 addr,
                                              struct map **mapp,
diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c

new file mode 100644 (file)

index 0000000..75465f8
--- /dev/null
+++ b/tools/perf/util/mem-events.c
@@ -0,0 +1,255 @@
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <api/fs/fs.h>
+#include "mem-events.h"
+#include "debug.h"
+#include "symbol.h"
+
+#define E(t, n, s) { .tag = t, .name = n, .sysfs_name = s }
+
+struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX] = {
+       E("ldlat-loads",        "cpu/mem-loads,ldlat=30/P",     "mem-loads"),
+       E("ldlat-stores",       "cpu/mem-stores/P",             "mem-stores"),
+};
+#undef E
+
+#undef E
+
+char *perf_mem_events__name(int i)
+{
+       return (char *)perf_mem_events[i].name;
+}
+
+int perf_mem_events__parse(const char *str)
+{
+       char *tok, *saveptr = NULL;
+       bool found = false;
+       char *buf;
+       int j;
+
+       /* We need buffer that we know we can write to. */
+       buf = malloc(strlen(str) + 1);
+       if (!buf)
+               return -ENOMEM;
+
+       strcpy(buf, str);
+
+       tok = strtok_r((char *)buf, ",", &saveptr);
+
+       while (tok) {
+               for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
+                       struct perf_mem_event *e = &perf_mem_events[j];
+
+                       if (strstr(e->tag, tok))
+                               e->record = found = true;
+               }
+
+               tok = strtok_r(NULL, ",", &saveptr);
+       }
+
+       free(buf);
+
+       if (found)
+               return 0;
+
+       pr_err("failed: event '%s' not found, use '-e list' to get list of available events\n", str);
+       return -1;
+}
+
+int perf_mem_events__init(void)
+{
+       const char *mnt = sysfs__mount();
+       bool found = false;
+       int j;
+
+       if (!mnt)
+               return -ENOENT;
+
+       for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
+               char path[PATH_MAX];
+               struct perf_mem_event *e = &perf_mem_events[j];
+               struct stat st;
+
+               scnprintf(path, PATH_MAX, "%s/devices/cpu/events/%s",
+                         mnt, e->sysfs_name);
+
+               if (!stat(path, &st))
+                       e->supported = found = true;
+       }
+
+       return found ? 0 : -ENOENT;
+}
+
+static const char * const tlb_access[] = {
+       "N/A",
+       "HIT",
+       "MISS",
+       "L1",
+       "L2",
+       "Walker",
+       "Fault",
+};
+
+int perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
+{
+       size_t l = 0, i;
+       u64 m = PERF_MEM_TLB_NA;
+       u64 hit, miss;
+
+       sz -= 1; /* -1 for null termination */
+       out[0] = '\0';
+
+       if (mem_info)
+               m = mem_info->data_src.mem_dtlb;
+
+       hit = m & PERF_MEM_TLB_HIT;
+       miss = m & PERF_MEM_TLB_MISS;
+
+       /* already taken care of */
+       m &= ~(PERF_MEM_TLB_HIT|PERF_MEM_TLB_MISS);
+
+       for (i = 0; m && i < ARRAY_SIZE(tlb_access); i++, m >>= 1) {
+               if (!(m & 0x1))
+                       continue;
+               if (l) {
+                       strcat(out, " or ");
+                       l += 4;
+               }
+               l += scnprintf(out + l, sz - l, tlb_access[i]);
+       }
+       if (*out == '\0')
+               l += scnprintf(out, sz - l, "N/A");
+       if (hit)
+               l += scnprintf(out + l, sz - l, " hit");
+       if (miss)
+               l += scnprintf(out + l, sz - l, " miss");
+
+       return l;
+}
+
+static const char * const mem_lvl[] = {
+       "N/A",
+       "HIT",
+       "MISS",
+       "L1",
+       "LFB",
+       "L2",
+       "L3",
+       "Local RAM",
+       "Remote RAM (1 hop)",
+       "Remote RAM (2 hops)",
+       "Remote Cache (1 hop)",
+       "Remote Cache (2 hops)",
+       "I/O",
+       "Uncached",
+};
+
+int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
+{
+       size_t i, l = 0;
+       u64 m =  PERF_MEM_LVL_NA;
+       u64 hit, miss;
+
+       if (mem_info)
+               m  = mem_info->data_src.mem_lvl;
+
+       sz -= 1; /* -1 for null termination */
+       out[0] = '\0';
+
+       hit = m & PERF_MEM_LVL_HIT;
+       miss = m & PERF_MEM_LVL_MISS;
+
+       /* already taken care of */
+       m &= ~(PERF_MEM_LVL_HIT|PERF_MEM_LVL_MISS);
+
+       for (i = 0; m && i < ARRAY_SIZE(mem_lvl); i++, m >>= 1) {
+               if (!(m & 0x1))
+                       continue;
+               if (l) {
+                       strcat(out, " or ");
+                       l += 4;
+               }
+               l += scnprintf(out + l, sz - l, mem_lvl[i]);
+       }
+       if (*out == '\0')
+               l += scnprintf(out, sz - l, "N/A");
+       if (hit)
+               l += scnprintf(out + l, sz - l, " hit");
+       if (miss)
+               l += scnprintf(out + l, sz - l, " miss");
+
+       return l;
+}
+
+static const char * const snoop_access[] = {
+       "N/A",
+       "None",
+       "Miss",
+       "Hit",
+       "HitM",
+};
+
+int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
+{
+       size_t i, l = 0;
+       u64 m = PERF_MEM_SNOOP_NA;
+
+       sz -= 1; /* -1 for null termination */
+       out[0] = '\0';
+
+       if (mem_info)
+               m = mem_info->data_src.mem_snoop;
+
+       for (i = 0; m && i < ARRAY_SIZE(snoop_access); i++, m >>= 1) {
+               if (!(m & 0x1))
+                       continue;
+               if (l) {
+                       strcat(out, " or ");
+                       l += 4;
+               }
+               l += scnprintf(out + l, sz - l, snoop_access[i]);
+       }
+
+       if (*out == '\0')
+               l += scnprintf(out, sz - l, "N/A");
+
+       return l;
+}
+
+int perf_mem__lck_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
+{
+       u64 mask = PERF_MEM_LOCK_NA;
+       int l;
+
+       if (mem_info)
+               mask = mem_info->data_src.mem_lock;
+
+       if (mask & PERF_MEM_LOCK_NA)
+               l = scnprintf(out, sz, "N/A");
+       else if (mask & PERF_MEM_LOCK_LOCKED)
+               l = scnprintf(out, sz, "Yes");
+       else
+               l = scnprintf(out, sz, "No");
+
+       return l;
+}
+
+int perf_script__meminfo_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
+{
+       int i = 0;
+
+       i += perf_mem__lvl_scnprintf(out, sz, mem_info);
+       i += scnprintf(out + i, sz - i, "|SNP ");
+       i += perf_mem__snp_scnprintf(out + i, sz - i, mem_info);
+       i += scnprintf(out + i, sz - i, "|TLB ");
+       i += perf_mem__tlb_scnprintf(out + i, sz - i, mem_info);
+       i += scnprintf(out + i, sz - i, "|LCK ");
+       i += perf_mem__lck_scnprintf(out + i, sz - i, mem_info);
+
+       return i;
+}
diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h

new file mode 100644 (file)

index 0000000..5d6d930
--- /dev/null
+++ b/tools/perf/util/mem-events.h
@@ -0,0 +1,35 @@
+#ifndef __PERF_MEM_EVENTS_H
+#define __PERF_MEM_EVENTS_H
+
+#include <stdbool.h>
+
+struct perf_mem_event {
+       bool            record;
+       bool            supported;
+       const char      *tag;
+       const char      *name;
+       const char      *sysfs_name;
+};
+
+enum {
+       PERF_MEM_EVENTS__LOAD,
+       PERF_MEM_EVENTS__STORE,
+       PERF_MEM_EVENTS__MAX,
+};
+
+extern struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX];
+
+int perf_mem_events__parse(const char *str);
+int perf_mem_events__init(void);
+
+char *perf_mem_events__name(int i);
+
+struct mem_info;
+int perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
+int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
+int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
+int perf_mem__lck_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
+
+int perf_script__meminfo_scnprintf(char *bf, size_t size, struct mem_info *mem_info);
+
+#endif /* __PERF_MEM_EVENTS_H */
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c

index 813d9b272c813b0dd749470f6209aa3a0a0607b4..4c19d5e79d8c4d626eb3fa91486cc1d83447aeeb 100644 (file)
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -279,7 +279,24 @@ const char *event_type(int type)
         return "unknown";
  }
  
+static int parse_events__is_name_term(struct parse_events_term *term)
+{
+       return term->type_term == PARSE_EVENTS__TERM_TYPE_NAME;
+}
  
+static char *get_config_name(struct list_head *head_terms)
+{
+       struct parse_events_term *term;
+
+       if (!head_terms)
+               return NULL;
+
+       list_for_each_entry(term, head_terms, list)
+               if (parse_events__is_name_term(term))
+                       return term->val.str;
+
+       return NULL;
+}
  
  static struct perf_evsel *
  __add_event(struct list_head *list, int *idx,
@@ -333,11 +350,25 @@ static int parse_aliases(char *str, const char *names[][PERF_EVSEL__MAX_ALIASES]
         return -1;
  }
  
+typedef int config_term_func_t(struct perf_event_attr *attr,
+                              struct parse_events_term *term,
+                              struct parse_events_error *err);
+static int config_term_common(struct perf_event_attr *attr,
+                             struct parse_events_term *term,
+                             struct parse_events_error *err);
+static int config_attr(struct perf_event_attr *attr,
+                      struct list_head *head,
+                      struct parse_events_error *err,
+                      config_term_func_t config_term);
+
  int parse_events_add_cache(struct list_head *list, int *idx,
-                          char *type, char *op_result1, char *op_result2)
+                          char *type, char *op_result1, char *op_result2,
+                          struct parse_events_error *err,
+                          struct list_head *head_config)
  {
         struct perf_event_attr attr;
-       char name[MAX_NAME_LEN];
+       LIST_HEAD(config_terms);
+       char name[MAX_NAME_LEN], *config_name;
         int cache_type = -1, cache_op = -1, cache_result = -1;
         char *op_result[2] = { op_result1, op_result2 };
         int i, n;
@@ -351,6 +382,7 @@ int parse_events_add_cache(struct list_head *list, int *idx,
         if (cache_type == -1)
                 return -EINVAL;
  
+       config_name = get_config_name(head_config);
         n = snprintf(name, MAX_NAME_LEN, "%s", type);
  
         for (i = 0; (i < 2) && (op_result[i]); i++) {
@@ -391,7 +423,16 @@ int parse_events_add_cache(struct list_head *list, int *idx,
         memset(&attr, 0, sizeof(attr));
         attr.config = cache_type | (cache_op << 8) | (cache_result << 16);
         attr.type = PERF_TYPE_HW_CACHE;
-       return add_event(list, idx, &attr, name, NULL);
+
+       if (head_config) {
+               if (config_attr(&attr, head_config, err,
+                               config_term_common))
+                       return -EINVAL;
+
+               if (get_config_terms(head_config, &config_terms))
+                       return -ENOMEM;
+       }
+       return add_event(list, idx, &attr, config_name ? : name, &config_terms);
  }
  
  static void tracepoint_error(struct parse_events_error *e, int err,
@@ -540,6 +581,7 @@ static int add_tracepoint_multi_sys(struct list_head *list, int *idx,
  struct __add_bpf_event_param {
         struct parse_events_evlist *data;
         struct list_head *list;
+       struct list_head *head_config;
  };
  
  static int add_bpf_event(struct probe_trace_event *tev, int fd,
@@ -556,7 +598,8 @@ static int add_bpf_event(struct probe_trace_event *tev, int fd,
                  tev->group, tev->event, fd);
  
         err = parse_events_add_tracepoint(&new_evsels, &evlist->idx, tev->group,
-                                         tev->event, evlist->error, NULL);
+                                         tev->event, evlist->error,
+                                         param->head_config);
         if (err) {
                 struct perf_evsel *evsel, *tmp;
  
@@ -581,11 +624,12 @@ static int add_bpf_event(struct probe_trace_event *tev, int fd,
  
  int parse_events_load_bpf_obj(struct parse_events_evlist *data,
                               struct list_head *list,
-                             struct bpf_object *obj)
+                             struct bpf_object *obj,
+                             struct list_head *head_config)
  {
         int err;
         char errbuf[BUFSIZ];
-       struct __add_bpf_event_param param = {data, list};
+       struct __add_bpf_event_param param = {data, list, head_config};
         static bool registered_unprobe_atexit = false;
  
         if (IS_ERR(obj) || !obj) {
@@ -631,17 +675,99 @@ errout:
         return err;
  }
  
+static int
+parse_events_config_bpf(struct parse_events_evlist *data,
+                       struct bpf_object *obj,
+                       struct list_head *head_config)
+{
+       struct parse_events_term *term;
+       int error_pos;
+
+       if (!head_config || list_empty(head_config))
+               return 0;
+
+       list_for_each_entry(term, head_config, list) {
+               char errbuf[BUFSIZ];
+               int err;
+
+               if (term->type_term != PARSE_EVENTS__TERM_TYPE_USER) {
+                       snprintf(errbuf, sizeof(errbuf),
+                                "Invalid config term for BPF object");
+                       errbuf[BUFSIZ - 1] = '\0';
+
+                       data->error->idx = term->err_term;
+                       data->error->str = strdup(errbuf);
+                       return -EINVAL;
+               }
+
+               err = bpf__config_obj(obj, term, data->evlist, &error_pos);
+               if (err) {
+                       bpf__strerror_config_obj(obj, term, data->evlist,
+                                                &error_pos, err, errbuf,
+                                                sizeof(errbuf));
+                       data->error->help = strdup(
+"Hint:\tValid config terms:\n"
+"     \tmap:[<arraymap>].value<indices>=[value]\n"
+"     \tmap:[<eventmap>].event<indices>=[event]\n"
+"\n"
+"     \twhere <indices> is something like [0,3...5] or [all]\n"
+"     \t(add -v to see detail)");
+                       data->error->str = strdup(errbuf);
+                       if (err == -BPF_LOADER_ERRNO__OBJCONF_MAP_VALUE)
+                               data->error->idx = term->err_val;
+                       else
+                               data->error->idx = term->err_term + error_pos;
+                       return err;
+               }
+       }
+       return 0;
+}
+
+/*
+ * Split config terms:
+ * perf record -e bpf.c/call-graph=fp,map:array.value[0]=1/ ...
+ *  'call-graph=fp' is 'evt config', should be applied to each
+ *  events in bpf.c.
+ * 'map:array.value[0]=1' is 'obj config', should be processed
+ * with parse_events_config_bpf.
+ *
+ * Move object config terms from the first list to obj_head_config.
+ */
+static void
+split_bpf_config_terms(struct list_head *evt_head_config,
+                      struct list_head *obj_head_config)
+{
+       struct parse_events_term *term, *temp;
+
+       /*
+        * Currectly, all possible user config term
+        * belong to bpf object. parse_events__is_hardcoded_term()
+        * happends to be a good flag.
+        *
+        * See parse_events_config_bpf() and
+        * config_term_tracepoint().
+        */
+       list_for_each_entry_safe(term, temp, evt_head_config, list)
+               if (!parse_events__is_hardcoded_term(term))
+                       list_move_tail(&term->list, obj_head_config);
+}
+
  int parse_events_load_bpf(struct parse_events_evlist *data,
                           struct list_head *list,
                           char *bpf_file_name,
-                         bool source)
+                         bool source,
+                         struct list_head *head_config)
  {
+       int err;
         struct bpf_object *obj;
+       LIST_HEAD(obj_head_config);
+
+       if (head_config)
+               split_bpf_config_terms(head_config, &obj_head_config);
  
         obj = bpf__prepare_load(bpf_file_name, source);
         if (IS_ERR(obj)) {
                 char errbuf[BUFSIZ];
-               int err;
  
                 err = PTR_ERR(obj);
  
@@ -659,7 +785,18 @@ int parse_events_load_bpf(struct parse_events_evlist *data,
                 return err;
         }
  
-       return parse_events_load_bpf_obj(data, list, obj);
+       err = parse_events_load_bpf_obj(data, list, obj, head_config);
+       if (err)
+               return err;
+       err = parse_events_config_bpf(data, obj, &obj_head_config);
+
+       /*
+        * Caller doesn't know anything about obj_head_config,
+        * so combine them together again before returnning.
+        */
+       if (head_config)
+               list_splice_tail(&obj_head_config, head_config);
+       return err;
  }
  
  static int
@@ -746,9 +883,59 @@ static int check_type_val(struct parse_events_term *term,
         return -EINVAL;
  }
  
-typedef int config_term_func_t(struct perf_event_attr *attr,
-                              struct parse_events_term *term,
-                              struct parse_events_error *err);
+/*
+ * Update according to parse-events.l
+ */
+static const char *config_term_names[__PARSE_EVENTS__TERM_TYPE_NR] = {
+       [PARSE_EVENTS__TERM_TYPE_USER]                  = "<sysfs term>",
+       [PARSE_EVENTS__TERM_TYPE_CONFIG]                = "config",
+       [PARSE_EVENTS__TERM_TYPE_CONFIG1]               = "config1",
+       [PARSE_EVENTS__TERM_TYPE_CONFIG2]               = "config2",
+       [PARSE_EVENTS__TERM_TYPE_NAME]                  = "name",
+       [PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD]         = "period",
+       [PARSE_EVENTS__TERM_TYPE_SAMPLE_FREQ]           = "freq",
+       [PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE]    = "branch_type",
+       [PARSE_EVENTS__TERM_TYPE_TIME]                  = "time",
+       [PARSE_EVENTS__TERM_TYPE_CALLGRAPH]             = "call-graph",
+       [PARSE_EVENTS__TERM_TYPE_STACKSIZE]             = "stack-size",
+       [PARSE_EVENTS__TERM_TYPE_NOINHERIT]             = "no-inherit",
+       [PARSE_EVENTS__TERM_TYPE_INHERIT]               = "inherit",
+};
+
+static bool config_term_shrinked;
+
+static bool
+config_term_avail(int term_type, struct parse_events_error *err)
+{
+       if (term_type < 0 || term_type >= __PARSE_EVENTS__TERM_TYPE_NR) {
+               err->str = strdup("Invalid term_type");
+               return false;
+       }
+       if (!config_term_shrinked)
+               return true;
+
+       switch (term_type) {
+       case PARSE_EVENTS__TERM_TYPE_CONFIG:
+       case PARSE_EVENTS__TERM_TYPE_CONFIG1:
+       case PARSE_EVENTS__TERM_TYPE_CONFIG2:
+       case PARSE_EVENTS__TERM_TYPE_NAME:
+               return true;
+       default:
+               if (!err)
+                       return false;
+
+               /* term_type is validated so indexing is safe */
+               if (asprintf(&err->str, "'%s' is not usable in 'perf stat'",
+                            config_term_names[term_type]) < 0)
+                       err->str = NULL;
+               return false;
+       }
+}
+
+void parse_events__shrink_config_terms(void)
+{
+       config_term_shrinked = true;
+}
  
  static int config_term_common(struct perf_event_attr *attr,
                               struct parse_events_term *term,
@@ -815,6 +1002,17 @@ do {                                                                         \
                 return -EINVAL;
         }
  
+       /*
+        * Check term availbility after basic checking so
+        * PARSE_EVENTS__TERM_TYPE_USER can be found and filtered.
+        *
+        * If check availbility at the entry of this function,
+        * user will see "'<sysfs term>' is not usable in 'perf stat'"
+        * if an invalid config term is provided for legacy events
+        * (for example, instructions/badterm/...), which is confusing.
+        */
+       if (!config_term_avail(term->type_term, err))
+               return -EINVAL;
         return 0;
  #undef CHECK_TYPE_VAL
  }
@@ -961,23 +1159,8 @@ int parse_events_add_numeric(struct parse_events_evlist *data,
                         return -ENOMEM;
         }
  
-       return add_event(list, &data->idx, &attr, NULL, &config_terms);
-}
-
-static int parse_events__is_name_term(struct parse_events_term *term)
-{
-       return term->type_term == PARSE_EVENTS__TERM_TYPE_NAME;
-}
-
-static char *pmu_event_name(struct list_head *head_terms)
-{
-       struct parse_events_term *term;
-
-       list_for_each_entry(term, head_terms, list)
-               if (parse_events__is_name_term(term))
-                       return term->val.str;
-
-       return NULL;
+       return add_event(list, &data->idx, &attr,
+                        get_config_name(head_config), &config_terms);
  }
  
  int parse_events_add_pmu(struct parse_events_evlist *data,
@@ -1024,7 +1207,7 @@ int parse_events_add_pmu(struct parse_events_evlist *data,
                 return -EINVAL;
  
         evsel = __add_event(list, &data->idx, &attr,
-                           pmu_event_name(head_config), pmu->cpus,
+                           get_config_name(head_config), pmu->cpus,
                             &config_terms);
         if (evsel) {
                 evsel->unit = info.unit;
@@ -1386,8 +1569,7 @@ int parse_events_terms(struct list_head *terms, const char *str)
                 return 0;
         }
  
-       if (data.terms)
-               parse_events__free_terms(data.terms);
+       parse_events_terms__delete(data.terms);
         return ret;
  }
  
@@ -1395,9 +1577,10 @@ int parse_events(struct perf_evlist *evlist, const char *str,
                  struct parse_events_error *err)
  {
         struct parse_events_evlist data = {
-               .list  = LIST_HEAD_INIT(data.list),
-               .idx   = evlist->nr_entries,
-               .error = err,
+               .list   = LIST_HEAD_INIT(data.list),
+               .idx    = evlist->nr_entries,
+               .error  = err,
+               .evlist = evlist,
         };
         int ret;
  
@@ -2068,12 +2251,29 @@ int parse_events_term__clone(struct parse_events_term **new,
                         term->err_term, term->err_val);
  }
  
-void parse_events__free_terms(struct list_head *terms)
+void parse_events_terms__purge(struct list_head *terms)
  {
         struct parse_events_term *term, *h;
  
-       list_for_each_entry_safe(term, h, terms, list)
+       list_for_each_entry_safe(term, h, terms, list) {
+               if (term->array.nr_ranges)
+                       free(term->array.ranges);
+               list_del_init(&term->list);
                 free(term);
+       }
+}
+
+void parse_events_terms__delete(struct list_head *terms)
+{
+       if (!terms)
+               return;
+       parse_events_terms__purge(terms);
+       free(terms);
+}
+
+void parse_events__clear_array(struct parse_events_array *a)
+{
+       free(a->ranges);
  }
  
  void parse_events_evlist_error(struct parse_events_evlist *data,
@@ -2088,6 +2288,33 @@ void parse_events_evlist_error(struct parse_events_evlist *data,
         WARN_ONCE(!err->str, "WARNING: failed to allocate error string");
  }
  
+static void config_terms_list(char *buf, size_t buf_sz)
+{
+       int i;
+       bool first = true;
+
+       buf[0] = '\0';
+       for (i = 0; i < __PARSE_EVENTS__TERM_TYPE_NR; i++) {
+               const char *name = config_term_names[i];
+
+               if (!config_term_avail(i, NULL))
+                       continue;
+               if (!name)
+                       continue;
+               if (name[0] == '<')
+                       continue;
+
+               if (strlen(buf) + strlen(name) + 2 >= buf_sz)
+                       return;
+
+               if (!first)
+                       strcat(buf, ",");
+               else
+                       first = false;
+               strcat(buf, name);
+       }
+}
+
  /*
   * Return string contains valid config terms of an event.
   * @additional_terms: For terms such as PMU sysfs terms.
@@ -2095,17 +2322,18 @@ void parse_events_evlist_error(struct parse_events_evlist *data,
  char *parse_events_formats_error_string(char *additional_terms)
  {
         char *str;
-       static const char *static_terms = "config,config1,config2,name,"
-                                         "period,freq,branch_type,time,"
-                                         "call-graph,stack-size\n";
+       /* "branch_type" is the longest name */
+       char static_terms[__PARSE_EVENTS__TERM_TYPE_NR *
+                         (sizeof("branch_type") - 1)];
  
+       config_terms_list(static_terms, sizeof(static_terms));
         /* valid terms */
         if (additional_terms) {
-               if (!asprintf(&str, "valid terms: %s,%s",
-                             additional_terms, static_terms))
+               if (asprintf(&str, "valid terms: %s,%s",
+                            additional_terms, static_terms) < 0)
                         goto fail;
         } else {
-               if (!asprintf(&str, "valid terms: %s", static_terms))
+               if (asprintf(&str, "valid terms: %s", static_terms) < 0)
                         goto fail;
         }
         return str;
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h

index f1a6db107241b1c8ffaf03a3514ba1549df6fd1b..67e493088e81c29c17819e94b37da4f83fae073f 100644 (file)
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -68,11 +68,21 @@ enum {
         PARSE_EVENTS__TERM_TYPE_CALLGRAPH,
         PARSE_EVENTS__TERM_TYPE_STACKSIZE,
         PARSE_EVENTS__TERM_TYPE_NOINHERIT,
-       PARSE_EVENTS__TERM_TYPE_INHERIT
+       PARSE_EVENTS__TERM_TYPE_INHERIT,
+       __PARSE_EVENTS__TERM_TYPE_NR,
+};
+
+struct parse_events_array {
+       size_t nr_ranges;
+       struct {
+               unsigned int start;
+               size_t length;
+       } *ranges;
  };
  
  struct parse_events_term {
         char *config;
+       struct parse_events_array array;
         union {
                 char *str;
                 u64  num;
@@ -98,12 +108,14 @@ struct parse_events_evlist {
         int                        idx;
         int                        nr_groups;
         struct parse_events_error *error;
+       struct perf_evlist        *evlist;
  };
  
  struct parse_events_terms {
         struct list_head *terms;
  };
  
+void parse_events__shrink_config_terms(void);
  int parse_events__is_hardcoded_term(struct parse_events_term *term);
  int parse_events_term__num(struct parse_events_term **term,
                            int type_term, char *config, u64 num,
@@ -115,7 +127,9 @@ int parse_events_term__sym_hw(struct parse_events_term **term,
                               char *config, unsigned idx);
  int parse_events_term__clone(struct parse_events_term **new,
                              struct parse_events_term *term);
-void parse_events__free_terms(struct list_head *terms);
+void parse_events_terms__delete(struct list_head *terms);
+void parse_events_terms__purge(struct list_head *terms);
+void parse_events__clear_array(struct parse_events_array *a);
  int parse_events__modifier_event(struct list_head *list, char *str, bool add);
  int parse_events__modifier_group(struct list_head *list, char *event_mod);
  int parse_events_name(struct list_head *list, char *name);
@@ -126,18 +140,22 @@ int parse_events_add_tracepoint(struct list_head *list, int *idx,
  int parse_events_load_bpf(struct parse_events_evlist *data,
                           struct list_head *list,
                           char *bpf_file_name,
-                         bool source);
+                         bool source,
+                         struct list_head *head_config);
  /* Provide this function for perf test */
  struct bpf_object;
  int parse_events_load_bpf_obj(struct parse_events_evlist *data,
                               struct list_head *list,
-                             struct bpf_object *obj);
+                             struct bpf_object *obj,
+                             struct list_head *head_config);
  int parse_events_add_numeric(struct parse_events_evlist *data,
                              struct list_head *list,
                              u32 type, u64 config,
                              struct list_head *head_config);
  int parse_events_add_cache(struct list_head *list, int *idx,
-                          char *type, char *op_result1, char *op_result2);
+                          char *type, char *op_result1, char *op_result2,
+                          struct parse_events_error *error,
+                          struct list_head *head_config);
  int parse_events_add_breakpoint(struct list_head *list, int *idx,
                                 void *ptr, char *type, u64 len);
  int parse_events_add_pmu(struct parse_events_evlist *data,
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l

index 58c5831ffd5c22133f48a4c1a3a07721c71362fa..1477fbc78993c7b31d7bb1531f630cca8587d93f 100644 (file)
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -9,8 +9,8 @@
  %{
  #include <errno.h>
  #include "../perf.h"
-#include "parse-events-bison.h"
  #include "parse-events.h"
+#include "parse-events-bison.h"
  
  char *parse_events_get_text(yyscan_t yyscanner);
  YYSTYPE *parse_events_get_lval(yyscan_t yyscanner);
@@ -111,6 +111,7 @@ do {                                                        \
  %x mem
  %s config
  %x event
+%x array
  
  group          [^,{}/]*[{][^}]*[}][^,{}/]*
  event_pmu      [^,{}/]+[/][^/]*[/][^,{}/]*
@@ -122,7 +123,7 @@ num_dec             [0-9]+
  num_hex                0x[a-fA-F0-9]+
  num_raw_hex    [a-fA-F0-9]+
  name           [a-zA-Z_*?][a-zA-Z0-9_*?.]*
-name_minus     [a-zA-Z_*?][a-zA-Z0-9\-_*?.]*
+name_minus     [a-zA-Z_*?][a-zA-Z0-9\-_*?.:]*
  /* If you add a modifier you need to update check_modifier() */
  modifier_event [ukhpPGHSDI]+
  modifier_bp    [rwx]{1,3}
@@ -176,10 +177,17 @@ modifier_bp       [rwx]{1,3}
  
  }
  
+<array>{
+"]"                    { BEGIN(config); return ']'; }
+{num_dec}              { return value(yyscanner, 10); }
+{num_hex}              { return value(yyscanner, 16); }
+,                      { return ','; }
+"\.\.\."               { return PE_ARRAY_RANGE; }
+}
+
  <config>{
         /*
-        * Please update parse_events_formats_error_string any time
-        * new static term is added.
+        * Please update config_term_names when new static term is added.
          */
  config                 { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG); }
  config1                        { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG1); }
@@ -196,6 +204,8 @@ no-inherit          { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_NOINHERIT); }
  ,                      { return ','; }
  "/"                    { BEGIN(INITIAL); return '/'; }
  {name_minus}           { return str(yyscanner, PE_NAME); }
+\[all\]                        { return PE_ARRAY_ALL; }
+"["                    { BEGIN(array); return '['; }
  }
  
  <mem>{
@@ -238,6 +248,7 @@ cpu-migrations|migrations                   { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COU
  alignment-faults                               { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_ALIGNMENT_FAULTS); }
  emulation-faults                               { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_EMULATION_FAULTS); }
  dummy                                          { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_DUMMY); }
+bpf-output                                     { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_BPF_OUTPUT); }
  
         /*
          * We have to handle the kernel PMU event cycles-ct/cycles-t/mem-loads/mem-stores separately.
diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y

index ad379968d4c10c0fb7bb2ddc3bacce3dc1166f43..5be4a5f216d6d23889e07e225810fec40df67c30 100644 (file)
--- a/tools/perf/util/parse-events.y
+++ b/tools/perf/util/parse-events.y
@@ -28,7 +28,7 @@ do { \
         INIT_LIST_HEAD(list);         \
  } while (0)
  
-static inc_group_count(struct list_head *list,
+static void inc_group_count(struct list_head *list,
                        struct parse_events_evlist *data)
  {
         /* Count groups only have more than 1 members */
@@ -48,6 +48,7 @@ static inc_group_count(struct list_head *list,
  %token PE_PREFIX_MEM PE_PREFIX_RAW PE_PREFIX_GROUP
  %token PE_ERROR
  %token PE_PMU_EVENT_PRE PE_PMU_EVENT_SUF PE_KERNEL_PMU_EVENT
+%token PE_ARRAY_ALL PE_ARRAY_RANGE
  %type <num> PE_VALUE
  %type <num> PE_VALUE_SYM_HW
  %type <num> PE_VALUE_SYM_SW
@@ -64,6 +65,7 @@ static inc_group_count(struct list_head *list,
  %type <str> PE_PMU_EVENT_PRE PE_PMU_EVENT_SUF PE_KERNEL_PMU_EVENT
  %type <num> value_sym
  %type <head> event_config
+%type <head> opt_event_config
  %type <term> event_term
  %type <head> event_pmu
  %type <head> event_legacy_symbol
@@ -82,6 +84,9 @@ static inc_group_count(struct list_head *list,
  %type <head> group_def
  %type <head> group
  %type <head> groups
+%type <array> array
+%type <array> array_term
+%type <array> array_terms
  
  %union
  {
@@ -93,6 +98,7 @@ static inc_group_count(struct list_head *list,
                 char *sys;
                 char *event;
         } tracepoint_name;
+       struct parse_events_array array;
  }
  %%
  
@@ -211,24 +217,14 @@ event_def: event_pmu |
            event_bpf_file
  
  event_pmu:
-PE_NAME '/' event_config '/'
+PE_NAME opt_event_config
  {
         struct parse_events_evlist *data = _data;
         struct list_head *list;
  
         ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_pmu(data, list, $1, $3));
-       parse_events__free_terms($3);
-       $$ = list;
-}
-|
-PE_NAME '/' '/'
-{
-       struct parse_events_evlist *data = _data;
-       struct list_head *list;
-
-       ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_pmu(data, list, $1, NULL));
+       ABORT_ON(parse_events_add_pmu(data, list, $1, $2));
+       parse_events_terms__delete($2);
         $$ = list;
  }
  |
@@ -246,7 +242,7 @@ PE_KERNEL_PMU_EVENT sep_dc
  
         ALLOC_LIST(list);
         ABORT_ON(parse_events_add_pmu(data, list, "cpu", head));
-       parse_events__free_terms(head);
+       parse_events_terms__delete(head);
         $$ = list;
  }
  |
@@ -266,7 +262,7 @@ PE_PMU_EVENT_PRE '-' PE_PMU_EVENT_SUF sep_dc
  
         ALLOC_LIST(list);
         ABORT_ON(parse_events_add_pmu(data, list, "cpu", head));
-       parse_events__free_terms(head);
+       parse_events_terms__delete(head);
         $$ = list;
  }
  
@@ -285,7 +281,7 @@ value_sym '/' event_config '/'
  
         ALLOC_LIST(list);
         ABORT_ON(parse_events_add_numeric(data, list, type, config, $3));
-       parse_events__free_terms($3);
+       parse_events_terms__delete($3);
         $$ = list;
  }
  |
@@ -302,33 +298,39 @@ value_sym sep_slash_dc
  }
  
  event_legacy_cache:
-PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT '-' PE_NAME_CACHE_OP_RESULT
+PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT '-' PE_NAME_CACHE_OP_RESULT opt_event_config
  {
         struct parse_events_evlist *data = _data;
+       struct parse_events_error *error = data->error;
         struct list_head *list;
  
         ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_cache(list, &data->idx, $1, $3, $5));
+       ABORT_ON(parse_events_add_cache(list, &data->idx, $1, $3, $5, error, $6));
+       parse_events_terms__delete($6);
         $$ = list;
  }
  |
-PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT
+PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT opt_event_config
  {
         struct parse_events_evlist *data = _data;
+       struct parse_events_error *error = data->error;
         struct list_head *list;
  
         ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_cache(list, &data->idx, $1, $3, NULL));
+       ABORT_ON(parse_events_add_cache(list, &data->idx, $1, $3, NULL, error, $4));
+       parse_events_terms__delete($4);
         $$ = list;
  }
  |
-PE_NAME_CACHE_TYPE
+PE_NAME_CACHE_TYPE opt_event_config
  {
         struct parse_events_evlist *data = _data;
+       struct parse_events_error *error = data->error;
         struct list_head *list;
  
         ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_cache(list, &data->idx, $1, NULL, NULL));
+       ABORT_ON(parse_events_add_cache(list, &data->idx, $1, NULL, NULL, error, $2));
+       parse_events_terms__delete($2);
         $$ = list;
  }
  
@@ -378,24 +380,7 @@ PE_PREFIX_MEM PE_VALUE sep_dc
  }
  
  event_legacy_tracepoint:
-tracepoint_name
-{
-       struct parse_events_evlist *data = _data;
-       struct parse_events_error *error = data->error;
-       struct list_head *list;
-
-       ALLOC_LIST(list);
-       if (error)
-               error->idx = @1.first_column;
-
-       if (parse_events_add_tracepoint(list, &data->idx, $1.sys, $1.event,
-                                       error, NULL))
-               return -1;
-
-       $$ = list;
-}
-|
-tracepoint_name '/' event_config '/'
+tracepoint_name opt_event_config
  {
         struct parse_events_evlist *data = _data;
         struct parse_events_error *error = data->error;
@@ -406,7 +391,7 @@ tracepoint_name '/' event_config '/'
                 error->idx = @1.first_column;
  
         if (parse_events_add_tracepoint(list, &data->idx, $1.sys, $1.event,
-                                       error, $3))
+                                       error, $2))
                 return -1;
  
         $$ = list;
@@ -433,49 +418,68 @@ PE_NAME ':' PE_NAME
  }
  
  event_legacy_numeric:
-PE_VALUE ':' PE_VALUE
+PE_VALUE ':' PE_VALUE opt_event_config
  {
         struct parse_events_evlist *data = _data;
         struct list_head *list;
  
         ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_numeric(data, list, (u32)$1, $3, NULL));
+       ABORT_ON(parse_events_add_numeric(data, list, (u32)$1, $3, $4));
+       parse_events_terms__delete($4);
         $$ = list;
  }
  
  event_legacy_raw:
-PE_RAW
+PE_RAW opt_event_config
  {
         struct parse_events_evlist *data = _data;
         struct list_head *list;
  
         ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_numeric(data, list, PERF_TYPE_RAW, $1, NULL));
+       ABORT_ON(parse_events_add_numeric(data, list, PERF_TYPE_RAW, $1, $2));
+       parse_events_terms__delete($2);
         $$ = list;
  }
  
  event_bpf_file:
-PE_BPF_OBJECT
+PE_BPF_OBJECT opt_event_config
  {
         struct parse_events_evlist *data = _data;
         struct parse_events_error *error = data->error;
         struct list_head *list;
  
         ALLOC_LIST(list);
-       ABORT_ON(parse_events_load_bpf(data, list, $1, false));
+       ABORT_ON(parse_events_load_bpf(data, list, $1, false, $2));
+       parse_events_terms__delete($2);
         $$ = list;
  }
  |
-PE_BPF_SOURCE
+PE_BPF_SOURCE opt_event_config
  {
         struct parse_events_evlist *data = _data;
         struct list_head *list;
  
         ALLOC_LIST(list);
-       ABORT_ON(parse_events_load_bpf(data, list, $1, true));
+       ABORT_ON(parse_events_load_bpf(data, list, $1, true, $2));
+       parse_events_terms__delete($2);
         $$ = list;
  }
  
+opt_event_config:
+'/' event_config '/'
+{
+       $$ = $2;
+}
+|
+'/' '/'
+{
+       $$ = NULL;
+}
+|
+{
+       $$ = NULL;
+}
+
  start_terms: event_config
  {
         struct parse_events_terms *data = _data;
@@ -573,6 +577,86 @@ PE_TERM
         ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, 1, &@1, NULL));
         $$ = term;
  }
+|
+PE_NAME array '=' PE_NAME
+{
+       struct parse_events_term *term;
+       int i;
+
+       ABORT_ON(parse_events_term__str(&term, PARSE_EVENTS__TERM_TYPE_USER,
+                                       $1, $4, &@1, &@4));
+
+       term->array = $2;
+       $$ = term;
+}
+|
+PE_NAME array '=' PE_VALUE
+{
+       struct parse_events_term *term;
+
+       ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
+                                       $1, $4, &@1, &@4));
+       term->array = $2;
+       $$ = term;
+}
+
+array:
+'[' array_terms ']'
+{
+       $$ = $2;
+}
+|
+PE_ARRAY_ALL
+{
+       $$.nr_ranges = 0;
+       $$.ranges = NULL;
+}
+
+array_terms:
+array_terms ',' array_term
+{
+       struct parse_events_array new_array;
+
+       new_array.nr_ranges = $1.nr_ranges + $3.nr_ranges;
+       new_array.ranges = malloc(sizeof(new_array.ranges[0]) *
+                                 new_array.nr_ranges);
+       ABORT_ON(!new_array.ranges);
+       memcpy(&new_array.ranges[0], $1.ranges,
+              $1.nr_ranges * sizeof(new_array.ranges[0]));
+       memcpy(&new_array.ranges[$1.nr_ranges], $3.ranges,
+              $3.nr_ranges * sizeof(new_array.ranges[0]));
+       free($1.ranges);
+       free($3.ranges);
+       $$ = new_array;
+}
+|
+array_term
+
+array_term:
+PE_VALUE
+{
+       struct parse_events_array array;
+
+       array.nr_ranges = 1;
+       array.ranges = malloc(sizeof(array.ranges[0]));
+       ABORT_ON(!array.ranges);
+       array.ranges[0].start = $1;
+       array.ranges[0].length = 1;
+       $$ = array;
+}
+|
+PE_VALUE PE_ARRAY_RANGE PE_VALUE
+{
+       struct parse_events_array array;
+
+       ABORT_ON($3 < $1);
+       array.nr_ranges = 1;
+       array.ranges = malloc(sizeof(array.ranges[0]));
+       ABORT_ON(!array.ranges);
+       array.ranges[0].start = $1;
+       array.ranges[0].length = $3 - $1 + 1;
+       $$ = array;
+}
  
  sep_dc: ':' |
  
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c

index b597bcc8fc781f4fa2c631044bddaab447b756e9..adef23b1352e836fec9f0f9a5290c578bb25cc43 100644 (file)
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -98,7 +98,7 @@ static int perf_pmu__parse_scale(struct perf_pmu_alias *alias, char *dir, char *
         char scale[128];
         int fd, ret = -1;
         char path[PATH_MAX];
-       const char *lc;
+       char *lc;
  
         snprintf(path, PATH_MAX, "%s/%s.scale", dir, name);
  
@@ -123,6 +123,17 @@ static int perf_pmu__parse_scale(struct perf_pmu_alias *alias, char *dir, char *
          */
         lc = setlocale(LC_NUMERIC, NULL);
  
+       /*
+        * The lc string may be allocated in static storage,
+        * so get a dynamic copy to make it survive setlocale
+        * call below.
+        */
+       lc = strdup(lc);
+       if (!lc) {
+               ret = -ENOMEM;
+               goto error;
+       }
+
         /*
          * force to C locale to ensure kernel
          * scale string is converted correctly.
@@ -135,6 +146,8 @@ static int perf_pmu__parse_scale(struct perf_pmu_alias *alias, char *dir, char *
         /* restore locale */
         setlocale(LC_NUMERIC, lc);
  
+       free(lc);
+
         ret = 0;
  error:
         close(fd);
@@ -153,7 +166,7 @@ static int perf_pmu__parse_unit(struct perf_pmu_alias *alias, char *dir, char *n
         if (fd == -1)
                 return -1;
  
-               sret = read(fd, alias->unit, UNIT_MAX_LEN);
+       sret = read(fd, alias->unit, UNIT_MAX_LEN);
         if (sret < 0)
                 goto error;
  
@@ -284,13 +297,12 @@ static int pmu_aliases_parse(char *dir, struct list_head *head)
  {
         struct dirent *evt_ent;
         DIR *event_dir;
-       int ret = 0;
  
         event_dir = opendir(dir);
         if (!event_dir)
                 return -EINVAL;
  
-       while (!ret && (evt_ent = readdir(event_dir))) {
+       while ((evt_ent = readdir(event_dir))) {
                 char path[PATH_MAX];
                 char *name = evt_ent->d_name;
                 FILE *file;
@@ -306,17 +318,19 @@ static int pmu_aliases_parse(char *dir, struct list_head *head)
  
                 snprintf(path, PATH_MAX, "%s/%s", dir, name);
  
-               ret = -EINVAL;
                 file = fopen(path, "r");
-               if (!file)
-                       break;
+               if (!file) {
+                       pr_debug("Cannot open %s\n", path);
+                       continue;
+               }
  
-               ret = perf_pmu__new_alias(head, dir, name, file);
+               if (perf_pmu__new_alias(head, dir, name, file) < 0)
+                       pr_debug("Cannot set up %s\n", name);
                 fclose(file);
         }
  
         closedir(event_dir);
-       return ret;
+       return 0;
  }
  
  /*
@@ -354,7 +368,7 @@ static int pmu_alias_terms(struct perf_pmu_alias *alias,
         list_for_each_entry(term, &alias->terms, list) {
                 ret = parse_events_term__clone(&cloned, term);
                 if (ret) {
-                       parse_events__free_terms(&list);
+                       parse_events_terms__purge(&list);
                         return ret;
                 }
                 list_add_tail(&cloned->list, &list);
diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c

index 544509c159cec4111e5865007b513e1197c576c4..b3aabc0d4eb0096fff41fb078a09d77988f103ff 100644 (file)
--- a/tools/perf/util/scripting-engines/trace-event-perl.c
+++ b/tools/perf/util/scripting-engines/trace-event-perl.c
@@ -187,6 +187,9 @@ static void define_event_symbols(struct event_format *event,
                                  const char *ev_name,
                                  struct print_arg *args)
  {
+       if (args == NULL)
+               return;
+
         switch (args->type) {
         case PRINT_NULL:
                 break;
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c

index d72fafc1c800db1a699ac37e4ecb4acd0f1e3326..fbd05242b4e59786ca0e081a52729248d780f5a0 100644 (file)
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -205,6 +205,9 @@ static void define_event_symbols(struct event_format *event,
                                  const char *ev_name,
                                  struct print_arg *args)
  {
+       if (args == NULL)
+               return;
+
         switch (args->type) {
         case PRINT_NULL:
                 break;
@@ -1091,8 +1094,6 @@ static int python_start_script(const char *script, int argc, const char **argv)
                 goto error;
         }
  
-       free(command_line);
-
         set_table_handlers(tables);
  
         if (tables->db_export_mode) {
@@ -1101,6 +1102,8 @@ static int python_start_script(const char *script, int argc, const char **argv)
                         goto error;
         }
  
+       free(command_line);
+
         return err;
  error:
         Py_Finalize();
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c

index 40b7a0d0905b8d7f01972c6e38e225f108b15133..60b3593d210dbc3b52997a693bcc8f9645687f08 100644 (file)
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -240,14 +240,6 @@ static int process_event_stub(struct perf_tool *tool __maybe_unused,
         return 0;
  }
  
-static int process_build_id_stub(struct perf_tool *tool __maybe_unused,
-                                union perf_event *event __maybe_unused,
-                                struct perf_session *session __maybe_unused)
-{
-       dump_printf(": unhandled!\n");
-       return 0;
-}
-
  static int process_finished_round_stub(struct perf_tool *tool __maybe_unused,
                                        union perf_event *event __maybe_unused,
                                        struct ordered_events *oe __maybe_unused)
@@ -260,23 +252,6 @@ static int process_finished_round(struct perf_tool *tool,
                                   union perf_event *event,
                                   struct ordered_events *oe);
  
-static int process_id_index_stub(struct perf_tool *tool __maybe_unused,
-                                union perf_event *event __maybe_unused,
-                                struct perf_session *perf_session
-                                __maybe_unused)
-{
-       dump_printf(": unhandled!\n");
-       return 0;
-}
-
-static int process_event_auxtrace_info_stub(struct perf_tool *tool __maybe_unused,
-                               union perf_event *event __maybe_unused,
-                               struct perf_session *session __maybe_unused)
-{
-       dump_printf(": unhandled!\n");
-       return 0;
-}
-
  static int skipn(int fd, off_t n)
  {
         char buf[4096];
@@ -303,10 +278,9 @@ static s64 process_event_auxtrace_stub(struct perf_tool *tool __maybe_unused,
         return event->auxtrace.size;
  }
  
-static
-int process_event_auxtrace_error_stub(struct perf_tool *tool __maybe_unused,
-                                     union perf_event *event __maybe_unused,
-                                     struct perf_session *session __maybe_unused)
+static int process_event_op2_stub(struct perf_tool *tool __maybe_unused,
+                                 union perf_event *event __maybe_unused,
+                                 struct perf_session *session __maybe_unused)
  {
         dump_printf(": unhandled!\n");
         return 0;
@@ -410,7 +384,7 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
         if (tool->tracing_data == NULL)
                 tool->tracing_data = process_event_synth_tracing_data_stub;
         if (tool->build_id == NULL)
-               tool->build_id = process_build_id_stub;
+               tool->build_id = process_event_op2_stub;
         if (tool->finished_round == NULL) {
                 if (tool->ordered_events)
                         tool->finished_round = process_finished_round;
@@ -418,13 +392,13 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
                         tool->finished_round = process_finished_round_stub;
         }
         if (tool->id_index == NULL)
-               tool->id_index = process_id_index_stub;
+               tool->id_index = process_event_op2_stub;
         if (tool->auxtrace_info == NULL)
-               tool->auxtrace_info = process_event_auxtrace_info_stub;
+               tool->auxtrace_info = process_event_op2_stub;
         if (tool->auxtrace == NULL)
                 tool->auxtrace = process_event_auxtrace_stub;
         if (tool->auxtrace_error == NULL)
-               tool->auxtrace_error = process_event_auxtrace_error_stub;
+               tool->auxtrace_error = process_event_op2_stub;
         if (tool->thread_map == NULL)
                 tool->thread_map = process_event_thread_map_stub;
         if (tool->cpu_map == NULL)
diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py

index 1833103768cb99fbdbb92ea8910b08488fd2aac3..c8680984d2d6680f56a974a1b54056a5e74263e1 100644 (file)
--- a/tools/perf/util/setup.py
+++ b/tools/perf/util/setup.py
@@ -22,6 +22,7 @@ cflags = getenv('CFLAGS', '').split()
  # switch off several checks (need to be at the end of cflags list)
  cflags += ['-fno-strict-aliasing', '-Wno-write-strings', '-Wno-unused-parameter' ]
  
+src_perf  = getenv('srctree') + '/tools/perf'
  build_lib = getenv('PYTHON_EXTBUILD_LIB')
  build_tmp = getenv('PYTHON_EXTBUILD_TMP')
  libtraceevent = getenv('LIBTRACEEVENT')
@@ -30,6 +31,9 @@ libapikfs = getenv('LIBAPI')
  ext_sources = [f.strip() for f in file('util/python-ext-sources')
                                 if len(f.strip()) > 0 and f[0] != '#']
  
+# use full paths with source files
+ext_sources = map(lambda x: '%s/%s' % (src_perf, x) , ext_sources)
+
  perf = Extension('perf',
                   sources = ext_sources,
                   include_dirs = ['util/include'],
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c

index ec722346e6ffb8dd6531e94576bb15b548a1487b..93fa136b0025c02e1e4383be523fcf37c092f36e 100644 (file)
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -6,6 +6,7 @@
  #include "evsel.h"
  #include "evlist.h"
  #include <traceevent/event-parse.h>
+#include "mem-events.h"
  
  regex_t                parent_regex;
  const char     default_parent_pattern[] = "^sys_|^do_page_fault";
@@ -25,9 +26,19 @@ int          sort__has_parent = 0;
  int            sort__has_sym = 0;
  int            sort__has_dso = 0;
  int            sort__has_socket = 0;
+int            sort__has_thread = 0;
+int            sort__has_comm = 0;
  enum sort_mode sort__mode = SORT_MODE__NORMAL;
  
-
+/*
+ * Replaces all occurrences of a char used with the:
+ *
+ * -t, --field-separator
+ *
+ * option, that uses a special separator character and don't pad with spaces,
+ * replacing all occurances of this separator in symbol names (and other
+ * output) with a '.' character, that thus it's the only non valid separator.
+*/
  static int repsep_snprintf(char *bf, size_t size, const char *fmt, ...)
  {
         int n;
@@ -80,10 +91,21 @@ static int hist_entry__thread_snprintf(struct hist_entry *he, char *bf,
                                width, width, comm ?: "");
  }
  
+static int hist_entry__thread_filter(struct hist_entry *he, int type, const void *arg)
+{
+       const struct thread *th = arg;
+
+       if (type != HIST_FILTER__THREAD)
+               return -1;
+
+       return th && he->thread != th;
+}
+
  struct sort_entry sort_thread = {
         .se_header      = "  Pid:Command",
         .se_cmp         = sort__thread_cmp,
         .se_snprintf    = hist_entry__thread_snprintf,
+       .se_filter      = hist_entry__thread_filter,
         .se_width_idx   = HISTC_THREAD,
  };
  
@@ -121,6 +143,7 @@ struct sort_entry sort_comm = {
         .se_collapse    = sort__comm_collapse,
         .se_sort        = sort__comm_sort,
         .se_snprintf    = hist_entry__comm_snprintf,
+       .se_filter      = hist_entry__thread_filter,
         .se_width_idx   = HISTC_COMM,
  };
  
@@ -170,10 +193,21 @@ static int hist_entry__dso_snprintf(struct hist_entry *he, char *bf,
         return _hist_entry__dso_snprintf(he->ms.map, bf, size, width);
  }
  
+static int hist_entry__dso_filter(struct hist_entry *he, int type, const void *arg)
+{
+       const struct dso *dso = arg;
+
+       if (type != HIST_FILTER__DSO)
+               return -1;
+
+       return dso && (!he->ms.map || he->ms.map->dso != dso);
+}
+
  struct sort_entry sort_dso = {
         .se_header      = "Shared Object",
         .se_cmp         = sort__dso_cmp,
         .se_snprintf    = hist_entry__dso_snprintf,
+       .se_filter      = hist_entry__dso_filter,
         .se_width_idx   = HISTC_DSO,
  };
  
@@ -246,10 +280,8 @@ static int _hist_entry__sym_snprintf(struct map *map, struct symbol *sym,
                         ret += repsep_snprintf(bf + ret, size - ret, "%s", sym->name);
                         ret += repsep_snprintf(bf + ret, size - ret, "+0x%llx",
                                         ip - map->unmap_ip(map, sym->start));
-                       ret += repsep_snprintf(bf + ret, size - ret, "%-*s",
-                                      width - ret, "");
                 } else {
-                       ret += repsep_snprintf(bf + ret, size - ret, "%-*s",
+                       ret += repsep_snprintf(bf + ret, size - ret, "%.*s",
                                                width - ret,
                                                sym->name);
                 }
@@ -257,14 +289,9 @@ static int _hist_entry__sym_snprintf(struct map *map, struct symbol *sym,
                 size_t len = BITS_PER_LONG / 4;
                 ret += repsep_snprintf(bf + ret, size - ret, "%-#.*llx",
                                        len, ip);
-               ret += repsep_snprintf(bf + ret, size - ret, "%-*s",
-                                      width - ret, "");
         }
  
-       if (ret > width)
-               bf[width] = '\0';
-
-       return width;
+       return ret;
  }
  
  static int hist_entry__sym_snprintf(struct hist_entry *he, char *bf,
@@ -274,46 +301,56 @@ static int hist_entry__sym_snprintf(struct hist_entry *he, char *bf,
                                          he->level, bf, size, width);
  }
  
+static int hist_entry__sym_filter(struct hist_entry *he, int type, const void *arg)
+{
+       const char *sym = arg;
+
+       if (type != HIST_FILTER__SYMBOL)
+               return -1;
+
+       return sym && (!he->ms.sym || !strstr(he->ms.sym->name, sym));
+}
+
  struct sort_entry sort_sym = {
         .se_header      = "Symbol",
         .se_cmp         = sort__sym_cmp,
         .se_sort        = sort__sym_sort,
         .se_snprintf    = hist_entry__sym_snprintf,
+       .se_filter      = hist_entry__sym_filter,
         .se_width_idx   = HISTC_SYMBOL,
  };
  
  /* --sort srcline */
  
+static char *hist_entry__get_srcline(struct hist_entry *he)
+{
+       struct map *map = he->ms.map;
+
+       if (!map)
+               return SRCLINE_UNKNOWN;
+
+       return get_srcline(map->dso, map__rip_2objdump(map, he->ip),
+                          he->ms.sym, true);
+}
+
  static int64_t
  sort__srcline_cmp(struct hist_entry *left, struct hist_entry *right)
  {
-       if (!left->srcline) {
-               if (!left->ms.map)
-                       left->srcline = SRCLINE_UNKNOWN;
-               else {
-                       struct map *map = left->ms.map;
-                       left->srcline = get_srcline(map->dso,
-                                          map__rip_2objdump(map, left->ip),
-                                                   left->ms.sym, true);
-               }
-       }
-       if (!right->srcline) {
-               if (!right->ms.map)
-                       right->srcline = SRCLINE_UNKNOWN;
-               else {
-                       struct map *map = right->ms.map;
-                       right->srcline = get_srcline(map->dso,
-                                            map__rip_2objdump(map, right->ip),
-                                                    right->ms.sym, true);
-               }
-       }
+       if (!left->srcline)
+               left->srcline = hist_entry__get_srcline(left);
+       if (!right->srcline)
+               right->srcline = hist_entry__get_srcline(right);
+
         return strcmp(right->srcline, left->srcline);
  }
  
  static int hist_entry__srcline_snprintf(struct hist_entry *he, char *bf,
                                         size_t size, unsigned int width)
  {
-       return repsep_snprintf(bf, size, "%-*.*s", width, width, he->srcline);
+       if (!he->srcline)
+               he->srcline = hist_entry__get_srcline(he);
+
+       return repsep_snprintf(bf, size, "%-.*s", width, he->srcline);
  }
  
  struct sort_entry sort_srcline = {
@@ -327,11 +364,14 @@ struct sort_entry sort_srcline = {
  
  static char no_srcfile[1];
  
-static char *get_srcfile(struct hist_entry *e)
+static char *hist_entry__get_srcfile(struct hist_entry *e)
  {
         char *sf, *p;
         struct map *map = e->ms.map;
  
+       if (!map)
+               return no_srcfile;
+
         sf = __get_srcline(map->dso, map__rip_2objdump(map, e->ip),
                          e->ms.sym, false, true);
         if (!strcmp(sf, SRCLINE_UNKNOWN))
@@ -348,25 +388,21 @@ static char *get_srcfile(struct hist_entry *e)
  static int64_t
  sort__srcfile_cmp(struct hist_entry *left, struct hist_entry *right)
  {
-       if (!left->srcfile) {
-               if (!left->ms.map)
-                       left->srcfile = no_srcfile;
-               else
-                       left->srcfile = get_srcfile(left);
-       }
-       if (!right->srcfile) {
-               if (!right->ms.map)
-                       right->srcfile = no_srcfile;
-               else
-                       right->srcfile = get_srcfile(right);
-       }
+       if (!left->srcfile)
+               left->srcfile = hist_entry__get_srcfile(left);
+       if (!right->srcfile)
+               right->srcfile = hist_entry__get_srcfile(right);
+
         return strcmp(right->srcfile, left->srcfile);
  }
  
  static int hist_entry__srcfile_snprintf(struct hist_entry *he, char *bf,
                                         size_t size, unsigned int width)
  {
-       return repsep_snprintf(bf, size, "%-*.*s", width, width, he->srcfile);
+       if (!he->srcfile)
+               he->srcfile = hist_entry__get_srcfile(he);
+
+       return repsep_snprintf(bf, size, "%-.*s", width, he->srcfile);
  }
  
  struct sort_entry sort_srcfile = {
@@ -439,10 +475,21 @@ static int hist_entry__socket_snprintf(struct hist_entry *he, char *bf,
         return repsep_snprintf(bf, size, "%*.*d", width, width-3, he->socket);
  }
  
+static int hist_entry__socket_filter(struct hist_entry *he, int type, const void *arg)
+{
+       int sk = *(const int *)arg;
+
+       if (type != HIST_FILTER__SOCKET)
+               return -1;
+
+       return sk >= 0 && he->socket != sk;
+}
+
  struct sort_entry sort_socket = {
         .se_header      = "Socket",
         .se_cmp         = sort__socket_cmp,
         .se_snprintf    = hist_entry__socket_snprintf,
+       .se_filter      = hist_entry__socket_filter,
         .se_width_idx   = HISTC_SOCKET,
  };
  
@@ -483,9 +530,6 @@ sort__trace_cmp(struct hist_entry *left, struct hist_entry *right)
         if (right->trace_output == NULL)
                 right->trace_output = get_trace_output(right);
  
-       hists__new_col_len(left->hists, HISTC_TRACE, strlen(left->trace_output));
-       hists__new_col_len(right->hists, HISTC_TRACE, strlen(right->trace_output));
-
         return strcmp(right->trace_output, left->trace_output);
  }
  
@@ -496,11 +540,11 @@ static int hist_entry__trace_snprintf(struct hist_entry *he, char *bf,
  
         evsel = hists_to_evsel(he->hists);
         if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
-               return scnprintf(bf, size, "%-*.*s", width, width, "N/A");
+               return scnprintf(bf, size, "%-.*s", width, "N/A");
  
         if (he->trace_output == NULL)
                 he->trace_output = get_trace_output(he);
-       return repsep_snprintf(bf, size, "%-*.*s", width, width, he->trace_output);
+       return repsep_snprintf(bf, size, "%-.*s", width, he->trace_output);
  }
  
  struct sort_entry sort_trace = {
@@ -532,6 +576,18 @@ static int hist_entry__dso_from_snprintf(struct hist_entry *he, char *bf,
                 return repsep_snprintf(bf, size, "%-*.*s", width, width, "N/A");
  }
  
+static int hist_entry__dso_from_filter(struct hist_entry *he, int type,
+                                      const void *arg)
+{
+       const struct dso *dso = arg;
+
+       if (type != HIST_FILTER__DSO)
+               return -1;
+
+       return dso && (!he->branch_info || !he->branch_info->from.map ||
+                      he->branch_info->from.map->dso != dso);
+}
+
  static int64_t
  sort__dso_to_cmp(struct hist_entry *left, struct hist_entry *right)
  {
@@ -552,6 +608,18 @@ static int hist_entry__dso_to_snprintf(struct hist_entry *he, char *bf,
                 return repsep_snprintf(bf, size, "%-*.*s", width, width, "N/A");
  }
  
+static int hist_entry__dso_to_filter(struct hist_entry *he, int type,
+                                    const void *arg)
+{
+       const struct dso *dso = arg;
+
+       if (type != HIST_FILTER__DSO)
+               return -1;
+
+       return dso && (!he->branch_info || !he->branch_info->to.map ||
+                      he->branch_info->to.map->dso != dso);
+}
+
  static int64_t
  sort__sym_from_cmp(struct hist_entry *left, struct hist_entry *right)
  {
@@ -613,10 +681,35 @@ static int hist_entry__sym_to_snprintf(struct hist_entry *he, char *bf,
         return repsep_snprintf(bf, size, "%-*.*s", width, width, "N/A");
  }
  
+static int hist_entry__sym_from_filter(struct hist_entry *he, int type,
+                                      const void *arg)
+{
+       const char *sym = arg;
+
+       if (type != HIST_FILTER__SYMBOL)
+               return -1;
+
+       return sym && !(he->branch_info && he->branch_info->from.sym &&
+                       strstr(he->branch_info->from.sym->name, sym));
+}
+
+static int hist_entry__sym_to_filter(struct hist_entry *he, int type,
+                                      const void *arg)
+{
+       const char *sym = arg;
+
+       if (type != HIST_FILTER__SYMBOL)
+               return -1;
+
+       return sym && !(he->branch_info && he->branch_info->to.sym &&
+                       strstr(he->branch_info->to.sym->name, sym));
+}
+
  struct sort_entry sort_dso_from = {
         .se_header      = "Source Shared Object",
         .se_cmp         = sort__dso_from_cmp,
         .se_snprintf    = hist_entry__dso_from_snprintf,
+       .se_filter      = hist_entry__dso_from_filter,
         .se_width_idx   = HISTC_DSO_FROM,
  };
  
@@ -624,6 +717,7 @@ struct sort_entry sort_dso_to = {
         .se_header      = "Target Shared Object",
         .se_cmp         = sort__dso_to_cmp,
         .se_snprintf    = hist_entry__dso_to_snprintf,
+       .se_filter      = hist_entry__dso_to_filter,
         .se_width_idx   = HISTC_DSO_TO,
  };
  
@@ -631,6 +725,7 @@ struct sort_entry sort_sym_from = {
         .se_header      = "Source Symbol",
         .se_cmp         = sort__sym_from_cmp,
         .se_snprintf    = hist_entry__sym_from_snprintf,
+       .se_filter      = hist_entry__sym_from_filter,
         .se_width_idx   = HISTC_SYMBOL_FROM,
  };
  
@@ -638,6 +733,7 @@ struct sort_entry sort_sym_to = {
         .se_header      = "Target Symbol",
         .se_cmp         = sort__sym_to_cmp,
         .se_snprintf    = hist_entry__sym_to_snprintf,
+       .se_filter      = hist_entry__sym_to_filter,
         .se_width_idx   = HISTC_SYMBOL_TO,
  };
  
@@ -797,20 +893,10 @@ sort__locked_cmp(struct hist_entry *left, struct hist_entry *right)
  static int hist_entry__locked_snprintf(struct hist_entry *he, char *bf,
                                     size_t size, unsigned int width)
  {
-       const char *out;
-       u64 mask = PERF_MEM_LOCK_NA;
+       char out[10];
  
-       if (he->mem_info)
-               mask = he->mem_info->data_src.mem_lock;
-
-       if (mask & PERF_MEM_LOCK_NA)
-               out = "N/A";
-       else if (mask & PERF_MEM_LOCK_LOCKED)
-               out = "Yes";
-       else
-               out = "No";
-
-       return repsep_snprintf(bf, size, "%-*s", width, out);
+       perf_mem__lck_scnprintf(out, sizeof(out), he->mem_info);
+       return repsep_snprintf(bf, size, "%.*s", width, out);
  }
  
  static int64_t
@@ -832,54 +918,12 @@ sort__tlb_cmp(struct hist_entry *left, struct hist_entry *right)
         return (int64_t)(data_src_r.mem_dtlb - data_src_l.mem_dtlb);
  }
  
-static const char * const tlb_access[] = {
-       "N/A",
-       "HIT",
-       "MISS",
-       "L1",
-       "L2",
-       "Walker",
-       "Fault",
-};
-#define NUM_TLB_ACCESS (sizeof(tlb_access)/sizeof(const char *))
-
  static int hist_entry__tlb_snprintf(struct hist_entry *he, char *bf,
                                     size_t size, unsigned int width)
  {
         char out[64];
-       size_t sz = sizeof(out) - 1; /* -1 for null termination */
-       size_t l = 0, i;
-       u64 m = PERF_MEM_TLB_NA;
-       u64 hit, miss;
-
-       out[0] = '\0';
-
-       if (he->mem_info)
-               m = he->mem_info->data_src.mem_dtlb;
-
-       hit = m & PERF_MEM_TLB_HIT;
-       miss = m & PERF_MEM_TLB_MISS;
-
-       /* already taken care of */
-       m &= ~(PERF_MEM_TLB_HIT|PERF_MEM_TLB_MISS);
-
-       for (i = 0; m && i < NUM_TLB_ACCESS; i++, m >>= 1) {
-               if (!(m & 0x1))
-                       continue;
-               if (l) {
-                       strcat(out, " or ");
-                       l += 4;
-               }
-               strncat(out, tlb_access[i], sz - l);
-               l += strlen(tlb_access[i]);
-       }
-       if (*out == '\0')
-               strcpy(out, "N/A");
-       if (hit)
-               strncat(out, " hit", sz - l);
-       if (miss)
-               strncat(out, " miss", sz - l);
  
+       perf_mem__tlb_scnprintf(out, sizeof(out), he->mem_info);
         return repsep_snprintf(bf, size, "%-*s", width, out);
  }
  
@@ -902,61 +946,12 @@ sort__lvl_cmp(struct hist_entry *left, struct hist_entry *right)
         return (int64_t)(data_src_r.mem_lvl - data_src_l.mem_lvl);
  }
  
-static const char * const mem_lvl[] = {
-       "N/A",
-       "HIT",
-       "MISS",
-       "L1",
-       "LFB",
-       "L2",
-       "L3",
-       "Local RAM",
-       "Remote RAM (1 hop)",
-       "Remote RAM (2 hops)",
-       "Remote Cache (1 hop)",
-       "Remote Cache (2 hops)",
-       "I/O",
-       "Uncached",
-};
-#define NUM_MEM_LVL (sizeof(mem_lvl)/sizeof(const char *))
-
  static int hist_entry__lvl_snprintf(struct hist_entry *he, char *bf,
                                     size_t size, unsigned int width)
  {
         char out[64];
-       size_t sz = sizeof(out) - 1; /* -1 for null termination */
-       size_t i, l = 0;
-       u64 m =  PERF_MEM_LVL_NA;
-       u64 hit, miss;
-
-       if (he->mem_info)
-               m  = he->mem_info->data_src.mem_lvl;
-
-       out[0] = '\0';
-
-       hit = m & PERF_MEM_LVL_HIT;
-       miss = m & PERF_MEM_LVL_MISS;
-
-       /* already taken care of */
-       m &= ~(PERF_MEM_LVL_HIT|PERF_MEM_LVL_MISS);
-
-       for (i = 0; m && i < NUM_MEM_LVL; i++, m >>= 1) {
-               if (!(m & 0x1))
-                       continue;
-               if (l) {
-                       strcat(out, " or ");
-                       l += 4;
-               }
-               strncat(out, mem_lvl[i], sz - l);
-               l += strlen(mem_lvl[i]);
-       }
-       if (*out == '\0')
-               strcpy(out, "N/A");
-       if (hit)
-               strncat(out, " hit", sz - l);
-       if (miss)
-               strncat(out, " miss", sz - l);
  
+       perf_mem__lvl_scnprintf(out, sizeof(out), he->mem_info);
         return repsep_snprintf(bf, size, "%-*s", width, out);
  }
  
@@ -979,51 +974,15 @@ sort__snoop_cmp(struct hist_entry *left, struct hist_entry *right)
         return (int64_t)(data_src_r.mem_snoop - data_src_l.mem_snoop);
  }
  
-static const char * const snoop_access[] = {
-       "N/A",
-       "None",
-       "Miss",
-       "Hit",
-       "HitM",
-};
-#define NUM_SNOOP_ACCESS (sizeof(snoop_access)/sizeof(const char *))
-
  static int hist_entry__snoop_snprintf(struct hist_entry *he, char *bf,
                                     size_t size, unsigned int width)
  {
         char out[64];
-       size_t sz = sizeof(out) - 1; /* -1 for null termination */
-       size_t i, l = 0;
-       u64 m = PERF_MEM_SNOOP_NA;
-
-       out[0] = '\0';
-
-       if (he->mem_info)
-               m = he->mem_info->data_src.mem_snoop;
-
-       for (i = 0; m && i < NUM_SNOOP_ACCESS; i++, m >>= 1) {
-               if (!(m & 0x1))
-                       continue;
-               if (l) {
-                       strcat(out, " or ");
-                       l += 4;
-               }
-               strncat(out, snoop_access[i], sz - l);
-               l += strlen(snoop_access[i]);
-       }
-
-       if (*out == '\0')
-               strcpy(out, "N/A");
  
+       perf_mem__snp_scnprintf(out, sizeof(out), he->mem_info);
         return repsep_snprintf(bf, size, "%-*s", width, out);
  }
  
-static inline  u64 cl_address(u64 address)
-{
-       /* return the cacheline of the address */
-       return (address & ~(cacheline_size - 1));
-}
-
  static int64_t
  sort__dcacheline_cmp(struct hist_entry *left, struct hist_entry *right)
  {
@@ -1440,20 +1399,6 @@ struct hpp_sort_entry {
         struct sort_entry *se;
  };
  
-bool perf_hpp__same_sort_entry(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
-{
-       struct hpp_sort_entry *hse_a;
-       struct hpp_sort_entry *hse_b;
-
-       if (!perf_hpp__is_sort_entry(a) || !perf_hpp__is_sort_entry(b))
-               return false;
-
-       hse_a = container_of(a, struct hpp_sort_entry, hpp);
-       hse_b = container_of(b, struct hpp_sort_entry, hpp);
-
-       return hse_a->se == hse_b->se;
-}
-
  void perf_hpp__reset_sort_width(struct perf_hpp_fmt *fmt, struct hists *hists)
  {
         struct hpp_sort_entry *hse;
@@ -1539,8 +1484,56 @@ static int64_t __sort__hpp_sort(struct perf_hpp_fmt *fmt,
         return sort_fn(a, b);
  }
  
+bool perf_hpp__is_sort_entry(struct perf_hpp_fmt *format)
+{
+       return format->header == __sort__hpp_header;
+}
+
+#define MK_SORT_ENTRY_CHK(key)                                 \
+bool perf_hpp__is_ ## key ## _entry(struct perf_hpp_fmt *fmt)  \
+{                                                              \
+       struct hpp_sort_entry *hse;                             \
+                                                               \
+       if (!perf_hpp__is_sort_entry(fmt))                      \
+               return false;                                   \
+                                                               \
+       hse = container_of(fmt, struct hpp_sort_entry, hpp);    \
+       return hse->se == &sort_ ## key ;                       \
+}
+
+MK_SORT_ENTRY_CHK(trace)
+MK_SORT_ENTRY_CHK(srcline)
+MK_SORT_ENTRY_CHK(srcfile)
+MK_SORT_ENTRY_CHK(thread)
+MK_SORT_ENTRY_CHK(comm)
+MK_SORT_ENTRY_CHK(dso)
+MK_SORT_ENTRY_CHK(sym)
+
+
+static bool __sort__hpp_equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
+{
+       struct hpp_sort_entry *hse_a;
+       struct hpp_sort_entry *hse_b;
+
+       if (!perf_hpp__is_sort_entry(a) || !perf_hpp__is_sort_entry(b))
+               return false;
+
+       hse_a = container_of(a, struct hpp_sort_entry, hpp);
+       hse_b = container_of(b, struct hpp_sort_entry, hpp);
+
+       return hse_a->se == hse_b->se;
+}
+
+static void hse_free(struct perf_hpp_fmt *fmt)
+{
+       struct hpp_sort_entry *hse;
+
+       hse = container_of(fmt, struct hpp_sort_entry, hpp);
+       free(hse);
+}
+
  static struct hpp_sort_entry *
-__sort_dimension__alloc_hpp(struct sort_dimension *sd)
+__sort_dimension__alloc_hpp(struct sort_dimension *sd, int level)
  {
         struct hpp_sort_entry *hse;
  
@@ -1560,40 +1553,92 @@ __sort_dimension__alloc_hpp(struct sort_dimension *sd)
         hse->hpp.cmp = __sort__hpp_cmp;
         hse->hpp.collapse = __sort__hpp_collapse;
         hse->hpp.sort = __sort__hpp_sort;
+       hse->hpp.equal = __sort__hpp_equal;
+       hse->hpp.free = hse_free;
  
         INIT_LIST_HEAD(&hse->hpp.list);
         INIT_LIST_HEAD(&hse->hpp.sort_list);
         hse->hpp.elide = false;
         hse->hpp.len = 0;
         hse->hpp.user_len = 0;
+       hse->hpp.level = level;
  
         return hse;
  }
  
-bool perf_hpp__is_sort_entry(struct perf_hpp_fmt *format)
+static void hpp_free(struct perf_hpp_fmt *fmt)
  {
-       return format->header == __sort__hpp_header;
+       free(fmt);
+}
+
+static struct perf_hpp_fmt *__hpp_dimension__alloc_hpp(struct hpp_dimension *hd,
+                                                      int level)
+{
+       struct perf_hpp_fmt *fmt;
+
+       fmt = memdup(hd->fmt, sizeof(*fmt));
+       if (fmt) {
+               INIT_LIST_HEAD(&fmt->list);
+               INIT_LIST_HEAD(&fmt->sort_list);
+               fmt->free = hpp_free;
+               fmt->level = level;
+       }
+
+       return fmt;
+}
+
+int hist_entry__filter(struct hist_entry *he, int type, const void *arg)
+{
+       struct perf_hpp_fmt *fmt;
+       struct hpp_sort_entry *hse;
+       int ret = -1;
+       int r;
+
+       perf_hpp_list__for_each_format(he->hpp_list, fmt) {
+               if (!perf_hpp__is_sort_entry(fmt))
+                       continue;
+
+               hse = container_of(fmt, struct hpp_sort_entry, hpp);
+               if (hse->se->se_filter == NULL)
+                       continue;
+
+               /*
+                * hist entry is filtered if any of sort key in the hpp list
+                * is applied.  But it should skip non-matched filter types.
+                */
+               r = hse->se->se_filter(he, type, arg);
+               if (r >= 0) {
+                       if (ret < 0)
+                               ret = 0;
+                       ret |= r;
+               }
+       }
+
+       return ret;
  }
  
-static int __sort_dimension__add_hpp_sort(struct sort_dimension *sd)
+static int __sort_dimension__add_hpp_sort(struct sort_dimension *sd,
+                                         struct perf_hpp_list *list,
+                                         int level)
  {
-       struct hpp_sort_entry *hse = __sort_dimension__alloc_hpp(sd);
+       struct hpp_sort_entry *hse = __sort_dimension__alloc_hpp(sd, level);
  
         if (hse == NULL)
                 return -1;
  
-       perf_hpp__register_sort_field(&hse->hpp);
+       perf_hpp_list__register_sort_field(list, &hse->hpp);
         return 0;
  }
  
-static int __sort_dimension__add_hpp_output(struct sort_dimension *sd)
+static int __sort_dimension__add_hpp_output(struct sort_dimension *sd,
+                                           struct perf_hpp_list *list)
  {
-       struct hpp_sort_entry *hse = __sort_dimension__alloc_hpp(sd);
+       struct hpp_sort_entry *hse = __sort_dimension__alloc_hpp(sd, 0);
  
         if (hse == NULL)
                 return -1;
  
-       perf_hpp__column_register(&hse->hpp);
+       perf_hpp_list__column_register(list, &hse->hpp);
         return 0;
  }
  
@@ -1727,6 +1772,9 @@ static int __sort__hde_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
         if (hde->raw_trace)
                 goto raw_field;
  
+       if (!he->trace_output)
+               he->trace_output = get_trace_output(he);
+
         field = hde->field;
         namelen = strlen(field->name);
         str = he->trace_output;
@@ -1776,6 +1824,11 @@ static int64_t __sort__hde_cmp(struct perf_hpp_fmt *fmt,
  
         hde = container_of(fmt, struct hpp_dynamic_entry, hpp);
  
+       if (b == NULL) {
+               update_dynamic_len(hde, a);
+               return 0;
+       }
+
         field = hde->field;
         if (field->flags & FIELD_IS_DYNAMIC) {
                 unsigned long long dyn;
@@ -1790,9 +1843,6 @@ static int64_t __sort__hde_cmp(struct perf_hpp_fmt *fmt,
         } else {
                 offset = field->offset;
                 size = field->size;
-
-               update_dynamic_len(hde, a);
-               update_dynamic_len(hde, b);
         }
  
         return memcmp(a->raw_data + offset, b->raw_data + offset, size);
@@ -1803,8 +1853,31 @@ bool perf_hpp__is_dynamic_entry(struct perf_hpp_fmt *fmt)
         return fmt->cmp == __sort__hde_cmp;
  }
  
+static bool __sort__hde_equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
+{
+       struct hpp_dynamic_entry *hde_a;
+       struct hpp_dynamic_entry *hde_b;
+
+       if (!perf_hpp__is_dynamic_entry(a) || !perf_hpp__is_dynamic_entry(b))
+               return false;
+
+       hde_a = container_of(a, struct hpp_dynamic_entry, hpp);
+       hde_b = container_of(b, struct hpp_dynamic_entry, hpp);
+
+       return hde_a->field == hde_b->field;
+}
+
+static void hde_free(struct perf_hpp_fmt *fmt)
+{
+       struct hpp_dynamic_entry *hde;
+
+       hde = container_of(fmt, struct hpp_dynamic_entry, hpp);
+       free(hde);
+}
+
  static struct hpp_dynamic_entry *
-__alloc_dynamic_entry(struct perf_evsel *evsel, struct format_field *field)
+__alloc_dynamic_entry(struct perf_evsel *evsel, struct format_field *field,
+                     int level)
  {
         struct hpp_dynamic_entry *hde;
  
@@ -1827,16 +1900,47 @@ __alloc_dynamic_entry(struct perf_evsel *evsel, struct format_field *field)
         hde->hpp.cmp = __sort__hde_cmp;
         hde->hpp.collapse = __sort__hde_cmp;
         hde->hpp.sort = __sort__hde_cmp;
+       hde->hpp.equal = __sort__hde_equal;
+       hde->hpp.free = hde_free;
  
         INIT_LIST_HEAD(&hde->hpp.list);
         INIT_LIST_HEAD(&hde->hpp.sort_list);
         hde->hpp.elide = false;
         hde->hpp.len = 0;
         hde->hpp.user_len = 0;
+       hde->hpp.level = level;
  
         return hde;
  }
  
+struct perf_hpp_fmt *perf_hpp_fmt__dup(struct perf_hpp_fmt *fmt)
+{
+       struct perf_hpp_fmt *new_fmt = NULL;
+
+       if (perf_hpp__is_sort_entry(fmt)) {
+               struct hpp_sort_entry *hse, *new_hse;
+
+               hse = container_of(fmt, struct hpp_sort_entry, hpp);
+               new_hse = memdup(hse, sizeof(*hse));
+               if (new_hse)
+                       new_fmt = &new_hse->hpp;
+       } else if (perf_hpp__is_dynamic_entry(fmt)) {
+               struct hpp_dynamic_entry *hde, *new_hde;
+
+               hde = container_of(fmt, struct hpp_dynamic_entry, hpp);
+               new_hde = memdup(hde, sizeof(*hde));
+               if (new_hde)
+                       new_fmt = &new_hde->hpp;
+       } else {
+               new_fmt = memdup(fmt, sizeof(*fmt));
+       }
+
+       INIT_LIST_HEAD(&new_fmt->list);
+       INIT_LIST_HEAD(&new_fmt->sort_list);
+
+       return new_fmt;
+}
+
  static int parse_field_name(char *str, char **event, char **field, char **opt)
  {
         char *event_name, *field_name, *opt_name;
@@ -1908,11 +2012,11 @@ static struct perf_evsel *find_evsel(struct perf_evlist *evlist, char *event_nam
  
  static int __dynamic_dimension__add(struct perf_evsel *evsel,
                                     struct format_field *field,
-                                   bool raw_trace)
+                                   bool raw_trace, int level)
  {
         struct hpp_dynamic_entry *hde;
  
-       hde = __alloc_dynamic_entry(evsel, field);
+       hde = __alloc_dynamic_entry(evsel, field, level);
         if (hde == NULL)
                 return -ENOMEM;
  
@@ -1922,14 +2026,14 @@ static int __dynamic_dimension__add(struct perf_evsel *evsel,
         return 0;
  }
  
-static int add_evsel_fields(struct perf_evsel *evsel, bool raw_trace)
+static int add_evsel_fields(struct perf_evsel *evsel, bool raw_trace, int level)
  {
         int ret;
         struct format_field *field;
  
         field = evsel->tp_format->format.fields;
         while (field) {
-               ret = __dynamic_dimension__add(evsel, field, raw_trace);
+               ret = __dynamic_dimension__add(evsel, field, raw_trace, level);
                 if (ret < 0)
                         return ret;
  
@@ -1938,7 +2042,8 @@ static int add_evsel_fields(struct perf_evsel *evsel, bool raw_trace)
         return 0;
  }
  
-static int add_all_dynamic_fields(struct perf_evlist *evlist, bool raw_trace)
+static int add_all_dynamic_fields(struct perf_evlist *evlist, bool raw_trace,
+                                 int level)
  {
         int ret;
         struct perf_evsel *evsel;
@@ -1947,7 +2052,7 @@ static int add_all_dynamic_fields(struct perf_evlist *evlist, bool raw_trace)
                 if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
                         continue;
  
-               ret = add_evsel_fields(evsel, raw_trace);
+               ret = add_evsel_fields(evsel, raw_trace, level);
                 if (ret < 0)
                         return ret;
         }
@@ -1955,7 +2060,7 @@ static int add_all_dynamic_fields(struct perf_evlist *evlist, bool raw_trace)
  }
  
  static int add_all_matching_fields(struct perf_evlist *evlist,
-                                  char *field_name, bool raw_trace)
+                                  char *field_name, bool raw_trace, int level)
  {
         int ret = -ESRCH;
         struct perf_evsel *evsel;
@@ -1969,14 +2074,15 @@ static int add_all_matching_fields(struct perf_evlist *evlist,
                 if (field == NULL)
                         continue;
  
-               ret = __dynamic_dimension__add(evsel, field, raw_trace);
+               ret = __dynamic_dimension__add(evsel, field, raw_trace, level);
                 if (ret < 0)
                         break;
         }
         return ret;
  }
  
-static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
+static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok,
+                            int level)
  {
         char *str, *event_name, *field_name, *opt_name;
         struct perf_evsel *evsel;
@@ -2006,12 +2112,12 @@ static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
         }
  
         if (!strcmp(field_name, "trace_fields")) {
-               ret = add_all_dynamic_fields(evlist, raw_trace);
+               ret = add_all_dynamic_fields(evlist, raw_trace, level);
                 goto out;
         }
  
         if (event_name == NULL) {
-               ret = add_all_matching_fields(evlist, field_name, raw_trace);
+               ret = add_all_matching_fields(evlist, field_name, raw_trace, level);
                 goto out;
         }
  
@@ -2029,7 +2135,7 @@ static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
         }
  
         if (!strcmp(field_name, "*")) {
-               ret = add_evsel_fields(evsel, raw_trace);
+               ret = add_evsel_fields(evsel, raw_trace, level);
         } else {
                 field = pevent_find_any_field(evsel->tp_format, field_name);
                 if (field == NULL) {
@@ -2038,7 +2144,7 @@ static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
                         return -ENOENT;
                 }
  
-               ret = __dynamic_dimension__add(evsel, field, raw_trace);
+               ret = __dynamic_dimension__add(evsel, field, raw_trace, level);
         }
  
  out:
@@ -2046,12 +2152,14 @@ out:
         return ret;
  }
  
-static int __sort_dimension__add(struct sort_dimension *sd)
+static int __sort_dimension__add(struct sort_dimension *sd,
+                                struct perf_hpp_list *list,
+                                int level)
  {
         if (sd->taken)
                 return 0;
  
-       if (__sort_dimension__add_hpp_sort(sd) < 0)
+       if (__sort_dimension__add_hpp_sort(sd, list, level) < 0)
                 return -1;
  
         if (sd->entry->se_collapse)
@@ -2062,46 +2170,63 @@ static int __sort_dimension__add(struct sort_dimension *sd)
         return 0;
  }
  
-static int __hpp_dimension__add(struct hpp_dimension *hd)
+static int __hpp_dimension__add(struct hpp_dimension *hd,
+                               struct perf_hpp_list *list,
+                               int level)
  {
-       if (!hd->taken) {
-               hd->taken = 1;
+       struct perf_hpp_fmt *fmt;
  
-               perf_hpp__register_sort_field(hd->fmt);
-       }
+       if (hd->taken)
+               return 0;
+
+       fmt = __hpp_dimension__alloc_hpp(hd, level);
+       if (!fmt)
+               return -1;
+
+       hd->taken = 1;
+       perf_hpp_list__register_sort_field(list, fmt);
         return 0;
  }
  
-static int __sort_dimension__add_output(struct sort_dimension *sd)
+static int __sort_dimension__add_output(struct perf_hpp_list *list,
+                                       struct sort_dimension *sd)
  {
         if (sd->taken)
                 return 0;
  
-       if (__sort_dimension__add_hpp_output(sd) < 0)
+       if (__sort_dimension__add_hpp_output(sd, list) < 0)
                 return -1;
  
         sd->taken = 1;
         return 0;
  }
  
-static int __hpp_dimension__add_output(struct hpp_dimension *hd)
+static int __hpp_dimension__add_output(struct perf_hpp_list *list,
+                                      struct hpp_dimension *hd)
  {
-       if (!hd->taken) {
-               hd->taken = 1;
+       struct perf_hpp_fmt *fmt;
  
-               perf_hpp__column_register(hd->fmt);
-       }
+       if (hd->taken)
+               return 0;
+
+       fmt = __hpp_dimension__alloc_hpp(hd, 0);
+       if (!fmt)
+               return -1;
+
+       hd->taken = 1;
+       perf_hpp_list__column_register(list, fmt);
         return 0;
  }
  
  int hpp_dimension__add_output(unsigned col)
  {
         BUG_ON(col >= PERF_HPP__MAX_INDEX);
-       return __hpp_dimension__add_output(&hpp_sort_dimensions[col]);
+       return __hpp_dimension__add_output(&perf_hpp_list, &hpp_sort_dimensions[col]);
  }
  
-static int sort_dimension__add(const char *tok,
-                              struct perf_evlist *evlist __maybe_unused)
+static int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
+                              struct perf_evlist *evlist __maybe_unused,
+                              int level)
  {
         unsigned int i;
  
@@ -2136,9 +2261,13 @@ static int sort_dimension__add(const char *tok,
                         sort__has_dso = 1;
                 } else if (sd->entry == &sort_socket) {
                         sort__has_socket = 1;
+               } else if (sd->entry == &sort_thread) {
+                       sort__has_thread = 1;
+               } else if (sd->entry == &sort_comm) {
+                       sort__has_comm = 1;
                 }
  
-               return __sort_dimension__add(sd);
+               return __sort_dimension__add(sd, list, level);
         }
  
         for (i = 0; i < ARRAY_SIZE(hpp_sort_dimensions); i++) {
@@ -2147,7 +2276,7 @@ static int sort_dimension__add(const char *tok,
                 if (strncasecmp(tok, hd->name, strlen(tok)))
                         continue;
  
-               return __hpp_dimension__add(hd);
+               return __hpp_dimension__add(hd, list, level);
         }
  
         for (i = 0; i < ARRAY_SIZE(bstack_sort_dimensions); i++) {
@@ -2162,7 +2291,7 @@ static int sort_dimension__add(const char *tok,
                 if (sd->entry == &sort_sym_from || sd->entry == &sort_sym_to)
                         sort__has_sym = 1;
  
-               __sort_dimension__add(sd);
+               __sort_dimension__add(sd, list, level);
                 return 0;
         }
  
@@ -2178,16 +2307,60 @@ static int sort_dimension__add(const char *tok,
                 if (sd->entry == &sort_mem_daddr_sym)
                         sort__has_sym = 1;
  
-               __sort_dimension__add(sd);
+               __sort_dimension__add(sd, list, level);
                 return 0;
         }
  
-       if (!add_dynamic_entry(evlist, tok))
+       if (!add_dynamic_entry(evlist, tok, level))
                 return 0;
  
         return -ESRCH;
  }
  
+static int setup_sort_list(struct perf_hpp_list *list, char *str,
+                          struct perf_evlist *evlist)
+{
+       char *tmp, *tok;
+       int ret = 0;
+       int level = 0;
+       int next_level = 1;
+       bool in_group = false;
+
+       do {
+               tok = str;
+               tmp = strpbrk(str, "{}, ");
+               if (tmp) {
+                       if (in_group)
+                               next_level = level;
+                       else
+                               next_level = level + 1;
+
+                       if (*tmp == '{')
+                               in_group = true;
+                       else if (*tmp == '}')
+                               in_group = false;
+
+                       *tmp = '\0';
+                       str = tmp + 1;
+               }
+
+               if (*tok) {
+                       ret = sort_dimension__add(list, tok, evlist, level);
+                       if (ret == -EINVAL) {
+                               error("Invalid --sort key: `%s'", tok);
+                               break;
+                       } else if (ret == -ESRCH) {
+                               error("Unknown --sort key: `%s'", tok);
+                               break;
+                       }
+               }
+
+               level = next_level;
+       } while (tmp);
+
+       return ret;
+}
+
  static const char *get_default_sort_order(struct perf_evlist *evlist)
  {
         const char *default_sort_orders[] = {
@@ -2282,7 +2455,7 @@ static char *setup_overhead(char *keys)
  
  static int __setup_sorting(struct perf_evlist *evlist)
  {
-       char *tmp, *tok, *str;
+       char *str;
         const char *sort_keys;
         int ret = 0;
  
@@ -2320,17 +2493,7 @@ static int __setup_sorting(struct perf_evlist *evlist)
                 }
         }
  
-       for (tok = strtok_r(str, ", ", &tmp);
-                       tok; tok = strtok_r(NULL, ", ", &tmp)) {
-               ret = sort_dimension__add(tok, evlist);
-               if (ret == -EINVAL) {
-                       error("Invalid --sort key: `%s'", tok);
-                       break;
-               } else if (ret == -ESRCH) {
-                       error("Unknown --sort key: `%s'", tok);
-                       break;
-               }
-       }
+       ret = setup_sort_list(&perf_hpp_list, str, evlist);
  
         free(str);
         return ret;
@@ -2341,7 +2504,7 @@ void perf_hpp__set_elide(int idx, bool elide)
         struct perf_hpp_fmt *fmt;
         struct hpp_sort_entry *hse;
  
-       perf_hpp__for_each_format(fmt) {
+       perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
                 if (!perf_hpp__is_sort_entry(fmt))
                         continue;
  
@@ -2401,7 +2564,7 @@ void sort__setup_elide(FILE *output)
         struct perf_hpp_fmt *fmt;
         struct hpp_sort_entry *hse;
  
-       perf_hpp__for_each_format(fmt) {
+       perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
                 if (!perf_hpp__is_sort_entry(fmt))
                         continue;
  
@@ -2413,7 +2576,7 @@ void sort__setup_elide(FILE *output)
          * It makes no sense to elide all of sort entries.
          * Just revert them to show up again.
          */
-       perf_hpp__for_each_format(fmt) {
+       perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
                 if (!perf_hpp__is_sort_entry(fmt))
                         continue;
  
@@ -2421,7 +2584,7 @@ void sort__setup_elide(FILE *output)
                         return;
         }
  
-       perf_hpp__for_each_format(fmt) {
+       perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
                 if (!perf_hpp__is_sort_entry(fmt))
                         continue;
  
@@ -2429,7 +2592,7 @@ void sort__setup_elide(FILE *output)
         }
  }
  
-static int output_field_add(char *tok)
+static int output_field_add(struct perf_hpp_list *list, char *tok)
  {
         unsigned int i;
  
@@ -2439,7 +2602,7 @@ static int output_field_add(char *tok)
                 if (strncasecmp(tok, sd->name, strlen(tok)))
                         continue;
  
-               return __sort_dimension__add_output(sd);
+               return __sort_dimension__add_output(list, sd);
         }
  
         for (i = 0; i < ARRAY_SIZE(hpp_sort_dimensions); i++) {
@@ -2448,7 +2611,7 @@ static int output_field_add(char *tok)
                 if (strncasecmp(tok, hd->name, strlen(tok)))
                         continue;
  
-               return __hpp_dimension__add_output(hd);
+               return __hpp_dimension__add_output(list, hd);
         }
  
         for (i = 0; i < ARRAY_SIZE(bstack_sort_dimensions); i++) {
@@ -2457,7 +2620,7 @@ static int output_field_add(char *tok)
                 if (strncasecmp(tok, sd->name, strlen(tok)))
                         continue;
  
-               return __sort_dimension__add_output(sd);
+               return __sort_dimension__add_output(list, sd);
         }
  
         for (i = 0; i < ARRAY_SIZE(memory_sort_dimensions); i++) {
@@ -2466,12 +2629,32 @@ static int output_field_add(char *tok)
                 if (strncasecmp(tok, sd->name, strlen(tok)))
                         continue;
  
-               return __sort_dimension__add_output(sd);
+               return __sort_dimension__add_output(list, sd);
         }
  
         return -ESRCH;
  }
  
+static int setup_output_list(struct perf_hpp_list *list, char *str)
+{
+       char *tmp, *tok;
+       int ret = 0;
+
+       for (tok = strtok_r(str, ", ", &tmp);
+                       tok; tok = strtok_r(NULL, ", ", &tmp)) {
+               ret = output_field_add(list, tok);
+               if (ret == -EINVAL) {
+                       error("Invalid --fields key: `%s'", tok);
+                       break;
+               } else if (ret == -ESRCH) {
+                       error("Unknown --fields key: `%s'", tok);
+                       break;
+               }
+       }
+
+       return ret;
+}
+
  static void reset_dimensions(void)
  {
         unsigned int i;
@@ -2496,7 +2679,7 @@ bool is_strict_order(const char *order)
  
  static int __setup_output_field(void)
  {
-       char *tmp, *tok, *str, *strp;
+       char *str, *strp;
         int ret = -EINVAL;
  
         if (field_order == NULL)
@@ -2516,17 +2699,7 @@ static int __setup_output_field(void)
                 goto out;
         }
  
-       for (tok = strtok_r(strp, ", ", &tmp);
-                       tok; tok = strtok_r(NULL, ", ", &tmp)) {
-               ret = output_field_add(tok);
-               if (ret == -EINVAL) {
-                       error("Invalid --fields key: `%s'", tok);
-                       break;
-               } else if (ret == -ESRCH) {
-                       error("Unknown --fields key: `%s'", tok);
-                       break;
-               }
-       }
+       ret = setup_output_list(&perf_hpp_list, strp);
  
  out:
         free(str);
@@ -2542,7 +2715,7 @@ int setup_sorting(struct perf_evlist *evlist)
                 return err;
  
         if (parent_pattern != default_parent_pattern) {
-               err = sort_dimension__add("parent", evlist);
+               err = sort_dimension__add(&perf_hpp_list, "parent", evlist, -1);
                 if (err < 0)
                         return err;
         }
@@ -2560,9 +2733,13 @@ int setup_sorting(struct perf_evlist *evlist)
                 return err;
  
         /* copy sort keys to output fields */
-       perf_hpp__setup_output_field();
+       perf_hpp__setup_output_field(&perf_hpp_list);
         /* and then copy output fields to sort keys */
-       perf_hpp__append_sort_keys();
+       perf_hpp__append_sort_keys(&perf_hpp_list);
+
+       /* setup hists-specific output fields */
+       if (perf_hpp__setup_hists_formats(&perf_hpp_list, evlist) < 0)
+               return -1;
  
         return 0;
  }
@@ -2578,5 +2755,5 @@ void reset_output_field(void)
         sort_order = NULL;
  
         reset_dimensions();
-       perf_hpp__reset_output_field();
+       perf_hpp__reset_output_field(&perf_hpp_list);
  }
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h

index 687bbb1244281ba65a99c3cb78f5bcb8a1eaa41b..3f4e359981192ac50b56e770aa472749666e1f98 100644 (file)
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -32,9 +32,12 @@ extern const char default_sort_order[];
  extern regex_t ignore_callees_regex;
  extern int have_ignore_callees;
  extern int sort__need_collapse;
+extern int sort__has_dso;
  extern int sort__has_parent;
  extern int sort__has_sym;
  extern int sort__has_socket;
+extern int sort__has_thread;
+extern int sort__has_comm;
  extern enum sort_mode sort__mode;
  extern struct sort_entry sort_comm;
  extern struct sort_entry sort_dso;
@@ -94,9 +97,11 @@ struct hist_entry {
         s32                     socket;
         s32                     cpu;
         u8                      cpumode;
+       u8                      depth;
  
         /* We are added by hists__add_dummy_entry. */
         bool                    dummy;
+       bool                    leaf;
  
         char                    level;
         u8                      filtered;
@@ -113,18 +118,28 @@ struct hist_entry {
                         bool    init_have_children;
                         bool    unfolded;
                         bool    has_children;
+                       bool    has_no_entry;
                 };
         };
         char                    *srcline;
         char                    *srcfile;
         struct symbol           *parent;
-       struct rb_root          sorted_chain;
         struct branch_info      *branch_info;
         struct hists            *hists;
         struct mem_info         *mem_info;
         void                    *raw_data;
         u32                     raw_size;
         void                    *trace_output;
+       struct perf_hpp_list    *hpp_list;
+       struct hist_entry       *parent_he;
+       union {
+               /* this is for hierarchical entry structure */
+               struct {
+                       struct rb_root  hroot_in;
+                       struct rb_root  hroot_out;
+               };                              /* non-leaf entries */
+               struct rb_root  sorted_chain;   /* leaf entry has callchains */
+       };
         struct callchain_root   callchain[0]; /* must be last member */
  };
  
@@ -160,6 +175,17 @@ static inline float hist_entry__get_percent_limit(struct hist_entry *he)
         return period * 100.0 / total_period;
  }
  
+static inline u64 cl_address(u64 address)
+{
+       /* return the cacheline of the address */
+       return (address & ~(cacheline_size - 1));
+}
+
+static inline u64 cl_offset(u64 address)
+{
+       /* return the cacheline of the address */
+       return (address & (cacheline_size - 1));
+}
  
  enum sort_mode {
         SORT_MODE__NORMAL,
@@ -221,6 +247,7 @@ struct sort_entry {
         int64_t (*se_sort)(struct hist_entry *, struct hist_entry *);
         int     (*se_snprintf)(struct hist_entry *he, char *bf, size_t size,
                                unsigned int width);
+       int     (*se_filter)(struct hist_entry *he, int type, const void *arg);
         u8      se_width_idx;
  };
  
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c

index 6ac03146889d29be60707efd6e04c27c919f64de..b33ffb2af2cf5900378fb9fb8ef60e9eddaaf9db 100644 (file)
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -2,6 +2,7 @@
  #include "evsel.h"
  #include "stat.h"
  #include "color.h"
+#include "pmu.h"
  
  enum {
         CTX_BIT_USER    = 1 << 0,
@@ -14,6 +15,13 @@ enum {
  
  #define NUM_CTX CTX_BIT_MAX
  
+/*
+ * AGGR_GLOBAL: Use CPU 0
+ * AGGR_SOCKET: Use first CPU of socket
+ * AGGR_CORE: Use first CPU of core
+ * AGGR_NONE: Use matching CPU
+ * AGGR_THREAD: Not supported?
+ */
  static struct stats runtime_nsecs_stats[MAX_NR_CPUS];
  static struct stats runtime_cycles_stats[NUM_CTX][MAX_NR_CPUS];
  static struct stats runtime_stalled_cycles_front_stats[NUM_CTX][MAX_NR_CPUS];
@@ -28,9 +36,15 @@ static struct stats runtime_dtlb_cache_stats[NUM_CTX][MAX_NR_CPUS];
  static struct stats runtime_cycles_in_tx_stats[NUM_CTX][MAX_NR_CPUS];
  static struct stats runtime_transaction_stats[NUM_CTX][MAX_NR_CPUS];
  static struct stats runtime_elision_stats[NUM_CTX][MAX_NR_CPUS];
+static bool have_frontend_stalled;
  
  struct stats walltime_nsecs_stats;
  
+void perf_stat__init_shadow_stats(void)
+{
+       have_frontend_stalled = pmu_have_event("cpu", "stalled-cycles-frontend");
+}
+
  static int evsel_context(struct perf_evsel *evsel)
  {
         int ctx = 0;
@@ -137,9 +151,10 @@ static const char *get_ratio_color(enum grc_type type, double ratio)
         return color;
  }
  
-static void print_stalled_cycles_frontend(FILE *out, int cpu,
+static void print_stalled_cycles_frontend(int cpu,
                                           struct perf_evsel *evsel
-                                         __maybe_unused, double avg)
+                                         __maybe_unused, double avg,
+                                         struct perf_stat_output_ctx *out)
  {
         double total, ratio = 0.0;
         const char *color;
@@ -152,14 +167,17 @@ static void print_stalled_cycles_frontend(FILE *out, int cpu,
  
         color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio);
  
-       fprintf(out, " #  ");
-       color_fprintf(out, color, "%6.2f%%", ratio);
-       fprintf(out, " frontend cycles idle   ");
+       if (ratio)
+               out->print_metric(out->ctx, color, "%7.2f%%", "frontend cycles idle",
+                                 ratio);
+       else
+               out->print_metric(out->ctx, NULL, NULL, "frontend cycles idle", 0);
  }
  
-static void print_stalled_cycles_backend(FILE *out, int cpu,
+static void print_stalled_cycles_backend(int cpu,
                                          struct perf_evsel *evsel
-                                        __maybe_unused, double avg)
+                                        __maybe_unused, double avg,
+                                        struct perf_stat_output_ctx *out)
  {
         double total, ratio = 0.0;
         const char *color;
@@ -172,14 +190,13 @@ static void print_stalled_cycles_backend(FILE *out, int cpu,
  
         color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio);
  
-       fprintf(out, " #  ");
-       color_fprintf(out, color, "%6.2f%%", ratio);
-       fprintf(out, " backend  cycles idle   ");
+       out->print_metric(out->ctx, color, "%6.2f%%", "backend cycles idle", ratio);
  }
  
-static void print_branch_misses(FILE *out, int cpu,
+static void print_branch_misses(int cpu,
                                 struct perf_evsel *evsel __maybe_unused,
-                               double avg)
+                               double avg,
+                               struct perf_stat_output_ctx *out)
  {
         double total, ratio = 0.0;
         const char *color;
@@ -192,14 +209,13 @@ static void print_branch_misses(FILE *out, int cpu,
  
         color = get_ratio_color(GRC_CACHE_MISSES, ratio);
  
-       fprintf(out, " #  ");
-       color_fprintf(out, color, "%6.2f%%", ratio);
-       fprintf(out, " of all branches        ");
+       out->print_metric(out->ctx, color, "%7.2f%%", "of all branches", ratio);
  }
  
-static void print_l1_dcache_misses(FILE *out, int cpu,
+static void print_l1_dcache_misses(int cpu,
                                    struct perf_evsel *evsel __maybe_unused,
-                                  double avg)
+                                  double avg,
+                                  struct perf_stat_output_ctx *out)
  {
         double total, ratio = 0.0;
         const char *color;
@@ -212,14 +228,13 @@ static void print_l1_dcache_misses(FILE *out, int cpu,
  
         color = get_ratio_color(GRC_CACHE_MISSES, ratio);
  
-       fprintf(out, " #  ");
-       color_fprintf(out, color, "%6.2f%%", ratio);
-       fprintf(out, " of all L1-dcache hits  ");
+       out->print_metric(out->ctx, color, "%7.2f%%", "of all L1-dcache hits", ratio);
  }
  
-static void print_l1_icache_misses(FILE *out, int cpu,
+static void print_l1_icache_misses(int cpu,
                                    struct perf_evsel *evsel __maybe_unused,
-                                  double avg)
+                                  double avg,
+                                  struct perf_stat_output_ctx *out)
  {
         double total, ratio = 0.0;
         const char *color;
@@ -231,15 +246,13 @@ static void print_l1_icache_misses(FILE *out, int cpu,
                 ratio = avg / total * 100.0;
  
         color = get_ratio_color(GRC_CACHE_MISSES, ratio);
-
-       fprintf(out, " #  ");
-       color_fprintf(out, color, "%6.2f%%", ratio);
-       fprintf(out, " of all L1-icache hits  ");
+       out->print_metric(out->ctx, color, "%7.2f%%", "of all L1-icache hits", ratio);
  }
  
-static void print_dtlb_cache_misses(FILE *out, int cpu,
+static void print_dtlb_cache_misses(int cpu,
                                     struct perf_evsel *evsel __maybe_unused,
-                                   double avg)
+                                   double avg,
+                                   struct perf_stat_output_ctx *out)
  {
         double total, ratio = 0.0;
         const char *color;
@@ -251,15 +264,13 @@ static void print_dtlb_cache_misses(FILE *out, int cpu,
                 ratio = avg / total * 100.0;
  
         color = get_ratio_color(GRC_CACHE_MISSES, ratio);
-
-       fprintf(out, " #  ");
-       color_fprintf(out, color, "%6.2f%%", ratio);
-       fprintf(out, " of all dTLB cache hits ");
+       out->print_metric(out->ctx, color, "%7.2f%%", "of all dTLB cache hits", ratio);
  }
  
-static void print_itlb_cache_misses(FILE *out, int cpu,
+static void print_itlb_cache_misses(int cpu,
                                     struct perf_evsel *evsel __maybe_unused,
-                                   double avg)
+                                   double avg,
+                                   struct perf_stat_output_ctx *out)
  {
         double total, ratio = 0.0;
         const char *color;
@@ -271,15 +282,13 @@ static void print_itlb_cache_misses(FILE *out, int cpu,
                 ratio = avg / total * 100.0;
  
         color = get_ratio_color(GRC_CACHE_MISSES, ratio);
-
-       fprintf(out, " #  ");
-       color_fprintf(out, color, "%6.2f%%", ratio);
-       fprintf(out, " of all iTLB cache hits ");
+       out->print_metric(out->ctx, color, "%7.2f%%", "of all iTLB cache hits", ratio);
  }
  
-static void print_ll_cache_misses(FILE *out, int cpu,
+static void print_ll_cache_misses(int cpu,
                                   struct perf_evsel *evsel __maybe_unused,
-                                 double avg)
+                                 double avg,
+                                 struct perf_stat_output_ctx *out)
  {
         double total, ratio = 0.0;
         const char *color;
@@ -291,15 +300,15 @@ static void print_ll_cache_misses(FILE *out, int cpu,
                 ratio = avg / total * 100.0;
  
         color = get_ratio_color(GRC_CACHE_MISSES, ratio);
-
-       fprintf(out, " #  ");
-       color_fprintf(out, color, "%6.2f%%", ratio);
-       fprintf(out, " of all LL-cache hits   ");
+       out->print_metric(out->ctx, color, "%7.2f%%", "of all LL-cache hits", ratio);
  }
  
-void perf_stat__print_shadow_stats(FILE *out, struct perf_evsel *evsel,
-                                  double avg, int cpu, enum aggr_mode aggr)
+void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
+                                  double avg, int cpu,
+                                  struct perf_stat_output_ctx *out)
  {
+       void *ctxp = out->ctx;
+       print_metric_t print_metric = out->print_metric;
         double total, ratio = 0.0, total2;
         int ctx = evsel_context(evsel);
  
@@ -307,119 +316,145 @@ void perf_stat__print_shadow_stats(FILE *out, struct perf_evsel *evsel,
                 total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
                 if (total) {
                         ratio = avg / total;
-                       fprintf(out, " #   %5.2f  insns per cycle        ", ratio);
+                       print_metric(ctxp, NULL, "%7.2f ",
+                                       "insn per cycle", ratio);
                 } else {
-                       fprintf(out, "                                   ");
+                       print_metric(ctxp, NULL, NULL, "insn per cycle", 0);
                 }
                 total = avg_stats(&runtime_stalled_cycles_front_stats[ctx][cpu]);
                 total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[ctx][cpu]));
  
                 if (total && avg) {
+                       out->new_line(ctxp);
                         ratio = total / avg;
-                       fprintf(out, "\n");
-                       if (aggr == AGGR_NONE)
-                               fprintf(out, "        ");
-                       fprintf(out, "                                                  #   %5.2f  stalled cycles per insn", ratio);
+                       print_metric(ctxp, NULL, "%7.2f ",
+                                       "stalled cycles per insn",
+                                       ratio);
+               } else if (have_frontend_stalled) {
+                       print_metric(ctxp, NULL, NULL,
+                                    "stalled cycles per insn", 0);
                 }
-
-       } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) &&
-                       runtime_branches_stats[ctx][cpu].n != 0) {
-               print_branch_misses(out, cpu, evsel, avg);
+       } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES)) {
+               if (runtime_branches_stats[ctx][cpu].n != 0)
+                       print_branch_misses(cpu, evsel, avg, out);
+               else
+                       print_metric(ctxp, NULL, NULL, "of all branches", 0);
         } else if (
                 evsel->attr.type == PERF_TYPE_HW_CACHE &&
                 evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1D |
                                         ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
-                                       ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-                       runtime_l1_dcache_stats[ctx][cpu].n != 0) {
-               print_l1_dcache_misses(out, cpu, evsel, avg);
+                                        ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
+               if (runtime_l1_dcache_stats[ctx][cpu].n != 0)
+                       print_l1_dcache_misses(cpu, evsel, avg, out);
+               else
+                       print_metric(ctxp, NULL, NULL, "of all L1-dcache hits", 0);
         } else if (
                 evsel->attr.type == PERF_TYPE_HW_CACHE &&
                 evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1I |
                                         ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
-                                       ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-                       runtime_l1_icache_stats[ctx][cpu].n != 0) {
-               print_l1_icache_misses(out, cpu, evsel, avg);
+                                        ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
+               if (runtime_l1_icache_stats[ctx][cpu].n != 0)
+                       print_l1_icache_misses(cpu, evsel, avg, out);
+               else
+                       print_metric(ctxp, NULL, NULL, "of all L1-icache hits", 0);
         } else if (
                 evsel->attr.type == PERF_TYPE_HW_CACHE &&
                 evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_DTLB |
                                         ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
-                                       ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-                       runtime_dtlb_cache_stats[ctx][cpu].n != 0) {
-               print_dtlb_cache_misses(out, cpu, evsel, avg);
+                                        ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
+               if (runtime_dtlb_cache_stats[ctx][cpu].n != 0)
+                       print_dtlb_cache_misses(cpu, evsel, avg, out);
+               else
+                       print_metric(ctxp, NULL, NULL, "of all dTLB cache hits", 0);
         } else if (
                 evsel->attr.type == PERF_TYPE_HW_CACHE &&
                 evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_ITLB |
                                         ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
-                                       ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-                       runtime_itlb_cache_stats[ctx][cpu].n != 0) {
-               print_itlb_cache_misses(out, cpu, evsel, avg);
+                                        ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
+               if (runtime_itlb_cache_stats[ctx][cpu].n != 0)
+                       print_itlb_cache_misses(cpu, evsel, avg, out);
+               else
+                       print_metric(ctxp, NULL, NULL, "of all iTLB cache hits", 0);
         } else if (
                 evsel->attr.type == PERF_TYPE_HW_CACHE &&
                 evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_LL |
                                         ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
-                                       ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-                       runtime_ll_cache_stats[ctx][cpu].n != 0) {
-               print_ll_cache_misses(out, cpu, evsel, avg);
-       } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) &&
-                       runtime_cacherefs_stats[ctx][cpu].n != 0) {
+                                        ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
+               if (runtime_ll_cache_stats[ctx][cpu].n != 0)
+                       print_ll_cache_misses(cpu, evsel, avg, out);
+               else
+                       print_metric(ctxp, NULL, NULL, "of all LL-cache hits", 0);
+       } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES)) {
                 total = avg_stats(&runtime_cacherefs_stats[ctx][cpu]);
  
                 if (total)
                         ratio = avg * 100 / total;
  
-               fprintf(out, " # %8.3f %% of all cache refs    ", ratio);
-
+               if (runtime_cacherefs_stats[ctx][cpu].n != 0)
+                       print_metric(ctxp, NULL, "%8.3f %%",
+                                    "of all cache refs", ratio);
+               else
+                       print_metric(ctxp, NULL, NULL, "of all cache refs", 0);
         } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) {
-               print_stalled_cycles_frontend(out, cpu, evsel, avg);
+               print_stalled_cycles_frontend(cpu, evsel, avg, out);
         } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) {
-               print_stalled_cycles_backend(out, cpu, evsel, avg);
+               print_stalled_cycles_backend(cpu, evsel, avg, out);
         } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) {
                 total = avg_stats(&runtime_nsecs_stats[cpu]);
  
                 if (total) {
                         ratio = avg / total;
-                       fprintf(out, " # %8.3f GHz                    ", ratio);
+                       print_metric(ctxp, NULL, "%8.3f", "GHz", ratio);
                 } else {
-                       fprintf(out, "                                   ");
+                       print_metric(ctxp, NULL, NULL, "Ghz", 0);
                 }
         } else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX)) {
                 total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
                 if (total)
-                       fprintf(out,
-                               " #   %5.2f%% transactional cycles   ",
-                               100.0 * (avg / total));
+                       print_metric(ctxp, NULL,
+                                       "%7.2f%%", "transactional cycles",
+                                       100.0 * (avg / total));
+               else
+                       print_metric(ctxp, NULL, NULL, "transactional cycles",
+                                    0);
         } else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX_CP)) {
                 total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
                 total2 = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);
                 if (total2 < avg)
                         total2 = avg;
                 if (total)
-                       fprintf(out,
-                               " #   %5.2f%% aborted cycles         ",
+                       print_metric(ctxp, NULL, "%7.2f%%", "aborted cycles",
                                 100.0 * ((total2-avg) / total));
-       } else if (perf_stat_evsel__is(evsel, TRANSACTION_START) &&
-                  runtime_cycles_in_tx_stats[ctx][cpu].n != 0) {
+               else
+                       print_metric(ctxp, NULL, NULL, "aborted cycles", 0);
+       } else if (perf_stat_evsel__is(evsel, TRANSACTION_START)) {
                 total = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);
  
                 if (avg)
                         ratio = total / avg;
  
-               fprintf(out, " # %8.0f cycles / transaction   ", ratio);
-       } else if (perf_stat_evsel__is(evsel, ELISION_START) &&
-                  runtime_cycles_in_tx_stats[ctx][cpu].n != 0) {
+               if (runtime_cycles_in_tx_stats[ctx][cpu].n != 0)
+                       print_metric(ctxp, NULL, "%8.0f",
+                                    "cycles / transaction", ratio);
+               else
+                       print_metric(ctxp, NULL, NULL, "cycles / transaction",
+                                    0);
+       } else if (perf_stat_evsel__is(evsel, ELISION_START)) {
                 total = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);
  
                 if (avg)
                         ratio = total / avg;
  
-               fprintf(out, " # %8.0f cycles / elision       ", ratio);
+               print_metric(ctxp, NULL, "%8.0f", "cycles / elision", ratio);
         } else if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) {
                 if ((ratio = avg_stats(&walltime_nsecs_stats)) != 0)
-                       fprintf(out, " # %8.3f CPUs utilized          ", avg / ratio);
+                       print_metric(ctxp, NULL, "%8.3f", "CPUs utilized",
+                                    avg / ratio);
                 else
-                       fprintf(out, "                                   ");
+                       print_metric(ctxp, NULL, NULL, "CPUs utilized", 0);
         } else if (runtime_nsecs_stats[cpu].n != 0) {
                 char unit = 'M';
+               char unit_buf[10];
  
                 total = avg_stats(&runtime_nsecs_stats[cpu]);
  
@@ -429,9 +464,9 @@ void perf_stat__print_shadow_stats(FILE *out, struct perf_evsel *evsel,
                         ratio *= 1000;
                         unit = 'K';
                 }
-
-               fprintf(out, " # %8.3f %c/sec                  ", ratio, unit);
+               snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit);
+               print_metric(ctxp, NULL, "%8.3f", unit_buf, ratio);
         } else {
-               fprintf(out, "                                   ");
+               print_metric(ctxp, NULL, NULL, NULL, 0);
         }
  }
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c

index afb0c45eba34ba8db7a6cb6d258326f545f7aca1..4d9b481cf3b6edbb7d6161cbd238709241dedc5b 100644 (file)
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -97,7 +97,7 @@ void perf_stat_evsel_id_init(struct perf_evsel *evsel)
         }
  }
  
-void perf_evsel__reset_stat_priv(struct perf_evsel *evsel)
+static void perf_evsel__reset_stat_priv(struct perf_evsel *evsel)
  {
         int i;
         struct perf_stat_evsel *ps = evsel->priv;
@@ -108,7 +108,7 @@ void perf_evsel__reset_stat_priv(struct perf_evsel *evsel)
         perf_stat_evsel_id_init(evsel);
  }
  
-int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel)
+static int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel)
  {
         evsel->priv = zalloc(sizeof(struct perf_stat_evsel));
         if (evsel->priv == NULL)
@@ -117,13 +117,13 @@ int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel)
         return 0;
  }
  
-void perf_evsel__free_stat_priv(struct perf_evsel *evsel)
+static void perf_evsel__free_stat_priv(struct perf_evsel *evsel)
  {
         zfree(&evsel->priv);
  }
  
-int perf_evsel__alloc_prev_raw_counts(struct perf_evsel *evsel,
-                                     int ncpus, int nthreads)
+static int perf_evsel__alloc_prev_raw_counts(struct perf_evsel *evsel,
+                                            int ncpus, int nthreads)
  {
         struct perf_counts *counts;
  
@@ -134,13 +134,13 @@ int perf_evsel__alloc_prev_raw_counts(struct perf_evsel *evsel,
         return counts ? 0 : -ENOMEM;
  }
  
-void perf_evsel__free_prev_raw_counts(struct perf_evsel *evsel)
+static void perf_evsel__free_prev_raw_counts(struct perf_evsel *evsel)
  {
         perf_counts__delete(evsel->prev_raw_counts);
         evsel->prev_raw_counts = NULL;
  }
  
-int perf_evsel__alloc_stats(struct perf_evsel *evsel, bool alloc_raw)
+static int perf_evsel__alloc_stats(struct perf_evsel *evsel, bool alloc_raw)
  {
         int ncpus = perf_evsel__nr_cpus(evsel);
         int nthreads = thread_map__nr(evsel->threads);
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h

index 086f4e128d6351f0e0de862c2ebcc44801cbc2f8..0150e786ccc7c1f48e407323b606f47155a076c9 100644 (file)
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -68,21 +68,23 @@ void perf_stat_evsel_id_init(struct perf_evsel *evsel);
  
  extern struct stats walltime_nsecs_stats;
  
+typedef void (*print_metric_t)(void *ctx, const char *color, const char *unit,
+                              const char *fmt, double val);
+typedef void (*new_line_t )(void *ctx);
+
+void perf_stat__init_shadow_stats(void);
  void perf_stat__reset_shadow_stats(void);
  void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 *count,
                                     int cpu);
-void perf_stat__print_shadow_stats(FILE *out, struct perf_evsel *evsel,
-                                  double avg, int cpu, enum aggr_mode aggr);
-
-void perf_evsel__reset_stat_priv(struct perf_evsel *evsel);
-int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel);
-void perf_evsel__free_stat_priv(struct perf_evsel *evsel);
-
-int perf_evsel__alloc_prev_raw_counts(struct perf_evsel *evsel,
-                                     int ncpus, int nthreads);
-void perf_evsel__free_prev_raw_counts(struct perf_evsel *evsel);
+struct perf_stat_output_ctx {
+       void *ctx;
+       print_metric_t print_metric;
+       new_line_t new_line;
+};
  
-int perf_evsel__alloc_stats(struct perf_evsel *evsel, bool alloc_raw);
+void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
+                                  double avg, int cpu,
+                                  struct perf_stat_output_ctx *out);
  
  int perf_evlist__alloc_stats(struct perf_evlist *evlist, bool alloc_raw);
  void perf_evlist__free_stats(struct perf_evlist *evlist);
diff --git a/tools/perf/util/strbuf.c b/tools/perf/util/strbuf.c

index 25671fa166188413c66758a978082e89a1ba08d2..d3d279275432ee663641a1a6dbb0a0cf8d3b8334 100644 (file)
--- a/tools/perf/util/strbuf.c
+++ b/tools/perf/util/strbuf.c
@@ -51,30 +51,6 @@ void strbuf_grow(struct strbuf *sb, size_t extra)
         ALLOC_GROW(sb->buf, sb->len + extra + 1, sb->alloc);
  }
  
-static void strbuf_splice(struct strbuf *sb, size_t pos, size_t len,
-                                  const void *data, size_t dlen)
-{
-       if (pos + len < pos)
-               die("you want to use way too much memory");
-       if (pos > sb->len)
-               die("`pos' is too far after the end of the buffer");
-       if (pos + len > sb->len)
-               die("`pos + len' is too far after the end of the buffer");
-
-       if (dlen >= len)
-               strbuf_grow(sb, dlen - len);
-       memmove(sb->buf + pos + dlen,
-                       sb->buf + pos + len,
-                       sb->len - pos - len);
-       memcpy(sb->buf + pos, data, dlen);
-       strbuf_setlen(sb, sb->len + dlen - len);
-}
-
-void strbuf_remove(struct strbuf *sb, size_t pos, size_t len)
-{
-       strbuf_splice(sb, pos, len, NULL, 0);
-}
-
  void strbuf_add(struct strbuf *sb, const void *data, size_t len)
  {
         strbuf_grow(sb, len);
diff --git a/tools/perf/util/strbuf.h b/tools/perf/util/strbuf.h

index 529f2f03524915ab9cae7c5608de444fd875812d..7a32c838884d8de6feaa3223e388199c5c939301 100644 (file)
--- a/tools/perf/util/strbuf.h
+++ b/tools/perf/util/strbuf.h
@@ -77,8 +77,6 @@ static inline void strbuf_addch(struct strbuf *sb, int c) {
         sb->buf[sb->len] = '\0';
  }
  
-extern void strbuf_remove(struct strbuf *, size_t pos, size_t len);
-
  extern void strbuf_add(struct strbuf *, const void *, size_t);
  static inline void strbuf_addstr(struct strbuf *sb, const char *s) {
         strbuf_add(sb, s, strlen(s));
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c

index 562b8ebeae5b2414b6bac867c928cb22ee9a5f13..b1dd68f358fcd8e7390b5929ed736a357c7853c1 100644 (file)
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -6,6 +6,7 @@
  #include <inttypes.h>
  
  #include "symbol.h"
+#include "demangle-java.h"
  #include "machine.h"
  #include "vdso.h"
  #include <symbol/kallsyms.h>
@@ -1077,6 +1078,8 @@ new_symbol:
                                 demangle_flags = DMGL_PARAMS | DMGL_ANSI;
  
                         demangled = bfd_demangle(NULL, elf_name, demangle_flags);
+                       if (demangled == NULL)
+                               demangled = java_demangle_sym(elf_name, JAVA_DEMANGLE_NORET);
                         if (demangled != NULL)
                                 elf_name = demangled;
                 }
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c

index ab02209a7cf3b162431bfc006287f5fbd8c68f43..e7588dc915181729394c1d2195c78576c47d20df 100644 (file)
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -1466,7 +1466,8 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
          * Read the build id if possible. This is required for
          * DSO_BINARY_TYPE__BUILDID_DEBUGINFO to work
          */
-       if (filename__read_build_id(dso->long_name, build_id, BUILD_ID_SIZE) > 0)
+       if (is_regular_file(name) &&
+           filename__read_build_id(dso->long_name, build_id, BUILD_ID_SIZE) > 0)
                 dso__set_build_id(dso, build_id);
  
         /*
@@ -1487,6 +1488,9 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
                                                    root_dir, name, PATH_MAX))
                         continue;
  
+               if (!is_regular_file(name))
+                       continue;
+
                 /* Name is now the name of the next image to try */
                 if (symsrc__init(ss, dso, name, symtab_type) < 0)
                         continue;
@@ -1525,6 +1529,10 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
         if (!runtime_ss && syms_ss)
                 runtime_ss = syms_ss;
  
+       if (syms_ss && syms_ss->type == DSO_BINARY_TYPE__BUILD_ID_CACHE)
+               if (dso__build_id_is_kmod(dso, name, PATH_MAX))
+                       kmod = true;
+
         if (syms_ss)
                 ret = dso__load_sym(dso, map, syms_ss, runtime_ss, filter, kmod);
         else
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h

index ccd1caa40e116645be37025665388886d7c97546..a937053a0ae07a6214fb03753faa66819ed40884 100644 (file)
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -110,7 +110,8 @@ struct symbol_conf {
                         has_filter,
                         show_ref_callgraph,
                         hide_unresolved,
-                       raw_trace;
+                       raw_trace,
+                       report_hierarchy;
         const char      *vmlinux_name,
                         *kallsyms_name,
                         *source_prefix,
diff --git a/tools/perf/util/trace-event.c b/tools/perf/util/trace-event.c

index 802bb868d446cafa7c5383982193ad13d87b785a..8ae051e0ec79090e674fdf9a6cc9fbece8275ebb 100644 (file)
--- a/tools/perf/util/trace-event.c
+++ b/tools/perf/util/trace-event.c
@@ -10,6 +10,7 @@
  #include <linux/err.h>
  #include <traceevent/event-parse.h>
  #include <api/fs/tracing_path.h>
+#include <api/fs/fs.h>
  #include "trace-event.h"
  #include "machine.h"
  #include "util.h"
diff --git a/tools/perf/util/tsc.c b/tools/perf/util/tsc.c

index 4d4210d4e13d1d7fbc32a5c1f4afff58a0810fd0..1b741646eed00b7754ba0a618b4fb57b8c87d9ac 100644 (file)
--- a/tools/perf/util/tsc.c
+++ b/tools/perf/util/tsc.c
@@ -19,7 +19,7 @@ u64 tsc_to_perf_time(u64 cyc, struct perf_tsc_conversion *tc)
         u64 quot, rem;
  
         quot = cyc >> tc->time_shift;
-       rem  = cyc & ((1 << tc->time_shift) - 1);
+       rem  = cyc & (((u64)1 << tc->time_shift) - 1);
         return tc->time_zero + quot * tc->time_mult +
                ((rem * tc->time_mult) >> tc->time_shift);
  }
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c

index ead9509835d23e2aee29b9f3548613973a5937fb..b7766c577b015d978fd3e9960c451692f81daa6e 100644 (file)
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -14,6 +14,7 @@
  #include <limits.h>
  #include <byteswap.h>
  #include <linux/kernel.h>
+#include <linux/log2.h>
  #include <unistd.h>
  #include "callchain.h"
  #include "strlist.h"
@@ -507,54 +508,6 @@ int parse_callchain_record(const char *arg, struct callchain_param *param)
         return ret;
  }
  
-int filename__read_str(const char *filename, char **buf, size_t *sizep)
-{
-       size_t size = 0, alloc_size = 0;
-       void *bf = NULL, *nbf;
-       int fd, n, err = 0;
-       char sbuf[STRERR_BUFSIZE];
-
-       fd = open(filename, O_RDONLY);
-       if (fd < 0)
-               return -errno;
-
-       do {
-               if (size == alloc_size) {
-                       alloc_size += BUFSIZ;
-                       nbf = realloc(bf, alloc_size);
-                       if (!nbf) {
-                               err = -ENOMEM;
-                               break;
-                       }
-
-                       bf = nbf;
-               }
-
-               n = read(fd, bf + size, alloc_size - size);
-               if (n < 0) {
-                       if (size) {
-                               pr_warning("read failed %d: %s\n", errno,
-                                        strerror_r(errno, sbuf, sizeof(sbuf)));
-                               err = 0;
-                       } else
-                               err = -errno;
-
-                       break;
-               }
-
-               size += n;
-       } while (n > 0);
-
-       if (!err) {
-               *sizep = size;
-               *buf   = bf;
-       } else
-               free(bf);
-
-       close(fd);
-       return err;
-}
-
  const char *get_filename_for_perf_kvm(void)
  {
         const char *filename;
@@ -691,3 +644,66 @@ out:
  
         return tip;
  }
+
+bool is_regular_file(const char *file)
+{
+       struct stat st;
+
+       if (stat(file, &st))
+               return false;
+
+       return S_ISREG(st.st_mode);
+}
+
+int fetch_current_timestamp(char *buf, size_t sz)
+{
+       struct timeval tv;
+       struct tm tm;
+       char dt[32];
+
+       if (gettimeofday(&tv, NULL) || !localtime_r(&tv.tv_sec, &tm))
+               return -1;
+
+       if (!strftime(dt, sizeof(dt), "%Y%m%d%H%M%S", &tm))
+               return -1;
+
+       scnprintf(buf, sz, "%s%02u", dt, (unsigned)tv.tv_usec / 10000);
+
+       return 0;
+}
+
+void print_binary(unsigned char *data, size_t len,
+                 size_t bytes_per_line, print_binary_t printer,
+                 void *extra)
+{
+       size_t i, j, mask;
+
+       if (!printer)
+               return;
+
+       bytes_per_line = roundup_pow_of_two(bytes_per_line);
+       mask = bytes_per_line - 1;
+
+       printer(BINARY_PRINT_DATA_BEGIN, 0, extra);
+       for (i = 0; i < len; i++) {
+               if ((i & mask) == 0) {
+                       printer(BINARY_PRINT_LINE_BEGIN, -1, extra);
+                       printer(BINARY_PRINT_ADDR, i, extra);
+               }
+
+               printer(BINARY_PRINT_NUM_DATA, data[i], extra);
+
+               if (((i & mask) == mask) || i == len - 1) {
+                       for (j = 0; j < mask-(i & mask); j++)
+                               printer(BINARY_PRINT_NUM_PAD, -1, extra);
+
+                       printer(BINARY_PRINT_SEP, i, extra);
+                       for (j = i & ~mask; j <= i; j++)
+                               printer(BINARY_PRINT_CHAR_DATA, data[j], extra);
+                       for (j = 0; j < mask-(i & mask); j++)
+                               printer(BINARY_PRINT_CHAR_PAD, i, extra);
+                       printer(BINARY_PRINT_LINE_END, -1, extra);
+               }
+       }
+       printer(BINARY_PRINT_DATA_END, -1, extra);
+}
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h

index fe915e616f9b65388e15be963d0ef5cf58c325fd..d0d50cef8b2af31703c7d298f68d93b68326c3c5 100644 (file)
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -82,6 +82,8 @@
  
  extern const char *graph_line;
  extern const char *graph_dotted_line;
+extern const char *spaces;
+extern const char *dots;
  extern char buildid_dir[];
  
  /* On most systems <limits.h> would have given us this, but
@@ -303,7 +305,6 @@ char *__get_srcline(struct dso *dso, u64 addr, struct symbol *sym,
                   bool show_sym, bool unwind_inlines);
  void free_srcline(char *srcline);
  
-int filename__read_str(const char *filename, char **buf, size_t *sizep);
  int perf_event_paranoid(void);
  
  void mem_bswap_64(void *src, int byte_size);
@@ -343,5 +344,27 @@ int fetch_kernel_version(unsigned int *puint,
  #define KVER_PARAM(x)  KVER_VERSION(x), KVER_PATCHLEVEL(x), KVER_SUBLEVEL(x)
  
  const char *perf_tip(const char *dirpath);
+bool is_regular_file(const char *file);
+int fetch_current_timestamp(char *buf, size_t sz);
+
+enum binary_printer_ops {
+       BINARY_PRINT_DATA_BEGIN,
+       BINARY_PRINT_LINE_BEGIN,
+       BINARY_PRINT_ADDR,
+       BINARY_PRINT_NUM_DATA,
+       BINARY_PRINT_NUM_PAD,
+       BINARY_PRINT_SEP,
+       BINARY_PRINT_CHAR_DATA,
+       BINARY_PRINT_CHAR_PAD,
+       BINARY_PRINT_LINE_END,
+       BINARY_PRINT_DATA_END,
+};
+
+typedef void (*print_binary_t)(enum binary_printer_ops,
+                              unsigned int val,
+                              void *extra);
  
+void print_binary(unsigned char *data, size_t len,
+                 size_t bytes_per_line, print_binary_t printer,
+                 void *extra);
  #endif /* GIT_COMPAT_UTIL_H */
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c

index 0dac7e05a6ac9e5f1500eca1cebbce2d900ab511..3fa94e291d16a2891b3799797c1a8da289d89b20 100644 (file)
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -1970,7 +1970,7 @@ int has_config_tdp(unsigned int family, unsigned int model)
  }
  
  static void
-dump_cstate_pstate_config_info(family, model)
+dump_cstate_pstate_config_info(unsigned int family, unsigned int model)
  {
         if (!do_nhm_platform_info)
                 return;
@@ -2142,7 +2142,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
  #define        RAPL_POWER_GRANULARITY  0x7FFF  /* 15 bit power granularity */
  #define        RAPL_TIME_GRANULARITY   0x3F /* 6 bit time granularity */
  
-double get_tdp(model)
+double get_tdp(unsigned int model)
  {
         unsigned long long msr;
  
@@ -2256,7 +2256,7 @@ void rapl_probe(unsigned int family, unsigned int model)
         return;
  }
  
-void perf_limit_reasons_probe(family, model)
+void perf_limit_reasons_probe(unsigned int family, unsigned int model)
  {
         if (!genuine_intel)
                 return;
@@ -2792,7 +2792,7 @@ void process_cpuid()
         perf_limit_reasons_probe(family, model);
  
         if (debug)
-               dump_cstate_pstate_config_info();
+               dump_cstate_pstate_config_info(family, model);
  
         if (has_skl_msrs(family, model))
                 calculate_tsc_tweak();
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c

index db2dd3335c6a6de0c2f0dd9fe89e462d16f5ba81..65da997b430a8e8c2b3d8eb9de93fb6b42d00f80 100644 (file)
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -97,8 +97,8 @@ static void async_pf_execute(struct work_struct *work)
          * This memory barrier pairs with prepare_to_wait's set_current_state()
          */
         smp_mb();
-       if (waitqueue_active(&vcpu->wq))
-               wake_up_interruptible(&vcpu->wq);
+       if (swait_active(&vcpu->wq))
+               swake_up(&vcpu->wq);
  
         mmput(mm);
         kvm_put_kvm(vcpu->kvm);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c

index a11cfd20a6a0d2aa86b1d06b8552bc41a8bfd8c9..5af50c3ddd535a5094b8de788bd88cc17085fb17 100644 (file)
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -216,8 +216,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
         vcpu->kvm = kvm;
         vcpu->vcpu_id = id;
         vcpu->pid = NULL;
-       vcpu->halt_poll_ns = 0;
-       init_waitqueue_head(&vcpu->wq);
+       init_swait_queue_head(&vcpu->wq);
         kvm_async_pf_vcpu_init(vcpu);
  
         vcpu->pre_pcpu = -1;
@@ -1952,6 +1951,9 @@ static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
         else
                 val *= halt_poll_ns_grow;
  
+       if (val > halt_poll_ns)
+               val = halt_poll_ns;
+
         vcpu->halt_poll_ns = val;
         trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
  }
@@ -1990,7 +1992,7 @@ static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
  void kvm_vcpu_block(struct kvm_vcpu *vcpu)
  {
         ktime_t start, cur;
-       DEFINE_WAIT(wait);
+       DECLARE_SWAITQUEUE(wait);
         bool waited = false;
         u64 block_ns;
  
@@ -2015,7 +2017,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
         kvm_arch_vcpu_blocking(vcpu);
  
         for (;;) {
-               prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
+               prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
  
                 if (kvm_vcpu_check_block(vcpu) < 0)
                         break;
@@ -2024,7 +2026,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
                 schedule();
         }
  
-       finish_wait(&vcpu->wq, &wait);
+       finish_swait(&vcpu->wq, &wait);
         cur = ktime_get();
  
         kvm_arch_vcpu_unblocking(vcpu);
@@ -2056,11 +2058,11 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
  {
         int me;
         int cpu = vcpu->cpu;
-       wait_queue_head_t *wqp;
+       struct swait_queue_head *wqp;
  
         wqp = kvm_arch_vcpu_wq(vcpu);
-       if (waitqueue_active(wqp)) {
-               wake_up_interruptible(wqp);
+       if (swait_active(wqp)) {
+               swake_up(wqp);
                 ++vcpu->stat.halt_wakeup;
         }
  
@@ -2161,7 +2163,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
                                 continue;
                         if (vcpu == me)
                                 continue;
-                       if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
+                       if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
                                 continue;
                         if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
                                 continue;
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 15 Mar 2016 16:32:27 +0000 (09:32 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 15 Mar 2016 16:32:27 +0000 (09:32 -0700)