Merge branch 'for-next' of git://git.samba.org/sfrench/cifs-2.6

author Linus Torvalds <torvalds@linux-foundation.org>

Sat, 21 May 2016 02:16:12 +0000 (19:16 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 21 May 2016 02:16:12 +0000 (19:16 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Sat, 21 May 2016 02:16:12 +0000 (19:16 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 21 May 2016 02:16:12 +0000 (19:16 -0700)
diff --git a/Documentation/ABI/testing/sysfs-class-cxl b/Documentation/ABI/testing/sysfs-class-cxl

index 7fd737eed38acd7f794daea91d9561a435a67e00..4ba0a2a61926251edf33e5f94a4eff45028d44a8 100644 (file)
--- a/Documentation/ABI/testing/sysfs-class-cxl
+++ b/Documentation/ABI/testing/sysfs-class-cxl
@@ -233,3 +233,11 @@ Description:       read/write
                 0 = don't trust, the image may be different (default)
                 1 = trust that the image will not change.
  Users:         https://github.com/ibm-capi/libcxl
+
+What:           /sys/class/cxl/<card>/psl_timebase_synced
+Date:           March 2016
+Contact:        linuxppc-dev@lists.ozlabs.org
+Description:    read only
+                Returns 1 if the psl timebase register is synchronized
+                with the core timebase register, 0 otherwise.
+Users:          https://github.com/ibm-capi/libcxl
diff --git a/Documentation/DocBook/debugobjects.tmpl b/Documentation/DocBook/debugobjects.tmpl

index 24979f691e3e88dd1ab82b3b00633e683445093f..7e4f34fde69752b7e314c662cb2066b89e917bc6 100644 (file)
--- a/Documentation/DocBook/debugobjects.tmpl
+++ b/Documentation/DocBook/debugobjects.tmpl
@@ -316,8 +316,8 @@
         </itemizedlist>
        </para>
        <para>
-       The function returns 1 when the fixup was successful,
-       otherwise 0. The return value is used to update the
+       The function returns true when the fixup was successful,
+       otherwise false. The return value is used to update the
         statistics.
        </para>
        <para>
@@ -341,8 +341,8 @@
         </itemizedlist>
        </para>
        <para>
-       The function returns 1 when the fixup was successful,
-       otherwise 0. The return value is used to update the
+       The function returns true when the fixup was successful,
+       otherwise false. The return value is used to update the
         statistics.
        </para>
        <para>
@@ -359,7 +359,8 @@
         statically initialized object or not. In case it is it calls
         debug_object_init() and debug_object_activate() to make the
         object known to the tracker and marked active. In this case
-       the function should return 0 because this is not a real fixup.
+       the function should return false because this is not a real
+       fixup.
        </para>
      </sect1>
  
@@ -376,8 +377,8 @@
         </itemizedlist>
        </para>
        <para>
-       The function returns 1 when the fixup was successful,
-       otherwise 0. The return value is used to update the
+       The function returns true when the fixup was successful,
+       otherwise false. The return value is used to update the
         statistics.
        </para>
      </sect1>
@@ -397,8 +398,8 @@
         </itemizedlist>
        </para>
        <para>
-       The function returns 1 when the fixup was successful,
-       otherwise 0. The return value is used to update the
+       The function returns true when the fixup was successful,
+       otherwise false. The return value is used to update the
         statistics.
        </para>
      </sect1>
@@ -414,8 +415,8 @@
         debug bucket.
        </para>
        <para>
-       The function returns 1 when the fixup was successful,
-       otherwise 0. The return value is used to update the
+       The function returns true when the fixup was successful,
+       otherwise false. The return value is used to update the
         statistics.
        </para>
        <para>
@@ -427,7 +428,8 @@
         case. The fixup function should check if this is a legitimate
         case of a statically initialized object or not. In this case only
         debug_object_init() should be called to make the object known to
-       the tracker. Then the function should return 0 because this is not
+       the tracker. Then the function should return false because this
+       is not
         a real fixup.
        </para>
      </sect1>
diff --git a/Documentation/devicetree/bindings/arm/atmel-at91.txt b/Documentation/devicetree/bindings/arm/atmel-at91.txt

index 1d800463347942fffb3a8fbb4433116f6da7ff97..e1f5ad855f14f3e8ccbed1f1387eb3c10c04e841 100644 (file)
--- a/Documentation/devicetree/bindings/arm/atmel-at91.txt
+++ b/Documentation/devicetree/bindings/arm/atmel-at91.txt
@@ -151,6 +151,65 @@ Example:
                 clocks = <&clk32k>;
         };
  
+SHDWC SAMA5D2-Compatible Shutdown Controller
+
+1) shdwc node
+
+required properties:
+- compatible: should be "atmel,sama5d2-shdwc".
+- reg: should contain registers location and length
+- clocks: phandle to input clock.
+- #address-cells: should be one. The cell is the wake-up input index.
+- #size-cells: should be zero.
+
+optional properties:
+
+- debounce-delay-us: minimum wake-up inputs debouncer period in
+  microseconds. It's usually a board-related property.
+- atmel,wakeup-rtc-timer: boolean to enable Real-Time Clock wake-up.
+
+The node contains child nodes for each wake-up input that the platform uses.
+
+2) input nodes
+
+Wake-up input nodes are usually described in the "board" part of the Device
+Tree. Note also that input 0 is linked to the wake-up pin and is frequently
+used.
+
+Required properties:
+- reg: should contain the wake-up input index [0 - 15].
+
+Optional properties:
+- atmel,wakeup-active-high: boolean, the corresponding wake-up input described
+  by the child, forces the wake-up of the core power supply on a high level.
+  The default is to be active low.
+
+Example:
+
+On the SoC side:
+       shdwc@f8048010 {
+               compatible = "atmel,sama5d2-shdwc";
+               reg = <0xf8048010 0x10>;
+               clocks = <&clk32k>;
+               #address-cells = <1>;
+               #size-cells = <0>;
+               atmel,wakeup-rtc-timer;
+       };
+
+On the board side:
+       shdwc@f8048010 {
+               debounce-delay-us = <976>;
+
+               input@0 {
+                       reg = <0>;
+               };
+
+               input@1 {
+                       reg = <1>;
+                       atmel,wakeup-active-high;
+               };
+       };
+
  Special Function Registers (SFR)
  
  Special Function Registers (SFR) manage specific aspects of the integrated
diff --git a/Documentation/devicetree/bindings/arm/cci.txt b/Documentation/devicetree/bindings/arm/cci.txt

index a1a5a7ecc2fb0d5d087152b6a21eaca618394576..0f2153e8fa7ec93f04b4d6f6d8b4c505497b12c3 100644 (file)
--- a/Documentation/devicetree/bindings/arm/cci.txt
+++ b/Documentation/devicetree/bindings/arm/cci.txt
@@ -100,7 +100,7 @@ specific to ARM.
                                  "arm,cci-400-pmu,r0"
                                  "arm,cci-400-pmu,r1"
                                  "arm,cci-400-pmu"  - DEPRECATED, permitted only where OS has
-                                                     secure acces to CCI registers
+                                                     secure access to CCI registers
                                  "arm,cci-500-pmu,r0"
                                  "arm,cci-550-pmu,r0"
                 - reg:
diff --git a/Documentation/devicetree/bindings/arm/l2c2x0.txt b/Documentation/devicetree/bindings/arm/l2c2x0.txt

index fe0398c5c77b3ba94c2dc9f4030a0b0eaa7f227d..c453ab5553cd81e323d72a0cd3e6f77749384b11 100644 (file)
--- a/Documentation/devicetree/bindings/arm/l2c2x0.txt
+++ b/Documentation/devicetree/bindings/arm/l2c2x0.txt
@@ -84,6 +84,12 @@ Optional properties:
  - prefetch-instr : Instruction prefetch. Value: <0> (forcibly disable),
    <1> (forcibly enable), property absent (retain settings set by
    firmware)
+- arm,dynamic-clock-gating : L2 dynamic clock gating. Value: <0> (forcibly
+  disable), <1> (forcibly enable), property absent (OS specific behavior,
+  preferrably retain firmware settings)
+- arm,standby-mode: L2 standby mode enable. Value <0> (forcibly disable),
+  <1> (forcibly enable), property absent (OS specific behavior,
+  preferrably retain firmware settings)
  
  Example:
  
diff --git a/Documentation/devicetree/bindings/arm/omap/crossbar.txt b/Documentation/devicetree/bindings/arm/omap/crossbar.txt

index a9b28d74d9023ebddc131742f9815ff9f9673dc0..bb5727ae004ac287c6725d53ef6a4b90a9e41480 100644 (file)
--- a/Documentation/devicetree/bindings/arm/omap/crossbar.txt
+++ b/Documentation/devicetree/bindings/arm/omap/crossbar.txt
@@ -42,7 +42,8 @@ Examples:
  Consumer:
  ========
  See Documentation/devicetree/bindings/interrupt-controller/interrupts.txt and
-Documentation/devicetree/bindings/arm/gic.txt for further details.
+Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt for
+further details.
  
  An interrupt consumer on an SoC using crossbar will use:
         interrupts = <GIC_SPI request_number interrupt_level>
diff --git a/Documentation/devicetree/bindings/arm/spear-misc.txt b/Documentation/devicetree/bindings/arm/spear-misc.txt

index cf649827ffcd77b31b50a35aeee938861142074a..e404e2556b4a3df2e4131ba64ff384c24a25a755 100644 (file)
--- a/Documentation/devicetree/bindings/arm/spear-misc.txt
+++ b/Documentation/devicetree/bindings/arm/spear-misc.txt
@@ -6,4 +6,4 @@ few properties of different peripheral controllers.
  misc node required properties:
  
  - compatible Should be "st,spear1340-misc", "syscon".
-- reg: Address range of misc space upto 8K
+- reg: Address range of misc space up to 8K
diff --git a/Documentation/devicetree/bindings/arm/ux500/boards.txt b/Documentation/devicetree/bindings/arm/ux500/boards.txt

index b8737a8de71896ebc4876e62db73b86bcf3ef6be..7334c24625fccf309c5a3708e3e0c1facd0344e5 100644 (file)
--- a/Documentation/devicetree/bindings/arm/ux500/boards.txt
+++ b/Documentation/devicetree/bindings/arm/ux500/boards.txt
@@ -23,7 +23,7 @@ scu:
         see binding for arm/scu.txt
  
  interrupt-controller:
-       see binding for arm/gic.txt
+       see binding for interrupt-controller/arm,gic.txt
  
  timer:
         see binding for arm/twd.txt
diff --git a/Documentation/devicetree/bindings/ata/nvidia,tegra124-ahci.txt b/Documentation/devicetree/bindings/ata/nvidia,tegra124-ahci.txt

new file mode 100644 (file)

index 0000000..66c83c3
--- /dev/null
+++ b/Documentation/devicetree/bindings/ata/nvidia,tegra124-ahci.txt
@@ -0,0 +1,32 @@
+Tegra124 SoC SATA AHCI controller
+
+Required properties :
+- compatible : For Tegra124, must contain "nvidia,tegra124-ahci".  Otherwise,
+  must contain '"nvidia,<chip>-ahci", "nvidia,tegra124-ahci"', where <chip>
+  is tegra132.
+- reg : Should contain 2 entries:
+  - AHCI register set (SATA BAR5)
+  - SATA register set
+- interrupts : Defines the interrupt used by SATA
+- clocks : Must contain an entry for each entry in clock-names.
+  See ../clocks/clock-bindings.txt for details.
+- clock-names : Must include the following entries:
+  - sata
+  - sata-oob
+  - cml1
+  - pll_e
+- resets : Must contain an entry for each entry in reset-names.
+  See ../reset/reset.txt for details.
+- reset-names : Must include the following entries:
+  - sata
+  - sata-oob
+  - sata-cold
+- phys : Must contain an entry for each entry in phy-names.
+  See ../phy/phy-bindings.txt for details.
+- phy-names : Must include the following entries:
+  - sata-phy : XUSB PADCTL SATA PHY
+- hvdd-supply : Defines the SATA HVDD regulator
+- vddio-supply : Defines the SATA VDDIO regulator
+- avdd-supply : Defines the SATA AVDD regulator
+- target-5v-supply : Defines the SATA 5V power regulator
+- target-12v-supply : Defines the SATA 12V power regulator
diff --git a/Documentation/devicetree/bindings/ata/tegra-sata.txt b/Documentation/devicetree/bindings/ata/tegra-sata.txt

deleted file mode 100644 (file)

index 66c83c3..0000000
--- a/Documentation/devicetree/bindings/ata/tegra-sata.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-Tegra124 SoC SATA AHCI controller
-
-Required properties :
-- compatible : For Tegra124, must contain "nvidia,tegra124-ahci".  Otherwise,
-  must contain '"nvidia,<chip>-ahci", "nvidia,tegra124-ahci"', where <chip>
-  is tegra132.
-- reg : Should contain 2 entries:
-  - AHCI register set (SATA BAR5)
-  - SATA register set
-- interrupts : Defines the interrupt used by SATA
-- clocks : Must contain an entry for each entry in clock-names.
-  See ../clocks/clock-bindings.txt for details.
-- clock-names : Must include the following entries:
-  - sata
-  - sata-oob
-  - cml1
-  - pll_e
-- resets : Must contain an entry for each entry in reset-names.
-  See ../reset/reset.txt for details.
-- reset-names : Must include the following entries:
-  - sata
-  - sata-oob
-  - sata-cold
-- phys : Must contain an entry for each entry in phy-names.
-  See ../phy/phy-bindings.txt for details.
-- phy-names : Must include the following entries:
-  - sata-phy : XUSB PADCTL SATA PHY
-- hvdd-supply : Defines the SATA HVDD regulator
-- vddio-supply : Defines the SATA VDDIO regulator
-- avdd-supply : Defines the SATA AVDD regulator
-- target-5v-supply : Defines the SATA 5V power regulator
-- target-12v-supply : Defines the SATA 12V power regulator
diff --git a/Documentation/devicetree/bindings/clock/nvidia,tegra124-dfll.txt b/Documentation/devicetree/bindings/clock/nvidia,tegra124-dfll.txt

index ee7e5fd4a50b45d1411a3b959f786528ce4cb38a..63f9d8277d48bb06a1b6b9c436865524764ab399 100644 (file)
--- a/Documentation/devicetree/bindings/clock/nvidia,tegra124-dfll.txt
+++ b/Documentation/devicetree/bindings/clock/nvidia,tegra124-dfll.txt
@@ -50,7 +50,7 @@ Required properties for I2C mode:
  
  Example:
  
-clock@0,70110000 {
+clock@70110000 {
          compatible = "nvidia,tegra124-dfll";
          reg = <0 0x70110000 0 0x100>, /* DFLL control */
                <0 0x70110000 0 0x100>, /* I2C output control */
diff --git a/Documentation/devicetree/bindings/clock/rockchip,rk3188-cru.txt b/Documentation/devicetree/bindings/clock/rockchip,rk3188-cru.txt

index 0c2bf5eba43efbf18c463876401da8c8cd4ea1a8..7f368530a2e4397653f1bc5069b0ed0587d507bb 100644 (file)
--- a/Documentation/devicetree/bindings/clock/rockchip,rk3188-cru.txt
+++ b/Documentation/devicetree/bindings/clock/rockchip,rk3188-cru.txt
@@ -16,7 +16,7 @@ Required Properties:
  Optional Properties:
  
  - rockchip,grf: phandle to the syscon managing the "general register files"
-  If missing pll rates are not changable, due to the missing pll lock status.
+  If missing pll rates are not changeable, due to the missing pll lock status.
  
  Each clock is assigned an identifier and client nodes can use this identifier
  to specify the clock which they consume. All available clocks are defined as
diff --git a/Documentation/devicetree/bindings/clock/rockchip,rk3288-cru.txt b/Documentation/devicetree/bindings/clock/rockchip,rk3288-cru.txt

index c9fbb76573e1c81c5503943c649db77541c77329..8cb47c39ba53922e100bff2efd10420ed5af626a 100644 (file)
--- a/Documentation/devicetree/bindings/clock/rockchip,rk3288-cru.txt
+++ b/Documentation/devicetree/bindings/clock/rockchip,rk3288-cru.txt
@@ -15,7 +15,7 @@ Required Properties:
  Optional Properties:
  
  - rockchip,grf: phandle to the syscon managing the "general register files"
-  If missing pll rates are not changable, due to the missing pll lock status.
+  If missing pll rates are not changeable, due to the missing pll lock status.
  
  Each clock is assigned an identifier and client nodes can use this identifier
  to specify the clock which they consume. All available clocks are defined as
diff --git a/Documentation/devicetree/bindings/clock/st/st,clkgen.txt b/Documentation/devicetree/bindings/clock/st/st,clkgen.txt

index 78978f1f515870e776e37eabdd3163dc42c0f132..b18bf86f926f68a1d81ce49effd43a1fecd35155 100644 (file)
--- a/Documentation/devicetree/bindings/clock/st/st,clkgen.txt
+++ b/Documentation/devicetree/bindings/clock/st/st,clkgen.txt
@@ -40,7 +40,7 @@ address is common of all subnode.
         };
  
  This binding uses the common clock binding[1].
-Each subnode should use the binding discribe in [2]..[7]
+Each subnode should use the binding described in [2]..[7]
  
  [1] Documentation/devicetree/bindings/clock/clock-bindings.txt
  [2] Documentation/devicetree/bindings/clock/st,clkgen-divmux.txt
diff --git a/Documentation/devicetree/bindings/cpufreq/nvidia,tegra124-cpufreq.txt b/Documentation/devicetree/bindings/cpufreq/nvidia,tegra124-cpufreq.txt

new file mode 100644 (file)

index 0000000..b1669fb
--- /dev/null
+++ b/Documentation/devicetree/bindings/cpufreq/nvidia,tegra124-cpufreq.txt
@@ -0,0 +1,44 @@
+Tegra124 CPU frequency scaling driver bindings
+----------------------------------------------
+
+Both required and optional properties listed below must be defined
+under node /cpus/cpu@0.
+
+Required properties:
+- clocks: Must contain an entry for each entry in clock-names.
+  See ../clocks/clock-bindings.txt for details.
+- clock-names: Must include the following entries:
+  - cpu_g: Clock mux for the fast CPU cluster.
+  - cpu_lp: Clock mux for the low-power CPU cluster.
+  - pll_x: Fast PLL clocksource.
+  - pll_p: Auxiliary PLL used during fast PLL rate changes.
+  - dfll: Fast DFLL clocksource that also automatically scales CPU voltage.
+- vdd-cpu-supply: Regulator for CPU voltage
+
+Optional properties:
+- clock-latency: Specify the possible maximum transition latency for clock,
+  in unit of nanoseconds.
+
+Example:
+--------
+cpus {
+       #address-cells = <1>;
+       #size-cells = <0>;
+
+       cpu@0 {
+               device_type = "cpu";
+               compatible = "arm,cortex-a15";
+               reg = <0>;
+
+               clocks = <&tegra_car TEGRA124_CLK_CCLK_G>,
+                        <&tegra_car TEGRA124_CLK_CCLK_LP>,
+                        <&tegra_car TEGRA124_CLK_PLL_X>,
+                        <&tegra_car TEGRA124_CLK_PLL_P>,
+                        <&dfll>;
+               clock-names = "cpu_g", "cpu_lp", "pll_x", "pll_p", "dfll";
+               clock-latency = <300000>;
+               vdd-cpu-supply: <&vdd_cpu>;
+       };
+
+       <...>
+};
diff --git a/Documentation/devicetree/bindings/cpufreq/tegra124-cpufreq.txt b/Documentation/devicetree/bindings/cpufreq/tegra124-cpufreq.txt

deleted file mode 100644 (file)

index b1669fb..0000000
--- a/Documentation/devicetree/bindings/cpufreq/tegra124-cpufreq.txt
+++ /dev/null
@@ -1,44 +0,0 @@
-Tegra124 CPU frequency scaling driver bindings
-----------------------------------------------
-
-Both required and optional properties listed below must be defined
-under node /cpus/cpu@0.
-
-Required properties:
-- clocks: Must contain an entry for each entry in clock-names.
-  See ../clocks/clock-bindings.txt for details.
-- clock-names: Must include the following entries:
-  - cpu_g: Clock mux for the fast CPU cluster.
-  - cpu_lp: Clock mux for the low-power CPU cluster.
-  - pll_x: Fast PLL clocksource.
-  - pll_p: Auxiliary PLL used during fast PLL rate changes.
-  - dfll: Fast DFLL clocksource that also automatically scales CPU voltage.
-- vdd-cpu-supply: Regulator for CPU voltage
-
-Optional properties:
-- clock-latency: Specify the possible maximum transition latency for clock,
-  in unit of nanoseconds.
-
-Example:
---------
-cpus {
-       #address-cells = <1>;
-       #size-cells = <0>;
-
-       cpu@0 {
-               device_type = "cpu";
-               compatible = "arm,cortex-a15";
-               reg = <0>;
-
-               clocks = <&tegra_car TEGRA124_CLK_CCLK_G>,
-                        <&tegra_car TEGRA124_CLK_CCLK_LP>,
-                        <&tegra_car TEGRA124_CLK_PLL_X>,
-                        <&tegra_car TEGRA124_CLK_PLL_P>,
-                        <&dfll>;
-               clock-names = "cpu_g", "cpu_lp", "pll_x", "pll_p", "dfll";
-               clock-latency = <300000>;
-               vdd-cpu-supply: <&vdd_cpu>;
-       };
-
-       <...>
-};
diff --git a/Documentation/devicetree/bindings/display/exynos/exynos_dsim.txt b/Documentation/devicetree/bindings/display/exynos/exynos_dsim.txt

index 22756b3dede2a3a839ea9889c23562b5ac305dfc..a78265993665a65bae524bb19606a7c16f248770 100644 (file)
--- a/Documentation/devicetree/bindings/display/exynos/exynos_dsim.txt
+++ b/Documentation/devicetree/bindings/display/exynos/exynos_dsim.txt
@@ -41,7 +41,7 @@ Video interfaces:
    endpoint node connected from mic node (reg = 0):
      - remote-endpoint: specifies the endpoint in mic node. This node is required
                        for Exynos5433 mipi dsi. So mic can access to panel node
-                      thoughout this dsi node.
+                      throughout this dsi node.
    endpoint node connected to panel node (reg = 1):
      - remote-endpoint: specifies the endpoint in panel node. This node is
                        required in all kinds of exynos mipi dsi to represent
diff --git a/Documentation/devicetree/bindings/dma/nvidia,tegra20-apbdma.txt b/Documentation/devicetree/bindings/dma/nvidia,tegra20-apbdma.txt

new file mode 100644 (file)

index 0000000..c6908e7
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/nvidia,tegra20-apbdma.txt
@@ -0,0 +1,44 @@
+* NVIDIA Tegra APB DMA controller
+
+Required properties:
+- compatible: Should be "nvidia,<chip>-apbdma"
+- reg: Should contain DMA registers location and length. This shuld include
+  all of the per-channel registers.
+- interrupts: Should contain all of the per-channel DMA interrupts.
+- clocks: Must contain one entry, for the module clock.
+  See ../clocks/clock-bindings.txt for details.
+- resets : Must contain an entry for each entry in reset-names.
+  See ../reset/reset.txt for details.
+- reset-names : Must include the following entries:
+  - dma
+- #dma-cells : Must be <1>. This dictates the length of DMA specifiers in
+  client nodes' dmas properties. The specifier represents the DMA request
+  select value for the peripheral. For more details, consult the Tegra TRM's
+  documentation of the APB DMA channel control register REQ_SEL field.
+
+Examples:
+
+apbdma: dma@6000a000 {
+       compatible = "nvidia,tegra20-apbdma";
+       reg = <0x6000a000 0x1200>;
+       interrupts = < 0 136 0x04
+                      0 137 0x04
+                      0 138 0x04
+                      0 139 0x04
+                      0 140 0x04
+                      0 141 0x04
+                      0 142 0x04
+                      0 143 0x04
+                      0 144 0x04
+                      0 145 0x04
+                      0 146 0x04
+                      0 147 0x04
+                      0 148 0x04
+                      0 149 0x04
+                      0 150 0x04
+                      0 151 0x04 >;
+       clocks = <&tegra_car 34>;
+       resets = <&tegra_car 34>;
+       reset-names = "dma";
+       #dma-cells = <1>;
+};
diff --git a/Documentation/devicetree/bindings/dma/tegra20-apbdma.txt b/Documentation/devicetree/bindings/dma/tegra20-apbdma.txt

deleted file mode 100644 (file)

index c6908e7..0000000
--- a/Documentation/devicetree/bindings/dma/tegra20-apbdma.txt
+++ /dev/null
@@ -1,44 +0,0 @@
-* NVIDIA Tegra APB DMA controller
-
-Required properties:
-- compatible: Should be "nvidia,<chip>-apbdma"
-- reg: Should contain DMA registers location and length. This shuld include
-  all of the per-channel registers.
-- interrupts: Should contain all of the per-channel DMA interrupts.
-- clocks: Must contain one entry, for the module clock.
-  See ../clocks/clock-bindings.txt for details.
-- resets : Must contain an entry for each entry in reset-names.
-  See ../reset/reset.txt for details.
-- reset-names : Must include the following entries:
-  - dma
-- #dma-cells : Must be <1>. This dictates the length of DMA specifiers in
-  client nodes' dmas properties. The specifier represents the DMA request
-  select value for the peripheral. For more details, consult the Tegra TRM's
-  documentation of the APB DMA channel control register REQ_SEL field.
-
-Examples:
-
-apbdma: dma@6000a000 {
-       compatible = "nvidia,tegra20-apbdma";
-       reg = <0x6000a000 0x1200>;
-       interrupts = < 0 136 0x04
-                      0 137 0x04
-                      0 138 0x04
-                      0 139 0x04
-                      0 140 0x04
-                      0 141 0x04
-                      0 142 0x04
-                      0 143 0x04
-                      0 144 0x04
-                      0 145 0x04
-                      0 146 0x04
-                      0 147 0x04
-                      0 148 0x04
-                      0 149 0x04
-                      0 150 0x04
-                      0 151 0x04 >;
-       clocks = <&tegra_car 34>;
-       resets = <&tegra_car 34>;
-       reset-names = "dma";
-       #dma-cells = <1>;
-};
diff --git a/Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt b/Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt

index 2291c4098730f6e81332b38fe8d045f4f3293347..3cf0072d3141962342f56df2f251ffbeb094879d 100644 (file)
--- a/Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt
+++ b/Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt
@@ -7,7 +7,7 @@ Required properties:
  - compatible: Should be "xlnx,axi-dma-1.00.a"
  - #dma-cells: Should be <1>, see "dmas" property below
  - reg: Should contain DMA registers location and length.
-- dma-channel child node: Should have atleast one channel and can have upto
+- dma-channel child node: Should have at least one channel and can have up to
         two channels per device. This node specifies the properties of each
         DMA channel (see child node properties below).
  
diff --git a/Documentation/devicetree/bindings/gpio/ibm,ppc4xx-gpio.txt b/Documentation/devicetree/bindings/gpio/ibm,ppc4xx-gpio.txt

new file mode 100644 (file)

index 0000000..d58b395
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/ibm,ppc4xx-gpio.txt
@@ -0,0 +1,24 @@
+* IBM/AMCC/APM GPIO Controller for PowerPC 4XX series and compatible SoCs
+
+All GPIOs are pin-shared with other functions. DCRs control whether a
+particular pin that has GPIO capabilities acts as a GPIO or is used for
+another purpose. GPIO outputs are separately programmable to emulate
+an open-drain driver.
+
+Required properties:
+       - compatible: must be "ibm,ppc4xx-gpio"
+       - reg: address and length of the register set for the device
+       - #gpio-cells: must be set to 2. The first cell is the pin number
+               and the second cell is used to specify the gpio polarity:
+               0 = active high
+               1 = active low
+       - gpio-controller: marks the device node as a gpio controller.
+
+Example:
+
+GPIO0: gpio@ef600b00 {
+       compatible = "ibm,ppc4xx-gpio";
+       reg = <0xef600b00 0x00000048>;
+       #gpio-cells = <2>;
+       gpio-controller;
+};
diff --git a/Documentation/devicetree/bindings/input/ads7846.txt b/Documentation/devicetree/bindings/input/ads7846.txt

index c6cfe2e3ed4119f4d4831312b962b47cd7009c85..9fc47b006fd132508b87208751d52f4efb14f4d6 100644 (file)
--- a/Documentation/devicetree/bindings/input/ads7846.txt
+++ b/Documentation/devicetree/bindings/input/ads7846.txt
@@ -29,7 +29,7 @@ Optional properties:
         ti,vref-delay-usecs             vref supply delay in usecs, 0 for
                                         external vref (u16).
         ti,vref-mv                      The VREF voltage, in millivolts (u16).
-                                       Set to 0 to use internal refernce
+                                       Set to 0 to use internal references
                                         (ADS7846).
         ti,keep-vref-on                 set to keep vref on for differential
                                         measurements as well
diff --git a/Documentation/devicetree/bindings/input/touchscreen/fsl-mx25-tcq.txt b/Documentation/devicetree/bindings/input/touchscreen/fsl-mx25-tcq.txt

index cdf05f9b232967075bb86d4998984e01dc31624e..abfcab3edc668bede50340f91ee799be35059142 100644 (file)
--- a/Documentation/devicetree/bindings/input/touchscreen/fsl-mx25-tcq.txt
+++ b/Documentation/devicetree/bindings/input/touchscreen/fsl-mx25-tcq.txt
@@ -15,7 +15,7 @@ Optional properties:
   - fsl,pen-debounce-ns: Pen debounce time in nanoseconds.
   - fsl,pen-threshold: Pen-down threshold for the touchscreen. This is a value
     between 1 and 4096. It is the ratio between the internal reference voltage
-   and the measured voltage after the plate was precharged. Resistence between
+   and the measured voltage after the plate was precharged. Resistance between
     plates and therefore the voltage decreases with pressure so that a smaller
     value is equivalent to a higher pressure.
   - fsl,settling-time-ns: Settling time in nanoseconds. The settling time is before
diff --git a/Documentation/devicetree/bindings/interrupt-controller/mediatek,sysirq.txt b/Documentation/devicetree/bindings/interrupt-controller/mediatek,sysirq.txt

index b8e1674c7837c274c6b485cfa3bb3977bd97232d..8cf564d083d2dda06446ef9e14aa2a4dfb2c5ad2 100644 (file)
--- a/Documentation/devicetree/bindings/interrupt-controller/mediatek,sysirq.txt
+++ b/Documentation/devicetree/bindings/interrupt-controller/mediatek,sysirq.txt
@@ -16,8 +16,7 @@ Required properties:
         "mediatek,mt6577-sysirq"
         "mediatek,mt2701-sysirq"
  - interrupt-controller : Identifies the node as an interrupt controller
-- #interrupt-cells : Use the same format as specified by GIC in
-  Documentation/devicetree/bindings/arm/gic.txt
+- #interrupt-cells : Use the same format as specified by GIC in arm,gic.txt.
  - interrupt-parent: phandle of irq parent for sysirq. The parent must
    use the same interrupt-cells format as GIC.
  - reg: Physical base address of the intpol registers and length of memory
diff --git a/Documentation/devicetree/bindings/interrupt-controller/nvidia,tegra-ictlr.txt b/Documentation/devicetree/bindings/interrupt-controller/nvidia,tegra-ictlr.txt

deleted file mode 100644 (file)

index 1099fe0..0000000
--- a/Documentation/devicetree/bindings/interrupt-controller/nvidia,tegra-ictlr.txt
+++ /dev/null
@@ -1,43 +0,0 @@
-NVIDIA Legacy Interrupt Controller
-
-All Tegra SoCs contain a legacy interrupt controller that routes
-interrupts to the GIC, and also serves as a wakeup source. It is also
-referred to as "ictlr", hence the name of the binding.
-
-The HW block exposes a number of interrupt controllers, each
-implementing a set of 32 interrupts.
-
-Required properties:
-
-- compatible : should be: "nvidia,tegra<chip>-ictlr". The LIC on
-  subsequent SoCs remained backwards-compatible with Tegra30, so on
-  Tegra generations later than Tegra30 the compatible value should
-  include "nvidia,tegra30-ictlr".      
-- reg : Specifies base physical address and size of the registers.
-  Each controller must be described separately (Tegra20 has 4 of them,
-  whereas Tegra30 and later have 5"  
-- interrupt-controller : Identifies the node as an interrupt controller.
-- #interrupt-cells : Specifies the number of cells needed to encode an
-  interrupt source. The value must be 3.
-- interrupt-parent : a phandle to the GIC these interrupts are routed
-  to.
-
-Notes:
-
-- Because this HW ultimately routes interrupts to the GIC, the
-  interrupt specifier must be that of the GIC.
-- Only SPIs can use the ictlr as an interrupt parent. SGIs and PPIs
-  are explicitly forbidden.
-
-Example:
-
-       ictlr: interrupt-controller@60004000 {
-               compatible = "nvidia,tegra20-ictlr", "nvidia,tegra-ictlr";
-               reg = <0x60004000 64>,
-                     <0x60004100 64>,
-                     <0x60004200 64>,
-                     <0x60004300 64>;
-               interrupt-controller;
-               #interrupt-cells = <3>;
-               interrupt-parent = <&intc>;
-       };
diff --git a/Documentation/devicetree/bindings/interrupt-controller/nvidia,tegra20-ictlr.txt b/Documentation/devicetree/bindings/interrupt-controller/nvidia,tegra20-ictlr.txt

new file mode 100644 (file)

index 0000000..1099fe0
--- /dev/null
+++ b/Documentation/devicetree/bindings/interrupt-controller/nvidia,tegra20-ictlr.txt
@@ -0,0 +1,43 @@
+NVIDIA Legacy Interrupt Controller
+
+All Tegra SoCs contain a legacy interrupt controller that routes
+interrupts to the GIC, and also serves as a wakeup source. It is also
+referred to as "ictlr", hence the name of the binding.
+
+The HW block exposes a number of interrupt controllers, each
+implementing a set of 32 interrupts.
+
+Required properties:
+
+- compatible : should be: "nvidia,tegra<chip>-ictlr". The LIC on
+  subsequent SoCs remained backwards-compatible with Tegra30, so on
+  Tegra generations later than Tegra30 the compatible value should
+  include "nvidia,tegra30-ictlr".      
+- reg : Specifies base physical address and size of the registers.
+  Each controller must be described separately (Tegra20 has 4 of them,
+  whereas Tegra30 and later have 5"  
+- interrupt-controller : Identifies the node as an interrupt controller.
+- #interrupt-cells : Specifies the number of cells needed to encode an
+  interrupt source. The value must be 3.
+- interrupt-parent : a phandle to the GIC these interrupts are routed
+  to.
+
+Notes:
+
+- Because this HW ultimately routes interrupts to the GIC, the
+  interrupt specifier must be that of the GIC.
+- Only SPIs can use the ictlr as an interrupt parent. SGIs and PPIs
+  are explicitly forbidden.
+
+Example:
+
+       ictlr: interrupt-controller@60004000 {
+               compatible = "nvidia,tegra20-ictlr", "nvidia,tegra-ictlr";
+               reg = <0x60004000 64>,
+                     <0x60004100 64>,
+                     <0x60004200 64>,
+                     <0x60004300 64>;
+               interrupt-controller;
+               #interrupt-cells = <3>;
+               interrupt-parent = <&intc>;
+       };
diff --git a/Documentation/devicetree/bindings/interrupt-controller/ti,omap4-wugen-mpu b/Documentation/devicetree/bindings/interrupt-controller/ti,omap4-wugen-mpu

index 43effa0a4fe7c46c0443bbc8c9b3d67c843cd85a..18d4f407bf0e858fcb5732f510113c06b1388018 100644 (file)
--- a/Documentation/devicetree/bindings/interrupt-controller/ti,omap4-wugen-mpu
+++ b/Documentation/devicetree/bindings/interrupt-controller/ti,omap4-wugen-mpu
@@ -4,7 +4,7 @@ All TI OMAP4/5 (and their derivatives) an interrupt controller that
  routes interrupts to the GIC, and also serves as a wakeup source. It
  is also referred to as "WUGEN-MPU", hence the name of the binding.
  
-Reguired properties:
+Required properties:
  
  - compatible : should contain at least "ti,omap4-wugen-mpu" or
    "ti,omap5-wugen-mpu"
@@ -20,7 +20,7 @@ Notes:
  - Because this HW ultimately routes interrupts to the GIC, the
    interrupt specifier must be that of the GIC.
  - Only SPIs can use the WUGEN as an interrupt parent. SGIs and PPIs
-  are explicitly forbiden.
+  are explicitly forbidden.
  
  Example:
  
diff --git a/Documentation/devicetree/bindings/memory-controllers/nvidia,tegra-mc.txt b/Documentation/devicetree/bindings/memory-controllers/nvidia,tegra-mc.txt

deleted file mode 100644 (file)

index 3338a28..0000000
--- a/Documentation/devicetree/bindings/memory-controllers/nvidia,tegra-mc.txt
+++ /dev/null
@@ -1,116 +0,0 @@
-NVIDIA Tegra Memory Controller device tree bindings
-===================================================
-
-memory-controller node
-----------------------
-
-Required properties:
-- compatible: Should be "nvidia,tegra<chip>-mc"
-- reg: Physical base address and length of the controller's registers.
-- clocks: Must contain an entry for each entry in clock-names.
-  See ../clocks/clock-bindings.txt for details.
-- clock-names: Must include the following entries:
-  - mc: the module's clock input
-- interrupts: The interrupt outputs from the controller.
-- #iommu-cells: Should be 1. The single cell of the IOMMU specifier defines
-  the SWGROUP of the master.
-
-This device implements an IOMMU that complies with the generic IOMMU binding.
-See ../iommu/iommu.txt for details.
-
-emc-timings subnode
--------------------
-
-The node should contain a "emc-timings" subnode for each supported RAM type (see field RAM_CODE in
-register PMC_STRAPPING_OPT_A).
-
-Required properties for "emc-timings" nodes :
-- nvidia,ram-code : Should contain the value of RAM_CODE this timing set is used for.
-
-timing subnode
---------------
-
-Each "emc-timings" node should contain a subnode for every supported EMC clock rate.
-
-Required properties for timing nodes :
-- clock-frequency : Should contain the memory clock rate in Hz.
-- nvidia,emem-configuration : Values to be written to the EMEM register block. For the Tegra124 SoC
-(see section "15.6.1 MC Registers" in the TRM), these are the registers whose values need to be
-specified, according to the board documentation:
-
-       MC_EMEM_ARB_CFG
-       MC_EMEM_ARB_OUTSTANDING_REQ
-       MC_EMEM_ARB_TIMING_RCD
-       MC_EMEM_ARB_TIMING_RP
-       MC_EMEM_ARB_TIMING_RC
-       MC_EMEM_ARB_TIMING_RAS
-       MC_EMEM_ARB_TIMING_FAW
-       MC_EMEM_ARB_TIMING_RRD
-       MC_EMEM_ARB_TIMING_RAP2PRE
-       MC_EMEM_ARB_TIMING_WAP2PRE
-       MC_EMEM_ARB_TIMING_R2R
-       MC_EMEM_ARB_TIMING_W2W
-       MC_EMEM_ARB_TIMING_R2W
-       MC_EMEM_ARB_TIMING_W2R
-       MC_EMEM_ARB_DA_TURNS
-       MC_EMEM_ARB_DA_COVERS
-       MC_EMEM_ARB_MISC0
-       MC_EMEM_ARB_MISC1
-       MC_EMEM_ARB_RING1_THROTTLE
-
-Example SoC include file:
-
-/ {
-       mc: memory-controller@0,70019000 {
-               compatible = "nvidia,tegra124-mc";
-               reg = <0x0 0x70019000 0x0 0x1000>;
-               clocks = <&tegra_car TEGRA124_CLK_MC>;
-               clock-names = "mc";
-
-               interrupts = <GIC_SPI 77 IRQ_TYPE_LEVEL_HIGH>;
-
-               #iommu-cells = <1>;
-       };
-
-       sdhci@0,700b0000 {
-               compatible = "nvidia,tegra124-sdhci";
-               ...
-               iommus = <&mc TEGRA_SWGROUP_SDMMC1A>;
-       };
-};
-
-Example board file:
-
-/ {
-       memory-controller@0,70019000 {
-               emc-timings-3 {
-                       nvidia,ram-code = <3>;
-
-                       timing-12750000 {
-                               clock-frequency = <12750000>;
-
-                               nvidia,emem-configuration = <
-                                       0x40040001 /* MC_EMEM_ARB_CFG */
-                                       0x8000000a /* MC_EMEM_ARB_OUTSTANDING_REQ */
-                                       0x00000001 /* MC_EMEM_ARB_TIMING_RCD */
-                                       0x00000001 /* MC_EMEM_ARB_TIMING_RP */
-                                       0x00000002 /* MC_EMEM_ARB_TIMING_RC */
-                                       0x00000000 /* MC_EMEM_ARB_TIMING_RAS */
-                                       0x00000002 /* MC_EMEM_ARB_TIMING_FAW */
-                                       0x00000001 /* MC_EMEM_ARB_TIMING_RRD */
-                                       0x00000002 /* MC_EMEM_ARB_TIMING_RAP2PRE */
-                                       0x00000008 /* MC_EMEM_ARB_TIMING_WAP2PRE */
-                                       0x00000003 /* MC_EMEM_ARB_TIMING_R2R */
-                                       0x00000002 /* MC_EMEM_ARB_TIMING_W2W */
-                                       0x00000003 /* MC_EMEM_ARB_TIMING_R2W */
-                                       0x00000006 /* MC_EMEM_ARB_TIMING_W2R */
-                                       0x06030203 /* MC_EMEM_ARB_DA_TURNS */
-                                       0x000a0402 /* MC_EMEM_ARB_DA_COVERS */
-                                       0x77e30303 /* MC_EMEM_ARB_MISC0 */
-                                       0x70000f03 /* MC_EMEM_ARB_MISC1 */
-                                       0x001f0000 /* MC_EMEM_ARB_RING1_THROTTLE */
-                               >;
-                       };
-               };
-       };
-};
diff --git a/Documentation/devicetree/bindings/memory-controllers/nvidia,tegra124-emc.txt b/Documentation/devicetree/bindings/memory-controllers/nvidia,tegra124-emc.txt

new file mode 100644 (file)

index 0000000..ba0bc3f
--- /dev/null
+++ b/Documentation/devicetree/bindings/memory-controllers/nvidia,tegra124-emc.txt
@@ -0,0 +1,374 @@
+NVIDIA Tegra124 SoC EMC (external memory controller)
+====================================================
+
+Required properties :
+- compatible : Should be "nvidia,tegra124-emc".
+- reg : physical base address and length of the controller's registers.
+- nvidia,memory-controller : phandle of the MC driver.
+
+The node should contain a "emc-timings" subnode for each supported RAM type
+(see field RAM_CODE in register PMC_STRAPPING_OPT_A), with its unit address
+being its RAM_CODE.
+
+Required properties for "emc-timings" nodes :
+- nvidia,ram-code : Should contain the value of RAM_CODE this timing set is
+used for.
+
+Each "emc-timings" node should contain a "timing" subnode for every supported
+EMC clock rate. The "timing" subnodes should have the clock rate in Hz as
+their unit address.
+
+Required properties for "timing" nodes :
+- clock-frequency : Should contain the memory clock rate in Hz.
+- The following properties contain EMC timing characterization values
+(specified in the board documentation) :
+  - nvidia,emc-auto-cal-config : EMC_AUTO_CAL_CONFIG
+  - nvidia,emc-auto-cal-config2 : EMC_AUTO_CAL_CONFIG2
+  - nvidia,emc-auto-cal-config3 : EMC_AUTO_CAL_CONFIG3
+  - nvidia,emc-auto-cal-interval : EMC_AUTO_CAL_INTERVAL
+  - nvidia,emc-bgbias-ctl0 : EMC_BGBIAS_CTL0
+  - nvidia,emc-cfg : EMC_CFG
+  - nvidia,emc-cfg-2 : EMC_CFG_2
+  - nvidia,emc-ctt-term-ctrl : EMC_CTT_TERM_CTRL
+  - nvidia,emc-mode-1 : Mode Register 1
+  - nvidia,emc-mode-2 : Mode Register 2
+  - nvidia,emc-mode-4 : Mode Register 4
+  - nvidia,emc-mode-reset : Mode Register 0
+  - nvidia,emc-mrs-wait-cnt : EMC_MRS_WAIT_CNT
+  - nvidia,emc-sel-dpd-ctrl : EMC_SEL_DPD_CTRL
+  - nvidia,emc-xm2dqspadctrl2 : EMC_XM2DQSPADCTRL2
+  - nvidia,emc-zcal-cnt-long : EMC_ZCAL_WAIT_CNT after clock change
+  - nvidia,emc-zcal-interval : EMC_ZCAL_INTERVAL
+- nvidia,emc-configuration : EMC timing characterization data. These are the
+registers (see section "15.6.2 EMC Registers" in the TRM) whose values need to
+be specified, according to the board documentation:
+
+       EMC_RC
+       EMC_RFC
+       EMC_RFC_SLR
+       EMC_RAS
+       EMC_RP
+       EMC_R2W
+       EMC_W2R
+       EMC_R2P
+       EMC_W2P
+       EMC_RD_RCD
+       EMC_WR_RCD
+       EMC_RRD
+       EMC_REXT
+       EMC_WEXT
+       EMC_WDV
+       EMC_WDV_MASK
+       EMC_QUSE
+       EMC_QUSE_WIDTH
+       EMC_IBDLY
+       EMC_EINPUT
+       EMC_EINPUT_DURATION
+       EMC_PUTERM_EXTRA
+       EMC_PUTERM_WIDTH
+       EMC_PUTERM_ADJ
+       EMC_CDB_CNTL_1
+       EMC_CDB_CNTL_2
+       EMC_CDB_CNTL_3
+       EMC_QRST
+       EMC_QSAFE
+       EMC_RDV
+       EMC_RDV_MASK
+       EMC_REFRESH
+       EMC_BURST_REFRESH_NUM
+       EMC_PRE_REFRESH_REQ_CNT
+       EMC_PDEX2WR
+       EMC_PDEX2RD
+       EMC_PCHG2PDEN
+       EMC_ACT2PDEN
+       EMC_AR2PDEN
+       EMC_RW2PDEN
+       EMC_TXSR
+       EMC_TXSRDLL
+       EMC_TCKE
+       EMC_TCKESR
+       EMC_TPD
+       EMC_TFAW
+       EMC_TRPAB
+       EMC_TCLKSTABLE
+       EMC_TCLKSTOP
+       EMC_TREFBW
+       EMC_FBIO_CFG6
+       EMC_ODT_WRITE
+       EMC_ODT_READ
+       EMC_FBIO_CFG5
+       EMC_CFG_DIG_DLL
+       EMC_CFG_DIG_DLL_PERIOD
+       EMC_DLL_XFORM_DQS0
+       EMC_DLL_XFORM_DQS1
+       EMC_DLL_XFORM_DQS2
+       EMC_DLL_XFORM_DQS3
+       EMC_DLL_XFORM_DQS4
+       EMC_DLL_XFORM_DQS5
+       EMC_DLL_XFORM_DQS6
+       EMC_DLL_XFORM_DQS7
+       EMC_DLL_XFORM_DQS8
+       EMC_DLL_XFORM_DQS9
+       EMC_DLL_XFORM_DQS10
+       EMC_DLL_XFORM_DQS11
+       EMC_DLL_XFORM_DQS12
+       EMC_DLL_XFORM_DQS13
+       EMC_DLL_XFORM_DQS14
+       EMC_DLL_XFORM_DQS15
+       EMC_DLL_XFORM_QUSE0
+       EMC_DLL_XFORM_QUSE1
+       EMC_DLL_XFORM_QUSE2
+       EMC_DLL_XFORM_QUSE3
+       EMC_DLL_XFORM_QUSE4
+       EMC_DLL_XFORM_QUSE5
+       EMC_DLL_XFORM_QUSE6
+       EMC_DLL_XFORM_QUSE7
+       EMC_DLL_XFORM_ADDR0
+       EMC_DLL_XFORM_ADDR1
+       EMC_DLL_XFORM_ADDR2
+       EMC_DLL_XFORM_ADDR3
+       EMC_DLL_XFORM_ADDR4
+       EMC_DLL_XFORM_ADDR5
+       EMC_DLL_XFORM_QUSE8
+       EMC_DLL_XFORM_QUSE9
+       EMC_DLL_XFORM_QUSE10
+       EMC_DLL_XFORM_QUSE11
+       EMC_DLL_XFORM_QUSE12
+       EMC_DLL_XFORM_QUSE13
+       EMC_DLL_XFORM_QUSE14
+       EMC_DLL_XFORM_QUSE15
+       EMC_DLI_TRIM_TXDQS0
+       EMC_DLI_TRIM_TXDQS1
+       EMC_DLI_TRIM_TXDQS2
+       EMC_DLI_TRIM_TXDQS3
+       EMC_DLI_TRIM_TXDQS4
+       EMC_DLI_TRIM_TXDQS5
+       EMC_DLI_TRIM_TXDQS6
+       EMC_DLI_TRIM_TXDQS7
+       EMC_DLI_TRIM_TXDQS8
+       EMC_DLI_TRIM_TXDQS9
+       EMC_DLI_TRIM_TXDQS10
+       EMC_DLI_TRIM_TXDQS11
+       EMC_DLI_TRIM_TXDQS12
+       EMC_DLI_TRIM_TXDQS13
+       EMC_DLI_TRIM_TXDQS14
+       EMC_DLI_TRIM_TXDQS15
+       EMC_DLL_XFORM_DQ0
+       EMC_DLL_XFORM_DQ1
+       EMC_DLL_XFORM_DQ2
+       EMC_DLL_XFORM_DQ3
+       EMC_DLL_XFORM_DQ4
+       EMC_DLL_XFORM_DQ5
+       EMC_DLL_XFORM_DQ6
+       EMC_DLL_XFORM_DQ7
+       EMC_XM2CMDPADCTRL
+       EMC_XM2CMDPADCTRL4
+       EMC_XM2CMDPADCTRL5
+       EMC_XM2DQPADCTRL2
+       EMC_XM2DQPADCTRL3
+       EMC_XM2CLKPADCTRL
+       EMC_XM2CLKPADCTRL2
+       EMC_XM2COMPPADCTRL
+       EMC_XM2VTTGENPADCTRL
+       EMC_XM2VTTGENPADCTRL2
+       EMC_XM2VTTGENPADCTRL3
+       EMC_XM2DQSPADCTRL3
+       EMC_XM2DQSPADCTRL4
+       EMC_XM2DQSPADCTRL5
+       EMC_XM2DQSPADCTRL6
+       EMC_DSR_VTTGEN_DRV
+       EMC_TXDSRVTTGEN
+       EMC_FBIO_SPARE
+       EMC_ZCAL_WAIT_CNT
+       EMC_MRS_WAIT_CNT2
+       EMC_CTT
+       EMC_CTT_DURATION
+       EMC_CFG_PIPE
+       EMC_DYN_SELF_REF_CONTROL
+       EMC_QPOP
+
+Example SoC include file:
+
+/ {
+       emc@7001b000 {
+               compatible = "nvidia,tegra124-emc";
+               reg = <0x0 0x7001b000 0x0 0x1000>;
+
+               nvidia,memory-controller = <&mc>;
+       };
+};
+
+Example board file:
+
+/ {
+       emc@7001b000 {
+               emc-timings-3 {
+                       nvidia,ram-code = <3>;
+
+                       timing-12750000 {
+                               clock-frequency = <12750000>;
+
+                               nvidia,emc-zcal-cnt-long = <0x00000042>;
+                               nvidia,emc-auto-cal-interval = <0x001fffff>;
+                               nvidia,emc-ctt-term-ctrl = <0x00000802>;
+                               nvidia,emc-cfg = <0x73240000>;
+                               nvidia,emc-cfg-2 = <0x000008c5>;
+                               nvidia,emc-sel-dpd-ctrl = <0x00040128>;
+                               nvidia,emc-bgbias-ctl0 = <0x00000008>;
+                               nvidia,emc-auto-cal-config = <0xa1430000>;
+                               nvidia,emc-auto-cal-config2 = <0x00000000>;
+                               nvidia,emc-auto-cal-config3 = <0x00000000>;
+                               nvidia,emc-mode-reset = <0x80001221>;
+                               nvidia,emc-mode-1 = <0x80100003>;
+                               nvidia,emc-mode-2 = <0x80200008>;
+                               nvidia,emc-mode-4 = <0x00000000>;
+
+                               nvidia,emc-configuration = <
+                                       0x00000000 /* EMC_RC */
+                                       0x00000003 /* EMC_RFC */
+                                       0x00000000 /* EMC_RFC_SLR */
+                                       0x00000000 /* EMC_RAS */
+                                       0x00000000 /* EMC_RP */
+                                       0x00000004 /* EMC_R2W */
+                                       0x0000000a /* EMC_W2R */
+                                       0x00000003 /* EMC_R2P */
+                                       0x0000000b /* EMC_W2P */
+                                       0x00000000 /* EMC_RD_RCD */
+                                       0x00000000 /* EMC_WR_RCD */
+                                       0x00000003 /* EMC_RRD */
+                                       0x00000003 /* EMC_REXT */
+                                       0x00000000 /* EMC_WEXT */
+                                       0x00000006 /* EMC_WDV */
+                                       0x00000006 /* EMC_WDV_MASK */
+                                       0x00000006 /* EMC_QUSE */
+                                       0x00000002 /* EMC_QUSE_WIDTH */
+                                       0x00000000 /* EMC_IBDLY */
+                                       0x00000005 /* EMC_EINPUT */
+                                       0x00000005 /* EMC_EINPUT_DURATION */
+                                       0x00010000 /* EMC_PUTERM_EXTRA */
+                                       0x00000003 /* EMC_PUTERM_WIDTH */
+                                       0x00000000 /* EMC_PUTERM_ADJ */
+                                       0x00000000 /* EMC_CDB_CNTL_1 */
+                                       0x00000000 /* EMC_CDB_CNTL_2 */
+                                       0x00000000 /* EMC_CDB_CNTL_3 */
+                                       0x00000004 /* EMC_QRST */
+                                       0x0000000c /* EMC_QSAFE */
+                                       0x0000000d /* EMC_RDV */
+                                       0x0000000f /* EMC_RDV_MASK */
+                                       0x00000060 /* EMC_REFRESH */
+                                       0x00000000 /* EMC_BURST_REFRESH_NUM */
+                                       0x00000018 /* EMC_PRE_REFRESH_REQ_CNT */
+                                       0x00000002 /* EMC_PDEX2WR */
+                                       0x00000002 /* EMC_PDEX2RD */
+                                       0x00000001 /* EMC_PCHG2PDEN */
+                                       0x00000000 /* EMC_ACT2PDEN */
+                                       0x00000007 /* EMC_AR2PDEN */
+                                       0x0000000f /* EMC_RW2PDEN */
+                                       0x00000005 /* EMC_TXSR */
+                                       0x00000005 /* EMC_TXSRDLL */
+                                       0x00000004 /* EMC_TCKE */
+                                       0x00000005 /* EMC_TCKESR */
+                                       0x00000004 /* EMC_TPD */
+                                       0x00000000 /* EMC_TFAW */
+                                       0x00000000 /* EMC_TRPAB */
+                                       0x00000005 /* EMC_TCLKSTABLE */
+                                       0x00000005 /* EMC_TCLKSTOP */
+                                       0x00000064 /* EMC_TREFBW */
+                                       0x00000000 /* EMC_FBIO_CFG6 */
+                                       0x00000000 /* EMC_ODT_WRITE */
+                                       0x00000000 /* EMC_ODT_READ */
+                                       0x106aa298 /* EMC_FBIO_CFG5 */
+                                       0x002c00a0 /* EMC_CFG_DIG_DLL */
+                                       0x00008000 /* EMC_CFG_DIG_DLL_PERIOD */
+                                       0x00064000 /* EMC_DLL_XFORM_DQS0 */
+                                       0x00064000 /* EMC_DLL_XFORM_DQS1 */
+                                       0x00064000 /* EMC_DLL_XFORM_DQS2 */
+                                       0x00064000 /* EMC_DLL_XFORM_DQS3 */
+                                       0x00064000 /* EMC_DLL_XFORM_DQS4 */
+                                       0x00064000 /* EMC_DLL_XFORM_DQS5 */
+                                       0x00064000 /* EMC_DLL_XFORM_DQS6 */
+                                       0x00064000 /* EMC_DLL_XFORM_DQS7 */
+                                       0x00064000 /* EMC_DLL_XFORM_DQS8 */
+                                       0x00064000 /* EMC_DLL_XFORM_DQS9 */
+                                       0x00064000 /* EMC_DLL_XFORM_DQS10 */
+                                       0x00064000 /* EMC_DLL_XFORM_DQS11 */
+                                       0x00064000 /* EMC_DLL_XFORM_DQS12 */
+                                       0x00064000 /* EMC_DLL_XFORM_DQS13 */
+                                       0x00064000 /* EMC_DLL_XFORM_DQS14 */
+                                       0x00064000 /* EMC_DLL_XFORM_DQS15 */
+                                       0x00000000 /* EMC_DLL_XFORM_QUSE0 */
+                                       0x00000000 /* EMC_DLL_XFORM_QUSE1 */
+                                       0x00000000 /* EMC_DLL_XFORM_QUSE2 */
+                                       0x00000000 /* EMC_DLL_XFORM_QUSE3 */
+                                       0x00000000 /* EMC_DLL_XFORM_QUSE4 */
+                                       0x00000000 /* EMC_DLL_XFORM_QUSE5 */
+                                       0x00000000 /* EMC_DLL_XFORM_QUSE6 */
+                                       0x00000000 /* EMC_DLL_XFORM_QUSE7 */
+                                       0x00000000 /* EMC_DLL_XFORM_ADDR0 */
+                                       0x00000000 /* EMC_DLL_XFORM_ADDR1 */
+                                       0x00000000 /* EMC_DLL_XFORM_ADDR2 */
+                                       0x00000000 /* EMC_DLL_XFORM_ADDR3 */
+                                       0x00000000 /* EMC_DLL_XFORM_ADDR4 */
+                                       0x00000000 /* EMC_DLL_XFORM_ADDR5 */
+                                       0x00000000 /* EMC_DLL_XFORM_QUSE8 */
+                                       0x00000000 /* EMC_DLL_XFORM_QUSE9 */
+                                       0x00000000 /* EMC_DLL_XFORM_QUSE10 */
+                                       0x00000000 /* EMC_DLL_XFORM_QUSE11 */
+                                       0x00000000 /* EMC_DLL_XFORM_QUSE12 */
+                                       0x00000000 /* EMC_DLL_XFORM_QUSE13 */
+                                       0x00000000 /* EMC_DLL_XFORM_QUSE14 */
+                                       0x00000000 /* EMC_DLL_XFORM_QUSE15 */
+                                       0x00000000 /* EMC_DLI_TRIM_TXDQS0 */
+                                       0x00000000 /* EMC_DLI_TRIM_TXDQS1 */
+                                       0x00000000 /* EMC_DLI_TRIM_TXDQS2 */
+                                       0x00000000 /* EMC_DLI_TRIM_TXDQS3 */
+                                       0x00000000 /* EMC_DLI_TRIM_TXDQS4 */
+                                       0x00000000 /* EMC_DLI_TRIM_TXDQS5 */
+                                       0x00000000 /* EMC_DLI_TRIM_TXDQS6 */
+                                       0x00000000 /* EMC_DLI_TRIM_TXDQS7 */
+                                       0x00000000 /* EMC_DLI_TRIM_TXDQS8 */
+                                       0x00000000 /* EMC_DLI_TRIM_TXDQS9 */
+                                       0x00000000 /* EMC_DLI_TRIM_TXDQS10 */
+                                       0x00000000 /* EMC_DLI_TRIM_TXDQS11 */
+                                       0x00000000 /* EMC_DLI_TRIM_TXDQS12 */
+                                       0x00000000 /* EMC_DLI_TRIM_TXDQS13 */
+                                       0x00000000 /* EMC_DLI_TRIM_TXDQS14 */
+                                       0x00000000 /* EMC_DLI_TRIM_TXDQS15 */
+                                       0x000fc000 /* EMC_DLL_XFORM_DQ0 */
+                                       0x000fc000 /* EMC_DLL_XFORM_DQ1 */
+                                       0x000fc000 /* EMC_DLL_XFORM_DQ2 */
+                                       0x000fc000 /* EMC_DLL_XFORM_DQ3 */
+                                       0x0000fc00 /* EMC_DLL_XFORM_DQ4 */
+                                       0x0000fc00 /* EMC_DLL_XFORM_DQ5 */
+                                       0x0000fc00 /* EMC_DLL_XFORM_DQ6 */
+                                       0x0000fc00 /* EMC_DLL_XFORM_DQ7 */
+                                       0x10000280 /* EMC_XM2CMDPADCTRL */
+                                       0x00000000 /* EMC_XM2CMDPADCTRL4 */
+                                       0x00111111 /* EMC_XM2CMDPADCTRL5 */
+                                       0x00000000 /* EMC_XM2DQPADCTRL2 */
+                                       0x00000000 /* EMC_XM2DQPADCTRL3 */
+                                       0x77ffc081 /* EMC_XM2CLKPADCTRL */
+                                       0x00000e0e /* EMC_XM2CLKPADCTRL2 */
+                                       0x81f1f108 /* EMC_XM2COMPPADCTRL */
+                                       0x07070004 /* EMC_XM2VTTGENPADCTRL */
+                                       0x0000003f /* EMC_XM2VTTGENPADCTRL2 */
+                                       0x016eeeee /* EMC_XM2VTTGENPADCTRL3 */
+                                       0x51451400 /* EMC_XM2DQSPADCTRL3 */
+                                       0x00514514 /* EMC_XM2DQSPADCTRL4 */
+                                       0x00514514 /* EMC_XM2DQSPADCTRL5 */
+                                       0x51451400 /* EMC_XM2DQSPADCTRL6 */
+                                       0x0000003f /* EMC_DSR_VTTGEN_DRV */
+                                       0x00000007 /* EMC_TXDSRVTTGEN */
+                                       0x00000000 /* EMC_FBIO_SPARE */
+                                       0x00000042 /* EMC_ZCAL_WAIT_CNT */
+                                       0x000e000e /* EMC_MRS_WAIT_CNT2 */
+                                       0x00000000 /* EMC_CTT */
+                                       0x00000003 /* EMC_CTT_DURATION */
+                                       0x0000f2f3 /* EMC_CFG_PIPE */
+                                       0x800001c5 /* EMC_DYN_SELF_REF_CONTROL */
+                                       0x0000000a /* EMC_QPOP */
+                               >;
+                       };
+               };
+       };
+};
diff --git a/Documentation/devicetree/bindings/memory-controllers/nvidia,tegra30-mc.txt b/Documentation/devicetree/bindings/memory-controllers/nvidia,tegra30-mc.txt

new file mode 100644 (file)

index 0000000..8dbe470
--- /dev/null
+++ b/Documentation/devicetree/bindings/memory-controllers/nvidia,tegra30-mc.txt
@@ -0,0 +1,116 @@
+NVIDIA Tegra Memory Controller device tree bindings
+===================================================
+
+memory-controller node
+----------------------
+
+Required properties:
+- compatible: Should be "nvidia,tegra<chip>-mc"
+- reg: Physical base address and length of the controller's registers.
+- clocks: Must contain an entry for each entry in clock-names.
+  See ../clocks/clock-bindings.txt for details.
+- clock-names: Must include the following entries:
+  - mc: the module's clock input
+- interrupts: The interrupt outputs from the controller.
+- #iommu-cells: Should be 1. The single cell of the IOMMU specifier defines
+  the SWGROUP of the master.
+
+This device implements an IOMMU that complies with the generic IOMMU binding.
+See ../iommu/iommu.txt for details.
+
+emc-timings subnode
+-------------------
+
+The node should contain a "emc-timings" subnode for each supported RAM type (see field RAM_CODE in
+register PMC_STRAPPING_OPT_A).
+
+Required properties for "emc-timings" nodes :
+- nvidia,ram-code : Should contain the value of RAM_CODE this timing set is used for.
+
+timing subnode
+--------------
+
+Each "emc-timings" node should contain a subnode for every supported EMC clock rate.
+
+Required properties for timing nodes :
+- clock-frequency : Should contain the memory clock rate in Hz.
+- nvidia,emem-configuration : Values to be written to the EMEM register block. For the Tegra124 SoC
+(see section "15.6.1 MC Registers" in the TRM), these are the registers whose values need to be
+specified, according to the board documentation:
+
+       MC_EMEM_ARB_CFG
+       MC_EMEM_ARB_OUTSTANDING_REQ
+       MC_EMEM_ARB_TIMING_RCD
+       MC_EMEM_ARB_TIMING_RP
+       MC_EMEM_ARB_TIMING_RC
+       MC_EMEM_ARB_TIMING_RAS
+       MC_EMEM_ARB_TIMING_FAW
+       MC_EMEM_ARB_TIMING_RRD
+       MC_EMEM_ARB_TIMING_RAP2PRE
+       MC_EMEM_ARB_TIMING_WAP2PRE
+       MC_EMEM_ARB_TIMING_R2R
+       MC_EMEM_ARB_TIMING_W2W
+       MC_EMEM_ARB_TIMING_R2W
+       MC_EMEM_ARB_TIMING_W2R
+       MC_EMEM_ARB_DA_TURNS
+       MC_EMEM_ARB_DA_COVERS
+       MC_EMEM_ARB_MISC0
+       MC_EMEM_ARB_MISC1
+       MC_EMEM_ARB_RING1_THROTTLE
+
+Example SoC include file:
+
+/ {
+       mc: memory-controller@70019000 {
+               compatible = "nvidia,tegra124-mc";
+               reg = <0x0 0x70019000 0x0 0x1000>;
+               clocks = <&tegra_car TEGRA124_CLK_MC>;
+               clock-names = "mc";
+
+               interrupts = <GIC_SPI 77 IRQ_TYPE_LEVEL_HIGH>;
+
+               #iommu-cells = <1>;
+       };
+
+       sdhci@700b0000 {
+               compatible = "nvidia,tegra124-sdhci";
+               ...
+               iommus = <&mc TEGRA_SWGROUP_SDMMC1A>;
+       };
+};
+
+Example board file:
+
+/ {
+       memory-controller@70019000 {
+               emc-timings-3 {
+                       nvidia,ram-code = <3>;
+
+                       timing-12750000 {
+                               clock-frequency = <12750000>;
+
+                               nvidia,emem-configuration = <
+                                       0x40040001 /* MC_EMEM_ARB_CFG */
+                                       0x8000000a /* MC_EMEM_ARB_OUTSTANDING_REQ */
+                                       0x00000001 /* MC_EMEM_ARB_TIMING_RCD */
+                                       0x00000001 /* MC_EMEM_ARB_TIMING_RP */
+                                       0x00000002 /* MC_EMEM_ARB_TIMING_RC */
+                                       0x00000000 /* MC_EMEM_ARB_TIMING_RAS */
+                                       0x00000002 /* MC_EMEM_ARB_TIMING_FAW */
+                                       0x00000001 /* MC_EMEM_ARB_TIMING_RRD */
+                                       0x00000002 /* MC_EMEM_ARB_TIMING_RAP2PRE */
+                                       0x00000008 /* MC_EMEM_ARB_TIMING_WAP2PRE */
+                                       0x00000003 /* MC_EMEM_ARB_TIMING_R2R */
+                                       0x00000002 /* MC_EMEM_ARB_TIMING_W2W */
+                                       0x00000003 /* MC_EMEM_ARB_TIMING_R2W */
+                                       0x00000006 /* MC_EMEM_ARB_TIMING_W2R */
+                                       0x06030203 /* MC_EMEM_ARB_DA_TURNS */
+                                       0x000a0402 /* MC_EMEM_ARB_DA_COVERS */
+                                       0x77e30303 /* MC_EMEM_ARB_MISC0 */
+                                       0x70000f03 /* MC_EMEM_ARB_MISC1 */
+                                       0x001f0000 /* MC_EMEM_ARB_RING1_THROTTLE */
+                               >;
+                       };
+               };
+       };
+};
diff --git a/Documentation/devicetree/bindings/memory-controllers/tegra-emc.txt b/Documentation/devicetree/bindings/memory-controllers/tegra-emc.txt

deleted file mode 100644 (file)

index b59c625..0000000
--- a/Documentation/devicetree/bindings/memory-controllers/tegra-emc.txt
+++ /dev/null
@@ -1,374 +0,0 @@
-NVIDIA Tegra124 SoC EMC (external memory controller)
-====================================================
-
-Required properties :
-- compatible : Should be "nvidia,tegra124-emc".
-- reg : physical base address and length of the controller's registers.
-- nvidia,memory-controller : phandle of the MC driver.
-
-The node should contain a "emc-timings" subnode for each supported RAM type
-(see field RAM_CODE in register PMC_STRAPPING_OPT_A), with its unit address
-being its RAM_CODE.
-
-Required properties for "emc-timings" nodes :
-- nvidia,ram-code : Should contain the value of RAM_CODE this timing set is
-used for.
-
-Each "emc-timings" node should contain a "timing" subnode for every supported
-EMC clock rate. The "timing" subnodes should have the clock rate in Hz as
-their unit address.
-
-Required properties for "timing" nodes :
-- clock-frequency : Should contain the memory clock rate in Hz.
-- The following properties contain EMC timing characterization values
-(specified in the board documentation) :
-  - nvidia,emc-auto-cal-config : EMC_AUTO_CAL_CONFIG
-  - nvidia,emc-auto-cal-config2 : EMC_AUTO_CAL_CONFIG2
-  - nvidia,emc-auto-cal-config3 : EMC_AUTO_CAL_CONFIG3
-  - nvidia,emc-auto-cal-interval : EMC_AUTO_CAL_INTERVAL
-  - nvidia,emc-bgbias-ctl0 : EMC_BGBIAS_CTL0
-  - nvidia,emc-cfg : EMC_CFG
-  - nvidia,emc-cfg-2 : EMC_CFG_2
-  - nvidia,emc-ctt-term-ctrl : EMC_CTT_TERM_CTRL
-  - nvidia,emc-mode-1 : Mode Register 1
-  - nvidia,emc-mode-2 : Mode Register 2
-  - nvidia,emc-mode-4 : Mode Register 4
-  - nvidia,emc-mode-reset : Mode Register 0
-  - nvidia,emc-mrs-wait-cnt : EMC_MRS_WAIT_CNT
-  - nvidia,emc-sel-dpd-ctrl : EMC_SEL_DPD_CTRL
-  - nvidia,emc-xm2dqspadctrl2 : EMC_XM2DQSPADCTRL2
-  - nvidia,emc-zcal-cnt-long : EMC_ZCAL_WAIT_CNT after clock change
-  - nvidia,emc-zcal-interval : EMC_ZCAL_INTERVAL
-- nvidia,emc-configuration : EMC timing characterization data. These are the
-registers (see section "15.6.2 EMC Registers" in the TRM) whose values need to
-be specified, according to the board documentation:
-
-       EMC_RC
-       EMC_RFC
-       EMC_RFC_SLR
-       EMC_RAS
-       EMC_RP
-       EMC_R2W
-       EMC_W2R
-       EMC_R2P
-       EMC_W2P
-       EMC_RD_RCD
-       EMC_WR_RCD
-       EMC_RRD
-       EMC_REXT
-       EMC_WEXT
-       EMC_WDV
-       EMC_WDV_MASK
-       EMC_QUSE
-       EMC_QUSE_WIDTH
-       EMC_IBDLY
-       EMC_EINPUT
-       EMC_EINPUT_DURATION
-       EMC_PUTERM_EXTRA
-       EMC_PUTERM_WIDTH
-       EMC_PUTERM_ADJ
-       EMC_CDB_CNTL_1
-       EMC_CDB_CNTL_2
-       EMC_CDB_CNTL_3
-       EMC_QRST
-       EMC_QSAFE
-       EMC_RDV
-       EMC_RDV_MASK
-       EMC_REFRESH
-       EMC_BURST_REFRESH_NUM
-       EMC_PRE_REFRESH_REQ_CNT
-       EMC_PDEX2WR
-       EMC_PDEX2RD
-       EMC_PCHG2PDEN
-       EMC_ACT2PDEN
-       EMC_AR2PDEN
-       EMC_RW2PDEN
-       EMC_TXSR
-       EMC_TXSRDLL
-       EMC_TCKE
-       EMC_TCKESR
-       EMC_TPD
-       EMC_TFAW
-       EMC_TRPAB
-       EMC_TCLKSTABLE
-       EMC_TCLKSTOP
-       EMC_TREFBW
-       EMC_FBIO_CFG6
-       EMC_ODT_WRITE
-       EMC_ODT_READ
-       EMC_FBIO_CFG5
-       EMC_CFG_DIG_DLL
-       EMC_CFG_DIG_DLL_PERIOD
-       EMC_DLL_XFORM_DQS0
-       EMC_DLL_XFORM_DQS1
-       EMC_DLL_XFORM_DQS2
-       EMC_DLL_XFORM_DQS3
-       EMC_DLL_XFORM_DQS4
-       EMC_DLL_XFORM_DQS5
-       EMC_DLL_XFORM_DQS6
-       EMC_DLL_XFORM_DQS7
-       EMC_DLL_XFORM_DQS8
-       EMC_DLL_XFORM_DQS9
-       EMC_DLL_XFORM_DQS10
-       EMC_DLL_XFORM_DQS11
-       EMC_DLL_XFORM_DQS12
-       EMC_DLL_XFORM_DQS13
-       EMC_DLL_XFORM_DQS14
-       EMC_DLL_XFORM_DQS15
-       EMC_DLL_XFORM_QUSE0
-       EMC_DLL_XFORM_QUSE1
-       EMC_DLL_XFORM_QUSE2
-       EMC_DLL_XFORM_QUSE3
-       EMC_DLL_XFORM_QUSE4
-       EMC_DLL_XFORM_QUSE5
-       EMC_DLL_XFORM_QUSE6
-       EMC_DLL_XFORM_QUSE7
-       EMC_DLL_XFORM_ADDR0
-       EMC_DLL_XFORM_ADDR1
-       EMC_DLL_XFORM_ADDR2
-       EMC_DLL_XFORM_ADDR3
-       EMC_DLL_XFORM_ADDR4
-       EMC_DLL_XFORM_ADDR5
-       EMC_DLL_XFORM_QUSE8
-       EMC_DLL_XFORM_QUSE9
-       EMC_DLL_XFORM_QUSE10
-       EMC_DLL_XFORM_QUSE11
-       EMC_DLL_XFORM_QUSE12
-       EMC_DLL_XFORM_QUSE13
-       EMC_DLL_XFORM_QUSE14
-       EMC_DLL_XFORM_QUSE15
-       EMC_DLI_TRIM_TXDQS0
-       EMC_DLI_TRIM_TXDQS1
-       EMC_DLI_TRIM_TXDQS2
-       EMC_DLI_TRIM_TXDQS3
-       EMC_DLI_TRIM_TXDQS4
-       EMC_DLI_TRIM_TXDQS5
-       EMC_DLI_TRIM_TXDQS6
-       EMC_DLI_TRIM_TXDQS7
-       EMC_DLI_TRIM_TXDQS8
-       EMC_DLI_TRIM_TXDQS9
-       EMC_DLI_TRIM_TXDQS10
-       EMC_DLI_TRIM_TXDQS11
-       EMC_DLI_TRIM_TXDQS12
-       EMC_DLI_TRIM_TXDQS13
-       EMC_DLI_TRIM_TXDQS14
-       EMC_DLI_TRIM_TXDQS15
-       EMC_DLL_XFORM_DQ0
-       EMC_DLL_XFORM_DQ1
-       EMC_DLL_XFORM_DQ2
-       EMC_DLL_XFORM_DQ3
-       EMC_DLL_XFORM_DQ4
-       EMC_DLL_XFORM_DQ5
-       EMC_DLL_XFORM_DQ6
-       EMC_DLL_XFORM_DQ7
-       EMC_XM2CMDPADCTRL
-       EMC_XM2CMDPADCTRL4
-       EMC_XM2CMDPADCTRL5
-       EMC_XM2DQPADCTRL2
-       EMC_XM2DQPADCTRL3
-       EMC_XM2CLKPADCTRL
-       EMC_XM2CLKPADCTRL2
-       EMC_XM2COMPPADCTRL
-       EMC_XM2VTTGENPADCTRL
-       EMC_XM2VTTGENPADCTRL2
-       EMC_XM2VTTGENPADCTRL3
-       EMC_XM2DQSPADCTRL3
-       EMC_XM2DQSPADCTRL4
-       EMC_XM2DQSPADCTRL5
-       EMC_XM2DQSPADCTRL6
-       EMC_DSR_VTTGEN_DRV
-       EMC_TXDSRVTTGEN
-       EMC_FBIO_SPARE
-       EMC_ZCAL_WAIT_CNT
-       EMC_MRS_WAIT_CNT2
-       EMC_CTT
-       EMC_CTT_DURATION
-       EMC_CFG_PIPE
-       EMC_DYN_SELF_REF_CONTROL
-       EMC_QPOP
-
-Example SoC include file:
-
-/ {
-       emc@0,7001b000 {
-               compatible = "nvidia,tegra124-emc";
-               reg = <0x0 0x7001b000 0x0 0x1000>;
-
-               nvidia,memory-controller = <&mc>;
-       };
-};
-
-Example board file:
-
-/ {
-       emc@0,7001b000 {
-               emc-timings-3 {
-                       nvidia,ram-code = <3>;
-
-                       timing-12750000 {
-                               clock-frequency = <12750000>;
-
-                               nvidia,emc-zcal-cnt-long = <0x00000042>;
-                               nvidia,emc-auto-cal-interval = <0x001fffff>;
-                               nvidia,emc-ctt-term-ctrl = <0x00000802>;
-                               nvidia,emc-cfg = <0x73240000>;
-                               nvidia,emc-cfg-2 = <0x000008c5>;
-                               nvidia,emc-sel-dpd-ctrl = <0x00040128>;
-                               nvidia,emc-bgbias-ctl0 = <0x00000008>;
-                               nvidia,emc-auto-cal-config = <0xa1430000>;
-                               nvidia,emc-auto-cal-config2 = <0x00000000>;
-                               nvidia,emc-auto-cal-config3 = <0x00000000>;
-                               nvidia,emc-mode-reset = <0x80001221>;
-                               nvidia,emc-mode-1 = <0x80100003>;
-                               nvidia,emc-mode-2 = <0x80200008>;
-                               nvidia,emc-mode-4 = <0x00000000>;
-
-                               nvidia,emc-configuration = <
-                                       0x00000000 /* EMC_RC */
-                                       0x00000003 /* EMC_RFC */
-                                       0x00000000 /* EMC_RFC_SLR */
-                                       0x00000000 /* EMC_RAS */
-                                       0x00000000 /* EMC_RP */
-                                       0x00000004 /* EMC_R2W */
-                                       0x0000000a /* EMC_W2R */
-                                       0x00000003 /* EMC_R2P */
-                                       0x0000000b /* EMC_W2P */
-                                       0x00000000 /* EMC_RD_RCD */
-                                       0x00000000 /* EMC_WR_RCD */
-                                       0x00000003 /* EMC_RRD */
-                                       0x00000003 /* EMC_REXT */
-                                       0x00000000 /* EMC_WEXT */
-                                       0x00000006 /* EMC_WDV */
-                                       0x00000006 /* EMC_WDV_MASK */
-                                       0x00000006 /* EMC_QUSE */
-                                       0x00000002 /* EMC_QUSE_WIDTH */
-                                       0x00000000 /* EMC_IBDLY */
-                                       0x00000005 /* EMC_EINPUT */
-                                       0x00000005 /* EMC_EINPUT_DURATION */
-                                       0x00010000 /* EMC_PUTERM_EXTRA */
-                                       0x00000003 /* EMC_PUTERM_WIDTH */
-                                       0x00000000 /* EMC_PUTERM_ADJ */
-                                       0x00000000 /* EMC_CDB_CNTL_1 */
-                                       0x00000000 /* EMC_CDB_CNTL_2 */
-                                       0x00000000 /* EMC_CDB_CNTL_3 */
-                                       0x00000004 /* EMC_QRST */
-                                       0x0000000c /* EMC_QSAFE */
-                                       0x0000000d /* EMC_RDV */
-                                       0x0000000f /* EMC_RDV_MASK */
-                                       0x00000060 /* EMC_REFRESH */
-                                       0x00000000 /* EMC_BURST_REFRESH_NUM */
-                                       0x00000018 /* EMC_PRE_REFRESH_REQ_CNT */
-                                       0x00000002 /* EMC_PDEX2WR */
-                                       0x00000002 /* EMC_PDEX2RD */
-                                       0x00000001 /* EMC_PCHG2PDEN */
-                                       0x00000000 /* EMC_ACT2PDEN */
-                                       0x00000007 /* EMC_AR2PDEN */
-                                       0x0000000f /* EMC_RW2PDEN */
-                                       0x00000005 /* EMC_TXSR */
-                                       0x00000005 /* EMC_TXSRDLL */
-                                       0x00000004 /* EMC_TCKE */
-                                       0x00000005 /* EMC_TCKESR */
-                                       0x00000004 /* EMC_TPD */
-                                       0x00000000 /* EMC_TFAW */
-                                       0x00000000 /* EMC_TRPAB */
-                                       0x00000005 /* EMC_TCLKSTABLE */
-                                       0x00000005 /* EMC_TCLKSTOP */
-                                       0x00000064 /* EMC_TREFBW */
-                                       0x00000000 /* EMC_FBIO_CFG6 */
-                                       0x00000000 /* EMC_ODT_WRITE */
-                                       0x00000000 /* EMC_ODT_READ */
-                                       0x106aa298 /* EMC_FBIO_CFG5 */
-                                       0x002c00a0 /* EMC_CFG_DIG_DLL */
-                                       0x00008000 /* EMC_CFG_DIG_DLL_PERIOD */
-                                       0x00064000 /* EMC_DLL_XFORM_DQS0 */
-                                       0x00064000 /* EMC_DLL_XFORM_DQS1 */
-                                       0x00064000 /* EMC_DLL_XFORM_DQS2 */
-                                       0x00064000 /* EMC_DLL_XFORM_DQS3 */
-                                       0x00064000 /* EMC_DLL_XFORM_DQS4 */
-                                       0x00064000 /* EMC_DLL_XFORM_DQS5 */
-                                       0x00064000 /* EMC_DLL_XFORM_DQS6 */
-                                       0x00064000 /* EMC_DLL_XFORM_DQS7 */
-                                       0x00064000 /* EMC_DLL_XFORM_DQS8 */
-                                       0x00064000 /* EMC_DLL_XFORM_DQS9 */
-                                       0x00064000 /* EMC_DLL_XFORM_DQS10 */
-                                       0x00064000 /* EMC_DLL_XFORM_DQS11 */
-                                       0x00064000 /* EMC_DLL_XFORM_DQS12 */
-                                       0x00064000 /* EMC_DLL_XFORM_DQS13 */
-                                       0x00064000 /* EMC_DLL_XFORM_DQS14 */
-                                       0x00064000 /* EMC_DLL_XFORM_DQS15 */
-                                       0x00000000 /* EMC_DLL_XFORM_QUSE0 */
-                                       0x00000000 /* EMC_DLL_XFORM_QUSE1 */
-                                       0x00000000 /* EMC_DLL_XFORM_QUSE2 */
-                                       0x00000000 /* EMC_DLL_XFORM_QUSE3 */
-                                       0x00000000 /* EMC_DLL_XFORM_QUSE4 */
-                                       0x00000000 /* EMC_DLL_XFORM_QUSE5 */
-                                       0x00000000 /* EMC_DLL_XFORM_QUSE6 */
-                                       0x00000000 /* EMC_DLL_XFORM_QUSE7 */
-                                       0x00000000 /* EMC_DLL_XFORM_ADDR0 */
-                                       0x00000000 /* EMC_DLL_XFORM_ADDR1 */
-                                       0x00000000 /* EMC_DLL_XFORM_ADDR2 */
-                                       0x00000000 /* EMC_DLL_XFORM_ADDR3 */
-                                       0x00000000 /* EMC_DLL_XFORM_ADDR4 */
-                                       0x00000000 /* EMC_DLL_XFORM_ADDR5 */
-                                       0x00000000 /* EMC_DLL_XFORM_QUSE8 */
-                                       0x00000000 /* EMC_DLL_XFORM_QUSE9 */
-                                       0x00000000 /* EMC_DLL_XFORM_QUSE10 */
-                                       0x00000000 /* EMC_DLL_XFORM_QUSE11 */
-                                       0x00000000 /* EMC_DLL_XFORM_QUSE12 */
-                                       0x00000000 /* EMC_DLL_XFORM_QUSE13 */
-                                       0x00000000 /* EMC_DLL_XFORM_QUSE14 */
-                                       0x00000000 /* EMC_DLL_XFORM_QUSE15 */
-                                       0x00000000 /* EMC_DLI_TRIM_TXDQS0 */
-                                       0x00000000 /* EMC_DLI_TRIM_TXDQS1 */
-                                       0x00000000 /* EMC_DLI_TRIM_TXDQS2 */
-                                       0x00000000 /* EMC_DLI_TRIM_TXDQS3 */
-                                       0x00000000 /* EMC_DLI_TRIM_TXDQS4 */
-                                       0x00000000 /* EMC_DLI_TRIM_TXDQS5 */
-                                       0x00000000 /* EMC_DLI_TRIM_TXDQS6 */
-                                       0x00000000 /* EMC_DLI_TRIM_TXDQS7 */
-                                       0x00000000 /* EMC_DLI_TRIM_TXDQS8 */
-                                       0x00000000 /* EMC_DLI_TRIM_TXDQS9 */
-                                       0x00000000 /* EMC_DLI_TRIM_TXDQS10 */
-                                       0x00000000 /* EMC_DLI_TRIM_TXDQS11 */
-                                       0x00000000 /* EMC_DLI_TRIM_TXDQS12 */
-                                       0x00000000 /* EMC_DLI_TRIM_TXDQS13 */
-                                       0x00000000 /* EMC_DLI_TRIM_TXDQS14 */
-                                       0x00000000 /* EMC_DLI_TRIM_TXDQS15 */
-                                       0x000fc000 /* EMC_DLL_XFORM_DQ0 */
-                                       0x000fc000 /* EMC_DLL_XFORM_DQ1 */
-                                       0x000fc000 /* EMC_DLL_XFORM_DQ2 */
-                                       0x000fc000 /* EMC_DLL_XFORM_DQ3 */
-                                       0x0000fc00 /* EMC_DLL_XFORM_DQ4 */
-                                       0x0000fc00 /* EMC_DLL_XFORM_DQ5 */
-                                       0x0000fc00 /* EMC_DLL_XFORM_DQ6 */
-                                       0x0000fc00 /* EMC_DLL_XFORM_DQ7 */
-                                       0x10000280 /* EMC_XM2CMDPADCTRL */
-                                       0x00000000 /* EMC_XM2CMDPADCTRL4 */
-                                       0x00111111 /* EMC_XM2CMDPADCTRL5 */
-                                       0x00000000 /* EMC_XM2DQPADCTRL2 */
-                                       0x00000000 /* EMC_XM2DQPADCTRL3 */
-                                       0x77ffc081 /* EMC_XM2CLKPADCTRL */
-                                       0x00000e0e /* EMC_XM2CLKPADCTRL2 */
-                                       0x81f1f108 /* EMC_XM2COMPPADCTRL */
-                                       0x07070004 /* EMC_XM2VTTGENPADCTRL */
-                                       0x0000003f /* EMC_XM2VTTGENPADCTRL2 */
-                                       0x016eeeee /* EMC_XM2VTTGENPADCTRL3 */
-                                       0x51451400 /* EMC_XM2DQSPADCTRL3 */
-                                       0x00514514 /* EMC_XM2DQSPADCTRL4 */
-                                       0x00514514 /* EMC_XM2DQSPADCTRL5 */
-                                       0x51451400 /* EMC_XM2DQSPADCTRL6 */
-                                       0x0000003f /* EMC_DSR_VTTGEN_DRV */
-                                       0x00000007 /* EMC_TXDSRVTTGEN */
-                                       0x00000000 /* EMC_FBIO_SPARE */
-                                       0x00000042 /* EMC_ZCAL_WAIT_CNT */
-                                       0x000e000e /* EMC_MRS_WAIT_CNT2 */
-                                       0x00000000 /* EMC_CTT */
-                                       0x00000003 /* EMC_CTT_DURATION */
-                                       0x0000f2f3 /* EMC_CFG_PIPE */
-                                       0x800001c5 /* EMC_DYN_SELF_REF_CONTROL */
-                                       0x0000000a /* EMC_QPOP */
-                               >;
-                       };
-               };
-       };
-};
diff --git a/Documentation/devicetree/bindings/mfd/axp20x.txt b/Documentation/devicetree/bindings/mfd/axp20x.txt

index fd39fa54571b799ad8a19fcebfc19599735027ec..d20b1034e96760add2a8784ac0e879e05f1a660d 100644 (file)
--- a/Documentation/devicetree/bindings/mfd/axp20x.txt
+++ b/Documentation/devicetree/bindings/mfd/axp20x.txt
@@ -6,10 +6,11 @@ axp202 (X-Powers)
  axp209 (X-Powers)
  axp221 (X-Powers)
  axp223 (X-Powers)
+axp809 (X-Powers)
  
  Required properties:
  - compatible: "x-powers,axp152", "x-powers,axp202", "x-powers,axp209",
-             "x-powers,axp221", "x-powers,axp223"
+             "x-powers,axp221", "x-powers,axp223", "x-powers,axp809"
  - reg: The I2C slave address or RSB hardware address for the AXP chip
  - interrupt-parent: The parent interrupt controller
  - interrupts: SoC NMI / GPIO interrupt connected to the PMIC's IRQ pin
@@ -18,7 +19,9 @@ Required properties:
  
  Optional properties:
  - x-powers,dcdc-freq: defines the work frequency of DC-DC in KHz
-                     (range: 750-1875). Default: 1.5MHz
+                     AXP152/20X: range:  750-1875, Default: 1.5 MHz
+                     AXP22X/80X: range: 1800-4050, Default: 3   MHz
+
  - <input>-supply: a phandle to the regulator supply node. May be omitted if
                   inputs are unregulated, such as using the IPSOUT output
                   from the PMIC.
@@ -77,6 +80,30 @@ LDO_IO0              : LDO           : ips-supply            : GPIO 0
  LDO_IO1                : LDO           : ips-supply            : GPIO 1
  RTC_LDO                : LDO           : ips-supply            : always on
  
+AXP809 regulators, type, and corresponding input supply names:
+
+Regulator        Type            Supply Name             Notes
+---------        ----            -----------             -----
+DCDC1          : DC-DC buck    : vin1-supply
+DCDC2          : DC-DC buck    : vin2-supply
+DCDC3          : DC-DC buck    : vin3-supply
+DCDC4          : DC-DC buck    : vin4-supply
+DCDC5          : DC-DC buck    : vin5-supply
+DC1SW          : On/Off Switch :                       : DCDC1 secondary output
+DC5LDO         : LDO           :                       : input from DCDC5
+ALDO1          : LDO           : aldoin-supply         : shared supply
+ALDO2          : LDO           : aldoin-supply         : shared supply
+ALDO3          : LDO           : aldoin-supply         : shared supply
+DLDO1          : LDO           : dldoin-supply         : shared supply
+DLDO2          : LDO           : dldoin-supply         : shared supply
+ELDO1          : LDO           : eldoin-supply         : shared supply
+ELDO2          : LDO           : eldoin-supply         : shared supply
+ELDO3          : LDO           : eldoin-supply         : shared supply
+LDO_IO0                : LDO           : ips-supply            : GPIO 0
+LDO_IO1                : LDO           : ips-supply            : GPIO 1
+RTC_LDO                : LDO           : ips-supply            : always on
+SW             : On/Off Switch : swin-supply
+
  Example:
  
  axp209: pmic@34 {
diff --git a/Documentation/devicetree/bindings/mfd/hisilicon,hi655x.txt b/Documentation/devicetree/bindings/mfd/hisilicon,hi655x.txt

new file mode 100644 (file)

index 0000000..0548569
--- /dev/null
+++ b/Documentation/devicetree/bindings/mfd/hisilicon,hi655x.txt
@@ -0,0 +1,27 @@
+Hisilicon Hi655x Power Management Integrated Circuit (PMIC)
+
+The hardware layout for access PMIC Hi655x from AP SoC Hi6220.
+Between PMIC Hi655x and Hi6220, the physical signal channel is SSI.
+We can use memory-mapped I/O to communicate.
+
++----------------+             +-------------+
+|                |             |             |
+|    Hi6220      |   SSI bus   |   Hi655x    |
+|                |-------------|             |
+|                |(REGMAP_MMIO)|             |
++----------------+             +-------------+
+
+Required properties:
+- compatible:           Should be "hisilicon,hi655x-pmic".
+- reg:                  Base address of PMIC on Hi6220 SoC.
+- interrupt-controller: Hi655x has internal IRQs (has own IRQ domain).
+- pmic-gpios:           The GPIO used by PMIC IRQ.
+
+Example:
+       pmic: pmic@f8000000 {
+               compatible = "hisilicon,hi655x-pmic";
+               reg = <0x0 0xf8000000 0x0 0x1000>;
+               interrupt-controller;
+               #interrupt-cells = <2>;
+               pmic-gpios = <&gpio1 2 GPIO_ACTIVE_HIGH>;
+       }
diff --git a/Documentation/devicetree/bindings/mfd/max77620.txt b/Documentation/devicetree/bindings/mfd/max77620.txt

new file mode 100644 (file)

index 0000000..2ad44f7
--- /dev/null
+++ b/Documentation/devicetree/bindings/mfd/max77620.txt
@@ -0,0 +1,143 @@
+MAX77620 Power management IC from Maxim Semiconductor.
+
+Required properties:
+-------------------
+- compatible: Must be one of
+               "maxim,max77620"
+               "maxim,max20024".
+- reg: I2C device address.
+
+Optional properties:
+-------------------
+- interrupts:          The interrupt on the parent the controller is
+                       connected to.
+- interrupt-controller: Marks the device node as an interrupt controller.
+- #interrupt-cells:    is <2> and their usage is compliant to the 2 cells
+                       variant of <../interrupt-controller/interrupts.txt>
+                       IRQ numbers for different interrupt source of MAX77620
+                       are defined at dt-bindings/mfd/max77620.h.
+
+Optional subnodes and their properties:
+=======================================
+
+Flexible power sequence configurations:
+--------------------------------------
+The Flexible Power Sequencer (FPS) allows each regulator to power up under
+hardware or software control. Additionally, each regulator can power on
+independently or among a group of other regulators with an adjustable power-up
+and power-down delays (sequencing). GPIO1, GPIO2, and GPIO3 can be programmed
+to be part of a sequence allowing external regulators to be sequenced along
+with internal regulators. 32KHz clock can be programmed to be part of a
+sequence.
+
+The flexible sequencing structure consists of two hardware enable inputs
+(EN0, EN1), and 3 master sequencing timers called FPS0, FPS1 and FPS2.
+Each master sequencing timer is programmable through its configuration
+register to have a hardware enable source (EN1 or EN2) or a software enable
+source (SW). When enabled/disabled, the master sequencing timer generates
+eight sequencing events on different time periods called slots. The time
+period between each event is programmable within the configuration register.
+Each regulator, GPIO1, GPIO2, GPIO3, and 32KHz clock has a flexible power
+sequence slave register which allows its enable source to be specified as
+a flexible power sequencer timer or a software bit. When a FPS source of
+regulators, GPIOs and clocks specifies the enable source to be a flexible
+power sequencer, the power up and power down delays can be specified in
+the regulators, GPIOs and clocks flexible power sequencer configuration
+registers.
+
+When FPS event cleared (set to LOW), regulators, GPIOs and 32KHz
+clock are set into following state at the sequencing event that
+corresponds to its flexible sequencer configuration register.
+       Sleep state:                    In this state, regulators, GPIOs
+                                       and 32KHz clock get disabled at
+                                       the sequencing event.
+       Global Low Power Mode (GLPM):   In this state, regulators are set in
+                                       low power mode at the sequencing event.
+
+The configuration parameters of FPS is provided through sub-node "fps"
+and their child for FPS specific. The child node name for FPS are "fps0",
+"fps1", and "fps2" for FPS0, FPS1 and FPS2 respectively.
+
+The FPS configurations like FPS source, power up and power down slots for
+regulators, GPIOs and 32kHz clocks are provided in their respective
+configuration nodes which is explained in respective sub-system DT
+binding document.
+
+There is need for different FPS configuration parameters based on system
+state like when system state changed from active to suspend or active to
+power off (shutdown).
+
+Optional properties:
+-------------------
+-maxim,fps-event-source:               u32, FPS event source like external
+                                       hardware input to PMIC i.e. EN0, EN1 or
+                                       software (SW).
+                                       The macros are defined on
+                                               dt-bindings/mfd/max77620.h
+                                       for different control source.
+                                       - MAX77620_FPS_EVENT_SRC_EN0
+                                               for hardware input pin EN0.
+                                       - MAX77620_FPS_EVENT_SRC_EN1
+                                               for hardware input pin EN1.
+                                       - MAX77620_FPS_EVENT_SRC_SW
+                                               for software control.
+
+-maxim,shutdown-fps-time-period-us:    u32, FPS time period in microseconds
+                                       when system enters in to shutdown
+                                       state.
+
+-maxim,suspend-fps-time-period-us:     u32, FPS time period in microseconds
+                                       when system enters in to suspend state.
+
+-maxim,device-state-on-disabled-event: u32, describe the PMIC state when FPS
+                                       event cleared (set to LOW) whether it
+                                       should go to sleep state or low-power
+                                       state. Following are valid values:
+                                       - MAX77620_FPS_INACTIVE_STATE_SLEEP
+                                               to set the PMIC state to sleep.
+                                       - MAX77620_FPS_INACTIVE_STATE_LOW_POWER
+                                               to set the PMIC state to low
+                                               power.
+                                       Absence of this property or other value
+                                       will not change device state when FPS
+                                       event get cleared.
+
+Here supported time periods by device in microseconds are as follows:
+MAX77620 supports 40, 80, 160, 320, 640, 1280, 2560 and 5120 microseconds.
+MAX20024 supports 20, 40, 80, 160, 320, 640, 1280 and 2540 microseconds.
+
+For DT binding details of different sub modules like GPIO, pincontrol,
+regulator, power, please refer respective device-tree binding document
+under their respective sub-system directories.
+
+Example:
+--------
+#include <dt-bindings/mfd/max77620.h>
+
+max77620@3c {
+       compatible = "maxim,max77620";
+       reg = <0x3c>;
+
+       interrupt-parent = <&intc>;
+       interrupts = <0 86 IRQ_TYPE_NONE>;
+
+       interrupt-controller;
+       #interrupt-cells = <2>;
+
+       fps {
+               fps0 {
+                       maxim,shutdown-fps-time-period-us = <1280>;
+                       maxim,fps-event-source = <MAX77620_FPS_EVENT_SRC_EN1>;
+               };
+
+               fps1 {
+                       maxim,shutdown-fps-time-period-us = <1280>;
+                       maxim,fps-event-source = <MAX77620_FPS_EVENT_SRC_EN0>;
+               };
+
+               fps2 {
+                       maxim,shutdown-fps-time-period-us = <1280>;
+                       maxim,fps-event-source = <MAX77620_FPS_EVENT_SRC_SW>;
+               };
+       };
+};
diff --git a/Documentation/devicetree/bindings/mfd/qcom-rpm.txt b/Documentation/devicetree/bindings/mfd/qcom-rpm.txt

index 5e97a9593ad71bbca2c1edf3199cbf53fb7ff45f..b98b291a31ba048fa57531bfa3a7a05b1b2f5262 100644 (file)
--- a/Documentation/devicetree/bindings/mfd/qcom-rpm.txt
+++ b/Documentation/devicetree/bindings/mfd/qcom-rpm.txt
@@ -178,7 +178,7 @@ see regulator.txt - with additional custom properties described below:
  - qcom,force-mode:
         Usage: optional (default if no other qcom,force-mode is specified)
         Value type: <u32>
-       Defintion: indicates that the regulator should be forced to a
+       Definition: indicates that the regulator should be forced to a
                    particular mode, valid values are:
                    QCOM_RPM_FORCE_MODE_NONE - do not force any mode
                    QCOM_RPM_FORCE_MODE_LPM - force into low power mode
@@ -204,7 +204,7 @@ see regulator.txt - with additional custom properties described below:
  - qcom,force-mode:
         Usage: optional
         Value type: <u32>
-       Defintion: indicates that the regulator should not be forced to any
+       Definition: indicates that the regulator should not be forced to any
                    particular mode, valid values are:
                    QCOM_RPM_FORCE_MODE_NONE - do not force any mode
                    QCOM_RPM_FORCE_MODE_LPM - force into low power mode
diff --git a/Documentation/devicetree/bindings/mmc/mmc-pwrseq-emmc.txt b/Documentation/devicetree/bindings/mmc/mmc-pwrseq-emmc.txt

index 0cb827bf94353124a331fce81412f5c1e0129bb7..3d965d57e00b302f1a216e5a3ecd7a4051902fbb 100644 (file)
--- a/Documentation/devicetree/bindings/mmc/mmc-pwrseq-emmc.txt
+++ b/Documentation/devicetree/bindings/mmc/mmc-pwrseq-emmc.txt
@@ -1,7 +1,7 @@
  * The simple eMMC hardware reset provider
  
  The purpose of this driver is to perform standard eMMC hw reset
-procedure, as descibed by Jedec 4.4 specification. This procedure is
+procedure, as described by Jedec 4.4 specification. This procedure is
  performed just after MMC core enabled power to the given mmc host (to
  fix possible issues if bootloader has left eMMC card in initialized or
  unknown state), and before performing complete system reboot (also in
diff --git a/Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt b/Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt

index c2546ced9c02a379b2c67c498f0cdc2fbccd03a4..0f6985b5de49afb7eb38991d86197ba407cb87cf 100644 (file)
--- a/Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt
+++ b/Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt
@@ -52,7 +52,7 @@ Optional properties:
                                v7.0. Use this property to describe the rare
                                earlier versions of this core that include WP
  
- -- Additonal SoC-specific NAND controller properties --
+ -- Additional SoC-specific NAND controller properties --
  
  The NAND controller is integrated differently on the variety of SoCs on which it
  is found. Part of this integration involves providing status and enable bits
diff --git a/Documentation/devicetree/bindings/net/hisilicon-hns-nic.txt b/Documentation/devicetree/bindings/net/hisilicon-hns-nic.txt

index b9ff4ba6454e49e98e89ca2f99d1f4d03f09e275..f0421ee3c7149fe08419eda6e4657a25872771d2 100644 (file)
--- a/Documentation/devicetree/bindings/net/hisilicon-hns-nic.txt
+++ b/Documentation/devicetree/bindings/net/hisilicon-hns-nic.txt
@@ -8,7 +8,7 @@ Required properties:
    specifies a reference to the associating hardware driver node.
    see Documentation/devicetree/bindings/net/hisilicon-hns-dsaf.txt
  - port-id: is the index of port provided by DSAF (the accelerator). DSAF can
-  connect to 8 PHYs. Port 0 to 1 are both used for adminstration purpose. They
+  connect to 8 PHYs. Port 0 to 1 are both used for administration purpose. They
    are called debug ports.
  
    The remaining 6 PHYs are taken according to the mode of DSAF.
diff --git a/Documentation/devicetree/bindings/net/stmmac.txt b/Documentation/devicetree/bindings/net/stmmac.txt

index 4d302db657c0bcb33039feb4fdd47e27d105e4c7..95816c5fc589b971e924fba438a3e5e42f024670 100644 (file)
--- a/Documentation/devicetree/bindings/net/stmmac.txt
+++ b/Documentation/devicetree/bindings/net/stmmac.txt
@@ -51,8 +51,8 @@ Optional properties:
                            AXI register inside the DMA module:
         - snps,lpi_en: enable Low Power Interface
         - snps,xit_frm: unlock on WoL
-       - snps,wr_osr_lmt: max write oustanding req. limit
-       - snps,rd_osr_lmt: max read oustanding req. limit
+       - snps,wr_osr_lmt: max write outstanding req. limit
+       - snps,rd_osr_lmt: max read outstanding req. limit
         - snps,kbbe: do not cross 1KiB boundary.
         - snps,axi_all: align address
         - snps,blen: this is a vector of supported burst length.
diff --git a/Documentation/devicetree/bindings/net/ti,dp83867.txt b/Documentation/devicetree/bindings/net/ti,dp83867.txt

index 58d935b58598102f4fab0b132ca0fbfadf544fab..5d21141a68b547104d5f152da0214dc847f4a3e9 100644 (file)
--- a/Documentation/devicetree/bindings/net/ti,dp83867.txt
+++ b/Documentation/devicetree/bindings/net/ti,dp83867.txt
@@ -2,7 +2,7 @@
  
  Required properties:
         - reg - The ID number for the phy, usually a small integer
-       - ti,rx-internal-delay - RGMII Recieve Clock Delay - see dt-bindings/net/ti-dp83867.h
+       - ti,rx-internal-delay - RGMII Receive Clock Delay - see dt-bindings/net/ti-dp83867.h
                 for applicable values
         - ti,tx-internal-delay - RGMII Transmit Clock Delay - see dt-bindings/net/ti-dp83867.h
                 for applicable values
diff --git a/Documentation/devicetree/bindings/opp/opp.txt b/Documentation/devicetree/bindings/opp/opp.txt

index 601256fe8c0dd99d2df3ff5a77b2cee23a852e5d..ee91cbdd95ee137d089013317411d0c05319f2bc 100644 (file)
--- a/Documentation/devicetree/bindings/opp/opp.txt
+++ b/Documentation/devicetree/bindings/opp/opp.txt
@@ -45,7 +45,7 @@ Devices supporting OPPs must set their "operating-points-v2" property with
  phandle to a OPP table in their DT node. The OPP core will use this phandle to
  find the operating points for the device.
  
-If required, this can be extended for SoC vendor specfic bindings. Such bindings
+If required, this can be extended for SoC vendor specific bindings. Such bindings
  should be documented as Documentation/devicetree/bindings/power/<vendor>-opp.txt
  and should have a compatible description like: "operating-points-v2-<vendor>".
  
diff --git a/Documentation/devicetree/bindings/pci/designware-pcie.txt b/Documentation/devicetree/bindings/pci/designware-pcie.txt

index 64f2fff121288cb0560a6186b6cddce6ecc33978..6c5322c55411b6c91e8a633ffc43de886eb4da7b 100644 (file)
--- a/Documentation/devicetree/bindings/pci/designware-pcie.txt
+++ b/Documentation/devicetree/bindings/pci/designware-pcie.txt
@@ -31,7 +31,7 @@ Optional properties:
  
  Example configuration:
  
-       pcie: pcie@0xdffff000 {
+       pcie: pcie@dffff000 {
                 compatible = "snps,dw-pcie";
                 reg = <0xdffff000 0x1000>, /* Controller registers */
                       <0xd0000000 0x2000>; /* PCI config space */
diff --git a/Documentation/devicetree/bindings/pci/hisilicon-pcie.txt b/Documentation/devicetree/bindings/pci/hisilicon-pcie.txt

index b721beacfe4dae6c0bff3f429365080e46043d4a..59c2f47aa303ae2490dc18e9c1ee0b30f754518e 100644 (file)
--- a/Documentation/devicetree/bindings/pci/hisilicon-pcie.txt
+++ b/Documentation/devicetree/bindings/pci/hisilicon-pcie.txt
@@ -34,11 +34,11 @@ Hip05 Example (note that Hip06 is the same except compatible):
                 ranges = <0x82000000 0 0x00000000 0x220 0x00000000 0 0x10000000>;
                 num-lanes = <8>;
                 port-id = <1>;
-               #interrupts-cells = <1>;
-               interrupts-map-mask = <0xf800 0 0 7>;
-               interrupts-map = <0x0 0 0 1 &mbigen_pcie 1 10
-                                 0x0 0 0 2 &mbigen_pcie 2 11
-                                 0x0 0 0 3 &mbigen_pcie 3 12
-                                 0x0 0 0 4 &mbigen_pcie 4 13>;
+               #interrupt-cells = <1>;
+               interrupt-map-mask = <0xf800 0 0 7>;
+               interrupt-map = <0x0 0 0 1 &mbigen_pcie 1 10
+                                0x0 0 0 2 &mbigen_pcie 2 11
+                                0x0 0 0 3 &mbigen_pcie 3 12
+                                0x0 0 0 4 &mbigen_pcie 4 13>;
                 status = "ok";
         };
diff --git a/Documentation/devicetree/bindings/phy/phy-stih41x-usb.txt b/Documentation/devicetree/bindings/phy/phy-stih41x-usb.txt

index 00944a05ee6b4998873a226f5c07bb48ef9d010a..744b4809542edd3b3c8a3b64a17bcc4fc4012ce3 100644 (file)
--- a/Documentation/devicetree/bindings/phy/phy-stih41x-usb.txt
+++ b/Documentation/devicetree/bindings/phy/phy-stih41x-usb.txt
@@ -17,7 +17,7 @@ Example:
  
  usb2_phy: usb2phy@0 {
         compatible      = "st,stih416-usb-phy";
-       #phy-cell       = <0>;
+       #phy-cells      = <0>;
         st,syscfg       = <&syscfg_rear>;
         clocks          = <&clk_sysin>;
         clock-names     = "osc_phy";
diff --git a/Documentation/devicetree/bindings/pinctrl/nvidia,tegra124-xusb-padctl.txt b/Documentation/devicetree/bindings/pinctrl/nvidia,tegra124-xusb-padctl.txt

index 8a6223dbc143e26d96a20a47c3930eb9bb556933..4048f43a9d29bfa96269fa4d87828dc313088a70 100644 (file)
--- a/Documentation/devicetree/bindings/pinctrl/nvidia,tegra124-xusb-padctl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/nvidia,tegra124-xusb-padctl.txt
@@ -85,7 +85,7 @@ Example:
  SoC file extract:
  -----------------
  
-       padctl@0,7009f000 {
+       padctl@7009f000 {
                 compatible = "nvidia,tegra124-xusb-padctl";
                 reg = <0x0 0x7009f000 0x0 0x1000>;
                 resets = <&tegra_car 142>;
@@ -97,7 +97,7 @@ SoC file extract:
  Board file extract:
  -------------------
  
-       pcie-controller@0,01003000 {
+       pcie-controller@01003000 {
                 ...
  
                 phys = <&padctl 0>;
@@ -108,7 +108,7 @@ Board file extract:
  
         ...
  
-       padctl: padctl@0,7009f000 {
+       padctl: padctl@7009f000 {
                 pinctrl-0 = <&padctl_default>;
                 pinctrl-names = "default";
  
diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,pmic-gpio.txt b/Documentation/devicetree/bindings/pinctrl/qcom,pmic-gpio.txt

index a90c812ad6429abe0ef5282982e6fea66560b599..a54c39ebbf8bc5348c45d9b669ba609ad24a4d78 100644 (file)
--- a/Documentation/devicetree/bindings/pinctrl/qcom,pmic-gpio.txt
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,pmic-gpio.txt
@@ -122,7 +122,7 @@ to specify in a pin configuration subnode:
                     2: 1.5uA                    (PMIC_GPIO_PULL_UP_1P5)
                     3: 31.5uA                   (PMIC_GPIO_PULL_UP_31P5)
                     4: 1.5uA + 30uA boost       (PMIC_GPIO_PULL_UP_1P5_30)
-                   If this property is ommited 30uA strength will be used if
+                   If this property is omitted 30uA strength will be used if
                     pull up is selected
  
  - bias-high-impedance:
diff --git a/Documentation/devicetree/bindings/power/qcom,coincell-charger.txt b/Documentation/devicetree/bindings/power/qcom,coincell-charger.txt

index 0e6d8754e7ecbb23909817156eb84abc82d411fe..747899223262fbc8b44867e39c1b5d7a09a16c88 100644 (file)
--- a/Documentation/devicetree/bindings/power/qcom,coincell-charger.txt
+++ b/Documentation/devicetree/bindings/power/qcom,coincell-charger.txt
@@ -29,7 +29,7 @@ IC (PMIC)
  - qcom,charger-disable:
         Usage: optional
         Value type: <boolean>
-       Definition: definining this property disables charging
+       Definition: defining this property disables charging
  
  This charger is a sub-node of one of the 8941 PMIC blocks, and is specified
  as a child node in DTS of that node.  See ../mfd/qcom,spmi-pmic.txt and
diff --git a/Documentation/devicetree/bindings/regulator/palmas-pmic.txt b/Documentation/devicetree/bindings/regulator/palmas-pmic.txt

index 725393c8a7f290f93de13fe5e0f69dd13e138560..99872819604fb961beec57d2d7d7cc554a1662c6 100644 (file)
--- a/Documentation/devicetree/bindings/regulator/palmas-pmic.txt
+++ b/Documentation/devicetree/bindings/regulator/palmas-pmic.txt
@@ -1,5 +1,12 @@
  * palmas regulator IP block devicetree bindings
  
+The tps659038 for the AM57x class have OTP spins that
+have different part numbers but the same functionality. There
+is not a need to add the OTP spins to the palmas driver. The
+spin devices should use the tps659038 as it's compatible value.
+This is the list of those devices:
+tps659037
+
  Required properties:
  - compatible : Should be from the list
    ti,twl6035-pmic
@@ -8,6 +15,7 @@ Required properties:
    ti,tps65913-pmic
    ti,tps65914-pmic
    ti,tps65917-pmic
+  ti,tps659038-pmic
  and also the generic series names
    ti,palmas-pmic
  - interrupt-parent : The parent interrupt controller which is palmas.
diff --git a/Documentation/devicetree/bindings/rtc/rtc-palmas.txt b/Documentation/devicetree/bindings/rtc/rtc-palmas.txt

index adbccc0a51e190d3397d12079f28463dd82c2261..eb1c7fdeb4135d24d34caa98d26aeb940b1b3a4b 100644 (file)
--- a/Documentation/devicetree/bindings/rtc/rtc-palmas.txt
+++ b/Documentation/devicetree/bindings/rtc/rtc-palmas.txt
@@ -15,9 +15,9 @@ Optional properties:
         battery is chargeable or not. If charging battery then driver can
         enable the charging.
  - ti,backup-battery-charge-high-current: Enable high current charging in
-       backup battery. Device supports the < 100mA and > 100mA charging.
-       The high current will be > 100mA. Absence of this property will
-       charge battery to lower current i.e. < 100mA.
+       backup battery. Device supports the < 100uA and > 100uA charging.
+       The high current will be > 100uA. Absence of this property will
+       charge battery to lower current i.e. < 100uA.
  
  Example:
         palmas: tps65913@58 {
diff --git a/Documentation/devicetree/bindings/serial/mvebu-uart.txt b/Documentation/devicetree/bindings/serial/mvebu-uart.txt

new file mode 100644 (file)

index 0000000..6087def
--- /dev/null
+++ b/Documentation/devicetree/bindings/serial/mvebu-uart.txt
@@ -0,0 +1,13 @@
+* Marvell UART : Non standard UART used in some of Marvell EBU SoCs (e.g., Armada-3700)
+
+Required properties:
+- compatible: "marvell,armada-3700-uart"
+- reg: offset and length of the register set for the device.
+- interrupts: device interrupt
+
+Example:
+       serial@12000 {
+               compatible = "marvell,armada-3700-uart";
+               reg = <0x12000 0x400>;
+               interrupts = <43>;
+       };
diff --git a/Documentation/devicetree/bindings/soc/ti/keystone-navigator-qmss.txt b/Documentation/devicetree/bindings/soc/ti/keystone-navigator-qmss.txt

index d1ce21a4904dee8aa6f203c3424c047ed247dc14..64c66a5644e7551c2637b4aebff3d81f12cdf48f 100644 (file)
--- a/Documentation/devicetree/bindings/soc/ti/keystone-navigator-qmss.txt
+++ b/Documentation/devicetree/bindings/soc/ti/keystone-navigator-qmss.txt
@@ -42,7 +42,7 @@ Required properties:
  - queue-pools  : child node classifying the queue ranges into pools.
                   Queue ranges are grouped into 3 type of pools:
                   - qpend           : pool of qpend(interruptible) queues
-                 - general-purpose : pool of general queues, primarly used
+                 - general-purpose : pool of general queues, primarily used
                                       as free descriptor queues or the
                                       transmit DMA queues.
                   - accumulator     : pool of queues on PDSP accumulator channel
@@ -50,7 +50,7 @@ Required properties:
    -- qrange            : number of queues to use per queue range, specified as
                           <"base queue #" "# of queues">.
    -- interrupts                : Optional property to specify the interrupt mapping
-                         for interruptible queues. The driver additionaly sets
+                         for interruptible queues. The driver additionally sets
                           the interrupt affinity hint based on the cpu mask.
    -- qalloc-by-id      : Optional property to specify that the queues in this
                           range can only be allocated by queue id.
@@ -80,7 +80,7 @@ Required properties:
                           latency     : time to delay the interrupt, specified
                                         in microseconds.
    -- multi-queue       : Optional property to specify that the channel has to
-                         monitor upto 32 queues starting at the base queue #.
+                         monitor up to 32 queues starting at the base queue #.
  - descriptor-regions   : child node describing the memory regions for keystone
                           navigator packet DMA descriptors. The memory for
                           descriptors will be allocated by the driver.
diff --git a/Documentation/devicetree/bindings/sound/nvidia,tegra30-hda.txt b/Documentation/devicetree/bindings/sound/nvidia,tegra30-hda.txt

index 275c6ea356f66327b34df92c0cc5751212140ca0..44d27456e8a4862a11e67785b459fd5ef721cc82 100644 (file)
--- a/Documentation/devicetree/bindings/sound/nvidia,tegra30-hda.txt
+++ b/Documentation/devicetree/bindings/sound/nvidia,tegra30-hda.txt
@@ -15,7 +15,7 @@ Required properties:
  
  Example:
  
-hda@0,70030000 {
+hda@70030000 {
         compatible = "nvidia,tegra124-hda", "nvidia,tegra30-hda";
         reg = <0x0 0x70030000 0x0 0x10000>;
         interrupts = <GIC_SPI 81 IRQ_TYPE_LEVEL_HIGH>;
diff --git a/Documentation/devicetree/bindings/sram/sram.txt b/Documentation/devicetree/bindings/sram/sram.txt

index 227e3a341af1e2b525a4c59a3bc842041ddff1c3..add48f09015e212e0c8bea10aad23165b9882f48 100644 (file)
--- a/Documentation/devicetree/bindings/sram/sram.txt
+++ b/Documentation/devicetree/bindings/sram/sram.txt
@@ -51,7 +51,7 @@ sram: sram@5c000000 {
         compatible = "mmio-sram";
         reg = <0x5c000000 0x40000>; /* 256 KiB SRAM at address 0x5c000000 */
  
-       #adress-cells = <1>;
+       #address-cells = <1>;
         #size-cells = <1>;
         ranges = <0 0x5c000000 0x40000>;
  
diff --git a/Documentation/devicetree/bindings/thermal/nvidia,tegra124-soctherm.txt b/Documentation/devicetree/bindings/thermal/nvidia,tegra124-soctherm.txt

new file mode 100644 (file)

index 0000000..6908d3a
--- /dev/null
+++ b/Documentation/devicetree/bindings/thermal/nvidia,tegra124-soctherm.txt
@@ -0,0 +1,55 @@
+Tegra124 SOCTHERM thermal management system
+
+The SOCTHERM IP block contains thermal sensors, support for polled
+or interrupt-based thermal monitoring, CPU and GPU throttling based
+on temperature trip points, and handling external overcurrent
+notifications. It is also used to manage emergency shutdown in an
+overheating situation.
+
+Required properties :
+- compatible : For Tegra124, must contain "nvidia,tegra124-soctherm".
+  For Tegra132, must contain "nvidia,tegra132-soctherm".
+  For Tegra210, must contain "nvidia,tegra210-soctherm".
+- reg : Should contain 1 entry:
+  - SOCTHERM register set
+- interrupts : Defines the interrupt used by SOCTHERM
+- clocks : Must contain an entry for each entry in clock-names.
+  See ../clocks/clock-bindings.txt for details.
+- clock-names : Must include the following entries:
+  - tsensor
+  - soctherm
+- resets : Must contain an entry for each entry in reset-names.
+  See ../reset/reset.txt for details.
+- reset-names : Must include the following entries:
+  - soctherm
+- #thermal-sensor-cells : Should be 1. See ./thermal.txt for a description
+    of this property. See <dt-bindings/thermal/tegra124-soctherm.h> for a
+    list of valid values when referring to thermal sensors.
+
+
+Example :
+
+       soctherm@700e2000 {
+               compatible = "nvidia,tegra124-soctherm";
+               reg = <0x0 0x700e2000 0x0 0x1000>;
+               interrupts = <GIC_SPI 48 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA124_CLK_TSENSOR>,
+                       <&tegra_car TEGRA124_CLK_SOC_THERM>;
+               clock-names = "tsensor", "soctherm";
+               resets = <&tegra_car 78>;
+               reset-names = "soctherm";
+
+               #thermal-sensor-cells = <1>;
+       };
+
+Example: referring to thermal sensors :
+
+       thermal-zones {
+                cpu {
+                        polling-delay-passive = <1000>;
+                        polling-delay = <1000>;
+
+                        thermal-sensors =
+                                <&soctherm TEGRA124_SOCTHERM_SENSOR_CPU>;
+                };
+       };
diff --git a/Documentation/devicetree/bindings/thermal/tegra-soctherm.txt b/Documentation/devicetree/bindings/thermal/tegra-soctherm.txt

deleted file mode 100644 (file)

index 6b68cd1..0000000
--- a/Documentation/devicetree/bindings/thermal/tegra-soctherm.txt
+++ /dev/null
@@ -1,55 +0,0 @@
-Tegra124 SOCTHERM thermal management system
-
-The SOCTHERM IP block contains thermal sensors, support for polled
-or interrupt-based thermal monitoring, CPU and GPU throttling based
-on temperature trip points, and handling external overcurrent
-notifications. It is also used to manage emergency shutdown in an
-overheating situation.
-
-Required properties :
-- compatible : For Tegra124, must contain "nvidia,tegra124-soctherm".
-  For Tegra132, must contain "nvidia,tegra132-soctherm".
-  For Tegra210, must contain "nvidia,tegra210-soctherm".
-- reg : Should contain 1 entry:
-  - SOCTHERM register set
-- interrupts : Defines the interrupt used by SOCTHERM
-- clocks : Must contain an entry for each entry in clock-names.
-  See ../clocks/clock-bindings.txt for details.
-- clock-names : Must include the following entries:
-  - tsensor
-  - soctherm
-- resets : Must contain an entry for each entry in reset-names.
-  See ../reset/reset.txt for details.
-- reset-names : Must include the following entries:
-  - soctherm
-- #thermal-sensor-cells : Should be 1. See ./thermal.txt for a description
-    of this property. See <dt-bindings/thermal/tegra124-soctherm.h> for a
-    list of valid values when referring to thermal sensors.
-
-
-Example :
-
-       soctherm@0,700e2000 {
-               compatible = "nvidia,tegra124-soctherm";
-               reg = <0x0 0x700e2000 0x0 0x1000>;
-               interrupts = <GIC_SPI 48 IRQ_TYPE_LEVEL_HIGH>;
-               clocks = <&tegra_car TEGRA124_CLK_TSENSOR>,
-                       <&tegra_car TEGRA124_CLK_SOC_THERM>;
-               clock-names = "tsensor", "soctherm";
-               resets = <&tegra_car 78>;
-               reset-names = "soctherm";
-
-               #thermal-sensor-cells = <1>;
-       };
-
-Example: referring to thermal sensors :
-
-       thermal-zones {
-                cpu {
-                        polling-delay-passive = <1000>;
-                        polling-delay = <1000>;
-
-                        thermal-sensors =
-                                <&soctherm TEGRA124_SOCTHERM_SENSOR_CPU>;
-                };
-       };
diff --git a/Documentation/devicetree/bindings/tty/serial/mvebu-uart.txt b/Documentation/devicetree/bindings/tty/serial/mvebu-uart.txt

deleted file mode 100644 (file)

index 6087def..0000000
--- a/Documentation/devicetree/bindings/tty/serial/mvebu-uart.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-* Marvell UART : Non standard UART used in some of Marvell EBU SoCs (e.g., Armada-3700)
-
-Required properties:
-- compatible: "marvell,armada-3700-uart"
-- reg: offset and length of the register set for the device.
-- interrupts: device interrupt
-
-Example:
-       serial@12000 {
-               compatible = "marvell,armada-3700-uart";
-               reg = <0x12000 0x400>;
-               interrupts = <43>;
-       };
diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt b/Documentation/devicetree/bindings/vendor-prefixes.txt

index 316412dc79135a472886a331caa9ff4a22932bfe..32f965807a07e42d0c1f47d2f7e6da7eceb5972c 100644 (file)
--- a/Documentation/devicetree/bindings/vendor-prefixes.txt
+++ b/Documentation/devicetree/bindings/vendor-prefixes.txt
@@ -28,6 +28,7 @@ aptina        Aptina Imaging
  arasan Arasan Chip Systems
  arm    ARM Ltd.
  armadeus       ARMadeus Systems SARL
+arrow  Arrow Electronics
  artesyn        Artesyn Embedded Technologies Inc.
  asahi-kasei    Asahi Kasei Corp.
  aspeed ASPEED Technology Inc.
@@ -60,6 +61,7 @@ cnxt  Conexant Systems, Inc.
  compulab       CompuLab Ltd.
  cortina        Cortina Systems, Inc.
  cosmic Cosmic Circuits
+creative       Creative Technology Ltd
  crystalfontz   Crystalfontz America, Inc.
  cubietech      Cubietech, Ltd.
  cypress        Cypress Semiconductor Corporation
@@ -79,6 +81,7 @@ ebv   EBV Elektronik
  edt    Emerging Display Technologies
  eeti   eGalax_eMPIA Technology Inc
  elan   Elan Microelectronic Corp.
+embest Shenzhen Embest Technology Co., Ltd.
  emmicro        EM Microelectronic
  energymicro    Silicon Laboratories (formerly Energy Micro AS)
  epcos  EPCOS AG
@@ -124,6 +127,7 @@ idt Integrated Device Technologies, Inc.
  ifi    Ingenieurburo Fur Ic-Technologie (I/F/I)
  iom    Iomega Corporation
  img    Imagination Technologies Ltd.
+inforce        Inforce Computing
  ingenic        Ingenic Semiconductor
  innolux        Innolux Corporation
  intel  Intel Corporation
diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt

index 7281fb4b43164624f076648477098e753243c33a..6c4478ce582df71cce821611b5df6fe73c497592 100644 (file)
--- a/Documentation/driver-model/devres.txt
+++ b/Documentation/driver-model/devres.txt
@@ -320,6 +320,9 @@ MEM
    devm_kvasprintf()
    devm_kzalloc()
  
+MFD
+ devm_mfd_add_devices()
+
  PCI
    pcim_enable_device() : after success, all PCI ops become managed
    pcim_pin_device()    : keep PCI device enabled after release
diff --git a/Documentation/fb/udlfb.txt b/Documentation/fb/udlfb.txt

index 57d2f2908b12bcb01eae4669e3f0d50eeabf9d7e..c985cb65dd06e0a031fa10a0ae81ec70ce636e5e 100644 (file)
--- a/Documentation/fb/udlfb.txt
+++ b/Documentation/fb/udlfb.txt
@@ -9,7 +9,7 @@ pairing that with a hardware framebuffer (16MB) on the other end of the
  USB wire.  That hardware framebuffer is able to drive the VGA, DVI, or HDMI
  monitor with no CPU involvement until a pixel has to change.
  
-The CPU or other local resource does all the rendering; optinally compares the
+The CPU or other local resource does all the rendering; optionally compares the
  result with a local shadow of the remote hardware framebuffer to identify
  the minimal set of pixels that have changed; and compresses and sends those
  pixels line-by-line via USB bulk transfers.
@@ -66,10 +66,10 @@ means that from a hardware and fbdev software perspective, everything is good.
  At that point, a /dev/fb? interface will be present for user-mode applications
  to open and begin writing to the framebuffer of the DisplayLink device using
  standard fbdev calls.  Note that if mmap() is used, by default the user mode
-application must send down damage notifcations to trigger repaints of the
+application must send down damage notifications to trigger repaints of the
  changed regions.  Alternatively, udlfb can be recompiled with experimental
  defio support enabled, to support a page-fault based detection mechanism
-that can work without explicit notifcation.
+that can work without explicit notification.
  
  The most common client of udlfb is xf86-video-displaylink or a modified
  xf86-video-fbdev X server. These servers have no real DisplayLink specific
diff --git a/Documentation/features/perf/perf-regs/arch-support.txt b/Documentation/features/perf/perf-regs/arch-support.txt

index e2b4a78ec5439b2d36332bc00e481910a491d49e..f179b1fb26ef49734ff84bf1deee48bfe8915fdd 100644 (file)
--- a/Documentation/features/perf/perf-regs/arch-support.txt
+++ b/Documentation/features/perf/perf-regs/arch-support.txt
@@ -27,7 +27,7 @@
      |       nios2: | TODO |
      |    openrisc: | TODO |
      |      parisc: | TODO |
-    |     powerpc: | TODO |
+    |     powerpc: |  ok  |
      |        s390: | TODO |
      |       score: | TODO |
      |          sh: | TODO |
diff --git a/Documentation/features/perf/perf-stackdump/arch-support.txt b/Documentation/features/perf/perf-stackdump/arch-support.txt

index 3dc24b0673c0bf6659db5677ccb822fc3954692d..85777c5c6353712c51b6ae58ad322af7ebcd9d74 100644 (file)
--- a/Documentation/features/perf/perf-stackdump/arch-support.txt
+++ b/Documentation/features/perf/perf-stackdump/arch-support.txt
@@ -27,7 +27,7 @@
      |       nios2: | TODO |
      |    openrisc: | TODO |
      |      parisc: | TODO |
-    |     powerpc: | TODO |
+    |     powerpc: |  ok  |
      |        s390: | TODO |
      |       score: | TODO |
      |          sh: | TODO |
diff --git a/Documentation/kdump/kdump.txt b/Documentation/kdump/kdump.txt

index bc4bd5a44b88b9abedec6b4d859f1163c3c2a6ac..88ff63d5fde3c63663a6b0f0e5e9a845c550060a 100644 (file)
--- a/Documentation/kdump/kdump.txt
+++ b/Documentation/kdump/kdump.txt
@@ -263,12 +263,6 @@ The syntax is:
      crashkernel=<range1>:<size1>[,<range2>:<size2>,...][@offset]
      range=start-[end]
  
-Please note, on arm, the offset is required.
-    crashkernel=<range1>:<size1>[,<range2>:<size2>,...]@offset
-    range=start-[end]
-
-    'start' is inclusive and 'end' is exclusive.
-
  For example:
  
      crashkernel=512M-2G:64M,2G-:128M
@@ -307,10 +301,9 @@ Boot into System Kernel
     on the memory consumption of the kdump system. In general this is not
     dependent on the memory size of the production system.
  
-   On arm, use "crashkernel=Y@X". Note that the start address of the kernel
-   will be aligned to 128MiB (0x08000000), so if the start address is not then
-   any space below the alignment point may be overwritten by the dump-capture kernel,
-   which means it is possible that the vmcore is not that precise as expected.
+   On arm, the use of "crashkernel=Y@X" is no longer necessary; the
+   kernel will automatically locate the crash kernel image within the
+   first 512MB of RAM if X is not given.
  
  
  Load the Dump-capture Kernel
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt

index f5c35901144ce6121f5892b0709f2d5d8a2983b4..4e76a349b6f800c746fb0c3e029961975117ae4b 100644 (file)
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2168,6 +2168,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                         [KNL,SH] Allow user to override the default size for
                         per-device physically contiguous DMA buffers.
  
+        memhp_default_state=online/offline
+                       [KNL] Set the initial state for the memory hotplug
+                       onlining policy. If not specified, the default value is
+                       set according to the
+                       CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config
+                       option.
+                       See Documentation/memory-hotplug.txt.
+
         memmap=exactmap [KNL,X86] Enable setting of an exact
                         E820 memory map, as specified by the user.
                         Such memmap=exactmap lines can be constructed based on
@@ -2951,11 +2959,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                                 for broken drivers that don't call it.
                 skip_isa_align  [X86] do not align io start addr, so can
                                 handle more pci cards
-               firmware        [ARM] Do not re-enumerate the bus but instead
-                               just use the configuration from the
-                               bootloader. This is currently used on
-                               IXP2000 systems where the bus has to be
-                               configured a certain way for adjunct CPUs.
                 noearly         [X86] Don't do any early type 1 scanning.
                                 This might help on some broken boards which
                                 machine check when some devices' config space
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt

index 443f4b44ad97cddc2d8b645c29a55ff88573f864..0d7cb955aa01db7017cc63b47bf1a6d55b5b880e 100644 (file)
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -261,10 +261,11 @@ it according to the policy which can be read from "auto_online_blocks" file:
  
  % cat /sys/devices/system/memory/auto_online_blocks
  
-The default is "offline" which means the newly added memory is not in a
-ready-to-use state and you have to "online" the newly added memory blocks
-manually. Automatic onlining can be requested by writing "online" to
-"auto_online_blocks" file:
+The default depends on the CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config
+option. If it is disabled the default is "offline" which means the newly added
+memory is not in a ready-to-use state and you have to "online" the newly added
+memory blocks manually. Automatic onlining can be requested by writing "online"
+to "auto_online_blocks" file:
  
  % echo online > /sys/devices/system/memory/auto_online_blocks
  
diff --git a/Documentation/powerpc/eeh-pci-error-recovery.txt b/Documentation/powerpc/eeh-pci-error-recovery.txt

index 9d4e33df624c2390e03cc75cdbf245e50bca1294..678189280bb409f2590d962aba784b4768eee003 100644 (file)
--- a/Documentation/powerpc/eeh-pci-error-recovery.txt
+++ b/Documentation/powerpc/eeh-pci-error-recovery.txt
@@ -12,7 +12,7 @@ Overview:
  The IBM POWER-based pSeries and iSeries computers include PCI bus
  controller chips that have extended capabilities for detecting and
  reporting a large variety of PCI bus error conditions.  These features
-go under the name of "EEH", for "Extended Error Handling".  The EEH
+go under the name of "EEH", for "Enhanced Error Handling".  The EEH
  hardware features allow PCI bus errors to be cleared and a PCI
  card to be "rebooted", without also having to reboot the operating
  system.
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt

index 34a5fece31216320181fccfbdaf3a85ad91043be..720355cbdf452dd22cc82c92ab7a43677acec04a 100644 (file)
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -57,6 +57,7 @@ Currently, these files are in /proc/sys/vm:
  - panic_on_oom
  - percpu_pagelist_fraction
  - stat_interval
+- stat_refresh
  - swappiness
  - user_reserve_kbytes
  - vfs_cache_pressure
@@ -755,6 +756,19 @@ is 1 second.
  
  ==============================================================
  
+stat_refresh
+
+Any read or write (by root only) flushes all the per-cpu vm statistics
+into their global totals, for more accurate reports when testing
+e.g. cat /proc/sys/vm/stat_refresh /proc/meminfo
+
+As a side-effect, it also checks for negative totals (elsewhere reported
+as 0) and "fails" with EINVAL if any are found, with a warning in dmesg.
+(At time of writing, a few stats are known sometimes to be found negative,
+with no ill effects: errors and warnings on these stats are suppressed.)
+
+==============================================================
+
  swappiness
  
  This control is used to define how aggressive the kernel will swap
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt

index d9cb65cf5cfdf786650f3d8345f6d6346f05f930..fb0e1f2a19cc1fbe860f1db3ddfeaeaa65c7b804 100644 (file)
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -394,9 +394,9 @@ hugepage natively. Once finished you can drop the page table lock.
  Refcounting on THP is mostly consistent with refcounting on other compound
  pages:
  
-  - get_page()/put_page() and GUP operate in head page's ->_count.
+  - get_page()/put_page() and GUP operate in head page's ->_refcount.
  
-  - ->_count in tail pages is always zero: get_page_unless_zero() never
+  - ->_refcount in tail pages is always zero: get_page_unless_zero() never
      succeed on tail pages.
  
    - map/unmap of the pages with PTE entry increment/decrement ->_mapcount
@@ -426,15 +426,15 @@ requests to split pinned huge page: it expects page count to be equal to
  sum of mapcount of all sub-pages plus one (split_huge_page caller must
  have reference for head page).
  
-split_huge_page uses migration entries to stabilize page->_count and
+split_huge_page uses migration entries to stabilize page->_refcount and
  page->_mapcount.
  
  We safe against physical memory scanners too: the only legitimate way
  scanner can get reference to a page is get_page_unless_zero().
  
-All tail pages has zero ->_count until atomic_add(). It prevent scanner
+All tail pages has zero ->_refcount until atomic_add(). It prevent scanner
  from geting reference to tail page up to the point. After the atomic_add()
-we don't care about ->_count value.  We already known how many references
+we don't care about ->_refcount value.  We already known how many references
  with should uncharge from head page.
  
  For head page get_page_unless_zero() will succeed and we don't mind. It's
diff --git a/MAINTAINERS b/MAINTAINERS

index 65f3277a8cf0ca4f1322a119943e4ebdace08ad5..bb176dbfe81f3c21bbfbc03d7147fc24607f5049 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2048,6 +2048,11 @@ M:       Nicolas Ferre <nicolas.ferre@atmel.com>
  S:     Supported
  F:     drivers/tty/serial/atmel_serial.c
  
+ATMEL AT91 SAMA5D2-Compatible Shutdown Controller
+M:     Nicolas Ferre <nicolas.ferre@atmel.com>
+S:     Supported
+F:     drivers/power/reset/at91-sama5d2_shdwc.c
+
  ATMEL SAMA5D2 ADC DRIVER
  M:     Ludovic Desroches <ludovic.desroches@atmel.com>
  L:     linux-iio@vger.kernel.org
@@ -6675,6 +6680,19 @@ T:       git git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git
  S:     Supported
  F:     Documentation/powerpc/
  F:     arch/powerpc/
+F:     drivers/char/tpm/tpm_ibmvtpm*
+F:     drivers/crypto/nx/
+F:     drivers/crypto/vmx/
+F:     drivers/net/ethernet/ibm/ibmveth.*
+F:     drivers/net/ethernet/ibm/ibmvnic.*
+F:     drivers/pci/hotplug/rpa*
+F:     drivers/scsi/ibmvscsi/
+N:     opal
+N:     /pmac
+N:     powermac
+N:     powernv
+N:     [^a-z0-9]ps3
+N:     pseries
  
  LINUX FOR POWER MACINTOSH
  M:     Benjamin Herrenschmidt <benh@kernel.crashing.org>
@@ -12278,6 +12296,12 @@ F:     include/linux/workqueue.h
  F:     kernel/workqueue.c
  F:     Documentation/workqueue.txt
  
+X-POWERS MULTIFUNCTION PMIC DEVICE DRIVERS
+M:     Chen-Yu Tsai <wens@csie.org>
+L:     linux-kernel@vger.kernel.org
+S:     Maintained
+N:     axp[128]
+
  X.25 NETWORK LAYER
  M:     Andrew Hendry <andrew.hendry@gmail.com>
  L:     linux-x25@vger.kernel.org
diff --git a/arch/arc/include/asm/hugepage.h b/arch/arc/include/asm/hugepage.h

index 7afe3356b770287d30fba05c7c6d16db531efa43..317ff773e1ca5f4de6e7ab03dc1c9f184426a2d2 100644 (file)
--- a/arch/arc/include/asm/hugepage.h
+++ b/arch/arc/include/asm/hugepage.h
@@ -61,8 +61,6 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
  extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
                                  pmd_t *pmd);
  
-#define has_transparent_hugepage() 1
-
  /* Generic variants assume pgtable_t is struct page *, hence need for these */
  #define __HAVE_ARCH_PGTABLE_DEPOSIT
  extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
diff --git a/arch/arm/boot/Makefile b/arch/arm/boot/Makefile

index 48fab15cfc0218e1a8e7a0bab389dc766d8ce3e7..446705a4325aafb9126484100090a69c71b0aff1 100644 (file)
--- a/arch/arm/boot/Makefile
+++ b/arch/arm/boot/Makefile
@@ -88,7 +88,7 @@ $(obj)/bootpImage: $(obj)/bootp/bootp FORCE
         $(call if_changed,objcopy)
         @$(kecho) '  Kernel: $@ is ready'
  
-PHONY += initrd
+PHONY += initrd install zinstall uinstall
  initrd:
         @test "$(INITRD_PHYS)" != "" || \
         (echo This machine does not support INITRD; exit -1)
diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h

index 02283eb2f5b22b0e66db8aad31660d5100e707cb..a83570f1012462f053feaabb2fce0b6ea1e06dc4 100644 (file)
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -162,8 +162,6 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
  
  static inline void dma_mark_clean(void *addr, size_t size) { }
  
-extern int arm_dma_set_mask(struct device *dev, u64 dma_mask);
-
  /**
   * arm_dma_alloc - allocate consistent memory for DMA
   * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h

index 485982084fe96aef7218aeb134527b41f90beb1e..781ef5fe235d4924372f0cc37f29c41770f319d4 100644 (file)
--- a/arch/arm/include/asm/io.h
+++ b/arch/arm/include/asm/io.h
@@ -392,9 +392,18 @@ void __iomem *ioremap(resource_size_t res_cookie, size_t size);
  #define ioremap ioremap
  #define ioremap_nocache ioremap
  
+/*
+ * Do not use ioremap_cache for mapping memory. Use memremap instead.
+ */
  void __iomem *ioremap_cache(resource_size_t res_cookie, size_t size);
  #define ioremap_cache ioremap_cache
  
+/*
+ * Do not use ioremap_cached in new code. Provided for the benefit of
+ * the pxa2xx-flash MTD driver only.
+ */
+void __iomem *ioremap_cached(resource_size_t res_cookie, size_t size);
+
  void __iomem *ioremap_wc(resource_size_t res_cookie, size_t size);
  #define ioremap_wc ioremap_wc
  #define ioremap_wt ioremap_wc
@@ -402,6 +411,9 @@ void __iomem *ioremap_wc(resource_size_t res_cookie, size_t size);
  void iounmap(volatile void __iomem *iomem_cookie);
  #define iounmap iounmap
  
+void *arch_memremap_wb(phys_addr_t phys_addr, size_t size);
+#define arch_memremap_wb arch_memremap_wb
+
  /*
   * io{read,write}{16,32}be() macros
   */
diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h

index 9427fd6325527e95ff9e5ad0a3f2bd973481e22a..31c07a2cc10049a3e7339913542d9398ebe135ff 100644 (file)
--- a/arch/arm/include/asm/memory.h
+++ b/arch/arm/include/asm/memory.h
@@ -288,19 +288,43 @@ static inline void *phys_to_virt(phys_addr_t x)
  #define __va(x)                        ((void *)__phys_to_virt((phys_addr_t)(x)))
  #define pfn_to_kaddr(pfn)      __va((phys_addr_t)(pfn) << PAGE_SHIFT)
  
-extern unsigned long (*arch_virt_to_idmap)(unsigned long x);
+extern long long arch_phys_to_idmap_offset;
  
  /*
- * These are for systems that have a hardware interconnect supported alias of
- * physical memory for idmap purposes.  Most cases should leave these
+ * These are for systems that have a hardware interconnect supported alias
+ * of physical memory for idmap purposes.  Most cases should leave these
   * untouched.  Note: this can only return addresses less than 4GiB.
   */
+static inline bool arm_has_idmap_alias(void)
+{
+       return IS_ENABLED(CONFIG_MMU) && arch_phys_to_idmap_offset != 0;
+}
+
+#define IDMAP_INVALID_ADDR ((u32)~0)
+
+static inline unsigned long phys_to_idmap(phys_addr_t addr)
+{
+       if (IS_ENABLED(CONFIG_MMU) && arch_phys_to_idmap_offset) {
+               addr += arch_phys_to_idmap_offset;
+               if (addr > (u32)~0)
+                       addr = IDMAP_INVALID_ADDR;
+       }
+       return addr;
+}
+
+static inline phys_addr_t idmap_to_phys(unsigned long idmap)
+{
+       phys_addr_t addr = idmap;
+
+       if (IS_ENABLED(CONFIG_MMU) && arch_phys_to_idmap_offset)
+               addr -= arch_phys_to_idmap_offset;
+
+       return addr;
+}
+
  static inline unsigned long __virt_to_idmap(unsigned long x)
  {
-       if (IS_ENABLED(CONFIG_MMU) && arch_virt_to_idmap)
-               return arch_virt_to_idmap(x);
-       else
-               return __virt_to_phys(x);
+       return phys_to_idmap(__virt_to_phys(x));
  }
  
  #define virt_to_idmap(x)       __virt_to_idmap((unsigned long)(x))
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h

index dc46398bc3a528ccf51fb01cdabd143036df10de..fa70db7c714b605ff98e4efeb264c944214aabab 100644 (file)
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -281,11 +281,6 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
         flush_pmd_entry(pmdp);
  }
  
-static inline int has_transparent_hugepage(void)
-{
-       return 1;
-}
-
  #endif /* __ASSEMBLY__ */
  
  #endif /* _ASM_PGTABLE_3LEVEL_H */
diff --git a/arch/arm/kernel/bios32.c b/arch/arm/kernel/bios32.c

index 066f7f9ba411e09b5e492a84f03034b86305e38f..05e61a2eeabe9e24aaa5011d2f05dd769019a485 100644 (file)
--- a/arch/arm/kernel/bios32.c
+++ b/arch/arm/kernel/bios32.c
@@ -550,9 +550,6 @@ char * __init pcibios_setup(char *str)
         if (!strcmp(str, "debug")) {
                 debug_pci = 1;
                 return NULL;
-       } else if (!strcmp(str, "firmware")) {
-               pci_add_flags(PCI_PROBE_ONLY);
-               return NULL;
         }
         return str;
  }
diff --git a/arch/arm/kernel/reboot.c b/arch/arm/kernel/reboot.c

index 71a2ff9ec4900c58677f12114c85c82e8cfaa575..3fa867a2aae672755c6ce6448f4148c989dbf964 100644 (file)
--- a/arch/arm/kernel/reboot.c
+++ b/arch/arm/kernel/reboot.c
@@ -104,8 +104,6 @@ void machine_halt(void)
  {
         local_irq_disable();
         smp_send_stop();
-
-       local_irq_disable();
         while (1);
  }
  
@@ -150,6 +148,5 @@ void machine_restart(char *cmd)
  
         /* Whoops - the platform was unable to reboot. Tell the user! */
         printk("Reboot failed -- System halted\n");
-       local_irq_disable();
         while (1);
  }
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c

index 7d4e2850910ce4cdbeb3e669c7846d782dfe6c8c..7b5350060612786c566d6f96cbffa2f444a91453 100644 (file)
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -941,6 +941,12 @@ static int __init init_machine_late(void)
  late_initcall(init_machine_late);
  
  #ifdef CONFIG_KEXEC
+/*
+ * The crash region must be aligned to 128MB to avoid
+ * zImage relocating below the reserved region.
+ */
+#define CRASH_ALIGN    (128 << 20)
+
  static inline unsigned long long get_total_mem(void)
  {
         unsigned long total;
@@ -968,6 +974,26 @@ static void __init reserve_crashkernel(void)
         if (ret)
                 return;
  
+       if (crash_base <= 0) {
+               unsigned long long crash_max = idmap_to_phys((u32)~0);
+               crash_base = memblock_find_in_range(CRASH_ALIGN, crash_max,
+                                                   crash_size, CRASH_ALIGN);
+               if (!crash_base) {
+                       pr_err("crashkernel reservation failed - No suitable area found.\n");
+                       return;
+               }
+       } else {
+               unsigned long long start;
+
+               start = memblock_find_in_range(crash_base,
+                                              crash_base + crash_size,
+                                              crash_size, SECTION_SIZE);
+               if (start != crash_base) {
+                       pr_err("crashkernel reservation failed - memory is in use.\n");
+                       return;
+               }
+       }
+
         ret = memblock_reserve(crash_base, crash_size);
         if (ret < 0) {
                 pr_warn("crashkernel reservation failed - memory is in use (0x%lx)\n",
diff --git a/arch/arm/mach-keystone/keystone.c b/arch/arm/mach-keystone/keystone.c

index e6b9cb1e6709753b6e25166d8b9cac395d9a2c42..a33a296b00dce8f1f65331fa6dad71745f116d9c 100644 (file)
--- a/arch/arm/mach-keystone/keystone.c
+++ b/arch/arm/mach-keystone/keystone.c
@@ -63,11 +63,6 @@ static void __init keystone_init(void)
         of_platform_populate(NULL, of_default_bus_match_table, NULL, NULL);
  }
  
-static unsigned long keystone_virt_to_idmap(unsigned long x)
-{
-       return (phys_addr_t)(x) - CONFIG_PAGE_OFFSET + KEYSTONE_LOW_PHYS_START;
-}
-
  static long long __init keystone_pv_fixup(void)
  {
         long long offset;
@@ -91,7 +86,7 @@ static long long __init keystone_pv_fixup(void)
         offset = KEYSTONE_HIGH_PHYS_START - KEYSTONE_LOW_PHYS_START;
  
         /* Populate the arch idmap hook */
-       arch_virt_to_idmap = keystone_virt_to_idmap;
+       arch_phys_to_idmap_offset = -offset;
  
         return offset;
  }
diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c

index 9f9d54271aada77708a954622269eec73000dc92..c61996c256cc4fa8c84609f5d4c5944b0ce337ce 100644 (file)
--- a/arch/arm/mm/cache-l2x0.c
+++ b/arch/arm/mm/cache-l2x0.c
@@ -647,11 +647,6 @@ static void __init l2c310_enable(void __iomem *base, unsigned num_lock)
                 aux &= ~(L310_AUX_CTRL_FULL_LINE_ZERO | L310_AUX_CTRL_EARLY_BRESP);
         }
  
-       /* r3p0 or later has power control register */
-       if (rev >= L310_CACHE_ID_RTL_R3P0)
-               l2x0_saved_regs.pwr_ctrl = L310_DYNAMIC_CLK_GATING_EN |
-                                               L310_STNDBY_MODE_EN;
-
         /*
          * Always enable non-secure access to the lockdown registers -
          * we write to them as part of the L2C enable sequence so they
@@ -1141,6 +1136,7 @@ static void __init l2c310_of_parse(const struct device_node *np,
         u32 filter[2] = { 0, 0 };
         u32 assoc;
         u32 prefetch;
+       u32 power;
         u32 val;
         int ret;
  
@@ -1271,6 +1267,26 @@ static void __init l2c310_of_parse(const struct device_node *np,
         }
  
         l2x0_saved_regs.prefetch_ctrl = prefetch;
+
+       power = l2x0_saved_regs.pwr_ctrl |
+               L310_DYNAMIC_CLK_GATING_EN | L310_STNDBY_MODE_EN;
+
+       ret = of_property_read_u32(np, "arm,dynamic-clock-gating", &val);
+       if (!ret) {
+               if (!val)
+                       power &= ~L310_DYNAMIC_CLK_GATING_EN;
+       } else if (ret != -EINVAL) {
+               pr_err("L2C-310 OF dynamic-clock-gating property value is missing or invalid\n");
+       }
+       ret = of_property_read_u32(np, "arm,standby-mode", &val);
+       if (!ret) {
+               if (!val)
+                       power &= ~L310_STNDBY_MODE_EN;
+       } else if (ret != -EINVAL) {
+               pr_err("L2C-310 OF standby-mode property value is missing or invalid\n");
+       }
+
+       l2x0_saved_regs.pwr_ctrl = power;
  }
  
  static const struct l2c_init_data of_l2c310_data __initconst = {
diff --git a/arch/arm/mm/cache-uniphier.c b/arch/arm/mm/cache-uniphier.c

index a6fa7b73fbe04a30c2902e8a54fb4019352fb6fc..c8e2f49472237b5fb93d0475f7668bba7b1f6293 100644 (file)
--- a/arch/arm/mm/cache-uniphier.c
+++ b/arch/arm/mm/cache-uniphier.c
@@ -96,6 +96,7 @@ struct uniphier_cache_data {
         void __iomem *ctrl_base;
         void __iomem *rev_base;
         void __iomem *op_base;
+       void __iomem *way_ctrl_base;
         u32 way_present_mask;
         u32 way_locked_mask;
         u32 nsets;
@@ -256,10 +257,13 @@ static void __init __uniphier_cache_set_locked_ways(
                                         struct uniphier_cache_data *data,
                                         u32 way_mask)
  {
+       unsigned int cpu;
+
         data->way_locked_mask = way_mask & data->way_present_mask;
  
-       writel_relaxed(~data->way_locked_mask & data->way_present_mask,
-                      data->ctrl_base + UNIPHIER_SSCLPDAWCR);
+       for_each_possible_cpu(cpu)
+               writel_relaxed(~data->way_locked_mask & data->way_present_mask,
+                              data->way_ctrl_base + 4 * cpu);
  }
  
  static void uniphier_cache_maint_range(unsigned long start, unsigned long end,
@@ -459,6 +463,8 @@ static int __init __uniphier_cache_init(struct device_node *np,
                 goto err;
         }
  
+       data->way_ctrl_base = data->ctrl_base + 0xc00;
+
         if (*cache_level == 2) {
                 u32 revision = readl(data->rev_base + UNIPHIER_SSCID);
                 /*
@@ -467,6 +473,22 @@ static int __init __uniphier_cache_init(struct device_node *np,
                  */
                 if (revision <= 0x16)
                         data->range_op_max_size = (u32)1 << 22;
+
+               /*
+                * Unfortunatly, the offset address of active way control base
+                * varies from SoC to SoC.
+                */
+               switch (revision) {
+               case 0x11:      /* sLD3 */
+                       data->way_ctrl_base = data->ctrl_base + 0x870;
+                       break;
+               case 0x12:      /* LD4 */
+               case 0x16:      /* sld8 */
+                       data->way_ctrl_base = data->ctrl_base + 0x840;
+                       break;
+               default:
+                       break;
+               }
         }
  
         data->range_op_max_size -= data->line_size;
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c

index 5c2ca062c3fa5b43497e422131aac2fec4d4146e..ff7ed5697d3e4b67ba1691dded0f79029ec8f921 100644 (file)
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -190,7 +190,6 @@ struct dma_map_ops arm_dma_ops = {
         .sync_single_for_device = arm_dma_sync_single_for_device,
         .sync_sg_for_cpu        = arm_dma_sync_sg_for_cpu,
         .sync_sg_for_device     = arm_dma_sync_sg_for_device,
-       .set_dma_mask           = arm_dma_set_mask,
  };
  EXPORT_SYMBOL(arm_dma_ops);
  
@@ -209,7 +208,6 @@ struct dma_map_ops arm_coherent_dma_ops = {
         .get_sgtable            = arm_dma_get_sgtable,
         .map_page               = arm_coherent_dma_map_page,
         .map_sg                 = arm_dma_map_sg,
-       .set_dma_mask           = arm_dma_set_mask,
  };
  EXPORT_SYMBOL(arm_coherent_dma_ops);
  
@@ -1143,16 +1141,6 @@ int dma_supported(struct device *dev, u64 mask)
  }
  EXPORT_SYMBOL(dma_supported);
  
-int arm_dma_set_mask(struct device *dev, u64 dma_mask)
-{
-       if (!dev->dma_mask || !dma_supported(dev, dma_mask))
-               return -EIO;
-
-       *dev->dma_mask = dma_mask;
-
-       return 0;
-}
-
  #define PREALLOC_DMA_DEBUG_ENTRIES     4096
  
  static int __init dma_debug_do_init(void)
@@ -2006,8 +1994,6 @@ struct dma_map_ops iommu_ops = {
         .unmap_sg               = arm_iommu_unmap_sg,
         .sync_sg_for_cpu        = arm_iommu_sync_sg_for_cpu,
         .sync_sg_for_device     = arm_iommu_sync_sg_for_device,
-
-       .set_dma_mask           = arm_dma_set_mask,
  };
  
  struct dma_map_ops iommu_coherent_ops = {
@@ -2021,8 +2007,6 @@ struct dma_map_ops iommu_coherent_ops = {
  
         .map_sg         = arm_coherent_iommu_map_sg,
         .unmap_sg       = arm_coherent_iommu_unmap_sg,
-
-       .set_dma_mask   = arm_dma_set_mask,
  };
  
  /**
diff --git a/arch/arm/mm/idmap.c b/arch/arm/mm/idmap.c

index bd274a05b8ffa9446160d1de69f777d9431c629b..c1a48f88764ea8da4ec3b0abd9dcd197678841ea 100644 (file)
--- a/arch/arm/mm/idmap.c
+++ b/arch/arm/mm/idmap.c
@@ -15,7 +15,7 @@
   * page tables.
   */
  pgd_t *idmap_pgd;
-unsigned long (*arch_virt_to_idmap)(unsigned long x);
+long long arch_phys_to_idmap_offset;
  
  #ifdef CONFIG_ARM_LPAE
  static void idmap_add_pmd(pud_t *pud, unsigned long addr, unsigned long end,
diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c

index 66a978d059585f671d1d68277a200b661ef9637b..ff0eed23ddf1354afd26ee0711f34680380227a1 100644 (file)
--- a/arch/arm/mm/ioremap.c
+++ b/arch/arm/mm/ioremap.c
@@ -297,9 +297,10 @@ static void __iomem * __arm_ioremap_pfn_caller(unsigned long pfn,
         }
  
         /*
-        * Don't allow RAM to be mapped - this causes problems with ARMv6+
+        * Don't allow RAM to be mapped with mismatched attributes - this
+        * causes problems with ARMv6+
          */
-       if (WARN_ON(pfn_valid(pfn)))
+       if (WARN_ON(pfn_valid(pfn) && mtype != MT_MEMORY_RW))
                 return NULL;
  
         area = get_vm_area_caller(size, VM_IOREMAP, caller);
@@ -380,11 +381,15 @@ void __iomem *ioremap(resource_size_t res_cookie, size_t size)
  EXPORT_SYMBOL(ioremap);
  
  void __iomem *ioremap_cache(resource_size_t res_cookie, size_t size)
+       __alias(ioremap_cached);
+
+void __iomem *ioremap_cached(resource_size_t res_cookie, size_t size)
  {
         return arch_ioremap_caller(res_cookie, size, MT_DEVICE_CACHED,
                                    __builtin_return_address(0));
  }
  EXPORT_SYMBOL(ioremap_cache);
+EXPORT_SYMBOL(ioremap_cached);
  
  void __iomem *ioremap_wc(resource_size_t res_cookie, size_t size)
  {
@@ -414,6 +419,13 @@ __arm_ioremap_exec(phys_addr_t phys_addr, size_t size, bool cached)
                         __builtin_return_address(0));
  }
  
+void *arch_memremap_wb(phys_addr_t phys_addr, size_t size)
+{
+       return (__force void *)arch_ioremap_caller(phys_addr, size,
+                                                  MT_MEMORY_RW,
+                                                  __builtin_return_address(0));
+}
+
  void __iounmap(volatile void __iomem *io_addr)
  {
         void *addr = (void *)(PAGE_MASK & (unsigned long)io_addr);
diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c

index d5805e4bf2fc71a0db019c198daedccb847a8376..2740967727e2057ef8897e4d4335d38acaeaac2c 100644 (file)
--- a/arch/arm/mm/nommu.c
+++ b/arch/arm/mm/nommu.c
@@ -368,11 +368,15 @@ void __iomem *ioremap(resource_size_t res_cookie, size_t size)
  EXPORT_SYMBOL(ioremap);
  
  void __iomem *ioremap_cache(resource_size_t res_cookie, size_t size)
+       __alias(ioremap_cached);
+
+void __iomem *ioremap_cached(resource_size_t res_cookie, size_t size)
  {
         return __arm_ioremap_caller(res_cookie, size, MT_DEVICE_CACHED,
                                     __builtin_return_address(0));
  }
  EXPORT_SYMBOL(ioremap_cache);
+EXPORT_SYMBOL(ioremap_cached);
  
  void __iomem *ioremap_wc(resource_size_t res_cookie, size_t size)
  {
@@ -381,6 +385,11 @@ void __iomem *ioremap_wc(resource_size_t res_cookie, size_t size)
  }
  EXPORT_SYMBOL(ioremap_wc);
  
+void *arch_memremap_wb(phys_addr_t phys_addr, size_t size)
+{
+       return (void *)phys_addr;
+}
+
  void __iounmap(volatile void __iomem *addr)
  {
  }
diff --git a/arch/arm/tools/Makefile b/arch/arm/tools/Makefile

index 32d05c8219dc35acf9db46a49e521ca312fc336e..6e4cd1867a9f5e915e31bc10395655931fcf18b6 100644 (file)
--- a/arch/arm/tools/Makefile
+++ b/arch/arm/tools/Makefile
@@ -4,7 +4,10 @@
  # Copyright (C) 2001 Russell King
  #
  
-include/generated/mach-types.h: $(src)/gen-mach-types $(src)/mach-types
-       @$(kecho) '  Generating $@'
-       @mkdir -p $(dir $@)
-       $(Q)$(AWK) -f $^ > $@ || { rm -f $@; /bin/false; }
+quiet_cmd_gen_mach = GEN     $@
+      cmd_gen_mach = mkdir -p $(dir $@) && \
+                    $(AWK) -f $(filter-out $(PHONY),$^) > $@ || \
+                    { rm -f $@; /bin/false; }
+
+include/generated/mach-types.h: $(src)/gen-mach-types $(src)/mach-types FORCE
+       $(call if_changed,gen_mach)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h

index 1910bf47d4a316af5c66af46f3a04555cdc10211..46472a91b6dfe9bed53276b8d7fdfe8aa4da62b3 100644 (file)
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -316,11 +316,6 @@ static inline int pmd_protnone(pmd_t pmd)
  
  #define set_pmd_at(mm, addr, pmdp, pmd)        set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd))
  
-static inline int has_transparent_hugepage(void)
-{
-       return 1;
-}
-
  #define __pgprot_modify(prot,mask,bits) \
         __pgprot((pgprot_val(prot) & ~(mask)) | (bits))
  
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c

index 589fd28e1fb5de8b923fc99ee0fa35d79b1cd6af..aa8aee7d69293ef8004b4883bebde05493ece111 100644 (file)
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -307,6 +307,7 @@ static __init int setup_hugepagesz(char *opt)
         } else if (ps == PUD_SIZE) {
                 hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
         } else {
+               hugetlb_bad_size();
                 pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10);
                 return 0;
         }
diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c

index b38700ae4e8429cdd6b56226ed82d523f4153d44..db1b7da91e4f496b332335c2c69adf73c13fbe70 100644 (file)
--- a/arch/metag/mm/hugetlbpage.c
+++ b/arch/metag/mm/hugetlbpage.c
@@ -239,6 +239,7 @@ static __init int setup_hugepagesz(char *opt)
         if (ps == (1 << HPAGE_SHIFT)) {
                 hugetlb_add_hstate(HPAGE_SHIFT - PAGE_SHIFT);
         } else {
+               hugetlb_bad_size();
                 pr_err("hugepagesz: Unsupported page size %lu M\n",
                        ps >> 20);
                 return 0;
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h

index e07a105cafc26fca81d3f1d1fb80d91201a40559..a6b611f1da4390356cd3412672cc0b35398bb0e8 100644 (file)
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -533,6 +533,7 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma,
  
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  
+#define has_transparent_hugepage has_transparent_hugepage
  extern int has_transparent_hugepage(void);
  
  static inline int pmd_trans_huge(pmd_t pmd)
diff --git a/arch/mips/mm/tlb-r4k.c b/arch/mips/mm/tlb-r4k.c

index 5a5c7fec645e8f328fe0b1cd5f8f810be86049ce..e8b335c162958030adb7a201f6a92a8b05018153 100644 (file)
--- a/arch/mips/mm/tlb-r4k.c
+++ b/arch/mips/mm/tlb-r4k.c
@@ -405,19 +405,20 @@ void add_wired_entry(unsigned long entrylo0, unsigned long entrylo1,
  
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  
-int __init has_transparent_hugepage(void)
+int has_transparent_hugepage(void)
  {
-       unsigned int mask;
-       unsigned long flags;
-
-       local_irq_save(flags);
-       write_c0_pagemask(PM_HUGE_MASK);
-       back_to_back_c0_hazard();
-       mask = read_c0_pagemask();
-       write_c0_pagemask(PM_DEFAULT_MASK);
+       static unsigned int mask = -1;
  
-       local_irq_restore(flags);
+       if (mask == -1) {       /* first call comes during __init */
+               unsigned long flags;
  
+               local_irq_save(flags);
+               write_c0_pagemask(PM_HUGE_MASK);
+               back_to_back_c0_hazard();
+               mask = read_c0_pagemask();
+               write_c0_pagemask(PM_DEFAULT_MASK);
+               local_irq_restore(flags);
+       }
         return mask == PM_HUGE_MASK;
  }
  
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig

index a18a0dcd57b7402cfb54490f2892297d0375c5c9..f0403b58ae8b131dda59e051d9c31e809648a0b3 100644 (file)
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -116,6 +116,8 @@ config PPC
         select GENERIC_ATOMIC64 if PPC32
         select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
         select HAVE_PERF_EVENTS
+       select HAVE_PERF_REGS
+       select HAVE_PERF_USER_STACK_DUMP
         select HAVE_REGS_AND_STACK_ACCESS_API
         select HAVE_HW_BREAKPOINT if PERF_EVENTS && PPC_BOOK3S_64
         select ARCH_WANT_IPC_PARSE_VERSION
@@ -606,9 +608,9 @@ endchoice
  
  config FORCE_MAX_ZONEORDER
         int "Maximum zone order"
-       range 9 64 if PPC64 && PPC_64K_PAGES
+       range 8 9 if PPC64 && PPC_64K_PAGES
         default "9" if PPC64 && PPC_64K_PAGES
-       range 13 64 if PPC64 && !PPC_64K_PAGES
+       range 9 13 if PPC64 && !PPC_64K_PAGES
         default "13" if PPC64 && !PPC_64K_PAGES
         range 9 64 if PPC32 && PPC_16K_PAGES
         default "9" if PPC32 && PPC_16K_PAGES
@@ -795,7 +797,6 @@ config 4xx_SOC
  
  config FSL_LBC
         bool "Freescale Local Bus support"
-       depends on FSL_SOC
         help
           Enables reporting of errors from the Freescale local bus
           controller.  Also contains some common code used by
diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug

index 638f9ce740f52b9f3a38247dc0b51dd922efac13..d3fcf7e64e3a4a68980fca0c1067794cddadf977 100644 (file)
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -19,14 +19,6 @@ config PPC_WERROR
         depends on !PPC_DISABLE_WERROR
         default y
  
-config STRICT_MM_TYPECHECKS
-       bool "Do extra type checking on mm types"
-       default n
-       help
-         This option turns on extra type checking for some mm related types.
-
-         If you don't know what this means, say N.
-
  config PRINT_STACK_DEPTH
         int "Stack depth to print" if DEBUG_KERNEL
         default 64
diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile

index 61165101342c896445165d829eeac9241dfdf412..8fe78a3efc92e2767e090f0e8b2b1509ff17cb20 100644 (file)
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -362,9 +362,6 @@ $(obj)/cuImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits)
  $(obj)/cuImage.%: vmlinux $(obj)/%.dtb $(wrapperbits)
         $(call if_changed,wrap,cuboot-$*,,$(obj)/$*.dtb)
  
-$(obj)/cuImage.%: vmlinux $(obj)/fsl/%.dtb $(wrapperbits)
-       $(call if_changed,wrap,cuboot-$*,,$(obj)/fsl/$*.dtb)
-
  $(obj)/simpleImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits)
         $(call if_changed,wrap,simpleboot-$*,,$(obj)/$*.dtb,$(obj)/ramdisk.image.gz)
  
@@ -381,6 +378,9 @@ $(obj)/treeImage.%: vmlinux $(obj)/%.dtb $(wrapperbits)
  $(obj)/%.dtb: $(src)/dts/%.dts FORCE
         $(call if_changed_dep,dtc)
  
+$(obj)/%.dtb: $(src)/dts/fsl/%.dts FORCE
+       $(call if_changed_dep,dtc)
+
  # If there isn't a platform selected then just strip the vmlinux.
  ifeq (,$(image-y))
  image-y := vmlinux.strip
diff --git a/arch/powerpc/boot/dts/fsl/gef_ppc9a.dts b/arch/powerpc/boot/dts/fsl/gef_ppc9a.dts

index 0424fc2bd0e0e9cbc18f6bbbd033d9d11df356c0..c88d4ef9e4f794c41872bcb6d58757d3173a5ef8 100644 (file)
--- a/arch/powerpc/boot/dts/fsl/gef_ppc9a.dts
+++ b/arch/powerpc/boot/dts/fsl/gef_ppc9a.dts
@@ -211,6 +211,10 @@
                                   0x0 0x00400000>;
                 };
         };
+
+       pci1: pcie@fef09000 {
+               status = "disabled";
+       };
  };
  
  /include/ "mpc8641si-post.dtsi"
diff --git a/arch/powerpc/boot/dts/fsl/gef_sbc310.dts b/arch/powerpc/boot/dts/fsl/gef_sbc310.dts

index 84b3d38f880edb37a52fd03d35062de31ce23ac5..838515798cce4d2275da02ca4d8ace52f5f8c1e9 100644 (file)
--- a/arch/powerpc/boot/dts/fsl/gef_sbc310.dts
+++ b/arch/powerpc/boot/dts/fsl/gef_sbc310.dts
@@ -24,10 +24,6 @@
         model = "GEF_SBC310";
         compatible = "gef,sbc310";
  
-       aliases {
-               pci1 = &pci1;
-       };
-
         memory {
                 device_type = "memory";
                 reg = <0x0 0x40000000>; // set by uboot
@@ -223,29 +219,11 @@
         };
  
         pci1: pcie@fef09000 {
-               compatible = "fsl,mpc8641-pcie";
-               device_type = "pci";
-               #size-cells = <2>;
-               #address-cells = <3>;
                 reg = <0xfef09000 0x1000>;
-               bus-range = <0x0 0xff>;
                 ranges = <0x02000000 0x0 0xc0000000 0xc0000000 0x0 0x20000000
                           0x01000000 0x0 0x00000000 0xfe400000 0x0 0x00400000>;
-               clock-frequency = <100000000>;
-               interrupts = <0x19 0x2 0 0>;
-               interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
-               interrupt-map = <
-                       0x0000 0x0 0x0 0x1 &mpic 0x4 0x2
-                       0x0000 0x0 0x0 0x2 &mpic 0x5 0x2
-                       0x0000 0x0 0x0 0x3 &mpic 0x6 0x2
-                       0x0000 0x0 0x0 0x4 &mpic 0x7 0x2
-                       >;
  
                 pcie@0 {
-                       reg = <0 0 0 0 0>;
-                       #size-cells = <2>;
-                       #address-cells = <3>;
-                       device_type = "pci";
                         ranges = <0x02000000 0x0 0xc0000000
                                   0x02000000 0x0 0xc0000000
                                   0x0 0x20000000
diff --git a/arch/powerpc/boot/dts/fsl/gef_sbc610.dts b/arch/powerpc/boot/dts/fsl/gef_sbc610.dts

index 974446acce23c13b5e67d19636bb251e763444f0..ff423ab424f2d3bd597ef21af5e7ff64e2b4c901 100644 (file)
--- a/arch/powerpc/boot/dts/fsl/gef_sbc610.dts
+++ b/arch/powerpc/boot/dts/fsl/gef_sbc610.dts
@@ -209,6 +209,10 @@
                                   0x0 0x00400000>;
                 };
         };
+
+       pci1: pcie@fef09000 {
+               status = "disabled";
+       };
  };
  
  /include/ "mpc8641si-post.dtsi"
diff --git a/arch/powerpc/boot/dts/fsl/mpc8641_hpcn.dts b/arch/powerpc/boot/dts/fsl/mpc8641_hpcn.dts

index 554001f2e96a3b8998022a0f83b3096a0ff96325..11bea3e6a43f4eaecd019d3970a3155d549503bd 100644 (file)
--- a/arch/powerpc/boot/dts/fsl/mpc8641_hpcn.dts
+++ b/arch/powerpc/boot/dts/fsl/mpc8641_hpcn.dts
@@ -15,10 +15,6 @@
         model = "MPC8641HPCN";
         compatible = "fsl,mpc8641hpcn";
  
-       aliases {
-               pci1 = &pci1;
-       };
-
         memory {
                 device_type = "memory";
                 reg = <0x00000000 0x40000000>;  // 1G at 0x0
@@ -359,29 +355,11 @@
         };
  
         pci1: pcie@ffe09000 {
-               compatible = "fsl,mpc8641-pcie";
-               device_type = "pci";
-               #size-cells = <2>;
-               #address-cells = <3>;
                 reg = <0xffe09000 0x1000>;
-               bus-range = <0 0xff>;
                 ranges = <0x02000000 0x0 0xa0000000 0xa0000000 0x0 0x20000000
                           0x01000000 0x0 0x00000000 0xffc10000 0x0 0x00010000>;
-               clock-frequency = <100000000>;
-               interrupts = <25 2 0 0>;
-               interrupt-map-mask = <0xf800 0 0 7>;
-               interrupt-map = <
-                       /* IDSEL 0x0 */
-                       0x0000 0 0 1 &mpic 4 1
-                       0x0000 0 0 2 &mpic 5 1
-                       0x0000 0 0 3 &mpic 6 1
-                       0x0000 0 0 4 &mpic 7 1
-                       >;
+
                 pcie@0 {
-                       reg = <0 0 0 0 0>;
-                       #size-cells = <2>;
-                       #address-cells = <3>;
-                       device_type = "pci";
                         ranges = <0x02000000 0x0 0xa0000000
                                   0x02000000 0x0 0xa0000000
                                   0x0 0x20000000
diff --git a/arch/powerpc/boot/dts/fsl/mpc8641_hpcn_36b.dts b/arch/powerpc/boot/dts/fsl/mpc8641_hpcn_36b.dts

index fec58671a6d6ece0b8cda1e6338ed700689788a7..7ff62046a9ea808dc2ce1373b38b1038d7a02762 100644 (file)
--- a/arch/powerpc/boot/dts/fsl/mpc8641_hpcn_36b.dts
+++ b/arch/powerpc/boot/dts/fsl/mpc8641_hpcn_36b.dts
@@ -17,10 +17,6 @@
         #address-cells = <2>;
         #size-cells = <2>;
  
-       aliases {
-               pci1 = &pci1;
-       };
-
         memory {
                 device_type = "memory";
                 reg = <0x0 0x00000000 0x0 0x40000000>;  // 1G at 0x0
@@ -326,29 +322,11 @@
         };
  
         pci1: pcie@fffe09000 {
-               compatible = "fsl,mpc8641-pcie";
-               device_type = "pci";
-               #size-cells = <2>;
-               #address-cells = <3>;
                 reg = <0x0f 0xffe09000 0x0 0x1000>;
-               bus-range = <0x0 0xff>;
                 ranges = <0x02000000 0x0 0xe0000000 0x0c 0x20000000 0x0 0x20000000
                           0x01000000 0x0 0x00000000 0x0f 0xffc10000 0x0 0x00010000>;
-               clock-frequency = <100000000>;
-               interrupts = <25 2 0 0>;
-               interrupt-map-mask = <0xf800 0 0 7>;
-               interrupt-map = <
-                       /* IDSEL 0x0 */
-                       0x0000 0 0 1 &mpic 4 1
-                       0x0000 0 0 2 &mpic 5 1
-                       0x0000 0 0 3 &mpic 6 1
-                       0x0000 0 0 4 &mpic 7 1
-                       >;
+
                 pcie@0 {
-                       reg = <0 0 0 0 0>;
-                       #size-cells = <2>;
-                       #address-cells = <3>;
-                       device_type = "pci";
                         ranges = <0x02000000 0x0 0xe0000000
                                   0x02000000 0x0 0xe0000000
                                   0x0 0x20000000
diff --git a/arch/powerpc/boot/dts/fsl/mpc8641si-post.dtsi b/arch/powerpc/boot/dts/fsl/mpc8641si-post.dtsi

index 70889d8e885009b1e4c20dfb14113871722c09fa..eeb7c65d5f223ce62601d97c91e1584cc0485617 100644 (file)
--- a/arch/powerpc/boot/dts/fsl/mpc8641si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/mpc8641si-post.dtsi
@@ -102,19 +102,46 @@
         bus-range = <0x0 0xff>;
         clock-frequency = <100000000>;
         interrupts = <24 2 0 0>;
-       interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
  
-       interrupt-map = <
-               0x0000 0x0 0x0 0x1 &mpic 0x0 0x1
-               0x0000 0x0 0x0 0x2 &mpic 0x1 0x1
-               0x0000 0x0 0x0 0x3 &mpic 0x2 0x1
-               0x0000 0x0 0x0 0x4 &mpic 0x3 0x1
-               >;
+       pcie@0 {
+               reg = <0 0 0 0 0>;
+               #interrupt-cells = <1>;
+               #size-cells = <2>;
+               #address-cells = <3>;
+               device_type = "pci";
+               interrupts = <24 2 0 0>;
+               interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
+               interrupt-map = <
+                       0x0000 0x0 0x0 0x1 &mpic 0x0 0x1 0x0 0x0
+                       0x0000 0x0 0x0 0x2 &mpic 0x1 0x1 0x0 0x0
+                       0x0000 0x0 0x0 0x3 &mpic 0x2 0x1 0x0 0x0
+                       0x0000 0x0 0x0 0x4 &mpic 0x3 0x1 0x0 0x0
+                       >;
+       };
+};
+
+&pci1 {
+       compatible = "fsl,mpc8641-pcie";
+       device_type = "pci";
+       #size-cells = <2>;
+       #address-cells = <3>;
+       bus-range = <0x0 0xff>;
+       clock-frequency = <100000000>;
+       interrupts = <25 2 0 0>;
  
         pcie@0 {
                 reg = <0 0 0 0 0>;
+               #interrupt-cells = <1>;
                 #size-cells = <2>;
                 #address-cells = <3>;
                 device_type = "pci";
+               interrupts = <25 2 0 0>;
+               interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
+               interrupt-map = <
+                       0x0000 0x0 0x0 0x1 &mpic 0x4 0x1 0x0 0x0
+                       0x0000 0x0 0x0 0x2 &mpic 0x5 0x1 0x0 0x0
+                       0x0000 0x0 0x0 0x3 &mpic 0x6 0x1 0x0 0x0
+                       0x0000 0x0 0x0 0x4 &mpic 0x7 0x1 0x0 0x0
+                       >;
         };
  };
diff --git a/arch/powerpc/boot/dts/fsl/mpc8641si-pre.dtsi b/arch/powerpc/boot/dts/fsl/mpc8641si-pre.dtsi

index 9e03328561d342d7abf7b4e396280ef500cc32b2..7c6db6f7c12e66e1e2c128270adb870568faad70 100644 (file)
--- a/arch/powerpc/boot/dts/fsl/mpc8641si-pre.dtsi
+++ b/arch/powerpc/boot/dts/fsl/mpc8641si-pre.dtsi
@@ -25,6 +25,7 @@
                 serial0 = &serial0;
                 serial1 = &serial1;
                 pci0 = &pci0;
+               pci1 = &pci1;
         };
  
         cpus {
diff --git a/arch/powerpc/boot/dts/fsl/sbc8641d.dts b/arch/powerpc/boot/dts/fsl/sbc8641d.dts

index 0a9733cd418db8540deaf4af5d4856c94e604d31..75870a12490360848fcc7538f238eab7c03220f1 100644 (file)
--- a/arch/powerpc/boot/dts/fsl/sbc8641d.dts
+++ b/arch/powerpc/boot/dts/fsl/sbc8641d.dts
@@ -19,10 +19,6 @@
         model = "SBC8641D";
         compatible = "wind,sbc8641";
  
-       aliases {
-               pci1 = &pci1;
-       };
-
         memory {
                 device_type = "memory";
                 reg = <0x00000000 0x20000000>;  // 512M at 0x0
@@ -165,30 +161,11 @@
         };
  
         pci1: pcie@f8009000 {
-               compatible = "fsl,mpc8641-pcie";
-               device_type = "pci";
-               #size-cells = <2>;
-               #address-cells = <3>;
                 reg = <0xf8009000 0x1000>;
-               bus-range = <0 0xff>;
                 ranges = <0x02000000 0x0 0xa0000000 0xa0000000 0x0 0x20000000
                           0x01000000 0x0 0x00000000 0xe3000000 0x0 0x00100000>;
-               clock-frequency = <100000000>;
-               interrupts = <25 2 0 0>;
-               interrupt-map-mask = <0xf800 0 0 7>;
-               interrupt-map = <
-                       /* IDSEL 0x0 */
-                       0x0000 0 0 1 &mpic 4 1
-                       0x0000 0 0 2 &mpic 5 1
-                       0x0000 0 0 3 &mpic 6 1
-                       0x0000 0 0 4 &mpic 7 1
-                       >;
  
                 pcie@0 {
-                       reg = <0 0 0 0 0>;
-                       #size-cells = <2>;
-                       #address-cells = <3>;
-                       device_type = "pci";
                         ranges = <0x02000000 0x0 0xa0000000
                                   0x02000000 0x0 0xa0000000
                                   0x0 0x20000000
diff --git a/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi b/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi

index 99e421df79d4c87016ce26ebe0a473f032ea0038..6e0b4892a7406b75ac7de9abf0cee730dbe5c102 100644 (file)
--- a/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi
@@ -263,7 +263,7 @@
         };
  
         rcpm: global-utilities@e2000 {
-               compatible = "fsl,t1023-rcpm", "fsl,qoriq-rcpm-2.0";
+               compatible = "fsl,t1023-rcpm", "fsl,qoriq-rcpm-2.1";
                 reg = <0xe2000 0x1000>;
         };
  
diff --git a/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi b/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi

index e0f4da55477405daa5f9eb8f1b065187c0af0574..507649ece0a195e0eca9a3e9899f1a11a184987f 100644 (file)
--- a/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi
@@ -472,7 +472,7 @@
         };
  
         rcpm: global-utilities@e2000 {
-               compatible = "fsl,t1040-rcpm", "fsl,qoriq-rcpm-2.0";
+               compatible = "fsl,t1040-rcpm", "fsl,qoriq-rcpm-2.1";
                 reg = <0xe2000 0x1000>;
         };
  
diff --git a/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi b/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi

index 72691ef102ee75141d45bb6fee300127b4cf2e74..7c4afdb44b4628104d06977daa02f5d7fb3a6e8b 100644 (file)
--- a/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi
+++ b/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi
@@ -109,7 +109,7 @@
                         flash@0 {
                                 #address-cells = <1>;
                                 #size-cells = <1>;
-                               compatible = "micron,n25q512a", "jedec,spi-nor";
+                               compatible = "micron,n25q512ax3", "jedec,spi-nor";
                                 reg = <0>;
                                 spi-max-frequency = <10000000>; /* input clock */
                         };
diff --git a/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi b/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi

index dc9326875778c19fb2dd75f56744a2ae2f43db0f..ff87e67c70da4a292e353b57b824c5319ba3f5f1 100644 (file)
--- a/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi
+++ b/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi
@@ -113,7 +113,7 @@
                         flash@0 {
                                 #address-cells = <1>;
                                 #size-cells = <1>;
-                               compatible = "micron,n25q512a", "jedec,spi-nor";
+                               compatible = "micron,n25q512ax3", "jedec,spi-nor";
                                 reg = <0>;
                                 spi-max-frequency = <10000000>; /* input clock */
                         };
diff --git a/arch/powerpc/include/asm/book3s/32/hash.h b/arch/powerpc/include/asm/book3s/32/hash.h

index 264b754d65b0b60e554a7c22c18b1b87b996ba8b..880db13a2e9ffcb63f48038674fc313dccdc6e47 100644 (file)
--- a/arch/powerpc/include/asm/book3s/32/hash.h
+++ b/arch/powerpc/include/asm/book3s/32/hash.h
@@ -39,8 +39,5 @@
  #define _PMD_PRESENT_MASK (PAGE_MASK)
  #define _PMD_BAD       (~PAGE_MASK)
  
-/* Hash table based platforms need atomic updates of the linux PTE */
-#define PTE_ATOMIC_UPDATES     1
-
  #endif /* __KERNEL__ */
  #endif /* _ASM_POWERPC_BOOK3S_32_HASH_H */
diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h

index 16f513e5cbd74f2cc42460ba3c7d28524bb86e81..b82e063494dd8fdc3d78e151bc949e4ce8c3c1c6 100644 (file)
--- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h
@@ -1,5 +1,5 @@
-#ifndef _ASM_POWERPC_MMU_HASH32_H_
-#define _ASM_POWERPC_MMU_HASH32_H_
+#ifndef _ASM_POWERPC_BOOK3S_32_MMU_HASH_H_
+#define _ASM_POWERPC_BOOK3S_32_MMU_HASH_H_
  /*
   * 32-bit hash table MMU support
   */
@@ -90,4 +90,4 @@ typedef struct {
  #define mmu_virtual_psize      MMU_PAGE_4K
  #define mmu_linear_psize       MMU_PAGE_256M
  
-#endif /* _ASM_POWERPC_MMU_HASH32_H_ */
+#endif /* _ASM_POWERPC_BOOK3S_32_MMU_HASH_H_ */
diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h

new file mode 100644 (file)

index 0000000..a235019
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h
@@ -0,0 +1,109 @@
+#ifndef _ASM_POWERPC_BOOK3S_32_PGALLOC_H
+#define _ASM_POWERPC_BOOK3S_32_PGALLOC_H
+
+#include <linux/threads.h>
+
+/* For 32-bit, all levels of page tables are just drawn from get_free_page() */
+#define MAX_PGTABLE_INDEX_SIZE 0
+
+extern void __bad_pte(pmd_t *pmd);
+
+extern pgd_t *pgd_alloc(struct mm_struct *mm);
+extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
+
+/*
+ * We don't have any real pmd's, and this code never triggers because
+ * the pgd will always be present..
+ */
+/* #define pmd_alloc_one(mm,address)       ({ BUG(); ((pmd_t *)2); }) */
+#define pmd_free(mm, x)                do { } while (0)
+#define __pmd_free_tlb(tlb,x,a)                do { } while (0)
+/* #define pgd_populate(mm, pmd, pte)      BUG() */
+
+#ifndef CONFIG_BOOKE
+
+static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp,
+                                      pte_t *pte)
+{
+       *pmdp = __pmd(__pa(pte) | _PMD_PRESENT);
+}
+
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp,
+                               pgtable_t pte_page)
+{
+       *pmdp = __pmd((page_to_pfn(pte_page) << PAGE_SHIFT) | _PMD_PRESENT);
+}
+
+#define pmd_pgtable(pmd) pmd_page(pmd)
+#else
+
+static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp,
+                                      pte_t *pte)
+{
+       *pmdp = __pmd((unsigned long)pte | _PMD_PRESENT);
+}
+
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp,
+                               pgtable_t pte_page)
+{
+       *pmdp = __pmd((unsigned long)lowmem_page_address(pte_page) | _PMD_PRESENT);
+}
+
+#define pmd_pgtable(pmd) pmd_page(pmd)
+#endif
+
+extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr);
+extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr);
+
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+{
+       free_page((unsigned long)pte);
+}
+
+static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
+{
+       pgtable_page_dtor(ptepage);
+       __free_page(ptepage);
+}
+
+static inline void pgtable_free(void *table, unsigned index_size)
+{
+       BUG_ON(index_size); /* 32-bit doesn't use this */
+       free_page((unsigned long)table);
+}
+
+#define check_pgt_cache()      do { } while (0)
+
+#ifdef CONFIG_SMP
+static inline void pgtable_free_tlb(struct mmu_gather *tlb,
+                                   void *table, int shift)
+{
+       unsigned long pgf = (unsigned long)table;
+       BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
+       pgf |= shift;
+       tlb_remove_table(tlb, (void *)pgf);
+}
+
+static inline void __tlb_remove_table(void *_table)
+{
+       void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
+       unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
+
+       pgtable_free(table, shift);
+}
+#else
+static inline void pgtable_free_tlb(struct mmu_gather *tlb,
+                                   void *table, int shift)
+{
+       pgtable_free(table, shift);
+}
+#endif
+
+static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
+                                 unsigned long address)
+{
+       tlb_flush_pgtable(tlb, address);
+       pgtable_page_dtor(table);
+       pgtable_free_tlb(tlb, page_address(table), 0);
+}
+#endif /* _ASM_POWERPC_BOOK3S_32_PGALLOC_H */
diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h

index 5f08a08322385d8285851d07e82ff0917a72ebc8..1af837c561bae9d74152a048fc068fac299f8a37 100644 (file)
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -5,58 +5,31 @@
   * for each page table entry.  The PMD and PGD level use a 32b record for
   * each entry by assuming that each entry is page aligned.
   */
-#define PTE_INDEX_SIZE  9
-#define PMD_INDEX_SIZE  7
-#define PUD_INDEX_SIZE  9
-#define PGD_INDEX_SIZE  9
+#define H_PTE_INDEX_SIZE  9
+#define H_PMD_INDEX_SIZE  7
+#define H_PUD_INDEX_SIZE  9
+#define H_PGD_INDEX_SIZE  9
  
  #ifndef __ASSEMBLY__
-#define PTE_TABLE_SIZE (sizeof(pte_t) << PTE_INDEX_SIZE)
-#define PMD_TABLE_SIZE (sizeof(pmd_t) << PMD_INDEX_SIZE)
-#define PUD_TABLE_SIZE (sizeof(pud_t) << PUD_INDEX_SIZE)
-#define PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE)
-#endif /* __ASSEMBLY__ */
-
-#define PTRS_PER_PTE   (1 << PTE_INDEX_SIZE)
-#define PTRS_PER_PMD   (1 << PMD_INDEX_SIZE)
-#define PTRS_PER_PUD   (1 << PUD_INDEX_SIZE)
-#define PTRS_PER_PGD   (1 << PGD_INDEX_SIZE)
-
-/* PMD_SHIFT determines what a second-level page table entry can map */
-#define PMD_SHIFT      (PAGE_SHIFT + PTE_INDEX_SIZE)
-#define PMD_SIZE       (1UL << PMD_SHIFT)
-#define PMD_MASK       (~(PMD_SIZE-1))
+#define H_PTE_TABLE_SIZE       (sizeof(pte_t) << H_PTE_INDEX_SIZE)
+#define H_PMD_TABLE_SIZE       (sizeof(pmd_t) << H_PMD_INDEX_SIZE)
+#define H_PUD_TABLE_SIZE       (sizeof(pud_t) << H_PUD_INDEX_SIZE)
+#define H_PGD_TABLE_SIZE       (sizeof(pgd_t) << H_PGD_INDEX_SIZE)
  
  /* With 4k base page size, hugepage PTEs go at the PMD level */
  #define MIN_HUGEPTE_SHIFT      PMD_SHIFT
  
-/* PUD_SHIFT determines what a third-level page table entry can map */
-#define PUD_SHIFT      (PMD_SHIFT + PMD_INDEX_SIZE)
-#define PUD_SIZE       (1UL << PUD_SHIFT)
-#define PUD_MASK       (~(PUD_SIZE-1))
-
-/* PGDIR_SHIFT determines what a fourth-level page table entry can map */
-#define PGDIR_SHIFT    (PUD_SHIFT + PUD_INDEX_SIZE)
-#define PGDIR_SIZE     (1UL << PGDIR_SHIFT)
-#define PGDIR_MASK     (~(PGDIR_SIZE-1))
-
-/* Bits to mask out from a PMD to get to the PTE page */
-#define PMD_MASKED_BITS                0
-/* Bits to mask out from a PUD to get to the PMD page */
-#define PUD_MASKED_BITS                0
-/* Bits to mask out from a PGD to get to the PUD page */
-#define PGD_MASKED_BITS                0
-
  /* PTE flags to conserve for HPTE identification */
-#define _PAGE_HPTEFLAGS (_PAGE_BUSY | _PAGE_HASHPTE | \
-                        _PAGE_F_SECOND | _PAGE_F_GIX)
-
-/* shift to put page number into pte */
-#define PTE_RPN_SHIFT  (12)
-#define PTE_RPN_SIZE   (45)    /* gives 57-bit real addresses */
-
-#define _PAGE_4K_PFN           0
-#ifndef __ASSEMBLY__
+#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_HASHPTE | \
+                        H_PAGE_F_SECOND | H_PAGE_F_GIX)
+/*
+ * Not supported by 4k linux page size
+ */
+#define H_PAGE_4K_PFN  0x0
+#define H_PAGE_THP_HUGE 0x0
+#define H_PAGE_COMBO   0x0
+#define H_PTE_FRAG_NR  0
+#define H_PTE_FRAG_SIZE_SHIFT  0
  /*
   * On all 4K setups, remap_4k_pfn() equates to remap_pfn_range()
   */
@@ -64,37 +37,76 @@
         remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE, (prot))
  
  #ifdef CONFIG_HUGETLB_PAGE
-/*
- * For 4k page size, we support explicit hugepage via hugepd
- */
-static inline int pmd_huge(pmd_t pmd)
+static inline int hash__hugepd_ok(hugepd_t hpd)
+{
+       /*
+        * if it is not a pte and have hugepd shift mask
+        * set, then it is a hugepd directory pointer
+        */
+       if (!(hpd.pd & _PAGE_PTE) &&
+           ((hpd.pd & HUGEPD_SHIFT_MASK) != 0))
+               return true;
+       return false;
+}
+#endif
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+static inline char *get_hpte_slot_array(pmd_t *pmdp)
+{
+       BUG();
+       return NULL;
+}
+
+static inline unsigned int hpte_valid(unsigned char *hpte_slot_array, int index)
  {
+       BUG();
         return 0;
  }
  
-static inline int pud_huge(pud_t pud)
+static inline unsigned int hpte_hash_index(unsigned char *hpte_slot_array,
+                                          int index)
  {
+       BUG();
         return 0;
  }
  
-static inline int pgd_huge(pgd_t pgd)
+static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array,
+                                       unsigned int index, unsigned int hidx)
+{
+       BUG();
+}
+
+static inline int hash__pmd_trans_huge(pmd_t pmd)
  {
         return 0;
  }
-#define pgd_huge pgd_huge
  
-static inline int hugepd_ok(hugepd_t hpd)
+static inline int hash__pmd_same(pmd_t pmd_a, pmd_t pmd_b)
  {
-       /*
-        * if it is not a pte and have hugepd shift mask
-        * set, then it is a hugepd directory pointer
-        */
-       if (!(hpd.pd & _PAGE_PTE) &&
-           ((hpd.pd & HUGEPD_SHIFT_MASK) != 0))
-               return true;
-       return false;
+       BUG();
+       return 0;
  }
-#define is_hugepd(hpd)         (hugepd_ok(hpd))
+
+static inline pmd_t hash__pmd_mkhuge(pmd_t pmd)
+{
+       BUG();
+       return pmd;
+}
+
+extern unsigned long hash__pmd_hugepage_update(struct mm_struct *mm,
+                                          unsigned long addr, pmd_t *pmdp,
+                                          unsigned long clr, unsigned long set);
+extern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma,
+                                  unsigned long address, pmd_t *pmdp);
+extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+                                        pgtable_t pgtable);
+extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
+extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
+                                     unsigned long address, pmd_t *pmdp);
+extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
+                                      unsigned long addr, pmd_t *pmdp);
+extern int hash__has_transparent_hugepage(void);
  #endif
  
  #endif /* !__ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h

index 0a7956a80a08196433471f4d8703ebf5a2ee6c6f..5aae4f530c21d7ceb2ae8c190d26030d4da0172d 100644 (file)
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -1,73 +1,44 @@
  #ifndef _ASM_POWERPC_BOOK3S_64_HASH_64K_H
  #define _ASM_POWERPC_BOOK3S_64_HASH_64K_H
  
-#define PTE_INDEX_SIZE  8
-#define PMD_INDEX_SIZE  5
-#define PUD_INDEX_SIZE 5
-#define PGD_INDEX_SIZE  12
-
-#define PTRS_PER_PTE   (1 << PTE_INDEX_SIZE)
-#define PTRS_PER_PMD   (1 << PMD_INDEX_SIZE)
-#define PTRS_PER_PUD   (1 << PUD_INDEX_SIZE)
-#define PTRS_PER_PGD   (1 << PGD_INDEX_SIZE)
+#define H_PTE_INDEX_SIZE  8
+#define H_PMD_INDEX_SIZE  5
+#define H_PUD_INDEX_SIZE  5
+#define H_PGD_INDEX_SIZE  12
  
  /* With 4k base page size, hugepage PTEs go at the PMD level */
  #define MIN_HUGEPTE_SHIFT      PAGE_SHIFT
  
-/* PMD_SHIFT determines what a second-level page table entry can map */
-#define PMD_SHIFT      (PAGE_SHIFT + PTE_INDEX_SIZE)
-#define PMD_SIZE       (1UL << PMD_SHIFT)
-#define PMD_MASK       (~(PMD_SIZE-1))
-
-/* PUD_SHIFT determines what a third-level page table entry can map */
-#define PUD_SHIFT      (PMD_SHIFT + PMD_INDEX_SIZE)
-#define PUD_SIZE       (1UL << PUD_SHIFT)
-#define PUD_MASK       (~(PUD_SIZE-1))
-
-/* PGDIR_SHIFT determines what a fourth-level page table entry can map */
-#define PGDIR_SHIFT    (PUD_SHIFT + PUD_INDEX_SIZE)
-#define PGDIR_SIZE     (1UL << PGDIR_SHIFT)
-#define PGDIR_MASK     (~(PGDIR_SIZE-1))
-
-#define _PAGE_COMBO    0x00001000 /* this is a combo 4k page */
-#define _PAGE_4K_PFN   0x00002000 /* PFN is for a single 4k page */
+#define H_PAGE_COMBO   0x00001000 /* this is a combo 4k page */
+#define H_PAGE_4K_PFN  0x00002000 /* PFN is for a single 4k page */
  /*
- * Used to track subpage group valid if _PAGE_COMBO is set
- * This overloads _PAGE_F_GIX and _PAGE_F_SECOND
+ * We need to differentiate between explicit huge page and THP huge
+ * page, since THP huge page also need to track real subpage details
   */
-#define _PAGE_COMBO_VALID      (_PAGE_F_GIX | _PAGE_F_SECOND)
+#define H_PAGE_THP_HUGE  H_PAGE_4K_PFN
  
-/* PTE flags to conserve for HPTE identification */
-#define _PAGE_HPTEFLAGS (_PAGE_BUSY | _PAGE_F_SECOND | \
-                        _PAGE_F_GIX | _PAGE_HASHPTE | _PAGE_COMBO)
-
-/* Shift to put page number into pte.
- *
- * That gives us a max RPN of 41 bits, which means a max of 57 bits
- * of addressable physical space, or 53 bits for the special 4k PFNs.
+/*
+ * Used to track subpage group valid if H_PAGE_COMBO is set
+ * This overloads H_PAGE_F_GIX and H_PAGE_F_SECOND
   */
-#define PTE_RPN_SHIFT  (16)
-#define PTE_RPN_SIZE   (41)
+#define H_PAGE_COMBO_VALID     (H_PAGE_F_GIX | H_PAGE_F_SECOND)
  
+/* PTE flags to conserve for HPTE identification */
+#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_F_SECOND | \
+                        H_PAGE_F_GIX | H_PAGE_HASHPTE | H_PAGE_COMBO)
  /*
   * we support 16 fragments per PTE page of 64K size.
   */
-#define PTE_FRAG_NR    16
+#define H_PTE_FRAG_NR  16
  /*
   * We use a 2K PTE page fragment and another 2K for storing
   * real_pte_t hash index
   */
-#define PTE_FRAG_SIZE_SHIFT  12
+#define H_PTE_FRAG_SIZE_SHIFT  12
  #define PTE_FRAG_SIZE (1UL << PTE_FRAG_SIZE_SHIFT)
  
-/* Bits to mask out from a PMD to get to the PTE page */
-#define PMD_MASKED_BITS                0xc0000000000000ffUL
-/* Bits to mask out from a PUD to get to the PMD page */
-#define PUD_MASKED_BITS                0xc0000000000000ffUL
-/* Bits to mask out from a PGD to get to the PUD page */
-#define PGD_MASKED_BITS                0xc0000000000000ffUL
-
  #ifndef __ASSEMBLY__
+#include <asm/errno.h>
  
  /*
   * With 64K pages on hash table, we have a special PTE format that
@@ -83,9 +54,9 @@ static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep)
  
         rpte.pte = pte;
         rpte.hidx = 0;
-       if (pte_val(pte) & _PAGE_COMBO) {
+       if (pte_val(pte) & H_PAGE_COMBO) {
                 /*
-                * Make sure we order the hidx load against the _PAGE_COMBO
+                * Make sure we order the hidx load against the H_PAGE_COMBO
                  * check. The store side ordering is done in __hash_page_4K
                  */
                 smp_rmb();
@@ -97,9 +68,9 @@ static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep)
  
  static inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long index)
  {
-       if ((pte_val(rpte.pte) & _PAGE_COMBO))
+       if ((pte_val(rpte.pte) & H_PAGE_COMBO))
                 return (rpte.hidx >> (index<<2)) & 0xf;
-       return (pte_val(rpte.pte) >> _PAGE_F_GIX_SHIFT) & 0xf;
+       return (pte_val(rpte.pte) >> H_PAGE_F_GIX_SHIFT) & 0xf;
  }
  
  #define __rpte_to_pte(r)       ((r).pte)
@@ -122,79 +93,32 @@ extern bool __rpte_sub_valid(real_pte_t rpte, unsigned long index);
  #define pte_iterate_hashed_end() } while(0); } } while(0)
  
  #define pte_pagesize_index(mm, addr, pte)      \
-       (((pte) & _PAGE_COMBO)? MMU_PAGE_4K: MMU_PAGE_64K)
-
-#define remap_4k_pfn(vma, addr, pfn, prot)                             \
-       (WARN_ON(((pfn) >= (1UL << PTE_RPN_SIZE))) ? -EINVAL :  \
-               remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE,        \
-                       __pgprot(pgprot_val((prot)) | _PAGE_4K_PFN)))
-
-#define PTE_TABLE_SIZE PTE_FRAG_SIZE
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define PMD_TABLE_SIZE ((sizeof(pmd_t) << PMD_INDEX_SIZE) + (sizeof(unsigned long) << PMD_INDEX_SIZE))
-#else
-#define PMD_TABLE_SIZE (sizeof(pmd_t) << PMD_INDEX_SIZE)
-#endif
-#define PUD_TABLE_SIZE (sizeof(pud_t) << PUD_INDEX_SIZE)
-#define PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE)
-
-#ifdef CONFIG_HUGETLB_PAGE
-/*
- * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have
- * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD;
- *
- * Defined in such a way that we can optimize away code block at build time
- * if CONFIG_HUGETLB_PAGE=n.
- */
-static inline int pmd_huge(pmd_t pmd)
-{
-       /*
-        * leaf pte for huge page
-        */
-       return !!(pmd_val(pmd) & _PAGE_PTE);
-}
-
-static inline int pud_huge(pud_t pud)
-{
-       /*
-        * leaf pte for huge page
-        */
-       return !!(pud_val(pud) & _PAGE_PTE);
-}
+       (((pte) & H_PAGE_COMBO)? MMU_PAGE_4K: MMU_PAGE_64K)
  
-static inline int pgd_huge(pgd_t pgd)
+extern int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
+                          unsigned long pfn, unsigned long size, pgprot_t);
+static inline int hash__remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr,
+                                unsigned long pfn, pgprot_t prot)
  {
-       /*
-        * leaf pte for huge page
-        */
-       return !!(pgd_val(pgd) & _PAGE_PTE);
+       if (pfn > (PTE_RPN_MASK >> PAGE_SHIFT)) {
+               WARN(1, "remap_4k_pfn called with wrong pfn value\n");
+               return -EINVAL;
+       }
+       return remap_pfn_range(vma, addr, pfn, PAGE_SIZE,
+                              __pgprot(pgprot_val(prot) | H_PAGE_4K_PFN));
  }
-#define pgd_huge pgd_huge
  
-#ifdef CONFIG_DEBUG_VM
-extern int hugepd_ok(hugepd_t hpd);
-#define is_hugepd(hpd)               (hugepd_ok(hpd))
+#define H_PTE_TABLE_SIZE       PTE_FRAG_SIZE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define H_PMD_TABLE_SIZE       ((sizeof(pmd_t) << PMD_INDEX_SIZE) + \
+                                (sizeof(unsigned long) << PMD_INDEX_SIZE))
  #else
-/*
- * With 64k page size, we have hugepage ptes in the pgd and pmd entries. We don't
- * need to setup hugepage directory for them. Our pte and page directory format
- * enable us to have this enabled.
- */
-static inline int hugepd_ok(hugepd_t hpd)
-{
-       return 0;
-}
-#define is_hugepd(pdep)                        0
-#endif /* CONFIG_DEBUG_VM */
-
-#endif /* CONFIG_HUGETLB_PAGE */
+#define H_PMD_TABLE_SIZE       (sizeof(pmd_t) << PMD_INDEX_SIZE)
+#endif
+#define H_PUD_TABLE_SIZE       (sizeof(pud_t) << PUD_INDEX_SIZE)
+#define H_PGD_TABLE_SIZE       (sizeof(pgd_t) << PGD_INDEX_SIZE)
  
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-extern unsigned long pmd_hugepage_update(struct mm_struct *mm,
-                                        unsigned long addr,
-                                        pmd_t *pmdp,
-                                        unsigned long clr,
-                                        unsigned long set);
  static inline char *get_hpte_slot_array(pmd_t *pmdp)
  {
         /*
@@ -253,50 +177,35 @@ static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array,
   * that for explicit huge pages.
   *
   */
-static inline int pmd_trans_huge(pmd_t pmd)
+static inline int hash__pmd_trans_huge(pmd_t pmd)
  {
-       return !!((pmd_val(pmd) & (_PAGE_PTE | _PAGE_THP_HUGE)) ==
-                 (_PAGE_PTE | _PAGE_THP_HUGE));
+       return !!((pmd_val(pmd) & (_PAGE_PTE | H_PAGE_THP_HUGE)) ==
+                 (_PAGE_PTE | H_PAGE_THP_HUGE));
  }
  
-static inline int pmd_large(pmd_t pmd)
+static inline int hash__pmd_same(pmd_t pmd_a, pmd_t pmd_b)
  {
-       return !!(pmd_val(pmd) & _PAGE_PTE);
+       return (((pmd_raw(pmd_a) ^ pmd_raw(pmd_b)) & ~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0);
  }
  
-static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+static inline pmd_t hash__pmd_mkhuge(pmd_t pmd)
  {
-       return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT);
-}
-
-#define __HAVE_ARCH_PMD_SAME
-static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
-{
-       return (((pmd_val(pmd_a) ^ pmd_val(pmd_b)) & ~_PAGE_HPTEFLAGS) == 0);
-}
-
-static inline int __pmdp_test_and_clear_young(struct mm_struct *mm,
-                                             unsigned long addr, pmd_t *pmdp)
-{
-       unsigned long old;
-
-       if ((pmd_val(*pmdp) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
-               return 0;
-       old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED, 0);
-       return ((old & _PAGE_ACCESSED) != 0);
-}
-
-#define __HAVE_ARCH_PMDP_SET_WRPROTECT
-static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
-                                     pmd_t *pmdp)
-{
-
-       if ((pmd_val(*pmdp) & _PAGE_RW) == 0)
-               return;
-
-       pmd_hugepage_update(mm, addr, pmdp, _PAGE_RW, 0);
+       return __pmd(pmd_val(pmd) | (_PAGE_PTE | H_PAGE_THP_HUGE));
  }
  
+extern unsigned long hash__pmd_hugepage_update(struct mm_struct *mm,
+                                          unsigned long addr, pmd_t *pmdp,
+                                          unsigned long clr, unsigned long set);
+extern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma,
+                                  unsigned long address, pmd_t *pmdp);
+extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+                                        pgtable_t pgtable);
+extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
+extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
+                                     unsigned long address, pmd_t *pmdp);
+extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
+                                      unsigned long addr, pmd_t *pmdp);
+extern int hash__has_transparent_hugepage(void);
  #endif /*  CONFIG_TRANSPARENT_HUGEPAGE */
  #endif /* __ASSEMBLY__ */
  
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h

index d0ee6fcef823045dd5e98340d65cdbfcdb298243..f61cad3de4e69ec093674355332f7d3ee093cf2b 100644 (file)
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -13,48 +13,12 @@
   * We could create separate kernel read-only if we used the 3 PP bits
   * combinations that newer processors provide but we currently don't.
   */
-#define _PAGE_BIT_SWAP_TYPE    0
-
-#define _PAGE_EXEC             0x00001 /* execute permission */
-#define _PAGE_RW               0x00002 /* read & write access allowed */
-#define _PAGE_READ             0x00004 /* read access allowed */
-#define _PAGE_USER             0x00008 /* page may be accessed by userspace */
-#define _PAGE_GUARDED          0x00010 /* G: guarded (side-effect) page */
-/* M (memory coherence) is always set in the HPTE, so we don't need it here */
-#define _PAGE_COHERENT         0x0
-#define _PAGE_NO_CACHE         0x00020 /* I: cache inhibit */
-#define _PAGE_WRITETHRU                0x00040 /* W: cache write-through */
-#define _PAGE_DIRTY            0x00080 /* C: page changed */
-#define _PAGE_ACCESSED         0x00100 /* R: page referenced */
-#define _PAGE_SPECIAL          0x00400 /* software: special page */
-#define _PAGE_BUSY             0x00800 /* software: PTE & hash are busy */
-
-#ifdef CONFIG_MEM_SOFT_DIRTY
-#define _PAGE_SOFT_DIRTY       0x200 /* software: software dirty tracking */
-#else
-#define _PAGE_SOFT_DIRTY       0x000
-#endif
-
-#define _PAGE_F_GIX_SHIFT      57
-#define _PAGE_F_GIX            (7ul << 57)     /* HPTE index within HPTEG */
-#define _PAGE_F_SECOND         (1ul << 60)     /* HPTE is in 2ndary HPTEG */
-#define _PAGE_HASHPTE          (1ul << 61)     /* PTE has associated HPTE */
-#define _PAGE_PTE              (1ul << 62)     /* distinguishes PTEs from pointers */
-#define _PAGE_PRESENT          (1ul << 63)     /* pte contains a translation */
-
-/*
- * We need to differentiate between explicit huge page and THP huge
- * page, since THP huge page also need to track real subpage details
- */
-#define _PAGE_THP_HUGE  _PAGE_4K_PFN
-
-/*
- * set of bits not changed in pmd_modify.
- */
-#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
-                        _PAGE_ACCESSED | _PAGE_THP_HUGE | _PAGE_PTE | \
-                        _PAGE_SOFT_DIRTY)
-
+#define H_PAGE_BUSY            0x00800 /* software: PTE & hash are busy */
+#define H_PTE_NONE_MASK                _PAGE_HPTEFLAGS
+#define H_PAGE_F_GIX_SHIFT     57
+#define H_PAGE_F_GIX           (7ul << 57)     /* HPTE index within HPTEG */
+#define H_PAGE_F_SECOND                (1ul << 60)     /* HPTE is in 2ndary HPTEG */
+#define H_PAGE_HASHPTE         (1ul << 61)     /* PTE has associated HPTE */
  
  #ifdef CONFIG_PPC_64K_PAGES
  #include <asm/book3s/64/hash-64k.h>
@@ -65,29 +29,33 @@
  /*
   * Size of EA range mapped by our pagetables.
   */
-#define PGTABLE_EADDR_SIZE     (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \
-                                PUD_INDEX_SIZE + PGD_INDEX_SIZE + PAGE_SHIFT)
-#define PGTABLE_RANGE          (ASM_CONST(1) << PGTABLE_EADDR_SIZE)
+#define H_PGTABLE_EADDR_SIZE   (H_PTE_INDEX_SIZE + H_PMD_INDEX_SIZE + \
+                                H_PUD_INDEX_SIZE + H_PGD_INDEX_SIZE + PAGE_SHIFT)
+#define H_PGTABLE_RANGE                (ASM_CONST(1) << H_PGTABLE_EADDR_SIZE)
  
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define PMD_CACHE_INDEX        (PMD_INDEX_SIZE + 1)
+/*
+ * only with hash we need to use the second half of pmd page table
+ * to store pointer to deposited pgtable_t
+ */
+#define H_PMD_CACHE_INDEX      (H_PMD_INDEX_SIZE + 1)
  #else
-#define PMD_CACHE_INDEX        PMD_INDEX_SIZE
+#define H_PMD_CACHE_INDEX      H_PMD_INDEX_SIZE
  #endif
  /*
   * Define the address range of the kernel non-linear virtual area
   */
-#define KERN_VIRT_START ASM_CONST(0xD000000000000000)
-#define KERN_VIRT_SIZE ASM_CONST(0x0000100000000000)
+#define H_KERN_VIRT_START ASM_CONST(0xD000000000000000)
+#define H_KERN_VIRT_SIZE       ASM_CONST(0x0000100000000000)
  
  /*
   * The vmalloc space starts at the beginning of that region, and
   * occupies half of it on hash CPUs and a quarter of it on Book3E
   * (we keep a quarter for the virtual memmap)
   */
-#define VMALLOC_START  KERN_VIRT_START
-#define VMALLOC_SIZE   (KERN_VIRT_SIZE >> 1)
-#define VMALLOC_END    (VMALLOC_START + VMALLOC_SIZE)
+#define H_VMALLOC_START        H_KERN_VIRT_START
+#define H_VMALLOC_SIZE (H_KERN_VIRT_SIZE >> 1)
+#define H_VMALLOC_END  (H_VMALLOC_START + H_VMALLOC_SIZE)
  
  /*
   * Region IDs
@@ -96,7 +64,7 @@
  #define REGION_MASK            (0xfUL << REGION_SHIFT)
  #define REGION_ID(ea)          (((unsigned long)(ea)) >> REGION_SHIFT)
  
-#define VMALLOC_REGION_ID      (REGION_ID(VMALLOC_START))
+#define VMALLOC_REGION_ID      (REGION_ID(H_VMALLOC_START))
  #define KERNEL_REGION_ID       (REGION_ID(PAGE_OFFSET))
  #define VMEMMAP_REGION_ID      (0xfUL) /* Server only */
  #define USER_REGION_ID         (0UL)
@@ -105,381 +73,97 @@
   * Defines the address of the vmemap area, in its own region on
   * hash table CPUs.
   */
-#define VMEMMAP_BASE           (VMEMMAP_REGION_ID << REGION_SHIFT)
+#define H_VMEMMAP_BASE         (VMEMMAP_REGION_ID << REGION_SHIFT)
  
  #ifdef CONFIG_PPC_MM_SLICES
  #define HAVE_ARCH_UNMAPPED_AREA
  #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
  #endif /* CONFIG_PPC_MM_SLICES */
  
-/* No separate kernel read-only */
-#define _PAGE_KERNEL_RW                (_PAGE_RW | _PAGE_DIRTY) /* user access blocked by key */
-#define _PAGE_KERNEL_RO                 _PAGE_KERNEL_RW
-#define _PAGE_KERNEL_RWX       (_PAGE_DIRTY | _PAGE_RW | _PAGE_EXEC)
-
-/* Strong Access Ordering */
-#define _PAGE_SAO              (_PAGE_WRITETHRU | _PAGE_NO_CACHE | _PAGE_COHERENT)
-
-/* No page size encoding in the linux PTE */
-#define _PAGE_PSIZE            0
  
  /* PTEIDX nibble */
  #define _PTEIDX_SECONDARY      0x8
  #define _PTEIDX_GROUP_IX       0x7
  
-/* Hash table based platforms need atomic updates of the linux PTE */
-#define PTE_ATOMIC_UPDATES     1
-#define _PTE_NONE_MASK _PAGE_HPTEFLAGS
-/*
- * The mask convered by the RPN must be a ULL on 32-bit platforms with
- * 64-bit PTEs
- */
-#define PTE_RPN_MASK   (((1UL << PTE_RPN_SIZE) - 1) << PTE_RPN_SHIFT)
-/*
- * _PAGE_CHG_MASK masks of bits that are to be preserved across
- * pgprot changes
- */
-#define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
-                        _PAGE_ACCESSED | _PAGE_SPECIAL | _PAGE_PTE | \
-                        _PAGE_SOFT_DIRTY)
-/*
- * Mask of bits returned by pte_pgprot()
- */
-#define PAGE_PROT_BITS (_PAGE_GUARDED | _PAGE_COHERENT | _PAGE_NO_CACHE | \
-                        _PAGE_WRITETHRU | _PAGE_4K_PFN | \
-                        _PAGE_USER | _PAGE_ACCESSED |  \
-                        _PAGE_RW |  _PAGE_DIRTY | _PAGE_EXEC | \
-                        _PAGE_SOFT_DIRTY)
-/*
- * We define 2 sets of base prot bits, one for basic pages (ie,
- * cacheable kernel and user pages) and one for non cacheable
- * pages. We always set _PAGE_COHERENT when SMP is enabled or
- * the processor might need it for DMA coherency.
- */
-#define _PAGE_BASE_NC  (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE)
-#define _PAGE_BASE     (_PAGE_BASE_NC | _PAGE_COHERENT)
-
-/* Permission masks used to generate the __P and __S table,
- *
- * Note:__pgprot is defined in arch/powerpc/include/asm/page.h
- *
- * Write permissions imply read permissions for now (we could make write-only
- * pages on BookE but we don't bother for now). Execute permission control is
- * possible on platforms that define _PAGE_EXEC
- *
- * Note due to the way vm flags are laid out, the bits are XWR
- */
-#define PAGE_NONE      __pgprot(_PAGE_BASE)
-#define PAGE_SHARED    __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW)
-#define PAGE_SHARED_X  __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | \
-                                _PAGE_EXEC)
-#define PAGE_COPY      __pgprot(_PAGE_BASE | _PAGE_USER )
-#define PAGE_COPY_X    __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC)
-#define PAGE_READONLY  __pgprot(_PAGE_BASE | _PAGE_USER )
-#define PAGE_READONLY_X        __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC)
-
-#define __P000 PAGE_NONE
-#define __P001 PAGE_READONLY
-#define __P010 PAGE_COPY
-#define __P011 PAGE_COPY
-#define __P100 PAGE_READONLY_X
-#define __P101 PAGE_READONLY_X
-#define __P110 PAGE_COPY_X
-#define __P111 PAGE_COPY_X
-
-#define __S000 PAGE_NONE
-#define __S001 PAGE_READONLY
-#define __S010 PAGE_SHARED
-#define __S011 PAGE_SHARED
-#define __S100 PAGE_READONLY_X
-#define __S101 PAGE_READONLY_X
-#define __S110 PAGE_SHARED_X
-#define __S111 PAGE_SHARED_X
-
-/* Permission masks used for kernel mappings */
-#define PAGE_KERNEL    __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW)
-#define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \
-                                _PAGE_NO_CACHE)
-#define PAGE_KERNEL_NCG        __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \
-                                _PAGE_NO_CACHE | _PAGE_GUARDED)
-#define PAGE_KERNEL_X  __pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX)
-#define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO)
-#define PAGE_KERNEL_ROX        __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX)
-
-/* Protection used for kernel text. We want the debuggers to be able to
- * set breakpoints anywhere, so don't write protect the kernel text
- * on platforms where such control is possible.
- */
-#if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) ||\
-       defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE)
-#define PAGE_KERNEL_TEXT       PAGE_KERNEL_X
-#else
-#define PAGE_KERNEL_TEXT       PAGE_KERNEL_ROX
-#endif
-
-/* Make modules code happy. We don't set RO yet */
-#define PAGE_KERNEL_EXEC       PAGE_KERNEL_X
-#define PAGE_AGP               (PAGE_KERNEL_NC)
-
-#define PMD_BAD_BITS           (PTE_TABLE_SIZE-1)
-#define PUD_BAD_BITS           (PMD_TABLE_SIZE-1)
+#define H_PMD_BAD_BITS         (PTE_TABLE_SIZE-1)
+#define H_PUD_BAD_BITS         (PMD_TABLE_SIZE-1)
  
  #ifndef __ASSEMBLY__
-#define        pmd_bad(pmd)            (pmd_val(pmd) & PMD_BAD_BITS)
-#define pmd_page_vaddr(pmd)    __va(pmd_val(pmd) & ~PMD_MASKED_BITS)
-
-#define        pud_bad(pud)            (pud_val(pud) & PUD_BAD_BITS)
-#define pud_page_vaddr(pud)    __va(pud_val(pud) & ~PUD_MASKED_BITS)
-
-/* Pointers in the page table tree are physical addresses */
-#define __pgtable_ptr_val(ptr) __pa(ptr)
-
-#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & (PTRS_PER_PGD - 1))
-#define pud_index(address) (((address) >> (PUD_SHIFT)) & (PTRS_PER_PUD - 1))
-#define pmd_index(address) (((address) >> (PMD_SHIFT)) & (PTRS_PER_PMD - 1))
-#define pte_index(address) (((address) >> (PAGE_SHIFT)) & (PTRS_PER_PTE - 1))
+#define        hash__pmd_bad(pmd)              (pmd_val(pmd) & H_PMD_BAD_BITS)
+#define        hash__pud_bad(pud)              (pud_val(pud) & H_PUD_BAD_BITS)
+static inline int hash__pgd_bad(pgd_t pgd)
+{
+       return (pgd_val(pgd) == 0);
+}
  
  extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
                             pte_t *ptep, unsigned long pte, int huge);
  extern unsigned long htab_convert_pte_flags(unsigned long pteflags);
  /* Atomic PTE updates */
-static inline unsigned long pte_update(struct mm_struct *mm,
-                                      unsigned long addr,
-                                      pte_t *ptep, unsigned long clr,
-                                      unsigned long set,
-                                      int huge)
+static inline unsigned long hash__pte_update(struct mm_struct *mm,
+                                        unsigned long addr,
+                                        pte_t *ptep, unsigned long clr,
+                                        unsigned long set,
+                                        int huge)
  {
-       unsigned long old, tmp;
+       __be64 old_be, tmp_be;
+       unsigned long old;
  
         __asm__ __volatile__(
         "1:     ldarx   %0,0,%3         # pte_update\n\
-       andi.   %1,%0,%6\n\
+       and.    %1,%0,%6\n\
         bne-    1b \n\
         andc    %1,%0,%4 \n\
         or      %1,%1,%7\n\
         stdcx.  %1,0,%3 \n\
         bne-    1b"
-       : "=&r" (old), "=&r" (tmp), "=m" (*ptep)
-       : "r" (ptep), "r" (clr), "m" (*ptep), "i" (_PAGE_BUSY), "r" (set)
+       : "=&r" (old_be), "=&r" (tmp_be), "=m" (*ptep)
+       : "r" (ptep), "r" (cpu_to_be64(clr)), "m" (*ptep),
+         "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
         : "cc" );
         /* huge pages use the old page table lock */
         if (!huge)
                 assert_pte_locked(mm, addr);
  
-       if (old & _PAGE_HASHPTE)
+       old = be64_to_cpu(old_be);
+       if (old & H_PAGE_HASHPTE)
                 hpte_need_flush(mm, addr, ptep, old, huge);
  
         return old;
  }
  
-static inline int __ptep_test_and_clear_young(struct mm_struct *mm,
-                                             unsigned long addr, pte_t *ptep)
-{
-       unsigned long old;
-
-       if ((pte_val(*ptep) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
-               return 0;
-       old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0);
-       return (old & _PAGE_ACCESSED) != 0;
-}
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-#define ptep_test_and_clear_young(__vma, __addr, __ptep)                  \
-({                                                                        \
-       int __r;                                                           \
-       __r = __ptep_test_and_clear_young((__vma)->vm_mm, __addr, __ptep); \
-       __r;                                                               \
-})
-
-#define __HAVE_ARCH_PTEP_SET_WRPROTECT
-static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
-                                     pte_t *ptep)
-{
-
-       if ((pte_val(*ptep) & _PAGE_RW) == 0)
-               return;
-
-       pte_update(mm, addr, ptep, _PAGE_RW, 0, 0);
-}
-
-static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
-                                          unsigned long addr, pte_t *ptep)
-{
-       if ((pte_val(*ptep) & _PAGE_RW) == 0)
-               return;
-
-       pte_update(mm, addr, ptep, _PAGE_RW, 0, 1);
-}
-
-/*
- * We currently remove entries from the hashtable regardless of whether
- * the entry was young or dirty. The generic routines only flush if the
- * entry was young or dirty which is not good enough.
- *
- * We should be more intelligent about this but for the moment we override
- * these functions and force a tlb flush unconditionally
- */
-#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
-#define ptep_clear_flush_young(__vma, __address, __ptep)               \
-({                                                                     \
-       int __young = __ptep_test_and_clear_young((__vma)->vm_mm, __address, \
-                                                 __ptep);              \
-       __young;                                                        \
-})
-
-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
-                                      unsigned long addr, pte_t *ptep)
-{
-       unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, 0);
-       return __pte(old);
-}
-
-static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
-                            pte_t * ptep)
-{
-       pte_update(mm, addr, ptep, ~0UL, 0, 0);
-}
-
-
  /* Set the dirty and/or accessed bits atomically in a linux PTE, this
   * function doesn't need to flush the hash entry
   */
-static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
+static inline void hash__ptep_set_access_flags(pte_t *ptep, pte_t entry)
  {
-       unsigned long bits = pte_val(entry) &
-               (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC |
-                _PAGE_SOFT_DIRTY);
+       __be64 old, tmp, val, mask;
  
-       unsigned long old, tmp;
+       mask = cpu_to_be64(_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_READ | _PAGE_WRITE |
+                          _PAGE_EXEC | _PAGE_SOFT_DIRTY);
+
+       val = pte_raw(entry) & mask;
  
         __asm__ __volatile__(
         "1:     ldarx   %0,0,%4\n\
-               andi.   %1,%0,%6\n\
+               and.    %1,%0,%6\n\
                 bne-    1b \n\
                 or      %0,%3,%0\n\
                 stdcx.  %0,0,%4\n\
                 bne-    1b"
         :"=&r" (old), "=&r" (tmp), "=m" (*ptep)
-       :"r" (bits), "r" (ptep), "m" (*ptep), "i" (_PAGE_BUSY)
+       :"r" (val), "r" (ptep), "m" (*ptep), "r" (cpu_to_be64(H_PAGE_BUSY))
         :"cc");
  }
  
-static inline int pgd_bad(pgd_t pgd)
-{
-       return (pgd_val(pgd) == 0);
-}
-
-#define __HAVE_ARCH_PTE_SAME
-#define pte_same(A,B)  (((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0)
-static inline unsigned long pgd_page_vaddr(pgd_t pgd)
-{
-       return (unsigned long)__va(pgd_val(pgd) & ~PGD_MASKED_BITS);
-}
-
-
-/* Generic accessors to PTE bits */
-static inline int pte_write(pte_t pte)         { return !!(pte_val(pte) & _PAGE_RW);}
-static inline int pte_dirty(pte_t pte)         { return !!(pte_val(pte) & _PAGE_DIRTY); }
-static inline int pte_young(pte_t pte)         { return !!(pte_val(pte) & _PAGE_ACCESSED); }
-static inline int pte_special(pte_t pte)       { return !!(pte_val(pte) & _PAGE_SPECIAL); }
-static inline int pte_none(pte_t pte)          { return (pte_val(pte) & ~_PTE_NONE_MASK) == 0; }
-static inline pgprot_t pte_pgprot(pte_t pte)   { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); }
-
-#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
-static inline bool pte_soft_dirty(pte_t pte)
-{
-       return !!(pte_val(pte) & _PAGE_SOFT_DIRTY);
-}
-static inline pte_t pte_mksoft_dirty(pte_t pte)
-{
-       return __pte(pte_val(pte) | _PAGE_SOFT_DIRTY);
-}
-
-static inline pte_t pte_clear_soft_dirty(pte_t pte)
-{
-       return __pte(pte_val(pte) & ~_PAGE_SOFT_DIRTY);
-}
-#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
-
-#ifdef CONFIG_NUMA_BALANCING
-/*
- * These work without NUMA balancing but the kernel does not care. See the
- * comment in include/asm-generic/pgtable.h . On powerpc, this will only
- * work for user pages and always return true for kernel pages.
- */
-static inline int pte_protnone(pte_t pte)
-{
-       return (pte_val(pte) &
-               (_PAGE_PRESENT | _PAGE_USER)) == _PAGE_PRESENT;
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
-static inline int pte_present(pte_t pte)
-{
-       return !!(pte_val(pte) & _PAGE_PRESENT);
-}
-
-/* Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- *
- * Even if PTEs can be unsigned long long, a PFN is always an unsigned
- * long for now.
- */
-static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot)
-{
-       return __pte((((pte_basic_t)(pfn) << PTE_RPN_SHIFT) & PTE_RPN_MASK) |
-                    pgprot_val(pgprot));
-}
-
-static inline unsigned long pte_pfn(pte_t pte)
-{
-       return (pte_val(pte) & PTE_RPN_MASK) >> PTE_RPN_SHIFT;
-}
-
-/* Generic modifiers for PTE bits */
-static inline pte_t pte_wrprotect(pte_t pte)
-{
-       return __pte(pte_val(pte) & ~_PAGE_RW);
-}
-
-static inline pte_t pte_mkclean(pte_t pte)
-{
-       return __pte(pte_val(pte) & ~_PAGE_DIRTY);
-}
-
-static inline pte_t pte_mkold(pte_t pte)
-{
-       return __pte(pte_val(pte) & ~_PAGE_ACCESSED);
-}
-
-static inline pte_t pte_mkwrite(pte_t pte)
-{
-       return __pte(pte_val(pte) | _PAGE_RW);
-}
-
-static inline pte_t pte_mkdirty(pte_t pte)
+static inline int hash__pte_same(pte_t pte_a, pte_t pte_b)
  {
-       return __pte(pte_val(pte) | _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+       return (((pte_raw(pte_a) ^ pte_raw(pte_b)) & ~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0);
  }
  
-static inline pte_t pte_mkyoung(pte_t pte)
+static inline int hash__pte_none(pte_t pte)
  {
-       return __pte(pte_val(pte) | _PAGE_ACCESSED);
-}
-
-static inline pte_t pte_mkspecial(pte_t pte)
-{
-       return __pte(pte_val(pte) | _PAGE_SPECIAL);
-}
-
-static inline pte_t pte_mkhuge(pte_t pte)
-{
-       return pte;
-}
-
-static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
-{
-       return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot));
+       return (pte_val(pte) & ~H_PTE_NONE_MASK) == 0;
  }
  
  /* This low level function performs the actual PTE insertion
@@ -487,8 +171,8 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
   * an horrible mess that I'm not going to try to clean up now but
   * I'm keeping it in one place rather than spread around
   */
-static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
-                               pte_t *ptep, pte_t pte, int percpu)
+static inline void hash__set_pte_at(struct mm_struct *mm, unsigned long addr,
+                                 pte_t *ptep, pte_t pte, int percpu)
  {
         /*
          * Anything else just stores the PTE normally. That covers all 64-bit
@@ -497,53 +181,6 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
         *ptep = pte;
  }
  
-/*
- * Macro to mark a page protection value as "uncacheable".
- */
-
-#define _PAGE_CACHE_CTL        (_PAGE_COHERENT | _PAGE_GUARDED | _PAGE_NO_CACHE | \
-                        _PAGE_WRITETHRU)
-
-#define pgprot_noncached pgprot_noncached
-static inline pgprot_t pgprot_noncached(pgprot_t prot)
-{
-       return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) |
-                       _PAGE_NO_CACHE | _PAGE_GUARDED);
-}
-
-#define pgprot_noncached_wc pgprot_noncached_wc
-static inline pgprot_t pgprot_noncached_wc(pgprot_t prot)
-{
-       return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) |
-                       _PAGE_NO_CACHE);
-}
-
-#define pgprot_cached pgprot_cached
-static inline pgprot_t pgprot_cached(pgprot_t prot)
-{
-       return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) |
-                       _PAGE_COHERENT);
-}
-
-#define pgprot_cached_wthru pgprot_cached_wthru
-static inline pgprot_t pgprot_cached_wthru(pgprot_t prot)
-{
-       return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) |
-                       _PAGE_COHERENT | _PAGE_WRITETHRU);
-}
-
-#define pgprot_cached_noncoherent pgprot_cached_noncoherent
-static inline pgprot_t pgprot_cached_noncoherent(pgprot_t prot)
-{
-       return __pgprot(pgprot_val(prot) & ~_PAGE_CACHE_CTL);
-}
-
-#define pgprot_writecombine pgprot_writecombine
-static inline pgprot_t pgprot_writecombine(pgprot_t prot)
-{
-       return pgprot_noncached_wc(prot);
-}
-
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  extern void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
                                    pmd_t *pmdp, unsigned long old_pmd);
@@ -556,6 +193,14 @@ static inline void hpte_do_hugepage_flush(struct mm_struct *mm,
  }
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  
+
+extern int hash__map_kernel_page(unsigned long ea, unsigned long pa,
+                            unsigned long flags);
+extern int __meminit hash__vmemmap_create_mapping(unsigned long start,
+                                             unsigned long page_size,
+                                             unsigned long phys);
+extern void hash__vmemmap_remove_mapping(unsigned long start,
+                                    unsigned long page_size);
  #endif /* !__ASSEMBLY__ */
  #endif /* __KERNEL__ */
  #endif /* _ASM_POWERPC_BOOK3S_64_HASH_H */
diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb-radix.h b/arch/powerpc/include/asm/book3s/64/hugetlb-radix.h

new file mode 100644 (file)

index 0000000..60f4764
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/64/hugetlb-radix.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_POWERPC_BOOK3S_64_HUGETLB_RADIX_H
+#define _ASM_POWERPC_BOOK3S_64_HUGETLB_RADIX_H
+/*
+ * For radix we want generic code to handle hugetlb. But then if we want
+ * both hash and radix to be enabled together we need to workaround the
+ * limitations.
+ */
+void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
+void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
+extern unsigned long
+radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+                               unsigned long len, unsigned long pgoff,
+                               unsigned long flags);
+#endif
diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h

index 0cea4807e26fd2fe5a7c4a834c50637bd70ec157..290157e8d5b2498bce2d6ffb14261124d47a25a7 100644 (file)
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -1,5 +1,5 @@
-#ifndef _ASM_POWERPC_MMU_HASH64_H_
-#define _ASM_POWERPC_MMU_HASH64_H_
+#ifndef _ASM_POWERPC_BOOK3S_64_MMU_HASH_H_
+#define _ASM_POWERPC_BOOK3S_64_MMU_HASH_H_
  /*
   * PowerPC64 memory management structures
   *
@@ -78,6 +78,10 @@
  #define HPTE_V_SECONDARY       ASM_CONST(0x0000000000000002)
  #define HPTE_V_VALID           ASM_CONST(0x0000000000000001)
  
+/*
+ * ISA 3.0 have a different HPTE format.
+ */
+#define HPTE_R_3_0_SSIZE_SHIFT 58
  #define HPTE_R_PP0             ASM_CONST(0x8000000000000000)
  #define HPTE_R_TS              ASM_CONST(0x4000000000000000)
  #define HPTE_R_KEY_HI          ASM_CONST(0x3000000000000000)
@@ -115,6 +119,7 @@
  #define POWER7_TLB_SETS                128     /* # sets in POWER7 TLB */
  #define POWER8_TLB_SETS                512     /* # sets in POWER8 TLB */
  #define POWER9_TLB_SETS_HASH   256     /* # sets in POWER9 TLB Hash mode */
+#define POWER9_TLB_SETS_RADIX  128     /* # sets in POWER9 TLB Radix mode */
  
  #ifndef __ASSEMBLY__
  
@@ -127,24 +132,6 @@ extern struct hash_pte *htab_address;
  extern unsigned long htab_size_bytes;
  extern unsigned long htab_hash_mask;
  
-/*
- * Page size definition
- *
- *    shift : is the "PAGE_SHIFT" value for that page size
- *    sllp  : is a bit mask with the value of SLB L || LP to be or'ed
- *            directly to a slbmte "vsid" value
- *    penc  : is the HPTE encoding mask for the "LP" field:
- *
- */
-struct mmu_psize_def
-{
-       unsigned int    shift;  /* number of bits */
-       int             penc[MMU_PAGE_COUNT];   /* HPTE encoding */
-       unsigned int    tlbiel; /* tlbiel supported for that page size */
-       unsigned long   avpnm;  /* bits to mask out in AVPN in the HPTE */
-       unsigned long   sllp;   /* SLB L||LP (exact mask to use in slbmte) */
-};
-extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
  
  static inline int shift_to_mmu_psize(unsigned int shift)
  {
@@ -210,11 +197,6 @@ static inline int segment_shift(int ssize)
  /*
   * The current system page and segment sizes
   */
-extern int mmu_linear_psize;
-extern int mmu_virtual_psize;
-extern int mmu_vmalloc_psize;
-extern int mmu_vmemmap_psize;
-extern int mmu_io_psize;
  extern int mmu_kernel_ssize;
  extern int mmu_highuser_ssize;
  extern u16 mmu_slb_size;
@@ -247,7 +229,8 @@ static inline unsigned long hpte_encode_avpn(unsigned long vpn, int psize,
          */
         v = (vpn >> (23 - VPN_SHIFT)) & ~(mmu_psize_defs[psize].avpnm);
         v <<= HPTE_V_AVPN_SHIFT;
-       v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT;
+       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+               v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT;
         return v;
  }
  
@@ -271,8 +254,12 @@ static inline unsigned long hpte_encode_v(unsigned long vpn, int base_psize,
   * aligned for the requested page size
   */
  static inline unsigned long hpte_encode_r(unsigned long pa, int base_psize,
-                                         int actual_psize)
+                                         int actual_psize, int ssize)
  {
+
+       if (cpu_has_feature(CPU_FTR_ARCH_300))
+               pa |= ((unsigned long) ssize) << HPTE_R_3_0_SSIZE_SHIFT;
+
         /* A 4K page needs no special encoding */
         if (actual_psize == MMU_PAGE_4K)
                 return pa & HPTE_R_RPN;
@@ -476,7 +463,7 @@ extern void slb_set_size(u16 size);
         add     rt,rt,rx
  
  /* 4 bits per slice and we have one slice per 1TB */
-#define SLICE_ARRAY_SIZE  (PGTABLE_RANGE >> 41)
+#define SLICE_ARRAY_SIZE  (H_PGTABLE_RANGE >> 41)
  
  #ifndef __ASSEMBLY__
  
@@ -512,38 +499,6 @@ static inline void subpage_prot_free(struct mm_struct *mm) {}
  static inline void subpage_prot_init_new_context(struct mm_struct *mm) { }
  #endif /* CONFIG_PPC_SUBPAGE_PROT */
  
-typedef unsigned long mm_context_id_t;
-struct spinlock;
-
-typedef struct {
-       mm_context_id_t id;
-       u16 user_psize;         /* page size index */
-
-#ifdef CONFIG_PPC_MM_SLICES
-       u64 low_slices_psize;   /* SLB page size encodings */
-       unsigned char high_slices_psize[SLICE_ARRAY_SIZE];
-#else
-       u16 sllp;               /* SLB page size encoding */
-#endif
-       unsigned long vdso_base;
-#ifdef CONFIG_PPC_SUBPAGE_PROT
-       struct subpage_prot_table spt;
-#endif /* CONFIG_PPC_SUBPAGE_PROT */
-#ifdef CONFIG_PPC_ICSWX
-       struct spinlock *cop_lockp; /* guard acop and cop_pid */
-       unsigned long acop;     /* mask of enabled coprocessor types */
-       unsigned int cop_pid;   /* pid value used with coprocessors */
-#endif /* CONFIG_PPC_ICSWX */
-#ifdef CONFIG_PPC_64K_PAGES
-       /* for 4K PTE fragment support */
-       void *pte_frag;
-#endif
-#ifdef CONFIG_SPAPR_TCE_IOMMU
-       struct list_head iommu_group_mem_list;
-#endif
-} mm_context_t;
-
-
  #if 0
  /*
   * The code below is equivalent to this function for arguments
@@ -579,7 +534,7 @@ static inline unsigned long get_vsid(unsigned long context, unsigned long ea,
         /*
          * Bad address. We return VSID 0 for that
          */
-       if ((ea & ~REGION_MASK) >= PGTABLE_RANGE)
+       if ((ea & ~REGION_MASK) >= H_PGTABLE_RANGE)
                 return 0;
  
         if (ssize == MMU_SEGSIZE_256M)
@@ -613,4 +568,4 @@ unsigned htab_shift_for_mem_size(unsigned long mem_size);
  
  #endif /* __ASSEMBLY__ */
  
-#endif /* _ASM_POWERPC_MMU_HASH64_H_ */
+#endif /* _ASM_POWERPC_BOOK3S_64_MMU_HASH_H_ */
diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h

new file mode 100644 (file)

index 0000000..5854263
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -0,0 +1,137 @@
+#ifndef _ASM_POWERPC_BOOK3S_64_MMU_H_
+#define _ASM_POWERPC_BOOK3S_64_MMU_H_
+
+#ifndef __ASSEMBLY__
+/*
+ * Page size definition
+ *
+ *    shift : is the "PAGE_SHIFT" value for that page size
+ *    sllp  : is a bit mask with the value of SLB L || LP to be or'ed
+ *            directly to a slbmte "vsid" value
+ *    penc  : is the HPTE encoding mask for the "LP" field:
+ *
+ */
+struct mmu_psize_def {
+       unsigned int    shift;  /* number of bits */
+       int             penc[MMU_PAGE_COUNT];   /* HPTE encoding */
+       unsigned int    tlbiel; /* tlbiel supported for that page size */
+       unsigned long   avpnm;  /* bits to mask out in AVPN in the HPTE */
+       union {
+               unsigned long   sllp;   /* SLB L||LP (exact mask to use in slbmte) */
+               unsigned long ap;       /* Ap encoding used by PowerISA 3.0 */
+       };
+};
+extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
+
+#define radix_enabled() mmu_has_feature(MMU_FTR_RADIX)
+
+#endif /* __ASSEMBLY__ */
+
+/* 64-bit classic hash table MMU */
+#include <asm/book3s/64/mmu-hash.h>
+
+#ifndef __ASSEMBLY__
+/*
+ * ISA 3.0 partiton and process table entry format
+ */
+struct prtb_entry {
+       __be64 prtb0;
+       __be64 prtb1;
+};
+extern struct prtb_entry *process_tb;
+
+struct patb_entry {
+       __be64 patb0;
+       __be64 patb1;
+};
+extern struct patb_entry *partition_tb;
+
+#define PATB_HR                (1UL << 63)
+#define PATB_GR                (1UL << 63)
+#define RPDB_MASK      0x0ffffffffffff00fUL
+#define RPDB_SHIFT     (1UL << 8)
+/*
+ * Limit process table to PAGE_SIZE table. This
+ * also limit the max pid we can support.
+ * MAX_USER_CONTEXT * 16 bytes of space.
+ */
+#define PRTB_SIZE_SHIFT        (CONTEXT_BITS + 4)
+/*
+ * Power9 currently only support 64K partition table size.
+ */
+#define PATB_SIZE_SHIFT        16
+
+typedef unsigned long mm_context_id_t;
+struct spinlock;
+
+typedef struct {
+       mm_context_id_t id;
+       u16 user_psize;         /* page size index */
+
+#ifdef CONFIG_PPC_MM_SLICES
+       u64 low_slices_psize;   /* SLB page size encodings */
+       unsigned char high_slices_psize[SLICE_ARRAY_SIZE];
+#else
+       u16 sllp;               /* SLB page size encoding */
+#endif
+       unsigned long vdso_base;
+#ifdef CONFIG_PPC_SUBPAGE_PROT
+       struct subpage_prot_table spt;
+#endif /* CONFIG_PPC_SUBPAGE_PROT */
+#ifdef CONFIG_PPC_ICSWX
+       struct spinlock *cop_lockp; /* guard acop and cop_pid */
+       unsigned long acop;     /* mask of enabled coprocessor types */
+       unsigned int cop_pid;   /* pid value used with coprocessors */
+#endif /* CONFIG_PPC_ICSWX */
+#ifdef CONFIG_PPC_64K_PAGES
+       /* for 4K PTE fragment support */
+       void *pte_frag;
+#endif
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+       struct list_head iommu_group_mem_list;
+#endif
+} mm_context_t;
+
+/*
+ * The current system page and segment sizes
+ */
+extern int mmu_linear_psize;
+extern int mmu_virtual_psize;
+extern int mmu_vmalloc_psize;
+extern int mmu_vmemmap_psize;
+extern int mmu_io_psize;
+
+/* MMU initialization */
+extern void radix_init_native(void);
+extern void hash__early_init_mmu(void);
+extern void radix__early_init_mmu(void);
+static inline void early_init_mmu(void)
+{
+       if (radix_enabled())
+               return radix__early_init_mmu();
+       return hash__early_init_mmu();
+}
+extern void hash__early_init_mmu_secondary(void);
+extern void radix__early_init_mmu_secondary(void);
+static inline void early_init_mmu_secondary(void)
+{
+       if (radix_enabled())
+               return radix__early_init_mmu_secondary();
+       return hash__early_init_mmu_secondary();
+}
+
+extern void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base,
+                                        phys_addr_t first_memblock_size);
+extern void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
+                                        phys_addr_t first_memblock_size);
+static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+                                             phys_addr_t first_memblock_size)
+{
+       if (radix_enabled())
+               return radix__setup_initial_memory_limit(first_memblock_base,
+                                                  first_memblock_size);
+       return hash__setup_initial_memory_limit(first_memblock_base,
+                                          first_memblock_size);
+}
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_POWERPC_BOOK3S_64_MMU_H_ */
diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h

new file mode 100644 (file)

index 0000000..488279e
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h
@@ -0,0 +1,207 @@
+#ifndef _ASM_POWERPC_BOOK3S_64_PGALLOC_H
+#define _ASM_POWERPC_BOOK3S_64_PGALLOC_H
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+
+struct vmemmap_backing {
+       struct vmemmap_backing *list;
+       unsigned long phys;
+       unsigned long virt_addr;
+};
+extern struct vmemmap_backing *vmemmap_list;
+
+/*
+ * Functions that deal with pagetables that could be at any level of
+ * the table need to be passed an "index_size" so they know how to
+ * handle allocation.  For PTE pages (which are linked to a struct
+ * page for now, and drawn from the main get_free_pages() pool), the
+ * allocation size will be (2^index_size * sizeof(pointer)) and
+ * allocations are drawn from the kmem_cache in PGT_CACHE(index_size).
+ *
+ * The maximum index size needs to be big enough to allow any
+ * pagetable sizes we need, but small enough to fit in the low bits of
+ * any page table pointer.  In other words all pagetables, even tiny
+ * ones, must be aligned to allow at least enough low 0 bits to
+ * contain this value.  This value is also used as a mask, so it must
+ * be one less than a power of two.
+ */
+#define MAX_PGTABLE_INDEX_SIZE 0xf
+
+extern struct kmem_cache *pgtable_cache[];
+#define PGT_CACHE(shift) ({                            \
+                       BUG_ON(!(shift));               \
+                       pgtable_cache[(shift) - 1];     \
+               })
+
+#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+
+extern pte_t *pte_fragment_alloc(struct mm_struct *, unsigned long, int);
+extern void pte_fragment_free(unsigned long *, int);
+extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift);
+#ifdef CONFIG_SMP
+extern void __tlb_remove_table(void *_table);
+#endif
+
+static inline pgd_t *radix__pgd_alloc(struct mm_struct *mm)
+{
+#ifdef CONFIG_PPC_64K_PAGES
+       return (pgd_t *)__get_free_page(PGALLOC_GFP);
+#else
+       struct page *page;
+       page = alloc_pages(PGALLOC_GFP, 4);
+       if (!page)
+               return NULL;
+       return (pgd_t *) page_address(page);
+#endif
+}
+
+static inline void radix__pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+#ifdef CONFIG_PPC_64K_PAGES
+       free_page((unsigned long)pgd);
+#else
+       free_pages((unsigned long)pgd, 4);
+#endif
+}
+
+static inline pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+       if (radix_enabled())
+               return radix__pgd_alloc(mm);
+       return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL);
+}
+
+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+       if (radix_enabled())
+               return radix__pgd_free(mm, pgd);
+       kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd);
+}
+
+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+{
+       pgd_set(pgd, __pgtable_ptr_val(pud) | PGD_VAL_BITS);
+}
+
+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+       return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE),
+                               GFP_KERNEL|__GFP_REPEAT);
+}
+
+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+{
+       kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud);
+}
+
+static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+{
+       pud_set(pud, __pgtable_ptr_val(pmd) | PUD_VAL_BITS);
+}
+
+static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
+                                  unsigned long address)
+{
+        pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE);
+}
+
+static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+       return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX),
+                               GFP_KERNEL|__GFP_REPEAT);
+}
+
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
+{
+       kmem_cache_free(PGT_CACHE(PMD_CACHE_INDEX), pmd);
+}
+
+static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
+                                  unsigned long address)
+{
+        return pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX);
+}
+
+static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
+                                      pte_t *pte)
+{
+       pmd_set(pmd, __pgtable_ptr_val(pte) | PMD_VAL_BITS);
+}
+
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
+                               pgtable_t pte_page)
+{
+       pmd_set(pmd, __pgtable_ptr_val(pte_page) | PMD_VAL_BITS);
+}
+
+static inline pgtable_t pmd_pgtable(pmd_t pmd)
+{
+       return (pgtable_t)pmd_page_vaddr(pmd);
+}
+
+#ifdef CONFIG_PPC_4K_PAGES
+static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
+                                         unsigned long address)
+{
+       return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO);
+}
+
+static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
+                                     unsigned long address)
+{
+       struct page *page;
+       pte_t *pte;
+
+       pte = pte_alloc_one_kernel(mm, address);
+       if (!pte)
+               return NULL;
+       page = virt_to_page(pte);
+       if (!pgtable_page_ctor(page)) {
+               __free_page(page);
+               return NULL;
+       }
+       return pte;
+}
+#else /* if CONFIG_PPC_64K_PAGES */
+
+static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
+                                         unsigned long address)
+{
+       return (pte_t *)pte_fragment_alloc(mm, address, 1);
+}
+
+static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
+                                     unsigned long address)
+{
+       return (pgtable_t)pte_fragment_alloc(mm, address, 0);
+}
+#endif
+
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+{
+       pte_fragment_free((unsigned long *)pte, 1);
+}
+
+static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
+{
+       pte_fragment_free((unsigned long *)ptepage, 0);
+}
+
+static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
+                                 unsigned long address)
+{
+       tlb_flush_pgtable(tlb, address);
+       pgtable_free_tlb(tlb, table, 0);
+}
+
+#define check_pgt_cache()      do { } while (0)
+
+#endif /* _ASM_POWERPC_BOOK3S_64_PGALLOC_H */
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h

new file mode 100644 (file)

index 0000000..71e9abc
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_4K_H
+#define _ASM_POWERPC_BOOK3S_64_PGTABLE_4K_H
+/*
+ * hash 4k can't share hugetlb and also doesn't support THP
+ */
+#ifndef __ASSEMBLY__
+#ifdef CONFIG_HUGETLB_PAGE
+static inline int pmd_huge(pmd_t pmd)
+{
+       /*
+        * leaf pte for huge page
+        */
+       if (radix_enabled())
+               return !!(pmd_val(pmd) & _PAGE_PTE);
+       return 0;
+}
+
+static inline int pud_huge(pud_t pud)
+{
+       /*
+        * leaf pte for huge page
+        */
+       if (radix_enabled())
+               return !!(pud_val(pud) & _PAGE_PTE);
+       return 0;
+}
+
+static inline int pgd_huge(pgd_t pgd)
+{
+       /*
+        * leaf pte for huge page
+        */
+       if (radix_enabled())
+               return !!(pgd_val(pgd) & _PAGE_PTE);
+       return 0;
+}
+#define pgd_huge pgd_huge
+/*
+ * With radix , we have hugepage ptes in the pud and pmd entries. We don't
+ * need to setup hugepage directory for them. Our pte and page directory format
+ * enable us to have this enabled.
+ */
+static inline int hugepd_ok(hugepd_t hpd)
+{
+       if (radix_enabled())
+               return 0;
+       return hash__hugepd_ok(hpd);
+}
+#define is_hugepd(hpd)         (hugepd_ok(hpd))
+#endif /* CONFIG_HUGETLB_PAGE */
+#endif /* __ASSEMBLY__ */
+
+#endif /*_ASM_POWERPC_BOOK3S_64_PGTABLE_4K_H */
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h

new file mode 100644 (file)

index 0000000..cb2d0a5
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h
@@ -0,0 +1,64 @@
+#ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H
+#define _ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H
+
+#ifndef __ASSEMBLY__
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have
+ * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD;
+ *
+ * Defined in such a way that we can optimize away code block at build time
+ * if CONFIG_HUGETLB_PAGE=n.
+ */
+static inline int pmd_huge(pmd_t pmd)
+{
+       /*
+        * leaf pte for huge page
+        */
+       return !!(pmd_val(pmd) & _PAGE_PTE);
+}
+
+static inline int pud_huge(pud_t pud)
+{
+       /*
+        * leaf pte for huge page
+        */
+       return !!(pud_val(pud) & _PAGE_PTE);
+}
+
+static inline int pgd_huge(pgd_t pgd)
+{
+       /*
+        * leaf pte for huge page
+        */
+       return !!(pgd_val(pgd) & _PAGE_PTE);
+}
+#define pgd_huge pgd_huge
+
+#ifdef CONFIG_DEBUG_VM
+extern int hugepd_ok(hugepd_t hpd);
+#define is_hugepd(hpd)               (hugepd_ok(hpd))
+#else
+/*
+ * With 64k page size, we have hugepage ptes in the pgd and pmd entries. We don't
+ * need to setup hugepage directory for them. Our pte and page directory format
+ * enable us to have this enabled.
+ */
+static inline int hugepd_ok(hugepd_t hpd)
+{
+       return 0;
+}
+#define is_hugepd(pdep)                        0
+#endif /* CONFIG_DEBUG_VM */
+
+#endif /* CONFIG_HUGETLB_PAGE */
+
+static inline int remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr,
+                              unsigned long pfn, pgprot_t prot)
+{
+       if (radix_enabled())
+               BUG();
+       return hash__remap_4k_pfn(vma, addr, pfn, prot);
+}
+#endif /* __ASSEMBLY__ */
+#endif /*_ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H */
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h

index 77d3ce05798e34e9a9e7ab8bdbca1d90ae5174a7..88a5ecaa157b5a2fd474f23759e11e2764f973e5 100644 (file)
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1,13 +1,247 @@
  #ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_H_
  #define _ASM_POWERPC_BOOK3S_64_PGTABLE_H_
+
+/*
+ * Common bits between hash and Radix page table
+ */
+#define _PAGE_BIT_SWAP_TYPE    0
+
+#define _PAGE_EXEC             0x00001 /* execute permission */
+#define _PAGE_WRITE            0x00002 /* write access allowed */
+#define _PAGE_READ             0x00004 /* read access allowed */
+#define _PAGE_RW               (_PAGE_READ | _PAGE_WRITE)
+#define _PAGE_RWX              (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)
+#define _PAGE_PRIVILEGED       0x00008 /* kernel access only */
+#define _PAGE_SAO              0x00010 /* Strong access order */
+#define _PAGE_NON_IDEMPOTENT   0x00020 /* non idempotent memory */
+#define _PAGE_TOLERANT         0x00030 /* tolerant memory, cache inhibited */
+#define _PAGE_DIRTY            0x00080 /* C: page changed */
+#define _PAGE_ACCESSED         0x00100 /* R: page referenced */
  /*
- * This file contains the functions and defines necessary to modify and use
- * the ppc64 hashed page table.
+ * Software bits
   */
+#define _RPAGE_SW0             0x2000000000000000UL
+#define _RPAGE_SW1             0x00800
+#define _RPAGE_SW2             0x00400
+#define _RPAGE_SW3             0x00200
+#ifdef CONFIG_MEM_SOFT_DIRTY
+#define _PAGE_SOFT_DIRTY       _RPAGE_SW3 /* software: software dirty tracking */
+#else
+#define _PAGE_SOFT_DIRTY       0x00000
+#endif
+#define _PAGE_SPECIAL          _RPAGE_SW2 /* software: special page */
+
+
+#define _PAGE_PTE              (1ul << 62)     /* distinguishes PTEs from pointers */
+#define _PAGE_PRESENT          (1ul << 63)     /* pte contains a translation */
+/*
+ * Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE
+ * Instead of fixing all of them, add an alternate define which
+ * maps CI pte mapping.
+ */
+#define _PAGE_NO_CACHE         _PAGE_TOLERANT
+/*
+ * We support 57 bit real address in pte. Clear everything above 57, and
+ * every thing below PAGE_SHIFT;
+ */
+#define PTE_RPN_MASK   (((1UL << 57) - 1) & (PAGE_MASK))
+/*
+ * set of bits not changed in pmd_modify. Even though we have hash specific bits
+ * in here, on radix we expect them to be zero.
+ */
+#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
+                        _PAGE_ACCESSED | H_PAGE_THP_HUGE | _PAGE_PTE | \
+                        _PAGE_SOFT_DIRTY)
+/*
+ * user access blocked by key
+ */
+#define _PAGE_KERNEL_RW                (_PAGE_PRIVILEGED | _PAGE_RW | _PAGE_DIRTY)
+#define _PAGE_KERNEL_RO                 (_PAGE_PRIVILEGED | _PAGE_READ)
+#define _PAGE_KERNEL_RWX       (_PAGE_PRIVILEGED | _PAGE_DIRTY |       \
+                                _PAGE_RW | _PAGE_EXEC)
+/*
+ * No page size encoding in the linux PTE
+ */
+#define _PAGE_PSIZE            0
+/*
+ * _PAGE_CHG_MASK masks of bits that are to be preserved across
+ * pgprot changes
+ */
+#define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
+                        _PAGE_ACCESSED | _PAGE_SPECIAL | _PAGE_PTE |   \
+                        _PAGE_SOFT_DIRTY)
+/*
+ * Mask of bits returned by pte_pgprot()
+ */
+#define PAGE_PROT_BITS  (_PAGE_SAO | _PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT | \
+                        H_PAGE_4K_PFN | _PAGE_PRIVILEGED | _PAGE_ACCESSED | \
+                        _PAGE_READ | _PAGE_WRITE |  _PAGE_DIRTY | _PAGE_EXEC | \
+                        _PAGE_SOFT_DIRTY)
+/*
+ * We define 2 sets of base prot bits, one for basic pages (ie,
+ * cacheable kernel and user pages) and one for non cacheable
+ * pages. We always set _PAGE_COHERENT when SMP is enabled or
+ * the processor might need it for DMA coherency.
+ */
+#define _PAGE_BASE_NC  (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE)
+#define _PAGE_BASE     (_PAGE_BASE_NC)
+
+/* Permission masks used to generate the __P and __S table,
+ *
+ * Note:__pgprot is defined in arch/powerpc/include/asm/page.h
+ *
+ * Write permissions imply read permissions for now (we could make write-only
+ * pages on BookE but we don't bother for now). Execute permission control is
+ * possible on platforms that define _PAGE_EXEC
+ *
+ * Note due to the way vm flags are laid out, the bits are XWR
+ */
+#define PAGE_NONE      __pgprot(_PAGE_BASE | _PAGE_PRIVILEGED)
+#define PAGE_SHARED    __pgprot(_PAGE_BASE | _PAGE_RW)
+#define PAGE_SHARED_X  __pgprot(_PAGE_BASE | _PAGE_RW | _PAGE_EXEC)
+#define PAGE_COPY      __pgprot(_PAGE_BASE | _PAGE_READ)
+#define PAGE_COPY_X    __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC)
+#define PAGE_READONLY  __pgprot(_PAGE_BASE | _PAGE_READ)
+#define PAGE_READONLY_X        __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC)
+
+#define __P000 PAGE_NONE
+#define __P001 PAGE_READONLY
+#define __P010 PAGE_COPY
+#define __P011 PAGE_COPY
+#define __P100 PAGE_READONLY_X
+#define __P101 PAGE_READONLY_X
+#define __P110 PAGE_COPY_X
+#define __P111 PAGE_COPY_X
+
+#define __S000 PAGE_NONE
+#define __S001 PAGE_READONLY
+#define __S010 PAGE_SHARED
+#define __S011 PAGE_SHARED
+#define __S100 PAGE_READONLY_X
+#define __S101 PAGE_READONLY_X
+#define __S110 PAGE_SHARED_X
+#define __S111 PAGE_SHARED_X
+
+/* Permission masks used for kernel mappings */
+#define PAGE_KERNEL    __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW)
+#define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \
+                                _PAGE_TOLERANT)
+#define PAGE_KERNEL_NCG        __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \
+                                _PAGE_NON_IDEMPOTENT)
+#define PAGE_KERNEL_X  __pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX)
+#define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO)
+#define PAGE_KERNEL_ROX        __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX)
+
+/*
+ * Protection used for kernel text. We want the debuggers to be able to
+ * set breakpoints anywhere, so don't write protect the kernel text
+ * on platforms where such control is possible.
+ */
+#if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) || \
+       defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE)
+#define PAGE_KERNEL_TEXT       PAGE_KERNEL_X
+#else
+#define PAGE_KERNEL_TEXT       PAGE_KERNEL_ROX
+#endif
+
+/* Make modules code happy. We don't set RO yet */
+#define PAGE_KERNEL_EXEC       PAGE_KERNEL_X
+#define PAGE_AGP               (PAGE_KERNEL_NC)
+
+#ifndef __ASSEMBLY__
+/*
+ * page table defines
+ */
+extern unsigned long __pte_index_size;
+extern unsigned long __pmd_index_size;
+extern unsigned long __pud_index_size;
+extern unsigned long __pgd_index_size;
+extern unsigned long __pmd_cache_index;
+#define PTE_INDEX_SIZE  __pte_index_size
+#define PMD_INDEX_SIZE  __pmd_index_size
+#define PUD_INDEX_SIZE  __pud_index_size
+#define PGD_INDEX_SIZE  __pgd_index_size
+#define PMD_CACHE_INDEX __pmd_cache_index
+/*
+ * Because of use of pte fragments and THP, size of page table
+ * are not always derived out of index size above.
+ */
+extern unsigned long __pte_table_size;
+extern unsigned long __pmd_table_size;
+extern unsigned long __pud_table_size;
+extern unsigned long __pgd_table_size;
+#define PTE_TABLE_SIZE __pte_table_size
+#define PMD_TABLE_SIZE __pmd_table_size
+#define PUD_TABLE_SIZE __pud_table_size
+#define PGD_TABLE_SIZE __pgd_table_size
+
+extern unsigned long __pmd_val_bits;
+extern unsigned long __pud_val_bits;
+extern unsigned long __pgd_val_bits;
+#define PMD_VAL_BITS   __pmd_val_bits
+#define PUD_VAL_BITS   __pud_val_bits
+#define PGD_VAL_BITS   __pgd_val_bits
+
+extern unsigned long __pte_frag_nr;
+#define PTE_FRAG_NR __pte_frag_nr
+extern unsigned long __pte_frag_size_shift;
+#define PTE_FRAG_SIZE_SHIFT __pte_frag_size_shift
+#define PTE_FRAG_SIZE (1UL << PTE_FRAG_SIZE_SHIFT)
+/*
+ * Pgtable size used by swapper, init in asm code
+ */
+#define MAX_PGD_TABLE_SIZE (sizeof(pgd_t) << RADIX_PGD_INDEX_SIZE)
+
+#define PTRS_PER_PTE   (1 << PTE_INDEX_SIZE)
+#define PTRS_PER_PMD   (1 << PMD_INDEX_SIZE)
+#define PTRS_PER_PUD   (1 << PUD_INDEX_SIZE)
+#define PTRS_PER_PGD   (1 << PGD_INDEX_SIZE)
+
+/* PMD_SHIFT determines what a second-level page table entry can map */
+#define PMD_SHIFT      (PAGE_SHIFT + PTE_INDEX_SIZE)
+#define PMD_SIZE       (1UL << PMD_SHIFT)
+#define PMD_MASK       (~(PMD_SIZE-1))
+
+/* PUD_SHIFT determines what a third-level page table entry can map */
+#define PUD_SHIFT      (PMD_SHIFT + PMD_INDEX_SIZE)
+#define PUD_SIZE       (1UL << PUD_SHIFT)
+#define PUD_MASK       (~(PUD_SIZE-1))
+
+/* PGDIR_SHIFT determines what a fourth-level page table entry can map */
+#define PGDIR_SHIFT    (PUD_SHIFT + PUD_INDEX_SIZE)
+#define PGDIR_SIZE     (1UL << PGDIR_SHIFT)
+#define PGDIR_MASK     (~(PGDIR_SIZE-1))
+
+/* Bits to mask out from a PMD to get to the PTE page */
+#define PMD_MASKED_BITS                0xc0000000000000ffUL
+/* Bits to mask out from a PUD to get to the PMD page */
+#define PUD_MASKED_BITS                0xc0000000000000ffUL
+/* Bits to mask out from a PGD to get to the PUD page */
+#define PGD_MASKED_BITS                0xc0000000000000ffUL
+
+extern unsigned long __vmalloc_start;
+extern unsigned long __vmalloc_end;
+#define VMALLOC_START  __vmalloc_start
+#define VMALLOC_END    __vmalloc_end
+
+extern unsigned long __kernel_virt_start;
+extern unsigned long __kernel_virt_size;
+#define KERN_VIRT_START __kernel_virt_start
+#define KERN_VIRT_SIZE  __kernel_virt_size
+extern struct page *vmemmap;
+extern unsigned long ioremap_bot;
+#endif /* __ASSEMBLY__ */
  
  #include <asm/book3s/64/hash.h>
-#include <asm/barrier.h>
+#include <asm/book3s/64/radix.h>
  
+#ifdef CONFIG_PPC_64K_PAGES
+#include <asm/book3s/64/pgtable-64k.h>
+#else
+#include <asm/book3s/64/pgtable-4k.h>
+#endif
+
+#include <asm/barrier.h>
  /*
   * The second half of the kernel virtual space is used for IO mappings,
   * it's itself carved into the PIO region (ISA and PHB IO space) and
@@ -26,8 +260,6 @@
  #define IOREMAP_BASE   (PHB_IO_END)
  #define IOREMAP_END    (KERN_VIRT_START + KERN_VIRT_SIZE)
  
-#define vmemmap                        ((struct page *)VMEMMAP_BASE)
-
  /* Advertise special mapping type for AGP */
  #define HAVE_PAGE_AGP
  
@@ -45,7 +277,7 @@
  
  #define __real_pte(e,p)                ((real_pte_t){(e)})
  #define __rpte_to_pte(r)       ((r).pte)
-#define __rpte_to_hidx(r,index)        (pte_val(__rpte_to_pte(r)) >>_PAGE_F_GIX_SHIFT)
+#define __rpte_to_hidx(r,index)        (pte_val(__rpte_to_pte(r)) >> H_PAGE_F_GIX_SHIFT)
  
  #define pte_iterate_hashed_subpages(rpte, psize, va, index, shift)       \
         do {                                                             \
@@ -62,6 +294,327 @@
  
  #endif /* __real_pte */
  
+static inline unsigned long pte_update(struct mm_struct *mm, unsigned long addr,
+                                      pte_t *ptep, unsigned long clr,
+                                      unsigned long set, int huge)
+{
+       if (radix_enabled())
+               return radix__pte_update(mm, addr, ptep, clr, set, huge);
+       return hash__pte_update(mm, addr, ptep, clr, set, huge);
+}
+/*
+ * For hash even if we have _PAGE_ACCESSED = 0, we do a pte_update.
+ * We currently remove entries from the hashtable regardless of whether
+ * the entry was young or dirty.
+ *
+ * We should be more intelligent about this but for the moment we override
+ * these functions and force a tlb flush unconditionally
+ * For radix: H_PAGE_HASHPTE should be zero. Hence we can use the same
+ * function for both hash and radix.
+ */
+static inline int __ptep_test_and_clear_young(struct mm_struct *mm,
+                                             unsigned long addr, pte_t *ptep)
+{
+       unsigned long old;
+
+       if ((pte_val(*ptep) & (_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0)
+               return 0;
+       old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0);
+       return (old & _PAGE_ACCESSED) != 0;
+}
+
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+#define ptep_test_and_clear_young(__vma, __addr, __ptep)       \
+({                                                             \
+       int __r;                                                \
+       __r = __ptep_test_and_clear_young((__vma)->vm_mm, __addr, __ptep); \
+       __r;                                                    \
+})
+
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
+                                     pte_t *ptep)
+{
+
+       if ((pte_val(*ptep) & _PAGE_WRITE) == 0)
+               return;
+
+       pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 0);
+}
+
+static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
+                                          unsigned long addr, pte_t *ptep)
+{
+       if ((pte_val(*ptep) & _PAGE_WRITE) == 0)
+               return;
+
+       pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 1);
+}
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
+                                      unsigned long addr, pte_t *ptep)
+{
+       unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, 0);
+       return __pte(old);
+}
+
+static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
+                            pte_t * ptep)
+{
+       pte_update(mm, addr, ptep, ~0UL, 0, 0);
+}
+static inline int pte_write(pte_t pte)         { return !!(pte_val(pte) & _PAGE_WRITE);}
+static inline int pte_dirty(pte_t pte)         { return !!(pte_val(pte) & _PAGE_DIRTY); }
+static inline int pte_young(pte_t pte)         { return !!(pte_val(pte) & _PAGE_ACCESSED); }
+static inline int pte_special(pte_t pte)       { return !!(pte_val(pte) & _PAGE_SPECIAL); }
+static inline pgprot_t pte_pgprot(pte_t pte)   { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); }
+
+#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
+static inline bool pte_soft_dirty(pte_t pte)
+{
+       return !!(pte_val(pte) & _PAGE_SOFT_DIRTY);
+}
+static inline pte_t pte_mksoft_dirty(pte_t pte)
+{
+       return __pte(pte_val(pte) | _PAGE_SOFT_DIRTY);
+}
+
+static inline pte_t pte_clear_soft_dirty(pte_t pte)
+{
+       return __pte(pte_val(pte) & ~_PAGE_SOFT_DIRTY);
+}
+#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
+
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * These work without NUMA balancing but the kernel does not care. See the
+ * comment in include/asm-generic/pgtable.h . On powerpc, this will only
+ * work for user pages and always return true for kernel pages.
+ */
+static inline int pte_protnone(pte_t pte)
+{
+       return (pte_val(pte) & (_PAGE_PRESENT | _PAGE_PRIVILEGED)) ==
+               (_PAGE_PRESENT | _PAGE_PRIVILEGED);
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+static inline int pte_present(pte_t pte)
+{
+       return !!(pte_val(pte) & _PAGE_PRESENT);
+}
+/*
+ * Conversion functions: convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ *
+ * Even if PTEs can be unsigned long long, a PFN is always an unsigned
+ * long for now.
+ */
+static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot)
+{
+       return __pte((((pte_basic_t)(pfn) << PAGE_SHIFT) & PTE_RPN_MASK) |
+                    pgprot_val(pgprot));
+}
+
+static inline unsigned long pte_pfn(pte_t pte)
+{
+       return (pte_val(pte) & PTE_RPN_MASK) >> PAGE_SHIFT;
+}
+
+/* Generic modifiers for PTE bits */
+static inline pte_t pte_wrprotect(pte_t pte)
+{
+       return __pte(pte_val(pte) & ~_PAGE_WRITE);
+}
+
+static inline pte_t pte_mkclean(pte_t pte)
+{
+       return __pte(pte_val(pte) & ~_PAGE_DIRTY);
+}
+
+static inline pte_t pte_mkold(pte_t pte)
+{
+       return __pte(pte_val(pte) & ~_PAGE_ACCESSED);
+}
+
+static inline pte_t pte_mkwrite(pte_t pte)
+{
+       /*
+        * write implies read, hence set both
+        */
+       return __pte(pte_val(pte) | _PAGE_RW);
+}
+
+static inline pte_t pte_mkdirty(pte_t pte)
+{
+       return __pte(pte_val(pte) | _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+}
+
+static inline pte_t pte_mkyoung(pte_t pte)
+{
+       return __pte(pte_val(pte) | _PAGE_ACCESSED);
+}
+
+static inline pte_t pte_mkspecial(pte_t pte)
+{
+       return __pte(pte_val(pte) | _PAGE_SPECIAL);
+}
+
+static inline pte_t pte_mkhuge(pte_t pte)
+{
+       return pte;
+}
+
+static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
+{
+       /* FIXME!! check whether this need to be a conditional */
+       return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot));
+}
+
+static inline bool pte_user(pte_t pte)
+{
+       return !(pte_val(pte) & _PAGE_PRIVILEGED);
+}
+
+/* Encode and de-code a swap entry */
+#define MAX_SWAPFILES_CHECK() do { \
+       BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS); \
+       /*                                                      \
+        * Don't have overlapping bits with _PAGE_HPTEFLAGS     \
+        * We filter HPTEFLAGS on set_pte.                      \
+        */                                                     \
+       BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \
+       BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY);   \
+       } while (0)
+/*
+ * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT;
+ */
+#define SWP_TYPE_BITS 5
+#define __swp_type(x)          (((x).val >> _PAGE_BIT_SWAP_TYPE) \
+                               & ((1UL << SWP_TYPE_BITS) - 1))
+#define __swp_offset(x)                (((x).val & PTE_RPN_MASK) >> PAGE_SHIFT)
+#define __swp_entry(type, offset)      ((swp_entry_t) { \
+                               ((type) << _PAGE_BIT_SWAP_TYPE) \
+                               | (((offset) << PAGE_SHIFT) & PTE_RPN_MASK)})
+/*
+ * swp_entry_t must be independent of pte bits. We build a swp_entry_t from
+ * swap type and offset we get from swap and convert that to pte to find a
+ * matching pte in linux page table.
+ * Clear bits not found in swap entries here.
+ */
+#define __pte_to_swp_entry(pte)        ((swp_entry_t) { pte_val((pte)) & ~_PAGE_PTE })
+#define __swp_entry_to_pte(x)  __pte((x).val | _PAGE_PTE)
+
+#ifdef CONFIG_MEM_SOFT_DIRTY
+#define _PAGE_SWP_SOFT_DIRTY   (1UL << (SWP_TYPE_BITS + _PAGE_BIT_SWAP_TYPE))
+#else
+#define _PAGE_SWP_SOFT_DIRTY   0UL
+#endif /* CONFIG_MEM_SOFT_DIRTY */
+
+#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
+static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
+{
+       return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY);
+}
+static inline bool pte_swp_soft_dirty(pte_t pte)
+{
+       return !!(pte_val(pte) & _PAGE_SWP_SOFT_DIRTY);
+}
+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
+{
+       return __pte(pte_val(pte) & ~_PAGE_SWP_SOFT_DIRTY);
+}
+#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
+
+static inline bool check_pte_access(unsigned long access, unsigned long ptev)
+{
+       /*
+        * This check for _PAGE_RWX and _PAGE_PRESENT bits
+        */
+       if (access & ~ptev)
+               return false;
+       /*
+        * This check for access to privilege space
+        */
+       if ((access & _PAGE_PRIVILEGED) != (ptev & _PAGE_PRIVILEGED))
+               return false;
+
+       return true;
+}
+/*
+ * Generic functions with hash/radix callbacks
+ */
+
+static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
+{
+       if (radix_enabled())
+               return radix__ptep_set_access_flags(ptep, entry);
+       return hash__ptep_set_access_flags(ptep, entry);
+}
+
+#define __HAVE_ARCH_PTE_SAME
+static inline int pte_same(pte_t pte_a, pte_t pte_b)
+{
+       if (radix_enabled())
+               return radix__pte_same(pte_a, pte_b);
+       return hash__pte_same(pte_a, pte_b);
+}
+
+static inline int pte_none(pte_t pte)
+{
+       if (radix_enabled())
+               return radix__pte_none(pte);
+       return hash__pte_none(pte);
+}
+
+static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
+                               pte_t *ptep, pte_t pte, int percpu)
+{
+       if (radix_enabled())
+               return radix__set_pte_at(mm, addr, ptep, pte, percpu);
+       return hash__set_pte_at(mm, addr, ptep, pte, percpu);
+}
+
+#define _PAGE_CACHE_CTL        (_PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT)
+
+#define pgprot_noncached pgprot_noncached
+static inline pgprot_t pgprot_noncached(pgprot_t prot)
+{
+       return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) |
+                       _PAGE_NON_IDEMPOTENT);
+}
+
+#define pgprot_noncached_wc pgprot_noncached_wc
+static inline pgprot_t pgprot_noncached_wc(pgprot_t prot)
+{
+       return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) |
+                       _PAGE_TOLERANT);
+}
+
+#define pgprot_cached pgprot_cached
+static inline pgprot_t pgprot_cached(pgprot_t prot)
+{
+       return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL));
+}
+
+#define pgprot_writecombine pgprot_writecombine
+static inline pgprot_t pgprot_writecombine(pgprot_t prot)
+{
+       return pgprot_noncached_wc(prot);
+}
+/*
+ * check a pte mapping have cache inhibited property
+ */
+static inline bool pte_ci(pte_t pte)
+{
+       unsigned long pte_v = pte_val(pte);
+
+       if (((pte_v & _PAGE_CACHE_CTL) == _PAGE_TOLERANT) ||
+           ((pte_v & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT))
+               return true;
+       return false;
+}
+
  static inline void pmd_set(pmd_t *pmdp, unsigned long val)
  {
         *pmdp = __pmd(val);
@@ -75,6 +628,13 @@ static inline void pmd_clear(pmd_t *pmdp)
  #define pmd_none(pmd)          (!pmd_val(pmd))
  #define        pmd_present(pmd)        (!pmd_none(pmd))
  
+static inline int pmd_bad(pmd_t pmd)
+{
+       if (radix_enabled())
+               return radix__pmd_bad(pmd);
+       return hash__pmd_bad(pmd);
+}
+
  static inline void pud_set(pud_t *pudp, unsigned long val)
  {
         *pudp = __pud(val);
@@ -100,6 +660,15 @@ static inline pud_t pte_pud(pte_t pte)
         return __pud(pte_val(pte));
  }
  #define pud_write(pud)         pte_write(pud_pte(pud))
+
+static inline int pud_bad(pud_t pud)
+{
+       if (radix_enabled())
+               return radix__pud_bad(pud);
+       return hash__pud_bad(pud);
+}
+
+
  #define pgd_write(pgd)         pte_write(pgd_pte(pgd))
  static inline void pgd_set(pgd_t *pgdp, unsigned long val)
  {
@@ -124,8 +693,27 @@ static inline pgd_t pte_pgd(pte_t pte)
         return __pgd(pte_val(pte));
  }
  
+static inline int pgd_bad(pgd_t pgd)
+{
+       if (radix_enabled())
+               return radix__pgd_bad(pgd);
+       return hash__pgd_bad(pgd);
+}
+
  extern struct page *pgd_page(pgd_t pgd);
  
+/* Pointers in the page table tree are physical addresses */
+#define __pgtable_ptr_val(ptr) __pa(ptr)
+
+#define pmd_page_vaddr(pmd)    __va(pmd_val(pmd) & ~PMD_MASKED_BITS)
+#define pud_page_vaddr(pud)    __va(pud_val(pud) & ~PUD_MASKED_BITS)
+#define pgd_page_vaddr(pgd)    __va(pgd_val(pgd) & ~PGD_MASKED_BITS)
+
+#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & (PTRS_PER_PGD - 1))
+#define pud_index(address) (((address) >> (PUD_SHIFT)) & (PTRS_PER_PUD - 1))
+#define pmd_index(address) (((address) >> (PMD_SHIFT)) & (PTRS_PER_PMD - 1))
+#define pte_index(address) (((address) >> (PAGE_SHIFT)) & (PTRS_PER_PTE - 1))
+
  /*
   * Find an entry in a page-table-directory.  We combine the address region
   * (the high order N bits) and the pgd portion of the address.
@@ -156,73 +744,42 @@ extern struct page *pgd_page(pgd_t pgd);
  #define pgd_ERROR(e) \
         pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
  
-/* Encode and de-code a swap entry */
-#define MAX_SWAPFILES_CHECK() do { \
-       BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS); \
-       /*                                                      \
-        * Don't have overlapping bits with _PAGE_HPTEFLAGS     \
-        * We filter HPTEFLAGS on set_pte.                      \
-        */                                                     \
-       BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \
-       BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY);   \
-       } while (0)
-/*
- * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT;
- */
-#define SWP_TYPE_BITS 5
-#define __swp_type(x)          (((x).val >> _PAGE_BIT_SWAP_TYPE) \
-                               & ((1UL << SWP_TYPE_BITS) - 1))
-#define __swp_offset(x)                (((x).val & PTE_RPN_MASK) >> PTE_RPN_SHIFT)
-#define __swp_entry(type, offset)      ((swp_entry_t) { \
-                               ((type) << _PAGE_BIT_SWAP_TYPE) \
-                               | (((offset) << PTE_RPN_SHIFT) & PTE_RPN_MASK)})
-/*
- * swp_entry_t must be independent of pte bits. We build a swp_entry_t from
- * swap type and offset we get from swap and convert that to pte to find a
- * matching pte in linux page table.
- * Clear bits not found in swap entries here.
- */
-#define __pte_to_swp_entry(pte)        ((swp_entry_t) { pte_val((pte)) & ~_PAGE_PTE })
-#define __swp_entry_to_pte(x)  __pte((x).val | _PAGE_PTE)
-
-#ifdef CONFIG_MEM_SOFT_DIRTY
-#define _PAGE_SWP_SOFT_DIRTY   (1UL << (SWP_TYPE_BITS + _PAGE_BIT_SWAP_TYPE))
-#else
-#define _PAGE_SWP_SOFT_DIRTY   0UL
-#endif /* CONFIG_MEM_SOFT_DIRTY */
+void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
+void pgtable_cache_init(void);
  
-#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
-static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
+static inline int map_kernel_page(unsigned long ea, unsigned long pa,
+                                 unsigned long flags)
  {
-       return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY);
+       if (radix_enabled()) {
+#if defined(CONFIG_PPC_RADIX_MMU) && defined(DEBUG_VM)
+               unsigned long page_size = 1 << mmu_psize_defs[mmu_io_psize].shift;
+               WARN((page_size != PAGE_SIZE), "I/O page size != PAGE_SIZE");
+#endif
+               return radix__map_kernel_page(ea, pa, __pgprot(flags), PAGE_SIZE);
+       }
+       return hash__map_kernel_page(ea, pa, flags);
  }
-static inline bool pte_swp_soft_dirty(pte_t pte)
+
+static inline int __meminit vmemmap_create_mapping(unsigned long start,
+                                                  unsigned long page_size,
+                                                  unsigned long phys)
  {
-       return !!(pte_val(pte) & _PAGE_SWP_SOFT_DIRTY);
+       if (radix_enabled())
+               return radix__vmemmap_create_mapping(start, page_size, phys);
+       return hash__vmemmap_create_mapping(start, page_size, phys);
  }
-static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static inline void vmemmap_remove_mapping(unsigned long start,
+                                         unsigned long page_size)
  {
-       return __pte(pte_val(pte) & ~_PAGE_SWP_SOFT_DIRTY);
+       if (radix_enabled())
+               return radix__vmemmap_remove_mapping(start, page_size);
+       return hash__vmemmap_remove_mapping(start, page_size);
  }
-#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
-
-void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
-void pgtable_cache_init(void);
-
+#endif
  struct page *realmode_pfn_to_page(unsigned long pfn);
  
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot);
-extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot);
-extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot);
-extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
-                      pmd_t *pmdp, pmd_t pmd);
-extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
-                                pmd_t *pmd);
-extern int has_transparent_hugepage(void);
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-
  static inline pte_t pmd_pte(pmd_t pmd)
  {
         return __pte(pmd_val(pmd));
@@ -237,7 +794,6 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd)
  {
         return (pte_t *)pmd;
  }
-
  #define pmd_pfn(pmd)           pte_pfn(pmd_pte(pmd))
  #define pmd_dirty(pmd)         pte_dirty(pmd_pte(pmd))
  #define pmd_young(pmd)         pte_young(pmd_pte(pmd))
@@ -264,9 +820,87 @@ static inline int pmd_protnone(pmd_t pmd)
  #define __HAVE_ARCH_PMD_WRITE
  #define pmd_write(pmd)         pte_write(pmd_pte(pmd))
  
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot);
+extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot);
+extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot);
+extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+                      pmd_t *pmdp, pmd_t pmd);
+extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
+                                pmd_t *pmd);
+extern int hash__has_transparent_hugepage(void);
+static inline int has_transparent_hugepage(void)
+{
+       if (radix_enabled())
+               return radix__has_transparent_hugepage();
+       return hash__has_transparent_hugepage();
+}
+#define has_transparent_hugepage has_transparent_hugepage
+
+static inline unsigned long
+pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp,
+                   unsigned long clr, unsigned long set)
+{
+       if (radix_enabled())
+               return radix__pmd_hugepage_update(mm, addr, pmdp, clr, set);
+       return hash__pmd_hugepage_update(mm, addr, pmdp, clr, set);
+}
+
+static inline int pmd_large(pmd_t pmd)
+{
+       return !!(pmd_val(pmd) & _PAGE_PTE);
+}
+
+static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+{
+       return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT);
+}
+/*
+ * For radix we should always find H_PAGE_HASHPTE zero. Hence
+ * the below will work for radix too
+ */
+static inline int __pmdp_test_and_clear_young(struct mm_struct *mm,
+                                             unsigned long addr, pmd_t *pmdp)
+{
+       unsigned long old;
+
+       if ((pmd_val(*pmdp) & (_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0)
+               return 0;
+       old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED, 0);
+       return ((old & _PAGE_ACCESSED) != 0);
+}
+
+#define __HAVE_ARCH_PMDP_SET_WRPROTECT
+static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
+                                     pmd_t *pmdp)
+{
+
+       if ((pmd_val(*pmdp) & _PAGE_WRITE) == 0)
+               return;
+
+       pmd_hugepage_update(mm, addr, pmdp, _PAGE_WRITE, 0);
+}
+
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+       if (radix_enabled())
+               return radix__pmd_trans_huge(pmd);
+       return hash__pmd_trans_huge(pmd);
+}
+
+#define __HAVE_ARCH_PMD_SAME
+static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
+{
+       if (radix_enabled())
+               return radix__pmd_same(pmd_a, pmd_b);
+       return hash__pmd_same(pmd_a, pmd_b);
+}
+
  static inline pmd_t pmd_mkhuge(pmd_t pmd)
  {
-       return __pmd(pmd_val(pmd) | (_PAGE_PTE | _PAGE_THP_HUGE));
+       if (radix_enabled())
+               return radix__pmd_mkhuge(pmd);
+       return hash__pmd_mkhuge(pmd);
  }
  
  #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
@@ -277,37 +911,63 @@ extern int pmdp_set_access_flags(struct vm_area_struct *vma,
  #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
  extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
                                      unsigned long address, pmd_t *pmdp);
-#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
-extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
-                                 unsigned long address, pmd_t *pmdp);
  
  #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
-extern pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
-                                    unsigned long addr, pmd_t *pmdp);
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+                                           unsigned long addr, pmd_t *pmdp)
+{
+       if (radix_enabled())
+               return radix__pmdp_huge_get_and_clear(mm, addr, pmdp);
+       return hash__pmdp_huge_get_and_clear(mm, addr, pmdp);
+}
  
-extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
-                                unsigned long address, pmd_t *pmdp);
+static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
+                                       unsigned long address, pmd_t *pmdp)
+{
+       if (radix_enabled())
+               return radix__pmdp_collapse_flush(vma, address, pmdp);
+       return hash__pmdp_collapse_flush(vma, address, pmdp);
+}
  #define pmdp_collapse_flush pmdp_collapse_flush
  
  #define __HAVE_ARCH_PGTABLE_DEPOSIT
-extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
-                                      pgtable_t pgtable);
+static inline void pgtable_trans_huge_deposit(struct mm_struct *mm,
+                                             pmd_t *pmdp, pgtable_t pgtable)
+{
+       if (radix_enabled())
+               return radix__pgtable_trans_huge_deposit(mm, pmdp, pgtable);
+       return hash__pgtable_trans_huge_deposit(mm, pmdp, pgtable);
+}
+
  #define __HAVE_ARCH_PGTABLE_WITHDRAW
-extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
+static inline pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm,
+                                                   pmd_t *pmdp)
+{
+       if (radix_enabled())
+               return radix__pgtable_trans_huge_withdraw(mm, pmdp);
+       return hash__pgtable_trans_huge_withdraw(mm, pmdp);
+}
  
  #define __HAVE_ARCH_PMDP_INVALIDATE
  extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
                             pmd_t *pmdp);
  
  #define __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE
-extern void pmdp_huge_split_prepare(struct vm_area_struct *vma,
-                                   unsigned long address, pmd_t *pmdp);
+static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma,
+                                          unsigned long address, pmd_t *pmdp)
+{
+       if (radix_enabled())
+               return radix__pmdp_huge_split_prepare(vma, address, pmdp);
+       return hash__pmdp_huge_split_prepare(vma, address, pmdp);
+}
  
  #define pmd_move_must_withdraw pmd_move_must_withdraw
  struct spinlock;
  static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
                                          struct spinlock *old_pmd_ptl)
  {
+       if (radix_enabled())
+               return false;
         /*
          * Archs like ppc64 use pgtable to store per pmd
          * specific information. So when we switch the pmd,
@@ -315,5 +975,6 @@ static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
          */
         return true;
  }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  #endif /* __ASSEMBLY__ */
  #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */
diff --git a/arch/powerpc/include/asm/book3s/64/radix-4k.h b/arch/powerpc/include/asm/book3s/64/radix-4k.h

new file mode 100644 (file)

index 0000000..7c3b1fe
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/64/radix-4k.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_POWERPC_PGTABLE_RADIX_4K_H
+#define _ASM_POWERPC_PGTABLE_RADIX_4K_H
+
+/*
+ * For 4K page size supported index is 13/9/9/9
+ */
+#define RADIX_PTE_INDEX_SIZE  9  /* 2MB huge page */
+#define RADIX_PMD_INDEX_SIZE  9  /* 1G huge page */
+#define RADIX_PUD_INDEX_SIZE    9
+#define RADIX_PGD_INDEX_SIZE  13
+
+#endif /* _ASM_POWERPC_PGTABLE_RADIX_4K_H */
diff --git a/arch/powerpc/include/asm/book3s/64/radix-64k.h b/arch/powerpc/include/asm/book3s/64/radix-64k.h

new file mode 100644 (file)

index 0000000..82dc355
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/64/radix-64k.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_POWERPC_PGTABLE_RADIX_64K_H
+#define _ASM_POWERPC_PGTABLE_RADIX_64K_H
+
+/*
+ * For 64K page size supported index is 13/9/9/5
+ */
+#define RADIX_PTE_INDEX_SIZE  5  /* 2MB huge page */
+#define RADIX_PMD_INDEX_SIZE  9  /* 1G huge page */
+#define RADIX_PUD_INDEX_SIZE    9
+#define RADIX_PGD_INDEX_SIZE  13
+
+#endif /* _ASM_POWERPC_PGTABLE_RADIX_64K_H */
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h

new file mode 100644 (file)

index 0000000..937d4e2
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -0,0 +1,232 @@
+#ifndef _ASM_POWERPC_PGTABLE_RADIX_H
+#define _ASM_POWERPC_PGTABLE_RADIX_H
+
+#ifndef __ASSEMBLY__
+#include <asm/cmpxchg.h>
+#endif
+
+#ifdef CONFIG_PPC_64K_PAGES
+#include <asm/book3s/64/radix-64k.h>
+#else
+#include <asm/book3s/64/radix-4k.h>
+#endif
+
+/* An empty PTE can still have a R or C writeback */
+#define RADIX_PTE_NONE_MASK            (_PAGE_DIRTY | _PAGE_ACCESSED)
+
+/* Bits to set in a RPMD/RPUD/RPGD */
+#define RADIX_PMD_VAL_BITS             (0x8000000000000000UL | RADIX_PTE_INDEX_SIZE)
+#define RADIX_PUD_VAL_BITS             (0x8000000000000000UL | RADIX_PMD_INDEX_SIZE)
+#define RADIX_PGD_VAL_BITS             (0x8000000000000000UL | RADIX_PUD_INDEX_SIZE)
+
+/* Don't have anything in the reserved bits and leaf bits */
+#define RADIX_PMD_BAD_BITS             0x60000000000000e0UL
+#define RADIX_PUD_BAD_BITS             0x60000000000000e0UL
+#define RADIX_PGD_BAD_BITS             0x60000000000000e0UL
+
+/*
+ * Size of EA range mapped by our pagetables.
+ */
+#define RADIX_PGTABLE_EADDR_SIZE (RADIX_PTE_INDEX_SIZE + RADIX_PMD_INDEX_SIZE +        \
+                             RADIX_PUD_INDEX_SIZE + RADIX_PGD_INDEX_SIZE + PAGE_SHIFT)
+#define RADIX_PGTABLE_RANGE (ASM_CONST(1) << RADIX_PGTABLE_EADDR_SIZE)
+
+/*
+ * We support 52 bit address space, Use top bit for kernel
+ * virtual mapping. Also make sure kernel fit in the top
+ * quadrant.
+ *
+ *           +------------------+
+ *           +------------------+  Kernel virtual map (0xc008000000000000)
+ *           |                  |
+ *           |                  |
+ *           |                  |
+ * 0b11......+------------------+  Kernel linear map (0xc....)
+ *           |                  |
+ *           |     2 quadrant   |
+ *           |                  |
+ * 0b10......+------------------+
+ *           |                  |
+ *           |    1 quadrant    |
+ *           |                  |
+ * 0b01......+------------------+
+ *           |                  |
+ *           |    0 quadrant    |
+ *           |                  |
+ * 0b00......+------------------+
+ *
+ *
+ * 3rd quadrant expanded:
+ * +------------------------------+
+ * |                              |
+ * |                              |
+ * |                              |
+ * +------------------------------+  Kernel IO map end (0xc010000000000000)
+ * |                              |
+ * |                              |
+ * |      1/2 of virtual map      |
+ * |                              |
+ * |                              |
+ * +------------------------------+  Kernel IO map start
+ * |                              |
+ * |      1/4 of virtual map      |
+ * |                              |
+ * +------------------------------+  Kernel vmemap start
+ * |                              |
+ * |     1/4 of virtual map       |
+ * |                              |
+ * +------------------------------+  Kernel virt start (0xc008000000000000)
+ * |                              |
+ * |                              |
+ * |                              |
+ * +------------------------------+  Kernel linear (0xc.....)
+ */
+
+#define RADIX_KERN_VIRT_START ASM_CONST(0xc008000000000000)
+#define RADIX_KERN_VIRT_SIZE  ASM_CONST(0x0008000000000000)
+
+/*
+ * The vmalloc space starts at the beginning of that region, and
+ * occupies a quarter of it on radix config.
+ * (we keep a quarter for the virtual memmap)
+ */
+#define RADIX_VMALLOC_START    RADIX_KERN_VIRT_START
+#define RADIX_VMALLOC_SIZE     (RADIX_KERN_VIRT_SIZE >> 2)
+#define RADIX_VMALLOC_END      (RADIX_VMALLOC_START + RADIX_VMALLOC_SIZE)
+/*
+ * Defines the address of the vmemap area, in its own region on
+ * hash table CPUs.
+ */
+#define RADIX_VMEMMAP_BASE             (RADIX_VMALLOC_END)
+
+#ifndef __ASSEMBLY__
+#define RADIX_PTE_TABLE_SIZE   (sizeof(pte_t) << RADIX_PTE_INDEX_SIZE)
+#define RADIX_PMD_TABLE_SIZE   (sizeof(pmd_t) << RADIX_PMD_INDEX_SIZE)
+#define RADIX_PUD_TABLE_SIZE   (sizeof(pud_t) << RADIX_PUD_INDEX_SIZE)
+#define RADIX_PGD_TABLE_SIZE   (sizeof(pgd_t) << RADIX_PGD_INDEX_SIZE)
+
+static inline unsigned long radix__pte_update(struct mm_struct *mm,
+                                       unsigned long addr,
+                                       pte_t *ptep, unsigned long clr,
+                                       unsigned long set,
+                                       int huge)
+{
+       pte_t pte;
+       unsigned long old_pte, new_pte;
+
+       do {
+               pte = READ_ONCE(*ptep);
+               old_pte = pte_val(pte);
+               new_pte = (old_pte | set) & ~clr;
+
+       } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+
+       /* We already do a sync in cmpxchg, is ptesync needed ?*/
+       asm volatile("ptesync" : : : "memory");
+       /* huge pages use the old page table lock */
+       if (!huge)
+               assert_pte_locked(mm, addr);
+
+       return old_pte;
+}
+
+/*
+ * Set the dirty and/or accessed bits atomically in a linux PTE, this
+ * function doesn't need to invalidate tlb.
+ */
+static inline void radix__ptep_set_access_flags(pte_t *ptep, pte_t entry)
+{
+       pte_t pte;
+       unsigned long old_pte, new_pte;
+       unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
+                                             _PAGE_RW | _PAGE_EXEC);
+       do {
+               pte = READ_ONCE(*ptep);
+               old_pte = pte_val(pte);
+               new_pte = old_pte | set;
+
+       } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+
+       /* We already do a sync in cmpxchg, is ptesync needed ?*/
+       asm volatile("ptesync" : : : "memory");
+}
+
+static inline int radix__pte_same(pte_t pte_a, pte_t pte_b)
+{
+       return ((pte_raw(pte_a) ^ pte_raw(pte_b)) == 0);
+}
+
+static inline int radix__pte_none(pte_t pte)
+{
+       return (pte_val(pte) & ~RADIX_PTE_NONE_MASK) == 0;
+}
+
+static inline void radix__set_pte_at(struct mm_struct *mm, unsigned long addr,
+                                pte_t *ptep, pte_t pte, int percpu)
+{
+       *ptep = pte;
+       asm volatile("ptesync" : : : "memory");
+}
+
+static inline int radix__pmd_bad(pmd_t pmd)
+{
+       return !!(pmd_val(pmd) & RADIX_PMD_BAD_BITS);
+}
+
+static inline int radix__pmd_same(pmd_t pmd_a, pmd_t pmd_b)
+{
+       return ((pmd_raw(pmd_a) ^ pmd_raw(pmd_b)) == 0);
+}
+
+static inline int radix__pud_bad(pud_t pud)
+{
+       return !!(pud_val(pud) & RADIX_PUD_BAD_BITS);
+}
+
+
+static inline int radix__pgd_bad(pgd_t pgd)
+{
+       return !!(pgd_val(pgd) & RADIX_PGD_BAD_BITS);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+static inline int radix__pmd_trans_huge(pmd_t pmd)
+{
+       return !!(pmd_val(pmd) & _PAGE_PTE);
+}
+
+static inline pmd_t radix__pmd_mkhuge(pmd_t pmd)
+{
+       return __pmd(pmd_val(pmd) | _PAGE_PTE);
+}
+static inline void radix__pmdp_huge_split_prepare(struct vm_area_struct *vma,
+                                           unsigned long address, pmd_t *pmdp)
+{
+       /* Nothing to do for radix. */
+       return;
+}
+
+extern unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
+                                         pmd_t *pmdp, unsigned long clr,
+                                         unsigned long set);
+extern pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma,
+                                 unsigned long address, pmd_t *pmdp);
+extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+                                       pgtable_t pgtable);
+extern pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
+extern pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
+                                     unsigned long addr, pmd_t *pmdp);
+extern int radix__has_transparent_hugepage(void);
+#endif
+
+extern int __meminit radix__vmemmap_create_mapping(unsigned long start,
+                                            unsigned long page_size,
+                                            unsigned long phys);
+extern void radix__vmemmap_remove_mapping(unsigned long start,
+                                   unsigned long page_size);
+
+extern int radix__map_kernel_page(unsigned long ea, unsigned long pa,
+                                pgprot_t flags, unsigned int psz);
+#endif /* __ASSEMBLY__ */
+#endif
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h

index 1b753f96b3744c85efe0947a4f40d3612f7966b8..f12ddf5e8de51f91eb4c49ccb0283b0994bb4415 100644 (file)
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
@@ -1,8 +1,6 @@
  #ifndef _ASM_POWERPC_BOOK3S_64_TLBFLUSH_HASH_H
  #define _ASM_POWERPC_BOOK3S_64_TLBFLUSH_HASH_H
  
-#define MMU_NO_CONTEXT         0
-
  /*
   * TLB flushing for 64-bit hash-MMU CPUs
   */
@@ -29,14 +27,21 @@ extern void __flush_tlb_pending(struct ppc64_tlb_batch *batch);
  
  static inline void arch_enter_lazy_mmu_mode(void)
  {
-       struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch);
+       struct ppc64_tlb_batch *batch;
  
+       if (radix_enabled())
+               return;
+       batch = this_cpu_ptr(&ppc64_tlb_batch);
         batch->active = 1;
  }
  
  static inline void arch_leave_lazy_mmu_mode(void)
  {
-       struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch);
+       struct ppc64_tlb_batch *batch;
+
+       if (radix_enabled())
+               return;
+       batch = this_cpu_ptr(&ppc64_tlb_batch);
  
         if (batch->index)
                 __flush_tlb_pending(batch);
@@ -52,40 +57,42 @@ extern void flush_hash_range(unsigned long number, int local);
  extern void flush_hash_hugepage(unsigned long vsid, unsigned long addr,
                                 pmd_t *pmdp, unsigned int psize, int ssize,
                                 unsigned long flags);
-
-static inline void local_flush_tlb_mm(struct mm_struct *mm)
+static inline void hash__local_flush_tlb_mm(struct mm_struct *mm)
  {
  }
  
-static inline void flush_tlb_mm(struct mm_struct *mm)
+static inline void hash__flush_tlb_mm(struct mm_struct *mm)
  {
  }
  
-static inline void local_flush_tlb_page(struct vm_area_struct *vma,
-                                       unsigned long vmaddr)
+static inline void hash__local_flush_tlb_page(struct vm_area_struct *vma,
+                                         unsigned long vmaddr)
  {
  }
  
-static inline void flush_tlb_page(struct vm_area_struct *vma,
-                                 unsigned long vmaddr)
+static inline void hash__flush_tlb_page(struct vm_area_struct *vma,
+                                   unsigned long vmaddr)
  {
  }
  
-static inline void flush_tlb_page_nohash(struct vm_area_struct *vma,
-                                        unsigned long vmaddr)
+static inline void hash__flush_tlb_page_nohash(struct vm_area_struct *vma,
+                                          unsigned long vmaddr)
  {
  }
  
-static inline void flush_tlb_range(struct vm_area_struct *vma,
-                                  unsigned long start, unsigned long end)
+static inline void hash__flush_tlb_range(struct vm_area_struct *vma,
+                                    unsigned long start, unsigned long end)
  {
  }
  
-static inline void flush_tlb_kernel_range(unsigned long start,
-                                         unsigned long end)
+static inline void hash__flush_tlb_kernel_range(unsigned long start,
+                                           unsigned long end)
  {
  }
  
+
+struct mmu_gather;
+extern void hash__tlb_flush(struct mmu_gather *tlb);
  /* Private function for use by PCI IO mapping code */
  extern void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
                                      unsigned long end);
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h

new file mode 100644 (file)

index 0000000..13ef388
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
@@ -0,0 +1,33 @@
+#ifndef _ASM_POWERPC_TLBFLUSH_RADIX_H
+#define _ASM_POWERPC_TLBFLUSH_RADIX_H
+
+struct vm_area_struct;
+struct mm_struct;
+struct mmu_gather;
+
+static inline int mmu_get_ap(int psize)
+{
+       return mmu_psize_defs[psize].ap;
+}
+
+extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+                           unsigned long end);
+extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end);
+
+extern void radix__local_flush_tlb_mm(struct mm_struct *mm);
+extern void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
+extern void radix___local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
+                                   unsigned long ap, int nid);
+extern void radix__tlb_flush(struct mmu_gather *tlb);
+#ifdef CONFIG_SMP
+extern void radix__flush_tlb_mm(struct mm_struct *mm);
+extern void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
+extern void radix___flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
+                             unsigned long ap, int nid);
+#else
+#define radix__flush_tlb_mm(mm)                radix__local_flush_tlb_mm(mm)
+#define radix__flush_tlb_page(vma,addr)        radix__local_flush_tlb_page(vma,addr)
+#define radix___flush_tlb_page(mm,addr,p,i)    radix___local_flush_tlb_page(mm,addr,p,i)
+#endif
+
+#endif
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h

new file mode 100644 (file)

index 0000000..d98424a
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
@@ -0,0 +1,76 @@
+#ifndef _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H
+#define _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H
+
+#define MMU_NO_CONTEXT ~0UL
+
+
+#include <asm/book3s/64/tlbflush-hash.h>
+#include <asm/book3s/64/tlbflush-radix.h>
+
+static inline void flush_tlb_range(struct vm_area_struct *vma,
+                                  unsigned long start, unsigned long end)
+{
+       if (radix_enabled())
+               return radix__flush_tlb_range(vma, start, end);
+       return hash__flush_tlb_range(vma, start, end);
+}
+
+static inline void flush_tlb_kernel_range(unsigned long start,
+                                         unsigned long end)
+{
+       if (radix_enabled())
+               return radix__flush_tlb_kernel_range(start, end);
+       return hash__flush_tlb_kernel_range(start, end);
+}
+
+static inline void local_flush_tlb_mm(struct mm_struct *mm)
+{
+       if (radix_enabled())
+               return radix__local_flush_tlb_mm(mm);
+       return hash__local_flush_tlb_mm(mm);
+}
+
+static inline void local_flush_tlb_page(struct vm_area_struct *vma,
+                                       unsigned long vmaddr)
+{
+       if (radix_enabled())
+               return radix__local_flush_tlb_page(vma, vmaddr);
+       return hash__local_flush_tlb_page(vma, vmaddr);
+}
+
+static inline void flush_tlb_page_nohash(struct vm_area_struct *vma,
+                                        unsigned long vmaddr)
+{
+       if (radix_enabled())
+               return radix__flush_tlb_page(vma, vmaddr);
+       return hash__flush_tlb_page_nohash(vma, vmaddr);
+}
+
+static inline void tlb_flush(struct mmu_gather *tlb)
+{
+       if (radix_enabled())
+               return radix__tlb_flush(tlb);
+       return hash__tlb_flush(tlb);
+}
+
+#ifdef CONFIG_SMP
+static inline void flush_tlb_mm(struct mm_struct *mm)
+{
+       if (radix_enabled())
+               return radix__flush_tlb_mm(mm);
+       return hash__flush_tlb_mm(mm);
+}
+
+static inline void flush_tlb_page(struct vm_area_struct *vma,
+                                 unsigned long vmaddr)
+{
+       if (radix_enabled())
+               return radix__flush_tlb_page(vma, vmaddr);
+       return hash__flush_tlb_page(vma, vmaddr);
+}
+#else
+#define flush_tlb_mm(mm)               local_flush_tlb_mm(mm)
+#define flush_tlb_page(vma, addr)      local_flush_tlb_page(vma, addr)
+#endif /* CONFIG_SMP */
+
+#endif /*  _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H */
diff --git a/arch/powerpc/include/asm/book3s/pgalloc.h b/arch/powerpc/include/asm/book3s/pgalloc.h

new file mode 100644 (file)

index 0000000..54f591e
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/pgalloc.h
@@ -0,0 +1,19 @@
+#ifndef _ASM_POWERPC_BOOK3S_PGALLOC_H
+#define _ASM_POWERPC_BOOK3S_PGALLOC_H
+
+#include <linux/mm.h>
+
+extern void tlb_remove_table(struct mmu_gather *tlb, void *table);
+static inline void tlb_flush_pgtable(struct mmu_gather *tlb,
+                                    unsigned long address)
+{
+
+}
+
+#ifdef CONFIG_PPC64
+#include <asm/book3s/64/pgalloc.h>
+#else
+#include <asm/book3s/32/pgalloc.h>
+#endif
+
+#endif /* _ASM_POWERPC_BOOK3S_PGALLOC_H */
diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h

index 42814f0567cc49db7328258676801f8c739da12c..e2d9f4996e5ca030257c008449128283be2bad33 100644 (file)
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -8,6 +8,8 @@
  extern struct kmem_cache *hugepte_cache;
  
  #ifdef CONFIG_PPC_BOOK3S_64
+
+#include <asm/book3s/64/hugetlb-radix.h>
  /*
   * This should work for other subarchs too. But right now we use the
   * new format only for 64bit book3s
@@ -31,7 +33,19 @@ static inline unsigned int hugepd_shift(hugepd_t hpd)
  {
         return mmu_psize_to_shift(hugepd_mmu_psize(hpd));
  }
+static inline void flush_hugetlb_page(struct vm_area_struct *vma,
+                                     unsigned long vmaddr)
+{
+       if (radix_enabled())
+               return radix__flush_hugetlb_page(vma, vmaddr);
+}
  
+static inline void __local_flush_hugetlb_page(struct vm_area_struct *vma,
+                                             unsigned long vmaddr)
+{
+       if (radix_enabled())
+               return radix__local_flush_hugetlb_page(vma, vmaddr);
+}
  #else
  
  static inline pte_t *hugepd_page(hugepd_t hpd)
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h

index 7529aab068f5968a8a093ba2ddb5f42e9ab5b61c..1f4497fb5b83adb5797e258509f2cf8edeaf21c4 100644 (file)
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -276,19 +276,24 @@ static inline unsigned long hpte_make_readonly(unsigned long ptel)
         return ptel;
  }
  
-static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type)
+static inline bool hpte_cache_flags_ok(unsigned long hptel, bool is_ci)
  {
-       unsigned int wimg = ptel & HPTE_R_WIMG;
+       unsigned int wimg = hptel & HPTE_R_WIMG;
  
         /* Handle SAO */
         if (wimg == (HPTE_R_W | HPTE_R_I | HPTE_R_M) &&
             cpu_has_feature(CPU_FTR_ARCH_206))
                 wimg = HPTE_R_M;
  
-       if (!io_type)
+       if (!is_ci)
                 return wimg == HPTE_R_M;
-
-       return (wimg & (HPTE_R_W | HPTE_R_I)) == io_type;
+       /*
+        * if host is mapped cache inhibited, make sure hptel also have
+        * cache inhibited.
+        */
+       if (wimg & HPTE_R_W) /* FIXME!! is this ok for all guest. ? */
+               return false;
+       return !!(wimg & HPTE_R_I);
  }
  
  /*
@@ -305,9 +310,9 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing)
                  */
                 old_pte = READ_ONCE(*ptep);
                 /*
-                * wait until _PAGE_BUSY is clear then set it atomically
+                * wait until H_PAGE_BUSY is clear then set it atomically
                  */
-               if (unlikely(pte_val(old_pte) & _PAGE_BUSY)) {
+               if (unlikely(pte_val(old_pte) & H_PAGE_BUSY)) {
                         cpu_relax();
                         continue;
                 }
@@ -319,27 +324,12 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing)
                 if (writing && pte_write(old_pte))
                         new_pte = pte_mkdirty(new_pte);
  
-               if (pte_val(old_pte) == __cmpxchg_u64((unsigned long *)ptep,
-                                                     pte_val(old_pte),
-                                                     pte_val(new_pte))) {
+               if (pte_xchg(ptep, old_pte, new_pte))
                         break;
-               }
         }
         return new_pte;
  }
  
-
-/* Return HPTE cache control bits corresponding to Linux pte bits */
-static inline unsigned long hpte_cache_bits(unsigned long pte_val)
-{
-#if _PAGE_NO_CACHE == HPTE_R_I && _PAGE_WRITETHRU == HPTE_R_W
-       return pte_val & (HPTE_R_W | HPTE_R_I);
-#else
-       return ((pte_val & _PAGE_NO_CACHE) ? HPTE_R_I : 0) +
-               ((pte_val & _PAGE_WRITETHRU) ? HPTE_R_W : 0);
-#endif
-}
-
  static inline bool hpte_read_permission(unsigned long pp, unsigned long key)
  {
         if (key)
diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h

index fd22442d30a9732eae0489c4e062cfe41628884c..6bdcd0da9e2148565522bd12a4dcece725c9fa70 100644 (file)
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -256,6 +256,7 @@ struct machdep_calls {
  #ifdef CONFIG_ARCH_RANDOM
         int (*get_random_seed)(unsigned long *v);
  #endif
+       int (*update_partition_table)(u64);
  };
  
  extern void e500_idle(void);
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h

index 8ca1c983bf6c3443c9d1a47cec058833383986b6..e53ebebff4744b7b3181d070e0a7c1d2cb6ce221 100644 (file)
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -88,6 +88,11 @@
   */
  #define MMU_FTR_1T_SEGMENT             ASM_CONST(0x40000000)
  
+/*
+ * Radix page table available
+ */
+#define MMU_FTR_RADIX                  ASM_CONST(0x80000000)
+
  /* MMU feature bit sets for various CPUs */
  #define MMU_FTRS_DEFAULT_HPTE_ARCH_V2  \
         MMU_FTR_HPTE_TABLE | MMU_FTR_PPCAS_ARCH_V2
@@ -110,9 +115,25 @@
  DECLARE_PER_CPU(int, next_tlbcam_idx);
  #endif
  
+enum {
+       MMU_FTRS_POSSIBLE = MMU_FTR_HPTE_TABLE | MMU_FTR_TYPE_8xx |
+               MMU_FTR_TYPE_40x | MMU_FTR_TYPE_44x | MMU_FTR_TYPE_FSL_E |
+               MMU_FTR_TYPE_47x | MMU_FTR_USE_HIGH_BATS | MMU_FTR_BIG_PHYS |
+               MMU_FTR_USE_TLBIVAX_BCAST | MMU_FTR_USE_TLBILX |
+               MMU_FTR_LOCK_BCAST_INVAL | MMU_FTR_NEED_DTLB_SW_LRU |
+               MMU_FTR_USE_TLBRSRV | MMU_FTR_USE_PAIRED_MAS |
+               MMU_FTR_NO_SLBIE_B | MMU_FTR_16M_PAGE | MMU_FTR_TLBIEL |
+               MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_CI_LARGE_PAGE |
+               MMU_FTR_1T_SEGMENT |
+#ifdef CONFIG_PPC_RADIX_MMU
+               MMU_FTR_RADIX |
+#endif
+               0,
+};
+
  static inline int mmu_has_feature(unsigned long feature)
  {
-       return (cur_cpu_spec->mmu_features & feature);
+       return (MMU_FTRS_POSSIBLE & cur_cpu_spec->mmu_features & feature);
  }
  
  static inline void mmu_clear_feature(unsigned long feature)
@@ -122,13 +143,6 @@ static inline void mmu_clear_feature(unsigned long feature)
  
  extern unsigned int __start___mmu_ftr_fixup, __stop___mmu_ftr_fixup;
  
-/* MMU initialization */
-extern void early_init_mmu(void);
-extern void early_init_mmu_secondary(void);
-
-extern void setup_initial_memory_limit(phys_addr_t first_memblock_base,
-                                      phys_addr_t first_memblock_size);
-
  #ifdef CONFIG_PPC64
  /* This is our real memory area size on ppc64 server, on embedded, we
   * make it match the size our of bolted TLB area
@@ -181,10 +195,20 @@ static inline void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
  
  #define MMU_PAGE_COUNT 15
  
-#if defined(CONFIG_PPC_STD_MMU_64)
-/* 64-bit classic hash table MMU */
-#include <asm/book3s/64/mmu-hash.h>
-#elif defined(CONFIG_PPC_STD_MMU_32)
+#ifdef CONFIG_PPC_BOOK3S_64
+#include <asm/book3s/64/mmu.h>
+#else /* CONFIG_PPC_BOOK3S_64 */
+
+#ifndef __ASSEMBLY__
+/* MMU initialization */
+extern void early_init_mmu(void);
+extern void early_init_mmu_secondary(void);
+extern void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+                                      phys_addr_t first_memblock_size);
+#endif /* __ASSEMBLY__ */
+#endif
+
+#if defined(CONFIG_PPC_STD_MMU_32)
  /* 32-bit classic hash table MMU */
  #include <asm/book3s/32/mmu-hash.h>
  #elif defined(CONFIG_40x)
@@ -201,6 +225,9 @@ static inline void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
  #  include <asm/mmu-8xx.h>
  #endif
  
+#ifndef radix_enabled
+#define radix_enabled() (0)
+#endif
  
  #endif /* __KERNEL__ */
  #endif /* _ASM_POWERPC_MMU_H_ */
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h

index 4eaab40e3ade3663b13dcd155ed9258d276534a0..9d2cd0c36ec27645eb82aa53a5d39b04ae422778 100644 (file)
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -33,16 +33,27 @@ extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
  extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
  extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem);
  #endif
-
-extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next);
  extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm);
  extern void set_context(unsigned long id, pgd_t *pgd);
  
  #ifdef CONFIG_PPC_BOOK3S_64
+extern void radix__switch_mmu_context(struct mm_struct *prev,
+                                    struct mm_struct *next);
+static inline void switch_mmu_context(struct mm_struct *prev,
+                                     struct mm_struct *next,
+                                     struct task_struct *tsk)
+{
+       if (radix_enabled())
+               return radix__switch_mmu_context(prev, next);
+       return switch_slb(tsk, next);
+}
+
  extern int __init_new_context(void);
  extern void __destroy_context(int context_id);
  static inline void mmu_context_init(void) { }
  #else
+extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next,
+                              struct task_struct *tsk);
  extern unsigned long __init_new_context(void);
  extern void __destroy_context(unsigned long context_id);
  extern void mmu_context_init(void);
@@ -88,17 +99,11 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
         if (cpu_has_feature(CPU_FTR_ALTIVEC))
                 asm volatile ("dssall");
  #endif /* CONFIG_ALTIVEC */
-
-       /* The actual HW switching method differs between the various
-        * sub architectures.
+       /*
+        * The actual HW switching method differs between the various
+        * sub architectures. Out of line for now
          */
-#ifdef CONFIG_PPC_STD_MMU_64
-       switch_slb(tsk, next);
-#else
-       /* Out of line for now */
-       switch_mmu_context(prev, next);
-#endif
-
+       switch_mmu_context(prev, next, tsk);
  }
  
  #define deactivate_mm(tsk,mm)  do { } while (0)
diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h

new file mode 100644 (file)

index 0000000..76d6b9e
--- /dev/null
+++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h
@@ -0,0 +1,109 @@
+#ifndef _ASM_POWERPC_PGALLOC_32_H
+#define _ASM_POWERPC_PGALLOC_32_H
+
+#include <linux/threads.h>
+
+/* For 32-bit, all levels of page tables are just drawn from get_free_page() */
+#define MAX_PGTABLE_INDEX_SIZE 0
+
+extern void __bad_pte(pmd_t *pmd);
+
+extern pgd_t *pgd_alloc(struct mm_struct *mm);
+extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
+
+/*
+ * We don't have any real pmd's, and this code never triggers because
+ * the pgd will always be present..
+ */
+/* #define pmd_alloc_one(mm,address)       ({ BUG(); ((pmd_t *)2); }) */
+#define pmd_free(mm, x)                do { } while (0)
+#define __pmd_free_tlb(tlb,x,a)                do { } while (0)
+/* #define pgd_populate(mm, pmd, pte)      BUG() */
+
+#ifndef CONFIG_BOOKE
+
+static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp,
+                                      pte_t *pte)
+{
+       *pmdp = __pmd(__pa(pte) | _PMD_PRESENT);
+}
+
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp,
+                               pgtable_t pte_page)
+{
+       *pmdp = __pmd((page_to_pfn(pte_page) << PAGE_SHIFT) | _PMD_PRESENT);
+}
+
+#define pmd_pgtable(pmd) pmd_page(pmd)
+#else
+
+static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp,
+                                      pte_t *pte)
+{
+       *pmdp = __pmd((unsigned long)pte | _PMD_PRESENT);
+}
+
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp,
+                               pgtable_t pte_page)
+{
+       *pmdp = __pmd((unsigned long)lowmem_page_address(pte_page) | _PMD_PRESENT);
+}
+
+#define pmd_pgtable(pmd) pmd_page(pmd)
+#endif
+
+extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr);
+extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr);
+
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+{
+       free_page((unsigned long)pte);
+}
+
+static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
+{
+       pgtable_page_dtor(ptepage);
+       __free_page(ptepage);
+}
+
+static inline void pgtable_free(void *table, unsigned index_size)
+{
+       BUG_ON(index_size); /* 32-bit doesn't use this */
+       free_page((unsigned long)table);
+}
+
+#define check_pgt_cache()      do { } while (0)
+
+#ifdef CONFIG_SMP
+static inline void pgtable_free_tlb(struct mmu_gather *tlb,
+                                   void *table, int shift)
+{
+       unsigned long pgf = (unsigned long)table;
+       BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
+       pgf |= shift;
+       tlb_remove_table(tlb, (void *)pgf);
+}
+
+static inline void __tlb_remove_table(void *_table)
+{
+       void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
+       unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
+
+       pgtable_free(table, shift);
+}
+#else
+static inline void pgtable_free_tlb(struct mmu_gather *tlb,
+                                   void *table, int shift)
+{
+       pgtable_free(table, shift);
+}
+#endif
+
+static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
+                                 unsigned long address)
+{
+       tlb_flush_pgtable(tlb, address);
+       pgtable_page_dtor(table);
+       pgtable_free_tlb(tlb, page_address(table), 0);
+}
+#endif /* _ASM_POWERPC_PGALLOC_32_H */
diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h

new file mode 100644 (file)

index 0000000..0c12a3b
--- /dev/null
+++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h
@@ -0,0 +1,212 @@
+#ifndef _ASM_POWERPC_PGALLOC_64_H
+#define _ASM_POWERPC_PGALLOC_64_H
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+
+struct vmemmap_backing {
+       struct vmemmap_backing *list;
+       unsigned long phys;
+       unsigned long virt_addr;
+};
+extern struct vmemmap_backing *vmemmap_list;
+
+/*
+ * Functions that deal with pagetables that could be at any level of
+ * the table need to be passed an "index_size" so they know how to
+ * handle allocation.  For PTE pages (which are linked to a struct
+ * page for now, and drawn from the main get_free_pages() pool), the
+ * allocation size will be (2^index_size * sizeof(pointer)) and
+ * allocations are drawn from the kmem_cache in PGT_CACHE(index_size).
+ *
+ * The maximum index size needs to be big enough to allow any
+ * pagetable sizes we need, but small enough to fit in the low bits of
+ * any page table pointer.  In other words all pagetables, even tiny
+ * ones, must be aligned to allow at least enough low 0 bits to
+ * contain this value.  This value is also used as a mask, so it must
+ * be one less than a power of two.
+ */
+#define MAX_PGTABLE_INDEX_SIZE 0xf
+
+extern struct kmem_cache *pgtable_cache[];
+#define PGT_CACHE(shift) ({                            \
+                       BUG_ON(!(shift));               \
+                       pgtable_cache[(shift) - 1];     \
+               })
+
+static inline pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+       return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL);
+}
+
+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+       kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd);
+}
+
+#ifndef CONFIG_PPC_64K_PAGES
+
+#define pgd_populate(MM, PGD, PUD)     pgd_set(PGD, (unsigned long)PUD)
+
+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+       return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE),
+                               GFP_KERNEL|__GFP_REPEAT);
+}
+
+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+{
+       kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud);
+}
+
+static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+{
+       pud_set(pud, (unsigned long)pmd);
+}
+
+static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
+                                      pte_t *pte)
+{
+       pmd_set(pmd, (unsigned long)pte);
+}
+
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
+                               pgtable_t pte_page)
+{
+       pmd_set(pmd, (unsigned long)page_address(pte_page));
+}
+
+#define pmd_pgtable(pmd) pmd_page(pmd)
+
+static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
+                                         unsigned long address)
+{
+       return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO);
+}
+
+static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
+                                     unsigned long address)
+{
+       struct page *page;
+       pte_t *pte;
+
+       pte = pte_alloc_one_kernel(mm, address);
+       if (!pte)
+               return NULL;
+       page = virt_to_page(pte);
+       if (!pgtable_page_ctor(page)) {
+               __free_page(page);
+               return NULL;
+       }
+       return page;
+}
+
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+{
+       free_page((unsigned long)pte);
+}
+
+static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
+{
+       pgtable_page_dtor(ptepage);
+       __free_page(ptepage);
+}
+
+extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift);
+#ifdef CONFIG_SMP
+extern void __tlb_remove_table(void *_table);
+#endif
+static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
+                                 unsigned long address)
+{
+       tlb_flush_pgtable(tlb, address);
+       pgtable_free_tlb(tlb, page_address(table), 0);
+}
+
+#else /* if CONFIG_PPC_64K_PAGES */
+
+extern pte_t *pte_fragment_alloc(struct mm_struct *, unsigned long, int);
+extern void pte_fragment_free(unsigned long *, int);
+extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift);
+#ifdef CONFIG_SMP
+extern void __tlb_remove_table(void *_table);
+#endif
+
+#define pud_populate(mm, pud, pmd)     pud_set(pud, (unsigned long)pmd)
+
+static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
+                                      pte_t *pte)
+{
+       pmd_set(pmd, (unsigned long)pte);
+}
+
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
+                               pgtable_t pte_page)
+{
+       pmd_set(pmd, (unsigned long)pte_page);
+}
+
+static inline pgtable_t pmd_pgtable(pmd_t pmd)
+{
+       return (pgtable_t)(pmd_val(pmd) & ~PMD_MASKED_BITS);
+}
+
+static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
+                                         unsigned long address)
+{
+       return (pte_t *)pte_fragment_alloc(mm, address, 1);
+}
+
+static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
+                                       unsigned long address)
+{
+       return (pgtable_t)pte_fragment_alloc(mm, address, 0);
+}
+
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+{
+       pte_fragment_fre((unsigned long *)pte, 1);
+}
+
+static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
+{
+       pte_fragment_free((unsigned long *)ptepage, 0);
+}
+
+static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
+                                 unsigned long address)
+{
+       tlb_flush_pgtable(tlb, address);
+       pgtable_free_tlb(tlb, table, 0);
+}
+#endif /* CONFIG_PPC_64K_PAGES */
+
+static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+       return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX),
+                               GFP_KERNEL|__GFP_REPEAT);
+}
+
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
+{
+       kmem_cache_free(PGT_CACHE(PMD_CACHE_INDEX), pmd);
+}
+
+#define __pmd_free_tlb(tlb, pmd, addr)               \
+       pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX)
+#ifndef CONFIG_PPC_64K_PAGES
+#define __pud_free_tlb(tlb, pud, addr)               \
+       pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE)
+
+#endif /* CONFIG_PPC_64K_PAGES */
+
+#define check_pgt_cache()      do { } while (0)
+
+#endif /* _ASM_POWERPC_PGALLOC_64_H */
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h

index 10debb93c4a4835a8ccaa758cb850237d7a6f200..d4d808cf905ee25554e49dcf2e7e3f4ed73c4fd7 100644 (file)
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -108,9 +108,6 @@
  #ifndef __ASSEMBLY__
  /* pte_clear moved to later in this file */
  
-/* Pointers in the page table tree are virtual addresses */
-#define __pgtable_ptr_val(ptr) ((unsigned long)(ptr))
-
  #define PMD_BAD_BITS           (PTE_TABLE_SIZE-1)
  #define PUD_BAD_BITS           (PMD_TABLE_SIZE-1)
  
@@ -362,6 +359,13 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
  
  void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
  void pgtable_cache_init(void);
+extern int map_kernel_page(unsigned long ea, unsigned long pa,
+                          unsigned long flags);
+extern int __meminit vmemmap_create_mapping(unsigned long start,
+                                           unsigned long page_size,
+                                           unsigned long phys);
+extern void vmemmap_remove_mapping(unsigned long start,
+                                  unsigned long page_size);
  #endif /* __ASSEMBLY__ */
  
  #endif /* _ASM_POWERPC_NOHASH_64_PGTABLE_H */
diff --git a/arch/powerpc/include/asm/nohash/pgalloc.h b/arch/powerpc/include/asm/nohash/pgalloc.h

new file mode 100644 (file)

index 0000000..b39ec95
--- /dev/null
+++ b/arch/powerpc/include/asm/nohash/pgalloc.h
@@ -0,0 +1,23 @@
+#ifndef _ASM_POWERPC_NOHASH_PGALLOC_H
+#define _ASM_POWERPC_NOHASH_PGALLOC_H
+
+#include <linux/mm.h>
+
+extern void tlb_remove_table(struct mmu_gather *tlb, void *table);
+#ifdef CONFIG_PPC64
+extern void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address);
+#else
+/* 44x etc which is BOOKE not BOOK3E */
+static inline void tlb_flush_pgtable(struct mmu_gather *tlb,
+                                    unsigned long address)
+{
+
+}
+#endif /* !CONFIG_PPC_BOOK3E */
+
+#ifdef CONFIG_PPC64
+#include <asm/nohash/64/pgalloc.h>
+#else
+#include <asm/nohash/32/pgalloc.h>
+#endif
+#endif /* _ASM_POWERPC_NOHASH_PGALLOC_H */
diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h

index f8faaaeeca1e16d57dd0a4d9dfff3cb73c7e055d..9bb8ddf0be37b94d08b4955bc794c30418361852 100644 (file)
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -368,16 +368,16 @@ enum OpalLPCAddressType {
  };
  
  enum opal_msg_type {
-       OPAL_MSG_ASYNC_COMP = 0,        /* params[0] = token, params[1] = rc,
+       OPAL_MSG_ASYNC_COMP     = 0,    /* params[0] = token, params[1] = rc,
                                          * additional params function-specific
                                          */
-       OPAL_MSG_MEM_ERR,
-       OPAL_MSG_EPOW,
-       OPAL_MSG_SHUTDOWN,              /* params[0] = 1 reboot, 0 shutdown */
-       OPAL_MSG_HMI_EVT,
-       OPAL_MSG_DPO,
-       OPAL_MSG_PRD,
-       OPAL_MSG_OCC,
+       OPAL_MSG_MEM_ERR        = 1,
+       OPAL_MSG_EPOW           = 2,
+       OPAL_MSG_SHUTDOWN       = 3,    /* params[0] = 1 reboot, 0 shutdown */
+       OPAL_MSG_HMI_EVT        = 4,
+       OPAL_MSG_DPO            = 5,
+       OPAL_MSG_PRD            = 6,
+       OPAL_MSG_OCC            = 7,
         OPAL_MSG_TYPE_MAX,
  };
  
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h

index ab3d8977bacd6488fafe662c61d893be1c1bd22b..51db3a37bced619f3b4a34742157ee112e3dafca 100644 (file)
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -288,7 +288,11 @@ extern long long virt_phys_offset;
  
  #ifndef __ASSEMBLY__
  
+#ifdef CONFIG_PPC_BOOK3S_64
+#include <asm/pgtable-be-types.h>
+#else
  #include <asm/pgtable-types.h>
+#endif
  
  typedef struct { signed long pd; } hugepd_t;
  
@@ -312,12 +316,20 @@ void arch_free_page(struct page *page, int order);
  #endif
  
  struct vm_area_struct;
-
+#ifdef CONFIG_PPC_BOOK3S_64
+/*
+ * For BOOK3s 64 with 4k and 64K linux page size
+ * we want to use pointers, because the page table
+ * actually store pfn
+ */
+typedef pte_t *pgtable_t;
+#else
  #if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC64)
  typedef pte_t *pgtable_t;
  #else
  typedef struct page *pgtable_t;
  #endif
+#endif
  
  #include <asm-generic/memory_model.h>
  #endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h

index d908a46d05c0b1be8bbb5b35a90425465fd5d3aa..dd5f0712afa2539b7e484101839390fb804c9f26 100644 (file)
--- a/arch/powerpc/include/asm/page_64.h
+++ b/arch/powerpc/include/asm/page_64.h
@@ -93,7 +93,7 @@ extern u64 ppc64_pft_size;
  
  #define SLICE_LOW_TOP          (0x100000000ul)
  #define SLICE_NUM_LOW          (SLICE_LOW_TOP >> SLICE_LOW_SHIFT)
-#define SLICE_NUM_HIGH         (PGTABLE_RANGE >> SLICE_HIGH_SHIFT)
+#define SLICE_NUM_HIGH         (H_PGTABLE_RANGE >> SLICE_HIGH_SHIFT)
  
  #define GET_LOW_SLICE_INDEX(addr)      ((addr) >> SLICE_LOW_SHIFT)
  #define GET_HIGH_SLICE_INDEX(addr)     ((addr) >> SLICE_HIGH_SHIFT)
@@ -128,8 +128,6 @@ extern void slice_set_user_psize(struct mm_struct *mm, unsigned int psize);
  extern void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
                                   unsigned long len, unsigned int psize);
  
-#define slice_mm_new_context(mm)       ((mm)->context.id == MMU_NO_CONTEXT)
-
  #endif /* __ASSEMBLY__ */
  #else
  #define slice_init()
@@ -151,7 +149,6 @@ do {                                                \
  
  #define slice_set_range_psize(mm, start, len, psize)   \
         slice_set_user_psize((mm), (psize))
-#define slice_mm_new_context(mm)       1
  #endif /* CONFIG_PPC_MM_SLICES */
  
  #ifdef CONFIG_HUGETLB_PAGE
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h

index f5056e3394b466e20544818d1c431ee8e18cc5f8..467c0b05b6fb9df752826afef91eafde2fcac560 100644 (file)
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -17,33 +17,34 @@ struct device_node;
   * PCI controller operations
   */
  struct pci_controller_ops {
-       void            (*dma_dev_setup)(struct pci_dev *dev);
+       void            (*dma_dev_setup)(struct pci_dev *pdev);
         void            (*dma_bus_setup)(struct pci_bus *bus);
  
-       int             (*probe_mode)(struct pci_bus *);
+       int             (*probe_mode)(struct pci_bus *bus);
  
         /* Called when pci_enable_device() is called. Returns true to
          * allow assignment/enabling of the device. */
-       bool            (*enable_device_hook)(struct pci_dev *);
+       bool            (*enable_device_hook)(struct pci_dev *pdev);
  
-       void            (*disable_device)(struct pci_dev *);
+       void            (*disable_device)(struct pci_dev *pdev);
  
-       void            (*release_device)(struct pci_dev *);
+       void            (*release_device)(struct pci_dev *pdev);
  
         /* Called during PCI resource reassignment */
-       resource_size_t (*window_alignment)(struct pci_bus *, unsigned long type);
-       void            (*reset_secondary_bus)(struct pci_dev *dev);
+       resource_size_t (*window_alignment)(struct pci_bus *bus,
+                                           unsigned long type);
+       void            (*reset_secondary_bus)(struct pci_dev *pdev);
  
  #ifdef CONFIG_PCI_MSI
-       int             (*setup_msi_irqs)(struct pci_dev *dev,
+       int             (*setup_msi_irqs)(struct pci_dev *pdev,
                                           int nvec, int type);
-       void            (*teardown_msi_irqs)(struct pci_dev *dev);
+       void            (*teardown_msi_irqs)(struct pci_dev *pdev);
  #endif
  
-       int             (*dma_set_mask)(struct pci_dev *dev, u64 dma_mask);
-       u64             (*dma_get_required_mask)(struct pci_dev *dev);
+       int             (*dma_set_mask)(struct pci_dev *pdev, u64 dma_mask);
+       u64             (*dma_get_required_mask)(struct pci_dev *pdev);
  
-       void            (*shutdown)(struct pci_controller *);
+       void            (*shutdown)(struct pci_controller *hose);
  };
  
  /*
@@ -208,14 +209,14 @@ struct pci_dn {
  #ifdef CONFIG_EEH
         struct eeh_dev *edev;           /* eeh device */
  #endif
-#define IODA_INVALID_PE                (-1)
+#define IODA_INVALID_PE                0xFFFFFFFF
  #ifdef CONFIG_PPC_POWERNV
-       int     pe_number;
+       unsigned int pe_number;
         int     vf_index;               /* VF index in the PF */
  #ifdef CONFIG_PCI_IOV
         u16     vfs_expanded;           /* number of VFs IOV BAR expanded */
         u16     num_vfs;                /* number of VFs enabled*/
-       int     *pe_num_map;            /* PE# for the first VF PE or array */
+       unsigned int *pe_num_map;       /* PE# for the first VF PE or array */
         bool    m64_single_mode;        /* Use M64 BAR in Single Mode */
  #define IODA_INVALID_M64        (-1)
         int     (*m64_map)[PCI_SRIOV_NUM_BARS];
@@ -234,7 +235,9 @@ extern struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus,
  extern struct pci_dn *pci_get_pdn(struct pci_dev *pdev);
  extern struct pci_dn *add_dev_pci_data(struct pci_dev *pdev);
  extern void remove_dev_pci_data(struct pci_dev *pdev);
-extern void *update_dn_pci_info(struct device_node *dn, void *data);
+extern struct pci_dn *pci_add_device_node_info(struct pci_controller *hose,
+                                              struct device_node *dn);
+extern void pci_remove_device_node_info(struct device_node *dn);
  
  static inline int pci_device_from_OF_node(struct device_node *np,
                                           u8 *bus, u8 *devfn)
@@ -256,13 +259,13 @@ static inline struct eeh_dev *pdn_to_eeh_dev(struct pci_dn *pdn)
  #endif
  
  /** Find the bus corresponding to the indicated device node */
-extern struct pci_bus *pcibios_find_pci_bus(struct device_node *dn);
+extern struct pci_bus *pci_find_bus_by_node(struct device_node *dn);
  
  /** Remove all of the PCI devices under this bus */
-extern void pcibios_remove_pci_devices(struct pci_bus *bus);
+extern void pci_hp_remove_devices(struct pci_bus *bus);
  
  /** Discover new pci devices under this bus, and add them */
-extern void pcibios_add_pci_devices(struct pci_bus *bus);
+extern void pci_hp_add_devices(struct pci_bus *bus);
  
  
  extern void isa_bridge_find_early(struct pci_controller *hose);
diff --git a/arch/powerpc/include/asm/pgalloc-32.h b/arch/powerpc/include/asm/pgalloc-32.h

deleted file mode 100644 (file)

index 76d6b9e..0000000
--- a/arch/powerpc/include/asm/pgalloc-32.h
+++ /dev/null
@@ -1,109 +0,0 @@
-#ifndef _ASM_POWERPC_PGALLOC_32_H
-#define _ASM_POWERPC_PGALLOC_32_H
-
-#include <linux/threads.h>
-
-/* For 32-bit, all levels of page tables are just drawn from get_free_page() */
-#define MAX_PGTABLE_INDEX_SIZE 0
-
-extern void __bad_pte(pmd_t *pmd);
-
-extern pgd_t *pgd_alloc(struct mm_struct *mm);
-extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
-
-/*
- * We don't have any real pmd's, and this code never triggers because
- * the pgd will always be present..
- */
-/* #define pmd_alloc_one(mm,address)       ({ BUG(); ((pmd_t *)2); }) */
-#define pmd_free(mm, x)                do { } while (0)
-#define __pmd_free_tlb(tlb,x,a)                do { } while (0)
-/* #define pgd_populate(mm, pmd, pte)      BUG() */
-
-#ifndef CONFIG_BOOKE
-
-static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp,
-                                      pte_t *pte)
-{
-       *pmdp = __pmd(__pa(pte) | _PMD_PRESENT);
-}
-
-static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp,
-                               pgtable_t pte_page)
-{
-       *pmdp = __pmd((page_to_pfn(pte_page) << PAGE_SHIFT) | _PMD_PRESENT);
-}
-
-#define pmd_pgtable(pmd) pmd_page(pmd)
-#else
-
-static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp,
-                                      pte_t *pte)
-{
-       *pmdp = __pmd((unsigned long)pte | _PMD_PRESENT);
-}
-
-static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp,
-                               pgtable_t pte_page)
-{
-       *pmdp = __pmd((unsigned long)lowmem_page_address(pte_page) | _PMD_PRESENT);
-}
-
-#define pmd_pgtable(pmd) pmd_page(pmd)
-#endif
-
-extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr);
-extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr);
-
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
-       free_page((unsigned long)pte);
-}
-
-static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
-{
-       pgtable_page_dtor(ptepage);
-       __free_page(ptepage);
-}
-
-static inline void pgtable_free(void *table, unsigned index_size)
-{
-       BUG_ON(index_size); /* 32-bit doesn't use this */
-       free_page((unsigned long)table);
-}
-
-#define check_pgt_cache()      do { } while (0)
-
-#ifdef CONFIG_SMP
-static inline void pgtable_free_tlb(struct mmu_gather *tlb,
-                                   void *table, int shift)
-{
-       unsigned long pgf = (unsigned long)table;
-       BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
-       pgf |= shift;
-       tlb_remove_table(tlb, (void *)pgf);
-}
-
-static inline void __tlb_remove_table(void *_table)
-{
-       void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
-       unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
-
-       pgtable_free(table, shift);
-}
-#else
-static inline void pgtable_free_tlb(struct mmu_gather *tlb,
-                                   void *table, int shift)
-{
-       pgtable_free(table, shift);
-}
-#endif
-
-static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
-                                 unsigned long address)
-{
-       tlb_flush_pgtable(tlb, address);
-       pgtable_page_dtor(table);
-       pgtable_free_tlb(tlb, page_address(table), 0);
-}
-#endif /* _ASM_POWERPC_PGALLOC_32_H */
diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h

deleted file mode 100644 (file)

index 8d5fc3a..0000000
--- a/arch/powerpc/include/asm/pgalloc-64.h
+++ /dev/null
@@ -1,266 +0,0 @@
-#ifndef _ASM_POWERPC_PGALLOC_64_H
-#define _ASM_POWERPC_PGALLOC_64_H
-/*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/slab.h>
-#include <linux/cpumask.h>
-#include <linux/percpu.h>
-
-struct vmemmap_backing {
-       struct vmemmap_backing *list;
-       unsigned long phys;
-       unsigned long virt_addr;
-};
-extern struct vmemmap_backing *vmemmap_list;
-
-/*
- * Functions that deal with pagetables that could be at any level of
- * the table need to be passed an "index_size" so they know how to
- * handle allocation.  For PTE pages (which are linked to a struct
- * page for now, and drawn from the main get_free_pages() pool), the
- * allocation size will be (2^index_size * sizeof(pointer)) and
- * allocations are drawn from the kmem_cache in PGT_CACHE(index_size).
- *
- * The maximum index size needs to be big enough to allow any
- * pagetable sizes we need, but small enough to fit in the low bits of
- * any page table pointer.  In other words all pagetables, even tiny
- * ones, must be aligned to allow at least enough low 0 bits to
- * contain this value.  This value is also used as a mask, so it must
- * be one less than a power of two.
- */
-#define MAX_PGTABLE_INDEX_SIZE 0xf
-
-extern struct kmem_cache *pgtable_cache[];
-#define PGT_CACHE(shift) ({                            \
-                       BUG_ON(!(shift));               \
-                       pgtable_cache[(shift) - 1];     \
-               })
-
-static inline pgd_t *pgd_alloc(struct mm_struct *mm)
-{
-       return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL);
-}
-
-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-       kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd);
-}
-
-#ifndef CONFIG_PPC_64K_PAGES
-
-#define pgd_populate(MM, PGD, PUD)     pgd_set(PGD, __pgtable_ptr_val(PUD))
-
-static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
-       return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE),
-                               GFP_KERNEL|__GFP_REPEAT);
-}
-
-static inline void pud_free(struct mm_struct *mm, pud_t *pud)
-{
-       kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud);
-}
-
-static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
-{
-       pud_set(pud, __pgtable_ptr_val(pmd));
-}
-
-static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
-                                      pte_t *pte)
-{
-       pmd_set(pmd, __pgtable_ptr_val(pte));
-}
-
-static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
-                               pgtable_t pte_page)
-{
-       pmd_set(pmd, __pgtable_ptr_val(page_address(pte_page)));
-}
-
-#define pmd_pgtable(pmd) pmd_page(pmd)
-
-static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
-                                         unsigned long address)
-{
-       return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO);
-}
-
-static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
-                                     unsigned long address)
-{
-       struct page *page;
-       pte_t *pte;
-
-       pte = pte_alloc_one_kernel(mm, address);
-       if (!pte)
-               return NULL;
-       page = virt_to_page(pte);
-       if (!pgtable_page_ctor(page)) {
-               __free_page(page);
-               return NULL;
-       }
-       return page;
-}
-
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
-       free_page((unsigned long)pte);
-}
-
-static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
-{
-       pgtable_page_dtor(ptepage);
-       __free_page(ptepage);
-}
-
-static inline void pgtable_free(void *table, unsigned index_size)
-{
-       if (!index_size)
-               free_page((unsigned long)table);
-       else {
-               BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE);
-               kmem_cache_free(PGT_CACHE(index_size), table);
-       }
-}
-
-#ifdef CONFIG_SMP
-static inline void pgtable_free_tlb(struct mmu_gather *tlb,
-                                   void *table, int shift)
-{
-       unsigned long pgf = (unsigned long)table;
-       BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
-       pgf |= shift;
-       tlb_remove_table(tlb, (void *)pgf);
-}
-
-static inline void __tlb_remove_table(void *_table)
-{
-       void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
-       unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
-
-       pgtable_free(table, shift);
-}
-#else /* !CONFIG_SMP */
-static inline void pgtable_free_tlb(struct mmu_gather *tlb,
-                                   void *table, int shift)
-{
-       pgtable_free(table, shift);
-}
-#endif /* CONFIG_SMP */
-
-static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
-                                 unsigned long address)
-{
-       tlb_flush_pgtable(tlb, address);
-       pgtable_page_dtor(table);
-       pgtable_free_tlb(tlb, page_address(table), 0);
-}
-
-#else /* if CONFIG_PPC_64K_PAGES */
-
-extern pte_t *page_table_alloc(struct mm_struct *, unsigned long, int);
-extern void page_table_free(struct mm_struct *, unsigned long *, int);
-extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift);
-#ifdef CONFIG_SMP
-extern void __tlb_remove_table(void *_table);
-#endif
-
-#ifndef __PAGETABLE_PUD_FOLDED
-/* book3s 64 is 4 level page table */
-static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
-{
-       pgd_set(pgd, __pgtable_ptr_val(pud));
-}
-
-static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
-       return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE),
-                               GFP_KERNEL|__GFP_REPEAT);
-}
-
-static inline void pud_free(struct mm_struct *mm, pud_t *pud)
-{
-       kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud);
-}
-#endif
-
-static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
-{
-       pud_set(pud, __pgtable_ptr_val(pmd));
-}
-
-static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
-                                      pte_t *pte)
-{
-       pmd_set(pmd, __pgtable_ptr_val(pte));
-}
-
-static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
-                               pgtable_t pte_page)
-{
-       pmd_set(pmd, __pgtable_ptr_val(pte_page));
-}
-
-static inline pgtable_t pmd_pgtable(pmd_t pmd)
-{
-       return (pgtable_t)pmd_page_vaddr(pmd);
-}
-
-static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
-                                         unsigned long address)
-{
-       return (pte_t *)page_table_alloc(mm, address, 1);
-}
-
-static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
-                                       unsigned long address)
-{
-       return (pgtable_t)page_table_alloc(mm, address, 0);
-}
-
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
-       page_table_free(mm, (unsigned long *)pte, 1);
-}
-
-static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
-{
-       page_table_free(mm, (unsigned long *)ptepage, 0);
-}
-
-static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
-                                 unsigned long address)
-{
-       tlb_flush_pgtable(tlb, address);
-       pgtable_free_tlb(tlb, table, 0);
-}
-#endif /* CONFIG_PPC_64K_PAGES */
-
-static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
-       return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX),
-                               GFP_KERNEL|__GFP_REPEAT);
-}
-
-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
-{
-       kmem_cache_free(PGT_CACHE(PMD_CACHE_INDEX), pmd);
-}
-
-#define __pmd_free_tlb(tlb, pmd, addr)               \
-       pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX)
-#ifndef __PAGETABLE_PUD_FOLDED
-#define __pud_free_tlb(tlb, pud, addr)               \
-       pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE)
-
-#endif /* __PAGETABLE_PUD_FOLDED */
-
-#define check_pgt_cache()      do { } while (0)
-
-#endif /* _ASM_POWERPC_PGALLOC_64_H */
diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h

index fc3ee06eab876ec12b20150aeffafe4e045260ae..0413457ba11dc0a08f6d86349a35efb0cae38063 100644 (file)
--- a/arch/powerpc/include/asm/pgalloc.h
+++ b/arch/powerpc/include/asm/pgalloc.h
@@ -1,25 +1,12 @@
  #ifndef _ASM_POWERPC_PGALLOC_H
  #define _ASM_POWERPC_PGALLOC_H
-#ifdef __KERNEL__
  
  #include <linux/mm.h>
  
-#ifdef CONFIG_PPC_BOOK3E
-extern void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address);
-#else /* CONFIG_PPC_BOOK3E */
-static inline void tlb_flush_pgtable(struct mmu_gather *tlb,
-                                    unsigned long address)
-{
-}
-#endif /* !CONFIG_PPC_BOOK3E */
-
-extern void tlb_remove_table(struct mmu_gather *tlb, void *table);
-
-#ifdef CONFIG_PPC64
-#include <asm/pgalloc-64.h>
+#ifdef CONFIG_PPC_BOOK3S
+#include <asm/book3s/pgalloc.h>
  #else
-#include <asm/pgalloc-32.h>
+#include <asm/nohash/pgalloc.h>
  #endif
  
-#endif /* __KERNEL__ */
  #endif /* _ASM_POWERPC_PGALLOC_H */
diff --git a/arch/powerpc/include/asm/pgtable-be-types.h b/arch/powerpc/include/asm/pgtable-be-types.h

new file mode 100644 (file)

index 0000000..e2bf208
--- /dev/null
+++ b/arch/powerpc/include/asm/pgtable-be-types.h
@@ -0,0 +1,92 @@
+#ifndef _ASM_POWERPC_PGTABLE_BE_TYPES_H
+#define _ASM_POWERPC_PGTABLE_BE_TYPES_H
+
+#include <asm/cmpxchg.h>
+
+/* PTE level */
+typedef struct { __be64 pte; } pte_t;
+#define __pte(x)       ((pte_t) { cpu_to_be64(x) })
+static inline unsigned long pte_val(pte_t x)
+{
+       return be64_to_cpu(x.pte);
+}
+
+static inline __be64 pte_raw(pte_t x)
+{
+       return x.pte;
+}
+
+/* PMD level */
+#ifdef CONFIG_PPC64
+typedef struct { __be64 pmd; } pmd_t;
+#define __pmd(x)       ((pmd_t) { cpu_to_be64(x) })
+static inline unsigned long pmd_val(pmd_t x)
+{
+       return be64_to_cpu(x.pmd);
+}
+
+static inline __be64 pmd_raw(pmd_t x)
+{
+       return x.pmd;
+}
+
+/*
+ * 64 bit hash always use 4 level table. Everybody else use 4 level
+ * only for 4K page size.
+ */
+#if defined(CONFIG_PPC_BOOK3S_64) || !defined(CONFIG_PPC_64K_PAGES)
+typedef struct { __be64 pud; } pud_t;
+#define __pud(x)       ((pud_t) { cpu_to_be64(x) })
+static inline unsigned long pud_val(pud_t x)
+{
+       return be64_to_cpu(x.pud);
+}
+#endif /* CONFIG_PPC_BOOK3S_64 || !CONFIG_PPC_64K_PAGES */
+#endif /* CONFIG_PPC64 */
+
+/* PGD level */
+typedef struct { __be64 pgd; } pgd_t;
+#define __pgd(x)       ((pgd_t) { cpu_to_be64(x) })
+static inline unsigned long pgd_val(pgd_t x)
+{
+       return be64_to_cpu(x.pgd);
+}
+
+/* Page protection bits */
+typedef struct { unsigned long pgprot; } pgprot_t;
+#define pgprot_val(x)  ((x).pgprot)
+#define __pgprot(x)    ((pgprot_t) { (x) })
+
+/*
+ * With hash config 64k pages additionally define a bigger "real PTE" type that
+ * gathers the "second half" part of the PTE for pseudo 64k pages
+ */
+#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_STD_MMU_64)
+typedef struct { pte_t pte; unsigned long hidx; } real_pte_t;
+#else
+typedef struct { pte_t pte; } real_pte_t;
+#endif
+
+static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new)
+{
+       unsigned long *p = (unsigned long *)ptep;
+       __be64 prev;
+
+       prev = (__force __be64)__cmpxchg_u64(p, (__force unsigned long)pte_raw(old),
+                                            (__force unsigned long)pte_raw(new));
+
+       return pte_raw(old) == prev;
+}
+
+static inline bool pmd_xchg(pmd_t *pmdp, pmd_t old, pmd_t new)
+{
+       unsigned long *p = (unsigned long *)pmdp;
+       __be64 prev;
+
+       prev = (__force __be64)__cmpxchg_u64(p, (__force unsigned long)pmd_raw(old),
+                                            (__force unsigned long)pmd_raw(new));
+
+       return pmd_raw(old) == prev;
+}
+
+#endif /* _ASM_POWERPC_PGTABLE_BE_TYPES_H */
diff --git a/arch/powerpc/include/asm/pgtable-types.h b/arch/powerpc/include/asm/pgtable-types.h

index 43140f8b059236946a49b0f15635ddd6e8a3c235..e7f4f3e0fcde94ba237fa2a269c32d40458cc8f2 100644 (file)
--- a/arch/powerpc/include/asm/pgtable-types.h
+++ b/arch/powerpc/include/asm/pgtable-types.h
@@ -1,9 +1,6 @@
  #ifndef _ASM_POWERPC_PGTABLE_TYPES_H
  #define _ASM_POWERPC_PGTABLE_TYPES_H
  
-#ifdef CONFIG_STRICT_MM_TYPECHECKS
-/* These are used to make use of C type-checking. */
-
  /* PTE level */
  typedef struct { pte_basic_t pte; } pte_t;
  #define __pte(x)       ((pte_t) { (x) })
@@ -48,49 +45,6 @@ typedef struct { unsigned long pgprot; } pgprot_t;
  #define pgprot_val(x)  ((x).pgprot)
  #define __pgprot(x)    ((pgprot_t) { (x) })
  
-#else
-
-/*
- * .. while these make it easier on the compiler
- */
-
-typedef pte_basic_t pte_t;
-#define __pte(x)       (x)
-static inline pte_basic_t pte_val(pte_t pte)
-{
-       return pte;
-}
-
-#ifdef CONFIG_PPC64
-typedef unsigned long pmd_t;
-#define __pmd(x)       (x)
-static inline unsigned long pmd_val(pmd_t pmd)
-{
-       return pmd;
-}
-
-#if defined(CONFIG_PPC_BOOK3S_64) || !defined(CONFIG_PPC_64K_PAGES)
-typedef unsigned long pud_t;
-#define __pud(x)       (x)
-static inline unsigned long pud_val(pud_t pud)
-{
-       return pud;
-}
-#endif /* CONFIG_PPC_BOOK3S_64 || !CONFIG_PPC_64K_PAGES */
-#endif /* CONFIG_PPC64 */
-
-typedef unsigned long pgd_t;
-#define __pgd(x)       (x)
-static inline unsigned long pgd_val(pgd_t pgd)
-{
-       return pgd;
-}
-
-typedef unsigned long pgprot_t;
-#define pgprot_val(x)  (x)
-#define __pgprot(x)    (x)
-
-#endif /* CONFIG_STRICT_MM_TYPECHECKS */
  /*
   * With hash config 64k pages additionally define a bigger "real PTE" type that
   * gathers the "second half" part of the PTE for pseudo 64k pages
@@ -100,4 +54,16 @@ typedef struct { pte_t pte; unsigned long hidx; } real_pte_t;
  #else
  typedef struct { pte_t pte; } real_pte_t;
  #endif
+
+#ifdef CONFIG_PPC_STD_MMU_64
+#include <asm/cmpxchg.h>
+
+static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new)
+{
+       unsigned long *p = (unsigned long *)ptep;
+
+       return pte_val(old) == __cmpxchg_u64(p, pte_val(old), pte_val(new));
+}
+#endif
+
  #endif /* _ASM_POWERPC_PGTABLE_TYPES_H */
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h

index 47897a30982d6aee476c5be51c7e81ada53158a5..ee09e99097f026ed1a8c3242cf20b447504c109b 100644 (file)
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -65,7 +65,6 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
                        struct page **pages, int *nr);
  #ifndef CONFIG_TRANSPARENT_HUGEPAGE
  #define pmd_large(pmd)         0
-#define has_transparent_hugepage() 0
  #endif
  pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
                                    bool *is_thp, unsigned *shift);
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h

index 7ab04fc59e2462917501a7f6a1b8727be3685163..1d035c1cc8898baaf00cd1eb9c5a284bf6e159d0 100644 (file)
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -131,6 +131,7 @@
  /* sorted alphabetically */
  #define PPC_INST_BHRBE                 0x7c00025c
  #define PPC_INST_CLRBHRB               0x7c00035c
+#define PPC_INST_CP_ABORT              0x7c00068c
  #define PPC_INST_DCBA                  0x7c0005ec
  #define PPC_INST_DCBA_MASK             0xfc0007fe
  #define PPC_INST_DCBAL                 0x7c2005ec
@@ -285,6 +286,7 @@
  #endif
  
  /* Deal with instructions that older assemblers aren't aware of */
+#define        PPC_CP_ABORT            stringify_in_c(.long PPC_INST_CP_ABORT)
  #define        PPC_DCBAL(a, b)         stringify_in_c(.long PPC_INST_DCBAL | \
                                         __PPC_RA(a) | __PPC_RB(b))
  #define        PPC_DCBZL(a, b)         stringify_in_c(.long PPC_INST_DCBZL | \
diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h

index ca0c5bff78490eef1b3efc87b5b8f0fc68f1e727..8753e4eb9ab5b27d2a7ca7dedf720d3c3b7d0502 100644 (file)
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -33,9 +33,9 @@ extern struct pci_dev *isa_bridge_pcidev;     /* may be NULL if no ISA bus */
  struct device_node;
  struct pci_dn;
  
-typedef void *(*traverse_func)(struct device_node *me, void *data);
-void *traverse_pci_devices(struct device_node *start, traverse_func pre,
-               void *data);
+void *pci_traverse_device_nodes(struct device_node *start,
+                               void *(*fn)(struct device_node *, void *),
+                               void *data);
  void *traverse_pci_dn(struct pci_dn *root,
                       void *(*fn)(struct pci_dn *, void *),
                       void *data);
diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h

index 499d9f89435a2dbe1216a58047bb3c1135932746..2b31632376a5bec7c926d12f5506a670d4a3b739 100644 (file)
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -427,7 +427,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601)
         li      r4,1024;                        \
         mtctr   r4;                             \
         lis     r4,KERNELBASE@h;                \
+       .machine push;                          \
+       .machine "power4";                      \
  0:     tlbie   r4;                             \
+       .machine pop;                           \
         addi    r4,r4,0x1000;                   \
         bdnz    0b
  #endif
diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h

index 1ec67b0430657c590fdbf2b230fc8ca926989e6e..2eeaf80d41b7f455889bf5859eee5cd106547616 100644 (file)
--- a/arch/powerpc/include/asm/pte-common.h
+++ b/arch/powerpc/include/asm/pte-common.h
@@ -76,6 +76,16 @@
   */
  #ifndef __ASSEMBLY__
  extern unsigned long bad_call_to_PMD_PAGE_SIZE(void);
+
+/*
+ * Don't just check for any non zero bits in __PAGE_USER, since for book3e
+ * and PTE_64BIT, PAGE_KERNEL_X contains _PAGE_BAP_SR which is also in
+ * _PAGE_USER.  Need to explicitly match _PAGE_BAP_UR bit in that case too.
+ */
+static inline bool pte_user(pte_t pte)
+{
+       return (pte_val(pte) & _PAGE_USER) == _PAGE_USER;
+}
  #endif /* __ASSEMBLY__ */
  
  /* Location of the PFN in the PTE. Most 32-bit platforms use the same
@@ -184,13 +194,6 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void);
  /* Make modules code happy. We don't set RO yet */
  #define PAGE_KERNEL_EXEC       PAGE_KERNEL_X
  
-/*
- * Don't just check for any non zero bits in __PAGE_USER, since for book3e
- * and PTE_64BIT, PAGE_KERNEL_X contains _PAGE_BAP_SR which is also in
- * _PAGE_USER.  Need to explicitly match _PAGE_BAP_UR bit in that case too.
- */
-#define pte_user(val)          ((val & _PAGE_USER) == _PAGE_USER)
-
  /* Advertise special mapping type for AGP */
  #define PAGE_AGP               (PAGE_KERNEL_NC)
  #define HAVE_PAGE_AGP
@@ -198,3 +201,12 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void);
  /* Advertise support for _PAGE_SPECIAL */
  #define __HAVE_ARCH_PTE_SPECIAL
  
+#ifndef _PAGE_READ
+/* if not defined, we should not find _PAGE_WRITE too */
+#define _PAGE_READ 0
+#define _PAGE_WRITE _PAGE_RW
+#endif
+
+#ifndef H_PAGE_4K_PFN
+#define H_PAGE_4K_PFN 0
+#endif
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h

index f5f4c66bbbc91331cb82fc731f353eb01bb0a02f..c1e82e968506302dbbf81ddf4345fc4bf26b15c5 100644 (file)
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -347,6 +347,7 @@
  #define   LPCR_LPES_SH 2
  #define   LPCR_RMI     0x00000002      /* real mode is cache inhibit */
  #define   LPCR_HDICE   0x00000001      /* Hyp Decr enable (HV,PR,EE) */
+#define   LPCR_UPRT    0x00400000      /* Use Process Table (ISA 3) */
  #ifndef SPRN_LPID
  #define SPRN_LPID      0x13F   /* Logical Partition Identifier */
  #endif
@@ -587,6 +588,7 @@
  #define SPRN_PIR       0x3FF   /* Processor Identification Register */
  #endif
  #define SPRN_TIR       0x1BE   /* Thread Identification Register */
+#define SPRN_PTCR      0x1D0   /* Partition table control Register */
  #define SPRN_PSPB      0x09F   /* Problem State Priority Boost reg */
  #define SPRN_PTEHI     0x3D5   /* 981 7450 PTE HI word (S/W TLB load) */
  #define SPRN_PTELO     0x3D6   /* 982 7450 PTE LO word (S/W TLB load) */
@@ -1182,6 +1184,7 @@
  #define PVR_970GX      0x0045
  #define PVR_POWER7p    0x004A
  #define PVR_POWER8E    0x004B
+#define PVR_POWER8NVL  0x004C
  #define PVR_POWER8     0x004D
  #define PVR_BE         0x0070
  #define PVR_PA6T       0x0090
diff --git a/arch/powerpc/include/asm/tlbflush.h b/arch/powerpc/include/asm/tlbflush.h

index 9f77f85e3e99be60ff514ee21c3708c857c3dda0..1b38eea28e5aa488b8be93d9fda4d9767b81b846 100644 (file)
--- a/arch/powerpc/include/asm/tlbflush.h
+++ b/arch/powerpc/include/asm/tlbflush.h
@@ -58,6 +58,7 @@ extern void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
  
  #elif defined(CONFIG_PPC_STD_MMU_32)
  
+#define MMU_NO_CONTEXT      (0)
  /*
   * TLB flushing for "classic" hash-MMU 32-bit CPUs, 6xx, 7xx, 7xxx
   */
@@ -78,7 +79,7 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm)
  }
  
  #elif defined(CONFIG_PPC_STD_MMU_64)
-#include <asm/book3s/64/tlbflush-hash.h>
+#include <asm/book3s/64/tlbflush.h>
  #else
  #error Unsupported MMU type
  #endif
diff --git a/arch/powerpc/include/uapi/asm/perf_regs.h b/arch/powerpc/include/uapi/asm/perf_regs.h

new file mode 100644 (file)

index 0000000..6a93209
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/perf_regs.h
@@ -0,0 +1,50 @@
+#ifndef _UAPI_ASM_POWERPC_PERF_REGS_H
+#define _UAPI_ASM_POWERPC_PERF_REGS_H
+
+enum perf_event_powerpc_regs {
+       PERF_REG_POWERPC_R0,
+       PERF_REG_POWERPC_R1,
+       PERF_REG_POWERPC_R2,
+       PERF_REG_POWERPC_R3,
+       PERF_REG_POWERPC_R4,
+       PERF_REG_POWERPC_R5,
+       PERF_REG_POWERPC_R6,
+       PERF_REG_POWERPC_R7,
+       PERF_REG_POWERPC_R8,
+       PERF_REG_POWERPC_R9,
+       PERF_REG_POWERPC_R10,
+       PERF_REG_POWERPC_R11,
+       PERF_REG_POWERPC_R12,
+       PERF_REG_POWERPC_R13,
+       PERF_REG_POWERPC_R14,
+       PERF_REG_POWERPC_R15,
+       PERF_REG_POWERPC_R16,
+       PERF_REG_POWERPC_R17,
+       PERF_REG_POWERPC_R18,
+       PERF_REG_POWERPC_R19,
+       PERF_REG_POWERPC_R20,
+       PERF_REG_POWERPC_R21,
+       PERF_REG_POWERPC_R22,
+       PERF_REG_POWERPC_R23,
+       PERF_REG_POWERPC_R24,
+       PERF_REG_POWERPC_R25,
+       PERF_REG_POWERPC_R26,
+       PERF_REG_POWERPC_R27,
+       PERF_REG_POWERPC_R28,
+       PERF_REG_POWERPC_R29,
+       PERF_REG_POWERPC_R30,
+       PERF_REG_POWERPC_R31,
+       PERF_REG_POWERPC_NIP,
+       PERF_REG_POWERPC_MSR,
+       PERF_REG_POWERPC_ORIG_R3,
+       PERF_REG_POWERPC_CTR,
+       PERF_REG_POWERPC_LINK,
+       PERF_REG_POWERPC_XER,
+       PERF_REG_POWERPC_CCR,
+       PERF_REG_POWERPC_SOFTE,
+       PERF_REG_POWERPC_TRAP,
+       PERF_REG_POWERPC_DAR,
+       PERF_REG_POWERPC_DSISR,
+       PERF_REG_POWERPC_MAX,
+};
+#endif /* _UAPI_ASM_POWERPC_PERF_REGS_H */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c

index c9370d4e36bdadf580f2d9fde751c550add3ffea..9ea09551a2cd4f67a5b04f7eea1f62f7de3a724c 100644 (file)
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -438,7 +438,11 @@ int main(void)
         DEFINE(BUG_ENTRY_SIZE, sizeof(struct bug_entry));
  #endif
  
+#ifdef MAX_PGD_TABLE_SIZE
+       DEFINE(PGD_TABLE_SIZE, MAX_PGD_TABLE_SIZE);
+#else
         DEFINE(PGD_TABLE_SIZE, PGD_TABLE_SIZE);
+#endif
         DEFINE(PTE_SIZE, sizeof(pte_t));
  
  #ifdef CONFIG_KVM
diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c

index 41c011cb60706638145534514195741f27f65e1a..8275858a434d9fc2fe9ad8ba8700ac46b1e80125 100644 (file)
--- a/arch/powerpc/kernel/btext.c
+++ b/arch/powerpc/kernel/btext.c
@@ -162,7 +162,7 @@ void btext_map(void)
         offset = ((unsigned long) dispDeviceBase) - base;
         size = dispDeviceRowBytes * dispDeviceRect[3] + offset
                 + dispDeviceRect[0];
-       vbase = __ioremap(base, size, _PAGE_NO_CACHE);
+       vbase = __ioremap(base, size, pgprot_val(pgprot_noncached_wc(__pgprot(0))));
         if (vbase == 0)
                 return;
         logicalDisplayBase = vbase + offset;
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c

index 6c662b8de90d6430a15e4c7f903ccffe74727f58..eeeacf6235a36d50ab05d86f3cca18a936298df3 100644 (file)
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -63,7 +63,6 @@ extern void __setup_cpu_745x(unsigned long offset, struct cpu_spec* spec);
  extern void __setup_cpu_ppc970(unsigned long offset, struct cpu_spec* spec);
  extern void __setup_cpu_ppc970MP(unsigned long offset, struct cpu_spec* spec);
  extern void __setup_cpu_pa6t(unsigned long offset, struct cpu_spec* spec);
-extern void __setup_cpu_a2(unsigned long offset, struct cpu_spec* spec);
  extern void __restore_cpu_pa6t(void);
  extern void __restore_cpu_ppc970(void);
  extern void __setup_cpu_power7(unsigned long offset, struct cpu_spec* spec);
@@ -72,7 +71,6 @@ extern void __setup_cpu_power8(unsigned long offset, struct cpu_spec* spec);
  extern void __restore_cpu_power8(void);
  extern void __setup_cpu_power9(unsigned long offset, struct cpu_spec* spec);
  extern void __restore_cpu_power9(void);
-extern void __restore_cpu_a2(void);
  extern void __flush_tlb_power7(unsigned int action);
  extern void __flush_tlb_power8(unsigned int action);
  extern void __flush_tlb_power9(unsigned int action);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c

index 6544017eb90b837f99e78681b9c149d0c3a9f74d..c9bc78e9c6101b2ae5016efec858c5c38b0f4158 100644 (file)
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -48,7 +48,7 @@
  
  
  /** Overview:
- *  EEH, or "Extended Error Handling" is a PCI bridge technology for
+ *  EEH, or "Enhanced Error Handling" is a PCI bridge technology for
   *  dealing with PCI bus errors that can't be dealt with within the
   *  usual PCI framework, except by check-stopping the CPU.  Systems
   *  that are designed for high-availability/reliability cannot afford
@@ -1068,7 +1068,7 @@ void eeh_add_device_early(struct pci_dn *pdn)
         struct pci_controller *phb;
         struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
  
-       if (!edev || !eeh_enabled())
+       if (!edev)
                 return;
  
         if (!eeh_has_flag(EEH_PROBE_MODE_DEVTREE))
@@ -1336,14 +1336,11 @@ static int eeh_pe_change_owner(struct eeh_pe *pe)
                             id->subdevice != pdev->subsystem_device)
                                 continue;
  
-                       goto reset;
+                       return eeh_pe_reset_and_recover(pe);
                 }
         }
  
         return eeh_unfreeze_pe(pe, true);
-
-reset:
-       return eeh_pe_reset_and_recover(pe);
  }
  
  /**
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c

index fb6207d2c604b4bd9b24d35c57454c080383775a..2714a3b81d24476ad07d651c64720f42ab96bcec 100644 (file)
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -171,6 +171,16 @@ static void *eeh_dev_save_state(void *data, void *userdata)
         if (!edev)
                 return NULL;
  
+       /*
+        * We cannot access the config space on some adapters.
+        * Otherwise, it will cause fenced PHB. We don't save
+        * the content in their config space and will restore
+        * from the initial config space saved when the EEH
+        * device is created.
+        */
+       if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED))
+               return NULL;
+
         pdev = eeh_dev_to_pci_dev(edev);
         if (!pdev)
                 return NULL;
@@ -312,6 +322,19 @@ static void *eeh_dev_restore_state(void *data, void *userdata)
         if (!edev)
                 return NULL;
  
+       /*
+        * The content in the config space isn't saved because
+        * the blocked config space on some adapters. We have
+        * to restore the initial saved config space when the
+        * EEH device is created.
+        */
+       if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) {
+               if (list_is_last(&edev->list, &edev->pe->edevs))
+                       eeh_pe_restore_bars(edev->pe);
+
+               return NULL;
+       }
+
         pdev = eeh_dev_to_pci_dev(edev);
         if (!pdev)
                 return NULL;
@@ -552,7 +575,7 @@ static int eeh_clear_pe_frozen_state(struct eeh_pe *pe,
  
  int eeh_pe_reset_and_recover(struct eeh_pe *pe)
  {
-       int result, ret;
+       int ret;
  
         /* Bail if the PE is being recovered */
         if (pe->state & EEH_PE_RECOVERING)
@@ -564,9 +587,6 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe)
         /* Save states */
         eeh_pe_dev_traverse(pe, eeh_dev_save_state, NULL);
  
-       /* Report error */
-       eeh_pe_dev_traverse(pe, eeh_report_error, &result);
-
         /* Issue reset */
         ret = eeh_reset_pe(pe);
         if (ret) {
@@ -581,15 +601,9 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe)
                 return ret;
         }
  
-       /* Notify completion of reset */
-       eeh_pe_dev_traverse(pe, eeh_report_reset, &result);
-
         /* Restore device state */
         eeh_pe_dev_traverse(pe, eeh_dev_restore_state, NULL);
  
-       /* Resume */
-       eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
-
         /* Clear recovery mode */
         eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
  
@@ -621,7 +635,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
          * We don't remove the corresponding PE instances because
          * we need the information afterwords. The attached EEH
          * devices are expected to be attached soon when calling
-        * into pcibios_add_pci_devices().
+        * into pci_hp_add_devices().
          */
         eeh_pe_state_mark(pe, EEH_PE_KEEP);
         if (bus) {
@@ -630,7 +644,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
                 } else {
                         eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
                         pci_lock_rescan_remove();
-                       pcibios_remove_pci_devices(bus);
+                       pci_hp_remove_devices(bus);
                         pci_unlock_rescan_remove();
                 }
         } else if (frozen_bus) {
@@ -681,7 +695,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
                 if (pe->type & EEH_PE_VF)
                         eeh_add_virt_device(edev, NULL);
                 else
-                       pcibios_add_pci_devices(bus);
+                       pci_hp_add_devices(bus);
         } else if (frozen_bus && rmv_data->removed) {
                 pr_info("EEH: Sleep 5s ahead of partial hotplug\n");
                 ssleep(5);
@@ -691,7 +705,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
                 if (pe->type & EEH_PE_VF)
                         eeh_add_virt_device(edev, NULL);
                 else
-                       pcibios_add_pci_devices(frozen_bus);
+                       pci_hp_add_devices(frozen_bus);
         }
         eeh_pe_state_clear(pe, EEH_PE_KEEP);
  
@@ -896,7 +910,7 @@ perm_error:
                         eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
  
                         pci_lock_rescan_remove();
-                       pcibios_remove_pci_devices(frozen_bus);
+                       pci_hp_remove_devices(frozen_bus);
                         pci_unlock_rescan_remove();
                 }
         }
@@ -981,7 +995,7 @@ static void eeh_handle_special_event(void)
                                 bus = eeh_pe_bus_get(phb_pe);
                                 eeh_pe_dev_traverse(pe,
                                         eeh_report_failure, NULL);
-                               pcibios_remove_pci_devices(bus);
+                               pci_hp_remove_devices(bus);
                         }
                         pci_unlock_rescan_remove();
                 }
diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c

index 4eefb6e34dbb2f6edbf4990349e9c11b0b5d07f8..82e7327e3cd0ec7cec0e0655dd75ce1b55465968 100644 (file)
--- a/arch/powerpc/kernel/eeh_event.c
+++ b/arch/powerpc/kernel/eeh_event.c
@@ -36,7 +36,7 @@
  
  static DEFINE_SPINLOCK(eeh_eventlist_lock);
  static struct semaphore eeh_eventlist_sem;
-LIST_HEAD(eeh_eventlist);
+static LIST_HEAD(eeh_eventlist);
  
  /**
   * eeh_event_handler - Dispatch EEH events.
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c

index eea48d8baf4944ac053b0a34ba706afa6ad4b43c..f0520da857594cecf04a7da0a7c4e3875781e584 100644 (file)
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -249,7 +249,7 @@ static void *__eeh_pe_get(void *data, void *flag)
         } else {
                 if (edev->pe_config_addr &&
                     (edev->pe_config_addr == pe->addr))
-               return pe;
+                       return pe;
         }
  
         /* Try BDF address */
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S

index 39a79c89a4b6e8edab082c8e35cda58ef2b00ddf..73e461a3dfbbd2926b191e6733739f6566563f28 100644 (file)
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -37,6 +37,7 @@
  #include <asm/hw_irq.h>
  #include <asm/context_tracking.h>
  #include <asm/tm.h>
+#include <asm/ppc-opcode.h>
  
  /*
   * System calls.
@@ -509,6 +510,14 @@ BEGIN_FTR_SECTION
         ldarx   r6,0,r1
  END_FTR_SECTION_IFSET(CPU_FTR_STCX_CHECKS_ADDRESS)
  
+BEGIN_FTR_SECTION
+/*
+ * A cp_abort (copy paste abort) here ensures that when context switching, a
+ * copy from one process can't leak into the paste of another.
+ */
+       PPC_CP_ABORT
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+
  #ifdef CONFIG_PPC_BOOK3S
  /* Cancel all explict user streams as they will have no use after context
   * switch and will stop the HW from creating streams itself
@@ -520,7 +529,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_STCX_CHECKS_ADDRESS)
         std     r6,PACACURRENT(r13)     /* Set new 'current' */
  
         ld      r8,KSP(r4)      /* new stack pointer */
-#ifdef CONFIG_PPC_BOOK3S
+#ifdef CONFIG_PPC_STD_MMU_64
+BEGIN_MMU_FTR_SECTION
+       b       2f
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_RADIX)
  BEGIN_FTR_SECTION
         clrrdi  r6,r8,28        /* get its ESID */
         clrrdi  r9,r1,28        /* get current sp ESID */
@@ -566,7 +578,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
         slbmte  r7,r0
         isync
  2:
-#endif /* !CONFIG_PPC_BOOK3S */
+#endif /* CONFIG_PPC_STD_MMU_64 */
  
         CURRENT_THREAD_INFO(r7, r8)  /* base of new stack */
         /* Note: this uses SWITCH_FRAME_SIZE rather than INT_FRAME_SIZE
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S

index 7716cebf4b8ea086171ab326ba72079a758e2e79..4c9440629128ccd94dbc35d5eb0b6e060d4d970f 100644 (file)
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -189,7 +189,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
  #endif /* CONFIG_PPC_P7_NAP */
         EXCEPTION_PROLOG_0(PACA_EXMC)
  BEGIN_FTR_SECTION
-       b       machine_check_pSeries_early
+       b       machine_check_powernv_early
  FTR_SECTION_ELSE
         b       machine_check_pSeries_0
  ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
@@ -209,11 +209,6 @@ data_access_slb_pSeries:
         EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x380)
         std     r3,PACA_EXSLB+EX_R3(r13)
         mfspr   r3,SPRN_DAR
-#ifdef __DISABLED__
-       /* Keep that around for when we re-implement dynamic VSIDs */
-       cmpdi   r3,0
-       bge     slb_miss_user_pseries
-#endif /* __DISABLED__ */
         mfspr   r12,SPRN_SRR1
  #ifndef CONFIG_RELOCATABLE
         b       slb_miss_realmode
@@ -240,11 +235,6 @@ instruction_access_slb_pSeries:
         EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x480)
         std     r3,PACA_EXSLB+EX_R3(r13)
         mfspr   r3,SPRN_SRR0            /* SRR0 is faulting address */
-#ifdef __DISABLED__
-       /* Keep that around for when we re-implement dynamic VSIDs */
-       cmpdi   r3,0
-       bge     slb_miss_user_pseries
-#endif /* __DISABLED__ */
         mfspr   r12,SPRN_SRR1
  #ifndef CONFIG_RELOCATABLE
         b       slb_miss_realmode
@@ -443,7 +433,7 @@ denorm_exception_hv:
  
         .align  7
         /* moved from 0x200 */
-machine_check_pSeries_early:
+machine_check_powernv_early:
  BEGIN_FTR_SECTION
         EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
         /*
@@ -709,34 +699,6 @@ system_reset_fwnmi:
  
  #endif /* CONFIG_PPC_PSERIES */
  
-#ifdef __DISABLED__
-/*
- * This is used for when the SLB miss handler has to go virtual,
- * which doesn't happen for now anymore but will once we re-implement
- * dynamic VSIDs for shared page tables
- */
-slb_miss_user_pseries:
-       std     r10,PACA_EXGEN+EX_R10(r13)
-       std     r11,PACA_EXGEN+EX_R11(r13)
-       std     r12,PACA_EXGEN+EX_R12(r13)
-       GET_SCRATCH0(r10)
-       ld      r11,PACA_EXSLB+EX_R9(r13)
-       ld      r12,PACA_EXSLB+EX_R3(r13)
-       std     r10,PACA_EXGEN+EX_R13(r13)
-       std     r11,PACA_EXGEN+EX_R9(r13)
-       std     r12,PACA_EXGEN+EX_R3(r13)
-       clrrdi  r12,r13,32
-       mfmsr   r10
-       mfspr   r11,SRR0                        /* save SRR0 */
-       ori     r12,r12,slb_miss_user_common@l  /* virt addr of handler */
-       ori     r10,r10,MSR_IR|MSR_DR|MSR_RI
-       mtspr   SRR0,r12
-       mfspr   r12,SRR1                        /* and SRR1 */
-       mtspr   SRR1,r10
-       rfid
-       b       .                               /* prevent spec. execution */
-#endif /* __DISABLED__ */
-
  #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
  kvmppc_skip_interrupt:
         /*
@@ -764,11 +726,10 @@ kvmppc_skip_Hinterrupt:
  #endif
  
  /*
- * Code from here down to __end_handlers is invoked from the
- * exception prologs above.  Because the prologs assemble the
- * addresses of these handlers using the LOAD_HANDLER macro,
- * which uses an ori instruction, these handlers must be in
- * the first 64k of the kernel image.
+ * Ensure that any handlers that get invoked from the exception prologs
+ * above are below the first 64KB (0x10000) of the kernel image because
+ * the prologs assemble the addresses of these handlers using the
+ * LOAD_HANDLER macro, which uses an ori instruction.
   */
  
  /*** Common interrupt handlers ***/
@@ -953,11 +914,6 @@ hv_facility_unavailable_relon_trampoline:
  #endif
         STD_RELON_EXCEPTION_PSERIES(0x5700, 0x1700, altivec_assist)
  
-       /* Other future vectors */
-       .align  7
-       .globl  __end_interrupts
-__end_interrupts:
-
         .align  7
  system_call_entry:
         b       system_call_common
@@ -983,7 +939,13 @@ data_access_common:
         ld      r3,PACA_EXGEN+EX_DAR(r13)
         lwz     r4,PACA_EXGEN+EX_DSISR(r13)
         li      r5,0x300
+       std     r3,_DAR(r1)
+       std     r4,_DSISR(r1)
+BEGIN_MMU_FTR_SECTION
         b       do_hash_page            /* Try to handle as hpte fault */
+MMU_FTR_SECTION_ELSE
+       b       handle_page_fault
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_RADIX)
  
         .align  7
         .globl  h_data_storage_common
@@ -1008,73 +970,15 @@ instruction_access_common:
         ld      r3,_NIP(r1)
         andis.  r4,r12,0x5820
         li      r5,0x400
+       std     r3,_DAR(r1)
+       std     r4,_DSISR(r1)
+BEGIN_MMU_FTR_SECTION
         b       do_hash_page            /* Try to handle as hpte fault */
-
-       STD_EXCEPTION_COMMON(0xe20, h_instr_storage, unknown_exception)
-
-/*
- * Here is the common SLB miss user that is used when going to virtual
- * mode for SLB misses, that is currently not used
- */
-#ifdef __DISABLED__
-       .align  7
-       .globl  slb_miss_user_common
-slb_miss_user_common:
-       mflr    r10
-       std     r3,PACA_EXGEN+EX_DAR(r13)
-       stw     r9,PACA_EXGEN+EX_CCR(r13)
-       std     r10,PACA_EXGEN+EX_LR(r13)
-       std     r11,PACA_EXGEN+EX_SRR0(r13)
-       bl      slb_allocate_user
-
-       ld      r10,PACA_EXGEN+EX_LR(r13)
-       ld      r3,PACA_EXGEN+EX_R3(r13)
-       lwz     r9,PACA_EXGEN+EX_CCR(r13)
-       ld      r11,PACA_EXGEN+EX_SRR0(r13)
-       mtlr    r10
-       beq-    slb_miss_fault
-
-       andi.   r10,r12,MSR_RI          /* check for unrecoverable exception */
-       beq-    unrecov_user_slb
-       mfmsr   r10
-
-.machine push
-.machine "power4"
-       mtcrf   0x80,r9
-.machine pop
-
-       clrrdi  r10,r10,2               /* clear RI before setting SRR0/1 */
-       mtmsrd  r10,1
-
-       mtspr   SRR0,r11
-       mtspr   SRR1,r12
-
-       ld      r9,PACA_EXGEN+EX_R9(r13)
-       ld      r10,PACA_EXGEN+EX_R10(r13)
-       ld      r11,PACA_EXGEN+EX_R11(r13)
-       ld      r12,PACA_EXGEN+EX_R12(r13)
-       ld      r13,PACA_EXGEN+EX_R13(r13)
-       rfid
-       b       .
-
-slb_miss_fault:
-       EXCEPTION_PROLOG_COMMON(0x380, PACA_EXGEN)
-       ld      r4,PACA_EXGEN+EX_DAR(r13)
-       li      r5,0
-       std     r4,_DAR(r1)
-       std     r5,_DSISR(r1)
+MMU_FTR_SECTION_ELSE
         b       handle_page_fault
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_RADIX)
  
-unrecov_user_slb:
-       EXCEPTION_PROLOG_COMMON(0x4200, PACA_EXGEN)
-       RECONCILE_IRQ_STATE(r10, r11)
-       bl      save_nvgprs
-1:     addi    r3,r1,STACK_FRAME_OVERHEAD
-       bl      unrecoverable_exception
-       b       1b
-
-#endif /* __DISABLED__ */
-
+       STD_EXCEPTION_COMMON(0xe20, h_instr_storage, unknown_exception)
  
         /*
          * Machine check is different because we use a different
@@ -1230,10 +1134,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
         STD_EXCEPTION_COMMON(0xf60, facility_unavailable, facility_unavailable_exception)
         STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, facility_unavailable_exception)
  
-       .align  7
-       .globl  __end_handlers
-__end_handlers:
-
         /* Equivalents to the above handlers for relocation-on interrupt vectors */
         STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist)
         MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell)
@@ -1244,6 +1144,17 @@ __end_handlers:
         STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable)
         STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable)
  
+       /*
+        * The __end_interrupts marker must be past the out-of-line (OOL)
+        * handlers, so that they are copied to real address 0x100 when running
+        * a relocatable kernel. This ensures they can be reached from the short
+        * trampoline handlers (like 0x4f00, 0x4f20, etc.) which branch
+        * directly, without using LOAD_HANDLER().
+        */
+       .align  7
+       .globl  __end_interrupts
+__end_interrupts:
+
  #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
  /*
   * Data area reserved for FWNMI option.
@@ -1476,8 +1387,11 @@ slb_miss_realmode:
         stw     r9,PACA_EXSLB+EX_CCR(r13)       /* save CR in exc. frame */
         std     r10,PACA_EXSLB+EX_LR(r13)       /* save LR */
  
+#ifdef CONFIG_PPC_STD_MMU_64
+BEGIN_MMU_FTR_SECTION
         bl      slb_allocate_realmode
-
+END_MMU_FTR_SECTION_IFCLR(MMU_FTR_RADIX)
+#endif
         /* All done -- return from exception. */
  
         ld      r10,PACA_EXSLB+EX_LR(r13)
@@ -1485,7 +1399,9 @@ slb_miss_realmode:
         lwz     r9,PACA_EXSLB+EX_CCR(r13)       /* get saved CR */
  
         mtlr    r10
-
+BEGIN_MMU_FTR_SECTION
+       b       2f
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_RADIX)
         andi.   r10,r12,MSR_RI  /* check for unrecoverable exception */
         beq-    2f
  
@@ -1536,9 +1452,7 @@ power4_fixup_nap:
   */
         .align  7
  do_hash_page:
-       std     r3,_DAR(r1)
-       std     r4,_DSISR(r1)
-
+#ifdef CONFIG_PPC_STD_MMU_64
         andis.  r0,r4,0xa410            /* weird error? */
         bne-    handle_page_fault       /* if not, try to insert a HPTE */
         andis.  r0,r4,DSISR_DABRMATCH@h
@@ -1566,6 +1480,7 @@ do_hash_page:
  
         /* Error */
         blt-    13f
+#endif /* CONFIG_PPC_STD_MMU_64 */
  
  /* Here we have a page fault that hash_page can't handle. */
  handle_page_fault:
@@ -1592,6 +1507,7 @@ handle_dabr_fault:
  12:    b       ret_from_except_lite
  
  
+#ifdef CONFIG_PPC_STD_MMU_64
  /* We have a page fault that hash_page could handle but HV refused
   * the PTE insertion
   */
@@ -1601,6 +1517,7 @@ handle_dabr_fault:
         ld      r4,_DAR(r1)
         bl      low_hash_fault
         b       ret_from_except
+#endif
  
  /*
   * We come here as a result of a DSI at a point where we don't want
diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c

index 9dac18dabd03c5a5f2135aee190a463212393fee..1123a4d8d8dd60411c99342b607d9cda26e4be15 100644 (file)
--- a/arch/powerpc/kernel/ftrace.c
+++ b/arch/powerpc/kernel/ftrace.c
@@ -607,3 +607,13 @@ unsigned long __init arch_syscall_addr(int nr)
         return sys_call_table[nr*2];
  }
  #endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_PPC64 */
+
+#if defined(CONFIG_PPC64) && (!defined(_CALL_ELF) || _CALL_ELF != 2)
+char *arch_ftrace_match_adjust(char *str, const char *search)
+{
+       if (str[0] == '.' && search[0] != '.')
+               return str + 1;
+       else
+               return str;
+}
+#endif /* defined(CONFIG_PPC64) && (!defined(_CALL_ELF) || _CALL_ELF != 2) */
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S

index 4286775cbde9c36b67b6188615f8c93d41efdb5b..2d14774af6b41c6628b3b0fdb47875648e1179f0 100644 (file)
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -973,13 +973,16 @@ start_here_common:
   * This stuff goes at the beginning of the bss, which is page-aligned.
   */
         .section ".bss"
+/*
+ * pgd dir should be aligned to PGD_TABLE_SIZE which is 64K.
+ * We will need to find a better way to fix this
+ */
+       .align  16
  
-       .align  PAGE_SHIFT
+       .globl  swapper_pg_dir
+swapper_pg_dir:
+       .space  PGD_TABLE_SIZE
  
         .globl  empty_zero_page
  empty_zero_page:
         .space  PAGE_SIZE
-
-       .globl  swapper_pg_dir
-swapper_pg_dir:
-       .space  PGD_TABLE_SIZE
diff --git a/arch/powerpc/kernel/ibmebus.c b/arch/powerpc/kernel/ibmebus.c

index ac86c53e25428baaad22800ae1cedb2775680732..a89f4f7a66bdb351a59dea446996cf64a58c373b 100644 (file)
--- a/arch/powerpc/kernel/ibmebus.c
+++ b/arch/powerpc/kernel/ibmebus.c
@@ -408,7 +408,7 @@ static ssize_t modalias_show(struct device *dev,
         return len+1;
  }
  
-struct device_attribute ibmebus_bus_device_attrs[] = {
+static struct device_attribute ibmebus_bus_device_attrs[] = {
         __ATTR_RO(devspec),
         __ATTR_RO(name),
         __ATTR_RO(modalias),
diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c

index 0f19970979608208b385aa0606c5461747673238..ae1316106e2b59ac7de6803982b3751b8afa8918 100644 (file)
--- a/arch/powerpc/kernel/isa-bridge.c
+++ b/arch/powerpc/kernel/isa-bridge.c
@@ -109,14 +109,14 @@ static void pci_process_ISA_OF_ranges(struct device_node *isa_node,
                 size = 0x10000;
  
         __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE,
-                    size, _PAGE_NO_CACHE|_PAGE_GUARDED);
+                    size, pgprot_val(pgprot_noncached(__pgprot(0))));
         return;
  
  inval_range:
         printk(KERN_ERR "no ISA IO ranges or unexpected isa range, "
                "mapping 64k\n");
         __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE,
-                    0x10000, _PAGE_NO_CACHE|_PAGE_GUARDED);
+                    0x10000, pgprot_val(pgprot_noncached(__pgprot(0))));
  }
  
  
diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c

index 015ae55c18686ffb794938531aa52692e0843b0e..2694d078741d08c9f32a05c7ea6323867ddde63d 100644 (file)
--- a/arch/powerpc/kernel/machine_kexec.c
+++ b/arch/powerpc/kernel/machine_kexec.c
@@ -228,17 +228,12 @@ static struct property memory_limit_prop = {
  
  static void __init export_crashk_values(struct device_node *node)
  {
-       struct property *prop;
-
         /* There might be existing crash kernel properties, but we can't
          * be sure what's in them, so remove them. */
-       prop = of_find_property(node, "linux,crashkernel-base", NULL);
-       if (prop)
-               of_remove_property(node, prop);
-
-       prop = of_find_property(node, "linux,crashkernel-size", NULL);
-       if (prop)
-               of_remove_property(node, prop);
+       of_remove_property(node, of_find_property(node,
+                               "linux,crashkernel-base", NULL));
+       of_remove_property(node, of_find_property(node,
+                               "linux,crashkernel-size", NULL));
  
         if (crashk_res.start != 0) {
                 crashk_base = cpu_to_be_ulong(crashk_res.start),
@@ -258,16 +253,13 @@ static void __init export_crashk_values(struct device_node *node)
  static int __init kexec_setup(void)
  {
         struct device_node *node;
-       struct property *prop;
  
         node = of_find_node_by_path("/chosen");
         if (!node)
                 return -ENOENT;
  
         /* remove any stale properties so ours can be found */
-       prop = of_find_property(node, kernel_end_prop.name, NULL);
-       if (prop)
-               of_remove_property(node, prop);
+       of_remove_property(node, of_find_property(node, kernel_end_prop.name, NULL));
  
         /* information needed by userspace when using default_machine_kexec */
         kernel_end = cpu_to_be_ulong(__pa(_end));
diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c

index 0fbd75d185d7e5dd315341bfaecaef9f01dc794f..b8c202d63ecb83d9a1afded686af1c268e2c4cb6 100644 (file)
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -76,6 +76,7 @@ int default_machine_kexec_prepare(struct kimage *image)
          * end of the blocked region (begin >= high).  Use the
          * boolean identity !(a || b)  === (!a && !b).
          */
+#ifdef CONFIG_PPC_STD_MMU_64
         if (htab_address) {
                 low = __pa(htab_address);
                 high = low + htab_size_bytes;
@@ -88,6 +89,7 @@ int default_machine_kexec_prepare(struct kimage *image)
                                 return -ETXTBSY;
                 }
         }
+#endif /* CONFIG_PPC_STD_MMU_64 */
  
         /* We also should not overwrite the tce tables */
         for_each_node_by_type(node, "pci") {
@@ -381,7 +383,7 @@ void default_machine_kexec(struct kimage *image)
         /* NOTREACHED */
  }
  
-#ifndef CONFIG_PPC_BOOK3E
+#ifdef CONFIG_PPC_STD_MMU_64
  /* Values we need to export to the second kernel via the device tree. */
  static unsigned long htab_base;
  static unsigned long htab_size;
@@ -401,7 +403,6 @@ static struct property htab_size_prop = {
  static int __init export_htab_values(void)
  {
         struct device_node *node;
-       struct property *prop;
  
         /* On machines with no htab htab_address is NULL */
         if (!htab_address)
@@ -412,12 +413,8 @@ static int __init export_htab_values(void)
                 return -ENODEV;
  
         /* remove any stale propertys so ours can be found */
-       prop = of_find_property(node, htab_base_prop.name, NULL);
-       if (prop)
-               of_remove_property(node, prop);
-       prop = of_find_property(node, htab_size_prop.name, NULL);
-       if (prop)
-               of_remove_property(node, prop);
+       of_remove_property(node, of_find_property(node, htab_base_prop.name, NULL));
+       of_remove_property(node, of_find_property(node, htab_size_prop.name, NULL));
  
         htab_base = cpu_to_be64(__pa(htab_address));
         of_add_property(node, &htab_base_prop);
@@ -428,4 +425,4 @@ static int __init export_htab_values(void)
         return 0;
  }
  late_initcall(export_htab_values);
-#endif /* !CONFIG_PPC_BOOK3E */
+#endif /* CONFIG_PPC_STD_MMU_64 */
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c

index 671fd5122406fc8c13c94713408c6c70d6bb7761..ef267fd9dd225a3c7f3dfbcacc057a0bf28dca8e 100644 (file)
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -37,7 +37,7 @@ static DEFINE_PER_CPU(int, mce_queue_count);
  static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue);
  
  static void machine_check_process_queued_event(struct irq_work *work);
-struct irq_work mce_event_process_work = {
+static struct irq_work mce_event_process_work = {
          .func = machine_check_process_queued_event,
  };
  
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c

index ee62b197502d77e93ac860ab9e495db38eedc11e..7353991c4ecee6d8a6ecacdce4ab96815ffdfb12 100644 (file)
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -72,11 +72,15 @@ void __flush_tlb_power8(unsigned int action)
  
  void __flush_tlb_power9(unsigned int action)
  {
+       if (radix_enabled())
+               flush_tlb_206(POWER9_TLB_SETS_RADIX, action);
+
         flush_tlb_206(POWER9_TLB_SETS_HASH, action);
  }
  
  
  /* flush SLBs and reload */
+#ifdef CONFIG_PPC_STD_MMU_64
  static void flush_and_reload_slb(void)
  {
         struct slb_shadow *slb;
@@ -110,6 +114,7 @@ static void flush_and_reload_slb(void)
                 asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb));
         }
  }
+#endif
  
  static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits)
  {
@@ -120,6 +125,7 @@ static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits)
          * reset the error bits whenever we handle them so that at the end
          * we can check whether we handled all of them or not.
          * */
+#ifdef CONFIG_PPC_STD_MMU_64
         if (dsisr & slb_error_bits) {
                 flush_and_reload_slb();
                 /* reset error bits */
@@ -131,6 +137,7 @@ static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits)
                 /* reset error bits */
                 dsisr &= ~P7_DSISR_MC_TLB_MULTIHIT_MFTLB;
         }
+#endif
         /* Any other errors we don't understand? */
         if (dsisr & 0xffffffffUL)
                 handled = 0;
@@ -150,6 +157,7 @@ static long mce_handle_common_ierror(uint64_t srr1)
         switch (P7_SRR1_MC_IFETCH(srr1)) {
         case 0:
                 break;
+#ifdef CONFIG_PPC_STD_MMU_64
         case P7_SRR1_MC_IFETCH_SLB_PARITY:
         case P7_SRR1_MC_IFETCH_SLB_MULTIHIT:
                 /* flush and reload SLBs for SLB errors. */
@@ -162,6 +170,7 @@ static long mce_handle_common_ierror(uint64_t srr1)
                         handled = 1;
                 }
                 break;
+#endif
         default:
                 break;
         }
@@ -175,10 +184,12 @@ static long mce_handle_ierror_p7(uint64_t srr1)
  
         handled = mce_handle_common_ierror(srr1);
  
+#ifdef CONFIG_PPC_STD_MMU_64
         if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) {
                 flush_and_reload_slb();
                 handled = 1;
         }
+#endif
         return handled;
  }
  
@@ -321,10 +332,12 @@ static long mce_handle_ierror_p8(uint64_t srr1)
  
         handled = mce_handle_common_ierror(srr1);
  
+#ifdef CONFIG_PPC_STD_MMU_64
         if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) {
                 flush_and_reload_slb();
                 handled = 1;
         }
+#endif
         return handled;
  }
  
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S

index bf5160fbf9d8437c11afae2b75a5b3d29bab79ff..285ca8c6cc2ea5cd9472466c6d901e81b742712c 100644 (file)
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -599,12 +599,6 @@ _GLOBAL(__bswapdi2)
         mr      r4,r10
         blr
  
-_GLOBAL(abs)
-       srawi   r4,r3,31
-       xor     r3,r3,r4
-       sub     r3,r3,r4
-       blr
-
  #ifdef CONFIG_SMP
  _GLOBAL(start_secondary_resume)
         /* Reset stack */
diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c

index 0cab9e8c37948685b8128bd156a15b1ef40e6936..856f9a7944cd91052582cd1fa5531c4bc6c9f9ea 100644 (file)
--- a/arch/powerpc/kernel/nvram_64.c
+++ b/arch/powerpc/kernel/nvram_64.c
@@ -15,8 +15,6 @@
   *       parsing code.
   */
  
-#include <linux/module.h>
-
  #include <linux/types.h>
  #include <linux/errno.h>
  #include <linux/fs.h>
@@ -1231,12 +1229,4 @@ static int __init nvram_init(void)
         
         return rc;
  }
-
-static void __exit nvram_cleanup(void)
-{
-        misc_deregister( &nvram_dev );
-}
-
-module_init(nvram_init);
-module_exit(nvram_cleanup);
-MODULE_LICENSE("GPL");
+device_initcall(nvram_init);
diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c

index 59c436189f466fbef0119378028184198ca5a863..2d71269e7dc108f3eb0d26e24b011ed57fecc35d 100644 (file)
--- a/arch/powerpc/kernel/pci-hotplug.c
+++ b/arch/powerpc/kernel/pci-hotplug.c
@@ -21,6 +21,35 @@
  #include <asm/firmware.h>
  #include <asm/eeh.h>
  
+static struct pci_bus *find_bus_among_children(struct pci_bus *bus,
+                                              struct device_node *dn)
+{
+       struct pci_bus *child = NULL;
+       struct pci_bus *tmp;
+
+       if (pci_bus_to_OF_node(bus) == dn)
+               return bus;
+
+       list_for_each_entry(tmp, &bus->children, node) {
+               child = find_bus_among_children(tmp, dn);
+               if (child)
+                       break;
+       }
+
+       return child;
+}
+
+struct pci_bus *pci_find_bus_by_node(struct device_node *dn)
+{
+       struct pci_dn *pdn = PCI_DN(dn);
+
+       if (!pdn  || !pdn->phb || !pdn->phb->bus)
+               return NULL;
+
+       return find_bus_among_children(pdn->phb->bus, dn);
+}
+EXPORT_SYMBOL_GPL(pci_find_bus_by_node);
+
  /**
   * pcibios_release_device - release PCI device
   * @dev: PCI device
@@ -38,20 +67,20 @@ void pcibios_release_device(struct pci_dev *dev)
  }
  
  /**
- * pcibios_remove_pci_devices - remove all devices under this bus
+ * pci_hp_remove_devices - remove all devices under this bus
   * @bus: the indicated PCI bus
   *
   * Remove all of the PCI devices under this bus both from the
   * linux pci device tree, and from the powerpc EEH address cache.
   */
-void pcibios_remove_pci_devices(struct pci_bus *bus)
+void pci_hp_remove_devices(struct pci_bus *bus)
  {
         struct pci_dev *dev, *tmp;
         struct pci_bus *child_bus;
  
         /* First go down child busses */
         list_for_each_entry(child_bus, &bus->children, node)
-               pcibios_remove_pci_devices(child_bus);
+               pci_hp_remove_devices(child_bus);
  
         pr_debug("PCI: Removing devices on bus %04x:%02x\n",
                  pci_domain_nr(bus),  bus->number);
@@ -60,11 +89,10 @@ void pcibios_remove_pci_devices(struct pci_bus *bus)
                 pci_stop_and_remove_bus_device(dev);
         }
  }
-
-EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices);
+EXPORT_SYMBOL_GPL(pci_hp_remove_devices);
  
  /**
- * pcibios_add_pci_devices - adds new pci devices to bus
+ * pci_hp_add_devices - adds new pci devices to bus
   * @bus: the indicated PCI bus
   *
   * This routine will find and fixup new pci devices under
@@ -74,7 +102,7 @@ EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices);
   * is how this routine differs from other, similar pcibios
   * routines.)
   */
-void pcibios_add_pci_devices(struct pci_bus * bus)
+void pci_hp_add_devices(struct pci_bus *bus)
  {
         int slotno, mode, pass, max;
         struct pci_dev *dev;
@@ -92,7 +120,8 @@ void pcibios_add_pci_devices(struct pci_bus * bus)
         if (mode == PCI_PROBE_DEVTREE) {
                 /* use ofdt-based probe */
                 of_rescan_bus(dn, bus);
-       } else if (mode == PCI_PROBE_NORMAL) {
+       } else if (mode == PCI_PROBE_NORMAL &&
+                  dn->child && PCI_DN(dn->child)) {
                 /*
                  * Use legacy probe. In the partial hotplug case, we
                  * probably have grandchildren devices unplugged. So
@@ -114,4 +143,4 @@ void pcibios_add_pci_devices(struct pci_bus * bus)
         }
         pcibios_finish_adding_to_bus(bus);
  }
-EXPORT_SYMBOL_GPL(pcibios_add_pci_devices);
+EXPORT_SYMBOL_GPL(pci_hp_add_devices);
diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c

index 60bb187cb46ac0cc64c67a4f19fb1687492a25f1..3759df52bd671d883c38aec23fe4b133e6c0d0f7 100644 (file)
--- a/arch/powerpc/kernel/pci_64.c
+++ b/arch/powerpc/kernel/pci_64.c
@@ -38,7 +38,7 @@
   * ISA drivers use hard coded offsets.  If no ISA bus exists nothing
   * is mapped on the first 64K of IO space
   */
-unsigned long pci_io_base = ISA_IO_BASE;
+unsigned long pci_io_base;
  EXPORT_SYMBOL(pci_io_base);
  
  static int __init pcibios_init(void)
@@ -47,6 +47,7 @@ static int __init pcibios_init(void)
  
         printk(KERN_INFO "PCI: Probing PCI hardware\n");
  
+       pci_io_base = ISA_IO_BASE;
         /* For now, override phys_mem_access_prot. If we need it,g
          * later, we may move that initialization to each ppc_md
          */
@@ -159,7 +160,7 @@ static int pcibios_map_phb_io_space(struct pci_controller *hose)
  
         /* Establish the mapping */
         if (__ioremap_at(phys_page, area->addr, size_page,
-                        _PAGE_NO_CACHE | _PAGE_GUARDED) == NULL)
+                        pgprot_val(pgprot_noncached(__pgprot(0)))) == NULL)
                 return -ENOMEM;
  
         /* Fixup hose IO resource */
diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c

index 38102cb9baa968223dd4bfb353c544928e91984c..ecdccce787193dccbe8a851e0ef4e3cd3e6fee54 100644 (file)
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -282,13 +282,9 @@ void remove_dev_pci_data(struct pci_dev *pdev)
  #endif /* CONFIG_PCI_IOV */
  }
  
-/*
- * Traverse_func that inits the PCI fields of the device node.
- * NOTE: this *must* be done before read/write config to the device.
- */
-void *update_dn_pci_info(struct device_node *dn, void *data)
+struct pci_dn *pci_add_device_node_info(struct pci_controller *hose,
+                                       struct device_node *dn)
  {
-       struct pci_controller *phb = data;
         const __be32 *type = of_get_property(dn, "ibm,pci-config-space-type", NULL);
         const __be32 *regs;
         struct device_node *parent;
@@ -299,7 +295,7 @@ void *update_dn_pci_info(struct device_node *dn, void *data)
                 return NULL;
         dn->data = pdn;
         pdn->node = dn;
-       pdn->phb = phb;
+       pdn->phb = hose;
  #ifdef CONFIG_PPC_POWERNV
         pdn->pe_number = IODA_INVALID_PE;
  #endif
@@ -331,8 +327,32 @@ void *update_dn_pci_info(struct device_node *dn, void *data)
         if (pdn->parent)
                 list_add_tail(&pdn->list, &pdn->parent->child_list);
  
-       return NULL;
+       return pdn;
  }
+EXPORT_SYMBOL_GPL(pci_add_device_node_info);
+
+void pci_remove_device_node_info(struct device_node *dn)
+{
+       struct pci_dn *pdn = dn ? PCI_DN(dn) : NULL;
+#ifdef CONFIG_EEH
+       struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+
+       if (edev)
+               edev->pdn = NULL;
+#endif
+
+       if (!pdn)
+               return;
+
+       WARN_ON(!list_empty(&pdn->child_list));
+       list_del(&pdn->list);
+       if (pdn->parent)
+               of_node_put(pdn->parent->node);
+
+       dn->data = NULL;
+       kfree(pdn);
+}
+EXPORT_SYMBOL_GPL(pci_remove_device_node_info);
  
  /*
   * Traverse a device tree stopping each PCI device in the tree.
@@ -352,8 +372,9 @@ void *update_dn_pci_info(struct device_node *dn, void *data)
   * one of these nodes we also assume its siblings are non-pci for
   * performance.
   */
-void *traverse_pci_devices(struct device_node *start, traverse_func pre,
-               void *data)
+void *pci_traverse_device_nodes(struct device_node *start,
+                               void *(*fn)(struct device_node *, void *),
+                               void *data)
  {
         struct device_node *dn, *nextdn;
         void *ret;
@@ -368,8 +389,11 @@ void *traverse_pci_devices(struct device_node *start, traverse_func pre,
                 if (classp)
                         class = of_read_number(classp, 1);
  
-               if (pre && ((ret = pre(dn, data)) != NULL))
-                       return ret;
+               if (fn) {
+                       ret = fn(dn, data);
+                       if (ret)
+                               return ret;
+               }
  
                 /* If we are a PCI bridge, go down */
                 if (dn->child && ((class >> 8) == PCI_CLASS_BRIDGE_PCI ||
@@ -391,6 +415,7 @@ void *traverse_pci_devices(struct device_node *start, traverse_func pre,
         }
         return NULL;
  }
+EXPORT_SYMBOL_GPL(pci_traverse_device_nodes);
  
  static struct pci_dn *pci_dn_next_one(struct pci_dn *root,
                                       struct pci_dn *pdn)
@@ -432,6 +457,18 @@ void *traverse_pci_dn(struct pci_dn *root,
         return NULL;
  }
  
+static void *add_pdn(struct device_node *dn, void *data)
+{
+       struct pci_controller *hose = data;
+       struct pci_dn *pdn;
+
+       pdn = pci_add_device_node_info(hose, dn);
+       if (!pdn)
+               return ERR_PTR(-ENOMEM);
+
+       return NULL;
+}
+
  /** 
   * pci_devs_phb_init_dynamic - setup pci devices under this PHB
   * phb: pci-to-host bridge (top-level bridge connecting to cpu)
@@ -446,8 +483,7 @@ void pci_devs_phb_init_dynamic(struct pci_controller *phb)
         struct pci_dn *pdn;
  
         /* PHB nodes themselves must not match */
-       update_dn_pci_info(dn, phb);
-       pdn = dn->data;
+       pdn = pci_add_device_node_info(phb, dn);
         if (pdn) {
                 pdn->devfn = pdn->busno = -1;
                 pdn->vendor_id = pdn->device_id = pdn->class_code = 0;
@@ -456,7 +492,7 @@ void pci_devs_phb_init_dynamic(struct pci_controller *phb)
         }
  
         /* Update dn->phb ptrs for new phb and children devices */
-       traverse_pci_devices(dn, update_dn_pci_info, phb);
+       pci_traverse_device_nodes(dn, add_pdn, phb);
  }
  
  /** 
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c

index 2a9280b945e02bfe45dc4f7fd466fed678d42d43..ea8a28fd6f31e544b3ab15e7b695f5ad42a2fe2e 100644 (file)
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -38,6 +38,7 @@
  #include <linux/random.h>
  #include <linux/hw_breakpoint.h>
  #include <linux/uaccess.h>
+#include <linux/elf-randomize.h>
  
  #include <asm/pgtable.h>
  #include <asm/io.h>
@@ -55,6 +56,7 @@
  #include <asm/firmware.h>
  #endif
  #include <asm/code-patching.h>
+#include <asm/exec.h>
  #include <asm/livepatch.h>
  
  #include <linux/kprobes.h>
@@ -1077,7 +1079,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
         }
  #endif /* CONFIG_PPC64 */
  
-#ifdef CONFIG_PPC_BOOK3S_64
+#ifdef CONFIG_PPC_STD_MMU_64
         batch = this_cpu_ptr(&ppc64_tlb_batch);
         if (batch->active) {
                 current_thread_info()->local_flags |= _TLF_LAZY_MMU;
@@ -1085,7 +1087,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
                         __flush_tlb_pending(batch);
                 batch->active = 0;
         }
-#endif /* CONFIG_PPC_BOOK3S_64 */
+#endif /* CONFIG_PPC_STD_MMU_64 */
  
  #ifdef CONFIG_PPC_ADV_DEBUG_REGS
         switch_booke_debug_regs(&new->thread.debug);
@@ -1131,7 +1133,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
  
         last = _switch(old_thread, new_thread);
  
-#ifdef CONFIG_PPC_BOOK3S_64
+#ifdef CONFIG_PPC_STD_MMU_64
         if (current_thread_info()->local_flags & _TLF_LAZY_MMU) {
                 current_thread_info()->local_flags &= ~_TLF_LAZY_MMU;
                 batch = this_cpu_ptr(&ppc64_tlb_batch);
@@ -1140,8 +1142,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
  
         if (current_thread_info()->task->thread.regs)
                 restore_math(current_thread_info()->task->thread.regs);
-
-#endif /* CONFIG_PPC_BOOK3S_64 */
+#endif /* CONFIG_PPC_STD_MMU_64 */
  
         return last;
  }
@@ -1376,6 +1377,9 @@ static void setup_ksp_vsid(struct task_struct *p, unsigned long sp)
         unsigned long sp_vsid;
         unsigned long llp = mmu_psize_defs[mmu_linear_psize].sllp;
  
+       if (radix_enabled())
+               return;
+
         if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
                 sp_vsid = get_kernel_vsid(sp, MMU_SEGSIZE_1T)
                         << SLB_VSID_SHIFT_1T;
@@ -1924,7 +1928,8 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
          * the heap, we can put it above 1TB so it is backed by a 1TB
          * segment. Otherwise the heap will be in the bottom 1TB
          * which always uses 256MB segments and this may result in a
-        * performance penalty.
+        * performance penalty. We don't need to worry about radix. For
+        * radix, mmu_highuser_ssize remains unchanged from 256MB.
          */
         if (!is_32bit_task() && (mmu_highuser_ssize == MMU_SEGSIZE_1T))
                 base = max_t(unsigned long, mm->brk, 1UL << SID_SHIFT_1T);
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c

index a15fe1d4e84aec9955622b603823fe44633da98d..946e34ffeae9f82c570f15fd54c87593c5984916 100644 (file)
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -34,6 +34,7 @@
  #include <linux/of.h>
  #include <linux/of_fdt.h>
  #include <linux/libfdt.h>
+#include <linux/cpu.h>
  
  #include <asm/prom.h>
  #include <asm/rtas.h>
@@ -167,6 +168,7 @@ static struct ibm_pa_feature {
          */
         {CPU_FTR_TM_COMP, 0, 0,
          PPC_FEATURE2_HTM_COMP|PPC_FEATURE2_HTM_NOSC_COMP, 22, 0, 0},
+       {0, MMU_FTR_RADIX, 0, 0,                40, 0, 0},
  };
  
  static void __init scan_features(unsigned long node, const unsigned char *ftrs,
diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c

index aa610ce8742fe6bb0d3875e0a1d69c37d6741e63..c638e2487a9c77a5fb31f354c390502efa74752d 100644 (file)
--- a/arch/powerpc/kernel/rtasd.c
+++ b/arch/powerpc/kernel/rtasd.c
@@ -442,7 +442,7 @@ static void do_event_scan(void)
  }
  
  static void rtas_event_scan(struct work_struct *w);
-DECLARE_DELAYED_WORK(event_scan_work, rtas_event_scan);
+static DECLARE_DELAYED_WORK(event_scan_work, rtas_event_scan);
  
  /*
   * Delay should be at least one second since some machines have problems if
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c

index 44c8d03558ac4ef538964c715ae3ed0c800e1d21..8ca79b7503d8a01e83689f562972e9ba05f2d0c9 100644 (file)
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -128,9 +128,7 @@ void machine_restart(char *cmd)
         machine_shutdown();
         if (ppc_md.restart)
                 ppc_md.restart(cmd);
-#ifdef CONFIG_SMP
         smp_send_stop();
-#endif
         printk(KERN_EMERG "System Halted, OK to turn off power\n");
         local_irq_disable();
         while (1) ;
@@ -141,9 +139,7 @@ void machine_power_off(void)
         machine_shutdown();
         if (pm_power_off)
                 pm_power_off();
-#ifdef CONFIG_SMP
         smp_send_stop();
-#endif
         printk(KERN_EMERG "System Halted, OK to turn off power\n");
         local_irq_disable();
         while (1) ;
@@ -159,9 +155,7 @@ void machine_halt(void)
         machine_shutdown();
         if (ppc_md.halt)
                 ppc_md.halt();
-#ifdef CONFIG_SMP
         smp_send_stop();
-#endif
         printk(KERN_EMERG "System Halted, OK to turn off power\n");
         local_irq_disable();
         while (1) ;
diff --git a/arch/powerpc/kernel/swsusp.c b/arch/powerpc/kernel/swsusp.c

index 6669b17525129f8c39ab7b9de8b89fb38adb4354..6ae9bd5086a4a44bdb4f058f63264af2714068bc 100644 (file)
--- a/arch/powerpc/kernel/swsusp.c
+++ b/arch/powerpc/kernel/swsusp.c
@@ -31,6 +31,6 @@ void save_processor_state(void)
  void restore_processor_state(void)
  {
  #ifdef CONFIG_PPC32
-       switch_mmu_context(current->active_mm, current->active_mm);
+       switch_mmu_context(current->active_mm, current->active_mm, NULL);
  #endif
  }
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c

index 81b0900a39eef095a917a4471f4b09753c988ede..3ed9a5a21d77997fa96f3cb39dde621701c30f52 100644 (file)
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -55,6 +55,7 @@
  #include <linux/delay.h>
  #include <linux/irq_work.h>
  #include <linux/clk-provider.h>
+#include <linux/suspend.h>
  #include <asm/trace.h>
  
  #include <asm/io.h>
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c

index 5f8dcdaa2820167496b5878f9ab883585922a2cf..8d7358f3a27300c4f29e7673f164757282bb1737 100644 (file)
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -87,7 +87,7 @@ struct vio_cmo_dev_entry {
   * @curr: bytes currently allocated
   * @high: high water mark for IO data usage
   */
-struct vio_cmo {
+static struct vio_cmo {
         spinlock_t lock;
         struct delayed_work balance_q;
         struct list_head device_list;
@@ -615,7 +615,7 @@ static u64 vio_dma_get_required_mask(struct device *dev)
          return dma_iommu_ops.get_required_mask(dev);
  }
  
-struct dma_map_ops vio_dma_mapping_ops = {
+static struct dma_map_ops vio_dma_mapping_ops = {
         .alloc             = vio_dma_iommu_alloc_coherent,
         .free              = vio_dma_iommu_free_coherent,
         .mmap              = dma_direct_mmap_coherent,
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c

index c7b78d8336b26d18873231225a1ed6c0bcdde3bc..05f09ae825876cfd2a6b57d10edfd8b760f0c501 100644 (file)
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -447,7 +447,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
         struct revmap_entry *rev;
         struct page *page, *pages[1];
         long index, ret, npages;
-       unsigned long is_io;
+       bool is_ci;
         unsigned int writing, write_ok;
         struct vm_area_struct *vma;
         unsigned long rcbits;
@@ -503,7 +503,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
         smp_rmb();
  
         ret = -EFAULT;
-       is_io = 0;
+       is_ci = false;
         pfn = 0;
         page = NULL;
         pte_size = PAGE_SIZE;
@@ -521,7 +521,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
                         pfn = vma->vm_pgoff +
                                 ((hva - vma->vm_start) >> PAGE_SHIFT);
                         pte_size = psize;
-                       is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot));
+                       is_ci = pte_ci(__pte((pgprot_val(vma->vm_page_prot))));
                         write_ok = vma->vm_flags & VM_WRITE;
                 }
                 up_read(&current->mm->mmap_sem);
@@ -558,10 +558,9 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
                 goto out_put;
  
         /* Check WIMG vs. the actual page we're accessing */
-       if (!hpte_cache_flags_ok(r, is_io)) {
-               if (is_io)
+       if (!hpte_cache_flags_ok(r, is_ci)) {
+               if (is_ci)
                         goto out_put;
-
                 /*
                  * Allow guest to map emulated device memory as
                  * uncacheable, but actually make it cacheable.
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c

index 93243554cae9188390b763bf59541d3c6b0d6fd8..e20beae5ca7a462d9f1cfb7211b9a84592383ddf 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3272,6 +3272,12 @@ static int kvmppc_core_check_processor_compat_hv(void)
         if (!cpu_has_feature(CPU_FTR_HVMODE) ||
             !cpu_has_feature(CPU_FTR_ARCH_206))
                 return -EIO;
+       /*
+        * Disable KVM for Power9, untill the required bits merged.
+        */
+       if (cpu_has_feature(CPU_FTR_ARCH_300))
+               return -EIO;
+
         return 0;
  }
  
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c

index 4cb8db05f3e55007a1ee54feaf6864d841020e9c..99b4e9d5dd238c70b795db5e9fedbaa7741fc753 100644 (file)
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -175,7 +175,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
         unsigned long g_ptel;
         struct kvm_memory_slot *memslot;
         unsigned hpage_shift;
-       unsigned long is_io;
+       bool is_ci;
         unsigned long *rmap;
         pte_t *ptep;
         unsigned int writing;
@@ -199,7 +199,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
         gfn = gpa >> PAGE_SHIFT;
         memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
         pa = 0;
-       is_io = ~0ul;
+       is_ci = false;
         rmap = NULL;
         if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) {
                 /* Emulated MMIO - mark this with key=31 */
@@ -250,7 +250,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
                         if (writing && !pte_write(pte))
                                 /* make the actual HPTE be read-only */
                                 ptel = hpte_make_readonly(ptel);
-                       is_io = hpte_cache_bits(pte_val(pte));
+                       is_ci = pte_ci(pte);
                         pa = pte_pfn(pte) << PAGE_SHIFT;
                         pa |= hva & (host_pte_size - 1);
                         pa |= gpa & ~PAGE_MASK;
@@ -267,9 +267,9 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
         else
                 pteh |= HPTE_V_ABSENT;
  
-       /* Check WIMG */
-       if (is_io != ~0ul && !hpte_cache_flags_ok(ptel, is_io)) {
-               if (is_io)
+       /*If we had host pte mapping then  Check WIMG */
+       if (ptep && !hpte_cache_flags_ok(ptel, is_ci)) {
+               if (is_ci)
                         return H_PARAMETER;
                 /*
                  * Allow guest to map emulated device memory as
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c

index 8129b0db131edfd1532bbd487097c43974baf04a..8e4f64f0b7741d60e90f204c8265d1729bfa1788 100644 (file)
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1713,7 +1713,11 @@ static void kvmppc_core_destroy_vm_pr(struct kvm *kvm)
  
  static int kvmppc_core_check_processor_compat_pr(void)
  {
-       /* we are always compatible */
+       /*
+        * Disable KVM for Power9 untill the required bits merged.
+        */
+       if (cpu_has_feature(CPU_FTR_ARCH_300))
+               return -EIO;
         return 0;
  }
  
diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S

index c44df2dbedd52f5a2726443afe5a9296326b3778..99f37f24185ca890127b3e2a1ebed7ed14f4ca55 100644 (file)
--- a/arch/powerpc/lib/copy_32.S
+++ b/arch/powerpc/lib/copy_32.S
@@ -217,7 +217,7 @@ _GLOBAL(memcpy)
         bdnz    40b
  65:    blr
  
-_GLOBAL(generic_memcpy)
+generic_memcpy:
         srwi.   r7,r5,3
         addi    r6,r3,-4
         addi    r4,r4,-4
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c

index dc885b30f7a6e0f600dc5694521dfdb11b248f93..3362299b185916dcbf451a06eca040491d317173 100644 (file)
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -925,6 +925,7 @@ int __kprobes analyse_instr(struct instruction_op *op, struct pt_regs *regs,
                         }
                 }
  #endif
+       break; /* illegal instruction */
  
         case 31:
                 switch ((instr >> 1) & 0x3ff) {
@@ -1818,9 +1819,11 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)
                 case 4:
                         __get_user_asmx(val, op.ea, err, "lwarx");
                         break;
+#ifdef __powerpc64__
                 case 8:
                         __get_user_asmx(val, op.ea, err, "ldarx");
                         break;
+#endif
                 default:
                         return 0;
                 }
@@ -1841,9 +1844,11 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)
                 case 4:
                         __put_user_asmx(op.val, op.ea, err, "stwcx.", cr);
                         break;
+#ifdef __powerpc64__
                 case 8:
                         __put_user_asmx(op.val, op.ea, err, "stdcx.", cr);
                         break;
+#endif
                 default:
                         return 0;
                 }
diff --git a/arch/powerpc/lib/xor_vmx.c b/arch/powerpc/lib/xor_vmx.c

index 07f49f1568e5eacccf5ea19929d572ff2eb68cfc..f9de69a04e88d85fc707231e5eb8641384cc1b08 100644 (file)
--- a/arch/powerpc/lib/xor_vmx.c
+++ b/arch/powerpc/lib/xor_vmx.c
@@ -17,7 +17,17 @@
   *
   * Author: Anton Blanchard <anton@au.ibm.com>
   */
+
+/*
+ * Sparse (as at v0.5.0) gets very, very confused by this file.
+ * Make it a bit simpler for it.
+ */
+#if !defined(__CHECKER__)
  #include <altivec.h>
+#else
+#define vec_xor(a, b) a ^ b
+#define vector __attribute__((vector_size(16)))
+#endif
  
  #include <linux/preempt.h>
  #include <linux/export.h>
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile

index adfee3f1aeb9e7d7b8c47a4366a3a1dc05a5224f..f2cea6d5e764b845731cdd168a7194815b7624cf 100644 (file)
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -13,10 +13,11 @@ obj-$(CONFIG_PPC_MMU_NOHASH)        += mmu_context_nohash.o tlb_nohash.o \
                                    tlb_nohash_low.o
  obj-$(CONFIG_PPC_BOOK3E)       += tlb_low_$(CONFIG_WORD_SIZE)e.o
  hash64-$(CONFIG_PPC_NATIVE)    := hash_native_64.o
-obj-$(CONFIG_PPC_STD_MMU_64)   += hash_utils_64.o slb_low.o slb.o $(hash64-y)
-obj-$(CONFIG_PPC_STD_MMU_32)   += ppc_mmu_32.o hash_low_32.o
-obj-$(CONFIG_PPC_STD_MMU)      += tlb_hash$(CONFIG_WORD_SIZE).o \
-                                  mmu_context_hash$(CONFIG_WORD_SIZE).o
+obj-$(CONFIG_PPC_BOOK3E_64)   += pgtable-book3e.o
+obj-$(CONFIG_PPC_STD_MMU_64)   += pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o
+obj-$(CONFIG_PPC_RADIX_MMU)    += pgtable-radix.o tlb-radix.o
+obj-$(CONFIG_PPC_STD_MMU_32)   += ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o
+obj-$(CONFIG_PPC_STD_MMU)      += tlb_hash$(CONFIG_WORD_SIZE).o
  ifeq ($(CONFIG_PPC_STD_MMU_64),y)
  obj-$(CONFIG_PPC_4K_PAGES)     += hash64_4k.o
  obj-$(CONFIG_PPC_64K_PAGES)    += hash64_64k.o
@@ -33,6 +34,7 @@ obj-$(CONFIG_PPC_MM_SLICES)   += slice.o
  obj-y                          += hugetlbpage.o
  ifeq ($(CONFIG_HUGETLB_PAGE),y)
  obj-$(CONFIG_PPC_STD_MMU_64)   += hugetlbpage-hash64.o
+obj-$(CONFIG_PPC_RADIX_MMU)    += hugetlbpage-radix.o
  obj-$(CONFIG_PPC_BOOK3E_MMU)   += hugetlbpage-book3e.o
  endif
  obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o
diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c

index a1b2713f6e96aa96424519d9fe3d109c160154bd..139dec421e57a578b89eb7140cca17028a1276cc 100644 (file)
--- a/arch/powerpc/mm/fsl_booke_mmu.c
+++ b/arch/powerpc/mm/fsl_booke_mmu.c
@@ -135,7 +135,7 @@ static void settlbcam(int index, unsigned long virt, phys_addr_t phys,
                 TLBCAM[index].MAS7 = (u64)phys >> 32;
  
         /* Below is unlikely -- only for large user pages or similar */
-       if (pte_user(flags)) {
+       if (pte_user(__pte(flags))) {
            TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR;
            TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0);
         }
diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/hash64_4k.c

index 47d1b26effc6a71115d371599590496fc2dd5b96..6333b273d2d59423dee27dab0fea8b6e5274b65a 100644 (file)
--- a/arch/powerpc/mm/hash64_4k.c
+++ b/arch/powerpc/mm/hash64_4k.c
@@ -34,21 +34,21 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
  
                 old_pte = pte_val(pte);
                 /* If PTE busy, retry the access */
-               if (unlikely(old_pte & _PAGE_BUSY))
+               if (unlikely(old_pte & H_PAGE_BUSY))
                         return 0;
                 /* If PTE permissions don't match, take page fault */
-               if (unlikely(access & ~old_pte))
+               if (unlikely(!check_pte_access(access, old_pte)))
                         return 1;
                 /*
                  * Try to lock the PTE, add ACCESSED and DIRTY if it was
                  * a write access. Since this is 4K insert of 64K page size
-                * also add _PAGE_COMBO
+                * also add H_PAGE_COMBO
                  */
-               new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
-               if (access & _PAGE_RW)
+               new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
+               if (access & _PAGE_WRITE)
                         new_pte |= _PAGE_DIRTY;
-       } while (old_pte != __cmpxchg_u64((unsigned long *)ptep,
-                                         old_pte, new_pte));
+       } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+
         /*
          * PP bits. _PAGE_USER is already PP bit 0x2, so we only
          * need to add in 0x1 if it's a read-only user page
@@ -60,22 +60,22 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
                 rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
  
         vpn  = hpt_vpn(ea, vsid, ssize);
-       if (unlikely(old_pte & _PAGE_HASHPTE)) {
+       if (unlikely(old_pte & H_PAGE_HASHPTE)) {
                 /*
                  * There MIGHT be an HPTE for this pte
                  */
                 hash = hpt_hash(vpn, shift, ssize);
-               if (old_pte & _PAGE_F_SECOND)
+               if (old_pte & H_PAGE_F_SECOND)
                         hash = ~hash;
                 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-               slot += (old_pte & _PAGE_F_GIX) >> _PAGE_F_GIX_SHIFT;
+               slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT;
  
                 if (ppc_md.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_4K,
                                          MMU_PAGE_4K, ssize, flags) == -1)
                         old_pte &= ~_PAGE_HPTEFLAGS;
         }
  
-       if (likely(!(old_pte & _PAGE_HASHPTE))) {
+       if (likely(!(old_pte & H_PAGE_HASHPTE))) {
  
                 pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
                 hash = hpt_hash(vpn, shift, ssize);
@@ -115,9 +115,10 @@ repeat:
                                            MMU_PAGE_4K, MMU_PAGE_4K, old_pte);
                         return -1;
                 }
-               new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
-               new_pte |= (slot << _PAGE_F_GIX_SHIFT) & (_PAGE_F_SECOND | _PAGE_F_GIX);
+               new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
+               new_pte |= (slot << H_PAGE_F_GIX_SHIFT) &
+                       (H_PAGE_F_SECOND | H_PAGE_F_GIX);
         }
-       *ptep = __pte(new_pte & ~_PAGE_BUSY);
+       *ptep = __pte(new_pte & ~H_PAGE_BUSY);
         return 0;
  }
diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c

index b2d659cf51c664f1379b8748feaa72146a146590..16644e1f4e6bb1cd858cd94126fafe4458ff7b46 100644 (file)
--- a/arch/powerpc/mm/hash64_64k.c
+++ b/arch/powerpc/mm/hash64_64k.c
@@ -23,7 +23,7 @@ bool __rpte_sub_valid(real_pte_t rpte, unsigned long index)
         unsigned long g_idx;
         unsigned long ptev = pte_val(rpte.pte);
  
-       g_idx = (ptev & _PAGE_COMBO_VALID) >> _PAGE_F_GIX_SHIFT;
+       g_idx = (ptev & H_PAGE_COMBO_VALID) >> H_PAGE_F_GIX_SHIFT;
         index = index >> 2;
         if (g_idx & (0x1 << index))
                 return true;
@@ -37,12 +37,12 @@ static unsigned long mark_subptegroup_valid(unsigned long ptev, unsigned long in
  {
         unsigned long g_idx;
  
-       if (!(ptev & _PAGE_COMBO))
+       if (!(ptev & H_PAGE_COMBO))
                 return ptev;
         index = index >> 2;
         g_idx = 0x1 << index;
  
-       return ptev | (g_idx << _PAGE_F_GIX_SHIFT);
+       return ptev | (g_idx << H_PAGE_F_GIX_SHIFT);
  }
  
  int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
@@ -66,21 +66,21 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
  
                 old_pte = pte_val(pte);
                 /* If PTE busy, retry the access */
-               if (unlikely(old_pte & _PAGE_BUSY))
+               if (unlikely(old_pte & H_PAGE_BUSY))
                         return 0;
                 /* If PTE permissions don't match, take page fault */
-               if (unlikely(access & ~old_pte))
+               if (unlikely(!check_pte_access(access, old_pte)))
                         return 1;
                 /*
                  * Try to lock the PTE, add ACCESSED and DIRTY if it was
                  * a write access. Since this is 4K insert of 64K page size
-                * also add _PAGE_COMBO
+                * also add H_PAGE_COMBO
                  */
-               new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED | _PAGE_COMBO;
-               if (access & _PAGE_RW)
+               new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED | H_PAGE_COMBO;
+               if (access & _PAGE_WRITE)
                         new_pte |= _PAGE_DIRTY;
-       } while (old_pte != __cmpxchg_u64((unsigned long *)ptep,
-                                         old_pte, new_pte));
+       } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+
         /*
          * Handle the subpage protection bits
          */
@@ -103,21 +103,21 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
         /*
          *None of the sub 4k page is hashed
          */
-       if (!(old_pte & _PAGE_HASHPTE))
+       if (!(old_pte & H_PAGE_HASHPTE))
                 goto htab_insert_hpte;
         /*
          * Check if the pte was already inserted into the hash table
          * as a 64k HW page, and invalidate the 64k HPTE if so.
          */
-       if (!(old_pte & _PAGE_COMBO)) {
+       if (!(old_pte & H_PAGE_COMBO)) {
                 flush_hash_page(vpn, rpte, MMU_PAGE_64K, ssize, flags);
                 /*
                  * clear the old slot details from the old and new pte.
                  * On hash insert failure we use old pte value and we don't
                  * want slot information there if we have a insert failure.
                  */
-               old_pte &= ~(_PAGE_HASHPTE | _PAGE_F_GIX | _PAGE_F_SECOND);
-               new_pte &= ~(_PAGE_HASHPTE | _PAGE_F_GIX | _PAGE_F_SECOND);
+               old_pte &= ~(H_PAGE_HASHPTE | H_PAGE_F_GIX | H_PAGE_F_SECOND);
+               new_pte &= ~(H_PAGE_HASHPTE | H_PAGE_F_GIX | H_PAGE_F_SECOND);
                 goto htab_insert_hpte;
         }
         /*
@@ -143,15 +143,15 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
                 if (ret == -1)
                         goto htab_insert_hpte;
  
-               *ptep = __pte(new_pte & ~_PAGE_BUSY);
+               *ptep = __pte(new_pte & ~H_PAGE_BUSY);
                 return 0;
         }
  
  htab_insert_hpte:
         /*
-        * handle _PAGE_4K_PFN case
+        * handle H_PAGE_4K_PFN case
          */
-       if (old_pte & _PAGE_4K_PFN) {
+       if (old_pte & H_PAGE_4K_PFN) {
                 /*
                  * All the sub 4k page have the same
                  * physical address.
@@ -199,20 +199,20 @@ repeat:
         }
         /*
          * Insert slot number & secondary bit in PTE second half,
-        * clear _PAGE_BUSY and set appropriate HPTE slot bit
-        * Since we have _PAGE_BUSY set on ptep, we can be sure
+        * clear H_PAGE_BUSY and set appropriate HPTE slot bit
+        * Since we have H_PAGE_BUSY set on ptep, we can be sure
          * nobody is undating hidx.
          */
         hidxp = (unsigned long *)(ptep + PTRS_PER_PTE);
         rpte.hidx &= ~(0xfUL << (subpg_index << 2));
         *hidxp = rpte.hidx  | (slot << (subpg_index << 2));
         new_pte = mark_subptegroup_valid(new_pte, subpg_index);
-       new_pte |=  _PAGE_HASHPTE;
+       new_pte |=  H_PAGE_HASHPTE;
         /*
          * check __real_pte for details on matching smp_rmb()
          */
         smp_wmb();
-       *ptep = __pte(new_pte & ~_PAGE_BUSY);
+       *ptep = __pte(new_pte & ~H_PAGE_BUSY);
         return 0;
  }
  
@@ -220,7 +220,6 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
                     unsigned long vsid, pte_t *ptep, unsigned long trap,
                     unsigned long flags, int ssize)
  {
-
         unsigned long hpte_group;
         unsigned long rflags, pa;
         unsigned long old_pte, new_pte;
@@ -235,27 +234,26 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
  
                 old_pte = pte_val(pte);
                 /* If PTE busy, retry the access */
-               if (unlikely(old_pte & _PAGE_BUSY))
+               if (unlikely(old_pte & H_PAGE_BUSY))
                         return 0;
                 /* If PTE permissions don't match, take page fault */
-               if (unlikely(access & ~old_pte))
+               if (unlikely(!check_pte_access(access, old_pte)))
                         return 1;
                 /*
                  * Check if PTE has the cache-inhibit bit set
                  * If so, bail out and refault as a 4k page
                  */
                 if (!mmu_has_feature(MMU_FTR_CI_LARGE_PAGE) &&
-                   unlikely(old_pte & _PAGE_NO_CACHE))
+                   unlikely(pte_ci(pte)))
                         return 0;
                 /*
                  * Try to lock the PTE, add ACCESSED and DIRTY if it was
                  * a write access.
                  */
-               new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
-               if (access & _PAGE_RW)
+               new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
+               if (access & _PAGE_WRITE)
                         new_pte |= _PAGE_DIRTY;
-       } while (old_pte != __cmpxchg_u64((unsigned long *)ptep,
-                                         old_pte, new_pte));
+       } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
  
         rflags = htab_convert_pte_flags(new_pte);
  
@@ -264,22 +262,22 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
                 rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
  
         vpn  = hpt_vpn(ea, vsid, ssize);
-       if (unlikely(old_pte & _PAGE_HASHPTE)) {
+       if (unlikely(old_pte & H_PAGE_HASHPTE)) {
                 /*
                  * There MIGHT be an HPTE for this pte
                  */
                 hash = hpt_hash(vpn, shift, ssize);
-               if (old_pte & _PAGE_F_SECOND)
+               if (old_pte & H_PAGE_F_SECOND)
                         hash = ~hash;
                 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-               slot += (old_pte & _PAGE_F_GIX) >> _PAGE_F_GIX_SHIFT;
+               slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT;
  
                 if (ppc_md.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_64K,
                                          MMU_PAGE_64K, ssize, flags) == -1)
                         old_pte &= ~_PAGE_HPTEFLAGS;
         }
  
-       if (likely(!(old_pte & _PAGE_HASHPTE))) {
+       if (likely(!(old_pte & H_PAGE_HASHPTE))) {
  
                 pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
                 hash = hpt_hash(vpn, shift, ssize);
@@ -319,9 +317,10 @@ repeat:
                                            MMU_PAGE_64K, MMU_PAGE_64K, old_pte);
                         return -1;
                 }
-               new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
-               new_pte |= (slot << _PAGE_F_GIX_SHIFT) & (_PAGE_F_SECOND | _PAGE_F_GIX);
+               new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
+               new_pte |= (slot << H_PAGE_F_GIX_SHIFT) &
+                       (H_PAGE_F_SECOND | H_PAGE_F_GIX);
         }
-       *ptep = __pte(new_pte & ~_PAGE_BUSY);
+       *ptep = __pte(new_pte & ~H_PAGE_BUSY);
         return 0;
  }
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c

index 8eaac81347fdb43c8722884318bc176dba957940..d873f6507f7210fe4b9a7caa774861b9648ba5aa 100644 (file)
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -221,7 +221,7 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
                 return -1;
  
         hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
-       hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
+       hpte_r = hpte_encode_r(pa, psize, apsize, ssize) | rflags;
  
         if (!(vflags & HPTE_V_BOLTED)) {
                 DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n",
@@ -719,6 +719,12 @@ static void native_flush_hash_range(unsigned long number, int local)
         local_irq_restore(flags);
  }
  
+static int native_update_partition_table(u64 patb1)
+{
+       partition_tb->patb1 = cpu_to_be64(patb1);
+       return 0;
+}
+
  void __init hpte_init_native(void)
  {
         ppc_md.hpte_invalidate  = native_hpte_invalidate;
@@ -729,4 +735,7 @@ void __init hpte_init_native(void)
         ppc_md.hpte_clear_all   = native_hpte_clear;
         ppc_md.flush_hash_range = native_flush_hash_range;
         ppc_md.hugepage_invalidate   = native_hugepage_invalidate;
+
+       if (cpu_has_feature(CPU_FTR_ARCH_300))
+               ppc_md.update_partition_table = native_update_partition_table;
  }
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c

index 7635b1c6b5dacfd04793315c31069a42a425e402..59268969a0bc7b0d02e8c10b28f5b214f6209006 100644 (file)
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -167,16 +167,22 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags)
         if ((pteflags & _PAGE_EXEC) == 0)
                 rflags |= HPTE_R_N;
         /*
-        * PP bits:
+        * PPP bits:
          * Linux uses slb key 0 for kernel and 1 for user.
-        * kernel areas are mapped with PP=00
-        * and there is no kernel RO (_PAGE_KERNEL_RO).
-        * User area is mapped with PP=0x2 for read/write
-        * or PP=0x3 for read-only (including writeable but clean pages).
+        * kernel RW areas are mapped with PPP=0b000
+        * User area is mapped with PPP=0b010 for read/write
+        * or PPP=0b011 for read-only (including writeable but clean pages).
          */
-       if (pteflags & _PAGE_USER) {
-               rflags |= 0x2;
-               if (!((pteflags & _PAGE_RW) && (pteflags & _PAGE_DIRTY)))
+       if (pteflags & _PAGE_PRIVILEGED) {
+               /*
+                * Kernel read only mapped with ppp bits 0b110
+                */
+               if (!(pteflags & _PAGE_WRITE))
+                       rflags |= (HPTE_R_PP0 | 0x2);
+       } else {
+               if (pteflags & _PAGE_RWX)
+                       rflags |= 0x2;
+               if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY)))
                         rflags |= 0x1;
         }
         /*
@@ -186,12 +192,13 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags)
         /*
          * Add in WIG bits
          */
-       if (pteflags & _PAGE_WRITETHRU)
-               rflags |= HPTE_R_W;
-       if (pteflags & _PAGE_NO_CACHE)
+
+       if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_TOLERANT)
                 rflags |= HPTE_R_I;
-       if (pteflags & _PAGE_GUARDED)
-               rflags |= HPTE_R_G;
+       if ((pteflags & _PAGE_CACHE_CTL ) == _PAGE_NON_IDEMPOTENT)
+               rflags |= (HPTE_R_I | HPTE_R_G);
+       if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO)
+               rflags |= (HPTE_R_I | HPTE_R_W);
  
         return rflags;
  }
@@ -669,6 +676,41 @@ int remove_section_mapping(unsigned long start, unsigned long end)
  }
  #endif /* CONFIG_MEMORY_HOTPLUG */
  
+static void __init hash_init_partition_table(phys_addr_t hash_table,
+                                            unsigned long pteg_count)
+{
+       unsigned long ps_field;
+       unsigned long htab_size;
+       unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
+
+       /*
+        * slb llp encoding for the page size used in VPM real mode.
+        * We can ignore that for lpid 0
+        */
+       ps_field = 0;
+       htab_size =  __ilog2(pteg_count) - 11;
+
+       BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 24), "Partition table size too large.");
+       partition_tb = __va(memblock_alloc_base(patb_size, patb_size,
+                                               MEMBLOCK_ALLOC_ANYWHERE));
+
+       /* Initialize the Partition Table with no entries */
+       memset((void *)partition_tb, 0, patb_size);
+       partition_tb->patb0 = cpu_to_be64(ps_field | hash_table | htab_size);
+       /*
+        * FIXME!! This should be done via update_partition table
+        * For now UPRT is 0 for us.
+        */
+       partition_tb->patb1 = 0;
+       DBG("Partition table %p\n", partition_tb);
+       /*
+        * update partition table control register,
+        * 64 K size.
+        */
+       mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
+
+}
+
  static void __init htab_initialize(void)
  {
         unsigned long table;
@@ -737,8 +779,11 @@ static void __init htab_initialize(void)
                 /* Initialize the HPT with no entries */
                 memset((void *)table, 0, htab_size_bytes);
  
-               /* Set SDR1 */
-               mtspr(SPRN_SDR1, _SDR1);
+               if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                       /* Set SDR1 */
+                       mtspr(SPRN_SDR1, _SDR1);
+               else
+                       hash_init_partition_table(table, pteg_count);
         }
  
         prot = pgprot_val(PAGE_KERNEL);
@@ -823,8 +868,38 @@ static void __init htab_initialize(void)
  #undef KB
  #undef MB
  
-void __init early_init_mmu(void)
+void __init hash__early_init_mmu(void)
  {
+       /*
+        * initialize page table size
+        */
+       __pte_frag_nr = H_PTE_FRAG_NR;
+       __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT;
+
+       __pte_index_size = H_PTE_INDEX_SIZE;
+       __pmd_index_size = H_PMD_INDEX_SIZE;
+       __pud_index_size = H_PUD_INDEX_SIZE;
+       __pgd_index_size = H_PGD_INDEX_SIZE;
+       __pmd_cache_index = H_PMD_CACHE_INDEX;
+       __pte_table_size = H_PTE_TABLE_SIZE;
+       __pmd_table_size = H_PMD_TABLE_SIZE;
+       __pud_table_size = H_PUD_TABLE_SIZE;
+       __pgd_table_size = H_PGD_TABLE_SIZE;
+       /*
+        * 4k use hugepd format, so for hash set then to
+        * zero
+        */
+       __pmd_val_bits = 0;
+       __pud_val_bits = 0;
+       __pgd_val_bits = 0;
+
+       __kernel_virt_start = H_KERN_VIRT_START;
+       __kernel_virt_size = H_KERN_VIRT_SIZE;
+       __vmalloc_start = H_VMALLOC_START;
+       __vmalloc_end = H_VMALLOC_END;
+       vmemmap = (struct page *)H_VMEMMAP_BASE;
+       ioremap_bot = IOREMAP_BASE;
+
         /* Initialize the MMU Hash table and create the linear mapping
          * of memory. Has to be done before SLB initialization as this is
          * currently where the page size encoding is obtained.
@@ -836,12 +911,16 @@ void __init early_init_mmu(void)
  }
  
  #ifdef CONFIG_SMP
-void early_init_mmu_secondary(void)
+void hash__early_init_mmu_secondary(void)
  {
         /* Initialize hash table for that CPU */
-       if (!firmware_has_feature(FW_FEATURE_LPAR))
-               mtspr(SPRN_SDR1, _SDR1);
-
+       if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+               if (!cpu_has_feature(CPU_FTR_ARCH_300))
+                       mtspr(SPRN_SDR1, _SDR1);
+               else
+                       mtspr(SPRN_PTCR,
+                             __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
+       }
         /* Initialize SLB */
         slb_initialize();
  }
@@ -920,7 +999,7 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
   * Userspace sets the subpage permissions using the subpage_prot system call.
   *
   * Result is 0: full permissions, _PAGE_RW: read-only,
- * _PAGE_USER or _PAGE_USER|_PAGE_RW: no access.
+ * _PAGE_RWX: no access.
   */
  static int subpage_protection(struct mm_struct *mm, unsigned long ea)
  {
@@ -946,8 +1025,13 @@ static int subpage_protection(struct mm_struct *mm, unsigned long ea)
         /* extract 2-bit bitfield for this 4k subpage */
         spp >>= 30 - 2 * ((ea >> 12) & 0xf);
  
-       /* turn 0,1,2,3 into combination of _PAGE_USER and _PAGE_RW */
-       spp = ((spp & 2) ? _PAGE_USER : 0) | ((spp & 1) ? _PAGE_RW : 0);
+       /*
+        * 0 -> full premission
+        * 1 -> Read only
+        * 2 -> no access.
+        * We return the flag that need to be cleared.
+        */
+       spp = ((spp & 2) ? _PAGE_RWX : 0) | ((spp & 1) ? _PAGE_WRITE : 0);
         return spp;
  }
  
@@ -1084,7 +1168,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
         /* Pre-check access permissions (will be re-checked atomically
          * in __hash_page_XX but this pre-check is a fast path
          */
-       if (access & ~pte_val(*ptep)) {
+       if (!check_pte_access(access, pte_val(*ptep))) {
                 DBG_LOW(" no access !\n");
                 rc = 1;
                 goto bail;
@@ -1122,8 +1206,8 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
  #endif
         /* Do actual hashing */
  #ifdef CONFIG_PPC_64K_PAGES
-       /* If _PAGE_4K_PFN is set, make sure this is a 4k segment */
-       if ((pte_val(*ptep) & _PAGE_4K_PFN) && psize == MMU_PAGE_64K) {
+       /* If H_PAGE_4K_PFN is set, make sure this is a 4k segment */
+       if ((pte_val(*ptep) & H_PAGE_4K_PFN) && psize == MMU_PAGE_64K) {
                 demote_segment_4k(mm, ea);
                 psize = MMU_PAGE_4K;
         }
@@ -1131,8 +1215,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
         /* If this PTE is non-cacheable and we have restrictions on
          * using non cacheable large pages, then we switch to 4k
          */
-       if (mmu_ci_restrictions && psize == MMU_PAGE_64K &&
-           (pte_val(*ptep) & _PAGE_NO_CACHE)) {
+       if (mmu_ci_restrictions && psize == MMU_PAGE_64K && pte_ci(*ptep)) {
                 if (user_region) {
                         demote_segment_4k(mm, ea);
                         psize = MMU_PAGE_4K;
@@ -1209,7 +1292,7 @@ EXPORT_SYMBOL_GPL(hash_page);
  int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap,
                 unsigned long dsisr)
  {
-       unsigned long access = _PAGE_PRESENT;
+       unsigned long access = _PAGE_PRESENT | _PAGE_READ;
         unsigned long flags = 0;
         struct mm_struct *mm = current->mm;
  
@@ -1220,14 +1303,18 @@ int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap,
                 flags |= HPTE_NOHPTE_UPDATE;
  
         if (dsisr & DSISR_ISSTORE)
-               access |= _PAGE_RW;
+               access |= _PAGE_WRITE;
         /*
-        * We need to set the _PAGE_USER bit if MSR_PR is set or if we are
-        * accessing a userspace segment (even from the kernel). We assume
-        * kernel addresses always have the high bit set.
+        * We set _PAGE_PRIVILEGED only when
+        * kernel mode access kernel space.
+        *
+        * _PAGE_PRIVILEGED is NOT set
+        * 1) when kernel mode access user space
+        * 2) user space access kernel space.
          */
+       access |= _PAGE_PRIVILEGED;
         if ((msr & MSR_PR) || (REGION_ID(ea) == USER_REGION_ID))
-               access |= _PAGE_USER;
+               access &= ~_PAGE_PRIVILEGED;
  
         if (trap == 0x400)
                 access |= _PAGE_EXEC;
@@ -1235,6 +1322,30 @@ int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap,
         return hash_page_mm(mm, ea, access, trap, flags);
  }
  
+#ifdef CONFIG_PPC_MM_SLICES
+static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
+{
+       int psize = get_slice_psize(mm, ea);
+
+       /* We only prefault standard pages for now */
+       if (unlikely(psize != mm->context.user_psize))
+               return false;
+
+       /*
+        * Don't prefault if subpage protection is enabled for the EA.
+        */
+       if (unlikely((psize == MMU_PAGE_4K) && subpage_protection(mm, ea)))
+               return false;
+
+       return true;
+}
+#else
+static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
+{
+       return true;
+}
+#endif
+
  void hash_preload(struct mm_struct *mm, unsigned long ea,
                   unsigned long access, unsigned long trap)
  {
@@ -1247,11 +1358,8 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
  
         BUG_ON(REGION_ID(ea) != USER_REGION_ID);
  
-#ifdef CONFIG_PPC_MM_SLICES
-       /* We only prefault standard pages for now */
-       if (unlikely(get_slice_psize(mm, ea) != mm->context.user_psize))
+       if (!should_hash_preload(mm, ea))
                 return;
-#endif
  
         DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx,"
                 " trap=%lx\n", mm, mm->pgd, ea, access, trap);
@@ -1282,13 +1390,13 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
  
         WARN_ON(hugepage_shift);
  #ifdef CONFIG_PPC_64K_PAGES
-       /* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on
+       /* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on
          * a 64K kernel), then we don't preload, hash_page() will take
          * care of it once we actually try to access the page.
          * That way we don't have to duplicate all of the logic for segment
          * page size demotion here
          */
-       if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE))
+       if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep))
                 goto out_exit;
  #endif /* CONFIG_PPC_64K_PAGES */
  
@@ -1570,7 +1678,7 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
  }
  #endif /* CONFIG_DEBUG_PAGEALLOC */
  
-void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base,
                                 phys_addr_t first_memblock_size)
  {
         /* We don't currently support the first MEMBLOCK not mapping 0
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c

index eb2accdd76fd8bb70a762c4a57df8bc2c425ffad..ba3fc229468a7384f36b7e6eacd50621d39cd93e 100644 (file)
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -37,20 +37,20 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
  
                 old_pmd = pmd_val(pmd);
                 /* If PMD busy, retry the access */
-               if (unlikely(old_pmd & _PAGE_BUSY))
+               if (unlikely(old_pmd & H_PAGE_BUSY))
                         return 0;
                 /* If PMD permissions don't match, take page fault */
-               if (unlikely(access & ~old_pmd))
+               if (unlikely(!check_pte_access(access, old_pmd)))
                         return 1;
                 /*
                  * Try to lock the PTE, add ACCESSED and DIRTY if it was
                  * a write access
                  */
-               new_pmd = old_pmd | _PAGE_BUSY | _PAGE_ACCESSED;
-               if (access & _PAGE_RW)
+               new_pmd = old_pmd | H_PAGE_BUSY | _PAGE_ACCESSED;
+               if (access & _PAGE_WRITE)
                         new_pmd |= _PAGE_DIRTY;
-       } while (old_pmd != __cmpxchg_u64((unsigned long *)pmdp,
-                                         old_pmd, new_pmd));
+       } while (!pmd_xchg(pmdp, __pmd(old_pmd), __pmd(new_pmd)));
+
         rflags = htab_convert_pte_flags(new_pmd);
  
  #if 0
@@ -78,7 +78,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
                  * base page size. This is because demote_segment won't flush
                  * hash page table entries.
                  */
-               if ((old_pmd & _PAGE_HASHPTE) && !(old_pmd & _PAGE_COMBO)) {
+               if ((old_pmd & H_PAGE_HASHPTE) && !(old_pmd & H_PAGE_COMBO)) {
                         flush_hash_hugepage(vsid, ea, pmdp, MMU_PAGE_64K,
                                             ssize, flags);
                         /*
@@ -125,7 +125,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
                 hash = hpt_hash(vpn, shift, ssize);
                 /* insert new entry */
                 pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT;
-               new_pmd |= _PAGE_HASHPTE;
+               new_pmd |= H_PAGE_HASHPTE;
  
  repeat:
                 hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
@@ -169,17 +169,17 @@ repeat:
                 mark_hpte_slot_valid(hpte_slot_array, index, slot);
         }
         /*
-        * Mark the pte with _PAGE_COMBO, if we are trying to hash it with
+        * Mark the pte with H_PAGE_COMBO, if we are trying to hash it with
          * base page size 4k.
          */
         if (psize == MMU_PAGE_4K)
-               new_pmd |= _PAGE_COMBO;
+               new_pmd |= H_PAGE_COMBO;
         /*
          * The hpte valid is stored in the pgtable whose address is in the
          * second half of the PMD. Order this against clearing of the busy bit in
          * huge pmd.
          */
         smp_wmb();
-       *pmdp = __pmd(new_pmd & ~_PAGE_BUSY);
+       *pmdp = __pmd(new_pmd & ~H_PAGE_BUSY);
         return 0;
  }
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c

index 8555fce902fea574608b42d2bb240009e3fe8371..3058560b61213067e00e98799770bffef1775623 100644 (file)
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -47,18 +47,19 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
         do {
                 old_pte = pte_val(*ptep);
                 /* If PTE busy, retry the access */
-               if (unlikely(old_pte & _PAGE_BUSY))
+               if (unlikely(old_pte & H_PAGE_BUSY))
                         return 0;
                 /* If PTE permissions don't match, take page fault */
-               if (unlikely(access & ~old_pte))
+               if (unlikely(!check_pte_access(access, old_pte)))
                         return 1;
+
                 /* Try to lock the PTE, add ACCESSED and DIRTY if it was
                  * a write access */
-               new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
-               if (access & _PAGE_RW)
+               new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
+               if (access & _PAGE_WRITE)
                         new_pte |= _PAGE_DIRTY;
-       } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
-                                        old_pte, new_pte));
+       } while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+
         rflags = htab_convert_pte_flags(new_pte);
  
         sz = ((1UL) << shift);
@@ -68,28 +69,28 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
                 rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
  
         /* Check if pte already has an hpte (case 2) */
-       if (unlikely(old_pte & _PAGE_HASHPTE)) {
+       if (unlikely(old_pte & H_PAGE_HASHPTE)) {
                 /* There MIGHT be an HPTE for this pte */
                 unsigned long hash, slot;
  
                 hash = hpt_hash(vpn, shift, ssize);
-               if (old_pte & _PAGE_F_SECOND)
+               if (old_pte & H_PAGE_F_SECOND)
                         hash = ~hash;
                 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-               slot += (old_pte & _PAGE_F_GIX) >> _PAGE_F_GIX_SHIFT;
+               slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT;
  
                 if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize,
                                          mmu_psize, ssize, flags) == -1)
                         old_pte &= ~_PAGE_HPTEFLAGS;
         }
  
-       if (likely(!(old_pte & _PAGE_HASHPTE))) {
+       if (likely(!(old_pte & H_PAGE_HASHPTE))) {
                 unsigned long hash = hpt_hash(vpn, shift, ssize);
  
                 pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
  
                 /* clear HPTE slot informations in new PTE */
-               new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+               new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
  
                 slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0,
                                              mmu_psize, ssize);
@@ -105,14 +106,14 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
                         return -1;
                 }
  
-               new_pte |= (slot << _PAGE_F_GIX_SHIFT) &
-                       (_PAGE_F_SECOND | _PAGE_F_GIX);
+               new_pte |= (slot << H_PAGE_F_GIX_SHIFT) &
+                       (H_PAGE_F_SECOND | H_PAGE_F_GIX);
         }
  
         /*
          * No need to use ldarx/stdcx here
          */
-       *ptep = __pte(new_pte & ~_PAGE_BUSY);
+       *ptep = __pte(new_pte & ~H_PAGE_BUSY);
         return 0;
  }
  
diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c

new file mode 100644 (file)

index 0000000..1e11559
--- /dev/null
+++ b/arch/powerpc/mm/hugetlbpage-radix.c
@@ -0,0 +1,87 @@
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/machdep.h>
+#include <asm/mman.h>
+
+void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+       unsigned long ap, shift;
+       struct hstate *hstate = hstate_file(vma->vm_file);
+
+       shift = huge_page_shift(hstate);
+       if (shift == mmu_psize_defs[MMU_PAGE_2M].shift)
+               ap = mmu_get_ap(MMU_PAGE_2M);
+       else if (shift == mmu_psize_defs[MMU_PAGE_1G].shift)
+               ap = mmu_get_ap(MMU_PAGE_1G);
+       else {
+               WARN(1, "Wrong huge page shift\n");
+               return ;
+       }
+       radix___flush_tlb_page(vma->vm_mm, vmaddr, ap, 0);
+}
+
+void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+       unsigned long ap, shift;
+       struct hstate *hstate = hstate_file(vma->vm_file);
+
+       shift = huge_page_shift(hstate);
+       if (shift == mmu_psize_defs[MMU_PAGE_2M].shift)
+               ap = mmu_get_ap(MMU_PAGE_2M);
+       else if (shift == mmu_psize_defs[MMU_PAGE_1G].shift)
+               ap = mmu_get_ap(MMU_PAGE_1G);
+       else {
+               WARN(1, "Wrong huge page shift\n");
+               return ;
+       }
+       radix___local_flush_tlb_page(vma->vm_mm, vmaddr, ap, 0);
+}
+
+/*
+ * A vairant of hugetlb_get_unmapped_area doing topdown search
+ * FIXME!! should we do as x86 does or non hugetlb area does ?
+ * ie, use topdown or not based on mmap_is_legacy check ?
+ */
+unsigned long
+radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+                               unsigned long len, unsigned long pgoff,
+                               unsigned long flags)
+{
+       struct mm_struct *mm = current->mm;
+       struct vm_area_struct *vma;
+       struct hstate *h = hstate_file(file);
+       struct vm_unmapped_area_info info;
+
+       if (len & ~huge_page_mask(h))
+               return -EINVAL;
+       if (len > TASK_SIZE)
+               return -ENOMEM;
+
+       if (flags & MAP_FIXED) {
+               if (prepare_hugepage_range(file, addr, len))
+                       return -EINVAL;
+               return addr;
+       }
+
+       if (addr) {
+               addr = ALIGN(addr, huge_page_size(h));
+               vma = find_vma(mm, addr);
+               if (TASK_SIZE - len >= addr &&
+                   (!vma || addr + len <= vma->vm_start))
+                       return addr;
+       }
+       /*
+        * We are always doing an topdown search here. Slice code
+        * does that too.
+        */
+       info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+       info.length = len;
+       info.low_limit = PAGE_SIZE;
+       info.high_limit = current->mm->mmap_base;
+       info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+       info.align_offset = 0;
+       return vm_unmapped_area(&info);
+}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c

index d991b9e80dbbc75ef955c78c924c2e273228b70f..5aac1a3f86cde4471ba3460c3b08d543a43fd0e4 100644 (file)
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -711,6 +711,9 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
         struct hstate *hstate = hstate_file(file);
         int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
  
+       if (radix_enabled())
+               return radix__hugetlb_get_unmapped_area(file, addr, len,
+                                                      pgoff, flags);
         return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
  }
  #endif
@@ -719,14 +722,14 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
  {
  #ifdef CONFIG_PPC_MM_SLICES
         unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
-
-       return 1UL << mmu_psize_to_shift(psize);
-#else
+       /* With radix we don't use slice, so derive it from vma*/
+       if (!radix_enabled())
+               return 1UL << mmu_psize_to_shift(psize);
+#endif
         if (!is_vm_hugetlb_page(vma))
                 return PAGE_SIZE;
  
         return huge_page_size(hstate_vma(vma));
-#endif
  }
  
  static inline bool is_power_of_4(unsigned long x)
@@ -772,8 +775,10 @@ static int __init hugepage_setup_sz(char *str)
  
         size = memparse(str, &str);
  
-       if (add_huge_page_size(size) != 0)
-               printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
+       if (add_huge_page_size(size) != 0) {
+               hugetlb_bad_size();
+               pr_err("Invalid huge page size specified(%llu)\n", size);
+       }
  
         return 1;
  }
@@ -823,7 +828,7 @@ static int __init hugetlbpage_init(void)
  {
         int psize;
  
-       if (!mmu_has_feature(MMU_FTR_16M_PAGE))
+       if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
                 return -ENODEV;
  
         for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
@@ -863,6 +868,9 @@ static int __init hugetlbpage_init(void)
                 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
         else if (mmu_psize_defs[MMU_PAGE_1M].shift)
                 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
+       else if (mmu_psize_defs[MMU_PAGE_2M].shift)
+               HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift;
+
  
         return 0;
  }
@@ -1003,9 +1011,9 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
                 end = pte_end;
  
         pte = READ_ONCE(*ptep);
-       mask = _PAGE_PRESENT | _PAGE_USER;
+       mask = _PAGE_PRESENT | _PAGE_READ;
         if (write)
-               mask |= _PAGE_RW;
+               mask |= _PAGE_WRITE;
  
         if ((pte_val(pte) & mask) != mask)
                 return 0;
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c

index ba655666186d9caab2cd2c68c2570e8e9f8f1d45..33709bdb04196ae3dfac4cb3f7463db2210a3449 100644 (file)
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -66,11 +66,11 @@
  #include "mmu_decl.h"
  
  #ifdef CONFIG_PPC_STD_MMU_64
-#if PGTABLE_RANGE > USER_VSID_RANGE
+#if H_PGTABLE_RANGE > USER_VSID_RANGE
  #warning Limited user VSID range means pagetable space is wasted
  #endif
  
-#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
+#if (TASK_SIZE_USER64 < H_PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
  #warning TASK_SIZE is smaller than it needs to be.
  #endif
  #endif /* CONFIG_PPC_STD_MMU_64 */
@@ -189,75 +189,6 @@ static int __meminit vmemmap_populated(unsigned long start, int page_size)
         return 0;
  }
  
-/* On hash-based CPUs, the vmemmap is bolted in the hash table.
- *
- * On Book3E CPUs, the vmemmap is currently mapped in the top half of
- * the vmalloc space using normal page tables, though the size of
- * pages encoded in the PTEs can be different
- */
-
-#ifdef CONFIG_PPC_BOOK3E
-static int __meminit vmemmap_create_mapping(unsigned long start,
-                                           unsigned long page_size,
-                                           unsigned long phys)
-{
-       /* Create a PTE encoding without page size */
-       unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED |
-               _PAGE_KERNEL_RW;
-
-       /* PTEs only contain page size encodings up to 32M */
-       BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf);
-
-       /* Encode the size in the PTE */
-       flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8;
-
-       /* For each PTE for that area, map things. Note that we don't
-        * increment phys because all PTEs are of the large size and
-        * thus must have the low bits clear
-        */
-       for (i = 0; i < page_size; i += PAGE_SIZE)
-               BUG_ON(map_kernel_page(start + i, phys, flags));
-
-       return 0;
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-static void vmemmap_remove_mapping(unsigned long start,
-                                  unsigned long page_size)
-{
-}
-#endif
-#else /* CONFIG_PPC_BOOK3E */
-static int __meminit vmemmap_create_mapping(unsigned long start,
-                                           unsigned long page_size,
-                                           unsigned long phys)
-{
-       int rc = htab_bolt_mapping(start, start + page_size, phys,
-                                  pgprot_val(PAGE_KERNEL),
-                                  mmu_vmemmap_psize, mmu_kernel_ssize);
-       if (rc < 0) {
-               int rc2 = htab_remove_mapping(start, start + page_size,
-                                             mmu_vmemmap_psize,
-                                             mmu_kernel_ssize);
-               BUG_ON(rc2 && (rc2 != -ENOENT));
-       }
-       return rc;
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-static void vmemmap_remove_mapping(unsigned long start,
-                                  unsigned long page_size)
-{
-       int rc = htab_remove_mapping(start, start + page_size,
-                                    mmu_vmemmap_psize,
-                                    mmu_kernel_ssize);
-       BUG_ON((rc < 0) && (rc != -ENOENT));
-       WARN_ON(rc == -ENOENT);
-}
-#endif
-
-#endif /* CONFIG_PPC_BOOK3E */
-
  struct vmemmap_backing *vmemmap_list;
  static struct vmemmap_backing *next;
  static int num_left;
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c

index ac79dbde1015ba090e30718c77759c4ec10498ed..2fd57fa48429bd60e0c0605dee812ae9870c41ed 100644 (file)
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -68,12 +68,15 @@ pte_t *kmap_pte;
  EXPORT_SYMBOL(kmap_pte);
  pgprot_t kmap_prot;
  EXPORT_SYMBOL(kmap_prot);
+#define TOP_ZONE ZONE_HIGHMEM
  
  static inline pte_t *virt_to_kpte(unsigned long vaddr)
  {
         return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
                         vaddr), vaddr), vaddr);
  }
+#else
+#define TOP_ZONE ZONE_NORMAL
  #endif
  
  int page_is_ram(unsigned long pfn)
@@ -267,14 +270,9 @@ void __init limit_zone_pfn(enum zone_type zone, unsigned long pfn_limit)
   */
  int dma_pfn_limit_to_zone(u64 pfn_limit)
  {
-       enum zone_type top_zone = ZONE_NORMAL;
         int i;
  
-#ifdef CONFIG_HIGHMEM
-       top_zone = ZONE_HIGHMEM;
-#endif
-
-       for (i = top_zone; i >= 0; i--) {
+       for (i = TOP_ZONE; i >= 0; i--) {
                 if (max_zone_pfns[i] <= pfn_limit)
                         return i;
         }
@@ -289,7 +287,6 @@ void __init paging_init(void)
  {
         unsigned long long total_ram = memblock_phys_mem_size();
         phys_addr_t top_of_ram = memblock_end_of_DRAM();
-       enum zone_type top_zone;
  
  #ifdef CONFIG_PPC32
         unsigned long v = __fix_to_virt(__end_of_fixed_addresses - 1);
@@ -313,13 +310,9 @@ void __init paging_init(void)
                (long int)((top_of_ram - total_ram) >> 20));
  
  #ifdef CONFIG_HIGHMEM
-       top_zone = ZONE_HIGHMEM;
         limit_zone_pfn(ZONE_NORMAL, lowmem_end_addr >> PAGE_SHIFT);
-#else
-       top_zone = ZONE_NORMAL;
  #endif
-
-       limit_zone_pfn(top_zone, top_of_ram >> PAGE_SHIFT);
+       limit_zone_pfn(TOP_ZONE, top_of_ram >> PAGE_SHIFT);
         zone_limits_final = true;
         free_area_init_nodes(max_zone_pfns);
  
@@ -498,7 +491,10 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
          * We don't need to worry about _PAGE_PRESENT here because we are
          * called with either mm->page_table_lock held or ptl lock held
          */
-       unsigned long access = 0, trap;
+       unsigned long access, trap;
+
+       if (radix_enabled())
+               return;
  
         /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
         if (!pte_young(*ptep) || address >= TASK_SIZE)
@@ -511,13 +507,19 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
          *
          * We also avoid filling the hash if not coming from a fault
          */
-       if (current->thread.regs == NULL)
-               return;
-       trap = TRAP(current->thread.regs);
-       if (trap == 0x400)
-               access |= _PAGE_EXEC;
-       else if (trap != 0x300)
+
+       trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL;
+       switch (trap) {
+       case 0x300:
+               access = 0UL;
+               break;
+       case 0x400:
+               access = _PAGE_EXEC;
+               break;
+       default:
                 return;
+       }
+
         hash_preload(vma->vm_mm, address, access, trap);
  #endif /* CONFIG_PPC_STD_MMU */
  #if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c

index 4087705ba90f34241200e2f30765794ea6b74b55..2f1e44362198d3f16d85fdd4656d37e618b51824 100644 (file)
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -26,6 +26,9 @@
  #include <linux/mm.h>
  #include <linux/random.h>
  #include <linux/sched.h>
+#include <linux/elf-randomize.h>
+#include <linux/security.h>
+#include <linux/mman.h>
  
  /*
   * Top of mmap area (just below the process stack).
@@ -78,6 +81,111 @@ static inline unsigned long mmap_base(unsigned long rnd)
         return PAGE_ALIGN(TASK_SIZE - gap - rnd);
  }
  
+#ifdef CONFIG_PPC_RADIX_MMU
+/*
+ * Same function as generic code used only for radix, because we don't need to overload
+ * the generic one. But we will have to duplicate, because hash select
+ * HAVE_ARCH_UNMAPPED_AREA
+ */
+static unsigned long
+radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
+                            unsigned long len, unsigned long pgoff,
+                            unsigned long flags)
+{
+       struct mm_struct *mm = current->mm;
+       struct vm_area_struct *vma;
+       struct vm_unmapped_area_info info;
+
+       if (len > TASK_SIZE - mmap_min_addr)
+               return -ENOMEM;
+
+       if (flags & MAP_FIXED)
+               return addr;
+
+       if (addr) {
+               addr = PAGE_ALIGN(addr);
+               vma = find_vma(mm, addr);
+               if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+                   (!vma || addr + len <= vma->vm_start))
+                       return addr;
+       }
+
+       info.flags = 0;
+       info.length = len;
+       info.low_limit = mm->mmap_base;
+       info.high_limit = TASK_SIZE;
+       info.align_mask = 0;
+       return vm_unmapped_area(&info);
+}
+
+static unsigned long
+radix__arch_get_unmapped_area_topdown(struct file *filp,
+                                    const unsigned long addr0,
+                                    const unsigned long len,
+                                    const unsigned long pgoff,
+                                    const unsigned long flags)
+{
+       struct vm_area_struct *vma;
+       struct mm_struct *mm = current->mm;
+       unsigned long addr = addr0;
+       struct vm_unmapped_area_info info;
+
+       /* requested length too big for entire address space */
+       if (len > TASK_SIZE - mmap_min_addr)
+               return -ENOMEM;
+
+       if (flags & MAP_FIXED)
+               return addr;
+
+       /* requesting a specific address */
+       if (addr) {
+               addr = PAGE_ALIGN(addr);
+               vma = find_vma(mm, addr);
+               if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+                               (!vma || addr + len <= vma->vm_start))
+                       return addr;
+       }
+
+       info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+       info.length = len;
+       info.low_limit = max(PAGE_SIZE, mmap_min_addr);
+       info.high_limit = mm->mmap_base;
+       info.align_mask = 0;
+       addr = vm_unmapped_area(&info);
+
+       /*
+        * A failed mmap() very likely causes application failure,
+        * so fall back to the bottom-up function here. This scenario
+        * can happen with large stack limits and large mmap()
+        * allocations.
+        */
+       if (addr & ~PAGE_MASK) {
+               VM_BUG_ON(addr != -ENOMEM);
+               info.flags = 0;
+               info.low_limit = TASK_UNMAPPED_BASE;
+               info.high_limit = TASK_SIZE;
+               addr = vm_unmapped_area(&info);
+       }
+
+       return addr;
+}
+
+static void radix__arch_pick_mmap_layout(struct mm_struct *mm,
+                                       unsigned long random_factor)
+{
+       if (mmap_is_legacy()) {
+               mm->mmap_base = TASK_UNMAPPED_BASE;
+               mm->get_unmapped_area = radix__arch_get_unmapped_area;
+       } else {
+               mm->mmap_base = mmap_base(random_factor);
+               mm->get_unmapped_area = radix__arch_get_unmapped_area_topdown;
+       }
+}
+#else
+/* dummy */
+extern void radix__arch_pick_mmap_layout(struct mm_struct *mm,
+                                       unsigned long random_factor);
+#endif
  /*
   * This function, called very early during the creation of a new
   * process VM image, sets up which VM layout function to use:
@@ -89,6 +197,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
         if (current->flags & PF_RANDOMIZE)
                 random_factor = arch_mmap_rnd();
  
+       if (radix_enabled())
+               return radix__arch_pick_mmap_layout(mm, random_factor);
         /*
          * Fall back to the standard layout if the personality
          * bit is set, or if the expected stack growth is unlimited:
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c

new file mode 100644 (file)

index 0000000..227b2a6
--- /dev/null
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -0,0 +1,187 @@
+/*
+ *  MMU context allocation for 64-bit kernels.
+ *
+ *  Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+
+#include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
+
+#include "icswx.h"
+
+static DEFINE_SPINLOCK(mmu_context_lock);
+static DEFINE_IDA(mmu_context_ida);
+
+int __init_new_context(void)
+{
+       int index;
+       int err;
+
+again:
+       if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL))
+               return -ENOMEM;
+
+       spin_lock(&mmu_context_lock);
+       err = ida_get_new_above(&mmu_context_ida, 1, &index);
+       spin_unlock(&mmu_context_lock);
+
+       if (err == -EAGAIN)
+               goto again;
+       else if (err)
+               return err;
+
+       if (index > MAX_USER_CONTEXT) {
+               spin_lock(&mmu_context_lock);
+               ida_remove(&mmu_context_ida, index);
+               spin_unlock(&mmu_context_lock);
+               return -ENOMEM;
+       }
+
+       return index;
+}
+EXPORT_SYMBOL_GPL(__init_new_context);
+static int radix__init_new_context(struct mm_struct *mm, int index)
+{
+       unsigned long rts_field;
+
+       /*
+        * set the process table entry,
+        */
+       rts_field = 3ull << PPC_BITLSHIFT(2);
+       process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE);
+       return 0;
+}
+
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+       int index;
+
+       index = __init_new_context();
+       if (index < 0)
+               return index;
+
+       if (radix_enabled()) {
+               radix__init_new_context(mm, index);
+       } else {
+
+               /* The old code would re-promote on fork, we don't do that
+                * when using slices as it could cause problem promoting slices
+                * that have been forced down to 4K
+                *
+                * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check
+                * explicitly against context.id == 0. This ensures that we
+                * properly initialize context slice details for newly allocated
+                * mm's (which will have id == 0) and don't alter context slice
+                * inherited via fork (which will have id != 0).
+                *
+                * We should not be calling init_new_context() on init_mm. Hence a
+                * check against 0 is ok.
+                */
+               if (mm->context.id == 0)
+                       slice_set_user_psize(mm, mmu_virtual_psize);
+               subpage_prot_init_new_context(mm);
+       }
+       mm->context.id = index;
+#ifdef CONFIG_PPC_ICSWX
+       mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
+       if (!mm->context.cop_lockp) {
+               __destroy_context(index);
+               subpage_prot_free(mm);
+               mm->context.id = MMU_NO_CONTEXT;
+               return -ENOMEM;
+       }
+       spin_lock_init(mm->context.cop_lockp);
+#endif /* CONFIG_PPC_ICSWX */
+
+#ifdef CONFIG_PPC_64K_PAGES
+       mm->context.pte_frag = NULL;
+#endif
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+       mm_iommu_init(&mm->context);
+#endif
+       return 0;
+}
+
+void __destroy_context(int context_id)
+{
+       spin_lock(&mmu_context_lock);
+       ida_remove(&mmu_context_ida, context_id);
+       spin_unlock(&mmu_context_lock);
+}
+EXPORT_SYMBOL_GPL(__destroy_context);
+
+#ifdef CONFIG_PPC_64K_PAGES
+static void destroy_pagetable_page(struct mm_struct *mm)
+{
+       int count;
+       void *pte_frag;
+       struct page *page;
+
+       pte_frag = mm->context.pte_frag;
+       if (!pte_frag)
+               return;
+
+       page = virt_to_page(pte_frag);
+       /* drop all the pending references */
+       count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
+       /* We allow PTE_FRAG_NR fragments from a PTE page */
+       if (page_ref_sub_and_test(page, PTE_FRAG_NR - count)) {
+               pgtable_page_dtor(page);
+               free_hot_cold_page(page, 0);
+       }
+}
+
+#else
+static inline void destroy_pagetable_page(struct mm_struct *mm)
+{
+       return;
+}
+#endif
+
+
+void destroy_context(struct mm_struct *mm)
+{
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+       mm_iommu_cleanup(&mm->context);
+#endif
+
+#ifdef CONFIG_PPC_ICSWX
+       drop_cop(mm->context.acop, mm);
+       kfree(mm->context.cop_lockp);
+       mm->context.cop_lockp = NULL;
+#endif /* CONFIG_PPC_ICSWX */
+
+       if (radix_enabled())
+               process_tb[mm->context.id].prtb1 = 0;
+       else
+               subpage_prot_free(mm);
+       destroy_pagetable_page(mm);
+       __destroy_context(mm->context.id);
+       mm->context.id = MMU_NO_CONTEXT;
+}
+
+#ifdef CONFIG_PPC_RADIX_MMU
+void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
+{
+       mtspr(SPRN_PID, next->context.id);
+       asm volatile("isync": : :"memory");
+}
+#endif
diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c

deleted file mode 100644 (file)

index 9ca6fe1..0000000
--- a/arch/powerpc/mm/mmu_context_hash64.c
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- *  MMU context allocation for 64-bit kernels.
- *
- *  Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org>
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
-#include <linux/idr.h>
-#include <linux/export.h>
-#include <linux/gfp.h>
-#include <linux/slab.h>
-
-#include <asm/mmu_context.h>
-#include <asm/pgalloc.h>
-
-#include "icswx.h"
-
-static DEFINE_SPINLOCK(mmu_context_lock);
-static DEFINE_IDA(mmu_context_ida);
-
-int __init_new_context(void)
-{
-       int index;
-       int err;
-
-again:
-       if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL))
-               return -ENOMEM;
-
-       spin_lock(&mmu_context_lock);
-       err = ida_get_new_above(&mmu_context_ida, 1, &index);
-       spin_unlock(&mmu_context_lock);
-
-       if (err == -EAGAIN)
-               goto again;
-       else if (err)
-               return err;
-
-       if (index > MAX_USER_CONTEXT) {
-               spin_lock(&mmu_context_lock);
-               ida_remove(&mmu_context_ida, index);
-               spin_unlock(&mmu_context_lock);
-               return -ENOMEM;
-       }
-
-       return index;
-}
-EXPORT_SYMBOL_GPL(__init_new_context);
-
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
-{
-       int index;
-
-       index = __init_new_context();
-       if (index < 0)
-               return index;
-
-       /* The old code would re-promote on fork, we don't do that
-        * when using slices as it could cause problem promoting slices
-        * that have been forced down to 4K
-        */
-       if (slice_mm_new_context(mm))
-               slice_set_user_psize(mm, mmu_virtual_psize);
-       subpage_prot_init_new_context(mm);
-       mm->context.id = index;
-#ifdef CONFIG_PPC_ICSWX
-       mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
-       if (!mm->context.cop_lockp) {
-               __destroy_context(index);
-               subpage_prot_free(mm);
-               mm->context.id = MMU_NO_CONTEXT;
-               return -ENOMEM;
-       }
-       spin_lock_init(mm->context.cop_lockp);
-#endif /* CONFIG_PPC_ICSWX */
-
-#ifdef CONFIG_PPC_64K_PAGES
-       mm->context.pte_frag = NULL;
-#endif
-#ifdef CONFIG_SPAPR_TCE_IOMMU
-       mm_iommu_init(&mm->context);
-#endif
-       return 0;
-}
-
-void __destroy_context(int context_id)
-{
-       spin_lock(&mmu_context_lock);
-       ida_remove(&mmu_context_ida, context_id);
-       spin_unlock(&mmu_context_lock);
-}
-EXPORT_SYMBOL_GPL(__destroy_context);
-
-#ifdef CONFIG_PPC_64K_PAGES
-static void destroy_pagetable_page(struct mm_struct *mm)
-{
-       int count;
-       void *pte_frag;
-       struct page *page;
-
-       pte_frag = mm->context.pte_frag;
-       if (!pte_frag)
-               return;
-
-       page = virt_to_page(pte_frag);
-       /* drop all the pending references */
-       count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
-       /* We allow PTE_FRAG_NR fragments from a PTE page */
-       if (page_ref_sub_and_test(page, PTE_FRAG_NR - count)) {
-               pgtable_page_dtor(page);
-               free_hot_cold_page(page, 0);
-       }
-}
-
-#else
-static inline void destroy_pagetable_page(struct mm_struct *mm)
-{
-       return;
-}
-#endif
-
-
-void destroy_context(struct mm_struct *mm)
-{
-#ifdef CONFIG_SPAPR_TCE_IOMMU
-       mm_iommu_cleanup(&mm->context);
-#endif
-
-#ifdef CONFIG_PPC_ICSWX
-       drop_cop(mm->context.acop, mm);
-       kfree(mm->context.cop_lockp);
-       mm->context.cop_lockp = NULL;
-#endif /* CONFIG_PPC_ICSWX */
-
-       destroy_pagetable_page(mm);
-       __destroy_context(mm->context.id);
-       subpage_prot_free(mm);
-       mm->context.id = MMU_NO_CONTEXT;
-}
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c

index 986afbc22c76b2188f8104030e7197bd7eae57ba..7d95bc402dba4b9ba411c5018091b262d4cd4cb6 100644 (file)
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -226,7 +226,8 @@ static void context_check_map(void)
  static void context_check_map(void) { }
  #endif
  
-void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
+void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next,
+                       struct task_struct *tsk)
  {
         unsigned int i, id, cpu = smp_processor_id();
         unsigned long *map;
@@ -334,8 +335,7 @@ int init_new_context(struct task_struct *t, struct mm_struct *mm)
         mm->context.active = 0;
  
  #ifdef CONFIG_PPC_MM_SLICES
-       if (slice_mm_new_context(mm))
-               slice_set_user_psize(mm, mmu_virtual_psize);
+       slice_set_user_psize(mm, mmu_virtual_psize);
  #endif
  
         return 0;
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h

index bfb7c0bcabd57a6dd12f87ca1e41f7bacdfe4f28..6af65327c99322b564bacdf66657f62f4e35a05b 100644 (file)
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -108,11 +108,6 @@ extern unsigned long Hash_size, Hash_mask;
  
  #endif /* CONFIG_PPC32 */
  
-#ifdef CONFIG_PPC64
-extern int map_kernel_page(unsigned long ea, unsigned long pa,
-                          unsigned long flags);
-#endif /* CONFIG_PPC64 */
-
  extern unsigned long ioremap_bot;
  extern unsigned long __max_low_memory;
  extern phys_addr_t __initial_memory_limit_addr;
diff --git a/arch/powerpc/mm/pgtable-book3e.c b/arch/powerpc/mm/pgtable-book3e.c

new file mode 100644 (file)

index 0000000..a229893
--- /dev/null
+++ b/arch/powerpc/mm/pgtable-book3e.c
@@ -0,0 +1,122 @@
+/*
+ * Copyright 2005, Paul Mackerras, IBM Corporation.
+ * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+#include <linux/memblock.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+#include <asm/dma.h>
+
+#include "mmu_decl.h"
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+/*
+ * On Book3E CPUs, the vmemmap is currently mapped in the top half of
+ * the vmalloc space using normal page tables, though the size of
+ * pages encoded in the PTEs can be different
+ */
+int __meminit vmemmap_create_mapping(unsigned long start,
+                                    unsigned long page_size,
+                                    unsigned long phys)
+{
+       /* Create a PTE encoding without page size */
+       unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED |
+               _PAGE_KERNEL_RW;
+
+       /* PTEs only contain page size encodings up to 32M */
+       BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf);
+
+       /* Encode the size in the PTE */
+       flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8;
+
+       /* For each PTE for that area, map things. Note that we don't
+        * increment phys because all PTEs are of the large size and
+        * thus must have the low bits clear
+        */
+       for (i = 0; i < page_size; i += PAGE_SIZE)
+               BUG_ON(map_kernel_page(start + i, phys, flags));
+
+       return 0;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+void vmemmap_remove_mapping(unsigned long start,
+                           unsigned long page_size)
+{
+}
+#endif
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+
+static __ref void *early_alloc_pgtable(unsigned long size)
+{
+       void *pt;
+
+       pt = __va(memblock_alloc_base(size, size, __pa(MAX_DMA_ADDRESS)));
+       memset(pt, 0, size);
+
+       return pt;
+}
+
+/*
+ * map_kernel_page currently only called by __ioremap
+ * map_kernel_page adds an entry to the ioremap page table
+ * and adds an entry to the HPT, possibly bolting it
+ */
+int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags)
+{
+       pgd_t *pgdp;
+       pud_t *pudp;
+       pmd_t *pmdp;
+       pte_t *ptep;
+
+       BUILD_BUG_ON(TASK_SIZE_USER64 > PGTABLE_RANGE);
+       if (slab_is_available()) {
+               pgdp = pgd_offset_k(ea);
+               pudp = pud_alloc(&init_mm, pgdp, ea);
+               if (!pudp)
+                       return -ENOMEM;
+               pmdp = pmd_alloc(&init_mm, pudp, ea);
+               if (!pmdp)
+                       return -ENOMEM;
+               ptep = pte_alloc_kernel(pmdp, ea);
+               if (!ptep)
+                       return -ENOMEM;
+               set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
+                                                         __pgprot(flags)));
+       } else {
+               pgdp = pgd_offset_k(ea);
+#ifndef __PAGETABLE_PUD_FOLDED
+               if (pgd_none(*pgdp)) {
+                       pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
+                       BUG_ON(pudp == NULL);
+                       pgd_populate(&init_mm, pgdp, pudp);
+               }
+#endif /* !__PAGETABLE_PUD_FOLDED */
+               pudp = pud_offset(pgdp, ea);
+               if (pud_none(*pudp)) {
+                       pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
+                       BUG_ON(pmdp == NULL);
+                       pud_populate(&init_mm, pudp, pmdp);
+               }
+               pmdp = pmd_offset(pudp, ea);
+               if (!pmd_present(*pmdp)) {
+                       ptep = early_alloc_pgtable(PAGE_SIZE);
+                       BUG_ON(ptep == NULL);
+                       pmd_populate_kernel(&init_mm, pmdp, ptep);
+               }
+               ptep = pte_offset_kernel(pmdp, ea);
+               set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
+                                                         __pgprot(flags)));
+       }
+
+       smp_wmb();
+       return 0;
+}
diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c

new file mode 100644 (file)

index 0000000..eb44511
--- /dev/null
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+
+#include "mmu_decl.h"
+#include <trace/events/thp.h>
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * This is called when relaxing access to a hugepage. It's also called in the page
+ * fault path when we don't hit any of the major fault cases, ie, a minor
+ * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
+ * handled those two for us, we additionally deal with missing execute
+ * permission here on some processors
+ */
+int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+                         pmd_t *pmdp, pmd_t entry, int dirty)
+{
+       int changed;
+#ifdef CONFIG_DEBUG_VM
+       WARN_ON(!pmd_trans_huge(*pmdp));
+       assert_spin_locked(&vma->vm_mm->page_table_lock);
+#endif
+       changed = !pmd_same(*(pmdp), entry);
+       if (changed) {
+               __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
+               /*
+                * Since we are not supporting SW TLB systems, we don't
+                * have any thing similar to flush_tlb_page_nohash()
+                */
+       }
+       return changed;
+}
+
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+                             unsigned long address, pmd_t *pmdp)
+{
+       return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
+}
+/*
+ * set a new huge pmd. We should not be called for updating
+ * an existing pmd entry. That should go via pmd_hugepage_update.
+ */
+void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+               pmd_t *pmdp, pmd_t pmd)
+{
+#ifdef CONFIG_DEBUG_VM
+       WARN_ON(pte_present(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp)));
+       assert_spin_locked(&mm->page_table_lock);
+       WARN_ON(!pmd_trans_huge(pmd));
+#endif
+       trace_hugepage_set_pmd(addr, pmd_val(pmd));
+       return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
+}
+/*
+ * We use this to invalidate a pmdp entry before switching from a
+ * hugepte to regular pmd entry.
+ */
+void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+                    pmd_t *pmdp)
+{
+       pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
+       flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+       /*
+        * This ensures that generic code that rely on IRQ disabling
+        * to prevent a parallel THP split work as expected.
+        */
+       kick_all_cpus_sync();
+}
+
+static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
+{
+       return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
+}
+
+pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
+{
+       unsigned long pmdv;
+
+       pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK;
+       return pmd_set_protbits(__pmd(pmdv), pgprot);
+}
+
+pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
+{
+       return pfn_pmd(page_to_pfn(page), pgprot);
+}
+
+pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+       unsigned long pmdv;
+
+       pmdv = pmd_val(pmd);
+       pmdv &= _HPAGE_CHG_MASK;
+       return pmd_set_protbits(__pmd(pmdv), newprot);
+}
+
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a HUGE PMD entry in the linux page tables.
+ * We use it to preload an HPTE into the hash table corresponding to
+ * the updated linux HUGE PMD entry.
+ */
+void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
+                         pmd_t *pmd)
+{
+       return;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c

new file mode 100644 (file)

index 0000000..c23e286
--- /dev/null
+++ b/arch/powerpc/mm/pgtable-hash64.c
@@ -0,0 +1,342 @@
+/*
+ * Copyright 2005, Paul Mackerras, IBM Corporation.
+ * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+
+#include "mmu_decl.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/thp.h>
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+/*
+ * On hash-based CPUs, the vmemmap is bolted in the hash table.
+ *
+ */
+int __meminit hash__vmemmap_create_mapping(unsigned long start,
+                                      unsigned long page_size,
+                                      unsigned long phys)
+{
+       int rc = htab_bolt_mapping(start, start + page_size, phys,
+                                  pgprot_val(PAGE_KERNEL),
+                                  mmu_vmemmap_psize, mmu_kernel_ssize);
+       if (rc < 0) {
+               int rc2 = htab_remove_mapping(start, start + page_size,
+                                             mmu_vmemmap_psize,
+                                             mmu_kernel_ssize);
+               BUG_ON(rc2 && (rc2 != -ENOENT));
+       }
+       return rc;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+void hash__vmemmap_remove_mapping(unsigned long start,
+                             unsigned long page_size)
+{
+       int rc = htab_remove_mapping(start, start + page_size,
+                                    mmu_vmemmap_psize,
+                                    mmu_kernel_ssize);
+       BUG_ON((rc < 0) && (rc != -ENOENT));
+       WARN_ON(rc == -ENOENT);
+}
+#endif
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+
+/*
+ * map_kernel_page currently only called by __ioremap
+ * map_kernel_page adds an entry to the ioremap page table
+ * and adds an entry to the HPT, possibly bolting it
+ */
+int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags)
+{
+       pgd_t *pgdp;
+       pud_t *pudp;
+       pmd_t *pmdp;
+       pte_t *ptep;
+
+       BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
+       if (slab_is_available()) {
+               pgdp = pgd_offset_k(ea);
+               pudp = pud_alloc(&init_mm, pgdp, ea);
+               if (!pudp)
+                       return -ENOMEM;
+               pmdp = pmd_alloc(&init_mm, pudp, ea);
+               if (!pmdp)
+                       return -ENOMEM;
+               ptep = pte_alloc_kernel(pmdp, ea);
+               if (!ptep)
+                       return -ENOMEM;
+               set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
+                                                         __pgprot(flags)));
+       } else {
+               /*
+                * If the mm subsystem is not fully up, we cannot create a
+                * linux page table entry for this mapping.  Simply bolt an
+                * entry in the hardware page table.
+                *
+                */
+               if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags,
+                                     mmu_io_psize, mmu_kernel_ssize)) {
+                       printk(KERN_ERR "Failed to do bolted mapping IO "
+                              "memory at %016lx !\n", pa);
+                       return -ENOMEM;
+               }
+       }
+
+       smp_wmb();
+       return 0;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
+                                   pmd_t *pmdp, unsigned long clr,
+                                   unsigned long set)
+{
+       __be64 old_be, tmp;
+       unsigned long old;
+
+#ifdef CONFIG_DEBUG_VM
+       WARN_ON(!pmd_trans_huge(*pmdp));
+       assert_spin_locked(&mm->page_table_lock);
+#endif
+
+       __asm__ __volatile__(
+       "1:     ldarx   %0,0,%3\n\
+               and.    %1,%0,%6\n\
+               bne-    1b \n\
+               andc    %1,%0,%4 \n\
+               or      %1,%1,%7\n\
+               stdcx.  %1,0,%3 \n\
+               bne-    1b"
+       : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp)
+       : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp),
+         "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
+       : "cc" );
+
+       old = be64_to_cpu(old_be);
+
+       trace_hugepage_update(addr, old, clr, set);
+       if (old & H_PAGE_HASHPTE)
+               hpte_do_hugepage_flush(mm, addr, pmdp, old);
+       return old;
+}
+
+pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
+                           pmd_t *pmdp)
+{
+       pmd_t pmd;
+
+       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+       VM_BUG_ON(pmd_trans_huge(*pmdp));
+
+       pmd = *pmdp;
+       pmd_clear(pmdp);
+       /*
+        * Wait for all pending hash_page to finish. This is needed
+        * in case of subpage collapse. When we collapse normal pages
+        * to hugepage, we first clear the pmd, then invalidate all
+        * the PTE entries. The assumption here is that any low level
+        * page fault will see a none pmd and take the slow path that
+        * will wait on mmap_sem. But we could very well be in a
+        * hash_page with local ptep pointer value. Such a hash page
+        * can result in adding new HPTE entries for normal subpages.
+        * That means we could be modifying the page content as we
+        * copy them to a huge page. So wait for parallel hash_page
+        * to finish before invalidating HPTE entries. We can do this
+        * by sending an IPI to all the cpus and executing a dummy
+        * function there.
+        */
+       kick_all_cpus_sync();
+       /*
+        * Now invalidate the hpte entries in the range
+        * covered by pmd. This make sure we take a
+        * fault and will find the pmd as none, which will
+        * result in a major fault which takes mmap_sem and
+        * hence wait for collapse to complete. Without this
+        * the __collapse_huge_page_copy can result in copying
+        * the old content.
+        */
+       flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
+       return pmd;
+}
+
+/*
+ * We want to put the pgtable in pmd and use pgtable for tracking
+ * the base page size hptes
+ */
+void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+                                 pgtable_t pgtable)
+{
+       pgtable_t *pgtable_slot;
+       assert_spin_locked(&mm->page_table_lock);
+       /*
+        * we store the pgtable in the second half of PMD
+        */
+       pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+       *pgtable_slot = pgtable;
+       /*
+        * expose the deposited pgtable to other cpus.
+        * before we set the hugepage PTE at pmd level
+        * hash fault code looks at the deposted pgtable
+        * to store hash index values.
+        */
+       smp_wmb();
+}
+
+pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
+{
+       pgtable_t pgtable;
+       pgtable_t *pgtable_slot;
+
+       assert_spin_locked(&mm->page_table_lock);
+       pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+       pgtable = *pgtable_slot;
+       /*
+        * Once we withdraw, mark the entry NULL.
+        */
+       *pgtable_slot = NULL;
+       /*
+        * We store HPTE information in the deposited PTE fragment.
+        * zero out the content on withdraw.
+        */
+       memset(pgtable, 0, PTE_FRAG_SIZE);
+       return pgtable;
+}
+
+void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
+                              unsigned long address, pmd_t *pmdp)
+{
+       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+       VM_BUG_ON(REGION_ID(address) != USER_REGION_ID);
+
+       /*
+        * We can't mark the pmd none here, because that will cause a race
+        * against exit_mmap. We need to continue mark pmd TRANS HUGE, while
+        * we spilt, but at the same time we wan't rest of the ppc64 code
+        * not to insert hash pte on this, because we will be modifying
+        * the deposited pgtable in the caller of this function. Hence
+        * clear the _PAGE_USER so that we move the fault handling to
+        * higher level function and that will serialize against ptl.
+        * We need to flush existing hash pte entries here even though,
+        * the translation is still valid, because we will withdraw
+        * pgtable_t after this.
+        */
+       pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED);
+}
+
+/*
+ * A linux hugepage PMD was changed and the corresponding hash table entries
+ * neesd to be flushed.
+ */
+void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
+                           pmd_t *pmdp, unsigned long old_pmd)
+{
+       int ssize;
+       unsigned int psize;
+       unsigned long vsid;
+       unsigned long flags = 0;
+       const struct cpumask *tmp;
+
+       /* get the base page size,vsid and segment size */
+#ifdef CONFIG_DEBUG_VM
+       psize = get_slice_psize(mm, addr);
+       BUG_ON(psize == MMU_PAGE_16M);
+#endif
+       if (old_pmd & H_PAGE_COMBO)
+               psize = MMU_PAGE_4K;
+       else
+               psize = MMU_PAGE_64K;
+
+       if (!is_kernel_addr(addr)) {
+               ssize = user_segment_size(addr);
+               vsid = get_vsid(mm->context.id, addr, ssize);
+               WARN_ON(vsid == 0);
+       } else {
+               vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+               ssize = mmu_kernel_ssize;
+       }
+
+       tmp = cpumask_of(smp_processor_id());
+       if (cpumask_equal(mm_cpumask(mm), tmp))
+               flags |= HPTE_LOCAL_UPDATE;
+
+       return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
+}
+
+pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
+                               unsigned long addr, pmd_t *pmdp)
+{
+       pmd_t old_pmd;
+       pgtable_t pgtable;
+       unsigned long old;
+       pgtable_t *pgtable_slot;
+
+       old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
+       old_pmd = __pmd(old);
+       /*
+        * We have pmd == none and we are holding page_table_lock.
+        * So we can safely go and clear the pgtable hash
+        * index info.
+        */
+       pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+       pgtable = *pgtable_slot;
+       /*
+        * Let's zero out old valid and hash index details
+        * hash fault look at them.
+        */
+       memset(pgtable, 0, PTE_FRAG_SIZE);
+       /*
+        * Serialize against find_linux_pte_or_hugepte which does lock-less
+        * lookup in page tables with local interrupts disabled. For huge pages
+        * it casts pmd_t to pte_t. Since format of pte_t is different from
+        * pmd_t we want to prevent transit from pmd pointing to page table
+        * to pmd pointing to huge page (and back) while interrupts are disabled.
+        * We clear pmd to possibly replace it with page table pointer in
+        * different code paths. So make sure we wait for the parallel
+        * find_linux_pte_or_hugepage to finish.
+        */
+       kick_all_cpus_sync();
+       return old_pmd;
+}
+
+int hash__has_transparent_hugepage(void)
+{
+
+       if (!mmu_has_feature(MMU_FTR_16M_PAGE))
+               return 0;
+       /*
+        * We support THP only if PMD_SIZE is 16MB.
+        */
+       if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
+               return 0;
+       /*
+        * We need to make sure that we support 16MB hugepage in a segement
+        * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
+        * of 64K.
+        */
+       /*
+        * If we have 64K HPTE, we will be using that by default
+        */
+       if (mmu_psize_defs[MMU_PAGE_64K].shift &&
+           (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
+               return 0;
+       /*
+        * Ok we only have 4K HPTE
+        */
+       if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
+               return 0;
+
+       return 1;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c

new file mode 100644 (file)

index 0000000..18b2c11
--- /dev/null
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -0,0 +1,526 @@
+/*
+ * Page table handling routines for radix page table.
+ *
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/sched.h>
+#include <linux/memblock.h>
+#include <linux/of_fdt.h>
+
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/dma.h>
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+#include <asm/firmware.h>
+
+#include <trace/events/thp.h>
+
+static int native_update_partition_table(u64 patb1)
+{
+       partition_tb->patb1 = cpu_to_be64(patb1);
+       return 0;
+}
+
+static __ref void *early_alloc_pgtable(unsigned long size)
+{
+       void *pt;
+
+       pt = __va(memblock_alloc_base(size, size, MEMBLOCK_ALLOC_ANYWHERE));
+       memset(pt, 0, size);
+
+       return pt;
+}
+
+int radix__map_kernel_page(unsigned long ea, unsigned long pa,
+                         pgprot_t flags,
+                         unsigned int map_page_size)
+{
+       pgd_t *pgdp;
+       pud_t *pudp;
+       pmd_t *pmdp;
+       pte_t *ptep;
+       /*
+        * Make sure task size is correct as per the max adddr
+        */
+       BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
+       if (slab_is_available()) {
+               pgdp = pgd_offset_k(ea);
+               pudp = pud_alloc(&init_mm, pgdp, ea);
+               if (!pudp)
+                       return -ENOMEM;
+               if (map_page_size == PUD_SIZE) {
+                       ptep = (pte_t *)pudp;
+                       goto set_the_pte;
+               }
+               pmdp = pmd_alloc(&init_mm, pudp, ea);
+               if (!pmdp)
+                       return -ENOMEM;
+               if (map_page_size == PMD_SIZE) {
+                       ptep = (pte_t *)pudp;
+                       goto set_the_pte;
+               }
+               ptep = pte_alloc_kernel(pmdp, ea);
+               if (!ptep)
+                       return -ENOMEM;
+       } else {
+               pgdp = pgd_offset_k(ea);
+               if (pgd_none(*pgdp)) {
+                       pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
+                       BUG_ON(pudp == NULL);
+                       pgd_populate(&init_mm, pgdp, pudp);
+               }
+               pudp = pud_offset(pgdp, ea);
+               if (map_page_size == PUD_SIZE) {
+                       ptep = (pte_t *)pudp;
+                       goto set_the_pte;
+               }
+               if (pud_none(*pudp)) {
+                       pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
+                       BUG_ON(pmdp == NULL);
+                       pud_populate(&init_mm, pudp, pmdp);
+               }
+               pmdp = pmd_offset(pudp, ea);
+               if (map_page_size == PMD_SIZE) {
+                       ptep = (pte_t *)pudp;
+                       goto set_the_pte;
+               }
+               if (!pmd_present(*pmdp)) {
+                       ptep = early_alloc_pgtable(PAGE_SIZE);
+                       BUG_ON(ptep == NULL);
+                       pmd_populate_kernel(&init_mm, pmdp, ptep);
+               }
+               ptep = pte_offset_kernel(pmdp, ea);
+       }
+
+set_the_pte:
+       set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, flags));
+       smp_wmb();
+       return 0;
+}
+
+static void __init radix_init_pgtable(void)
+{
+       int loop_count;
+       u64 base, end, start_addr;
+       unsigned long rts_field;
+       struct memblock_region *reg;
+       unsigned long linear_page_size;
+
+       /* We don't support slb for radix */
+       mmu_slb_size = 0;
+       /*
+        * Create the linear mapping, using standard page size for now
+        */
+       loop_count = 0;
+       for_each_memblock(memory, reg) {
+
+               start_addr = reg->base;
+
+redo:
+               if (loop_count < 1 && mmu_psize_defs[MMU_PAGE_1G].shift)
+                       linear_page_size = PUD_SIZE;
+               else if (loop_count < 2 && mmu_psize_defs[MMU_PAGE_2M].shift)
+                       linear_page_size = PMD_SIZE;
+               else
+                       linear_page_size = PAGE_SIZE;
+
+               base = _ALIGN_UP(start_addr, linear_page_size);
+               end = _ALIGN_DOWN(reg->base + reg->size, linear_page_size);
+
+               pr_info("Mapping range 0x%lx - 0x%lx with 0x%lx\n",
+                       (unsigned long)base, (unsigned long)end,
+                       linear_page_size);
+
+               while (base < end) {
+                       radix__map_kernel_page((unsigned long)__va(base),
+                                             base, PAGE_KERNEL_X,
+                                             linear_page_size);
+                       base += linear_page_size;
+               }
+               /*
+                * map the rest using lower page size
+                */
+               if (end < reg->base + reg->size) {
+                       start_addr = end;
+                       loop_count++;
+                       goto redo;
+               }
+       }
+       /*
+        * Allocate Partition table and process table for the
+        * host.
+        */
+       BUILD_BUG_ON_MSG((PRTB_SIZE_SHIFT > 23), "Process table size too large.");
+       process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT);
+       /*
+        * Fill in the process table.
+        * we support 52 bits, hence 52-28 = 24, 11000
+        */
+       rts_field = 3ull << PPC_BITLSHIFT(2);
+       process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
+       /*
+        * Fill in the partition table. We are suppose to use effective address
+        * of process table here. But our linear mapping also enable us to use
+        * physical address here.
+        */
+       ppc_md.update_partition_table(__pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR);
+       pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd);
+}
+
+static void __init radix_init_partition_table(void)
+{
+       unsigned long rts_field;
+       /*
+        * we support 52 bits, hence 52-28 = 24, 11000
+        */
+       rts_field = 3ull << PPC_BITLSHIFT(2);
+
+       BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 24), "Partition table size too large.");
+       partition_tb = early_alloc_pgtable(1UL << PATB_SIZE_SHIFT);
+       partition_tb->patb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) |
+                                         RADIX_PGD_INDEX_SIZE | PATB_HR);
+       printk("Partition table %p\n", partition_tb);
+
+       memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
+       /*
+        * update partition table control register,
+        * 64 K size.
+        */
+       mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
+}
+
+void __init radix_init_native(void)
+{
+       ppc_md.update_partition_table = native_update_partition_table;
+}
+
+static int __init get_idx_from_shift(unsigned int shift)
+{
+       int idx = -1;
+
+       switch (shift) {
+       case 0xc:
+               idx = MMU_PAGE_4K;
+               break;
+       case 0x10:
+               idx = MMU_PAGE_64K;
+               break;
+       case 0x15:
+               idx = MMU_PAGE_2M;
+               break;
+       case 0x1e:
+               idx = MMU_PAGE_1G;
+               break;
+       }
+       return idx;
+}
+
+static int __init radix_dt_scan_page_sizes(unsigned long node,
+                                          const char *uname, int depth,
+                                          void *data)
+{
+       int size = 0;
+       int shift, idx;
+       unsigned int ap;
+       const __be32 *prop;
+       const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+
+       /* We are scanning "cpu" nodes only */
+       if (type == NULL || strcmp(type, "cpu") != 0)
+               return 0;
+
+       prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
+       if (!prop)
+               return 0;
+
+       pr_info("Page sizes from device-tree:\n");
+       for (; size >= 4; size -= 4, ++prop) {
+
+               struct mmu_psize_def *def;
+
+               /* top 3 bit is AP encoding */
+               shift = be32_to_cpu(prop[0]) & ~(0xe << 28);
+               ap = be32_to_cpu(prop[0]) >> 29;
+               pr_info("Page size sift = %d AP=0x%x\n", shift, ap);
+
+               idx = get_idx_from_shift(shift);
+               if (idx < 0)
+                       continue;
+
+               def = &mmu_psize_defs[idx];
+               def->shift = shift;
+               def->ap  = ap;
+       }
+
+       /* needed ? */
+       cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
+       return 1;
+}
+
+static void __init radix_init_page_sizes(void)
+{
+       int rc;
+
+       /*
+        * Try to find the available page sizes in the device-tree
+        */
+       rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
+       if (rc != 0)  /* Found */
+               goto found;
+       /*
+        * let's assume we have page 4k and 64k support
+        */
+       mmu_psize_defs[MMU_PAGE_4K].shift = 12;
+       mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
+
+       mmu_psize_defs[MMU_PAGE_64K].shift = 16;
+       mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
+found:
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+       if (mmu_psize_defs[MMU_PAGE_2M].shift) {
+               /*
+                * map vmemmap using 2M if available
+                */
+               mmu_vmemmap_psize = MMU_PAGE_2M;
+       }
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+       return;
+}
+
+void __init radix__early_init_mmu(void)
+{
+       unsigned long lpcr;
+       /*
+        * setup LPCR UPRT based on mmu_features
+        */
+       lpcr = mfspr(SPRN_LPCR);
+       mtspr(SPRN_LPCR, lpcr | LPCR_UPRT);
+
+#ifdef CONFIG_PPC_64K_PAGES
+       /* PAGE_SIZE mappings */
+       mmu_virtual_psize = MMU_PAGE_64K;
+#else
+       mmu_virtual_psize = MMU_PAGE_4K;
+#endif
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+       /* vmemmap mapping */
+       mmu_vmemmap_psize = mmu_virtual_psize;
+#endif
+       /*
+        * initialize page table size
+        */
+       __pte_index_size = RADIX_PTE_INDEX_SIZE;
+       __pmd_index_size = RADIX_PMD_INDEX_SIZE;
+       __pud_index_size = RADIX_PUD_INDEX_SIZE;
+       __pgd_index_size = RADIX_PGD_INDEX_SIZE;
+       __pmd_cache_index = RADIX_PMD_INDEX_SIZE;
+       __pte_table_size = RADIX_PTE_TABLE_SIZE;
+       __pmd_table_size = RADIX_PMD_TABLE_SIZE;
+       __pud_table_size = RADIX_PUD_TABLE_SIZE;
+       __pgd_table_size = RADIX_PGD_TABLE_SIZE;
+
+       __pmd_val_bits = RADIX_PMD_VAL_BITS;
+       __pud_val_bits = RADIX_PUD_VAL_BITS;
+       __pgd_val_bits = RADIX_PGD_VAL_BITS;
+
+       __kernel_virt_start = RADIX_KERN_VIRT_START;
+       __kernel_virt_size = RADIX_KERN_VIRT_SIZE;
+       __vmalloc_start = RADIX_VMALLOC_START;
+       __vmalloc_end = RADIX_VMALLOC_END;
+       vmemmap = (struct page *)RADIX_VMEMMAP_BASE;
+       ioremap_bot = IOREMAP_BASE;
+       /*
+        * For now radix also use the same frag size
+        */
+       __pte_frag_nr = H_PTE_FRAG_NR;
+       __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT;
+
+       radix_init_page_sizes();
+       if (!firmware_has_feature(FW_FEATURE_LPAR))
+               radix_init_partition_table();
+
+       radix_init_pgtable();
+}
+
+void radix__early_init_mmu_secondary(void)
+{
+       unsigned long lpcr;
+       /*
+        * setup LPCR UPRT based on mmu_features
+        */
+       lpcr = mfspr(SPRN_LPCR);
+       mtspr(SPRN_LPCR, lpcr | LPCR_UPRT);
+       /*
+        * update partition table control register, 64 K size.
+        */
+       if (!firmware_has_feature(FW_FEATURE_LPAR))
+               mtspr(SPRN_PTCR,
+                     __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
+}
+
+void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
+                               phys_addr_t first_memblock_size)
+{
+       /* We don't currently support the first MEMBLOCK not mapping 0
+        * physical on those processors
+        */
+       BUG_ON(first_memblock_base != 0);
+       /*
+        * We limit the allocation that depend on ppc64_rma_size
+        * to first_memblock_size. We also clamp it to 1GB to
+        * avoid some funky things such as RTAS bugs.
+        *
+        * On radix config we really don't have a limitation
+        * on real mode access. But keeping it as above works
+        * well enough.
+        */
+       ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000);
+       /*
+        * Finally limit subsequent allocations. We really don't want
+        * to limit the memblock allocations to rma_size. FIXME!! should
+        * we even limit at all ?
+        */
+       memblock_set_current_limit(first_memblock_base + first_memblock_size);
+}
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+int __meminit radix__vmemmap_create_mapping(unsigned long start,
+                                     unsigned long page_size,
+                                     unsigned long phys)
+{
+       /* Create a PTE encoding */
+       unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW;
+
+       BUG_ON(radix__map_kernel_page(start, phys, __pgprot(flags), page_size));
+       return 0;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+void radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
+{
+       /* FIXME!! intel does more. We should free page tables mapping vmemmap ? */
+}
+#endif
+#endif
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
+                                 pmd_t *pmdp, unsigned long clr,
+                                 unsigned long set)
+{
+       unsigned long old;
+
+#ifdef CONFIG_DEBUG_VM
+       WARN_ON(!radix__pmd_trans_huge(*pmdp));
+       assert_spin_locked(&mm->page_table_lock);
+#endif
+
+       old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1);
+       trace_hugepage_update(addr, old, clr, set);
+
+       return old;
+}
+
+pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
+                       pmd_t *pmdp)
+
+{
+       pmd_t pmd;
+
+       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+       VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
+       /*
+        * khugepaged calls this for normal pmd
+        */
+       pmd = *pmdp;
+       pmd_clear(pmdp);
+       /*FIXME!!  Verify whether we need this kick below */
+       kick_all_cpus_sync();
+       flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+       return pmd;
+}
+
+/*
+ * For us pgtable_t is pte_t *. Inorder to save the deposisted
+ * page table, we consider the allocated page table as a list
+ * head. On withdraw we need to make sure we zero out the used
+ * list_head memory area.
+ */
+void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+                                pgtable_t pgtable)
+{
+        struct list_head *lh = (struct list_head *) pgtable;
+
+        assert_spin_locked(pmd_lockptr(mm, pmdp));
+
+        /* FIFO */
+        if (!pmd_huge_pte(mm, pmdp))
+                INIT_LIST_HEAD(lh);
+        else
+                list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
+        pmd_huge_pte(mm, pmdp) = pgtable;
+}
+
+pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
+{
+        pte_t *ptep;
+        pgtable_t pgtable;
+        struct list_head *lh;
+
+        assert_spin_locked(pmd_lockptr(mm, pmdp));
+
+        /* FIFO */
+        pgtable = pmd_huge_pte(mm, pmdp);
+        lh = (struct list_head *) pgtable;
+        if (list_empty(lh))
+                pmd_huge_pte(mm, pmdp) = NULL;
+        else {
+                pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
+                list_del(lh);
+        }
+        ptep = (pte_t *) pgtable;
+        *ptep = __pte(0);
+        ptep++;
+        *ptep = __pte(0);
+        return pgtable;
+}
+
+
+pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
+                              unsigned long addr, pmd_t *pmdp)
+{
+       pmd_t old_pmd;
+       unsigned long old;
+
+       old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
+       old_pmd = __pmd(old);
+       /*
+        * Serialize against find_linux_pte_or_hugepte which does lock-less
+        * lookup in page tables with local interrupts disabled. For huge pages
+        * it casts pmd_t to pte_t. Since format of pte_t is different from
+        * pmd_t we want to prevent transit from pmd pointing to page table
+        * to pmd pointing to huge page (and back) while interrupts are disabled.
+        * We clear pmd to possibly replace it with page table pointer in
+        * different code paths. So make sure we wait for the parallel
+        * find_linux_pte_or_hugepage to finish.
+        */
+       kick_all_cpus_sync();
+       return old_pmd;
+}
+
+int radix__has_transparent_hugepage(void)
+{
+       /* For radix 2M at PMD level means thp */
+       if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT)
+               return 1;
+       return 0;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c

index de37ff445362a1215bb2e6d3956bf364f841671a..88a307504b5a0f3798be84c282ab09de47168452 100644 (file)
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -38,14 +38,25 @@ static inline int is_exec_fault(void)
  
  /* We only try to do i/d cache coherency on stuff that looks like
   * reasonably "normal" PTEs. We currently require a PTE to be present
- * and we avoid _PAGE_SPECIAL and _PAGE_NO_CACHE. We also only do that
+ * and we avoid _PAGE_SPECIAL and cache inhibited pte. We also only do that
   * on userspace PTEs
   */
  static inline int pte_looks_normal(pte_t pte)
  {
+
+#if defined(CONFIG_PPC_BOOK3S_64)
+       if ((pte_val(pte) & (_PAGE_PRESENT | _PAGE_SPECIAL)) == _PAGE_PRESENT) {
+               if (pte_ci(pte))
+                       return 0;
+               if (pte_user(pte))
+                       return 1;
+       }
+       return 0;
+#else
         return (pte_val(pte) &
-           (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER)) ==
-           (_PAGE_PRESENT | _PAGE_USER);
+               (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER)) ==
+               (_PAGE_PRESENT | _PAGE_USER);
+#endif
  }
  
  static struct page *maybe_pte_to_page(pte_t pte)
@@ -71,6 +82,9 @@ static struct page *maybe_pte_to_page(pte_t pte)
  
  static pte_t set_pte_filter(pte_t pte)
  {
+       if (radix_enabled())
+               return pte;
+
         pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
         if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) ||
                                        cpu_has_feature(CPU_FTR_NOEXECUTE))) {
@@ -177,8 +191,8 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
          * _PAGE_PRESENT, but we can be sure that it is not in hpte.
          * Hence we can use set_pte_at for them.
          */
-       VM_WARN_ON((pte_val(*ptep) & (_PAGE_PRESENT | _PAGE_USER)) ==
-               (_PAGE_PRESENT | _PAGE_USER));
+       VM_WARN_ON(pte_present(*ptep) && !pte_protnone(*ptep));
+
         /*
          * Add the pte bit when tryint set a pte
          */
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c

index 347106080bb1e64b87d28477db24e47f3ab3ef77..e009e0604a8ab76f7b9b1efd2004c8c45f210c6c 100644 (file)
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -55,104 +55,63 @@
  
  #include "mmu_decl.h"
  
-#define CREATE_TRACE_POINTS
-#include <trace/events/thp.h>
-
-/* Some sanity checking */
-#if TASK_SIZE_USER64 > PGTABLE_RANGE
-#error TASK_SIZE_USER64 exceeds pagetable range
-#endif
-
  #ifdef CONFIG_PPC_STD_MMU_64
  #if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT))
  #error TASK_SIZE_USER64 exceeds user VSID range
  #endif
  #endif
  
-unsigned long ioremap_bot = IOREMAP_BASE;
-
-#ifdef CONFIG_PPC_MMU_NOHASH
-static __ref void *early_alloc_pgtable(unsigned long size)
-{
-       void *pt;
-
-       pt = __va(memblock_alloc_base(size, size, __pa(MAX_DMA_ADDRESS)));
-       memset(pt, 0, size);
-
-       return pt;
-}
-#endif /* CONFIG_PPC_MMU_NOHASH */
-
+#ifdef CONFIG_PPC_BOOK3S_64
  /*
- * map_kernel_page currently only called by __ioremap
- * map_kernel_page adds an entry to the ioremap page table
- * and adds an entry to the HPT, possibly bolting it
+ * partition table and process table for ISA 3.0
   */
-int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags)
-{
-       pgd_t *pgdp;
-       pud_t *pudp;
-       pmd_t *pmdp;
-       pte_t *ptep;
-
-       if (slab_is_available()) {
-               pgdp = pgd_offset_k(ea);
-               pudp = pud_alloc(&init_mm, pgdp, ea);
-               if (!pudp)
-                       return -ENOMEM;
-               pmdp = pmd_alloc(&init_mm, pudp, ea);
-               if (!pmdp)
-                       return -ENOMEM;
-               ptep = pte_alloc_kernel(pmdp, ea);
-               if (!ptep)
-                       return -ENOMEM;
-               set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
-                                                         __pgprot(flags)));
-       } else {
-#ifdef CONFIG_PPC_MMU_NOHASH
-               pgdp = pgd_offset_k(ea);
-#ifdef PUD_TABLE_SIZE
-               if (pgd_none(*pgdp)) {
-                       pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
-                       BUG_ON(pudp == NULL);
-                       pgd_populate(&init_mm, pgdp, pudp);
-               }
-#endif /* PUD_TABLE_SIZE */
-               pudp = pud_offset(pgdp, ea);
-               if (pud_none(*pudp)) {
-                       pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
-                       BUG_ON(pmdp == NULL);
-                       pud_populate(&init_mm, pudp, pmdp);
-               }
-               pmdp = pmd_offset(pudp, ea);
-               if (!pmd_present(*pmdp)) {
-                       ptep = early_alloc_pgtable(PAGE_SIZE);
-                       BUG_ON(ptep == NULL);
-                       pmd_populate_kernel(&init_mm, pmdp, ptep);
-               }
-               ptep = pte_offset_kernel(pmdp, ea);
-               set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
-                                                         __pgprot(flags)));
-#else /* CONFIG_PPC_MMU_NOHASH */
-               /*
-                * If the mm subsystem is not fully up, we cannot create a
-                * linux page table entry for this mapping.  Simply bolt an
-                * entry in the hardware page table.
-                *
-                */
-               if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags,
-                                     mmu_io_psize, mmu_kernel_ssize)) {
-                       printk(KERN_ERR "Failed to do bolted mapping IO "
-                              "memory at %016lx !\n", pa);
-                       return -ENOMEM;
-               }
-#endif /* !CONFIG_PPC_MMU_NOHASH */
-       }
-
-       smp_wmb();
-       return 0;
-}
-
+struct prtb_entry *process_tb;
+struct patb_entry *partition_tb;
+/*
+ * page table size
+ */
+unsigned long __pte_index_size;
+EXPORT_SYMBOL(__pte_index_size);
+unsigned long __pmd_index_size;
+EXPORT_SYMBOL(__pmd_index_size);
+unsigned long __pud_index_size;
+EXPORT_SYMBOL(__pud_index_size);
+unsigned long __pgd_index_size;
+EXPORT_SYMBOL(__pgd_index_size);
+unsigned long __pmd_cache_index;
+EXPORT_SYMBOL(__pmd_cache_index);
+unsigned long __pte_table_size;
+EXPORT_SYMBOL(__pte_table_size);
+unsigned long __pmd_table_size;
+EXPORT_SYMBOL(__pmd_table_size);
+unsigned long __pud_table_size;
+EXPORT_SYMBOL(__pud_table_size);
+unsigned long __pgd_table_size;
+EXPORT_SYMBOL(__pgd_table_size);
+unsigned long __pmd_val_bits;
+EXPORT_SYMBOL(__pmd_val_bits);
+unsigned long __pud_val_bits;
+EXPORT_SYMBOL(__pud_val_bits);
+unsigned long __pgd_val_bits;
+EXPORT_SYMBOL(__pgd_val_bits);
+unsigned long __kernel_virt_start;
+EXPORT_SYMBOL(__kernel_virt_start);
+unsigned long __kernel_virt_size;
+EXPORT_SYMBOL(__kernel_virt_size);
+unsigned long __vmalloc_start;
+EXPORT_SYMBOL(__vmalloc_start);
+unsigned long __vmalloc_end;
+EXPORT_SYMBOL(__vmalloc_end);
+struct page *vmemmap;
+EXPORT_SYMBOL(vmemmap);
+unsigned long __pte_frag_nr;
+EXPORT_SYMBOL(__pte_frag_nr);
+unsigned long __pte_frag_size_shift;
+EXPORT_SYMBOL(__pte_frag_size_shift);
+unsigned long ioremap_bot;
+#else /* !CONFIG_PPC_BOOK3S_64 */
+unsigned long ioremap_bot = IOREMAP_BASE;
+#endif
  
  /**
   * __ioremap_at - Low level function to establish the page tables
@@ -167,12 +126,8 @@ void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size,
         if ((flags & _PAGE_PRESENT) == 0)
                 flags |= pgprot_val(PAGE_KERNEL);
  
-       /* Non-cacheable page cannot be coherent */
-       if (flags & _PAGE_NO_CACHE)
-               flags &= ~_PAGE_COHERENT;
-
         /* We don't support the 4K PFN hack with ioremap */
-       if (flags & _PAGE_4K_PFN)
+       if (flags & H_PAGE_4K_PFN)
                 return NULL;
  
         WARN_ON(pa & ~PAGE_MASK);
@@ -253,7 +208,7 @@ void __iomem * __ioremap(phys_addr_t addr, unsigned long size,
  
  void __iomem * ioremap(phys_addr_t addr, unsigned long size)
  {
-       unsigned long flags = _PAGE_NO_CACHE | _PAGE_GUARDED;
+       unsigned long flags = pgprot_val(pgprot_noncached(__pgprot(0)));
         void *caller = __builtin_return_address(0);
  
         if (ppc_md.ioremap)
@@ -263,7 +218,7 @@ void __iomem * ioremap(phys_addr_t addr, unsigned long size)
  
  void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size)
  {
-       unsigned long flags = _PAGE_NO_CACHE;
+       unsigned long flags = pgprot_val(pgprot_noncached_wc(__pgprot(0)));
         void *caller = __builtin_return_address(0);
  
         if (ppc_md.ioremap)
@@ -277,11 +232,20 @@ void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size,
         void *caller = __builtin_return_address(0);
  
         /* writeable implies dirty for kernel addresses */
-       if (flags & _PAGE_RW)
+       if (flags & _PAGE_WRITE)
                 flags |= _PAGE_DIRTY;
  
-       /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
-       flags &= ~(_PAGE_USER | _PAGE_EXEC);
+       /* we don't want to let _PAGE_EXEC leak out */
+       flags &= ~_PAGE_EXEC;
+       /*
+        * Force kernel mapping.
+        */
+#if defined(CONFIG_PPC_BOOK3S_64)
+       flags |= _PAGE_PRIVILEGED;
+#else
+       flags &= ~_PAGE_USER;
+#endif
+
  
  #ifdef _PAGE_BAP_SR
         /* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format
@@ -411,7 +375,7 @@ static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel)
         return (pte_t *)ret;
  }
  
-pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel)
+pte_t *pte_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel)
  {
         pte_t *pte;
  
@@ -421,8 +385,9 @@ pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel)
  
         return __alloc_for_cache(mm, kernel);
  }
+#endif /* CONFIG_PPC_64K_PAGES */
  
-void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel)
+void pte_fragment_free(unsigned long *table, int kernel)
  {
         struct page *page = virt_to_page(table);
         if (put_page_testzero(page)) {
@@ -433,15 +398,6 @@ void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel)
  }
  
  #ifdef CONFIG_SMP
-static void page_table_free_rcu(void *table)
-{
-       struct page *page = virt_to_page(table);
-       if (put_page_testzero(page)) {
-               pgtable_page_dtor(page);
-               free_hot_cold_page(page, 0);
-       }
-}
-
  void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
  {
         unsigned long pgf = (unsigned long)table;
@@ -458,7 +414,7 @@ void __tlb_remove_table(void *_table)
  
         if (!shift)
                 /* PTE page needs special handling */
-               page_table_free_rcu(table);
+               pte_fragment_free(table, 0);
         else {
                 BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
                 kmem_cache_free(PGT_CACHE(shift), table);
@@ -469,385 +425,10 @@ void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
  {
         if (!shift) {
                 /* PTE page needs special handling */
-               struct page *page = virt_to_page(table);
-               if (put_page_testzero(page)) {
-                       pgtable_page_dtor(page);
-                       free_hot_cold_page(page, 0);
-               }
+               pte_fragment_free(table, 0);
         } else {
                 BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
                 kmem_cache_free(PGT_CACHE(shift), table);
         }
  }
  #endif
-#endif /* CONFIG_PPC_64K_PAGES */
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-
-/*
- * This is called when relaxing access to a hugepage. It's also called in the page
- * fault path when we don't hit any of the major fault cases, ie, a minor
- * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
- * handled those two for us, we additionally deal with missing execute
- * permission here on some processors
- */
-int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
-                         pmd_t *pmdp, pmd_t entry, int dirty)
-{
-       int changed;
-#ifdef CONFIG_DEBUG_VM
-       WARN_ON(!pmd_trans_huge(*pmdp));
-       assert_spin_locked(&vma->vm_mm->page_table_lock);
-#endif
-       changed = !pmd_same(*(pmdp), entry);
-       if (changed) {
-               __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
-               /*
-                * Since we are not supporting SW TLB systems, we don't
-                * have any thing similar to flush_tlb_page_nohash()
-                */
-       }
-       return changed;
-}
-
-unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
-                                 pmd_t *pmdp, unsigned long clr,
-                                 unsigned long set)
-{
-
-       unsigned long old, tmp;
-
-#ifdef CONFIG_DEBUG_VM
-       WARN_ON(!pmd_trans_huge(*pmdp));
-       assert_spin_locked(&mm->page_table_lock);
-#endif
-
-#ifdef PTE_ATOMIC_UPDATES
-       __asm__ __volatile__(
-       "1:     ldarx   %0,0,%3\n\
-               andi.   %1,%0,%6\n\
-               bne-    1b \n\
-               andc    %1,%0,%4 \n\
-               or      %1,%1,%7\n\
-               stdcx.  %1,0,%3 \n\
-               bne-    1b"
-       : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
-       : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY), "r" (set)
-       : "cc" );
-#else
-       old = pmd_val(*pmdp);
-       *pmdp = __pmd((old & ~clr) | set);
-#endif
-       trace_hugepage_update(addr, old, clr, set);
-       if (old & _PAGE_HASHPTE)
-               hpte_do_hugepage_flush(mm, addr, pmdp, old);
-       return old;
-}
-
-pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
-                         pmd_t *pmdp)
-{
-       pmd_t pmd;
-
-       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-       VM_BUG_ON(pmd_trans_huge(*pmdp));
-
-       pmd = *pmdp;
-       pmd_clear(pmdp);
-       /*
-        * Wait for all pending hash_page to finish. This is needed
-        * in case of subpage collapse. When we collapse normal pages
-        * to hugepage, we first clear the pmd, then invalidate all
-        * the PTE entries. The assumption here is that any low level
-        * page fault will see a none pmd and take the slow path that
-        * will wait on mmap_sem. But we could very well be in a
-        * hash_page with local ptep pointer value. Such a hash page
-        * can result in adding new HPTE entries for normal subpages.
-        * That means we could be modifying the page content as we
-        * copy them to a huge page. So wait for parallel hash_page
-        * to finish before invalidating HPTE entries. We can do this
-        * by sending an IPI to all the cpus and executing a dummy
-        * function there.
-        */
-       kick_all_cpus_sync();
-       /*
-        * Now invalidate the hpte entries in the range
-        * covered by pmd. This make sure we take a
-        * fault and will find the pmd as none, which will
-        * result in a major fault which takes mmap_sem and
-        * hence wait for collapse to complete. Without this
-        * the __collapse_huge_page_copy can result in copying
-        * the old content.
-        */
-       flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
-       return pmd;
-}
-
-int pmdp_test_and_clear_young(struct vm_area_struct *vma,
-                             unsigned long address, pmd_t *pmdp)
-{
-       return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
-}
-
-/*
- * We currently remove entries from the hashtable regardless of whether
- * the entry was young or dirty. The generic routines only flush if the
- * entry was young or dirty which is not good enough.
- *
- * We should be more intelligent about this but for the moment we override
- * these functions and force a tlb flush unconditionally
- */
-int pmdp_clear_flush_young(struct vm_area_struct *vma,
-                                 unsigned long address, pmd_t *pmdp)
-{
-       return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
-}
-
-/*
- * We want to put the pgtable in pmd and use pgtable for tracking
- * the base page size hptes
- */
-void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
-                               pgtable_t pgtable)
-{
-       pgtable_t *pgtable_slot;
-       assert_spin_locked(&mm->page_table_lock);
-       /*
-        * we store the pgtable in the second half of PMD
-        */
-       pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
-       *pgtable_slot = pgtable;
-       /*
-        * expose the deposited pgtable to other cpus.
-        * before we set the hugepage PTE at pmd level
-        * hash fault code looks at the deposted pgtable
-        * to store hash index values.
-        */
-       smp_wmb();
-}
-
-pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
-{
-       pgtable_t pgtable;
-       pgtable_t *pgtable_slot;
-
-       assert_spin_locked(&mm->page_table_lock);
-       pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
-       pgtable = *pgtable_slot;
-       /*
-        * Once we withdraw, mark the entry NULL.
-        */
-       *pgtable_slot = NULL;
-       /*
-        * We store HPTE information in the deposited PTE fragment.
-        * zero out the content on withdraw.
-        */
-       memset(pgtable, 0, PTE_FRAG_SIZE);
-       return pgtable;
-}
-
-void pmdp_huge_split_prepare(struct vm_area_struct *vma,
-                            unsigned long address, pmd_t *pmdp)
-{
-       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-       VM_BUG_ON(REGION_ID(address) != USER_REGION_ID);
-
-       /*
-        * We can't mark the pmd none here, because that will cause a race
-        * against exit_mmap. We need to continue mark pmd TRANS HUGE, while
-        * we spilt, but at the same time we wan't rest of the ppc64 code
-        * not to insert hash pte on this, because we will be modifying
-        * the deposited pgtable in the caller of this function. Hence
-        * clear the _PAGE_USER so that we move the fault handling to
-        * higher level function and that will serialize against ptl.
-        * We need to flush existing hash pte entries here even though,
-        * the translation is still valid, because we will withdraw
-        * pgtable_t after this.
-        */
-       pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_USER, 0);
-}
-
-
-/*
- * set a new huge pmd. We should not be called for updating
- * an existing pmd entry. That should go via pmd_hugepage_update.
- */
-void set_pmd_at(struct mm_struct *mm, unsigned long addr,
-               pmd_t *pmdp, pmd_t pmd)
-{
-#ifdef CONFIG_DEBUG_VM
-       WARN_ON((pmd_val(*pmdp) & (_PAGE_PRESENT | _PAGE_USER)) ==
-               (_PAGE_PRESENT | _PAGE_USER));
-       assert_spin_locked(&mm->page_table_lock);
-       WARN_ON(!pmd_trans_huge(pmd));
-#endif
-       trace_hugepage_set_pmd(addr, pmd_val(pmd));
-       return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
-}
-
-/*
- * We use this to invalidate a pmdp entry before switching from a
- * hugepte to regular pmd entry.
- */
-void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
-                    pmd_t *pmdp)
-{
-       pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
-
-       /*
-        * This ensures that generic code that rely on IRQ disabling
-        * to prevent a parallel THP split work as expected.
-        */
-       kick_all_cpus_sync();
-}
-
-/*
- * A linux hugepage PMD was changed and the corresponding hash table entries
- * neesd to be flushed.
- */
-void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
-                           pmd_t *pmdp, unsigned long old_pmd)
-{
-       int ssize;
-       unsigned int psize;
-       unsigned long vsid;
-       unsigned long flags = 0;
-       const struct cpumask *tmp;
-
-       /* get the base page size,vsid and segment size */
-#ifdef CONFIG_DEBUG_VM
-       psize = get_slice_psize(mm, addr);
-       BUG_ON(psize == MMU_PAGE_16M);
-#endif
-       if (old_pmd & _PAGE_COMBO)
-               psize = MMU_PAGE_4K;
-       else
-               psize = MMU_PAGE_64K;
-
-       if (!is_kernel_addr(addr)) {
-               ssize = user_segment_size(addr);
-               vsid = get_vsid(mm->context.id, addr, ssize);
-               WARN_ON(vsid == 0);
-       } else {
-               vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
-               ssize = mmu_kernel_ssize;
-       }
-
-       tmp = cpumask_of(smp_processor_id());
-       if (cpumask_equal(mm_cpumask(mm), tmp))
-               flags |= HPTE_LOCAL_UPDATE;
-
-       return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
-}
-
-static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
-{
-       return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
-}
-
-pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
-{
-       unsigned long pmdv;
-
-       pmdv = (pfn << PTE_RPN_SHIFT) & PTE_RPN_MASK;
-       return pmd_set_protbits(__pmd(pmdv), pgprot);
-}
-
-pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
-{
-       return pfn_pmd(page_to_pfn(page), pgprot);
-}
-
-pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
-{
-       unsigned long pmdv;
-
-       pmdv = pmd_val(pmd);
-       pmdv &= _HPAGE_CHG_MASK;
-       return pmd_set_protbits(__pmd(pmdv), newprot);
-}
-
-/*
- * This is called at the end of handling a user page fault, when the
- * fault has been handled by updating a HUGE PMD entry in the linux page tables.
- * We use it to preload an HPTE into the hash table corresponding to
- * the updated linux HUGE PMD entry.
- */
-void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
-                         pmd_t *pmd)
-{
-       return;
-}
-
-pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
-                             unsigned long addr, pmd_t *pmdp)
-{
-       pmd_t old_pmd;
-       pgtable_t pgtable;
-       unsigned long old;
-       pgtable_t *pgtable_slot;
-
-       old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
-       old_pmd = __pmd(old);
-       /*
-        * We have pmd == none and we are holding page_table_lock.
-        * So we can safely go and clear the pgtable hash
-        * index info.
-        */
-       pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
-       pgtable = *pgtable_slot;
-       /*
-        * Let's zero out old valid and hash index details
-        * hash fault look at them.
-        */
-       memset(pgtable, 0, PTE_FRAG_SIZE);
-       /*
-        * Serialize against find_linux_pte_or_hugepte which does lock-less
-        * lookup in page tables with local interrupts disabled. For huge pages
-        * it casts pmd_t to pte_t. Since format of pte_t is different from
-        * pmd_t we want to prevent transit from pmd pointing to page table
-        * to pmd pointing to huge page (and back) while interrupts are disabled.
-        * We clear pmd to possibly replace it with page table pointer in
-        * different code paths. So make sure we wait for the parallel
-        * find_linux_pte_or_hugepage to finish.
-        */
-       kick_all_cpus_sync();
-       return old_pmd;
-}
-
-int has_transparent_hugepage(void)
-{
-
-       BUILD_BUG_ON_MSG((PMD_SHIFT - PAGE_SHIFT) >= MAX_ORDER,
-               "hugepages can't be allocated by the buddy allocator");
-
-       BUILD_BUG_ON_MSG((PMD_SHIFT - PAGE_SHIFT) < 2,
-                        "We need more than 2 pages to do deferred thp split");
-
-       if (!mmu_has_feature(MMU_FTR_16M_PAGE))
-               return 0;
-       /*
-        * We support THP only if PMD_SIZE is 16MB.
-        */
-       if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
-               return 0;
-       /*
-        * We need to make sure that we support 16MB hugepage in a segement
-        * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
-        * of 64K.
-        */
-       /*
-        * If we have 64K HPTE, we will be using that by default
-        */
-       if (mmu_psize_defs[MMU_PAGE_64K].shift &&
-           (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
-               return 0;
-       /*
-        * Ok we only have 4K HPTE
-        */
-       if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
-               return 0;
-
-       return 1;
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c

index 825b6873391f9654f402d1b1a5ede306e5febe60..48fc28bab544771620c8ff48baf2461e5edef41e 100644 (file)
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -32,7 +32,6 @@ enum slb_index {
  };
  
  extern void slb_allocate_realmode(unsigned long ea);
-extern void slb_allocate_user(unsigned long ea);
  
  static void slb_allocate(unsigned long ea)
  {
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S

index 736d18b3cefd3bd16e2d9ab7c34b56bb7f5e8a61..dfdb90cb44039f364d92cecd2e920eccb25d07e5 100644 (file)
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -35,7 +35,7 @@ _GLOBAL(slb_allocate_realmode)
          * check for bad kernel/user address
          * (ea & ~REGION_MASK) >= PGTABLE_RANGE
          */
-       rldicr. r9,r3,4,(63 - PGTABLE_EADDR_SIZE - 4)
+       rldicr. r9,r3,4,(63 - H_PGTABLE_EADDR_SIZE - 4)
         bne-    8f
  
         srdi    r9,r3,60                /* get region */
@@ -91,7 +91,7 @@ slb_miss_kernel_load_vmemmap:
          * can be demoted from 64K -> 4K dynamically on some machines
          */
         clrldi  r11,r10,48
-       cmpldi  r11,(VMALLOC_SIZE >> 28) - 1
+       cmpldi  r11,(H_VMALLOC_SIZE >> 28) - 1
         bgt     5f
         lhz     r11,PACAVMALLOCSLLP(r13)
         b       6f
@@ -179,56 +179,6 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
         li      r11,SLB_VSID_USER       /* flags don't much matter */
         b       slb_finish_load
  
-#ifdef __DISABLED__
-
-/* void slb_allocate_user(unsigned long ea);
- *
- * Create an SLB entry for the given EA (user or kernel).
- *     r3 = faulting address, r13 = PACA
- *     r9, r10, r11 are clobbered by this function
- * No other registers are examined or changed.
- *
- * It is called with translation enabled in order to be able to walk the
- * page tables. This is not currently used.
- */
-_GLOBAL(slb_allocate_user)
-       /* r3 = faulting address */
-       srdi    r10,r3,28               /* get esid */
-
-       crset   4*cr7+lt                /* set "user" flag for later */
-
-       /* check if we fit in the range covered by the pagetables*/
-       srdi.   r9,r3,PGTABLE_EADDR_SIZE
-       crnot   4*cr0+eq,4*cr0+eq
-       beqlr
-
-       /* now we need to get to the page tables in order to get the page
-        * size encoding from the PMD. In the future, we'll be able to deal
-        * with 1T segments too by getting the encoding from the PGD instead
-        */
-       ld      r9,PACAPGDIR(r13)
-       cmpldi  cr0,r9,0
-       beqlr
-       rlwinm  r11,r10,8,25,28
-       ldx     r9,r9,r11               /* get pgd_t */
-       cmpldi  cr0,r9,0
-       beqlr
-       rlwinm  r11,r10,3,17,28
-       ldx     r9,r9,r11               /* get pmd_t */
-       cmpldi  cr0,r9,0
-       beqlr
-
-       /* build vsid flags */
-       andi.   r11,r9,SLB_VSID_LLP
-       ori     r11,r11,SLB_VSID_USER
-
-       /* get context to calculate proto-VSID */
-       ld      r9,PACACONTEXTID(r13)
-       /* fall through slb_finish_load */
-
-#endif /* __DISABLED__ */
-
-
  /*
   * Finish loading of an SLB entry and return
   *
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c

index 42954f0b47aced31454d44103180df9202527fb6..2b27458902ee888d1ba3480191497c105a184e94 100644 (file)
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -37,8 +37,8 @@
  #include <asm/hugetlb.h>
  
  /* some sanity checks */
-#if (PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE
-#error PGTABLE_RANGE exceeds slice_mask high_slices size
+#if (H_PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE
+#error H_PGTABLE_RANGE exceeds slice_mask high_slices size
  #endif
  
  static DEFINE_SPINLOCK(slice_convert_lock);
@@ -395,6 +395,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
  
         /* Sanity checks */
         BUG_ON(mm->task_size == 0);
+       VM_BUG_ON(radix_enabled());
  
         slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize);
         slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n",
@@ -568,6 +569,16 @@ unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr)
         unsigned char *hpsizes;
         int index, mask_index;
  
+       /*
+        * Radix doesn't use slice, but can get enabled along with MMU_SLICE
+        */
+       if (radix_enabled()) {
+#ifdef CONFIG_PPC_64K_PAGES
+               return MMU_PAGE_64K;
+#else
+               return MMU_PAGE_4K;
+#endif
+       }
         if (addr < SLICE_LOW_TOP) {
                 u64 lpsizes;
                 lpsizes = mm->context.low_slices_psize;
@@ -605,6 +616,7 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
  
         slice_dbg("slice_set_user_psize(mm=%p, psize=%d)\n", mm, psize);
  
+       VM_BUG_ON(radix_enabled());
         spin_lock_irqsave(&slice_convert_lock, flags);
  
         old_psize = mm->context.user_psize;
@@ -649,6 +661,7 @@ void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
  {
         struct slice_mask mask = slice_range_to_mask(start, len);
  
+       VM_BUG_ON(radix_enabled());
         slice_convert(mm, mask, psize);
  }
  
@@ -678,6 +691,9 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
         struct slice_mask mask, available;
         unsigned int psize = mm->context.user_psize;
  
+       if (radix_enabled())
+               return 0;
+
         mask = slice_range_to_mask(addr, len);
         available = slice_mask_for_size(mm, psize);
  #ifdef CONFIG_PPC_64K_PAGES
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c

new file mode 100644 (file)

index 0000000..0fdaf93
--- /dev/null
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -0,0 +1,251 @@
+/*
+ * TLB flush routines for radix kernels.
+ *
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/memblock.h>
+
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+
+static DEFINE_RAW_SPINLOCK(native_tlbie_lock);
+
+static inline void __tlbiel_pid(unsigned long pid, int set)
+{
+       unsigned long rb,rs,ric,prs,r;
+
+       rb = PPC_BIT(53); /* IS = 1 */
+       rb |= set << PPC_BITLSHIFT(51);
+       rs = ((unsigned long)pid) << PPC_BITLSHIFT(31);
+       prs = 1; /* process scoped */
+       r = 1;   /* raidx format */
+       ric = 2;  /* invalidate all the caches */
+
+       asm volatile("ptesync": : :"memory");
+       asm volatile(".long 0x7c000224 | (%0 << 11) | (%1 << 16) |"
+                    "(%2 << 17) | (%3 << 18) | (%4 << 21)"
+                    : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+       asm volatile("ptesync": : :"memory");
+}
+
+/*
+ * We use 128 set in radix mode and 256 set in hpt mode.
+ */
+static inline void _tlbiel_pid(unsigned long pid)
+{
+       int set;
+
+       for (set = 0; set < POWER9_TLB_SETS_RADIX ; set++) {
+               __tlbiel_pid(pid, set);
+       }
+       return;
+}
+
+static inline void _tlbie_pid(unsigned long pid)
+{
+       unsigned long rb,rs,ric,prs,r;
+
+       rb = PPC_BIT(53); /* IS = 1 */
+       rs = pid << PPC_BITLSHIFT(31);
+       prs = 1; /* process scoped */
+       r = 1;   /* raidx format */
+       ric = 2;  /* invalidate all the caches */
+
+       asm volatile("ptesync": : :"memory");
+       asm volatile(".long 0x7c000264 | (%0 << 11) | (%1 << 16) |"
+                    "(%2 << 17) | (%3 << 18) | (%4 << 21)"
+                    : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+       asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+static inline void _tlbiel_va(unsigned long va, unsigned long pid,
+                             unsigned long ap)
+{
+       unsigned long rb,rs,ric,prs,r;
+
+       rb = va & ~(PPC_BITMASK(52, 63));
+       rb |= ap << PPC_BITLSHIFT(58);
+       rs = pid << PPC_BITLSHIFT(31);
+       prs = 1; /* process scoped */
+       r = 1;   /* raidx format */
+       ric = 0;  /* no cluster flush yet */
+
+       asm volatile("ptesync": : :"memory");
+       asm volatile(".long 0x7c000224 | (%0 << 11) | (%1 << 16) |"
+                    "(%2 << 17) | (%3 << 18) | (%4 << 21)"
+                    : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+       asm volatile("ptesync": : :"memory");
+}
+
+static inline void _tlbie_va(unsigned long va, unsigned long pid,
+                            unsigned long ap)
+{
+       unsigned long rb,rs,ric,prs,r;
+
+       rb = va & ~(PPC_BITMASK(52, 63));
+       rb |= ap << PPC_BITLSHIFT(58);
+       rs = pid << PPC_BITLSHIFT(31);
+       prs = 1; /* process scoped */
+       r = 1;   /* raidx format */
+       ric = 0;  /* no cluster flush yet */
+
+       asm volatile("ptesync": : :"memory");
+       asm volatile(".long 0x7c000264 | (%0 << 11) | (%1 << 16) |"
+                    "(%2 << 17) | (%3 << 18) | (%4 << 21)"
+                    : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+       asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+/*
+ * Base TLB flushing operations:
+ *
+ *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
+ *  - flush_tlb_page(vma, vmaddr) flushes one page
+ *  - flush_tlb_range(vma, start, end) flushes a range of pages
+ *  - flush_tlb_kernel_range(start, end) flushes kernel pages
+ *
+ *  - local_* variants of page and mm only apply to the current
+ *    processor
+ */
+void radix__local_flush_tlb_mm(struct mm_struct *mm)
+{
+       unsigned int pid;
+
+       preempt_disable();
+       pid = mm->context.id;
+       if (pid != MMU_NO_CONTEXT)
+               _tlbiel_pid(pid);
+       preempt_enable();
+}
+EXPORT_SYMBOL(radix__local_flush_tlb_mm);
+
+void radix___local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
+                           unsigned long ap, int nid)
+{
+       unsigned int pid;
+
+       preempt_disable();
+       pid = mm ? mm->context.id : 0;
+       if (pid != MMU_NO_CONTEXT)
+               _tlbiel_va(vmaddr, pid, ap);
+       preempt_enable();
+}
+
+void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+       /* need the return fix for nohash.c */
+       if (vma && is_vm_hugetlb_page(vma))
+               return __local_flush_hugetlb_page(vma, vmaddr);
+#endif
+       radix___local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
+                              mmu_get_ap(mmu_virtual_psize), 0);
+}
+EXPORT_SYMBOL(radix__local_flush_tlb_page);
+
+#ifdef CONFIG_SMP
+static int mm_is_core_local(struct mm_struct *mm)
+{
+       return cpumask_subset(mm_cpumask(mm),
+                             topology_sibling_cpumask(smp_processor_id()));
+}
+
+void radix__flush_tlb_mm(struct mm_struct *mm)
+{
+       unsigned int pid;
+
+       preempt_disable();
+       pid = mm->context.id;
+       if (unlikely(pid == MMU_NO_CONTEXT))
+               goto no_context;
+
+       if (!mm_is_core_local(mm)) {
+               int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+
+               if (lock_tlbie)
+                       raw_spin_lock(&native_tlbie_lock);
+               _tlbie_pid(pid);
+               if (lock_tlbie)
+                       raw_spin_unlock(&native_tlbie_lock);
+       } else
+               _tlbiel_pid(pid);
+no_context:
+       preempt_enable();
+}
+EXPORT_SYMBOL(radix__flush_tlb_mm);
+
+void radix___flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
+                      unsigned long ap, int nid)
+{
+       unsigned int pid;
+
+       preempt_disable();
+       pid = mm ? mm->context.id : 0;
+       if (unlikely(pid == MMU_NO_CONTEXT))
+               goto bail;
+       if (!mm_is_core_local(mm)) {
+               int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+
+               if (lock_tlbie)
+                       raw_spin_lock(&native_tlbie_lock);
+               _tlbie_va(vmaddr, pid, ap);
+               if (lock_tlbie)
+                       raw_spin_unlock(&native_tlbie_lock);
+       } else
+               _tlbiel_va(vmaddr, pid, ap);
+bail:
+       preempt_enable();
+}
+
+void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+       if (vma && is_vm_hugetlb_page(vma))
+               return flush_hugetlb_page(vma, vmaddr);
+#endif
+       radix___flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
+                        mmu_get_ap(mmu_virtual_psize), 0);
+}
+EXPORT_SYMBOL(radix__flush_tlb_page);
+
+#endif /* CONFIG_SMP */
+
+void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+       int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+
+       if (lock_tlbie)
+               raw_spin_lock(&native_tlbie_lock);
+       _tlbie_pid(0);
+       if (lock_tlbie)
+               raw_spin_unlock(&native_tlbie_lock);
+}
+EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
+
+/*
+ * Currently, for range flushing, we just do a full mm flush. Because
+ * we use this in code path where we don' track the page size.
+ */
+void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+                    unsigned long end)
+
+{
+       struct mm_struct *mm = vma->vm_mm;
+       radix__flush_tlb_mm(mm);
+}
+EXPORT_SYMBOL(radix__flush_tlb_range);
+
+
+void radix__tlb_flush(struct mmu_gather *tlb)
+{
+       struct mm_struct *mm = tlb->mm;
+       radix__flush_tlb_mm(mm);
+}
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c

index f7b80391bee797bff6955b0e03de2a433656e6ed..4517aa43a8b1cc00452c946cea9777c04f13bd25 100644 (file)
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -155,7 +155,7 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
         batch->index = 0;
  }
  
-void tlb_flush(struct mmu_gather *tlb)
+void hash__tlb_flush(struct mmu_gather *tlb)
  {
         struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch);
  
@@ -218,7 +218,7 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
                 pte = pte_val(*ptep);
                 if (is_thp)
                         trace_hugepage_invalidate(start, pte);
-               if (!(pte & _PAGE_HASHPTE))
+               if (!(pte & H_PAGE_HASHPTE))
                         continue;
                 if (unlikely(is_thp))
                         hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte);
@@ -248,7 +248,7 @@ void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
         start_pte = pte_offset_map(pmd, addr);
         for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) {
                 unsigned long pteval = pte_val(*pte);
-               if (pteval & _PAGE_HASHPTE)
+               if (pteval & H_PAGE_HASHPTE)
                         hpte_need_flush(mm, addr, pte, pteval, 0);
                 addr += PAGE_SIZE;
         }
diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile

index f9c083a5652a4c2a99f6b13778d6f84feed9072e..77b6394a7c50d3a0fa70263e55fec906ca9b7c25 100644 (file)
--- a/arch/powerpc/perf/Makefile
+++ b/arch/powerpc/perf/Makefile
@@ -1,6 +1,6 @@
  subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
  
-obj-$(CONFIG_PERF_EVENTS)      += callchain.o
+obj-$(CONFIG_PERF_EVENTS)      += callchain.o perf_regs.o
  
  obj-$(CONFIG_PPC_PERF_CTRS)    += core-book3s.o bhrb.o
  obj64-$(CONFIG_PPC_PERF_CTRS)  += power4-pmu.o ppc970-pmu.o power5-pmu.o \
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c

index 22d9015c1acc80dea12e78c1ac53f8fcd3b19f0a..26d37e6f924e09534bbd41a28f045179858d08db 100644 (file)
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -137,7 +137,7 @@ static int read_user_stack_slow(void __user *ptr, void *buf, int nb)
         offset = addr & ((1UL << shift) - 1);
  
         pte = READ_ONCE(*ptep);
-       if (!pte_present(pte) || !(pte_val(pte) & _PAGE_USER))
+       if (!pte_present(pte) || !pte_user(pte))
                 goto err_out;
         pfn = pte_pfn(pte);
         if (!page_is_ram(pfn))
diff --git a/arch/powerpc/perf/perf_regs.c b/arch/powerpc/perf/perf_regs.c

new file mode 100644 (file)

index 0000000..d24a8a3
--- /dev/null
+++ b/arch/powerpc/perf/perf_regs.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2016 Anju T, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/perf_event.h>
+#include <linux/bug.h>
+#include <linux/stddef.h>
+#include <asm/ptrace.h>
+#include <asm/perf_regs.h>
+
+#define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r)
+
+#define REG_RESERVED (~((1ULL << PERF_REG_POWERPC_MAX) - 1))
+
+static unsigned int pt_regs_offset[PERF_REG_POWERPC_MAX] = {
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R0,  gpr[0]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R1,  gpr[1]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R2,  gpr[2]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R3,  gpr[3]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R4,  gpr[4]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R5,  gpr[5]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R6,  gpr[6]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R7,  gpr[7]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R8,  gpr[8]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R9,  gpr[9]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R10, gpr[10]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R11, gpr[11]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R12, gpr[12]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R13, gpr[13]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R14, gpr[14]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R15, gpr[15]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R16, gpr[16]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R17, gpr[17]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R18, gpr[18]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R19, gpr[19]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R20, gpr[20]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R21, gpr[21]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R22, gpr[22]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R23, gpr[23]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R24, gpr[24]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R25, gpr[25]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R26, gpr[26]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R27, gpr[27]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R28, gpr[28]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R29, gpr[29]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R30, gpr[30]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_R31, gpr[31]),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_NIP, nip),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_MSR, msr),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_ORIG_R3, orig_gpr3),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_CTR, ctr),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_LINK, link),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_XER, xer),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_CCR, ccr),
+#ifdef CONFIG_PPC64
+       PT_REGS_OFFSET(PERF_REG_POWERPC_SOFTE, softe),
+#else
+       PT_REGS_OFFSET(PERF_REG_POWERPC_SOFTE, mq),
+#endif
+       PT_REGS_OFFSET(PERF_REG_POWERPC_TRAP, trap),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_DAR, dar),
+       PT_REGS_OFFSET(PERF_REG_POWERPC_DSISR, dsisr),
+};
+
+u64 perf_reg_value(struct pt_regs *regs, int idx)
+{
+       if (WARN_ON_ONCE(idx >= PERF_REG_POWERPC_MAX))
+               return 0;
+
+       return regs_get_register(regs, pt_regs_offset[idx]);
+}
+
+int perf_reg_validate(u64 mask)
+{
+       if (!mask || mask & REG_RESERVED)
+               return -EINVAL;
+       return 0;
+}
+
+u64 perf_reg_abi(struct task_struct *task)
+{
+#ifdef CONFIG_PPC64
+       if (!test_tsk_thread_flag(task, TIF_32BIT))
+               return PERF_SAMPLE_REGS_ABI_64;
+       else
+#endif
+       return PERF_SAMPLE_REGS_ABI_32;
+}
+
+void perf_get_regs_user(struct perf_regs *regs_user,
+                       struct pt_regs *regs,
+                       struct pt_regs *regs_user_copy)
+{
+       regs_user->regs = task_pt_regs(current);
+       regs_user->abi  = perf_reg_abi(current);
+}
diff --git a/arch/powerpc/perf/power8-events-list.h b/arch/powerpc/perf/power8-events-list.h

index 741b77edd03e739ac54296bd441c5e5b09c98d9b..3a2e6e8ebb928f252ea871d00f62f4201aafc6f7 100644 (file)
--- a/arch/powerpc/perf/power8-events-list.h
+++ b/arch/powerpc/perf/power8-events-list.h
@@ -49,3 +49,43 @@ EVENT(PM_L3_PREF_ALL,                                0x4e052)
  EVENT(PM_DTLB_MISS,                            0x300fc)
  /* ITLB Reloaded */
  EVENT(PM_ITLB_MISS,                            0x400fc)
+/* Run_Instructions */
+EVENT(PM_RUN_INST_CMPL,                                0x500fa)
+/* Alternate event code for PM_RUN_INST_CMPL */
+EVENT(PM_RUN_INST_CMPL_ALT,                    0x400fa)
+/* Run_cycles */
+EVENT(PM_RUN_CYC,                              0x600f4)
+/* Alternate event code for Run_cycles */
+EVENT(PM_RUN_CYC_ALT,                          0x200f4)
+/* Marked store completed */
+EVENT(PM_MRK_ST_CMPL,                          0x10134)
+/* Alternate event code for Marked store completed */
+EVENT(PM_MRK_ST_CMPL_ALT,                      0x301e2)
+/* Marked two path branch */
+EVENT(PM_BR_MRK_2PATH,                         0x10138)
+/* Alternate event code for PM_BR_MRK_2PATH */
+EVENT(PM_BR_MRK_2PATH_ALT,                     0x40138)
+/* L3 castouts in Mepf state */
+EVENT(PM_L3_CO_MEPF,                           0x18082)
+/* Alternate event code for PM_L3_CO_MEPF */
+EVENT(PM_L3_CO_MEPF_ALT,                       0x3e05e)
+/* Data cache was reloaded from a location other than L2 due to a marked load */
+EVENT(PM_MRK_DATA_FROM_L2MISS,                 0x1d14e)
+/* Alternate event code for PM_MRK_DATA_FROM_L2MISS */
+EVENT(PM_MRK_DATA_FROM_L2MISS_ALT,             0x401e8)
+/* Alternate event code for  PM_CMPLU_STALL */
+EVENT(PM_CMPLU_STALL_ALT,                      0x1e054)
+/* Two path branch */
+EVENT(PM_BR_2PATH,                             0x20036)
+/* Alternate event code for PM_BR_2PATH */
+EVENT(PM_BR_2PATH_ALT,                         0x40036)
+/* # PPC Dispatched */
+EVENT(PM_INST_DISP,                            0x200f2)
+/* Alternate event code for PM_INST_DISP */
+EVENT(PM_INST_DISP_ALT,                                0x300f2)
+/* Marked filter Match */
+EVENT(PM_MRK_FILT_MATCH,                       0x2013c)
+/* Alternate event code for PM_MRK_FILT_MATCH */
+EVENT(PM_MRK_FILT_MATCH_ALT,                   0x3012e)
+/* Alternate event code for PM_LD_MISS_L1 */
+EVENT(PM_LD_MISS_L1_ALT,                       0x400f0)
diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c

index 690d9186a85520d41ddc38c2089d0ced017061d4..7cf3b4378192886d243ed71c4839e4b2cb8157ac 100644 (file)
--- a/arch/powerpc/perf/power8-pmu.c
+++ b/arch/powerpc/perf/power8-pmu.c
@@ -274,7 +274,8 @@ static int power8_get_constraint(u64 event, unsigned long *maskp, unsigned long
                 /* Ignore Linux defined bits when checking event below */
                 base_event = event & ~EVENT_LINUX_MASK;
  
-               if (pmc >= 5 && base_event != 0x500fa && base_event != 0x600f4)
+               if (pmc >= 5 && base_event != PM_RUN_INST_CMPL &&
+                               base_event != PM_RUN_CYC)
                         return -1;
  
                 mask  |= CNST_PMC_MASK(pmc);
@@ -488,17 +489,17 @@ static int power8_compute_mmcr(u64 event[], int n_ev,
  
  /* Table of alternatives, sorted by column 0 */
  static const unsigned int event_alternatives[][MAX_ALT] = {
-       { 0x10134, 0x301e2 },           /* PM_MRK_ST_CMPL */
-       { 0x10138, 0x40138 },           /* PM_BR_MRK_2PATH */
-       { 0x18082, 0x3e05e },           /* PM_L3_CO_MEPF */
-       { 0x1d14e, 0x401e8 },           /* PM_MRK_DATA_FROM_L2MISS */
-       { 0x1e054, 0x4000a },           /* PM_CMPLU_STALL */
-       { 0x20036, 0x40036 },           /* PM_BR_2PATH */
-       { 0x200f2, 0x300f2 },           /* PM_INST_DISP */
-       { 0x200f4, 0x600f4 },           /* PM_RUN_CYC */
-       { 0x2013c, 0x3012e },           /* PM_MRK_FILT_MATCH */
-       { 0x3e054, 0x400f0 },           /* PM_LD_MISS_L1 */
-       { 0x400fa, 0x500fa },           /* PM_RUN_INST_CMPL */
+       { PM_MRK_ST_CMPL,               PM_MRK_ST_CMPL_ALT },
+       { PM_BR_MRK_2PATH,              PM_BR_MRK_2PATH_ALT },
+       { PM_L3_CO_MEPF,                PM_L3_CO_MEPF_ALT },
+       { PM_MRK_DATA_FROM_L2MISS,      PM_MRK_DATA_FROM_L2MISS_ALT },
+       { PM_CMPLU_STALL_ALT,           PM_CMPLU_STALL },
+       { PM_BR_2PATH,                  PM_BR_2PATH_ALT },
+       { PM_INST_DISP,                 PM_INST_DISP_ALT },
+       { PM_RUN_CYC_ALT,               PM_RUN_CYC },
+       { PM_MRK_FILT_MATCH,            PM_MRK_FILT_MATCH_ALT },
+       { PM_LD_MISS_L1,                PM_LD_MISS_L1_ALT },
+       { PM_RUN_INST_CMPL_ALT,         PM_RUN_INST_CMPL },
  };
  
  /*
@@ -546,17 +547,17 @@ static int power8_get_alternatives(u64 event, unsigned int flags, u64 alt[])
                 j = num_alt;
                 for (i = 0; i < num_alt; ++i) {
                         switch (alt[i]) {
-                       case 0x1e:      /* PM_CYC */
-                               alt[j++] = 0x600f4;     /* PM_RUN_CYC */
+                       case PM_CYC:
+                               alt[j++] = PM_RUN_CYC;
                                 break;
-                       case 0x600f4:   /* PM_RUN_CYC */
-                               alt[j++] = 0x1e;
+                       case PM_RUN_CYC:
+                               alt[j++] = PM_CYC;
                                 break;
-                       case 0x2:       /* PM_PPC_CMPL */
-                               alt[j++] = 0x500fa;     /* PM_RUN_INST_CMPL */
+                       case PM_INST_CMPL:
+                               alt[j++] = PM_RUN_INST_CMPL;
                                 break;
-                       case 0x500fa:   /* PM_RUN_INST_CMPL */
-                               alt[j++] = 0x2; /* PM_PPC_CMPL */
+                       case PM_RUN_INST_CMPL:
+                               alt[j++] = PM_INST_CMPL;
                                 break;
                         }
                 }
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype

index 142dff5e96d6c1737bffd6bde6692dccdab8bfb2..77e9b8d591fb6ec5409aecd813fe3a05f52c7c49 100644 (file)
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -72,7 +72,7 @@ config PPC_BOOK3S_64
         select PPC_FPU
         select PPC_HAVE_PMU_SUPPORT
         select SYS_SUPPORTS_HUGETLBFS
-       select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES
+       select HAVE_ARCH_TRANSPARENT_HUGEPAGE
         select ARCH_SUPPORTS_NUMA_BALANCING
         select IRQ_WORK
  
@@ -331,6 +331,15 @@ config PPC_STD_MMU_64
         def_bool y
         depends on PPC_STD_MMU && PPC64
  
+config PPC_RADIX_MMU
+       bool "Radix MMU Support"
+       depends on PPC_BOOK3S_64
+       default y
+       help
+         Enable support for the Power ISA 3.0 Radix style MMU. Currently this
+         is only implemented by IBM Power9 CPUs, if you don't have one of them
+         you can probably disable this.
+
  config PPC_MMU_NOHASH
         def_bool y
         depends on !PPC_STD_MMU
diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c

index f7af74f836934575cbe491d70a3409ca350a6e25..3cbe38fad60974c858afc9ebe088331379ec3e01 100644 (file)
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -24,7 +24,7 @@
  
  #include <linux/interrupt.h>
  #include <linux/list.h>
-#include <linux/module.h>
+#include <linux/init.h>
  #include <linux/ptrace.h>
  #include <linux/slab.h>
  #include <linux/wait.h>
@@ -197,7 +197,7 @@ static int __spu_trap_data_map(struct spu *spu, unsigned long ea, u64 dsisr)
             (REGION_ID(ea) != USER_REGION_ID)) {
  
                 spin_unlock(&spu->register_lock);
-               ret = hash_page(ea, _PAGE_PRESENT, 0x300, dsisr);
+               ret = hash_page(ea, _PAGE_PRESENT | _PAGE_READ, 0x300, dsisr);
                 spin_lock(&spu->register_lock);
  
                 if (!ret) {
@@ -805,7 +805,4 @@ static int __init init_spu_base(void)
   out:
         return ret;
  }
-module_init(init_spu_base);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arnd Bergmann <arndb@de.ibm.com>");
+device_initcall(init_spu_base);
diff --git a/arch/powerpc/platforms/cell/spufs/fault.c b/arch/powerpc/platforms/cell/spufs/fault.c

index d98f845ac7771b8f198c44b24824ec89faf07b5c..e29e4d5afa2ddd165f31cef0e0a38ffa7babe007 100644 (file)
--- a/arch/powerpc/platforms/cell/spufs/fault.c
+++ b/arch/powerpc/platforms/cell/spufs/fault.c
@@ -141,8 +141,8 @@ int spufs_handle_class1(struct spu_context *ctx)
         /* we must not hold the lock when entering copro_handle_mm_fault */
         spu_release(ctx);
  
-       access = (_PAGE_PRESENT | _PAGE_USER);
-       access |= (dsisr & MFC_DSISR_ACCESS_PUT) ? _PAGE_RW : 0UL;
+       access = (_PAGE_PRESENT | _PAGE_READ);
+       access |= (dsisr & MFC_DSISR_ACCESS_PUT) ? _PAGE_WRITE : 0UL;
         local_irq_save(flags);
         ret = hash_page(ea, access, 0x300, dsisr);
         local_irq_restore(flags);
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c

index 950b3e539057a9f7affed493e49d17979e43b95f..9226df11bf39605f83c99211f94542f0dc949cc0 100644 (file)
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -75,7 +75,7 @@ static int pnv_eeh_init(void)
                  * and P7IOC separately. So we should regard
                  * PE#0 as valid for PHB3 and P7IOC.
                  */
-               if (phb->ioda.reserved_pe != 0)
+               if (phb->ioda.reserved_pe_idx != 0)
                         eeh_add_flag(EEH_VALID_PE_ZERO);
  
                 break;
@@ -1009,8 +1009,9 @@ static int pnv_eeh_reset_vf_pe(struct eeh_pe *pe, int option)
  static int pnv_eeh_reset(struct eeh_pe *pe, int option)
  {
         struct pci_controller *hose = pe->phb;
+       struct pnv_phb *phb;
         struct pci_bus *bus;
-       int ret;
+       int64_t rc;
  
         /*
          * For PHB reset, we always have complete reset. For those PEs whose
@@ -1026,45 +1027,39 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option)
          * reset. The side effect is that EEH core has to clear the frozen
          * state explicitly after BAR restore.
          */
-       if (pe->type & EEH_PE_PHB) {
-               ret = pnv_eeh_phb_reset(hose, option);
-       } else {
-               struct pnv_phb *phb;
-               s64 rc;
+       if (pe->type & EEH_PE_PHB)
+               return pnv_eeh_phb_reset(hose, option);
  
-               /*
-                * The frozen PE might be caused by PAPR error injection
-                * registers, which are expected to be cleared after hitting
-                * frozen PE as stated in the hardware spec. Unfortunately,
-                * that's not true on P7IOC. So we have to clear it manually
-                * to avoid recursive EEH errors during recovery.
-                */
-               phb = hose->private_data;
-               if (phb->model == PNV_PHB_MODEL_P7IOC &&
-                   (option == EEH_RESET_HOT ||
-                   option == EEH_RESET_FUNDAMENTAL)) {
-                       rc = opal_pci_reset(phb->opal_id,
-                                           OPAL_RESET_PHB_ERROR,
-                                           OPAL_ASSERT_RESET);
-                       if (rc != OPAL_SUCCESS) {
-                               pr_warn("%s: Failure %lld clearing "
-                                       "error injection registers\n",
-                                       __func__, rc);
-                               return -EIO;
-                       }
+       /*
+        * The frozen PE might be caused by PAPR error injection
+        * registers, which are expected to be cleared after hitting
+        * frozen PE as stated in the hardware spec. Unfortunately,
+        * that's not true on P7IOC. So we have to clear it manually
+        * to avoid recursive EEH errors during recovery.
+        */
+       phb = hose->private_data;
+       if (phb->model == PNV_PHB_MODEL_P7IOC &&
+           (option == EEH_RESET_HOT ||
+            option == EEH_RESET_FUNDAMENTAL)) {
+               rc = opal_pci_reset(phb->opal_id,
+                                   OPAL_RESET_PHB_ERROR,
+                                   OPAL_ASSERT_RESET);
+               if (rc != OPAL_SUCCESS) {
+                       pr_warn("%s: Failure %lld clearing error injection registers\n",
+                               __func__, rc);
+                       return -EIO;
                 }
-
-               bus = eeh_pe_bus_get(pe);
-               if (pe->type & EEH_PE_VF)
-                       ret = pnv_eeh_reset_vf_pe(pe, option);
-               else if (pci_is_root_bus(bus) ||
-                       pci_is_root_bus(bus->parent))
-                       ret = pnv_eeh_root_reset(hose, option);
-               else
-                       ret = pnv_eeh_bridge_reset(bus->self, option);
         }
  
-       return ret;
+       bus = eeh_pe_bus_get(pe);
+       if (pe->type & EEH_PE_VF)
+               return pnv_eeh_reset_vf_pe(pe, option);
+
+       if (pci_is_root_bus(bus) ||
+           pci_is_root_bus(bus->parent))
+               return pnv_eeh_root_reset(hose, option);
+
+       return pnv_eeh_bridge_reset(bus->self, option);
  }
  
  /**
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c

index 7229acd9bb3af5bf7eeb15e60d733f22ff6593b0..0459e100b4e78990b1e07b126b7655082fae4591 100644 (file)
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -12,6 +12,7 @@
  #include <linux/export.h>
  #include <linux/pci.h>
  #include <linux/memblock.h>
+#include <linux/iommu.h>
  
  #include <asm/iommu.h>
  #include <asm/pnv-pci.h>
@@ -25,8 +26,6 @@
   * Other types of TCE cache invalidation are not functional in the
   * hardware.
   */
-#define TCE_KILL_INVAL_ALL PPC_BIT(0)
-
  static struct pci_dev *get_pci_dev(struct device_node *dn)
  {
         return PCI_DN(dn)->pcidev;
@@ -138,22 +137,17 @@ static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe,
         struct pnv_ioda_pe *pe;
         struct pci_dn *pdn;
  
-       if (npe->flags & PNV_IODA_PE_PEER) {
-               pe = npe->peers[0];
-               pdev = pe->pdev;
-       } else {
-               pdev = pnv_pci_get_gpu_dev(npe->pdev);
-               if (!pdev)
-                       return NULL;
+       pdev = pnv_pci_get_gpu_dev(npe->pdev);
+       if (!pdev)
+               return NULL;
  
-               pdn = pci_get_pdn(pdev);
-               if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
-                       return NULL;
+       pdn = pci_get_pdn(pdev);
+       if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
+               return NULL;
  
-               hose = pci_bus_to_host(pdev->bus);
-               phb = hose->private_data;
-               pe = &phb->ioda.pe_array[pdn->pe_number];
-       }
+       hose = pci_bus_to_host(pdev->bus);
+       phb = hose->private_data;
+       pe = &phb->ioda.pe_array[pdn->pe_number];
  
         if (gpdev)
                 *gpdev = pdev;
@@ -161,92 +155,70 @@ static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe,
         return pe;
  }
  
-void pnv_npu_tce_invalidate_entire(struct pnv_ioda_pe *npe)
+long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
+               struct iommu_table *tbl)
  {
         struct pnv_phb *phb = npe->phb;
+       int64_t rc;
+       const unsigned long size = tbl->it_indirect_levels ?
+               tbl->it_level_size : tbl->it_size;
+       const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
+       const __u64 win_size = tbl->it_size << tbl->it_page_shift;
+
+       pe_info(npe, "Setting up window %llx..%llx pg=%lx\n",
+                       start_addr, start_addr + win_size - 1,
+                       IOMMU_PAGE_SIZE(tbl));
+
+       rc = opal_pci_map_pe_dma_window(phb->opal_id,
+                       npe->pe_number,
+                       npe->pe_number,
+                       tbl->it_indirect_levels + 1,
+                       __pa(tbl->it_base),
+                       size << 3,
+                       IOMMU_PAGE_SIZE(tbl));
+       if (rc) {
+               pe_err(npe, "Failed to configure TCE table, err %lld\n", rc);
+               return rc;
+       }
+       pnv_pci_ioda2_tce_invalidate_entire(phb, false);
  
-       if (WARN_ON(phb->type != PNV_PHB_NPU ||
-                   !phb->ioda.tce_inval_reg ||
-                   !(npe->flags & PNV_IODA_PE_DEV)))
-               return;
+       /* Add the table to the list so its TCE cache will get invalidated */
+       pnv_pci_link_table_and_group(phb->hose->node, num,
+                       tbl, &npe->table_group);
  
-       mb(); /* Ensure previous TCE table stores are visible */
-       __raw_writeq(cpu_to_be64(TCE_KILL_INVAL_ALL),
-               phb->ioda.tce_inval_reg);
+       return 0;
  }
  
-void pnv_npu_tce_invalidate(struct pnv_ioda_pe *npe,
-                               struct iommu_table *tbl,
-                               unsigned long index,
-                               unsigned long npages,
-                               bool rm)
+long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num)
  {
         struct pnv_phb *phb = npe->phb;
+       int64_t rc;
  
-       /* We can only invalidate the whole cache on NPU */
-       unsigned long val = TCE_KILL_INVAL_ALL;
-
-       if (WARN_ON(phb->type != PNV_PHB_NPU ||
-                   !phb->ioda.tce_inval_reg ||
-                   !(npe->flags & PNV_IODA_PE_DEV)))
-               return;
-
-       mb(); /* Ensure previous TCE table stores are visible */
-       if (rm)
-               __raw_rm_writeq(cpu_to_be64(val),
-                 (__be64 __iomem *) phb->ioda.tce_inval_reg_phys);
-       else
-               __raw_writeq(cpu_to_be64(val),
-                       phb->ioda.tce_inval_reg);
-}
-
-void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe)
-{
-       struct pnv_ioda_pe *gpe;
-       struct pci_dev *gpdev;
-       int i, avail = -1;
-
-       if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV))
-               return;
-
-       gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
-       if (!gpe)
-               return;
-
-       for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) {
-               /* Nothing to do if the PE is already connected. */
-               if (gpe->peers[i] == npe)
-                       return;
+       pe_info(npe, "Removing DMA window\n");
  
-               if (!gpe->peers[i])
-                       avail = i;
+       rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number,
+                       npe->pe_number,
+                       0/* levels */, 0/* table address */,
+                       0/* table size */, 0/* page size */);
+       if (rc) {
+               pe_err(npe, "Unmapping failed, ret = %lld\n", rc);
+               return rc;
         }
+       pnv_pci_ioda2_tce_invalidate_entire(phb, false);
  
-       if (WARN_ON(avail < 0))
-               return;
-
-       gpe->peers[avail] = npe;
-       gpe->flags |= PNV_IODA_PE_PEER;
+       pnv_pci_unlink_table_and_group(npe->table_group.tables[num],
+                       &npe->table_group);
  
-       /*
-        * We assume that the NPU devices only have a single peer PE
-        * (the GPU PCIe device PE).
-        */
-       npe->peers[0] = gpe;
-       npe->flags |= PNV_IODA_PE_PEER;
+       return 0;
  }
  
  /*
- * For the NPU we want to point the TCE table at the same table as the
- * real PCI device.
+ * Enables 32 bit DMA on NPU.
   */
-static void pnv_npu_disable_bypass(struct pnv_ioda_pe *npe)
+static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
  {
-       struct pnv_phb *phb = npe->phb;
         struct pci_dev *gpdev;
         struct pnv_ioda_pe *gpe;
-       void *addr;
-       unsigned int size;
         int64_t rc;
  
         /*
@@ -260,14 +232,7 @@ static void pnv_npu_disable_bypass(struct pnv_ioda_pe *npe)
         if (!gpe)
                 return;
  
-       addr = (void *)gpe->table_group.tables[0]->it_base;
-       size = gpe->table_group.tables[0]->it_size << 3;
-       rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number,
-                                       npe->pe_number, 1, __pa(addr),
-                                       size, 0x1000);
-       if (rc != OPAL_SUCCESS)
-               pr_warn("%s: Error %lld setting DMA window on PHB#%d-PE#%d\n",
-                       __func__, rc, phb->hose->global_number, npe->pe_number);
+       rc = pnv_npu_set_window(npe, 0, gpe->table_group.tables[0]);
  
         /*
          * We don't initialise npu_pe->tce32_table as we always use
@@ -277,72 +242,120 @@ static void pnv_npu_disable_bypass(struct pnv_ioda_pe *npe)
  }
  
  /*
- * Enable/disable bypass mode on the NPU. The NPU only supports one
+ * Enables bypass mode on the NPU. The NPU only supports one
   * window per link, so bypass needs to be explicitly enabled or
   * disabled. Unlike for a PHB3 bypass and non-bypass modes can't be
   * active at the same time.
   */
-int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe, bool enable)
+static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
  {
         struct pnv_phb *phb = npe->phb;
         int64_t rc = 0;
+       phys_addr_t top = memblock_end_of_DRAM();
  
         if (phb->type != PNV_PHB_NPU || !npe->pdev)
                 return -EINVAL;
  
-       if (enable) {
-               /* Enable the bypass window */
-               phys_addr_t top = memblock_end_of_DRAM();
-
-               npe->tce_bypass_base = 0;
-               top = roundup_pow_of_two(top);
-               dev_info(&npe->pdev->dev, "Enabling bypass for PE %d\n",
-                        npe->pe_number);
-               rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
-                                       npe->pe_number, npe->pe_number,
-                                       npe->tce_bypass_base, top);
-       } else {
-               /*
-                * Disable the bypass window by replacing it with the
-                * TCE32 window.
-                */
-               pnv_npu_disable_bypass(npe);
-       }
+       rc = pnv_npu_unset_window(npe, 0);
+       if (rc != OPAL_SUCCESS)
+               return rc;
+
+       /* Enable the bypass window */
+
+       top = roundup_pow_of_two(top);
+       dev_info(&npe->pdev->dev, "Enabling bypass for PE %d\n",
+                       npe->pe_number);
+       rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
+                       npe->pe_number, npe->pe_number,
+                       0 /* bypass base */, top);
+
+       if (rc == OPAL_SUCCESS)
+               pnv_pci_ioda2_tce_invalidate_entire(phb, false);
  
         return rc;
  }
  
-int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask)
+void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass)
  {
-       struct pci_controller *hose = pci_bus_to_host(npdev->bus);
-       struct pnv_phb *phb = hose->private_data;
-       struct pci_dn *pdn = pci_get_pdn(npdev);
-       struct pnv_ioda_pe *npe, *gpe;
-       struct pci_dev *gpdev;
-       uint64_t top;
-       bool bypass = false;
+       int i;
+       struct pnv_phb *phb;
+       struct pci_dn *pdn;
+       struct pnv_ioda_pe *npe;
+       struct pci_dev *npdev;
  
-       if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
-               return -ENXIO;
+       for (i = 0; ; ++i) {
+               npdev = pnv_pci_get_npu_dev(gpdev, i);
  
-       /* We only do bypass if it's enabled on the linked device */
-       npe = &phb->ioda.pe_array[pdn->pe_number];
-       gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
-       if (!gpe)
-               return -ENODEV;
+               if (!npdev)
+                       break;
+
+               pdn = pci_get_pdn(npdev);
+               if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
+                       return;
+
+               phb = pci_bus_to_host(npdev->bus)->private_data;
+
+               /* We only do bypass if it's enabled on the linked device */
+               npe = &phb->ioda.pe_array[pdn->pe_number];
+
+               if (bypass) {
+                       dev_info(&npdev->dev,
+                                       "Using 64-bit DMA iommu bypass\n");
+                       pnv_npu_dma_set_bypass(npe);
+               } else {
+                       dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n");
+                       pnv_npu_dma_set_32(npe);
+               }
+       }
+}
  
-       if (gpe->tce_bypass_enabled) {
-               top = gpe->tce_bypass_base + memblock_end_of_DRAM() - 1;
-               bypass = (dma_mask >= top);
+/* Switch ownership from platform code to external user (e.g. VFIO) */
+void pnv_npu_take_ownership(struct pnv_ioda_pe *npe)
+{
+       struct pnv_phb *phb = npe->phb;
+       int64_t rc;
+
+       /*
+        * Note: NPU has just a single TVE in the hardware which means that
+        * while used by the kernel, it can have either 32bit window or
+        * DMA bypass but never both. So we deconfigure 32bit window only
+        * if it was enabled at the moment of ownership change.
+        */
+       if (npe->table_group.tables[0]) {
+               pnv_npu_unset_window(npe, 0);
+               return;
         }
  
-       if (bypass)
-               dev_info(&npdev->dev, "Using 64-bit DMA iommu bypass\n");
-       else
-               dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n");
+       /* Disable bypass */
+       rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
+                       npe->pe_number, npe->pe_number,
+                       0 /* bypass base */, 0);
+       if (rc) {
+               pe_err(npe, "Failed to disable bypass, err %lld\n", rc);
+               return;
+       }
+       pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false);
+}
  
-       pnv_npu_dma_set_bypass(npe, bypass);
-       *npdev->dev.dma_mask = dma_mask;
+struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
+{
+       struct pnv_phb *phb = npe->phb;
+       struct pci_bus *pbus = phb->hose->bus;
+       struct pci_dev *npdev, *gpdev = NULL, *gptmp;
+       struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
  
-       return 0;
+       if (!gpe || !gpdev)
+               return NULL;
+
+       list_for_each_entry(npdev, &pbus->devices, bus_list) {
+               gptmp = pnv_pci_get_gpu_dev(npdev);
+
+               if (gptmp != gpdev)
+                       continue;
+
+               pe_info(gpe, "Attached NPU %s\n", dev_name(&npdev->dev));
+               iommu_group_add_device(gpe->table_group.group, &npdev->dev);
+       }
+
+       return gpe;
  }
diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c b/arch/powerpc/platforms/powernv/opal-hmi.c

index d000f4e219814ec5e6d89d2ff17f30a05895d7db..c0a8201cb4d9bb7a57c381f412a0509e8bc01127 100644 (file)
--- a/arch/powerpc/platforms/powernv/opal-hmi.c
+++ b/arch/powerpc/platforms/powernv/opal-hmi.c
@@ -150,15 +150,17 @@ static void print_nx_checkstop_reason(const char *level,
  static void print_checkstop_reason(const char *level,
                                         struct OpalHMIEvent *hmi_evt)
  {
-       switch (hmi_evt->u.xstop_error.xstop_type) {
+       uint8_t type = hmi_evt->u.xstop_error.xstop_type;
+       switch (type) {
         case CHECKSTOP_TYPE_CORE:
                 print_core_checkstop_reason(level, hmi_evt);
                 break;
         case CHECKSTOP_TYPE_NX:
                 print_nx_checkstop_reason(level, hmi_evt);
                 break;
-       case CHECKSTOP_TYPE_UNKNOWN:
-               printk("%s      Unknown Malfunction Alert.\n", level);
+       default:
+               printk("%s      Unknown Malfunction Alert of type %d\n",
+                      level, type);
                 break;
         }
  }
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c

index c5baaf3cc4e5ef565bcaadeb125bd6ac2138a4c9..3a5ea8236db8108f2458d848b1fbc211cba8112d 100644 (file)
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -48,15 +48,16 @@
  #include "powernv.h"
  #include "pci.h"
  
-/* 256M DMA window, 4K TCE pages, 8 bytes TCE */
-#define TCE32_TABLE_SIZE       ((0x10000000 / 0x1000) * 8)
+#define PNV_IODA1_M64_NUM      16      /* Number of M64 BARs   */
+#define PNV_IODA1_M64_SEGS     8       /* Segments per M64 BAR */
+#define PNV_IODA1_DMA32_SEGSIZE        0x10000000
  
  #define POWERNV_IOMMU_DEFAULT_LEVELS   1
  #define POWERNV_IOMMU_MAX_LEVELS       5
  
  static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
  
-static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
+void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
                             const char *fmt, ...)
  {
         struct va_format vaf;
@@ -87,13 +88,6 @@ static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
         va_end(args);
  }
  
-#define pe_err(pe, fmt, ...)                                   \
-       pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__)
-#define pe_warn(pe, fmt, ...)                                  \
-       pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__)
-#define pe_info(pe, fmt, ...)                                  \
-       pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__)
-
  static bool pnv_iommu_bypass_disabled __read_mostly;
  
  static int __init iommu_setup(char *str)
@@ -122,9 +116,17 @@ static inline bool pnv_pci_is_mem_pref_64(unsigned long flags)
                 (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH));
  }
  
+static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
+{
+       phb->ioda.pe_array[pe_no].phb = phb;
+       phb->ioda.pe_array[pe_no].pe_number = pe_no;
+
+       return &phb->ioda.pe_array[pe_no];
+}
+
  static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
  {
-       if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) {
+       if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe_num)) {
                 pr_warn("%s: Invalid PE %d on PHB#%x\n",
                         __func__, pe_no, phb->hose->global_number);
                 return;
@@ -134,32 +136,31 @@ static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
                 pr_debug("%s: PE %d was reserved on PHB#%x\n",
                          __func__, pe_no, phb->hose->global_number);
  
-       phb->ioda.pe_array[pe_no].phb = phb;
-       phb->ioda.pe_array[pe_no].pe_number = pe_no;
+       pnv_ioda_init_pe(phb, pe_no);
  }
  
-static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
+static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
  {
         unsigned long pe;
  
         do {
                 pe = find_next_zero_bit(phb->ioda.pe_alloc,
-                                       phb->ioda.total_pe, 0);
-               if (pe >= phb->ioda.total_pe)
-                       return IODA_INVALID_PE;
+                                       phb->ioda.total_pe_num, 0);
+               if (pe >= phb->ioda.total_pe_num)
+                       return NULL;
         } while(test_and_set_bit(pe, phb->ioda.pe_alloc));
  
-       phb->ioda.pe_array[pe].phb = phb;
-       phb->ioda.pe_array[pe].pe_number = pe;
-       return pe;
+       return pnv_ioda_init_pe(phb, pe);
  }
  
-static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
+static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
  {
-       WARN_ON(phb->ioda.pe_array[pe].pdev);
+       struct pnv_phb *phb = pe->phb;
+
+       WARN_ON(pe->pdev);
  
-       memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe));
-       clear_bit(pe, phb->ioda.pe_alloc);
+       memset(pe, 0, sizeof(struct pnv_ioda_pe));
+       clear_bit(pe->pe_number, phb->ioda.pe_alloc);
  }
  
  /* The default M64 BAR is shared by all PEs */
@@ -199,13 +200,13 @@ static int pnv_ioda2_init_m64(struct pnv_phb *phb)
          * expected to be 0 or last one of PE capabicity.
          */
         r = &phb->hose->mem_resources[1];
-       if (phb->ioda.reserved_pe == 0)
+       if (phb->ioda.reserved_pe_idx == 0)
                 r->start += phb->ioda.m64_segsize;
-       else if (phb->ioda.reserved_pe == (phb->ioda.total_pe - 1))
+       else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
                 r->end -= phb->ioda.m64_segsize;
         else
                 pr_warn("  Cannot strip M64 segment for reserved PE#%d\n",
-                       phb->ioda.reserved_pe);
+                       phb->ioda.reserved_pe_idx);
  
         return 0;
  
@@ -219,7 +220,7 @@ fail:
         return -EIO;
  }
  
-static void pnv_ioda2_reserve_dev_m64_pe(struct pci_dev *pdev,
+static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev,
                                          unsigned long *pe_bitmap)
  {
         struct pci_controller *hose = pci_bus_to_host(pdev->bus);
@@ -246,22 +247,80 @@ static void pnv_ioda2_reserve_dev_m64_pe(struct pci_dev *pdev,
         }
  }
  
-static void pnv_ioda2_reserve_m64_pe(struct pci_bus *bus,
-                                    unsigned long *pe_bitmap,
-                                    bool all)
+static int pnv_ioda1_init_m64(struct pnv_phb *phb)
+{
+       struct resource *r;
+       int index;
+
+       /*
+        * There are 16 M64 BARs, each of which has 8 segments. So
+        * there are as many M64 segments as the maximum number of
+        * PEs, which is 128.
+        */
+       for (index = 0; index < PNV_IODA1_M64_NUM; index++) {
+               unsigned long base, segsz = phb->ioda.m64_segsize;
+               int64_t rc;
+
+               base = phb->ioda.m64_base +
+                      index * PNV_IODA1_M64_SEGS * segsz;
+               rc = opal_pci_set_phb_mem_window(phb->opal_id,
+                               OPAL_M64_WINDOW_TYPE, index, base, 0,
+                               PNV_IODA1_M64_SEGS * segsz);
+               if (rc != OPAL_SUCCESS) {
+                       pr_warn("  Error %lld setting M64 PHB#%d-BAR#%d\n",
+                               rc, phb->hose->global_number, index);
+                       goto fail;
+               }
+
+               rc = opal_pci_phb_mmio_enable(phb->opal_id,
+                               OPAL_M64_WINDOW_TYPE, index,
+                               OPAL_ENABLE_M64_SPLIT);
+               if (rc != OPAL_SUCCESS) {
+                       pr_warn("  Error %lld enabling M64 PHB#%d-BAR#%d\n",
+                               rc, phb->hose->global_number, index);
+                       goto fail;
+               }
+       }
+
+       /*
+        * Exclude the segment used by the reserved PE, which
+        * is expected to be 0 or last supported PE#.
+        */
+       r = &phb->hose->mem_resources[1];
+       if (phb->ioda.reserved_pe_idx == 0)
+               r->start += phb->ioda.m64_segsize;
+       else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
+               r->end -= phb->ioda.m64_segsize;
+       else
+               WARN(1, "Wrong reserved PE#%d on PHB#%d\n",
+                    phb->ioda.reserved_pe_idx, phb->hose->global_number);
+
+       return 0;
+
+fail:
+       for ( ; index >= 0; index--)
+               opal_pci_phb_mmio_enable(phb->opal_id,
+                       OPAL_M64_WINDOW_TYPE, index, OPAL_DISABLE_M64);
+
+       return -EIO;
+}
+
+static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus,
+                                   unsigned long *pe_bitmap,
+                                   bool all)
  {
         struct pci_dev *pdev;
  
         list_for_each_entry(pdev, &bus->devices, bus_list) {
-               pnv_ioda2_reserve_dev_m64_pe(pdev, pe_bitmap);
+               pnv_ioda_reserve_dev_m64_pe(pdev, pe_bitmap);
  
                 if (all && pdev->subordinate)
-                       pnv_ioda2_reserve_m64_pe(pdev->subordinate,
-                                                pe_bitmap, all);
+                       pnv_ioda_reserve_m64_pe(pdev->subordinate,
+                                               pe_bitmap, all);
         }
  }
  
-static int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all)
+static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all)
  {
         struct pci_controller *hose = pci_bus_to_host(bus);
         struct pnv_phb *phb = hose->private_data;
@@ -271,28 +330,28 @@ static int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all)
  
         /* Root bus shouldn't use M64 */
         if (pci_is_root_bus(bus))
-               return IODA_INVALID_PE;
+               return NULL;
  
         /* Allocate bitmap */
-       size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long));
+       size = _ALIGN_UP(phb->ioda.total_pe_num / 8, sizeof(unsigned long));
         pe_alloc = kzalloc(size, GFP_KERNEL);
         if (!pe_alloc) {
                 pr_warn("%s: Out of memory !\n",
                         __func__);
-               return IODA_INVALID_PE;
+               return NULL;
         }
  
         /* Figure out reserved PE numbers by the PE */
-       pnv_ioda2_reserve_m64_pe(bus, pe_alloc, all);
+       pnv_ioda_reserve_m64_pe(bus, pe_alloc, all);
  
         /*
          * the current bus might not own M64 window and that's all
          * contributed by its child buses. For the case, we needn't
          * pick M64 dependent PE#.
          */
-       if (bitmap_empty(pe_alloc, phb->ioda.total_pe)) {
+       if (bitmap_empty(pe_alloc, phb->ioda.total_pe_num)) {
                 kfree(pe_alloc);
-               return IODA_INVALID_PE;
+               return NULL;
         }
  
         /*
@@ -301,10 +360,11 @@ static int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all)
          */
         master_pe = NULL;
         i = -1;
-       while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe, i + 1)) <
-               phb->ioda.total_pe) {
+       while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe_num, i + 1)) <
+               phb->ioda.total_pe_num) {
                 pe = &phb->ioda.pe_array[i];
  
+               phb->ioda.m64_segmap[pe->pe_number] = pe->pe_number;
                 if (!master_pe) {
                         pe->flags |= PNV_IODA_PE_MASTER;
                         INIT_LIST_HEAD(&pe->slaves);
@@ -314,10 +374,30 @@ static int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all)
                         pe->master = master_pe;
                         list_add_tail(&pe->list, &master_pe->slaves);
                 }
+
+               /*
+                * P7IOC supports M64DT, which helps mapping M64 segment
+                * to one particular PE#. However, PHB3 has fixed mapping
+                * between M64 segment and PE#. In order to have same logic
+                * for P7IOC and PHB3, we enforce fixed mapping between M64
+                * segment and PE# on P7IOC.
+                */
+               if (phb->type == PNV_PHB_IODA1) {
+                       int64_t rc;
+
+                       rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+                                       pe->pe_number, OPAL_M64_WINDOW_TYPE,
+                                       pe->pe_number / PNV_IODA1_M64_SEGS,
+                                       pe->pe_number % PNV_IODA1_M64_SEGS);
+                       if (rc != OPAL_SUCCESS)
+                               pr_warn("%s: Error %lld mapping M64 for PHB#%d-PE#%d\n",
+                                       __func__, rc, phb->hose->global_number,
+                                       pe->pe_number);
+               }
         }
  
         kfree(pe_alloc);
-       return master_pe->pe_number;
+       return master_pe;
  }
  
  static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
@@ -328,8 +408,7 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
         const u32 *r;
         u64 pci_addr;
  
-       /* FIXME: Support M64 for P7IOC */
-       if (phb->type != PNV_PHB_IODA2) {
+       if (phb->type != PNV_PHB_IODA1 && phb->type != PNV_PHB_IODA2) {
                 pr_info("  Not support M64 window\n");
                 return;
         }
@@ -355,7 +434,7 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
         hose->mem_offset[1] = res->start - pci_addr;
  
         phb->ioda.m64_size = resource_size(res);
-       phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe;
+       phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe_num;
         phb->ioda.m64_base = pci_addr;
  
         pr_info(" MEM64 0x%016llx..0x%016llx -> 0x%016llx\n",
@@ -363,9 +442,12 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
  
         /* Use last M64 BAR to cover M64 window */
         phb->ioda.m64_bar_idx = 15;
-       phb->init_m64 = pnv_ioda2_init_m64;
-       phb->reserve_m64_pe = pnv_ioda2_reserve_m64_pe;
-       phb->pick_m64_pe = pnv_ioda2_pick_m64_pe;
+       if (phb->type == PNV_PHB_IODA1)
+               phb->init_m64 = pnv_ioda1_init_m64;
+       else
+               phb->init_m64 = pnv_ioda2_init_m64;
+       phb->reserve_m64_pe = pnv_ioda_reserve_m64_pe;
+       phb->pick_m64_pe = pnv_ioda_pick_m64_pe;
  }
  
  static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no)
@@ -456,7 +538,7 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
         s64 rc;
  
         /* Sanity check on PE number */
-       if (pe_no < 0 || pe_no >= phb->ioda.total_pe)
+       if (pe_no < 0 || pe_no >= phb->ioda.total_pe_num)
                 return OPAL_EEH_STOPPED_PERM_UNAVAIL;
  
         /*
@@ -808,44 +890,6 @@ out:
         return 0;
  }
  
-static void pnv_ioda_link_pe_by_weight(struct pnv_phb *phb,
-                                      struct pnv_ioda_pe *pe)
-{
-       struct pnv_ioda_pe *lpe;
-
-       list_for_each_entry(lpe, &phb->ioda.pe_dma_list, dma_link) {
-               if (lpe->dma_weight < pe->dma_weight) {
-                       list_add_tail(&pe->dma_link, &lpe->dma_link);
-                       return;
-               }
-       }
-       list_add_tail(&pe->dma_link, &phb->ioda.pe_dma_list);
-}
-
-static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev)
-{
-       /* This is quite simplistic. The "base" weight of a device
-        * is 10. 0 means no DMA is to be accounted for it.
-        */
-
-       /* If it's a bridge, no DMA */
-       if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
-               return 0;
-
-       /* Reduce the weight of slow USB controllers */
-       if (dev->class == PCI_CLASS_SERIAL_USB_UHCI ||
-           dev->class == PCI_CLASS_SERIAL_USB_OHCI ||
-           dev->class == PCI_CLASS_SERIAL_USB_EHCI)
-               return 3;
-
-       /* Increase the weight of RAID (includes Obsidian) */
-       if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID)
-               return 15;
-
-       /* Default */
-       return 10;
-}
-
  #ifdef CONFIG_PCI_IOV
  static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
  {
@@ -919,7 +963,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
         struct pnv_phb *phb = hose->private_data;
         struct pci_dn *pdn = pci_get_pdn(dev);
         struct pnv_ioda_pe *pe;
-       int pe_num;
  
         if (!pdn) {
                 pr_err("%s: Device tree node not associated properly\n",
@@ -929,8 +972,8 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
         if (pdn->pe_number != IODA_INVALID_PE)
                 return NULL;
  
-       pe_num = pnv_ioda_alloc_pe(phb);
-       if (pe_num == IODA_INVALID_PE) {
+       pe = pnv_ioda_alloc_pe(phb);
+       if (!pe) {
                 pr_warning("%s: Not enough PE# available, disabling device\n",
                            pci_name(dev));
                 return NULL;
@@ -943,14 +986,12 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
          *
          * At some point we want to remove the PDN completely anyways
          */
-       pe = &phb->ioda.pe_array[pe_num];
         pci_dev_get(dev);
         pdn->pcidev = dev;
-       pdn->pe_number = pe_num;
+       pdn->pe_number = pe->pe_number;
         pe->flags = PNV_IODA_PE_DEV;
         pe->pdev = dev;
         pe->pbus = NULL;
-       pe->tce32_seg = -1;
         pe->mve_number = -1;
         pe->rid = dev->bus->number << 8 | pdn->devfn;
  
@@ -958,23 +999,15 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
  
         if (pnv_ioda_configure_pe(phb, pe)) {
                 /* XXX What do we do here ? */
-               if (pe_num)
-                       pnv_ioda_free_pe(phb, pe_num);
+               pnv_ioda_free_pe(pe);
                 pdn->pe_number = IODA_INVALID_PE;
                 pe->pdev = NULL;
                 pci_dev_put(dev);
                 return NULL;
         }
  
-       /* Assign a DMA weight to the device */
-       pe->dma_weight = pnv_ioda_dma_weight(dev);
-       if (pe->dma_weight != 0) {
-               phb->ioda.dma_weight += pe->dma_weight;
-               phb->ioda.dma_pe_count++;
-       }
-
-       /* Link the PE */
-       pnv_ioda_link_pe_by_weight(phb, pe);
+       /* Put PE to the list */
+       list_add_tail(&pe->list, &phb->ioda.pe_list);
  
         return pe;
  }
@@ -993,7 +1026,6 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
                 }
                 pdn->pcidev = dev;
                 pdn->pe_number = pe->pe_number;
-               pe->dma_weight += pnv_ioda_dma_weight(dev);
                 if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
                         pnv_ioda_setup_same_PE(dev->subordinate, pe);
         }
@@ -1005,49 +1037,44 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
   * subordinate PCI devices and buses. The second type of PE is normally
   * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports.
   */
-static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
+static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
  {
         struct pci_controller *hose = pci_bus_to_host(bus);
         struct pnv_phb *phb = hose->private_data;
-       struct pnv_ioda_pe *pe;
-       int pe_num = IODA_INVALID_PE;
+       struct pnv_ioda_pe *pe = NULL;
  
         /* Check if PE is determined by M64 */
         if (phb->pick_m64_pe)
-               pe_num = phb->pick_m64_pe(bus, all);
+               pe = phb->pick_m64_pe(bus, all);
  
         /* The PE number isn't pinned by M64 */
-       if (pe_num == IODA_INVALID_PE)
-               pe_num = pnv_ioda_alloc_pe(phb);
+       if (!pe)
+               pe = pnv_ioda_alloc_pe(phb);
  
-       if (pe_num == IODA_INVALID_PE) {
+       if (!pe) {
                 pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n",
                         __func__, pci_domain_nr(bus), bus->number);
-               return;
+               return NULL;
         }
  
-       pe = &phb->ioda.pe_array[pe_num];
         pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
         pe->pbus = bus;
         pe->pdev = NULL;
-       pe->tce32_seg = -1;
         pe->mve_number = -1;
         pe->rid = bus->busn_res.start << 8;
-       pe->dma_weight = 0;
  
         if (all)
                 pe_info(pe, "Secondary bus %d..%d associated with PE#%d\n",
-                       bus->busn_res.start, bus->busn_res.end, pe_num);
+                       bus->busn_res.start, bus->busn_res.end, pe->pe_number);
         else
                 pe_info(pe, "Secondary bus %d associated with PE#%d\n",
-                       bus->busn_res.start, pe_num);
+                       bus->busn_res.start, pe->pe_number);
  
         if (pnv_ioda_configure_pe(phb, pe)) {
                 /* XXX What do we do here ? */
-               if (pe_num)
-                       pnv_ioda_free_pe(phb, pe_num);
+               pnv_ioda_free_pe(pe);
                 pe->pbus = NULL;
-               return;
+               return NULL;
         }
  
         /* Associate it with all child devices */
@@ -1056,16 +1083,7 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
         /* Put PE to the list */
         list_add_tail(&pe->list, &phb->ioda.pe_list);
  
-       /* Account for one DMA PE if at least one DMA capable device exist
-        * below the bridge
-        */
-       if (pe->dma_weight != 0) {
-               phb->ioda.dma_weight += pe->dma_weight;
-               phb->ioda.dma_pe_count++;
-       }
-
-       /* Link the PE */
-       pnv_ioda_link_pe_by_weight(phb, pe);
+       return pe;
  }
  
  static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
@@ -1088,7 +1106,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
          * same GPU get assigned the same PE.
          */
         gpu_pdev = pnv_pci_get_gpu_dev(npu_pdev);
-       for (pe_num = 0; pe_num < phb->ioda.total_pe; pe_num++) {
+       for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) {
                 pe = &phb->ioda.pe_array[pe_num];
                 if (!pe->pdev)
                         continue;
@@ -1106,7 +1124,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
                         rid = npu_pdev->bus->number << 8 | npu_pdn->devfn;
                         npu_pdn->pcidev = npu_pdev;
                         npu_pdn->pe_number = pe_num;
-                       pe->dma_weight += pnv_ioda_dma_weight(npu_pdev);
                         phb->ioda.pe_rmap[rid] = pe->pe_number;
  
                         /* Map the PE to this link */
@@ -1378,7 +1395,7 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
  
                 pnv_ioda_deconfigure_pe(phb, pe);
  
-               pnv_ioda_free_pe(phb, pe->pe_number);
+               pnv_ioda_free_pe(pe);
         }
  }
  
@@ -1387,6 +1404,7 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
         struct pci_bus        *bus;
         struct pci_controller *hose;
         struct pnv_phb        *phb;
+       struct pnv_ioda_pe    *pe;
         struct pci_dn         *pdn;
         struct pci_sriov      *iov;
         u16                    num_vfs, i;
@@ -1411,8 +1429,11 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
                 /* Release PE numbers */
                 if (pdn->m64_single_mode) {
                         for (i = 0; i < num_vfs; i++) {
-                               if (pdn->pe_num_map[i] != IODA_INVALID_PE)
-                                       pnv_ioda_free_pe(phb, pdn->pe_num_map[i]);
+                               if (pdn->pe_num_map[i] == IODA_INVALID_PE)
+                                       continue;
+
+                               pe = &phb->ioda.pe_array[pdn->pe_num_map[i]];
+                               pnv_ioda_free_pe(pe);
                         }
                 } else
                         bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
@@ -1454,7 +1475,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
                 pe->flags = PNV_IODA_PE_VF;
                 pe->pbus = NULL;
                 pe->parent_dev = pdev;
-               pe->tce32_seg = -1;
                 pe->mve_number = -1;
                 pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
                            pci_iov_virtfn_devfn(pdev, vf_index);
@@ -1466,8 +1486,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
  
                 if (pnv_ioda_configure_pe(phb, pe)) {
                         /* XXX What do we do here ? */
-                       if (pe_num)
-                               pnv_ioda_free_pe(phb, pe_num);
+                       pnv_ioda_free_pe(pe);
                         pe->pdev = NULL;
                         continue;
                 }
@@ -1486,6 +1505,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
         struct pci_bus        *bus;
         struct pci_controller *hose;
         struct pnv_phb        *phb;
+       struct pnv_ioda_pe    *pe;
         struct pci_dn         *pdn;
         int                    ret;
         u16                    i;
@@ -1528,18 +1548,20 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
                 /* Calculate available PE for required VFs */
                 if (pdn->m64_single_mode) {
                         for (i = 0; i < num_vfs; i++) {
-                               pdn->pe_num_map[i] = pnv_ioda_alloc_pe(phb);
-                               if (pdn->pe_num_map[i] == IODA_INVALID_PE) {
+                               pe = pnv_ioda_alloc_pe(phb);
+                               if (!pe) {
                                         ret = -EBUSY;
                                         goto m64_failed;
                                 }
+
+                               pdn->pe_num_map[i] = pe->pe_number;
                         }
                 } else {
                         mutex_lock(&phb->ioda.pe_alloc_mutex);
                         *pdn->pe_num_map = bitmap_find_next_zero_area(
-                               phb->ioda.pe_alloc, phb->ioda.total_pe,
+                               phb->ioda.pe_alloc, phb->ioda.total_pe_num,
                                 0, num_vfs, 0);
-                       if (*pdn->pe_num_map >= phb->ioda.total_pe) {
+                       if (*pdn->pe_num_map >= phb->ioda.total_pe_num) {
                                 mutex_unlock(&phb->ioda.pe_alloc_mutex);
                                 dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
                                 kfree(pdn->pe_num_map);
@@ -1577,8 +1599,11 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
  m64_failed:
         if (pdn->m64_single_mode) {
                 for (i = 0; i < num_vfs; i++) {
-                       if (pdn->pe_num_map[i] != IODA_INVALID_PE)
-                               pnv_ioda_free_pe(phb, pdn->pe_num_map[i]);
+                       if (pdn->pe_num_map[i] == IODA_INVALID_PE)
+                               continue;
+
+                       pe = &phb->ioda.pe_array[pdn->pe_num_map[i]];
+                       pnv_ioda_free_pe(pe);
                 }
         } else
                 bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
@@ -1640,8 +1665,6 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
         struct pnv_ioda_pe *pe;
         uint64_t top;
         bool bypass = false;
-       struct pci_dev *linked_npu_dev;
-       int i;
  
         if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
                 return -ENODEV;;
@@ -1662,15 +1685,7 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
         *pdev->dev.dma_mask = dma_mask;
  
         /* Update peer npu devices */
-       if (pe->flags & PNV_IODA_PE_PEER)
-               for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) {
-                       if (!pe->peers[i])
-                               continue;
-
-                       linked_npu_dev = pe->peers[i]->pdev;
-                       if (dma_get_mask(&linked_npu_dev->dev) != dma_mask)
-                               dma_set_mask(&linked_npu_dev->dev, dma_mask);
-               }
+       pnv_npu_try_dma_set_bypass(pdev, bypass);
  
         return 0;
  }
@@ -1811,28 +1826,34 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
         .get = pnv_tce_get,
  };
  
-static inline void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_ioda_pe *pe)
+#define TCE_KILL_INVAL_ALL  PPC_BIT(0)
+#define TCE_KILL_INVAL_PE   PPC_BIT(1)
+#define TCE_KILL_INVAL_TCE  PPC_BIT(2)
+
+void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
+{
+       const unsigned long val = TCE_KILL_INVAL_ALL;
+
+       mb(); /* Ensure previous TCE table stores are visible */
+       if (rm)
+               __raw_rm_writeq(cpu_to_be64(val),
+                               (__be64 __iomem *)
+                               phb->ioda.tce_inval_reg_phys);
+       else
+               __raw_writeq(cpu_to_be64(val), phb->ioda.tce_inval_reg);
+}
+
+static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe)
  {
         /* 01xb - invalidate TCEs that match the specified PE# */
-       unsigned long val = (0x4ull << 60) | (pe->pe_number & 0xFF);
+       unsigned long val = TCE_KILL_INVAL_PE | (pe->pe_number & 0xFF);
         struct pnv_phb *phb = pe->phb;
-       struct pnv_ioda_pe *npe;
-       int i;
  
         if (!phb->ioda.tce_inval_reg)
                 return;
  
         mb(); /* Ensure above stores are visible */
         __raw_writeq(cpu_to_be64(val), phb->ioda.tce_inval_reg);
-
-       if (pe->flags & PNV_IODA_PE_PEER)
-               for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) {
-                       npe = pe->peers[i];
-                       if (!npe || npe->phb->type != PNV_PHB_NPU)
-                               continue;
-
-                       pnv_npu_tce_invalidate_entire(npe);
-               }
  }
  
  static void pnv_pci_ioda2_do_tce_invalidate(unsigned pe_number, bool rm,
@@ -1842,7 +1863,7 @@ static void pnv_pci_ioda2_do_tce_invalidate(unsigned pe_number, bool rm,
         unsigned long start, end, inc;
  
         /* We'll invalidate DMA address in PE scope */
-       start = 0x2ull << 60;
+       start = TCE_KILL_INVAL_TCE;
         start |= (pe_number & 0xFF);
         end = start;
  
@@ -1867,28 +1888,24 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
         struct iommu_table_group_link *tgl;
  
         list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) {
-               struct pnv_ioda_pe *npe;
                 struct pnv_ioda_pe *pe = container_of(tgl->table_group,
                                 struct pnv_ioda_pe, table_group);
                 __be64 __iomem *invalidate = rm ?
                         (__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys :
                         pe->phb->ioda.tce_inval_reg;
-               int i;
  
+               if (pe->phb->type == PNV_PHB_NPU) {
+                       /*
+                        * The NVLink hardware does not support TCE kill
+                        * per TCE entry so we have to invalidate
+                        * the entire cache for it.
+                        */
+                       pnv_pci_ioda2_tce_invalidate_entire(pe->phb, rm);
+                       continue;
+               }
                 pnv_pci_ioda2_do_tce_invalidate(pe->pe_number, rm,
                         invalidate, tbl->it_page_shift,
                         index, npages);
-
-               if (pe->flags & PNV_IODA_PE_PEER)
-                       /* Invalidate PEs using the same TCE table */
-                       for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) {
-                               npe = pe->peers[i];
-                               if (!npe || npe->phb->type != PNV_PHB_NPU)
-                                       continue;
-
-                               pnv_npu_tce_invalidate(npe, tbl, index,
-                                                       npages, rm);
-                       }
         }
  }
  
@@ -1945,56 +1962,140 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = {
         .free = pnv_ioda2_table_free,
  };
  
-static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
-                                     struct pnv_ioda_pe *pe, unsigned int base,
-                                     unsigned int segs)
+static int pnv_pci_ioda_dev_dma_weight(struct pci_dev *dev, void *data)
+{
+       unsigned int *weight = (unsigned int *)data;
+
+       /* This is quite simplistic. The "base" weight of a device
+        * is 10. 0 means no DMA is to be accounted for it.
+        */
+       if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
+               return 0;
+
+       if (dev->class == PCI_CLASS_SERIAL_USB_UHCI ||
+           dev->class == PCI_CLASS_SERIAL_USB_OHCI ||
+           dev->class == PCI_CLASS_SERIAL_USB_EHCI)
+               *weight += 3;
+       else if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID)
+               *weight += 15;
+       else
+               *weight += 10;
+
+       return 0;
+}
+
+static unsigned int pnv_pci_ioda_pe_dma_weight(struct pnv_ioda_pe *pe)
+{
+       unsigned int weight = 0;
+
+       /* SRIOV VF has same DMA32 weight as its PF */
+#ifdef CONFIG_PCI_IOV
+       if ((pe->flags & PNV_IODA_PE_VF) && pe->parent_dev) {
+               pnv_pci_ioda_dev_dma_weight(pe->parent_dev, &weight);
+               return weight;
+       }
+#endif
+
+       if ((pe->flags & PNV_IODA_PE_DEV) && pe->pdev) {
+               pnv_pci_ioda_dev_dma_weight(pe->pdev, &weight);
+       } else if ((pe->flags & PNV_IODA_PE_BUS) && pe->pbus) {
+               struct pci_dev *pdev;
+
+               list_for_each_entry(pdev, &pe->pbus->devices, bus_list)
+                       pnv_pci_ioda_dev_dma_weight(pdev, &weight);
+       } else if ((pe->flags & PNV_IODA_PE_BUS_ALL) && pe->pbus) {
+               pci_walk_bus(pe->pbus, pnv_pci_ioda_dev_dma_weight, &weight);
+       }
+
+       return weight;
+}
+
+static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
+                                      struct pnv_ioda_pe *pe)
  {
  
         struct page *tce_mem = NULL;
         struct iommu_table *tbl;
-       unsigned int i;
+       unsigned int weight, total_weight = 0;
+       unsigned int tce32_segsz, base, segs, avail, i;
         int64_t rc;
         void *addr;
  
         /* XXX FIXME: Handle 64-bit only DMA devices */
         /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */
         /* XXX FIXME: Allocate multi-level tables on PHB3 */
+       weight = pnv_pci_ioda_pe_dma_weight(pe);
+       if (!weight)
+               return;
  
-       /* We shouldn't already have a 32-bit DMA associated */
-       if (WARN_ON(pe->tce32_seg >= 0))
+       pci_walk_bus(phb->hose->bus, pnv_pci_ioda_dev_dma_weight,
+                    &total_weight);
+       segs = (weight * phb->ioda.dma32_count) / total_weight;
+       if (!segs)
+               segs = 1;
+
+       /*
+        * Allocate contiguous DMA32 segments. We begin with the expected
+        * number of segments. With one more attempt, the number of DMA32
+        * segments to be allocated is decreased by one until one segment
+        * is allocated successfully.
+        */
+       do {
+               for (base = 0; base <= phb->ioda.dma32_count - segs; base++) {
+                       for (avail = 0, i = base; i < base + segs; i++) {
+                               if (phb->ioda.dma32_segmap[i] ==
+                                   IODA_INVALID_PE)
+                                       avail++;
+                       }
+
+                       if (avail == segs)
+                               goto found;
+               }
+       } while (--segs);
+
+       if (!segs) {
+               pe_warn(pe, "No available DMA32 segments\n");
                 return;
+       }
  
+found:
         tbl = pnv_pci_table_alloc(phb->hose->node);
         iommu_register_group(&pe->table_group, phb->hose->global_number,
                         pe->pe_number);
         pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
  
         /* Grab a 32-bit TCE table */
-       pe->tce32_seg = base;
+       pe_info(pe, "DMA weight %d (%d), assigned (%d) %d DMA32 segments\n",
+               weight, total_weight, base, segs);
         pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
-               (base << 28), ((base + segs) << 28) - 1);
+               base * PNV_IODA1_DMA32_SEGSIZE,
+               (base + segs) * PNV_IODA1_DMA32_SEGSIZE - 1);
  
         /* XXX Currently, we allocate one big contiguous table for the
          * TCEs. We only really need one chunk per 256M of TCE space
          * (ie per segment) but that's an optimization for later, it
          * requires some added smarts with our get/put_tce implementation
+        *
+        * Each TCE page is 4KB in size and each TCE entry occupies 8
+        * bytes
          */
+       tce32_segsz = PNV_IODA1_DMA32_SEGSIZE >> (IOMMU_PAGE_SHIFT_4K - 3);
         tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
-                                  get_order(TCE32_TABLE_SIZE * segs));
+                                  get_order(tce32_segsz * segs));
         if (!tce_mem) {
                 pe_err(pe, " Failed to allocate a 32-bit TCE memory\n");
                 goto fail;
         }
         addr = page_address(tce_mem);
-       memset(addr, 0, TCE32_TABLE_SIZE * segs);
+       memset(addr, 0, tce32_segsz * segs);
  
         /* Configure HW */
         for (i = 0; i < segs; i++) {
                 rc = opal_pci_map_pe_dma_window(phb->opal_id,
                                               pe->pe_number,
                                               base + i, 1,
-                                             __pa(addr) + TCE32_TABLE_SIZE * i,
-                                             TCE32_TABLE_SIZE, 0x1000);
+                                             __pa(addr) + tce32_segsz * i,
+                                             tce32_segsz, IOMMU_PAGE_SIZE_4K);
                 if (rc) {
                         pe_err(pe, " Failed to configure 32-bit TCE table,"
                                " err %ld\n", rc);
@@ -2002,9 +2103,14 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
                 }
         }
  
+       /* Setup DMA32 segment mapping */
+       for (i = base; i < base + segs; i++)
+               phb->ioda.dma32_segmap[i] = pe->pe_number;
+
         /* Setup linux iommu table */
-       pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs,
-                                 base << 28, IOMMU_PAGE_SHIFT_4K);
+       pnv_pci_setup_iommu_table(tbl, addr, tce32_segsz * segs,
+                                 base * PNV_IODA1_DMA32_SEGSIZE,
+                                 IOMMU_PAGE_SHIFT_4K);
  
         /* OPAL variant of P7IOC SW invalidated TCEs */
         if (phb->ioda.tce_inval_reg)
@@ -2031,10 +2137,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
         return;
   fail:
         /* XXX Failure: Try to fallback to 64-bit only ? */
-       if (pe->tce32_seg >= 0)
-               pe->tce32_seg = -1;
         if (tce_mem)
-               __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs));
+               __free_pages(tce_mem, get_order(tce32_segsz * segs));
         if (tbl) {
                 pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
                 iommu_free_table(tbl, "pnv");
@@ -2075,7 +2179,7 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
  
         pnv_pci_link_table_and_group(phb->hose->node, num,
                         tbl, &pe->table_group);
-       pnv_pci_ioda2_tce_invalidate_entire(pe);
+       pnv_pci_ioda2_tce_invalidate_pe(pe);
  
         return 0;
  }
@@ -2219,7 +2323,7 @@ static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
         if (ret)
                 pe_warn(pe, "Unmapping failed, ret = %ld\n", ret);
         else
-               pnv_pci_ioda2_tce_invalidate_entire(pe);
+               pnv_pci_ioda2_tce_invalidate_pe(pe);
  
         pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
  
@@ -2288,6 +2392,116 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
         .take_ownership = pnv_ioda2_take_ownership,
         .release_ownership = pnv_ioda2_release_ownership,
  };
+
+static int gpe_table_group_to_npe_cb(struct device *dev, void *opaque)
+{
+       struct pci_controller *hose;
+       struct pnv_phb *phb;
+       struct pnv_ioda_pe **ptmppe = opaque;
+       struct pci_dev *pdev = container_of(dev, struct pci_dev, dev);
+       struct pci_dn *pdn = pci_get_pdn(pdev);
+
+       if (!pdn || pdn->pe_number == IODA_INVALID_PE)
+               return 0;
+
+       hose = pci_bus_to_host(pdev->bus);
+       phb = hose->private_data;
+       if (phb->type != PNV_PHB_NPU)
+               return 0;
+
+       *ptmppe = &phb->ioda.pe_array[pdn->pe_number];
+
+       return 1;
+}
+
+/*
+ * This returns PE of associated NPU.
+ * This assumes that NPU is in the same IOMMU group with GPU and there is
+ * no other PEs.
+ */
+static struct pnv_ioda_pe *gpe_table_group_to_npe(
+               struct iommu_table_group *table_group)
+{
+       struct pnv_ioda_pe *npe = NULL;
+       int ret = iommu_group_for_each_dev(table_group->group, &npe,
+                       gpe_table_group_to_npe_cb);
+
+       BUG_ON(!ret || !npe);
+
+       return npe;
+}
+
+static long pnv_pci_ioda2_npu_set_window(struct iommu_table_group *table_group,
+               int num, struct iommu_table *tbl)
+{
+       long ret = pnv_pci_ioda2_set_window(table_group, num, tbl);
+
+       if (ret)
+               return ret;
+
+       ret = pnv_npu_set_window(gpe_table_group_to_npe(table_group), num, tbl);
+       if (ret)
+               pnv_pci_ioda2_unset_window(table_group, num);
+
+       return ret;
+}
+
+static long pnv_pci_ioda2_npu_unset_window(
+               struct iommu_table_group *table_group,
+               int num)
+{
+       long ret = pnv_pci_ioda2_unset_window(table_group, num);
+
+       if (ret)
+               return ret;
+
+       return pnv_npu_unset_window(gpe_table_group_to_npe(table_group), num);
+}
+
+static void pnv_ioda2_npu_take_ownership(struct iommu_table_group *table_group)
+{
+       /*
+        * Detach NPU first as pnv_ioda2_take_ownership() will destroy
+        * the iommu_table if 32bit DMA is enabled.
+        */
+       pnv_npu_take_ownership(gpe_table_group_to_npe(table_group));
+       pnv_ioda2_take_ownership(table_group);
+}
+
+static struct iommu_table_group_ops pnv_pci_ioda2_npu_ops = {
+       .get_table_size = pnv_pci_ioda2_get_table_size,
+       .create_table = pnv_pci_ioda2_create_table,
+       .set_window = pnv_pci_ioda2_npu_set_window,
+       .unset_window = pnv_pci_ioda2_npu_unset_window,
+       .take_ownership = pnv_ioda2_npu_take_ownership,
+       .release_ownership = pnv_ioda2_release_ownership,
+};
+
+static void pnv_pci_ioda_setup_iommu_api(void)
+{
+       struct pci_controller *hose, *tmp;
+       struct pnv_phb *phb;
+       struct pnv_ioda_pe *pe, *gpe;
+
+       /*
+        * Now we have all PHBs discovered, time to add NPU devices to
+        * the corresponding IOMMU groups.
+        */
+       list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+               phb = hose->private_data;
+
+               if (phb->type != PNV_PHB_NPU)
+                       continue;
+
+               list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+                       gpe = pnv_pci_npu_setup_iommu(pe);
+                       if (gpe)
+                               gpe->table_group.ops = &pnv_pci_ioda2_npu_ops;
+               }
+       }
+}
+#else /* !CONFIG_IOMMU_API */
+static void pnv_pci_ioda_setup_iommu_api(void) { };
  #endif
  
  static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb)
@@ -2443,10 +2657,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
  {
         int64_t rc;
  
-       /* We shouldn't already have a 32-bit DMA associated */
-       if (WARN_ON(pe->tce32_seg >= 0))
-               return;
-
         /* TVE #1 is selected by PCI address bit 59 */
         pe->tce_bypass_base = 1ull << 59;
  
@@ -2454,7 +2664,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
                         pe->pe_number);
  
         /* The PE will reserve all possible 32-bits space */
-       pe->tce32_seg = 0;
         pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n",
                 phb->ioda.m32_pci_base);
  
@@ -2470,11 +2679,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
  #endif
  
         rc = pnv_pci_ioda2_setup_default_config(pe);
-       if (rc) {
-               if (pe->tce32_seg >= 0)
-                       pe->tce32_seg = -1;
+       if (rc)
                 return;
-       }
  
         if (pe->flags & PNV_IODA_PE_DEV)
                 iommu_add_device(&pe->pdev->dev);
@@ -2485,47 +2691,24 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
  static void pnv_ioda_setup_dma(struct pnv_phb *phb)
  {
         struct pci_controller *hose = phb->hose;
-       unsigned int residual, remaining, segs, tw, base;
         struct pnv_ioda_pe *pe;
+       unsigned int weight;
  
         /* If we have more PE# than segments available, hand out one
          * per PE until we run out and let the rest fail. If not,
          * then we assign at least one segment per PE, plus more based
          * on the amount of devices under that PE
          */
-       if (phb->ioda.dma_pe_count > phb->ioda.tce32_count)
-               residual = 0;
-       else
-               residual = phb->ioda.tce32_count -
-                       phb->ioda.dma_pe_count;
-
-       pr_info("PCI: Domain %04x has %ld available 32-bit DMA segments\n",
-               hose->global_number, phb->ioda.tce32_count);
-       pr_info("PCI: %d PE# for a total weight of %d\n",
-               phb->ioda.dma_pe_count, phb->ioda.dma_weight);
+       pr_info("PCI: Domain %04x has %d available 32-bit DMA segments\n",
+               hose->global_number, phb->ioda.dma32_count);
  
         pnv_pci_ioda_setup_opal_tce_kill(phb);
  
-       /* Walk our PE list and configure their DMA segments, hand them
-        * out one base segment plus any residual segments based on
-        * weight
-        */
-       remaining = phb->ioda.tce32_count;
-       tw = phb->ioda.dma_weight;
-       base = 0;
-       list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) {
-               if (!pe->dma_weight)
-                       continue;
-               if (!remaining) {
-                       pe_warn(pe, "No DMA32 resources available\n");
+       /* Walk our PE list and configure their DMA segments */
+       list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+               weight = pnv_pci_ioda_pe_dma_weight(pe);
+               if (!weight)
                         continue;
-               }
-               segs = 1;
-               if (residual) {
-                       segs += ((pe->dma_weight * residual)  + (tw / 2)) / tw;
-                       if (segs > remaining)
-                               segs = remaining;
-               }
  
                 /*
                  * For IODA2 compliant PHB3, we needn't care about the weight.
@@ -2533,12 +2716,9 @@ static void pnv_ioda_setup_dma(struct pnv_phb *phb)
                  * the specific PE.
                  */
                 if (phb->type == PNV_PHB_IODA1) {
-                       pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n",
-                               pe->dma_weight, segs);
-                       pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs);
+                       pnv_pci_ioda1_setup_dma_pe(phb, pe);
                 } else if (phb->type == PNV_PHB_IODA2) {
                         pe_info(pe, "Assign DMA32 space\n");
-                       segs = 0;
                         pnv_pci_ioda2_setup_dma_pe(phb, pe);
                 } else if (phb->type == PNV_PHB_NPU) {
                         /*
@@ -2548,9 +2728,6 @@ static void pnv_ioda_setup_dma(struct pnv_phb *phb)
                          * as the PHB3 TVT.
                          */
                 }
-
-               remaining -= segs;
-               base += segs;
         }
  }
  
@@ -2858,7 +3035,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
         pdn->m64_single_mode = false;
  
         total_vfs = pci_sriov_get_totalvfs(pdev);
-       mul = phb->ioda.total_pe;
+       mul = phb->ioda.total_pe_num;
         total_vf_bar_sz = 0;
  
         for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
@@ -2929,19 +3106,72 @@ truncate_iov:
  }
  #endif /* CONFIG_PCI_IOV */
  
+static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
+                                 struct resource *res)
+{
+       struct pnv_phb *phb = pe->phb;
+       struct pci_bus_region region;
+       int index;
+       int64_t rc;
+
+       if (!res || !res->flags || res->start > res->end)
+               return;
+
+       if (res->flags & IORESOURCE_IO) {
+               region.start = res->start - phb->ioda.io_pci_base;
+               region.end   = res->end - phb->ioda.io_pci_base;
+               index = region.start / phb->ioda.io_segsize;
+
+               while (index < phb->ioda.total_pe_num &&
+                      region.start <= region.end) {
+                       phb->ioda.io_segmap[index] = pe->pe_number;
+                       rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+                               pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index);
+                       if (rc != OPAL_SUCCESS) {
+                               pr_err("%s: Error %lld mapping IO segment#%d to PE#%d\n",
+                                      __func__, rc, index, pe->pe_number);
+                               break;
+                       }
+
+                       region.start += phb->ioda.io_segsize;
+                       index++;
+               }
+       } else if ((res->flags & IORESOURCE_MEM) &&
+                  !pnv_pci_is_mem_pref_64(res->flags)) {
+               region.start = res->start -
+                              phb->hose->mem_offset[0] -
+                              phb->ioda.m32_pci_base;
+               region.end   = res->end -
+                              phb->hose->mem_offset[0] -
+                              phb->ioda.m32_pci_base;
+               index = region.start / phb->ioda.m32_segsize;
+
+               while (index < phb->ioda.total_pe_num &&
+                      region.start <= region.end) {
+                       phb->ioda.m32_segmap[index] = pe->pe_number;
+                       rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+                               pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index);
+                       if (rc != OPAL_SUCCESS) {
+                               pr_err("%s: Error %lld mapping M32 segment#%d to PE#%d",
+                                      __func__, rc, index, pe->pe_number);
+                               break;
+                       }
+
+                       region.start += phb->ioda.m32_segsize;
+                       index++;
+               }
+       }
+}
+
  /*
   * This function is supposed to be called on basis of PE from top
   * to bottom style. So the the I/O or MMIO segment assigned to
   * parent PE could be overrided by its child PEs if necessary.
   */
-static void pnv_ioda_setup_pe_seg(struct pci_controller *hose,
-                                 struct pnv_ioda_pe *pe)
+static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe)
  {
-       struct pnv_phb *phb = hose->private_data;
-       struct pci_bus_region region;
-       struct resource *res;
-       int i, index;
-       int rc;
+       struct pci_dev *pdev;
+       int i;
  
         /*
          * NOTE: We only care PCI bus based PE for now. For PCI
@@ -2950,57 +3180,20 @@ static void pnv_ioda_setup_pe_seg(struct pci_controller *hose,
          */
         BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)));
  
-       pci_bus_for_each_resource(pe->pbus, res, i) {
-               if (!res || !res->flags ||
-                   res->start > res->end)
-                       continue;
+       list_for_each_entry(pdev, &pe->pbus->devices, bus_list) {
+               for (i = 0; i <= PCI_ROM_RESOURCE; i++)
+                       pnv_ioda_setup_pe_res(pe, &pdev->resource[i]);
  
-               if (res->flags & IORESOURCE_IO) {
-                       region.start = res->start - phb->ioda.io_pci_base;
-                       region.end   = res->end - phb->ioda.io_pci_base;
-                       index = region.start / phb->ioda.io_segsize;
-
-                       while (index < phb->ioda.total_pe &&
-                              region.start <= region.end) {
-                               phb->ioda.io_segmap[index] = pe->pe_number;
-                               rc = opal_pci_map_pe_mmio_window(phb->opal_id,
-                                       pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index);
-                               if (rc != OPAL_SUCCESS) {
-                                       pr_err("%s: OPAL error %d when mapping IO "
-                                              "segment #%d to PE#%d\n",
-                                              __func__, rc, index, pe->pe_number);
-                                       break;
-                               }
-
-                               region.start += phb->ioda.io_segsize;
-                               index++;
-                       }
-               } else if ((res->flags & IORESOURCE_MEM) &&
-                          !pnv_pci_is_mem_pref_64(res->flags)) {
-                       region.start = res->start -
-                                      hose->mem_offset[0] -
-                                      phb->ioda.m32_pci_base;
-                       region.end   = res->end -
-                                      hose->mem_offset[0] -
-                                      phb->ioda.m32_pci_base;
-                       index = region.start / phb->ioda.m32_segsize;
-
-                       while (index < phb->ioda.total_pe &&
-                              region.start <= region.end) {
-                               phb->ioda.m32_segmap[index] = pe->pe_number;
-                               rc = opal_pci_map_pe_mmio_window(phb->opal_id,
-                                       pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index);
-                               if (rc != OPAL_SUCCESS) {
-                                       pr_err("%s: OPAL error %d when mapping M32 "
-                                              "segment#%d to PE#%d",
-                                              __func__, rc, index, pe->pe_number);
-                                       break;
-                               }
-
-                               region.start += phb->ioda.m32_segsize;
-                               index++;
-                       }
-               }
+               /*
+                * If the PE contains all subordinate PCI buses, the
+                * windows of the child bridges should be mapped to
+                * the PE as well.
+                */
+               if (!(pe->flags & PNV_IODA_PE_BUS_ALL) || !pci_is_bridge(pdev))
+                       continue;
+               for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++)
+                       pnv_ioda_setup_pe_res(pe,
+                               &pdev->resource[PCI_BRIDGE_RESOURCES + i]);
         }
  }
  
@@ -3018,7 +3211,7 @@ static void pnv_pci_ioda_setup_seg(void)
                         continue;
  
                 list_for_each_entry(pe, &phb->ioda.pe_list, list) {
-                       pnv_ioda_setup_pe_seg(hose, pe);
+                       pnv_ioda_setup_pe_seg(pe);
                 }
         }
  }
@@ -3035,6 +3228,8 @@ static void pnv_pci_ioda_setup_DMA(void)
                 phb = hose->private_data;
                 phb->initialized = 1;
         }
+
+       pnv_pci_ioda_setup_iommu_api();
  }
  
  static void pnv_pci_ioda_create_dbgfs(void)
@@ -3056,27 +3251,6 @@ static void pnv_pci_ioda_create_dbgfs(void)
  #endif /* CONFIG_DEBUG_FS */
  }
  
-static void pnv_npu_ioda_fixup(void)
-{
-       bool enable_bypass;
-       struct pci_controller *hose, *tmp;
-       struct pnv_phb *phb;
-       struct pnv_ioda_pe *pe;
-
-       list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
-               phb = hose->private_data;
-               if (phb->type != PNV_PHB_NPU)
-                       continue;
-
-               list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) {
-                       enable_bypass = dma_get_mask(&pe->pdev->dev) ==
-                               DMA_BIT_MASK(64);
-                       pnv_npu_init_dma_pe(pe);
-                       pnv_npu_dma_set_bypass(pe, enable_bypass);
-               }
-       }
-}
-
  static void pnv_pci_ioda_fixup(void)
  {
         pnv_pci_ioda_setup_PEs();
@@ -3089,9 +3263,6 @@ static void pnv_pci_ioda_fixup(void)
         eeh_init();
         eeh_addr_cache_build();
  #endif
-
-       /* Link NPU IODA tables to their PCI devices. */
-       pnv_npu_ioda_fixup();
  }
  
  /*
@@ -3195,12 +3366,6 @@ static bool pnv_pci_enable_device_hook(struct pci_dev *dev)
         return true;
  }
  
-static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus,
-                              u32 devfn)
-{
-       return phb->ioda.pe_rmap[(bus->number << 8) | devfn];
-}
-
  static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
  {
         struct pnv_phb *phb = hose->private_data;
@@ -3210,31 +3375,39 @@ static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
  }
  
  static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
-       .dma_dev_setup = pnv_pci_dma_dev_setup,
-       .dma_bus_setup = pnv_pci_dma_bus_setup,
+       .dma_dev_setup          = pnv_pci_dma_dev_setup,
+       .dma_bus_setup          = pnv_pci_dma_bus_setup,
  #ifdef CONFIG_PCI_MSI
-       .setup_msi_irqs = pnv_setup_msi_irqs,
-       .teardown_msi_irqs = pnv_teardown_msi_irqs,
+       .setup_msi_irqs         = pnv_setup_msi_irqs,
+       .teardown_msi_irqs      = pnv_teardown_msi_irqs,
  #endif
-       .enable_device_hook = pnv_pci_enable_device_hook,
-       .window_alignment = pnv_pci_window_alignment,
-       .reset_secondary_bus = pnv_pci_reset_secondary_bus,
-       .dma_set_mask = pnv_pci_ioda_dma_set_mask,
-       .dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask,
-       .shutdown = pnv_pci_ioda_shutdown,
+       .enable_device_hook     = pnv_pci_enable_device_hook,
+       .window_alignment       = pnv_pci_window_alignment,
+       .reset_secondary_bus    = pnv_pci_reset_secondary_bus,
+       .dma_set_mask           = pnv_pci_ioda_dma_set_mask,
+       .dma_get_required_mask  = pnv_pci_ioda_dma_get_required_mask,
+       .shutdown               = pnv_pci_ioda_shutdown,
  };
  
+static int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask)
+{
+       dev_err_once(&npdev->dev,
+                       "%s operation unsupported for NVLink devices\n",
+                       __func__);
+       return -EPERM;
+}
+
  static const struct pci_controller_ops pnv_npu_ioda_controller_ops = {
-       .dma_dev_setup = pnv_pci_dma_dev_setup,
+       .dma_dev_setup          = pnv_pci_dma_dev_setup,
  #ifdef CONFIG_PCI_MSI
-       .setup_msi_irqs = pnv_setup_msi_irqs,
-       .teardown_msi_irqs = pnv_teardown_msi_irqs,
+       .setup_msi_irqs         = pnv_setup_msi_irqs,
+       .teardown_msi_irqs      = pnv_teardown_msi_irqs,
  #endif
-       .enable_device_hook = pnv_pci_enable_device_hook,
-       .window_alignment = pnv_pci_window_alignment,
-       .reset_secondary_bus = pnv_pci_reset_secondary_bus,
-       .dma_set_mask = pnv_npu_dma_set_mask,
-       .shutdown = pnv_pci_ioda_shutdown,
+       .enable_device_hook     = pnv_pci_enable_device_hook,
+       .window_alignment       = pnv_pci_window_alignment,
+       .reset_secondary_bus    = pnv_pci_reset_secondary_bus,
+       .dma_set_mask           = pnv_npu_dma_set_mask,
+       .shutdown               = pnv_pci_ioda_shutdown,
  };
  
  static void __init pnv_pci_init_ioda_phb(struct device_node *np,
@@ -3242,10 +3415,12 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
  {
         struct pci_controller *hose;
         struct pnv_phb *phb;
-       unsigned long size, m32map_off, pemap_off, iomap_off = 0;
+       unsigned long size, m64map_off, m32map_off, pemap_off;
+       unsigned long iomap_off = 0, dma32map_off = 0;
         const __be64 *prop64;
         const __be32 *prop32;
         int len;
+       unsigned int segno;
         u64 phb_id;
         void *aux;
         long rc;
@@ -3306,13 +3481,13 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
                 pr_err("  Failed to map registers !\n");
  
         /* Initialize more IODA stuff */
-       phb->ioda.total_pe = 1;
+       phb->ioda.total_pe_num = 1;
         prop32 = of_get_property(np, "ibm,opal-num-pes", NULL);
         if (prop32)
-               phb->ioda.total_pe = be32_to_cpup(prop32);
+               phb->ioda.total_pe_num = be32_to_cpup(prop32);
         prop32 = of_get_property(np, "ibm,opal-reserved-pe", NULL);
         if (prop32)
-               phb->ioda.reserved_pe = be32_to_cpup(prop32);
+               phb->ioda.reserved_pe_idx = be32_to_cpup(prop32);
  
         /* Parse 64-bit MMIO range */
         pnv_ioda_parse_m64_window(phb);
@@ -3321,36 +3496,58 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
         /* FW Has already off top 64k of M32 space (MSI space) */
         phb->ioda.m32_size += 0x10000;
  
-       phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe;
+       phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe_num;
         phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0];
         phb->ioda.io_size = hose->pci_io_size;
-       phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe;
+       phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe_num;
         phb->ioda.io_pci_base = 0; /* XXX calculate this ? */
  
+       /* Calculate how many 32-bit TCE segments we have */
+       phb->ioda.dma32_count = phb->ioda.m32_pci_base /
+                               PNV_IODA1_DMA32_SEGSIZE;
+
         /* Allocate aux data & arrays. We don't have IO ports on PHB3 */
-       size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long));
+       size = _ALIGN_UP(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8,
+                       sizeof(unsigned long));
+       m64map_off = size;
+       size += phb->ioda.total_pe_num * sizeof(phb->ioda.m64_segmap[0]);
         m32map_off = size;
-       size += phb->ioda.total_pe * sizeof(phb->ioda.m32_segmap[0]);
+       size += phb->ioda.total_pe_num * sizeof(phb->ioda.m32_segmap[0]);
         if (phb->type == PNV_PHB_IODA1) {
                 iomap_off = size;
-               size += phb->ioda.total_pe * sizeof(phb->ioda.io_segmap[0]);
+               size += phb->ioda.total_pe_num * sizeof(phb->ioda.io_segmap[0]);
+               dma32map_off = size;
+               size += phb->ioda.dma32_count *
+                       sizeof(phb->ioda.dma32_segmap[0]);
         }
         pemap_off = size;
-       size += phb->ioda.total_pe * sizeof(struct pnv_ioda_pe);
+       size += phb->ioda.total_pe_num * sizeof(struct pnv_ioda_pe);
         aux = memblock_virt_alloc(size, 0);
         phb->ioda.pe_alloc = aux;
+       phb->ioda.m64_segmap = aux + m64map_off;
         phb->ioda.m32_segmap = aux + m32map_off;
-       if (phb->type == PNV_PHB_IODA1)
+       for (segno = 0; segno < phb->ioda.total_pe_num; segno++) {
+               phb->ioda.m64_segmap[segno] = IODA_INVALID_PE;
+               phb->ioda.m32_segmap[segno] = IODA_INVALID_PE;
+       }
+       if (phb->type == PNV_PHB_IODA1) {
                 phb->ioda.io_segmap = aux + iomap_off;
+               for (segno = 0; segno < phb->ioda.total_pe_num; segno++)
+                       phb->ioda.io_segmap[segno] = IODA_INVALID_PE;
+
+               phb->ioda.dma32_segmap = aux + dma32map_off;
+               for (segno = 0; segno < phb->ioda.dma32_count; segno++)
+                       phb->ioda.dma32_segmap[segno] = IODA_INVALID_PE;
+       }
         phb->ioda.pe_array = aux + pemap_off;
-       set_bit(phb->ioda.reserved_pe, phb->ioda.pe_alloc);
+       set_bit(phb->ioda.reserved_pe_idx, phb->ioda.pe_alloc);
  
-       INIT_LIST_HEAD(&phb->ioda.pe_dma_list);
         INIT_LIST_HEAD(&phb->ioda.pe_list);
         mutex_init(&phb->ioda.pe_list_mutex);
  
         /* Calculate how many 32-bit TCE segments we have */
-       phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28;
+       phb->ioda.dma32_count = phb->ioda.m32_pci_base /
+                               PNV_IODA1_DMA32_SEGSIZE;
  
  #if 0 /* We should really do that ... */
         rc = opal_pci_set_phb_mem_window(opal->phb_id,
@@ -3362,7 +3559,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
  #endif
  
         pr_info("  %03d (%03d) PE's M32: 0x%x [segment=0x%x]\n",
-               phb->ioda.total_pe, phb->ioda.reserved_pe,
+               phb->ioda.total_pe_num, phb->ioda.reserved_pe_idx,
                 phb->ioda.m32_size, phb->ioda.m32_segsize);
         if (phb->ioda.m64_size)
                 pr_info("                 M64: 0x%lx [segment=0x%lx]\n",
@@ -3377,12 +3574,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
         phb->freeze_pe = pnv_ioda_freeze_pe;
         phb->unfreeze_pe = pnv_ioda_unfreeze_pe;
  
-       /* Setup RID -> PE mapping function */
-       phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe;
-
-       /* Setup TCEs */
-       phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
-
         /* Setup MSI support */
         pnv_pci_init_ioda_msis(phb);
  
@@ -3395,10 +3586,12 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
          */
         ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
  
-       if (phb->type == PNV_PHB_NPU)
+       if (phb->type == PNV_PHB_NPU) {
                 hose->controller_ops = pnv_npu_ioda_controller_ops;
-       else
+       } else {
+               phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
                 hose->controller_ops = pnv_pci_ioda_controller_ops;
+       }
  
  #ifdef CONFIG_PCI_IOV
         ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources;
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c

index 73c8dc2a353fdd540b9a05a5aa208b3e50970e47..1d92bd93bcd9f5c8d28d760088bb3fcdeb14365f 100644 (file)
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -39,9 +39,6 @@
  /* Delay in usec */
  #define PCI_RESET_DELAY_US     3000000
  
-#define cfg_dbg(fmt...)        do { } while(0)
-//#define cfg_dbg(fmt...)      printk(fmt)
-
  #ifdef CONFIG_PCI_MSI
  int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
  {
@@ -370,7 +367,7 @@ static void pnv_pci_config_check_eeh(struct pci_dn *pdn)
         struct pnv_phb *phb = pdn->phb->private_data;
         u8      fstate;
         __be16  pcierr;
-       int     pe_no;
+       unsigned int pe_no;
         s64     rc;
  
         /*
@@ -380,7 +377,7 @@ static void pnv_pci_config_check_eeh(struct pci_dn *pdn)
          */
         pe_no = pdn->pe_number;
         if (pe_no == IODA_INVALID_PE) {
-               pe_no = phb->ioda.reserved_pe;
+               pe_no = phb->ioda.reserved_pe_idx;
         }
  
         /*
@@ -402,8 +399,8 @@ static void pnv_pci_config_check_eeh(struct pci_dn *pdn)
                 }
         }
  
-       cfg_dbg(" -> EEH check, bdfn=%04x PE#%d fstate=%x\n",
-               (pdn->busno << 8) | (pdn->devfn), pe_no, fstate);
+       pr_devel(" -> EEH check, bdfn=%04x PE#%d fstate=%x\n",
+                (pdn->busno << 8) | (pdn->devfn), pe_no, fstate);
  
         /* Clear the frozen state if applicable */
         if (fstate == OPAL_EEH_STOPPED_MMIO_FREEZE ||
@@ -451,8 +448,8 @@ int pnv_pci_cfg_read(struct pci_dn *pdn,
                 return PCIBIOS_FUNC_NOT_SUPPORTED;
         }
  
-       cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n",
-               __func__, pdn->busno, pdn->devfn, where, size, *val);
+       pr_devel("%s: bus: %x devfn: %x +%x/%x -> %08x\n",
+                __func__, pdn->busno, pdn->devfn, where, size, *val);
         return PCIBIOS_SUCCESSFUL;
  }
  
@@ -462,8 +459,8 @@ int pnv_pci_cfg_write(struct pci_dn *pdn,
         struct pnv_phb *phb = pdn->phb->private_data;
         u32 bdfn = (pdn->busno << 8) | pdn->devfn;
  
-       cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n",
-               pdn->busno, pdn->devfn, where, size, val);
+       pr_devel("%s: bus: %x devfn: %x +%x/%x -> %08x\n",
+                __func__, pdn->busno, pdn->devfn, where, size, val);
         switch (size) {
         case 1:
                 opal_pci_config_write_byte(phb->opal_id, bdfn, where, val);
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h

index 3f814f382b2e793bf0b5374abb74bebf35246a98..7dee25e304db2cbe1faedfe3329754605e678beb 100644 (file)
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -24,7 +24,6 @@ enum pnv_phb_model {
  #define PNV_IODA_PE_MASTER     (1 << 3)        /* Master PE in compound case   */
  #define PNV_IODA_PE_SLAVE      (1 << 4)        /* Slave PE in compound case    */
  #define PNV_IODA_PE_VF         (1 << 5)        /* PE for one VF                */
-#define PNV_IODA_PE_PEER       (1 << 6)        /* PE has peers                 */
  
  /* Data associated with a PE, including IOMMU tracking etc.. */
  struct pnv_phb;
@@ -32,9 +31,6 @@ struct pnv_ioda_pe {
         unsigned long           flags;
         struct pnv_phb          *phb;
  
-#define PNV_IODA_MAX_PEER_PES  8
-       struct pnv_ioda_pe      *peers[PNV_IODA_MAX_PEER_PES];
-
         /* A PE can be associated with a single device or an
          * entire bus (& children). In the former case, pdev
          * is populated, in the later case, pbus is.
@@ -53,14 +49,7 @@ struct pnv_ioda_pe {
         /* PE number */
         unsigned int            pe_number;
  
-       /* "Weight" assigned to the PE for the sake of DMA resource
-        * allocations
-        */
-       unsigned int            dma_weight;
-
         /* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
-       int                     tce32_seg;
-       int                     tce32_segcount;
         struct iommu_table_group table_group;
  
         /* 64-bit TCE bypass region */
@@ -78,7 +67,6 @@ struct pnv_ioda_pe {
         struct list_head        slaves;
  
         /* Link in list of PE#s */
-       struct list_head        dma_link;
         struct list_head        list;
  };
  
@@ -110,19 +98,18 @@ struct pnv_phb {
                          unsigned int is_64, struct msi_msg *msg);
         void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev);
         void (*fixup_phb)(struct pci_controller *hose);
-       u32 (*bdfn_to_pe)(struct pnv_phb *phb, struct pci_bus *bus, u32 devfn);
         int (*init_m64)(struct pnv_phb *phb);
         void (*reserve_m64_pe)(struct pci_bus *bus,
                                unsigned long *pe_bitmap, bool all);
-       int (*pick_m64_pe)(struct pci_bus *bus, bool all);
+       struct pnv_ioda_pe *(*pick_m64_pe)(struct pci_bus *bus, bool all);
         int (*get_pe_state)(struct pnv_phb *phb, int pe_no);
         void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
         int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt);
  
         struct {
                 /* Global bridge info */
-               unsigned int            total_pe;
-               unsigned int            reserved_pe;
+               unsigned int            total_pe_num;
+               unsigned int            reserved_pe_idx;
  
                 /* 32-bit MMIO window */
                 unsigned int            m32_size;
@@ -141,15 +128,19 @@ struct pnv_phb {
                 unsigned int            io_segsize;
                 unsigned int            io_pci_base;
  
-               /* PE allocation bitmap */
-               unsigned long           *pe_alloc;
-               /* PE allocation mutex */
+               /* PE allocation */
                 struct mutex            pe_alloc_mutex;
+               unsigned long           *pe_alloc;
+               struct pnv_ioda_pe      *pe_array;
  
                 /* M32 & IO segment maps */
+               unsigned int            *m64_segmap;
                 unsigned int            *m32_segmap;
                 unsigned int            *io_segmap;
-               struct pnv_ioda_pe      *pe_array;
+
+               /* DMA32 segment maps - IODA1 only */
+               unsigned int            dma32_count;
+               unsigned int            *dma32_segmap;
  
                 /* IRQ chip */
                 int                     irq_chip_init;
@@ -167,20 +158,6 @@ struct pnv_phb {
                  */
                 unsigned char           pe_rmap[0x10000];
  
-               /* 32-bit TCE tables allocation */
-               unsigned long           tce32_count;
-
-               /* Total "weight" for the sake of DMA resources
-                * allocation
-                */
-               unsigned int            dma_weight;
-               unsigned int            dma_pe_count;
-
-               /* Sorted list of used PE's, sorted at
-                * boot for resource allocation purposes
-                */
-               struct list_head        pe_dma_list;
-
                 /* TCE cache invalidate registers (physical and
                  * remapped)
                  */
@@ -236,16 +213,23 @@ extern void pnv_pci_dma_bus_setup(struct pci_bus *bus);
  extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
  extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
  
+extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
+                           const char *fmt, ...);
+#define pe_err(pe, fmt, ...)                                   \
+       pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__)
+#define pe_warn(pe, fmt, ...)                                  \
+       pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__)
+#define pe_info(pe, fmt, ...)                                  \
+       pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__)
+
  /* Nvlink functions */
-extern void pnv_npu_tce_invalidate_entire(struct pnv_ioda_pe *npe);
-extern void pnv_npu_tce_invalidate(struct pnv_ioda_pe *npe,
-                                      struct iommu_table *tbl,
-                                      unsigned long index,
-                                      unsigned long npages,
-                                      bool rm);
-extern void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe);
-extern void pnv_npu_setup_dma_pe(struct pnv_ioda_pe *npe);
-extern int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe, bool enabled);
-extern int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask);
+extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
+extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
+extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe);
+extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
+               struct iommu_table *tbl);
+extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num);
+extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe);
+extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe);
  
  #endif /* __POWERNV_PCI_H */
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c

index 1acb0c72d9231d980080af3acbae7120b720c67c..ee6430bedcc3181edd39615097be788f8fae7155 100644 (file)
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -273,7 +273,10 @@ static int __init pnv_probe(void)
         if (!of_flat_dt_is_compatible(root, "ibm,powernv"))
                 return 0;
  
-       hpte_init_native();
+       if (IS_ENABLED(CONFIG_PPC_RADIX_MMU) && radix_enabled())
+               radix_init_native();
+       else if (IS_ENABLED(CONFIG_PPC_STD_MMU_64))
+               hpte_init_native();
  
         if (firmware_has_feature(FW_FEATURE_OPAL))
                 pnv_setup_machdep_opal();
diff --git a/arch/powerpc/platforms/ps3/htab.c b/arch/powerpc/platforms/ps3/htab.c

index 2f95d33cf34a491d5970dd14ff5b88f757b31181..c9a3e677192a38e19b3f07a23fcbcfb7fcc3c3a7 100644 (file)
--- a/arch/powerpc/platforms/ps3/htab.c
+++ b/arch/powerpc/platforms/ps3/htab.c
@@ -63,7 +63,7 @@ static long ps3_hpte_insert(unsigned long hpte_group, unsigned long vpn,
         vflags &= ~HPTE_V_SECONDARY;
  
         hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
-       hpte_r = hpte_encode_r(ps3_mm_phys_to_lpar(pa), psize, apsize) | rflags;
+       hpte_r = hpte_encode_r(ps3_mm_phys_to_lpar(pa), psize, apsize, ssize) | rflags;
  
         spin_lock_irqsave(&ps3_htab_lock, flags);
  
diff --git a/arch/powerpc/platforms/ps3/spu.c b/arch/powerpc/platforms/ps3/spu.c

index a0bca05e26b0c7478c7483c43b2437fa0f6a61d7..492b2575e0d2e7048dc52df2a67ea985234c944a 100644 (file)
--- a/arch/powerpc/platforms/ps3/spu.c
+++ b/arch/powerpc/platforms/ps3/spu.c
@@ -205,7 +205,7 @@ static void spu_unmap(struct spu *spu)
  static int __init setup_areas(struct spu *spu)
  {
         struct table {char* name; unsigned long addr; unsigned long size;};
-       static const unsigned long shadow_flags = _PAGE_NO_CACHE | 3;
+       unsigned long shadow_flags = pgprot_val(pgprot_noncached_wc(PAGE_KERNEL_RO));
  
         spu_pdata(spu)->shadow = __ioremap(spu_pdata(spu)->shadow_addr,
                                            sizeof(struct spe_shadow),
@@ -216,7 +216,7 @@ static int __init setup_areas(struct spu *spu)
         }
  
         spu->local_store = (__force void *)ioremap_prot(spu->local_store_phys,
-               LS_SIZE, _PAGE_NO_CACHE);
+               LS_SIZE, pgprot_val(pgprot_noncached_wc(__pgprot(0))));
  
         if (!spu->local_store) {
                 pr_debug("%s:%d: ioremap local_store failed\n",
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c

index e9ff44cd5d86e3491eaa7c5a7a93c48156b480f1..2ce138542083bd82af773d5015025d0a2a3d0890 100644 (file)
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -116,6 +116,155 @@ static struct property *dlpar_clone_drconf_property(struct device_node *dn)
         return new_prop;
  }
  
+static void dlpar_update_drconf_property(struct device_node *dn,
+                                        struct property *prop)
+{
+       struct of_drconf_cell *lmbs;
+       u32 num_lmbs, *p;
+       int i;
+
+       /* Convert the property back to BE */
+       p = prop->value;
+       num_lmbs = *p;
+       *p = cpu_to_be32(*p);
+       p++;
+
+       lmbs = (struct of_drconf_cell *)p;
+       for (i = 0; i < num_lmbs; i++) {
+               lmbs[i].base_addr = cpu_to_be64(lmbs[i].base_addr);
+               lmbs[i].drc_index = cpu_to_be32(lmbs[i].drc_index);
+               lmbs[i].flags = cpu_to_be32(lmbs[i].flags);
+       }
+
+       rtas_hp_event = true;
+       of_update_property(dn, prop);
+       rtas_hp_event = false;
+}
+
+static int dlpar_update_device_tree_lmb(struct of_drconf_cell *lmb)
+{
+       struct device_node *dn;
+       struct property *prop;
+       struct of_drconf_cell *lmbs;
+       u32 *p, num_lmbs;
+       int i;
+
+       dn = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+       if (!dn)
+               return -ENODEV;
+
+       prop = dlpar_clone_drconf_property(dn);
+       if (!prop) {
+               of_node_put(dn);
+               return -ENODEV;
+       }
+
+       p = prop->value;
+       num_lmbs = *p++;
+       lmbs = (struct of_drconf_cell *)p;
+
+       for (i = 0; i < num_lmbs; i++) {
+               if (lmbs[i].drc_index == lmb->drc_index) {
+                       lmbs[i].flags = lmb->flags;
+                       lmbs[i].aa_index = lmb->aa_index;
+
+                       dlpar_update_drconf_property(dn, prop);
+                       break;
+               }
+       }
+
+       of_node_put(dn);
+       return 0;
+}
+
+static u32 lookup_lmb_associativity_index(struct of_drconf_cell *lmb)
+{
+       struct device_node *parent, *lmb_node, *dr_node;
+       const u32 *lmb_assoc;
+       const u32 *assoc_arrays;
+       u32 aa_index;
+       int aa_arrays, aa_array_entries, aa_array_sz;
+       int i;
+
+       parent = of_find_node_by_path("/");
+       if (!parent)
+               return -ENODEV;
+
+       lmb_node = dlpar_configure_connector(cpu_to_be32(lmb->drc_index),
+                                            parent);
+       of_node_put(parent);
+       if (!lmb_node)
+               return -EINVAL;
+
+       lmb_assoc = of_get_property(lmb_node, "ibm,associativity", NULL);
+       if (!lmb_assoc) {
+               dlpar_free_cc_nodes(lmb_node);
+               return -ENODEV;
+       }
+
+       dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+       if (!dr_node) {
+               dlpar_free_cc_nodes(lmb_node);
+               return -ENODEV;
+       }
+
+       assoc_arrays = of_get_property(dr_node,
+                                      "ibm,associativity-lookup-arrays",
+                                      NULL);
+       of_node_put(dr_node);
+       if (!assoc_arrays) {
+               dlpar_free_cc_nodes(lmb_node);
+               return -ENODEV;
+       }
+
+       /* The ibm,associativity-lookup-arrays property is defined to be
+        * a 32-bit value specifying the number of associativity arrays
+        * followed by a 32-bitvalue specifying the number of entries per
+        * array, followed by the associativity arrays.
+        */
+       aa_arrays = be32_to_cpu(assoc_arrays[0]);
+       aa_array_entries = be32_to_cpu(assoc_arrays[1]);
+       aa_array_sz = aa_array_entries * sizeof(u32);
+
+       aa_index = -1;
+       for (i = 0; i < aa_arrays; i++) {
+               int indx = (i * aa_array_entries) + 2;
+
+               if (memcmp(&assoc_arrays[indx], &lmb_assoc[1], aa_array_sz))
+                       continue;
+
+               aa_index = i;
+               break;
+       }
+
+       dlpar_free_cc_nodes(lmb_node);
+       return aa_index;
+}
+
+static int dlpar_add_device_tree_lmb(struct of_drconf_cell *lmb)
+{
+       int aa_index;
+
+       lmb->flags |= DRCONF_MEM_ASSIGNED;
+
+       aa_index = lookup_lmb_associativity_index(lmb);
+       if (aa_index < 0) {
+               pr_err("Couldn't find associativity index for drc index %x\n",
+                      lmb->drc_index);
+               return aa_index;
+       }
+
+       lmb->aa_index = aa_index;
+       return dlpar_update_device_tree_lmb(lmb);
+}
+
+static int dlpar_remove_device_tree_lmb(struct of_drconf_cell *lmb)
+{
+       lmb->flags &= ~DRCONF_MEM_ASSIGNED;
+       lmb->aa_index = 0xffffffff;
+       return dlpar_update_device_tree_lmb(lmb);
+}
+
  static struct memory_block *lmb_to_memblock(struct of_drconf_cell *lmb)
  {
         unsigned long section_nr;
@@ -243,8 +392,8 @@ static int dlpar_remove_lmb(struct of_drconf_cell *lmb)
         memblock_remove(lmb->base_addr, block_sz);
  
         dlpar_release_drc(lmb->drc_index);
+       dlpar_remove_device_tree_lmb(lmb);
  
-       lmb->flags &= ~DRCONF_MEM_ASSIGNED;
         return 0;
  }
  
@@ -384,43 +533,32 @@ static int dlpar_memory_remove_by_index(u32 drc_index, struct property *prop)
  
  #endif /* CONFIG_MEMORY_HOTREMOVE */
  
-static int dlpar_add_lmb(struct of_drconf_cell *lmb)
+static int dlpar_add_lmb_memory(struct of_drconf_cell *lmb)
  {
         struct memory_block *mem_block;
         unsigned long block_sz;
         int nid, rc;
  
-       if (lmb->flags & DRCONF_MEM_ASSIGNED)
-               return -EINVAL;
-
         block_sz = memory_block_size_bytes();
  
-       rc = dlpar_acquire_drc(lmb->drc_index);
-       if (rc)
-               return rc;
-
         /* Find the node id for this address */
         nid = memory_add_physaddr_to_nid(lmb->base_addr);
  
         /* Add the memory */
         rc = add_memory(nid, lmb->base_addr, block_sz);
-       if (rc) {
-               dlpar_release_drc(lmb->drc_index);
+       if (rc)
                 return rc;
-       }
  
         /* Register this block of memory */
         rc = memblock_add(lmb->base_addr, block_sz);
         if (rc) {
                 remove_memory(nid, lmb->base_addr, block_sz);
-               dlpar_release_drc(lmb->drc_index);
                 return rc;
         }
  
         mem_block = lmb_to_memblock(lmb);
         if (!mem_block) {
                 remove_memory(nid, lmb->base_addr, block_sz);
-               dlpar_release_drc(lmb->drc_index);
                 return -EINVAL;
         }
  
@@ -428,7 +566,6 @@ static int dlpar_add_lmb(struct of_drconf_cell *lmb)
         put_device(&mem_block->dev);
         if (rc) {
                 remove_memory(nid, lmb->base_addr, block_sz);
-               dlpar_release_drc(lmb->drc_index);
                 return rc;
         }
  
@@ -436,6 +573,34 @@ static int dlpar_add_lmb(struct of_drconf_cell *lmb)
         return 0;
  }
  
+static int dlpar_add_lmb(struct of_drconf_cell *lmb)
+{
+       int rc;
+
+       if (lmb->flags & DRCONF_MEM_ASSIGNED)
+               return -EINVAL;
+
+       rc = dlpar_acquire_drc(lmb->drc_index);
+       if (rc)
+               return rc;
+
+       rc = dlpar_add_device_tree_lmb(lmb);
+       if (rc) {
+               pr_err("Couldn't update device tree for drc index %x\n",
+                      lmb->drc_index);
+               dlpar_release_drc(lmb->drc_index);
+               return rc;
+       }
+
+       rc = dlpar_add_lmb_memory(lmb);
+       if (rc) {
+               dlpar_remove_device_tree_lmb(lmb);
+               dlpar_release_drc(lmb->drc_index);
+       }
+
+       return rc;
+}
+
  static int dlpar_memory_add_by_count(u32 lmbs_to_add, struct property *prop)
  {
         struct of_drconf_cell *lmbs;
@@ -536,31 +701,6 @@ static int dlpar_memory_add_by_index(u32 drc_index, struct property *prop)
         return rc;
  }
  
-static void dlpar_update_drconf_property(struct device_node *dn,
-                                        struct property *prop)
-{
-       struct of_drconf_cell *lmbs;
-       u32 num_lmbs, *p;
-       int i;
-
-       /* Convert the property back to BE */
-       p = prop->value;
-       num_lmbs = *p;
-       *p = cpu_to_be32(*p);
-       p++;
-
-       lmbs = (struct of_drconf_cell *)p;
-       for (i = 0; i < num_lmbs; i++) {
-               lmbs[i].base_addr = cpu_to_be64(lmbs[i].base_addr);
-               lmbs[i].drc_index = cpu_to_be32(lmbs[i].drc_index);
-               lmbs[i].flags = cpu_to_be32(lmbs[i].flags);
-       }
-
-       rtas_hp_event = true;
-       of_update_property(dn, prop);
-       rtas_hp_event = false;
-}
-
  int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
  {
         struct device_node *dn;
@@ -608,10 +748,7 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
                 break;
         }
  
-       if (rc)
-               dlpar_free_drconf_property(prop);
-       else
-               dlpar_update_drconf_property(dn, prop);
+       dlpar_free_drconf_property(prop);
  
  dlpar_memory_out:
         of_node_put(dn);
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c

index bd98ce2be17b766182b4b5df4358c8d731b8c305..b7dfc1359d0113d570fe9ea7f702c728814dc66f 100644 (file)
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -912,7 +912,8 @@ machine_arch_initcall(pseries, find_existing_ddw_windows);
  static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,
                         struct ddw_query_response *query)
  {
-       struct eeh_dev *edev;
+       struct device_node *dn;
+       struct pci_dn *pdn;
         u32 cfg_addr;
         u64 buid;
         int ret;
@@ -923,11 +924,10 @@ static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,
          * Retrieve them from the pci device, not the node with the
          * dma-window property
          */
-       edev = pci_dev_to_eeh_dev(dev);
-       cfg_addr = edev->config_addr;
-       if (edev->pe_config_addr)
-               cfg_addr = edev->pe_config_addr;
-       buid = edev->phb->buid;
+       dn = pci_device_to_OF_node(dev);
+       pdn = PCI_DN(dn);
+       buid = pdn->phb->buid;
+       cfg_addr = (pdn->busno << 8) | pdn->devfn;
  
         ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query,
                   cfg_addr, BUID_HI(buid), BUID_LO(buid));
@@ -941,7 +941,8 @@ static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail,
                         struct ddw_create_response *create, int page_shift,
                         int window_shift)
  {
-       struct eeh_dev *edev;
+       struct device_node *dn;
+       struct pci_dn *pdn;
         u32 cfg_addr;
         u64 buid;
         int ret;
@@ -952,11 +953,10 @@ static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail,
          * Retrieve them from the pci device, not the node with the
          * dma-window property
          */
-       edev = pci_dev_to_eeh_dev(dev);
-       cfg_addr = edev->config_addr;
-       if (edev->pe_config_addr)
-               cfg_addr = edev->pe_config_addr;
-       buid = edev->phb->buid;
+       dn = pci_device_to_OF_node(dev);
+       pdn = PCI_DN(dn);
+       buid = pdn->phb->buid;
+       cfg_addr = (pdn->busno << 8) | pdn->devfn;
  
         do {
                 /* extra outputs are LIOBN and dma-addr (hi, lo) */
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c

index 2415a0d31f8fd82117af5e7ab80582229a40c555..7f6100d91b4b9284cdec12c15accc8f3d2b3bee2 100644 (file)
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -89,18 +89,21 @@ void vpa_init(int cpu)
                        "%lx failed with %ld\n", cpu, hwcpu, addr, ret);
                 return;
         }
+
+#ifdef CONFIG_PPC_STD_MMU_64
         /*
          * PAPR says this feature is SLB-Buffer but firmware never
          * reports that.  All SPLPAR support SLB shadow buffer.
          */
-       addr = __pa(paca[cpu].slb_shadow_ptr);
-       if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
+       if (!radix_enabled() && firmware_has_feature(FW_FEATURE_SPLPAR)) {
+               addr = __pa(paca[cpu].slb_shadow_ptr);
                 ret = register_slb_shadow(hwcpu, addr);
                 if (ret)
                         pr_err("WARNING: SLB shadow buffer registration for "
                                "cpu %d (hw %d) of area %lx failed with %ld\n",
                                cpu, hwcpu, addr, ret);
         }
+#endif /* CONFIG_PPC_STD_MMU_64 */
  
         /*
          * Register dispatch trace log, if one has been allocated.
@@ -123,6 +126,8 @@ void vpa_init(int cpu)
         }
  }
  
+#ifdef CONFIG_PPC_STD_MMU_64
+
  static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
                                      unsigned long vpn, unsigned long pa,
                                      unsigned long rflags, unsigned long vflags,
@@ -139,7 +144,7 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
                          hpte_group, vpn,  pa, rflags, vflags, psize);
  
         hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
-       hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
+       hpte_r = hpte_encode_r(pa, psize, apsize, ssize) | rflags;
  
         if (!(vflags & HPTE_V_BOLTED))
                 pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r);
@@ -152,10 +157,6 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
         /* Exact = 0                   */
         flags = 0;
  
-       /* Make pHyp happy */
-       if ((rflags & _PAGE_NO_CACHE) && !(rflags & _PAGE_WRITETHRU))
-               hpte_r &= ~HPTE_R_M;
-
         if (firmware_has_feature(FW_FEATURE_XCMO) && !(hpte_r & HPTE_R_N))
                 flags |= H_COALESCE_CAND;
  
@@ -659,6 +660,8 @@ static void pSeries_set_page_state(struct page *page, int order,
  
  void arch_free_page(struct page *page, int order)
  {
+       if (radix_enabled())
+               return;
         if (!cmo_free_hint_flag || !firmware_has_feature(FW_FEATURE_CMO))
                 return;
  
@@ -666,7 +669,8 @@ void arch_free_page(struct page *page, int order)
  }
  EXPORT_SYMBOL(arch_free_page);
  
-#endif
+#endif /* CONFIG_PPC_SMLPAR */
+#endif /* CONFIG_PPC_STD_MMU_64 */
  
  #ifdef CONFIG_TRACEPOINTS
  #ifdef HAVE_JUMP_LABEL
diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c

index c9fecf09b8fada71dec94e9833c35be524b498e4..afa05a2cb7029b4df7b5fa28d099d8776650aed1 100644 (file)
--- a/arch/powerpc/platforms/pseries/lparcfg.c
+++ b/arch/powerpc/platforms/pseries/lparcfg.c
@@ -484,8 +484,9 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
         seq_printf(m, "shared_processor_mode=%d\n",
                    lppaca_shared_proc(get_lppaca()));
  
+#ifdef CONFIG_PPC_STD_MMU_64
         seq_printf(m, "slb_size=%d\n", mmu_slb_size);
-
+#endif
         parse_em_data(m);
  
         return 0;
diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c

index ceb18d34945978d9bc357dce09800882f27dc83a..a560a98bcf3bc2552486b552ef8e8a887425f0ba 100644 (file)
--- a/arch/powerpc/platforms/pseries/mobility.c
+++ b/arch/powerpc/platforms/pseries/mobility.c
@@ -191,8 +191,8 @@ static int update_dt_node(__be32 phandle, s32 scope)
                                 break;
  
                         case 0x80000000:
-                               prop = of_find_property(dn, prop_name, NULL);
-                               of_remove_property(dn, prop);
+                               of_remove_property(dn, of_find_property(dn,
+                                                       prop_name, NULL));
                                 prop = NULL;
                                 break;
  
diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c

index 272e9ec1ab54173b95db1c57f93dff60cf2276fe..543a6386f3eb2c4de4a80a7dde5f42e7267ca27d 100644 (file)
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -305,7 +305,7 @@ static int msi_quota_for_device(struct pci_dev *dev, int request)
         memset(&counts, 0, sizeof(struct msi_counts));
  
         /* Work out how many devices we have below this PE */
-       traverse_pci_devices(pe_dn, count_non_bridge_devices, &counts);
+       pci_traverse_device_nodes(pe_dn, count_non_bridge_devices, &counts);
  
         if (counts.num_devices == 0) {
                 pr_err("rtas_msi: found 0 devices under PE for %s\n",
@@ -320,7 +320,7 @@ static int msi_quota_for_device(struct pci_dev *dev, int request)
         /* else, we have some more calculating to do */
         counts.requestor = pci_device_to_OF_node(dev);
         counts.request = request;
-       traverse_pci_devices(pe_dn, count_spare_msis, &counts);
+       pci_traverse_device_nodes(pe_dn, count_spare_msis, &counts);
  
         /* If the quota isn't an integer multiple of the total, we can
          * use the remainder as spare MSIs for anyone that wants them. */
diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c

index 5d4a3df59d0c95fedaa7c2235bb7dbc034d7d9f6..906dbaa97fe2850abba20b597b55b4a2a14627ae 100644 (file)
--- a/arch/powerpc/platforms/pseries/pci_dlpar.c
+++ b/arch/powerpc/platforms/pseries/pci_dlpar.c
@@ -34,38 +34,6 @@
  
  #include "pseries.h"
  
-static struct pci_bus *
-find_bus_among_children(struct pci_bus *bus,
-                        struct device_node *dn)
-{
-       struct pci_bus *child = NULL;
-       struct pci_bus *tmp;
-       struct device_node *busdn;
-
-       busdn = pci_bus_to_OF_node(bus);
-       if (busdn == dn)
-               return bus;
-
-       list_for_each_entry(tmp, &bus->children, node) {
-               child = find_bus_among_children(tmp, dn);
-               if (child)
-                       break;
-       };
-       return child;
-}
-
-struct pci_bus *
-pcibios_find_pci_bus(struct device_node *dn)
-{
-       struct pci_dn *pdn = dn->data;
-
-       if (!pdn  || !pdn->phb || !pdn->phb->bus)
-               return NULL;
-
-       return find_bus_among_children(pdn->phb->bus, dn);
-}
-EXPORT_SYMBOL_GPL(pcibios_find_pci_bus);
-
  struct pci_controller *init_phb_dynamic(struct device_node *dn)
  {
         struct pci_controller *phb;
diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c

index 7c7fcc04254948837a3a747ef83fd6cdd6babb3b..cc66c49f07aa1b8a6825382216efb38486665460 100644 (file)
--- a/arch/powerpc/platforms/pseries/reconfig.c
+++ b/arch/powerpc/platforms/pseries/reconfig.c
@@ -303,7 +303,6 @@ static int do_remove_property(char *buf, size_t bufsize)
  {
         struct device_node *np;
         char *tmp;
-       struct property *prop;
         buf = parse_node(buf, bufsize, &np);
  
         if (!np)
@@ -316,9 +315,7 @@ static int do_remove_property(char *buf, size_t bufsize)
         if (strlen(buf) == 0)
                 return -EINVAL;
  
-       prop = of_find_property(np, buf, NULL);
-
-       return of_remove_property(np, prop);
+       return of_remove_property(np, of_find_property(np, buf, NULL));
  }
  
  static int do_update_property(char *buf, size_t bufsize)
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c

index 6e944fc6e5f979d2559fd805970a81af7886315f..9883bc7ea0079dfd2754f110794bcc834fbe4273 100644 (file)
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -235,6 +235,8 @@ static void __init pseries_discover_pic(void)
  
         for_each_node_by_name(np, "interrupt-controller") {
                 typep = of_get_property(np, "compatible", NULL);
+               if (!typep)
+                       continue;
                 if (strstr(typep, "open-pic")) {
                         pSeries_mpic_node = of_node_get(np);
                         ppc_md.init_IRQ       = pseries_mpic_init_IRQ;
@@ -265,7 +267,7 @@ static int pci_dn_reconfig_notifier(struct notifier_block *nb, unsigned long act
                 pdn = parent ? PCI_DN(parent) : NULL;
                 if (pdn) {
                         /* Create pdn and EEH device */
-                       update_dn_pci_info(np, pdn->phb);
+                       pci_add_device_node_info(pdn->phb, np);
                         eeh_dev_init(PCI_DN(np), pdn->phb);
                 }
  
diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c

index 85729f49764fc04c98604c119cf28f69e7da107b..0ef9df49f0f2c2aca631963a188057d64ffe4098 100644 (file)
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -37,6 +37,7 @@
  #include <asm/pci-bridge.h>
  #include <asm/ppc-pci.h>
  #include <asm/machdep.h>
+#include <asm/mpc85xx.h>
  #include <asm/disassemble.h>
  #include <asm/ppc-opcode.h>
  #include <sysdev/fsl_soc.h>
@@ -527,6 +528,8 @@ int fsl_add_bridge(struct platform_device *pdev, int is_primary)
         u8 hdr_type, progif;
         struct device_node *dev;
         struct ccsr_pci __iomem *pci;
+       u16 temp;
+       u32 svr = mfspr(SPRN_SVR);
  
         dev = pdev->dev.of_node;
  
@@ -596,6 +599,27 @@ int fsl_add_bridge(struct platform_device *pdev, int is_primary)
                         PPC_INDIRECT_TYPE_SURPRESS_PRIMARY_BUS;
                 if (fsl_pcie_check_link(hose))
                         hose->indirect_type |= PPC_INDIRECT_TYPE_NO_PCIE_LINK;
+       } else {
+               /*
+                * Set PBFR(PCI Bus Function Register)[10] = 1 to
+                * disable the combining of crossing cacheline
+                * boundary requests into one burst transaction.
+                * PCI-X operation is not affected.
+                * Fix erratum PCI 5 on MPC8548
+                */
+#define PCI_BUS_FUNCTION 0x44
+#define PCI_BUS_FUNCTION_MDS 0x400     /* Master disable streaming */
+               if (((SVR_SOC_VER(svr) == SVR_8543) ||
+                    (SVR_SOC_VER(svr) == SVR_8545) ||
+                    (SVR_SOC_VER(svr) == SVR_8547) ||
+                    (SVR_SOC_VER(svr) == SVR_8548)) &&
+                   !early_find_capability(hose, 0, 0, PCI_CAP_ID_PCIX)) {
+                       early_read_config_word(hose, 0, 0,
+                                       PCI_BUS_FUNCTION, &temp);
+                       temp |= PCI_BUS_FUNCTION_MDS;
+                       early_write_config_word(hose, 0, 0,
+                                       PCI_BUS_FUNCTION, temp);
+               }
         }
  
         printk(KERN_INFO "Found FSL PCI host bridge at 0x%016llx. "
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c

index afe3c7cd395d3ddce0077d79044f71f3030fd164..7de45b2df36699abf7cb4ce98e710aba0bce2ab3 100644 (file)
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -2004,8 +2004,15 @@ static struct syscore_ops mpic_syscore_ops = {
  
  static int mpic_init_sys(void)
  {
+       int rc;
+
         register_syscore_ops(&mpic_syscore_ops);
-       subsys_system_register(&mpic_subsys, NULL);
+       rc = subsys_system_register(&mpic_subsys, NULL);
+       if (rc) {
+               unregister_syscore_ops(&mpic_syscore_ops);
+               pr_err("mpic: Failed to register subsystem!\n");
+               return rc;
+       }
  
         return 0;
  }
diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile

index 436062dbb6e2e5c4141c70c2e43d9080679199b2..0b2f771593ebde86b39e636f6ed2a280603c7159 100644 (file)
--- a/arch/powerpc/xmon/Makefile
+++ b/arch/powerpc/xmon/Makefile
@@ -7,7 +7,7 @@ UBSAN_SANITIZE := n
  
  ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
  
-obj-y                  += xmon.o nonstdio.o
+obj-y                  += xmon.o nonstdio.o spr_access.o
  
  ifdef CONFIG_XMON_DISASSEMBLY
  obj-y                  += ppc-dis.o ppc-opc.o
diff --git a/arch/powerpc/xmon/spr_access.S b/arch/powerpc/xmon/spr_access.S

new file mode 100644 (file)

index 0000000..84ad742
--- /dev/null
+++ b/arch/powerpc/xmon/spr_access.S
@@ -0,0 +1,45 @@
+#include <asm/ppc_asm.h>
+
+/* unsigned long xmon_mfspr(sprn, default_value) */
+_GLOBAL(xmon_mfspr)
+       ld      r5, .Lmfspr_table@got(r2)
+       b       xmon_mxspr
+
+/* void xmon_mtspr(sprn, new_value) */
+_GLOBAL(xmon_mtspr)
+       ld      r5, .Lmtspr_table@got(r2)
+       b       xmon_mxspr
+
+/*
+ * r3 = sprn
+ * r4 = default or new value
+ * r5 = table base
+ */
+xmon_mxspr:
+       /*
+        * To index into the table of mxsprs we need:
+        *  i = (sprn & 0x3ff) * 8
+        * or using rwlinm:
+        *  i = (sprn << 3) & (0x3ff << 3)
+        */
+       rlwinm  r3, r3, 3, 0x3ff << 3
+       add     r5, r5, r3
+       mtctr   r5
+       mr      r3, r4 /* put default_value in r3 for mfspr */
+       bctr
+
+.Lmfspr_table:
+       spr = 0
+       .rept   1024
+       mfspr   r3, spr
+       blr
+       spr = spr + 1
+       .endr
+
+.Lmtspr_table:
+       spr = 0
+       .rept   1024
+       mtspr   spr, r4
+       blr
+       spr = spr + 1
+       .endr
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c

index 942796fa476750221133664d35d701a97b659499..c5e155108be5ac65dc3de9f0b9eb469aa1eb3d26 100644 (file)
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -86,6 +86,7 @@ static char tmpstr[128];
  
  static long bus_error_jmp[JMP_BUF_LEN];
  static int catch_memory_errors;
+static int catch_spr_faults;
  static long *xmon_fault_jmp[NR_CPUS];
  
  /* Breakpoint stuff */
@@ -147,7 +148,7 @@ void getstring(char *, int);
  static void flush_input(void);
  static int inchar(void);
  static void take_input(char *);
-static unsigned long read_spr(int);
+static int  read_spr(int, unsigned long *);
  static void write_spr(int, unsigned long);
  static void super_regs(void);
  static void remove_bpts(void);
@@ -250,6 +251,9 @@ Commands:\n\
    sdi #        disassemble spu local store for spu # (in hex)\n"
  #endif
  "  S   print special registers\n\
+  Sa    print all SPRs\n\
+  Sr # read SPR #\n\
+  Sw #v write v to SPR #\n\
    t    print backtrace\n\
    x    exit monitor and recover\n\
    X    exit monitor and don't recover\n"
@@ -442,6 +446,12 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
  #ifdef CONFIG_SMP
         cpu = smp_processor_id();
         if (cpumask_test_cpu(cpu, &cpus_in_xmon)) {
+               /*
+                * We catch SPR read/write faults here because the 0x700, 0xf60
+                * etc. handlers don't call debugger_fault_handler().
+                */
+               if (catch_spr_faults)
+                       longjmp(bus_error_jmp, 1);
                 get_output_lock();
                 excprint(regs);
                 printf("cpu 0x%x: Exception %lx %s in xmon, "
@@ -1635,89 +1645,87 @@ static void cacheflush(void)
         catch_memory_errors = 0;
  }
  
-static unsigned long
-read_spr(int n)
+extern unsigned long xmon_mfspr(int spr, unsigned long default_value);
+extern void xmon_mtspr(int spr, unsigned long value);
+
+static int
+read_spr(int n, unsigned long *vp)
  {
-       unsigned int instrs[2];
-       unsigned long (*code)(void);
         unsigned long ret = -1UL;
-#ifdef CONFIG_PPC64
-       unsigned long opd[3];
-
-       opd[0] = (unsigned long)instrs;
-       opd[1] = 0;
-       opd[2] = 0;
-       code = (unsigned long (*)(void)) opd;
-#else
-       code = (unsigned long (*)(void)) instrs;
-#endif
-
-       /* mfspr r3,n; blr */
-       instrs[0] = 0x7c6002a6 + ((n & 0x1F) << 16) + ((n & 0x3e0) << 6);
-       instrs[1] = 0x4e800020;
-       store_inst(instrs);
-       store_inst(instrs+1);
+       int ok = 0;
  
         if (setjmp(bus_error_jmp) == 0) {
-               catch_memory_errors = 1;
+               catch_spr_faults = 1;
                 sync();
  
-               ret = code();
+               ret = xmon_mfspr(n, *vp);
  
                 sync();
-               /* wait a little while to see if we get a machine check */
-               __delay(200);
-               n = size;
+               *vp = ret;
+               ok = 1;
         }
+       catch_spr_faults = 0;
  
-       return ret;
+       return ok;
  }
  
  static void
  write_spr(int n, unsigned long val)
  {
-       unsigned int instrs[2];
-       unsigned long (*code)(unsigned long);
-#ifdef CONFIG_PPC64
-       unsigned long opd[3];
-
-       opd[0] = (unsigned long)instrs;
-       opd[1] = 0;
-       opd[2] = 0;
-       code = (unsigned long (*)(unsigned long)) opd;
-#else
-       code = (unsigned long (*)(unsigned long)) instrs;
-#endif
-
-       instrs[0] = 0x7c6003a6 + ((n & 0x1F) << 16) + ((n & 0x3e0) << 6);
-       instrs[1] = 0x4e800020;
-       store_inst(instrs);
-       store_inst(instrs+1);
-
         if (setjmp(bus_error_jmp) == 0) {
-               catch_memory_errors = 1;
+               catch_spr_faults = 1;
                 sync();
  
-               code(val);
+               xmon_mtspr(n, val);
  
                 sync();
-               /* wait a little while to see if we get a machine check */
-               __delay(200);
-               n = size;
+       } else {
+               printf("SPR 0x%03x (%4d) Faulted during write\n", n, n);
         }
+       catch_spr_faults = 0;
  }
  
  static unsigned long regno;
  extern char exc_prolog;
  extern char dec_exc;
  
+static void dump_one_spr(int spr, bool show_unimplemented)
+{
+       unsigned long val;
+
+       val = 0xdeadbeef;
+       if (!read_spr(spr, &val)) {
+               printf("SPR 0x%03x (%4d) Faulted during read\n", spr, spr);
+               return;
+       }
+
+       if (val == 0xdeadbeef) {
+               /* Looks like read was a nop, confirm */
+               val = 0x0badcafe;
+               if (!read_spr(spr, &val)) {
+                       printf("SPR 0x%03x (%4d) Faulted during read\n", spr, spr);
+                       return;
+               }
+
+               if (val == 0x0badcafe) {
+                       if (show_unimplemented)
+                               printf("SPR 0x%03x (%4d) Unimplemented\n", spr, spr);
+                       return;
+               }
+       }
+
+       printf("SPR 0x%03x (%4d) = 0x%lx\n", spr, spr, val);
+}
+
  static void super_regs(void)
  {
         int cmd;
-       unsigned long val;
+       int spr;
  
         cmd = skipbl();
-       if (cmd == '\n') {
+
+       switch (cmd) {
+       case '\n': {
                 unsigned long sp, toc;
                 asm("mr %0,1" : "=r" (sp) :);
                 asm("mr %0,2" : "=r" (toc) :);
@@ -1730,21 +1738,29 @@ static void super_regs(void)
                        mfspr(SPRN_DEC), mfspr(SPRN_SPRG2));
                 printf("sp   = "REG"  sprg3= "REG"\n", sp, mfspr(SPRN_SPRG3));
                 printf("toc  = "REG"  dar  = "REG"\n", toc, mfspr(SPRN_DAR));
-
                 return;
         }
-
-       scanhex(&regno);
-       switch (cmd) {
-       case 'w':
-               val = read_spr(regno);
+       case 'w': {
+               unsigned long val;
+               scanhex(&regno);
+               val = 0;
+               read_spr(regno, &val);
                 scanhex(&val);
                 write_spr(regno, val);
-               /* fall through */
+               dump_one_spr(regno, true);
+               break;
+       }
         case 'r':
-               printf("spr %lx = %lx\n", regno, read_spr(regno));
+               scanhex(&regno);
+               dump_one_spr(regno, true);
+               break;
+       case 'a':
+               /* dump ALL SPRs */
+               for (spr = 1; spr < 1024; ++spr)
+                       dump_one_spr(spr, false);
                 break;
         }
+
         scannl();
  }
  
@@ -2913,7 +2929,7 @@ static void xmon_print_symbol(unsigned long address, const char *mid,
         printf("%s", after);
  }
  
-#ifdef CONFIG_PPC_BOOK3S_64
+#ifdef CONFIG_PPC_STD_MMU_64
  void dump_segments(void)
  {
         int i;
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h

index 2f66645587a2a134d9d2a304ae6508fedfc1fd8a..18d2beb89340a6deeb4e9cdb48cca5a27b23155d 100644 (file)
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1223,6 +1223,7 @@ static inline int pmd_trans_huge(pmd_t pmd)
         return pmd_val(pmd) & _SEGMENT_ENTRY_LARGE;
  }
  
+#define has_transparent_hugepage has_transparent_hugepage
  static inline int has_transparent_hugepage(void)
  {
         return MACHINE_HAS_HPAGE ? 1 : 0;
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h

index f089cfa249f335b419de702333ddc341e7e3acc3..93ce0ada3c63fe1dfbd918125d7663886a78031d 100644 (file)
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -681,8 +681,6 @@ static inline unsigned long pmd_trans_huge(pmd_t pmd)
         return pte_val(pte) & _PAGE_PMD_HUGE;
  }
  
-#define has_transparent_hugepage() 1
-
  static inline pmd_t pmd_mkold(pmd_t pmd)
  {
         pte_t pte = __pte(pmd_val(pmd));
diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h

index 96cecf55522ef492f1afbdd143ca80536d63d31d..2a26cc4fefc27fda65d15be3b3a0b719231ad609 100644 (file)
--- a/arch/tile/include/asm/pgtable.h
+++ b/arch/tile/include/asm/pgtable.h
@@ -487,7 +487,6 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
  }
  
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define has_transparent_hugepage() 1
  #define pmd_trans_huge pmd_huge_page
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  
diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c

index a992238e9b58260fdfa067c4cc2c0d348ceda41f..153020abd2f5c8a7476907ac62531d70f8c6f225 100644 (file)
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -962,9 +962,7 @@ static void __init setup_numa_mapping(void)
                 cpumask_set_cpu(best_cpu, &node_2_cpu_mask[node]);
                 cpu_2_node[best_cpu] = node;
                 cpumask_clear_cpu(best_cpu, &unbound_cpus);
-               node = next_node(node, default_nodes);
-               if (node == MAX_NUMNODES)
-                       node = first_node(default_nodes);
+               node = next_node_in(node, default_nodes);
         }
  
         /* Print out node assignments and set defaults for disabled cpus */
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c

index e212c64682c55bcaaa9b4aa78c13cf6326a95e39..77ceaa343fcef10956b73222f7033031035c3897 100644 (file)
--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
@@ -308,11 +308,16 @@ static bool saw_hugepagesz;
  
  static __init int setup_hugepagesz(char *opt)
  {
+       int rc;
+
         if (!saw_hugepagesz) {
                 saw_hugepagesz = true;
                 memset(huge_shift, 0, sizeof(huge_shift));
         }
-       return __setup_hugepagesz(memparse(opt, NULL));
+       rc = __setup_hugepagesz(memparse(opt, NULL));
+       if (rc)
+               hugetlb_bad_size();
+       return rc;
  }
  __setup("hugepagesz=", setup_hugepagesz);
  
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c

index a0582b7f41d3357b4db0507df4a7153a028b0fe7..adce25462b0dbaf6bc8147583333b7ee1863b39d 100644 (file)
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -679,7 +679,7 @@ static void __init init_free_pfn_range(unsigned long start, unsigned long end)
                          * Hacky direct set to avoid unnecessary
                          * lock take/release for EVERY page here.
                          */
-                       p->_count.counter = 0;
+                       p->_refcount.counter = 0;
                         p->_mapcount.counter = -1;
                 }
                 init_page_count(page);
diff --git a/arch/x86/crypto/sha-mb/sha1_x8_avx2.S b/arch/x86/crypto/sha-mb/sha1_x8_avx2.S

index 8e1b47792b319c022a6e5a692fbbb9a64b699b5e..c9dae1cd2919277376db3b7f378b6c547b2a3b05 100644 (file)
--- a/arch/x86/crypto/sha-mb/sha1_x8_avx2.S
+++ b/arch/x86/crypto/sha-mb/sha1_x8_avx2.S
@@ -296,7 +296,11 @@ W14  = TMP_
  #
  ENTRY(sha1_x8_avx2)
  
-       push    RSP_SAVE
+       # save callee-saved clobbered registers to comply with C function ABI
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
  
         #save rsp
         mov     %rsp, RSP_SAVE
@@ -446,7 +450,12 @@ lloop:
         ## Postamble
  
         mov     RSP_SAVE, %rsp
-       pop     RSP_SAVE
+
+       # restore callee-saved clobbered registers
+       pop     %r15
+       pop     %r14
+       pop     %r13
+       pop     %r12
  
         ret
  ENDPROC(sha1_x8_avx2)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h

index f86491a7bc9dd1c8c96f52f28b9befd0ff59d6ea..1a27396b6ea04df12b2cff8930341320e7a4a819 100644 (file)
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -181,6 +181,7 @@ static inline int pmd_trans_huge(pmd_t pmd)
         return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
  }
  
+#define has_transparent_hugepage has_transparent_hugepage
  static inline int has_transparent_hugepage(void)
  {
         return boot_cpu_has(X86_FEATURE_PSE);
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c

index 14a95054d4e058a85f6b8d80c162aa2617e7f848..2ae8584b44c73d7c93b30b80ca643109f6e5bcf5 100644 (file)
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -165,6 +165,7 @@ static __init int setup_hugepagesz(char *opt)
         } else if (ps == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) {
                 hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
         } else {
+               hugetlb_bad_size();
                 printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
                         ps >> 20);
                 return 0;
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c

index f70c1ff4612515b310ee59fdb843ae15eb2b2a33..9c086c57105c18cac6bbb4030c6c448eadf03af8 100644 (file)
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -617,9 +617,7 @@ static void __init numa_init_array(void)
                 if (early_cpu_to_node(i) != NUMA_NO_NODE)
                         continue;
                 numa_set_node(i, rr);
-               rr = next_node(rr, node_online_map);
-               if (rr == MAX_NUMNODES)
-                       rr = first_node(node_online_map);
+               rr = next_node_in(rr, node_online_map);
         }
  }
  
diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c

index f0099360039e247c91ea39e95e6f092f7505d42c..a5b5c87e21149f4df50d6dc21e6de016bbae29af 100644 (file)
--- a/drivers/amba/bus.c
+++ b/drivers/amba/bus.c
@@ -336,16 +336,7 @@ static void amba_device_release(struct device *dev)
         kfree(d);
  }
  
-/**
- *     amba_device_add - add a previously allocated AMBA device structure
- *     @dev: AMBA device allocated by amba_device_alloc
- *     @parent: resource parent for this devices resources
- *
- *     Claim the resource, and read the device cell ID if not already
- *     initialized.  Register the AMBA device with the Linux device
- *     manager.
- */
-int amba_device_add(struct amba_device *dev, struct resource *parent)
+static int amba_device_try_add(struct amba_device *dev, struct resource *parent)
  {
         u32 size;
         void __iomem *tmp;
@@ -373,6 +364,12 @@ int amba_device_add(struct amba_device *dev, struct resource *parent)
                 goto err_release;
         }
  
+       ret = dev_pm_domain_attach(&dev->dev, true);
+       if (ret == -EPROBE_DEFER) {
+               iounmap(tmp);
+               goto err_release;
+       }
+
         ret = amba_get_enable_pclk(dev);
         if (ret == 0) {
                 u32 pid, cid;
@@ -398,6 +395,7 @@ int amba_device_add(struct amba_device *dev, struct resource *parent)
         }
  
         iounmap(tmp);
+       dev_pm_domain_detach(&dev->dev, true);
  
         if (ret)
                 goto err_release;
@@ -421,6 +419,88 @@ int amba_device_add(struct amba_device *dev, struct resource *parent)
   err_out:
         return ret;
  }
+
+/*
+ * Registration of AMBA device require reading its pid and cid registers.
+ * To do this, the device must be turned on (if it is a part of power domain)
+ * and have clocks enabled. However in some cases those resources might not be
+ * yet available. Returning EPROBE_DEFER is not a solution in such case,
+ * because callers don't handle this special error code. Instead such devices
+ * are added to the special list and their registration is retried from
+ * periodic worker, until all resources are available and registration succeeds.
+ */
+struct deferred_device {
+       struct amba_device *dev;
+       struct resource *parent;
+       struct list_head node;
+};
+
+static LIST_HEAD(deferred_devices);
+static DEFINE_MUTEX(deferred_devices_lock);
+
+static void amba_deferred_retry_func(struct work_struct *dummy);
+static DECLARE_DELAYED_WORK(deferred_retry_work, amba_deferred_retry_func);
+
+#define DEFERRED_DEVICE_TIMEOUT (msecs_to_jiffies(5 * 1000))
+
+static void amba_deferred_retry_func(struct work_struct *dummy)
+{
+       struct deferred_device *ddev, *tmp;
+
+       mutex_lock(&deferred_devices_lock);
+
+       list_for_each_entry_safe(ddev, tmp, &deferred_devices, node) {
+               int ret = amba_device_try_add(ddev->dev, ddev->parent);
+
+               if (ret == -EPROBE_DEFER)
+                       continue;
+
+               list_del_init(&ddev->node);
+               kfree(ddev);
+       }
+
+       if (!list_empty(&deferred_devices))
+               schedule_delayed_work(&deferred_retry_work,
+                                     DEFERRED_DEVICE_TIMEOUT);
+
+       mutex_unlock(&deferred_devices_lock);
+}
+
+/**
+ *     amba_device_add - add a previously allocated AMBA device structure
+ *     @dev: AMBA device allocated by amba_device_alloc
+ *     @parent: resource parent for this devices resources
+ *
+ *     Claim the resource, and read the device cell ID if not already
+ *     initialized.  Register the AMBA device with the Linux device
+ *     manager.
+ */
+int amba_device_add(struct amba_device *dev, struct resource *parent)
+{
+       int ret = amba_device_try_add(dev, parent);
+
+       if (ret == -EPROBE_DEFER) {
+               struct deferred_device *ddev;
+
+               ddev = kmalloc(sizeof(*ddev), GFP_KERNEL);
+               if (!ddev)
+                       return -ENOMEM;
+
+               ddev->dev = dev;
+               ddev->parent = parent;
+               ret = 0;
+
+               mutex_lock(&deferred_devices_lock);
+
+               if (list_empty(&deferred_devices))
+                       schedule_delayed_work(&deferred_retry_work,
+                                             DEFERRED_DEVICE_TIMEOUT);
+               list_add_tail(&ddev->node, &deferred_devices);
+
+               mutex_unlock(&deferred_devices_lock);
+       }
+       return ret;
+}
  EXPORT_SYMBOL_GPL(amba_device_add);
  
  static struct amba_device *
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c

index 437b3a822f4482c5a6e34a2c1f827f5560d1f56e..d597e432e195305906d357e113b90659cbdadbd6 100644 (file)
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -861,7 +861,7 @@ rqbiocnt(struct request *r)
   * discussion.
   *
   * We cannot use get_page in the workaround, because it insists on a
- * positive page count as a precondition.  So we use _count directly.
+ * positive page count as a precondition.  So we use _refcount directly.
   */
  static void
  bio_pageinc(struct bio *bio)
diff --git a/drivers/cpufreq/pmac32-cpufreq.c b/drivers/cpufreq/pmac32-cpufreq.c

index b7b576e53e926a5c8e94bd614b3a60b4d1c6f222..ff44016ea0312e4b6c50ff1d9f0126f2d63dcddc 100644 (file)
--- a/drivers/cpufreq/pmac32-cpufreq.c
+++ b/drivers/cpufreq/pmac32-cpufreq.c
@@ -300,7 +300,7 @@ static int pmu_set_cpu_speed(int low_speed)
                 _set_L3CR(save_l3cr);
  
         /* Restore userland MMU context */
-       switch_mmu_context(NULL, current->active_mm);
+       switch_mmu_context(NULL, current->active_mm, NULL);
  
  #ifdef DEBUG_FREQ
         printk(KERN_DEBUG "HID1, after: %x\n", mfspr(SPRN_HID1));
diff --git a/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c b/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c

index 106679bca6cb42e830cbc1f707928a850dc82115..f9c79dabce20029401f7d25cc58cf1400f324da1 100644 (file)
--- a/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c
+++ b/drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c
@@ -157,7 +157,7 @@ struct device_node * __init tilcdc_get_overlay(struct kfree_table *kft)
         if (!overlay_data || kfree_table_add(kft, overlay_data))
                 return NULL;
  
-       of_fdt_unflatten_tree(overlay_data, &overlay);
+       of_fdt_unflatten_tree(overlay_data, NULL, &overlay);
         if (!overlay) {
                 pr_warn("%s: Unfattening overlay tree failed\n", __func__);
                 return NULL;
diff --git a/drivers/hsi/controllers/Kconfig b/drivers/hsi/controllers/Kconfig

index 6aba2780817227bdcff05132f10bd8d8eeabceb7..48e4eda186ccbcdcded58b89105a5a525eed1d88 100644 (file)
--- a/drivers/hsi/controllers/Kconfig
+++ b/drivers/hsi/controllers/Kconfig
@@ -5,15 +5,11 @@ comment "HSI controllers"
  
  config OMAP_SSI
         tristate "OMAP SSI hardware driver"
-       depends on HSI && OF && (ARCH_OMAP3 || (ARM && COMPILE_TEST))
+       depends on HSI && OF && ARM && COMMON_CLK
+       depends on ARCH_OMAP3 || COMPILE_TEST
         ---help---
           SSI is a legacy version of HSI. It is usually used to connect
           an application engine with a cellular modem.
           If you say Y here, you will enable the OMAP SSI hardware driver.
  
           If unsure, say N.
-
-config OMAP_SSI_PORT
-       tristate
-       default m if OMAP_SSI=m
-       default y if OMAP_SSI=y
diff --git a/drivers/hsi/controllers/Makefile b/drivers/hsi/controllers/Makefile

index d2665cf9c5450e4a951d129a6de253df79972525..7aba9c7f71bb23c4ac922cea4d2efb8ed73b571d 100644 (file)
--- a/drivers/hsi/controllers/Makefile
+++ b/drivers/hsi/controllers/Makefile
@@ -2,5 +2,5 @@
  # Makefile for HSI controllers drivers
  #
  
-obj-$(CONFIG_OMAP_SSI)         += omap_ssi.o
-obj-$(CONFIG_OMAP_SSI_PORT)    += omap_ssi_port.o
+omap_ssi-objs          += omap_ssi_core.o omap_ssi_port.o
+obj-$(CONFIG_OMAP_SSI) += omap_ssi.o
diff --git a/drivers/hsi/controllers/omap_ssi.c b/drivers/hsi/controllers/omap_ssi.c

deleted file mode 100644 (file)

index 27b91f1..0000000
--- a/drivers/hsi/controllers/omap_ssi.c
+++ /dev/null
@@ -1,610 +0,0 @@
-/* OMAP SSI driver.
- *
- * Copyright (C) 2010 Nokia Corporation. All rights reserved.
- * Copyright (C) 2014 Sebastian Reichel <sre@kernel.org>
- *
- * Contact: Carlos Chinea <carlos.chinea@nokia.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
- * 02110-1301 USA
- */
-
-#include <linux/compiler.h>
-#include <linux/err.h>
-#include <linux/ioport.h>
-#include <linux/io.h>
-#include <linux/gpio.h>
-#include <linux/clk.h>
-#include <linux/device.h>
-#include <linux/platform_device.h>
-#include <linux/dma-mapping.h>
-#include <linux/dmaengine.h>
-#include <linux/delay.h>
-#include <linux/seq_file.h>
-#include <linux/scatterlist.h>
-#include <linux/interrupt.h>
-#include <linux/spinlock.h>
-#include <linux/debugfs.h>
-#include <linux/pm_runtime.h>
-#include <linux/of_platform.h>
-#include <linux/hsi/hsi.h>
-#include <linux/idr.h>
-
-#include "omap_ssi_regs.h"
-#include "omap_ssi.h"
-
-/* For automatically allocated device IDs */
-static DEFINE_IDA(platform_omap_ssi_ida);
-
-#ifdef CONFIG_DEBUG_FS
-static int ssi_debug_show(struct seq_file *m, void *p __maybe_unused)
-{
-       struct hsi_controller *ssi = m->private;
-       struct omap_ssi_controller *omap_ssi = hsi_controller_drvdata(ssi);
-       void __iomem *sys = omap_ssi->sys;
-
-       pm_runtime_get_sync(ssi->device.parent);
-       seq_printf(m, "REVISION\t: 0x%08x\n",  readl(sys + SSI_REVISION_REG));
-       seq_printf(m, "SYSCONFIG\t: 0x%08x\n", readl(sys + SSI_SYSCONFIG_REG));
-       seq_printf(m, "SYSSTATUS\t: 0x%08x\n", readl(sys + SSI_SYSSTATUS_REG));
-       pm_runtime_put_sync(ssi->device.parent);
-
-       return 0;
-}
-
-static int ssi_debug_gdd_show(struct seq_file *m, void *p __maybe_unused)
-{
-       struct hsi_controller *ssi = m->private;
-       struct omap_ssi_controller *omap_ssi = hsi_controller_drvdata(ssi);
-       void __iomem *gdd = omap_ssi->gdd;
-       void __iomem *sys = omap_ssi->sys;
-       int lch;
-
-       pm_runtime_get_sync(ssi->device.parent);
-
-       seq_printf(m, "GDD_MPU_STATUS\t: 0x%08x\n",
-               readl(sys + SSI_GDD_MPU_IRQ_STATUS_REG));
-       seq_printf(m, "GDD_MPU_ENABLE\t: 0x%08x\n\n",
-               readl(sys + SSI_GDD_MPU_IRQ_ENABLE_REG));
-       seq_printf(m, "HW_ID\t\t: 0x%08x\n",
-                               readl(gdd + SSI_GDD_HW_ID_REG));
-       seq_printf(m, "PPORT_ID\t: 0x%08x\n",
-                               readl(gdd + SSI_GDD_PPORT_ID_REG));
-       seq_printf(m, "MPORT_ID\t: 0x%08x\n",
-                               readl(gdd + SSI_GDD_MPORT_ID_REG));
-       seq_printf(m, "TEST\t\t: 0x%08x\n",
-                               readl(gdd + SSI_GDD_TEST_REG));
-       seq_printf(m, "GCR\t\t: 0x%08x\n",
-                               readl(gdd + SSI_GDD_GCR_REG));
-
-       for (lch = 0; lch < SSI_MAX_GDD_LCH; lch++) {
-               seq_printf(m, "\nGDD LCH %d\n=========\n", lch);
-               seq_printf(m, "CSDP\t\t: 0x%04x\n",
-                               readw(gdd + SSI_GDD_CSDP_REG(lch)));
-               seq_printf(m, "CCR\t\t: 0x%04x\n",
-                               readw(gdd + SSI_GDD_CCR_REG(lch)));
-               seq_printf(m, "CICR\t\t: 0x%04x\n",
-                               readw(gdd + SSI_GDD_CICR_REG(lch)));
-               seq_printf(m, "CSR\t\t: 0x%04x\n",
-                               readw(gdd + SSI_GDD_CSR_REG(lch)));
-               seq_printf(m, "CSSA\t\t: 0x%08x\n",
-                               readl(gdd + SSI_GDD_CSSA_REG(lch)));
-               seq_printf(m, "CDSA\t\t: 0x%08x\n",
-                               readl(gdd + SSI_GDD_CDSA_REG(lch)));
-               seq_printf(m, "CEN\t\t: 0x%04x\n",
-                               readw(gdd + SSI_GDD_CEN_REG(lch)));
-               seq_printf(m, "CSAC\t\t: 0x%04x\n",
-                               readw(gdd + SSI_GDD_CSAC_REG(lch)));
-               seq_printf(m, "CDAC\t\t: 0x%04x\n",
-                               readw(gdd + SSI_GDD_CDAC_REG(lch)));
-               seq_printf(m, "CLNK_CTRL\t: 0x%04x\n",
-                               readw(gdd + SSI_GDD_CLNK_CTRL_REG(lch)));
-       }
-
-       pm_runtime_put_sync(ssi->device.parent);
-
-       return 0;
-}
-
-static int ssi_regs_open(struct inode *inode, struct file *file)
-{
-       return single_open(file, ssi_debug_show, inode->i_private);
-}
-
-static int ssi_gdd_regs_open(struct inode *inode, struct file *file)
-{
-       return single_open(file, ssi_debug_gdd_show, inode->i_private);
-}
-
-static const struct file_operations ssi_regs_fops = {
-       .open           = ssi_regs_open,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = single_release,
-};
-
-static const struct file_operations ssi_gdd_regs_fops = {
-       .open           = ssi_gdd_regs_open,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = single_release,
-};
-
-static int __init ssi_debug_add_ctrl(struct hsi_controller *ssi)
-{
-       struct omap_ssi_controller *omap_ssi = hsi_controller_drvdata(ssi);
-       struct dentry *dir;
-
-       /* SSI controller */
-       omap_ssi->dir = debugfs_create_dir(dev_name(&ssi->device), NULL);
-       if (!omap_ssi->dir)
-               return -ENOMEM;
-
-       debugfs_create_file("regs", S_IRUGO, omap_ssi->dir, ssi,
-                                                               &ssi_regs_fops);
-       /* SSI GDD (DMA) */
-       dir = debugfs_create_dir("gdd", omap_ssi->dir);
-       if (!dir)
-               goto rback;
-       debugfs_create_file("regs", S_IRUGO, dir, ssi, &ssi_gdd_regs_fops);
-
-       return 0;
-rback:
-       debugfs_remove_recursive(omap_ssi->dir);
-
-       return -ENOMEM;
-}
-
-static void ssi_debug_remove_ctrl(struct hsi_controller *ssi)
-{
-       struct omap_ssi_controller *omap_ssi = hsi_controller_drvdata(ssi);
-
-       debugfs_remove_recursive(omap_ssi->dir);
-}
-#endif /* CONFIG_DEBUG_FS */
-
-/*
- * FIXME: Horrible HACK needed until we remove the useless wakeline test
- * in the CMT. To be removed !!!!
- */
-void ssi_waketest(struct hsi_client *cl, unsigned int enable)
-{
-       struct hsi_port *port = hsi_get_port(cl);
-       struct omap_ssi_port *omap_port = hsi_port_drvdata(port);
-       struct hsi_controller *ssi = to_hsi_controller(port->device.parent);
-       struct omap_ssi_controller *omap_ssi = hsi_controller_drvdata(ssi);
-
-       omap_port->wktest = !!enable;
-       if (omap_port->wktest) {
-               pm_runtime_get_sync(ssi->device.parent);
-               writel_relaxed(SSI_WAKE(0),
-                               omap_ssi->sys + SSI_SET_WAKE_REG(port->num));
-       } else {
-               writel_relaxed(SSI_WAKE(0),
-                               omap_ssi->sys + SSI_CLEAR_WAKE_REG(port->num));
-               pm_runtime_put_sync(ssi->device.parent);
-       }
-}
-EXPORT_SYMBOL_GPL(ssi_waketest);
-
-static void ssi_gdd_complete(struct hsi_controller *ssi, unsigned int lch)
-{
-       struct omap_ssi_controller *omap_ssi = hsi_controller_drvdata(ssi);
-       struct hsi_msg *msg = omap_ssi->gdd_trn[lch].msg;
-       struct hsi_port *port = to_hsi_port(msg->cl->device.parent);
-       struct omap_ssi_port *omap_port = hsi_port_drvdata(port);
-       unsigned int dir;
-       u32 csr;
-       u32 val;
-
-       spin_lock(&omap_ssi->lock);
-
-       val = readl(omap_ssi->sys + SSI_GDD_MPU_IRQ_ENABLE_REG);
-       val &= ~SSI_GDD_LCH(lch);
-       writel_relaxed(val, omap_ssi->sys + SSI_GDD_MPU_IRQ_ENABLE_REG);
-
-       if (msg->ttype == HSI_MSG_READ) {
-               dir = DMA_FROM_DEVICE;
-               val = SSI_DATAAVAILABLE(msg->channel);
-               pm_runtime_put_sync(ssi->device.parent);
-       } else {
-               dir = DMA_TO_DEVICE;
-               val = SSI_DATAACCEPT(msg->channel);
-               /* Keep clocks reference for write pio event */
-       }
-       dma_unmap_sg(&ssi->device, msg->sgt.sgl, msg->sgt.nents, dir);
-       csr = readw(omap_ssi->gdd + SSI_GDD_CSR_REG(lch));
-       omap_ssi->gdd_trn[lch].msg = NULL; /* release GDD lch */
-       dev_dbg(&port->device, "DMA completed ch %d ttype %d\n",
-                               msg->channel, msg->ttype);
-       spin_unlock(&omap_ssi->lock);
-       if (csr & SSI_CSR_TOUR) { /* Timeout error */
-               msg->status = HSI_STATUS_ERROR;
-               msg->actual_len = 0;
-               spin_lock(&omap_port->lock);
-               list_del(&msg->link); /* Dequeue msg */
-               spin_unlock(&omap_port->lock);
-               msg->complete(msg);
-               return;
-       }
-       spin_lock(&omap_port->lock);
-       val |= readl(omap_ssi->sys + SSI_MPU_ENABLE_REG(port->num, 0));
-       writel_relaxed(val, omap_ssi->sys + SSI_MPU_ENABLE_REG(port->num, 0));
-       spin_unlock(&omap_port->lock);
-
-       msg->status = HSI_STATUS_COMPLETED;
-       msg->actual_len = sg_dma_len(msg->sgt.sgl);
-}
-
-static void ssi_gdd_tasklet(unsigned long dev)
-{
-       struct hsi_controller *ssi = (struct hsi_controller *)dev;
-       struct omap_ssi_controller *omap_ssi = hsi_controller_drvdata(ssi);
-       void __iomem *sys = omap_ssi->sys;
-       unsigned int lch;
-       u32 status_reg;
-
-       pm_runtime_get_sync(ssi->device.parent);
-
-       status_reg = readl(sys + SSI_GDD_MPU_IRQ_STATUS_REG);
-       for (lch = 0; lch < SSI_MAX_GDD_LCH; lch++) {
-               if (status_reg & SSI_GDD_LCH(lch))
-                       ssi_gdd_complete(ssi, lch);
-       }
-       writel_relaxed(status_reg, sys + SSI_GDD_MPU_IRQ_STATUS_REG);
-       status_reg = readl(sys + SSI_GDD_MPU_IRQ_STATUS_REG);
-
-       pm_runtime_put_sync(ssi->device.parent);
-
-       if (status_reg)
-               tasklet_hi_schedule(&omap_ssi->gdd_tasklet);
-       else
-               enable_irq(omap_ssi->gdd_irq);
-
-}
-
-static irqreturn_t ssi_gdd_isr(int irq, void *ssi)
-{
-       struct omap_ssi_controller *omap_ssi = hsi_controller_drvdata(ssi);
-
-       tasklet_hi_schedule(&omap_ssi->gdd_tasklet);
-       disable_irq_nosync(irq);
-
-       return IRQ_HANDLED;
-}
-
-static unsigned long ssi_get_clk_rate(struct hsi_controller *ssi)
-{
-       struct omap_ssi_controller *omap_ssi = hsi_controller_drvdata(ssi);
-       unsigned long rate = clk_get_rate(omap_ssi->fck);
-       return rate;
-}
-
-static int __init ssi_get_iomem(struct platform_device *pd,
-               const char *name, void __iomem **pbase, dma_addr_t *phy)
-{
-       struct resource *mem;
-       void __iomem *base;
-       struct hsi_controller *ssi = platform_get_drvdata(pd);
-
-       mem = platform_get_resource_byname(pd, IORESOURCE_MEM, name);
-       base = devm_ioremap_resource(&ssi->device, mem);
-       if (IS_ERR(base))
-               return PTR_ERR(base);
-
-       *pbase = base;
-
-       if (phy)
-               *phy = mem->start;
-
-       return 0;
-}
-
-static int __init ssi_add_controller(struct hsi_controller *ssi,
-                                               struct platform_device *pd)
-{
-       struct omap_ssi_controller *omap_ssi;
-       int err;
-
-       omap_ssi = devm_kzalloc(&ssi->device, sizeof(*omap_ssi), GFP_KERNEL);
-       if (!omap_ssi) {
-               dev_err(&pd->dev, "not enough memory for omap ssi\n");
-               return -ENOMEM;
-       }
-
-       err = ida_simple_get(&platform_omap_ssi_ida, 0, 0, GFP_KERNEL);
-       if (err < 0)
-               goto out_err;
-       ssi->id = err;
-
-       ssi->owner = THIS_MODULE;
-       ssi->device.parent = &pd->dev;
-       dev_set_name(&ssi->device, "ssi%d", ssi->id);
-       hsi_controller_set_drvdata(ssi, omap_ssi);
-       omap_ssi->dev = &ssi->device;
-       err = ssi_get_iomem(pd, "sys", &omap_ssi->sys, NULL);
-       if (err < 0)
-               goto out_err;
-       err = ssi_get_iomem(pd, "gdd", &omap_ssi->gdd, NULL);
-       if (err < 0)
-               goto out_err;
-       err = platform_get_irq_byname(pd, "gdd_mpu");
-       if (err < 0) {
-               dev_err(&pd->dev, "GDD IRQ resource missing\n");
-               goto out_err;
-       }
-       omap_ssi->gdd_irq = err;
-       tasklet_init(&omap_ssi->gdd_tasklet, ssi_gdd_tasklet,
-                                                       (unsigned long)ssi);
-       err = devm_request_irq(&ssi->device, omap_ssi->gdd_irq, ssi_gdd_isr,
-                                               0, "gdd_mpu", ssi);
-       if (err < 0) {
-               dev_err(&ssi->device, "Request GDD IRQ %d failed (%d)",
-                                                       omap_ssi->gdd_irq, err);
-               goto out_err;
-       }
-
-       omap_ssi->port = devm_kzalloc(&ssi->device,
-               sizeof(struct omap_ssi_port *) * ssi->num_ports, GFP_KERNEL);
-       if (!omap_ssi->port) {
-               err = -ENOMEM;
-               goto out_err;
-       }
-
-       omap_ssi->fck = devm_clk_get(&ssi->device, "ssi_ssr_fck");
-       if (IS_ERR(omap_ssi->fck)) {
-               dev_err(&pd->dev, "Could not acquire clock \"ssi_ssr_fck\": %li\n",
-                       PTR_ERR(omap_ssi->fck));
-               err = -ENODEV;
-               goto out_err;
-       }
-
-       /* TODO: find register, which can be used to detect context loss */
-       omap_ssi->get_loss = NULL;
-
-       omap_ssi->max_speed = UINT_MAX;
-       spin_lock_init(&omap_ssi->lock);
-       err = hsi_register_controller(ssi);
-
-       if (err < 0)
-               goto out_err;
-
-       return 0;
-
-out_err:
-       ida_simple_remove(&platform_omap_ssi_ida, ssi->id);
-       return err;
-}
-
-static int __init ssi_hw_init(struct hsi_controller *ssi)
-{
-       struct omap_ssi_controller *omap_ssi = hsi_controller_drvdata(ssi);
-       unsigned int i;
-       u32 val;
-       int err;
-
-       err = pm_runtime_get_sync(ssi->device.parent);
-       if (err < 0) {
-               dev_err(&ssi->device, "runtime PM failed %d\n", err);
-               return err;
-       }
-       /* Reseting SSI controller */
-       writel_relaxed(SSI_SOFTRESET, omap_ssi->sys + SSI_SYSCONFIG_REG);
-       val = readl(omap_ssi->sys + SSI_SYSSTATUS_REG);
-       for (i = 0; ((i < 20) && !(val & SSI_RESETDONE)); i++) {
-               msleep(20);
-               val = readl(omap_ssi->sys + SSI_SYSSTATUS_REG);
-       }
-       if (!(val & SSI_RESETDONE)) {
-               dev_err(&ssi->device, "SSI HW reset failed\n");
-               pm_runtime_put_sync(ssi->device.parent);
-               return -EIO;
-       }
-       /* Reseting GDD */
-       writel_relaxed(SSI_SWRESET, omap_ssi->gdd + SSI_GDD_GRST_REG);
-       /* Get FCK rate in KHz */
-       omap_ssi->fck_rate = DIV_ROUND_CLOSEST(ssi_get_clk_rate(ssi), 1000);
-       dev_dbg(&ssi->device, "SSI fck rate %lu KHz\n", omap_ssi->fck_rate);
-       /* Set default PM settings */
-       val = SSI_AUTOIDLE | SSI_SIDLEMODE_SMART | SSI_MIDLEMODE_SMART;
-       writel_relaxed(val, omap_ssi->sys + SSI_SYSCONFIG_REG);
-       omap_ssi->sysconfig = val;
-       writel_relaxed(SSI_CLK_AUTOGATING_ON, omap_ssi->sys + SSI_GDD_GCR_REG);
-       omap_ssi->gdd_gcr = SSI_CLK_AUTOGATING_ON;
-       pm_runtime_put_sync(ssi->device.parent);
-
-       return 0;
-}
-
-static void ssi_remove_controller(struct hsi_controller *ssi)
-{
-       struct omap_ssi_controller *omap_ssi = hsi_controller_drvdata(ssi);
-       int id = ssi->id;
-       tasklet_kill(&omap_ssi->gdd_tasklet);
-       hsi_unregister_controller(ssi);
-       ida_simple_remove(&platform_omap_ssi_ida, id);
-}
-
-static inline int ssi_of_get_available_ports_count(const struct device_node *np)
-{
-       struct device_node *child;
-       int num = 0;
-
-       for_each_available_child_of_node(np, child)
-               if (of_device_is_compatible(child, "ti,omap3-ssi-port"))
-                       num++;
-
-       return num;
-}
-
-static int ssi_remove_ports(struct device *dev, void *c)
-{
-       struct platform_device *pdev = to_platform_device(dev);
-
-       of_device_unregister(pdev);
-
-       return 0;
-}
-
-static int __init ssi_probe(struct platform_device *pd)
-{
-       struct platform_device *childpdev;
-       struct device_node *np = pd->dev.of_node;
-       struct device_node *child;
-       struct hsi_controller *ssi;
-       int err;
-       int num_ports;
-
-       if (!np) {
-               dev_err(&pd->dev, "missing device tree data\n");
-               return -EINVAL;
-       }
-
-       num_ports = ssi_of_get_available_ports_count(np);
-
-       ssi = hsi_alloc_controller(num_ports, GFP_KERNEL);
-       if (!ssi) {
-               dev_err(&pd->dev, "No memory for controller\n");
-               return -ENOMEM;
-       }
-
-       platform_set_drvdata(pd, ssi);
-
-       err = ssi_add_controller(ssi, pd);
-       if (err < 0)
-               goto out1;
-
-       pm_runtime_irq_safe(&pd->dev);
-       pm_runtime_enable(&pd->dev);
-
-       err = ssi_hw_init(ssi);
-       if (err < 0)
-               goto out2;
-#ifdef CONFIG_DEBUG_FS
-       err = ssi_debug_add_ctrl(ssi);
-       if (err < 0)
-               goto out2;
-#endif
-
-       for_each_available_child_of_node(np, child) {
-               if (!of_device_is_compatible(child, "ti,omap3-ssi-port"))
-                       continue;
-
-               childpdev = of_platform_device_create(child, NULL, &pd->dev);
-               if (!childpdev) {
-                       err = -ENODEV;
-                       dev_err(&pd->dev, "failed to create ssi controller port\n");
-                       goto out3;
-               }
-       }
-
-       dev_info(&pd->dev, "ssi controller %d initialized (%d ports)!\n",
-               ssi->id, num_ports);
-       return err;
-out3:
-       device_for_each_child(&pd->dev, NULL, ssi_remove_ports);
-out2:
-       ssi_remove_controller(ssi);
-out1:
-       platform_set_drvdata(pd, NULL);
-       pm_runtime_disable(&pd->dev);
-
-       return err;
-}
-
-static int __exit ssi_remove(struct platform_device *pd)
-{
-       struct hsi_controller *ssi = platform_get_drvdata(pd);
-
-#ifdef CONFIG_DEBUG_FS
-       ssi_debug_remove_ctrl(ssi);
-#endif
-       ssi_remove_controller(ssi);
-       platform_set_drvdata(pd, NULL);
-
-       pm_runtime_disable(&pd->dev);
-
-       /* cleanup of of_platform_populate() call */
-       device_for_each_child(&pd->dev, NULL, ssi_remove_ports);
-
-       return 0;
-}
-
-#ifdef CONFIG_PM
-static int omap_ssi_runtime_suspend(struct device *dev)
-{
-       struct hsi_controller *ssi = dev_get_drvdata(dev);
-       struct omap_ssi_controller *omap_ssi = hsi_controller_drvdata(ssi);
-
-       dev_dbg(dev, "runtime suspend!\n");
-
-       if (omap_ssi->get_loss)
-               omap_ssi->loss_count =
-                               omap_ssi->get_loss(ssi->device.parent);
-
-       return 0;
-}
-
-static int omap_ssi_runtime_resume(struct device *dev)
-{
-       struct hsi_controller *ssi = dev_get_drvdata(dev);
-       struct omap_ssi_controller *omap_ssi = hsi_controller_drvdata(ssi);
-
-       dev_dbg(dev, "runtime resume!\n");
-
-       if ((omap_ssi->get_loss) && (omap_ssi->loss_count ==
-                               omap_ssi->get_loss(ssi->device.parent)))
-               return 0;
-
-       writel_relaxed(omap_ssi->gdd_gcr, omap_ssi->gdd + SSI_GDD_GCR_REG);
-
-       return 0;
-}
-
-static const struct dev_pm_ops omap_ssi_pm_ops = {
-       SET_RUNTIME_PM_OPS(omap_ssi_runtime_suspend, omap_ssi_runtime_resume,
-               NULL)
-};
-
-#define DEV_PM_OPS     (&omap_ssi_pm_ops)
-#else
-#define DEV_PM_OPS     NULL
-#endif
-
-#ifdef CONFIG_OF
-static const struct of_device_id omap_ssi_of_match[] = {
-       { .compatible = "ti,omap3-ssi", },
-       {},
-};
-MODULE_DEVICE_TABLE(of, omap_ssi_of_match);
-#else
-#define omap_ssi_of_match NULL
-#endif
-
-static struct platform_driver ssi_pdriver = {
-       .remove = __exit_p(ssi_remove),
-       .driver = {
-               .name   = "omap_ssi",
-               .pm     = DEV_PM_OPS,
-               .of_match_table = omap_ssi_of_match,
-       },
-};
-
-module_platform_driver_probe(ssi_pdriver, ssi_probe);
-
-MODULE_ALIAS("platform:omap_ssi");
-MODULE_AUTHOR("Carlos Chinea <carlos.chinea@nokia.com>");
-MODULE_AUTHOR("Sebastian Reichel <sre@kernel.org>");
-MODULE_DESCRIPTION("Synchronous Serial Interface Driver");
-MODULE_LICENSE("GPL v2");
diff --git a/drivers/hsi/controllers/omap_ssi.h b/drivers/hsi/controllers/omap_ssi.h

index f9aaf37262be4cb120201313c30b167896f66aaf..7b4dec2c69ff475b6ca2dc322b479af213a7e16b 100644 (file)
--- a/drivers/hsi/controllers/omap_ssi.h
+++ b/drivers/hsi/controllers/omap_ssi.h
@@ -27,7 +27,7 @@
  #include <linux/module.h>
  #include <linux/platform_device.h>
  #include <linux/hsi/hsi.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
  #include <linux/interrupt.h>
  #include <linux/io.h>
  
@@ -97,7 +97,7 @@ struct omap_ssi_port {
         struct list_head        brkqueue;
         unsigned int            irq;
         int                     wake_irq;
-       int                     wake_gpio;
+       struct gpio_desc        *wake_gpio;
         struct tasklet_struct   pio_tasklet;
         struct tasklet_struct   wake_tasklet;
         bool                    wktest:1; /* FIXME: HACK to be removed */
@@ -134,6 +134,8 @@ struct gdd_trn {
   * @gdd_tasklet: bottom half for DMA transfers
   * @gdd_trn: Array of GDD transaction data for ongoing GDD transfers
   * @lock: lock to serialize access to GDD
+ * @fck_nb: DVFS notfifier block
+ * @fck_rate: clock rate
   * @loss_count: To follow if we need to restore context or not
   * @max_speed: Maximum TX speed (Kb/s) set by the clients.
   * @sysconfig: SSI controller saved context
@@ -151,6 +153,7 @@ struct omap_ssi_controller {
         struct tasklet_struct   gdd_tasklet;
         struct gdd_trn          gdd_trn[SSI_MAX_GDD_LCH];
         spinlock_t              lock;
+       struct notifier_block   fck_nb;
         unsigned long           fck_rate;
         u32                     loss_count;
         u32                     max_speed;
@@ -164,4 +167,9 @@ struct omap_ssi_controller {
  #endif
  };
  
+void omap_ssi_port_update_fclk(struct hsi_controller *ssi,
+                              struct omap_ssi_port *omap_port);
+
+extern struct platform_driver ssi_port_pdriver;
+
  #endif /* __LINUX_HSI_OMAP_SSI_H__ */
diff --git a/drivers/hsi/controllers/omap_ssi_core.c b/drivers/hsi/controllers/omap_ssi_core.c

new file mode 100644 (file)

index 0000000..a3e0feb
--- /dev/null
+++ b/drivers/hsi/controllers/omap_ssi_core.c
@@ -0,0 +1,693 @@
+/* OMAP SSI driver.
+ *
+ * Copyright (C) 2010 Nokia Corporation. All rights reserved.
+ * Copyright (C) 2014 Sebastian Reichel <sre@kernel.org>
+ *
+ * Contact: Carlos Chinea <carlos.chinea@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <linux/compiler.h>
+#include <linux/err.h>
+#include <linux/ioport.h>
+#include <linux/io.h>
+#include <linux/clk.h>
+#include <linux/device.h>
+#include <linux/platform_device.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmaengine.h>
+#include <linux/delay.h>
+#include <linux/seq_file.h>
+#include <linux/scatterlist.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/pinctrl/consumer.h>
+#include <linux/pm_runtime.h>
+#include <linux/of_platform.h>
+#include <linux/hsi/hsi.h>
+#include <linux/idr.h>
+
+#include "omap_ssi_regs.h"
+#include "omap_ssi.h"
+
+/* For automatically allocated device IDs */
+static DEFINE_IDA(platform_omap_ssi_ida);
+
+#ifdef CONFIG_DEBUG_FS
+static int ssi_debug_show(struct seq_file *m, void *p __maybe_unused)
+{
+       struct hsi_controller *ssi = m->private;
+       struct omap_ssi_controller *omap_ssi = hsi_controller_drvdata(ssi);
+       void __iomem *sys = omap_ssi->sys;
+
+       pm_runtime_get_sync(ssi->device.parent);
+       seq_printf(m, "REVISION\t: 0x%08x\n",  readl(sys + SSI_REVISION_REG));
+       seq_printf(m, "SYSCONFIG\t: 0x%08x\n", readl(sys + SSI_SYSCONFIG_REG));
+       seq_printf(m, "SYSSTATUS\t: 0x%08x\n", readl(sys + SSI_SYSSTATUS_REG));
+       pm_runtime_put_sync(ssi->device.parent);
+
+       return 0;
+}
+
+static int ssi_debug_gdd_show(struct seq_file *m, void *p __maybe_unused)
+{
+       struct hsi_controller *ssi = m->private;
+       struct omap_ssi_controller *omap_ssi = hsi_controller_drvdata(ssi);
+       void __iomem *gdd = omap_ssi->gdd;
+       void __iomem *sys = omap_ssi->sys;
+       int lch;
+
+       pm_runtime_get_sync(ssi->device.parent);
+
+       seq_printf(m, "GDD_MPU_STATUS\t: 0x%08x\n",
+               readl(sys + SSI_GDD_MPU_IRQ_STATUS_REG));
+       seq_printf(m, "GDD_MPU_ENABLE\t: 0x%08x\n\n",
+               readl(sys + SSI_GDD_MPU_IRQ_ENABLE_REG));
+       seq_printf(m, "HW_ID\t\t: 0x%08x\n",
+                               readl(gdd + SSI_GDD_HW_ID_REG));
+       seq_printf(m, "PPORT_ID\t: 0x%08x\n",
+                               readl(gdd + SSI_GDD_PPORT_ID_REG));
+       seq_printf(m, "MPORT_ID\t: 0x%08x\n",
+                               readl(gdd + SSI_GDD_MPORT_ID_REG));
+       seq_printf(m, "TEST\t\t: 0x%08x\n",
+                               readl(gdd + SSI_GDD_TEST_REG));
+       seq_printf(m, "GCR\t\t: 0x%08x\n",
+                               readl(gdd + SSI_GDD_GCR_REG));
+
+       for (lch = 0; lch < SSI_MAX_GDD_LCH; lch++) {
+               seq_printf(m, "\nGDD LCH %d\n=========\n", lch);
+               seq_printf(m, "CSDP\t\t: 0x%04x\n",
+                               readw(gdd + SSI_GDD_CSDP_REG(lch)));
+               seq_printf(m, "CCR\t\t: 0x%04x\n",
+                               readw(gdd + SSI_GDD_CCR_REG(lch)));
+               seq_printf(m, "CICR\t\t: 0x%04x\n",
+                               readw(gdd + SSI_GDD_CICR_REG(lch)));
+               seq_printf(m, "CSR\t\t: 0x%04x\n",
+                               readw(gdd + SSI_GDD_CSR_REG(lch)));
+               seq_printf(m, "CSSA\t\t: 0x%08x\n",
+                               readl(gdd + SSI_GDD_CSSA_REG(lch)));
+               seq_printf(m, "CDSA\t\t: 0x%08x\n",
+                               readl(gdd + SSI_GDD_CDSA_REG(lch)));
+               seq_printf(m, "CEN\t\t: 0x%04x\n",
+                               readw(gdd + SSI_GDD_CEN_REG(lch)));
+               seq_printf(m, "CSAC\t\t: 0x%04x\n",
+                               readw(gdd + SSI_GDD_CSAC_REG(lch)));
+               seq_printf(m, "CDAC\t\t: 0x%04x\n",
+                               readw(gdd + SSI_GDD_CDAC_REG(lch)));
+               seq_printf(m, "CLNK_CTRL\t: 0x%04x\n",
+                               readw(gdd + SSI_GDD_CLNK_CTRL_REG(lch)));
+       }
+
+       pm_runtime_put_sync(ssi->device.parent);
+
+       return 0;
+}
+
+static int ssi_regs_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, ssi_debug_show, inode->i_private);
+}
+
+static int ssi_gdd_regs_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, ssi_debug_gdd_show, inode->i_private);
+}
+
+static const struct file_operations ssi_regs_fops = {
+       .open           = ssi_regs_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static const struct file_operations ssi_gdd_regs_fops = {
+       .open           = ssi_gdd_regs_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static int ssi_debug_add_ctrl(struct hsi_controller *ssi)
+{
+       struct omap_ssi_controller *omap_ssi = hsi_controller_drvdata(ssi);
+       struct dentry *dir;
+
+       /* SSI controller */
+       omap_ssi->dir = debugfs_create_dir(dev_name(&ssi->device), NULL);
+       if (!omap_ssi->dir)
+               return -ENOMEM;
+
+       debugfs_create_file("regs", S_IRUGO, omap_ssi->dir, ssi,
+                                                               &ssi_regs_fops);
+       /* SSI GDD (DMA) */
+       dir = debugfs_create_dir("gdd", omap_ssi->dir);
+       if (!dir)
+               goto rback;
+       debugfs_create_file("regs", S_IRUGO, dir, ssi, &ssi_gdd_regs_fops);
+
+       return 0;
+rback:
+       debugfs_remove_recursive(omap_ssi->dir);
+
+       return -ENOMEM;
+}
+
+static void ssi_debug_remove_ctrl(struct hsi_controller *ssi)
+{
+       struct omap_ssi_controller *omap_ssi = hsi_controller_drvdata(ssi);
+
+       debugfs_remove_recursive(omap_ssi->dir);
+}
+#endif /* CONFIG_DEBUG_FS */
+
+/*
+ * FIXME: Horrible HACK needed until we remove the useless wakeline test
+ * in the CMT. To be removed !!!!
+ */
+void ssi_waketest(struct hsi_client *cl, unsigned int enable)
+{
+       struct hsi_port *port = hsi_get_port(cl);
+       struct omap_ssi_port *omap_port = hsi_port_drvdata(port);
+       struct hsi_controller *ssi = to_hsi_controller(port->device.parent);
+       struct omap_ssi_controller *omap_ssi = hsi_controller_drvdata(ssi);
+
+       omap_port->wktest = !!enable;
+       if (omap_port->wktest) {
+               pm_runtime_get_sync(ssi->device.parent);
+               writel_relaxed(SSI_WAKE(0),
+                               omap_ssi->sys + SSI_SET_WAKE_REG(port->num));
+       } else {
+               writel_relaxed(SSI_WAKE(0),
+                               omap_ssi->sys + SSI_CLEAR_WAKE_REG(port->num));
+               pm_runtime_put_sync(ssi->device.parent);
+       }
+}
+EXPORT_SYMBOL_GPL(ssi_waketest);
+
+static void ssi_gdd_complete(struct hsi_controller *ssi, unsigned int lch)
+{
+       struct omap_ssi_controller *omap_ssi = hsi_controller_drvdata(ssi);
+       struct hsi_msg *msg = omap_ssi->gdd_trn[lch].msg;
+       struct hsi_port *port = to_hsi_port(msg->cl->device.parent);
+       struct omap_ssi_port *omap_port = hsi_port_drvdata(port);
+       unsigned int dir;
+       u32 csr;
+       u32 val;
+
+       spin_lock(&omap_ssi->lock);
+
+       val = readl(omap_ssi->sys + SSI_GDD_MPU_IRQ_ENABLE_REG);
+       val &= ~SSI_GDD_LCH(lch);
+       writel_relaxed(val, omap_ssi->sys + SSI_GDD_MPU_IRQ_ENABLE_REG);
+
+       if (msg->ttype == HSI_MSG_READ) {
+               dir = DMA_FROM_DEVICE;
+               val = SSI_DATAAVAILABLE(msg->channel);
+               pm_runtime_put_sync(ssi->device.parent);
+       } else {
+               dir = DMA_TO_DEVICE;
+               val = SSI_DATAACCEPT(msg->channel);
+               /* Keep clocks reference for write pio event */
+       }
+       dma_unmap_sg(&ssi->device, msg->sgt.sgl, msg->sgt.nents, dir);
+       csr = readw(omap_ssi->gdd + SSI_GDD_CSR_REG(lch));
+       omap_ssi->gdd_trn[lch].msg = NULL; /* release GDD lch */
+       dev_dbg(&port->device, "DMA completed ch %d ttype %d\n",
+                               msg->channel, msg->ttype);
+       spin_unlock(&omap_ssi->lock);
+       if (csr & SSI_CSR_TOUR) { /* Timeout error */
+               msg->status = HSI_STATUS_ERROR;
+               msg->actual_len = 0;
+               spin_lock(&omap_port->lock);
+               list_del(&msg->link); /* Dequeue msg */
+               spin_unlock(&omap_port->lock);
+               msg->complete(msg);
+               return;
+       }
+       spin_lock(&omap_port->lock);
+       val |= readl(omap_ssi->sys + SSI_MPU_ENABLE_REG(port->num, 0));
+       writel_relaxed(val, omap_ssi->sys + SSI_MPU_ENABLE_REG(port->num, 0));
+       spin_unlock(&omap_port->lock);
+
+       msg->status = HSI_STATUS_COMPLETED;
+       msg->actual_len = sg_dma_len(msg->sgt.sgl);
+}
+
+static void ssi_gdd_tasklet(unsigned long dev)
+{
+       struct hsi_controller *ssi = (struct hsi_controller *)dev;
+       struct omap_ssi_controller *omap_ssi = hsi_controller_drvdata(ssi);
+       void __iomem *sys = omap_ssi->sys;
+       unsigned int lch;
+       u32 status_reg;
+
+       pm_runtime_get_sync(ssi->device.parent);
+
+       status_reg = readl(sys + SSI_GDD_MPU_IRQ_STATUS_REG);
+       for (lch = 0; lch < SSI_MAX_GDD_LCH; lch++) {
+               if (status_reg & SSI_GDD_LCH(lch))
+                       ssi_gdd_complete(ssi, lch);
+       }
+       writel_relaxed(status_reg, sys + SSI_GDD_MPU_IRQ_STATUS_REG);
+       status_reg = readl(sys + SSI_GDD_MPU_IRQ_STATUS_REG);
+
+       pm_runtime_put_sync(ssi->device.parent);
+
+       if (status_reg)
+               tasklet_hi_schedule(&omap_ssi->gdd_tasklet);
+       else
+               enable_irq(omap_ssi->gdd_irq);
+
+}
+
+static irqreturn_t ssi_gdd_isr(int irq, void *ssi)
+{
+       struct omap_ssi_controller *omap_ssi = hsi_controller_drvdata(ssi);
+
+       tasklet_hi_schedule(&omap_ssi->gdd_tasklet);
+       disable_irq_nosync(irq);
+
+       return IRQ_HANDLED;
+}
+
+static unsigned long ssi_get_clk_rate(struct hsi_controller *ssi)
+{
+       struct omap_ssi_controller *omap_ssi = hsi_controller_drvdata(ssi);
+       unsigned long rate = clk_get_rate(omap_ssi->fck);
+       return rate;
+}
+
+static int ssi_clk_event(struct notifier_block *nb, unsigned long event,
+                                                               void *data)
+{
+       struct omap_ssi_controller *omap_ssi = container_of(nb,
+                                       struct omap_ssi_controller, fck_nb);
+       struct hsi_controller *ssi = to_hsi_controller(omap_ssi->dev);
+       struct clk_notifier_data *clk_data = data;
+       struct omap_ssi_port *omap_port;
+       int i;
+
+       switch (event) {
+       case PRE_RATE_CHANGE:
+               dev_dbg(&ssi->device, "pre rate change\n");
+
+               for (i = 0; i < ssi->num_ports; i++) {
+                       omap_port = omap_ssi->port[i];
+
+                       if (!omap_port)
+                               continue;
+
+                       /* Workaround for SWBREAK + CAwake down race in CMT */
+                       tasklet_disable(&omap_port->wake_tasklet);
+
+                       /* stop all ssi communication */
+                       pinctrl_pm_select_idle_state(omap_port->pdev);
+                       udelay(1); /* wait for racing frames */
+               }
+
+               break;
+       case ABORT_RATE_CHANGE:
+               dev_dbg(&ssi->device, "abort rate change\n");
+               /* Fall through */
+       case POST_RATE_CHANGE:
+               dev_dbg(&ssi->device, "post rate change (%lu -> %lu)\n",
+                       clk_data->old_rate, clk_data->new_rate);
+               omap_ssi->fck_rate = DIV_ROUND_CLOSEST(clk_data->new_rate, 1000); /* KHz */
+
+               for (i = 0; i < ssi->num_ports; i++) {
+                       omap_port = omap_ssi->port[i];
+
+                       if (!omap_port)
+                               continue;
+
+                       omap_ssi_port_update_fclk(ssi, omap_port);
+
+                       /* resume ssi communication */
+                       pinctrl_pm_select_default_state(omap_port->pdev);
+                       tasklet_enable(&omap_port->wake_tasklet);
+               }
+
+               break;
+       default:
+               break;
+       }
+
+       return NOTIFY_DONE;
+}
+
+static int ssi_get_iomem(struct platform_device *pd,
+               const char *name, void __iomem **pbase, dma_addr_t *phy)
+{
+       struct resource *mem;
+       void __iomem *base;
+       struct hsi_controller *ssi = platform_get_drvdata(pd);
+
+       mem = platform_get_resource_byname(pd, IORESOURCE_MEM, name);
+       base = devm_ioremap_resource(&ssi->device, mem);
+       if (IS_ERR(base))
+               return PTR_ERR(base);
+
+       *pbase = base;
+
+       if (phy)
+               *phy = mem->start;
+
+       return 0;
+}
+
+static int ssi_add_controller(struct hsi_controller *ssi,
+                                               struct platform_device *pd)
+{
+       struct omap_ssi_controller *omap_ssi;
+       int err;
+
+       omap_ssi = devm_kzalloc(&ssi->device, sizeof(*omap_ssi), GFP_KERNEL);
+       if (!omap_ssi) {
+               dev_err(&pd->dev, "not enough memory for omap ssi\n");
+               return -ENOMEM;
+       }
+
+       err = ida_simple_get(&platform_omap_ssi_ida, 0, 0, GFP_KERNEL);
+       if (err < 0)
+               goto out_err;
+       ssi->id = err;
+
+       ssi->owner = THIS_MODULE;
+       ssi->device.parent = &pd->dev;
+       dev_set_name(&ssi->device, "ssi%d", ssi->id);
+       hsi_controller_set_drvdata(ssi, omap_ssi);
+       omap_ssi->dev = &ssi->device;
+       err = ssi_get_iomem(pd, "sys", &omap_ssi->sys, NULL);
+       if (err < 0)
+               goto out_err;
+       err = ssi_get_iomem(pd, "gdd", &omap_ssi->gdd, NULL);
+       if (err < 0)
+               goto out_err;
+       err = platform_get_irq_byname(pd, "gdd_mpu");
+       if (err < 0) {
+               dev_err(&pd->dev, "GDD IRQ resource missing\n");
+               goto out_err;
+       }
+       omap_ssi->gdd_irq = err;
+       tasklet_init(&omap_ssi->gdd_tasklet, ssi_gdd_tasklet,
+                                                       (unsigned long)ssi);
+       err = devm_request_irq(&ssi->device, omap_ssi->gdd_irq, ssi_gdd_isr,
+                                               0, "gdd_mpu", ssi);
+       if (err < 0) {
+               dev_err(&ssi->device, "Request GDD IRQ %d failed (%d)",
+                                                       omap_ssi->gdd_irq, err);
+               goto out_err;
+       }
+
+       omap_ssi->port = devm_kzalloc(&ssi->device,
+               sizeof(struct omap_ssi_port *) * ssi->num_ports, GFP_KERNEL);
+       if (!omap_ssi->port) {
+               err = -ENOMEM;
+               goto out_err;
+       }
+
+       omap_ssi->fck = devm_clk_get(&ssi->device, "ssi_ssr_fck");
+       if (IS_ERR(omap_ssi->fck)) {
+               dev_err(&pd->dev, "Could not acquire clock \"ssi_ssr_fck\": %li\n",
+                       PTR_ERR(omap_ssi->fck));
+               err = -ENODEV;
+               goto out_err;
+       }
+
+       omap_ssi->fck_nb.notifier_call = ssi_clk_event;
+       omap_ssi->fck_nb.priority = INT_MAX;
+       clk_notifier_register(omap_ssi->fck, &omap_ssi->fck_nb);
+
+       /* TODO: find register, which can be used to detect context loss */
+       omap_ssi->get_loss = NULL;
+
+       omap_ssi->max_speed = UINT_MAX;
+       spin_lock_init(&omap_ssi->lock);
+       err = hsi_register_controller(ssi);
+
+       if (err < 0)
+               goto out_err;
+
+       return 0;
+
+out_err:
+       ida_simple_remove(&platform_omap_ssi_ida, ssi->id);
+       return err;
+}
+
+static int ssi_hw_init(struct hsi_controller *ssi)
+{
+       struct omap_ssi_controller *omap_ssi = hsi_controller_drvdata(ssi);
+       unsigned int i;
+       u32 val;
+       int err;
+
+       err = pm_runtime_get_sync(ssi->device.parent);
+       if (err < 0) {
+               dev_err(&ssi->device, "runtime PM failed %d\n", err);
+               return err;
+       }
+       /* Reseting SSI controller */
+       writel_relaxed(SSI_SOFTRESET, omap_ssi->sys + SSI_SYSCONFIG_REG);
+       val = readl(omap_ssi->sys + SSI_SYSSTATUS_REG);
+       for (i = 0; ((i < 20) && !(val & SSI_RESETDONE)); i++) {
+               msleep(20);
+               val = readl(omap_ssi->sys + SSI_SYSSTATUS_REG);
+       }
+       if (!(val & SSI_RESETDONE)) {
+               dev_err(&ssi->device, "SSI HW reset failed\n");
+               pm_runtime_put_sync(ssi->device.parent);
+               return -EIO;
+       }
+       /* Reseting GDD */
+       writel_relaxed(SSI_SWRESET, omap_ssi->gdd + SSI_GDD_GRST_REG);
+       /* Get FCK rate in KHz */
+       omap_ssi->fck_rate = DIV_ROUND_CLOSEST(ssi_get_clk_rate(ssi), 1000);
+       dev_dbg(&ssi->device, "SSI fck rate %lu KHz\n", omap_ssi->fck_rate);
+       /* Set default PM settings */
+       val = SSI_AUTOIDLE | SSI_SIDLEMODE_SMART | SSI_MIDLEMODE_SMART;
+       writel_relaxed(val, omap_ssi->sys + SSI_SYSCONFIG_REG);
+       omap_ssi->sysconfig = val;
+       writel_relaxed(SSI_CLK_AUTOGATING_ON, omap_ssi->sys + SSI_GDD_GCR_REG);
+       omap_ssi->gdd_gcr = SSI_CLK_AUTOGATING_ON;
+       pm_runtime_put_sync(ssi->device.parent);
+
+       return 0;
+}
+
+static void ssi_remove_controller(struct hsi_controller *ssi)
+{
+       struct omap_ssi_controller *omap_ssi = hsi_controller_drvdata(ssi);
+       int id = ssi->id;
+       tasklet_kill(&omap_ssi->gdd_tasklet);
+       hsi_unregister_controller(ssi);
+       clk_notifier_unregister(omap_ssi->fck, &omap_ssi->fck_nb);
+       ida_simple_remove(&platform_omap_ssi_ida, id);
+}
+
+static inline int ssi_of_get_available_ports_count(const struct device_node *np)
+{
+       struct device_node *child;
+       int num = 0;
+
+       for_each_available_child_of_node(np, child)
+               if (of_device_is_compatible(child, "ti,omap3-ssi-port"))
+                       num++;
+
+       return num;
+}
+
+static int ssi_remove_ports(struct device *dev, void *c)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+
+       if (!dev->of_node)
+               return 0;
+
+       of_node_clear_flag(dev->of_node, OF_POPULATED);
+       of_device_unregister(pdev);
+
+       return 0;
+}
+
+static int ssi_probe(struct platform_device *pd)
+{
+       struct platform_device *childpdev;
+       struct device_node *np = pd->dev.of_node;
+       struct device_node *child;
+       struct hsi_controller *ssi;
+       int err;
+       int num_ports;
+
+       if (!np) {
+               dev_err(&pd->dev, "missing device tree data\n");
+               return -EINVAL;
+       }
+
+       num_ports = ssi_of_get_available_ports_count(np);
+
+       ssi = hsi_alloc_controller(num_ports, GFP_KERNEL);
+       if (!ssi) {
+               dev_err(&pd->dev, "No memory for controller\n");
+               return -ENOMEM;
+       }
+
+       platform_set_drvdata(pd, ssi);
+
+       err = ssi_add_controller(ssi, pd);
+       if (err < 0)
+               goto out1;
+
+       pm_runtime_irq_safe(&pd->dev);
+       pm_runtime_enable(&pd->dev);
+
+       err = ssi_hw_init(ssi);
+       if (err < 0)
+               goto out2;
+#ifdef CONFIG_DEBUG_FS
+       err = ssi_debug_add_ctrl(ssi);
+       if (err < 0)
+               goto out2;
+#endif
+
+       for_each_available_child_of_node(np, child) {
+               if (!of_device_is_compatible(child, "ti,omap3-ssi-port"))
+                       continue;
+
+               childpdev = of_platform_device_create(child, NULL, &pd->dev);
+               if (!childpdev) {
+                       err = -ENODEV;
+                       dev_err(&pd->dev, "failed to create ssi controller port\n");
+                       goto out3;
+               }
+       }
+
+       dev_info(&pd->dev, "ssi controller %d initialized (%d ports)!\n",
+               ssi->id, num_ports);
+       return err;
+out3:
+       device_for_each_child(&pd->dev, NULL, ssi_remove_ports);
+out2:
+       ssi_remove_controller(ssi);
+out1:
+       platform_set_drvdata(pd, NULL);
+       pm_runtime_disable(&pd->dev);
+
+       return err;
+}
+
+static int ssi_remove(struct platform_device *pd)
+{
+       struct hsi_controller *ssi = platform_get_drvdata(pd);
+
+       /* cleanup of of_platform_populate() call */
+       device_for_each_child(&pd->dev, NULL, ssi_remove_ports);
+
+#ifdef CONFIG_DEBUG_FS
+       ssi_debug_remove_ctrl(ssi);
+#endif
+       ssi_remove_controller(ssi);
+       platform_set_drvdata(pd, NULL);
+
+       pm_runtime_disable(&pd->dev);
+
+       return 0;
+}
+
+#ifdef CONFIG_PM
+static int omap_ssi_runtime_suspend(struct device *dev)
+{
+       struct hsi_controller *ssi = dev_get_drvdata(dev);
+       struct omap_ssi_controller *omap_ssi = hsi_controller_drvdata(ssi);
+
+       dev_dbg(dev, "runtime suspend!\n");
+
+       if (omap_ssi->get_loss)
+               omap_ssi->loss_count =
+                               omap_ssi->get_loss(ssi->device.parent);
+
+       return 0;
+}
+
+static int omap_ssi_runtime_resume(struct device *dev)
+{
+       struct hsi_controller *ssi = dev_get_drvdata(dev);
+       struct omap_ssi_controller *omap_ssi = hsi_controller_drvdata(ssi);
+
+       dev_dbg(dev, "runtime resume!\n");
+
+       if ((omap_ssi->get_loss) && (omap_ssi->loss_count ==
+                               omap_ssi->get_loss(ssi->device.parent)))
+               return 0;
+
+       writel_relaxed(omap_ssi->gdd_gcr, omap_ssi->gdd + SSI_GDD_GCR_REG);
+
+       return 0;
+}
+
+static const struct dev_pm_ops omap_ssi_pm_ops = {
+       SET_RUNTIME_PM_OPS(omap_ssi_runtime_suspend, omap_ssi_runtime_resume,
+               NULL)
+};
+
+#define DEV_PM_OPS     (&omap_ssi_pm_ops)
+#else
+#define DEV_PM_OPS     NULL
+#endif
+
+#ifdef CONFIG_OF
+static const struct of_device_id omap_ssi_of_match[] = {
+       { .compatible = "ti,omap3-ssi", },
+       {},
+};
+MODULE_DEVICE_TABLE(of, omap_ssi_of_match);
+#else
+#define omap_ssi_of_match NULL
+#endif
+
+static struct platform_driver ssi_pdriver = {
+       .probe = ssi_probe,
+       .remove = ssi_remove,
+       .driver = {
+               .name   = "omap_ssi",
+               .pm     = DEV_PM_OPS,
+               .of_match_table = omap_ssi_of_match,
+       },
+};
+
+static int __init ssi_init(void) {
+       int ret;
+
+       ret = platform_driver_register(&ssi_pdriver);
+       if (ret)
+               return ret;
+
+       return platform_driver_register(&ssi_port_pdriver);
+}
+module_init(ssi_init);
+
+static void __exit ssi_exit(void) {
+       platform_driver_unregister(&ssi_port_pdriver);
+       platform_driver_unregister(&ssi_pdriver);
+}
+module_exit(ssi_exit);
+
+MODULE_ALIAS("platform:omap_ssi");
+MODULE_AUTHOR("Carlos Chinea <carlos.chinea@nokia.com>");
+MODULE_AUTHOR("Sebastian Reichel <sre@kernel.org>");
+MODULE_DESCRIPTION("Synchronous Serial Interface Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/hsi/controllers/omap_ssi_port.c b/drivers/hsi/controllers/omap_ssi_port.c

index e80a66e20998697de466d6b5fe7bb773d7a77888..6b8f7739768aee770fcc0f82fa7e89078a5912f4 100644 (file)
--- a/drivers/hsi/controllers/omap_ssi_port.c
+++ b/drivers/hsi/controllers/omap_ssi_port.c
@@ -23,8 +23,10 @@
  #include <linux/platform_device.h>
  #include <linux/dma-mapping.h>
  #include <linux/pm_runtime.h>
+#include <linux/delay.h>
  
-#include <linux/of_gpio.h>
+#include <linux/gpio/consumer.h>
+#include <linux/pinctrl/consumer.h>
  #include <linux/debugfs.h>
  
  #include "omap_ssi_regs.h"
@@ -43,7 +45,7 @@ static inline int hsi_dummy_cl(struct hsi_client *cl __maybe_unused)
  static inline unsigned int ssi_wakein(struct hsi_port *port)
  {
         struct omap_ssi_port *omap_port = hsi_port_drvdata(port);
-       return gpio_get_value(omap_port->wake_gpio);
+       return gpiod_get_value(omap_port->wake_gpio);
  }
  
  #ifdef CONFIG_DEBUG_FS
@@ -171,7 +173,7 @@ static int ssi_div_set(void *data, u64 val)
  
  DEFINE_SIMPLE_ATTRIBUTE(ssi_sst_div_fops, ssi_div_get, ssi_div_set, "%llu\n");
  
-static int __init ssi_debug_add_port(struct omap_ssi_port *omap_port,
+static int ssi_debug_add_port(struct omap_ssi_port *omap_port,
                                      struct dentry *dir)
  {
         struct hsi_port *port = to_hsi_port(omap_port->dev);
@@ -514,6 +516,11 @@ static int ssi_flush(struct hsi_client *cl)
  
         pm_runtime_get_sync(omap_port->pdev);
         spin_lock_bh(&omap_port->lock);
+
+       /* stop all ssi communication */
+       pinctrl_pm_select_idle_state(omap_port->pdev);
+       udelay(1); /* wait for racing frames */
+
         /* Stop all DMA transfers */
         for (i = 0; i < SSI_MAX_GDD_LCH; i++) {
                 msg = omap_ssi->gdd_trn[i].msg;
@@ -550,6 +557,10 @@ static int ssi_flush(struct hsi_client *cl)
                 ssi_flush_queue(&omap_port->rxqueue[i], NULL);
         }
         ssi_flush_queue(&omap_port->brkqueue, NULL);
+
+       /* Resume SSI communication */
+       pinctrl_pm_select_default_state(omap_port->pdev);
+
         spin_unlock_bh(&omap_port->lock);
         pm_runtime_put_sync(omap_port->pdev);
  
@@ -1007,7 +1018,7 @@ static irqreturn_t ssi_wake_isr(int irq __maybe_unused, void *ssi_port)
         return IRQ_HANDLED;
  }
  
-static int __init ssi_port_irq(struct hsi_port *port,
+static int ssi_port_irq(struct hsi_port *port,
                                                 struct platform_device *pd)
  {
         struct omap_ssi_port *omap_port = hsi_port_drvdata(port);
@@ -1029,19 +1040,19 @@ static int __init ssi_port_irq(struct hsi_port *port,
         return err;
  }
  
-static int __init ssi_wake_irq(struct hsi_port *port,
+static int ssi_wake_irq(struct hsi_port *port,
                                                 struct platform_device *pd)
  {
         struct omap_ssi_port *omap_port = hsi_port_drvdata(port);
         int cawake_irq;
         int err;
  
-       if (omap_port->wake_gpio == -1) {
+       if (!omap_port->wake_gpio) {
                 omap_port->wake_irq = -1;
                 return 0;
         }
  
-       cawake_irq = gpio_to_irq(omap_port->wake_gpio);
+       cawake_irq = gpiod_to_irq(omap_port->wake_gpio);
  
         omap_port->wake_irq = cawake_irq;
         tasklet_init(&omap_port->wake_tasklet, ssi_wake_tasklet,
@@ -1060,7 +1071,7 @@ static int __init ssi_wake_irq(struct hsi_port *port,
         return err;
  }
  
-static void __init ssi_queues_init(struct omap_ssi_port *omap_port)
+static void ssi_queues_init(struct omap_ssi_port *omap_port)
  {
         unsigned int ch;
  
@@ -1071,7 +1082,7 @@ static void __init ssi_queues_init(struct omap_ssi_port *omap_port)
         INIT_LIST_HEAD(&omap_port->brkqueue);
  }
  
-static int __init ssi_port_get_iomem(struct platform_device *pd,
+static int ssi_port_get_iomem(struct platform_device *pd,
                 const char *name, void __iomem **pbase, dma_addr_t *phy)
  {
         struct hsi_port *port = platform_get_drvdata(pd);
@@ -1104,24 +1115,19 @@ static int __init ssi_port_get_iomem(struct platform_device *pd,
         return 0;
  }
  
-static int __init ssi_port_probe(struct platform_device *pd)
+static int ssi_port_probe(struct platform_device *pd)
  {
         struct device_node *np = pd->dev.of_node;
         struct hsi_port *port;
         struct omap_ssi_port *omap_port;
         struct hsi_controller *ssi = dev_get_drvdata(pd->dev.parent);
         struct omap_ssi_controller *omap_ssi = hsi_controller_drvdata(ssi);
-       int cawake_gpio = 0;
+       struct gpio_desc *cawake_gpio = NULL;
         u32 port_id;
         int err;
  
         dev_dbg(&pd->dev, "init ssi port...\n");
  
-       if (!try_module_get(ssi->owner)) {
-               dev_err(&pd->dev, "could not increment parent module refcount\n");
-               return -ENODEV;
-       }
-
         if (!ssi->port || !omap_ssi->port) {
                 dev_err(&pd->dev, "ssi controller not initialized!\n");
                 err = -ENODEV;
@@ -1147,20 +1153,10 @@ static int __init ssi_port_probe(struct platform_device *pd)
                 goto error;
         }
  
-       err = of_get_named_gpio(np, "ti,ssi-cawake-gpio", 0);
-       if (err < 0) {
-               dev_err(&pd->dev, "DT data is missing cawake gpio (err=%d)\n",
-                       err);
-               goto error;
-       }
-       cawake_gpio = err;
-
-       err = devm_gpio_request_one(&port->device, cawake_gpio, GPIOF_DIR_IN,
-               "cawake");
-       if (err) {
-               dev_err(&pd->dev, "could not request cawake gpio (err=%d)!\n",
-                       err);
-               err = -ENXIO;
+       cawake_gpio = devm_gpiod_get(&pd->dev, "ti,ssi-cawake", GPIOD_IN);
+       if (IS_ERR(cawake_gpio)) {
+               err = PTR_ERR(cawake_gpio);
+               dev_err(&pd->dev, "couldn't get cawake gpio (err=%d)!\n", err);
                 goto error;
         }
  
@@ -1219,8 +1215,7 @@ static int __init ssi_port_probe(struct platform_device *pd)
  
         hsi_add_clients_from_dt(port, np);
  
-       dev_info(&pd->dev, "ssi port %u successfully initialized (cawake=%d)\n",
-               port_id, cawake_gpio);
+       dev_info(&pd->dev, "ssi port %u successfully initialized\n", port_id);
  
         return 0;
  
@@ -1228,7 +1223,7 @@ error:
         return err;
  }
  
-static int __exit ssi_port_remove(struct platform_device *pd)
+static int ssi_port_remove(struct platform_device *pd)
  {
         struct hsi_port *port = platform_get_drvdata(pd);
         struct omap_ssi_port *omap_port = hsi_port_drvdata(port);
@@ -1253,12 +1248,28 @@ static int __exit ssi_port_remove(struct platform_device *pd)
  
         omap_ssi->port[omap_port->port_id] = NULL;
         platform_set_drvdata(pd, NULL);
-       module_put(ssi->owner);
         pm_runtime_disable(&pd->dev);
  
         return 0;
  }
  
+static int ssi_restore_divisor(struct omap_ssi_port *omap_port)
+{
+       writel_relaxed(omap_port->sst.divisor,
+                               omap_port->sst_base + SSI_SST_DIVISOR_REG);
+
+       return 0;
+}
+
+void omap_ssi_port_update_fclk(struct hsi_controller *ssi,
+                              struct omap_ssi_port *omap_port)
+{
+       /* update divisor */
+       u32 div = ssi_calculate_div(ssi);
+       omap_port->sst.divisor = div;
+       ssi_restore_divisor(omap_port);
+}
+
  #ifdef CONFIG_PM
  static int ssi_save_port_ctx(struct omap_ssi_port *omap_port)
  {
@@ -1311,14 +1322,6 @@ static int ssi_restore_port_mode(struct omap_ssi_port *omap_port)
         return 0;
  }
  
-static int ssi_restore_divisor(struct omap_ssi_port *omap_port)
-{
-       writel_relaxed(omap_port->sst.divisor,
-                               omap_port->sst_base + SSI_SST_DIVISOR_REG);
-
-       return 0;
-}
-
  static int omap_ssi_port_runtime_suspend(struct device *dev)
  {
         struct hsi_port *port = dev_get_drvdata(dev);
@@ -1380,19 +1383,12 @@ MODULE_DEVICE_TABLE(of, omap_ssi_port_of_match);
  #define omap_ssi_port_of_match NULL
  #endif
  
-static struct platform_driver ssi_port_pdriver = {
-       .remove = __exit_p(ssi_port_remove),
+struct platform_driver ssi_port_pdriver = {
+       .probe = ssi_port_probe,
+       .remove = ssi_port_remove,
         .driver = {
                 .name   = "omap_ssi_port",
                 .of_match_table = omap_ssi_port_of_match,
                 .pm     = DEV_PM_OPS,
         },
  };
-
-module_platform_driver_probe(ssi_port_pdriver, ssi_port_probe);
-
-MODULE_ALIAS("platform:omap_ssi_port");
-MODULE_AUTHOR("Carlos Chinea <carlos.chinea@nokia.com>");
-MODULE_AUTHOR("Sebastian Reichel <sre@kernel.org>");
-MODULE_DESCRIPTION("Synchronous Serial Interface Port Driver");
-MODULE_LICENSE("GPL v2");
diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c

index d9d6022c5aca42bbf959d214abb76b5ee3fe8704..d2209147dc8912ae590584a5a0890171239f9ad2 100644 (file)
--- a/drivers/hwtracing/intel_th/msu.c
+++ b/drivers/hwtracing/intel_th/msu.c
@@ -1164,7 +1164,7 @@ static void msc_mmap_close(struct vm_area_struct *vma)
         if (!atomic_dec_and_mutex_lock(&msc->mmap_count, &msc->buf_mutex))
                 return;
  
-       /* drop page _counts */
+       /* drop page _refcounts */
         for (pg = 0; pg < msc->nr_pages; pg++) {
                 struct page *page = msc_buffer_get_page(msc, pg);
  
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile

index f818538a7f4e118b309052c3d43a1aa18d5dbe65..26987d9d7e1cdccd5922322e34fd31466d8c064d 100644 (file)
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -8,9 +8,9 @@ obj-$(CONFIG_INFINIBAND_USER_MAD) +=    ib_umad.o
  obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=        ib_uverbs.o ib_ucm.o \
                                         $(user_access-y)
  
-ib_core-y :=                   packer.o ud_header.o verbs.o cq.o sysfs.o \
+ib_core-y :=                   packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
                                 device.o fmr_pool.o cache.o netlink.o \
-                               roce_gid_mgmt.o
+                               roce_gid_mgmt.o mr_pool.o
  ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
  ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
  
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c

index 93ab0ae9720889f9cf51d6629a72469b67d646d4..f0c91ba3178a12030a8cc4e38db7a651633759b8 100644 (file)
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -800,6 +800,7 @@ int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd,
         if (id->device != pd->device)
                 return -EINVAL;
  
+       qp_init_attr->port_num = id->port_num;
         qp = ib_create_qp(pd, qp_init_attr);
         if (IS_ERR(qp))
                 return PTR_ERR(qp);
@@ -4294,7 +4295,8 @@ static int __init cma_init(void)
         if (ret)
                 goto err;
  
-       if (ibnl_add_client(RDMA_NL_RDMA_CM, RDMA_NL_RDMA_CM_NUM_OPS, cma_cb_table))
+       if (ibnl_add_client(RDMA_NL_RDMA_CM, ARRAY_SIZE(cma_cb_table),
+                           cma_cb_table))
                 pr_warn("RDMA CMA: failed to add netlink callback\n");
         cma_configfs_init();
  
diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c

index e28a160cdab03650441cd0ff48b44e0cffce9a68..f0572049d291e8b862d88f5d45a63092cc61ffeb 100644 (file)
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -459,7 +459,7 @@ static void iw_cm_check_wildcard(struct sockaddr_storage *pm_addr,
         if (pm_addr->ss_family == AF_INET) {
                 struct sockaddr_in *pm4_addr = (struct sockaddr_in *)pm_addr;
  
-               if (pm4_addr->sin_addr.s_addr == INADDR_ANY) {
+               if (pm4_addr->sin_addr.s_addr == htonl(INADDR_ANY)) {
                         struct sockaddr_in *cm4_addr =
                                 (struct sockaddr_in *)cm_addr;
                         struct sockaddr_in *cm4_outaddr =
@@ -1175,7 +1175,7 @@ static int __init iw_cm_init(void)
         if (ret)
                 pr_err("iw_cm: couldn't init iwpm\n");
  
-       ret = ibnl_add_client(RDMA_NL_IWCM, RDMA_NL_IWPM_NUM_OPS,
+       ret = ibnl_add_client(RDMA_NL_IWCM, ARRAY_SIZE(iwcm_nl_cb_table),
                               iwcm_nl_cb_table);
         if (ret)
                 pr_err("iw_cm: couldn't register netlink callbacks\n");
diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c

index 9b2bf2fb2b00674287e74e7cb3048a5fa296b91e..b65e06c560d7ddafa8c53292a5f6e36c917a9244 100644 (file)
--- a/drivers/infiniband/core/iwpm_util.c
+++ b/drivers/infiniband/core/iwpm_util.c
@@ -634,6 +634,7 @@ static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid)
         if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client,
                            RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) {
                 pr_warn("%s Unable to put NLMSG_DONE\n", __func__);
+               dev_kfree_skb(skb);
                 return -ENOMEM;
         }
         nlh->nlmsg_type = NLMSG_DONE;
diff --git a/drivers/infiniband/core/mr_pool.c b/drivers/infiniband/core/mr_pool.c

new file mode 100644 (file)

index 0000000..49d478b
--- /dev/null
+++ b/drivers/infiniband/core/mr_pool.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2016 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <rdma/ib_verbs.h>
+#include <rdma/mr_pool.h>
+
+struct ib_mr *ib_mr_pool_get(struct ib_qp *qp, struct list_head *list)
+{
+       struct ib_mr *mr;
+       unsigned long flags;
+
+       spin_lock_irqsave(&qp->mr_lock, flags);
+       mr = list_first_entry_or_null(list, struct ib_mr, qp_entry);
+       if (mr) {
+               list_del(&mr->qp_entry);
+               qp->mrs_used++;
+       }
+       spin_unlock_irqrestore(&qp->mr_lock, flags);
+
+       return mr;
+}
+EXPORT_SYMBOL(ib_mr_pool_get);
+
+void ib_mr_pool_put(struct ib_qp *qp, struct list_head *list, struct ib_mr *mr)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&qp->mr_lock, flags);
+       list_add(&mr->qp_entry, list);
+       qp->mrs_used--;
+       spin_unlock_irqrestore(&qp->mr_lock, flags);
+}
+EXPORT_SYMBOL(ib_mr_pool_put);
+
+int ib_mr_pool_init(struct ib_qp *qp, struct list_head *list, int nr,
+               enum ib_mr_type type, u32 max_num_sg)
+{
+       struct ib_mr *mr;
+       unsigned long flags;
+       int ret, i;
+
+       for (i = 0; i < nr; i++) {
+               mr = ib_alloc_mr(qp->pd, type, max_num_sg);
+               if (IS_ERR(mr)) {
+                       ret = PTR_ERR(mr);
+                       goto out;
+               }
+
+               spin_lock_irqsave(&qp->mr_lock, flags);
+               list_add_tail(&mr->qp_entry, list);
+               spin_unlock_irqrestore(&qp->mr_lock, flags);
+       }
+
+       return 0;
+out:
+       ib_mr_pool_destroy(qp, list);
+       return ret;
+}
+EXPORT_SYMBOL(ib_mr_pool_init);
+
+void ib_mr_pool_destroy(struct ib_qp *qp, struct list_head *list)
+{
+       struct ib_mr *mr;
+       unsigned long flags;
+
+       spin_lock_irqsave(&qp->mr_lock, flags);
+       while (!list_empty(list)) {
+               mr = list_first_entry(list, struct ib_mr, qp_entry);
+               list_del(&mr->qp_entry);
+
+               spin_unlock_irqrestore(&qp->mr_lock, flags);
+               ib_dereg_mr(mr);
+               spin_lock_irqsave(&qp->mr_lock, flags);
+       }
+       spin_unlock_irqrestore(&qp->mr_lock, flags);
+}
+EXPORT_SYMBOL(ib_mr_pool_destroy);
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c

index d47df935677966b6b8ceaf3217036ee8a16f4507..9b8c20c8209bcfa6deb62a20a1f598592e4b7f9b 100644 (file)
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -151,12 +151,11 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
         struct ibnl_client *client;
         int type = nlh->nlmsg_type;
         int index = RDMA_NL_GET_CLIENT(type);
-       int op = RDMA_NL_GET_OP(type);
+       unsigned int op = RDMA_NL_GET_OP(type);
  
         list_for_each_entry(client, &client_list, list) {
                 if (client->index == index) {
-                       if (op < 0 || op >= client->nops ||
-                           !client->cb_table[op].dump)
+                       if (op >= client->nops || !client->cb_table[op].dump)
                                 return -EINVAL;
  
                         /*
diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c

new file mode 100644 (file)

index 0000000..1eb9b12
--- /dev/null
+++ b/drivers/infiniband/core/rw.c
@@ -0,0 +1,727 @@
+/*
+ * Copyright (c) 2016 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+#include <rdma/mr_pool.h>
+#include <rdma/rw.h>
+
+enum {
+       RDMA_RW_SINGLE_WR,
+       RDMA_RW_MULTI_WR,
+       RDMA_RW_MR,
+       RDMA_RW_SIG_MR,
+};
+
+static bool rdma_rw_force_mr;
+module_param_named(force_mr, rdma_rw_force_mr, bool, 0);
+MODULE_PARM_DESC(force_mr, "Force usage of MRs for RDMA READ/WRITE operations");
+
+/*
+ * Check if the device might use memory registration.  This is currently only
+ * true for iWarp devices. In the future we can hopefully fine tune this based
+ * on HCA driver input.
+ */
+static inline bool rdma_rw_can_use_mr(struct ib_device *dev, u8 port_num)
+{
+       if (rdma_protocol_iwarp(dev, port_num))
+               return true;
+       if (unlikely(rdma_rw_force_mr))
+               return true;
+       return false;
+}
+
+/*
+ * Check if the device will use memory registration for this RW operation.
+ * We currently always use memory registrations for iWarp RDMA READs, and
+ * have a debug option to force usage of MRs.
+ *
+ * XXX: In the future we can hopefully fine tune this based on HCA driver
+ * input.
+ */
+static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u8 port_num,
+               enum dma_data_direction dir, int dma_nents)
+{
+       if (rdma_protocol_iwarp(dev, port_num) && dir == DMA_FROM_DEVICE)
+               return true;
+       if (unlikely(rdma_rw_force_mr))
+               return true;
+       return false;
+}
+
+static inline u32 rdma_rw_max_sge(struct ib_device *dev,
+               enum dma_data_direction dir)
+{
+       return dir == DMA_TO_DEVICE ?
+               dev->attrs.max_sge : dev->attrs.max_sge_rd;
+}
+
+static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev)
+{
+       /* arbitrary limit to avoid allocating gigantic resources */
+       return min_t(u32, dev->attrs.max_fast_reg_page_list_len, 256);
+}
+
+static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num,
+               struct rdma_rw_reg_ctx *reg, struct scatterlist *sg,
+               u32 sg_cnt, u32 offset)
+{
+       u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
+       u32 nents = min(sg_cnt, pages_per_mr);
+       int count = 0, ret;
+
+       reg->mr = ib_mr_pool_get(qp, &qp->rdma_mrs);
+       if (!reg->mr)
+               return -EAGAIN;
+
+       if (reg->mr->need_inval) {
+               reg->inv_wr.opcode = IB_WR_LOCAL_INV;
+               reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey;
+               reg->inv_wr.next = &reg->reg_wr.wr;
+               count++;
+       } else {
+               reg->inv_wr.next = NULL;
+       }
+
+       ret = ib_map_mr_sg(reg->mr, sg, nents, &offset, PAGE_SIZE);
+       if (ret < nents) {
+               ib_mr_pool_put(qp, &qp->rdma_mrs, reg->mr);
+               return -EINVAL;
+       }
+
+       reg->reg_wr.wr.opcode = IB_WR_REG_MR;
+       reg->reg_wr.mr = reg->mr;
+       reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE;
+       if (rdma_protocol_iwarp(qp->device, port_num))
+               reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE;
+       count++;
+
+       reg->sge.addr = reg->mr->iova;
+       reg->sge.length = reg->mr->length;
+       return count;
+}
+
+static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+               u8 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset,
+               u64 remote_addr, u32 rkey, enum dma_data_direction dir)
+{
+       u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
+       int i, j, ret = 0, count = 0;
+
+       ctx->nr_ops = (sg_cnt + pages_per_mr - 1) / pages_per_mr;
+       ctx->reg = kcalloc(ctx->nr_ops, sizeof(*ctx->reg), GFP_KERNEL);
+       if (!ctx->reg) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       for (i = 0; i < ctx->nr_ops; i++) {
+               struct rdma_rw_reg_ctx *prev = i ? &ctx->reg[i - 1] : NULL;
+               struct rdma_rw_reg_ctx *reg = &ctx->reg[i];
+               u32 nents = min(sg_cnt, pages_per_mr);
+
+               ret = rdma_rw_init_one_mr(qp, port_num, reg, sg, sg_cnt,
+                               offset);
+               if (ret < 0)
+                       goto out_free;
+               count += ret;
+
+               if (prev) {
+                       if (reg->mr->need_inval)
+                               prev->wr.wr.next = &reg->inv_wr;
+                       else
+                               prev->wr.wr.next = &reg->reg_wr.wr;
+               }
+
+               reg->reg_wr.wr.next = &reg->wr.wr;
+
+               reg->wr.wr.sg_list = &reg->sge;
+               reg->wr.wr.num_sge = 1;
+               reg->wr.remote_addr = remote_addr;
+               reg->wr.rkey = rkey;
+               if (dir == DMA_TO_DEVICE) {
+                       reg->wr.wr.opcode = IB_WR_RDMA_WRITE;
+               } else if (!rdma_cap_read_inv(qp->device, port_num)) {
+                       reg->wr.wr.opcode = IB_WR_RDMA_READ;
+               } else {
+                       reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV;
+                       reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey;
+               }
+               count++;
+
+               remote_addr += reg->sge.length;
+               sg_cnt -= nents;
+               for (j = 0; j < nents; j++)
+                       sg = sg_next(sg);
+               offset = 0;
+       }
+
+       ctx->type = RDMA_RW_MR;
+       return count;
+
+out_free:
+       while (--i >= 0)
+               ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
+       kfree(ctx->reg);
+out:
+       return ret;
+}
+
+static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+               struct scatterlist *sg, u32 sg_cnt, u32 offset,
+               u64 remote_addr, u32 rkey, enum dma_data_direction dir)
+{
+       struct ib_device *dev = qp->pd->device;
+       u32 max_sge = rdma_rw_max_sge(dev, dir);
+       struct ib_sge *sge;
+       u32 total_len = 0, i, j;
+
+       ctx->nr_ops = DIV_ROUND_UP(sg_cnt, max_sge);
+
+       ctx->map.sges = sge = kcalloc(sg_cnt, sizeof(*sge), GFP_KERNEL);
+       if (!ctx->map.sges)
+               goto out;
+
+       ctx->map.wrs = kcalloc(ctx->nr_ops, sizeof(*ctx->map.wrs), GFP_KERNEL);
+       if (!ctx->map.wrs)
+               goto out_free_sges;
+
+       for (i = 0; i < ctx->nr_ops; i++) {
+               struct ib_rdma_wr *rdma_wr = &ctx->map.wrs[i];
+               u32 nr_sge = min(sg_cnt, max_sge);
+
+               if (dir == DMA_TO_DEVICE)
+                       rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
+               else
+                       rdma_wr->wr.opcode = IB_WR_RDMA_READ;
+               rdma_wr->remote_addr = remote_addr + total_len;
+               rdma_wr->rkey = rkey;
+               rdma_wr->wr.sg_list = sge;
+
+               for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) {
+                       rdma_wr->wr.num_sge++;
+
+                       sge->addr = ib_sg_dma_address(dev, sg) + offset;
+                       sge->length = ib_sg_dma_len(dev, sg) - offset;
+                       sge->lkey = qp->pd->local_dma_lkey;
+
+                       total_len += sge->length;
+                       sge++;
+                       sg_cnt--;
+                       offset = 0;
+               }
+
+               if (i + 1 < ctx->nr_ops)
+                       rdma_wr->wr.next = &ctx->map.wrs[i + 1].wr;
+       }
+
+       ctx->type = RDMA_RW_MULTI_WR;
+       return ctx->nr_ops;
+
+out_free_sges:
+       kfree(ctx->map.sges);
+out:
+       return -ENOMEM;
+}
+
+static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+               struct scatterlist *sg, u32 offset, u64 remote_addr, u32 rkey,
+               enum dma_data_direction dir)
+{
+       struct ib_device *dev = qp->pd->device;
+       struct ib_rdma_wr *rdma_wr = &ctx->single.wr;
+
+       ctx->nr_ops = 1;
+
+       ctx->single.sge.lkey = qp->pd->local_dma_lkey;
+       ctx->single.sge.addr = ib_sg_dma_address(dev, sg) + offset;
+       ctx->single.sge.length = ib_sg_dma_len(dev, sg) - offset;
+
+       memset(rdma_wr, 0, sizeof(*rdma_wr));
+       if (dir == DMA_TO_DEVICE)
+               rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
+       else
+               rdma_wr->wr.opcode = IB_WR_RDMA_READ;
+       rdma_wr->wr.sg_list = &ctx->single.sge;
+       rdma_wr->wr.num_sge = 1;
+       rdma_wr->remote_addr = remote_addr;
+       rdma_wr->rkey = rkey;
+
+       ctx->type = RDMA_RW_SINGLE_WR;
+       return 1;
+}
+
+/**
+ * rdma_rw_ctx_init - initialize a RDMA READ/WRITE context
+ * @ctx:       context to initialize
+ * @qp:                queue pair to operate on
+ * @port_num:  port num to which the connection is bound
+ * @sg:                scatterlist to READ/WRITE from/to
+ * @sg_cnt:    number of entries in @sg
+ * @sg_offset: current byte offset into @sg
+ * @remote_addr:remote address to read/write (relative to @rkey)
+ * @rkey:      remote key to operate on
+ * @dir:       %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
+ *
+ * Returns the number of WQEs that will be needed on the workqueue if
+ * successful, or a negative error code.
+ */
+int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
+               struct scatterlist *sg, u32 sg_cnt, u32 sg_offset,
+               u64 remote_addr, u32 rkey, enum dma_data_direction dir)
+{
+       struct ib_device *dev = qp->pd->device;
+       int ret;
+
+       ret = ib_dma_map_sg(dev, sg, sg_cnt, dir);
+       if (!ret)
+               return -ENOMEM;
+       sg_cnt = ret;
+
+       /*
+        * Skip to the S/G entry that sg_offset falls into:
+        */
+       for (;;) {
+               u32 len = ib_sg_dma_len(dev, sg);
+
+               if (sg_offset < len)
+                       break;
+
+               sg = sg_next(sg);
+               sg_offset -= len;
+               sg_cnt--;
+       }
+
+       ret = -EIO;
+       if (WARN_ON_ONCE(sg_cnt == 0))
+               goto out_unmap_sg;
+
+       if (rdma_rw_io_needs_mr(qp->device, port_num, dir, sg_cnt)) {
+               ret = rdma_rw_init_mr_wrs(ctx, qp, port_num, sg, sg_cnt,
+                               sg_offset, remote_addr, rkey, dir);
+       } else if (sg_cnt > 1) {
+               ret = rdma_rw_init_map_wrs(ctx, qp, sg, sg_cnt, sg_offset,
+                               remote_addr, rkey, dir);
+       } else {
+               ret = rdma_rw_init_single_wr(ctx, qp, sg, sg_offset,
+                               remote_addr, rkey, dir);
+       }
+
+       if (ret < 0)
+               goto out_unmap_sg;
+       return ret;
+
+out_unmap_sg:
+       ib_dma_unmap_sg(dev, sg, sg_cnt, dir);
+       return ret;
+}
+EXPORT_SYMBOL(rdma_rw_ctx_init);
+
+/**
+ * rdma_rw_ctx_signature init - initialize a RW context with signature offload
+ * @ctx:       context to initialize
+ * @qp:                queue pair to operate on
+ * @port_num:  port num to which the connection is bound
+ * @sg:                scatterlist to READ/WRITE from/to
+ * @sg_cnt:    number of entries in @sg
+ * @prot_sg:   scatterlist to READ/WRITE protection information from/to
+ * @prot_sg_cnt: number of entries in @prot_sg
+ * @sig_attrs: signature offloading algorithms
+ * @remote_addr:remote address to read/write (relative to @rkey)
+ * @rkey:      remote key to operate on
+ * @dir:       %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
+ *
+ * Returns the number of WQEs that will be needed on the workqueue if
+ * successful, or a negative error code.
+ */
+int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+               u8 port_num, struct scatterlist *sg, u32 sg_cnt,
+               struct scatterlist *prot_sg, u32 prot_sg_cnt,
+               struct ib_sig_attrs *sig_attrs,
+               u64 remote_addr, u32 rkey, enum dma_data_direction dir)
+{
+       struct ib_device *dev = qp->pd->device;
+       u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
+       struct ib_rdma_wr *rdma_wr;
+       struct ib_send_wr *prev_wr = NULL;
+       int count = 0, ret;
+
+       if (sg_cnt > pages_per_mr || prot_sg_cnt > pages_per_mr) {
+               pr_err("SG count too large\n");
+               return -EINVAL;
+       }
+
+       ret = ib_dma_map_sg(dev, sg, sg_cnt, dir);
+       if (!ret)
+               return -ENOMEM;
+       sg_cnt = ret;
+
+       ret = ib_dma_map_sg(dev, prot_sg, prot_sg_cnt, dir);
+       if (!ret) {
+               ret = -ENOMEM;
+               goto out_unmap_sg;
+       }
+       prot_sg_cnt = ret;
+
+       ctx->type = RDMA_RW_SIG_MR;
+       ctx->nr_ops = 1;
+       ctx->sig = kcalloc(1, sizeof(*ctx->sig), GFP_KERNEL);
+       if (!ctx->sig) {
+               ret = -ENOMEM;
+               goto out_unmap_prot_sg;
+       }
+
+       ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->data, sg, sg_cnt, 0);
+       if (ret < 0)
+               goto out_free_ctx;
+       count += ret;
+       prev_wr = &ctx->sig->data.reg_wr.wr;
+
+       if (prot_sg_cnt) {
+               ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->prot,
+                               prot_sg, prot_sg_cnt, 0);
+               if (ret < 0)
+                       goto out_destroy_data_mr;
+               count += ret;
+
+               if (ctx->sig->prot.inv_wr.next)
+                       prev_wr->next = &ctx->sig->prot.inv_wr;
+               else
+                       prev_wr->next = &ctx->sig->prot.reg_wr.wr;
+               prev_wr = &ctx->sig->prot.reg_wr.wr;
+       } else {
+               ctx->sig->prot.mr = NULL;
+       }
+
+       ctx->sig->sig_mr = ib_mr_pool_get(qp, &qp->sig_mrs);
+       if (!ctx->sig->sig_mr) {
+               ret = -EAGAIN;
+               goto out_destroy_prot_mr;
+       }
+
+       if (ctx->sig->sig_mr->need_inval) {
+               memset(&ctx->sig->sig_inv_wr, 0, sizeof(ctx->sig->sig_inv_wr));
+
+               ctx->sig->sig_inv_wr.opcode = IB_WR_LOCAL_INV;
+               ctx->sig->sig_inv_wr.ex.invalidate_rkey = ctx->sig->sig_mr->rkey;
+
+               prev_wr->next = &ctx->sig->sig_inv_wr;
+               prev_wr = &ctx->sig->sig_inv_wr;
+       }
+
+       ctx->sig->sig_wr.wr.opcode = IB_WR_REG_SIG_MR;
+       ctx->sig->sig_wr.wr.wr_cqe = NULL;
+       ctx->sig->sig_wr.wr.sg_list = &ctx->sig->data.sge;
+       ctx->sig->sig_wr.wr.num_sge = 1;
+       ctx->sig->sig_wr.access_flags = IB_ACCESS_LOCAL_WRITE;
+       ctx->sig->sig_wr.sig_attrs = sig_attrs;
+       ctx->sig->sig_wr.sig_mr = ctx->sig->sig_mr;
+       if (prot_sg_cnt)
+               ctx->sig->sig_wr.prot = &ctx->sig->prot.sge;
+       prev_wr->next = &ctx->sig->sig_wr.wr;
+       prev_wr = &ctx->sig->sig_wr.wr;
+       count++;
+
+       ctx->sig->sig_sge.addr = 0;
+       ctx->sig->sig_sge.length = ctx->sig->data.sge.length;
+       if (sig_attrs->wire.sig_type != IB_SIG_TYPE_NONE)
+               ctx->sig->sig_sge.length += ctx->sig->prot.sge.length;
+
+       rdma_wr = &ctx->sig->data.wr;
+       rdma_wr->wr.sg_list = &ctx->sig->sig_sge;
+       rdma_wr->wr.num_sge = 1;
+       rdma_wr->remote_addr = remote_addr;
+       rdma_wr->rkey = rkey;
+       if (dir == DMA_TO_DEVICE)
+               rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
+       else
+               rdma_wr->wr.opcode = IB_WR_RDMA_READ;
+       prev_wr->next = &rdma_wr->wr;
+       prev_wr = &rdma_wr->wr;
+       count++;
+
+       return count;
+
+out_destroy_prot_mr:
+       if (prot_sg_cnt)
+               ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr);
+out_destroy_data_mr:
+       ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr);
+out_free_ctx:
+       kfree(ctx->sig);
+out_unmap_prot_sg:
+       ib_dma_unmap_sg(dev, prot_sg, prot_sg_cnt, dir);
+out_unmap_sg:
+       ib_dma_unmap_sg(dev, sg, sg_cnt, dir);
+       return ret;
+}
+EXPORT_SYMBOL(rdma_rw_ctx_signature_init);
+
+/*
+ * Now that we are going to post the WRs we can update the lkey and need_inval
+ * state on the MRs.  If we were doing this at init time, we would get double
+ * or missing invalidations if a context was initialized but not actually
+ * posted.
+ */
+static void rdma_rw_update_lkey(struct rdma_rw_reg_ctx *reg, bool need_inval)
+{
+       reg->mr->need_inval = need_inval;
+       ib_update_fast_reg_key(reg->mr, ib_inc_rkey(reg->mr->lkey));
+       reg->reg_wr.key = reg->mr->lkey;
+       reg->sge.lkey = reg->mr->lkey;
+}
+
+/**
+ * rdma_rw_ctx_wrs - return chain of WRs for a RDMA READ or WRITE operation
+ * @ctx:       context to operate on
+ * @qp:                queue pair to operate on
+ * @port_num:  port num to which the connection is bound
+ * @cqe:       completion queue entry for the last WR
+ * @chain_wr:  WR to append to the posted chain
+ *
+ * Return the WR chain for the set of RDMA READ/WRITE operations described by
+ * @ctx, as well as any memory registration operations needed.  If @chain_wr
+ * is non-NULL the WR it points to will be appended to the chain of WRs posted.
+ * If @chain_wr is not set @cqe must be set so that the caller gets a
+ * completion notification.
+ */
+struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+               u8 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr)
+{
+       struct ib_send_wr *first_wr, *last_wr;
+       int i;
+
+       switch (ctx->type) {
+       case RDMA_RW_SIG_MR:
+               rdma_rw_update_lkey(&ctx->sig->data, true);
+               if (ctx->sig->prot.mr)
+                       rdma_rw_update_lkey(&ctx->sig->prot, true);
+       
+               ctx->sig->sig_mr->need_inval = true;
+               ib_update_fast_reg_key(ctx->sig->sig_mr,
+                       ib_inc_rkey(ctx->sig->sig_mr->lkey));
+               ctx->sig->sig_sge.lkey = ctx->sig->sig_mr->lkey;
+
+               if (ctx->sig->data.inv_wr.next)
+                       first_wr = &ctx->sig->data.inv_wr;
+               else
+                       first_wr = &ctx->sig->data.reg_wr.wr;
+               last_wr = &ctx->sig->data.wr.wr;
+               break;
+       case RDMA_RW_MR:
+               for (i = 0; i < ctx->nr_ops; i++) {
+                       rdma_rw_update_lkey(&ctx->reg[i],
+                               ctx->reg[i].wr.wr.opcode !=
+                                       IB_WR_RDMA_READ_WITH_INV);
+               }
+
+               if (ctx->reg[0].inv_wr.next)
+                       first_wr = &ctx->reg[0].inv_wr;
+               else
+                       first_wr = &ctx->reg[0].reg_wr.wr;
+               last_wr = &ctx->reg[ctx->nr_ops - 1].wr.wr;
+               break;
+       case RDMA_RW_MULTI_WR:
+               first_wr = &ctx->map.wrs[0].wr;
+               last_wr = &ctx->map.wrs[ctx->nr_ops - 1].wr;
+               break;
+       case RDMA_RW_SINGLE_WR:
+               first_wr = &ctx->single.wr.wr;
+               last_wr = &ctx->single.wr.wr;
+               break;
+       default:
+               BUG();
+       }
+
+       if (chain_wr) {
+               last_wr->next = chain_wr;
+       } else {
+               last_wr->wr_cqe = cqe;
+               last_wr->send_flags |= IB_SEND_SIGNALED;
+       }
+
+       return first_wr;
+}
+EXPORT_SYMBOL(rdma_rw_ctx_wrs);
+
+/**
+ * rdma_rw_ctx_post - post a RDMA READ or RDMA WRITE operation
+ * @ctx:       context to operate on
+ * @qp:                queue pair to operate on
+ * @port_num:  port num to which the connection is bound
+ * @cqe:       completion queue entry for the last WR
+ * @chain_wr:  WR to append to the posted chain
+ *
+ * Post the set of RDMA READ/WRITE operations described by @ctx, as well as
+ * any memory registration operations needed.  If @chain_wr is non-NULL the
+ * WR it points to will be appended to the chain of WRs posted.  If @chain_wr
+ * is not set @cqe must be set so that the caller gets a completion
+ * notification.
+ */
+int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
+               struct ib_cqe *cqe, struct ib_send_wr *chain_wr)
+{
+       struct ib_send_wr *first_wr, *bad_wr;
+
+       first_wr = rdma_rw_ctx_wrs(ctx, qp, port_num, cqe, chain_wr);
+       return ib_post_send(qp, first_wr, &bad_wr);
+}
+EXPORT_SYMBOL(rdma_rw_ctx_post);
+
+/**
+ * rdma_rw_ctx_destroy - release all resources allocated by rdma_rw_ctx_init
+ * @ctx:       context to release
+ * @qp:                queue pair to operate on
+ * @port_num:  port num to which the connection is bound
+ * @sg:                scatterlist that was used for the READ/WRITE
+ * @sg_cnt:    number of entries in @sg
+ * @dir:       %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
+ */
+void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
+               struct scatterlist *sg, u32 sg_cnt, enum dma_data_direction dir)
+{
+       int i;
+
+       switch (ctx->type) {
+       case RDMA_RW_MR:
+               for (i = 0; i < ctx->nr_ops; i++)
+                       ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
+               kfree(ctx->reg);
+               break;
+       case RDMA_RW_MULTI_WR:
+               kfree(ctx->map.wrs);
+               kfree(ctx->map.sges);
+               break;
+       case RDMA_RW_SINGLE_WR:
+               break;
+       default:
+               BUG();
+               break;
+       }
+
+       ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
+}
+EXPORT_SYMBOL(rdma_rw_ctx_destroy);
+
+/**
+ * rdma_rw_ctx_destroy_signature - release all resources allocated by
+ *     rdma_rw_ctx_init_signature
+ * @ctx:       context to release
+ * @qp:                queue pair to operate on
+ * @port_num:  port num to which the connection is bound
+ * @sg:                scatterlist that was used for the READ/WRITE
+ * @sg_cnt:    number of entries in @sg
+ * @prot_sg:   scatterlist that was used for the READ/WRITE of the PI
+ * @prot_sg_cnt: number of entries in @prot_sg
+ * @dir:       %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
+ */
+void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+               u8 port_num, struct scatterlist *sg, u32 sg_cnt,
+               struct scatterlist *prot_sg, u32 prot_sg_cnt,
+               enum dma_data_direction dir)
+{
+       if (WARN_ON_ONCE(ctx->type != RDMA_RW_SIG_MR))
+               return;
+
+       ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr);
+       ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
+
+       if (ctx->sig->prot.mr) {
+               ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr);
+               ib_dma_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir);
+       }
+
+       ib_mr_pool_put(qp, &qp->sig_mrs, ctx->sig->sig_mr);
+       kfree(ctx->sig);
+}
+EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature);
+
+void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr)
+{
+       u32 factor;
+
+       WARN_ON_ONCE(attr->port_num == 0);
+
+       /*
+        * Each context needs at least one RDMA READ or WRITE WR.
+        *
+        * For some hardware we might need more, eventually we should ask the
+        * HCA driver for a multiplier here.
+        */
+       factor = 1;
+
+       /*
+        * If the devices needs MRs to perform RDMA READ or WRITE operations,
+        * we'll need two additional MRs for the registrations and the
+        * invalidation.
+        */
+       if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN)
+               factor += 6;    /* (inv + reg) * (data + prot + sig) */
+       else if (rdma_rw_can_use_mr(dev, attr->port_num))
+               factor += 2;    /* inv + reg */
+
+       attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs;
+
+       /*
+        * But maybe we were just too high in the sky and the device doesn't
+        * even support all we need, and we'll have to live with what we get..
+        */
+       attr->cap.max_send_wr =
+               min_t(u32, attr->cap.max_send_wr, dev->attrs.max_qp_wr);
+}
+
+int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr)
+{
+       struct ib_device *dev = qp->pd->device;
+       u32 nr_mrs = 0, nr_sig_mrs = 0;
+       int ret = 0;
+
+       if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) {
+               nr_sig_mrs = attr->cap.max_rdma_ctxs;
+               nr_mrs = attr->cap.max_rdma_ctxs * 2;
+       } else if (rdma_rw_can_use_mr(dev, attr->port_num)) {
+               nr_mrs = attr->cap.max_rdma_ctxs;
+       }
+
+       if (nr_mrs) {
+               ret = ib_mr_pool_init(qp, &qp->rdma_mrs, nr_mrs,
+                               IB_MR_TYPE_MEM_REG,
+                               rdma_rw_fr_page_list_len(dev));
+               if (ret) {
+                       pr_err("%s: failed to allocated %d MRs\n",
+                               __func__, nr_mrs);
+                       return ret;
+               }
+       }
+
+       if (nr_sig_mrs) {
+               ret = ib_mr_pool_init(qp, &qp->sig_mrs, nr_sig_mrs,
+                               IB_MR_TYPE_SIGNATURE, 2);
+               if (ret) {
+                       pr_err("%s: failed to allocated %d SIG MRs\n",
+                               __func__, nr_mrs);
+                       goto out_free_rdma_mrs;
+               }
+       }
+
+       return 0;
+
+out_free_rdma_mrs:
+       ib_mr_pool_destroy(qp, &qp->rdma_mrs);
+       return ret;
+}
+
+void rdma_rw_cleanup_mrs(struct ib_qp *qp)
+{
+       ib_mr_pool_destroy(qp, &qp->sig_mrs);
+       ib_mr_pool_destroy(qp, &qp->rdma_mrs);
+}
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c

index 8a09c0fb268d8d89529f0f22249422ee3fe05320..3ebd108bcc5f272165d61e8da10eab916f8c2b41 100644 (file)
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -536,7 +536,7 @@ static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask)
         data = ibnl_put_msg(skb, &nlh, query->seq, 0, RDMA_NL_LS,
                             RDMA_NL_LS_OP_RESOLVE, NLM_F_REQUEST);
         if (!data) {
-               kfree_skb(skb);
+               nlmsg_free(skb);
                 return -EMSGSIZE;
         }
  
@@ -1820,7 +1820,7 @@ static int __init ib_sa_init(void)
                 goto err3;
         }
  
-       if (ibnl_add_client(RDMA_NL_LS, RDMA_NL_LS_NUM_OPS,
+       if (ibnl_add_client(RDMA_NL_LS, ARRAY_SIZE(ib_sa_cb_table),
                             ib_sa_cb_table)) {
                 pr_err("Failed to add netlink callback\n");
                 ret = -EINVAL;
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c

index 6fdc7ecdaca0c3df64e471e9d7b4b3f8242f00ff..1a8babb8ee3c4d328af7e08063cef8f147086a10 100644 (file)
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -1833,7 +1833,8 @@ static int create_qp(struct ib_uverbs_file *file,
         if (attr.create_flags & ~(IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK |
                                 IB_QP_CREATE_CROSS_CHANNEL |
                                 IB_QP_CREATE_MANAGED_SEND |
-                               IB_QP_CREATE_MANAGED_RECV)) {
+                               IB_QP_CREATE_MANAGED_RECV |
+                               IB_QP_CREATE_SCATTER_FCS)) {
                 ret = -EINVAL;
                 goto err_put;
         }
@@ -3088,8 +3089,7 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
         if (cmd.comp_mask)
                 return -EINVAL;
  
-       if ((cmd.flow_attr.type == IB_FLOW_ATTR_SNIFFER &&
-            !capable(CAP_NET_ADMIN)) || !capable(CAP_NET_RAW))
+       if (!capable(CAP_NET_RAW))
                 return -EPERM;
  
         if (cmd.flow_attr.flags >= IB_FLOW_ATTR_FLAGS_RESERVED)
@@ -3655,6 +3655,11 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
         resp.hca_core_clock = attr.hca_core_clock;
         resp.response_length += sizeof(resp.hca_core_clock);
  
+       if (ucore->outlen < resp.response_length + sizeof(resp.device_cap_flags_ex))
+               goto end;
+
+       resp.device_cap_flags_ex = attr.device_cap_flags;
+       resp.response_length += sizeof(resp.device_cap_flags_ex);
  end:
         err = ib_copy_to_udata(ucore, &resp, resp.response_length);
         return err;
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c

index b65b3541e7329e9d716dd40fb81189db97ec9700..1d7d4cf442e3c9646b6824e943e7e649c873d104 100644 (file)
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -48,6 +48,7 @@
  #include <rdma/ib_verbs.h>
  #include <rdma/ib_cache.h>
  #include <rdma/ib_addr.h>
+#include <rdma/rw.h>
  
  #include "core_priv.h"
  
@@ -723,59 +724,89 @@ struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd,
  }
  EXPORT_SYMBOL(ib_open_qp);
  
+static struct ib_qp *ib_create_xrc_qp(struct ib_qp *qp,
+               struct ib_qp_init_attr *qp_init_attr)
+{
+       struct ib_qp *real_qp = qp;
+
+       qp->event_handler = __ib_shared_qp_event_handler;
+       qp->qp_context = qp;
+       qp->pd = NULL;
+       qp->send_cq = qp->recv_cq = NULL;
+       qp->srq = NULL;
+       qp->xrcd = qp_init_attr->xrcd;
+       atomic_inc(&qp_init_attr->xrcd->usecnt);
+       INIT_LIST_HEAD(&qp->open_list);
+
+       qp = __ib_open_qp(real_qp, qp_init_attr->event_handler,
+                         qp_init_attr->qp_context);
+       if (!IS_ERR(qp))
+               __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp);
+       else
+               real_qp->device->destroy_qp(real_qp);
+       return qp;
+}
+
  struct ib_qp *ib_create_qp(struct ib_pd *pd,
                            struct ib_qp_init_attr *qp_init_attr)
  {
-       struct ib_qp *qp, *real_qp;
-       struct ib_device *device;
+       struct ib_device *device = pd ? pd->device : qp_init_attr->xrcd->device;
+       struct ib_qp *qp;
+       int ret;
+
+       /*
+        * If the callers is using the RDMA API calculate the resources
+        * needed for the RDMA READ/WRITE operations.
+        *
+        * Note that these callers need to pass in a port number.
+        */
+       if (qp_init_attr->cap.max_rdma_ctxs)
+               rdma_rw_init_qp(device, qp_init_attr);
  
-       device = pd ? pd->device : qp_init_attr->xrcd->device;
         qp = device->create_qp(pd, qp_init_attr, NULL);
+       if (IS_ERR(qp))
+               return qp;
+
+       qp->device     = device;
+       qp->real_qp    = qp;
+       qp->uobject    = NULL;
+       qp->qp_type    = qp_init_attr->qp_type;
+
+       atomic_set(&qp->usecnt, 0);
+       qp->mrs_used = 0;
+       spin_lock_init(&qp->mr_lock);
+       INIT_LIST_HEAD(&qp->rdma_mrs);
+       INIT_LIST_HEAD(&qp->sig_mrs);
+
+       if (qp_init_attr->qp_type == IB_QPT_XRC_TGT)
+               return ib_create_xrc_qp(qp, qp_init_attr);
+
+       qp->event_handler = qp_init_attr->event_handler;
+       qp->qp_context = qp_init_attr->qp_context;
+       if (qp_init_attr->qp_type == IB_QPT_XRC_INI) {
+               qp->recv_cq = NULL;
+               qp->srq = NULL;
+       } else {
+               qp->recv_cq = qp_init_attr->recv_cq;
+               atomic_inc(&qp_init_attr->recv_cq->usecnt);
+               qp->srq = qp_init_attr->srq;
+               if (qp->srq)
+                       atomic_inc(&qp_init_attr->srq->usecnt);
+       }
  
-       if (!IS_ERR(qp)) {
-               qp->device     = device;
-               qp->real_qp    = qp;
-               qp->uobject    = NULL;
-               qp->qp_type    = qp_init_attr->qp_type;
-
-               atomic_set(&qp->usecnt, 0);
-               if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) {
-                       qp->event_handler = __ib_shared_qp_event_handler;
-                       qp->qp_context = qp;
-                       qp->pd = NULL;
-                       qp->send_cq = qp->recv_cq = NULL;
-                       qp->srq = NULL;
-                       qp->xrcd = qp_init_attr->xrcd;
-                       atomic_inc(&qp_init_attr->xrcd->usecnt);
-                       INIT_LIST_HEAD(&qp->open_list);
-
-                       real_qp = qp;
-                       qp = __ib_open_qp(real_qp, qp_init_attr->event_handler,
-                                         qp_init_attr->qp_context);
-                       if (!IS_ERR(qp))
-                               __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp);
-                       else
-                               real_qp->device->destroy_qp(real_qp);
-               } else {
-                       qp->event_handler = qp_init_attr->event_handler;
-                       qp->qp_context = qp_init_attr->qp_context;
-                       if (qp_init_attr->qp_type == IB_QPT_XRC_INI) {
-                               qp->recv_cq = NULL;
-                               qp->srq = NULL;
-                       } else {
-                               qp->recv_cq = qp_init_attr->recv_cq;
-                               atomic_inc(&qp_init_attr->recv_cq->usecnt);
-                               qp->srq = qp_init_attr->srq;
-                               if (qp->srq)
-                                       atomic_inc(&qp_init_attr->srq->usecnt);
-                       }
+       qp->pd      = pd;
+       qp->send_cq = qp_init_attr->send_cq;
+       qp->xrcd    = NULL;
  
-                       qp->pd      = pd;
-                       qp->send_cq = qp_init_attr->send_cq;
-                       qp->xrcd    = NULL;
+       atomic_inc(&pd->usecnt);
+       atomic_inc(&qp_init_attr->send_cq->usecnt);
  
-                       atomic_inc(&pd->usecnt);
-                       atomic_inc(&qp_init_attr->send_cq->usecnt);
+       if (qp_init_attr->cap.max_rdma_ctxs) {
+               ret = rdma_rw_init_mrs(qp, qp_init_attr);
+               if (ret) {
+                       pr_err("failed to init MR pool ret= %d\n", ret);
+                       ib_destroy_qp(qp);
+                       qp = ERR_PTR(ret);
                 }
         }
  
@@ -1250,6 +1281,8 @@ int ib_destroy_qp(struct ib_qp *qp)
         struct ib_srq *srq;
         int ret;
  
+       WARN_ON_ONCE(qp->mrs_used > 0);
+
         if (atomic_read(&qp->usecnt))
                 return -EBUSY;
  
@@ -1261,6 +1294,9 @@ int ib_destroy_qp(struct ib_qp *qp)
         rcq  = qp->recv_cq;
         srq  = qp->srq;
  
+       if (!qp->uobject)
+               rdma_rw_cleanup_mrs(qp);
+
         ret = qp->device->destroy_qp(qp);
         if (!ret) {
                 if (pd)
@@ -1343,6 +1379,7 @@ struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
                 mr->pd      = pd;
                 mr->uobject = NULL;
                 atomic_inc(&pd->usecnt);
+               mr->need_inval = false;
         }
  
         return mr;
@@ -1389,6 +1426,7 @@ struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
                 mr->pd      = pd;
                 mr->uobject = NULL;
                 atomic_inc(&pd->usecnt);
+               mr->need_inval = false;
         }
  
         return mr;
@@ -1597,6 +1635,7 @@ EXPORT_SYMBOL(ib_set_vf_guid);
   * @mr:            memory region
   * @sg:            dma mapped scatterlist
   * @sg_nents:      number of entries in sg
+ * @sg_offset:     offset in bytes into sg
   * @page_size:     page vector desired page size
   *
   * Constraints:
@@ -1615,17 +1654,15 @@ EXPORT_SYMBOL(ib_set_vf_guid);
   * After this completes successfully, the  memory region
   * is ready for registration.
   */
-int ib_map_mr_sg(struct ib_mr *mr,
-                struct scatterlist *sg,
-                int sg_nents,
-                unsigned int page_size)
+int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
+                unsigned int *sg_offset, unsigned int page_size)
  {
         if (unlikely(!mr->device->map_mr_sg))
                 return -ENOSYS;
  
         mr->page_size = page_size;
  
-       return mr->device->map_mr_sg(mr, sg, sg_nents);
+       return mr->device->map_mr_sg(mr, sg, sg_nents, sg_offset);
  }
  EXPORT_SYMBOL(ib_map_mr_sg);
  
@@ -1635,6 +1672,10 @@ EXPORT_SYMBOL(ib_map_mr_sg);
   * @mr:            memory region
   * @sgl:           dma mapped scatterlist
   * @sg_nents:      number of entries in sg
+ * @sg_offset_p:   IN:  start offset in bytes into sg
+ *                 OUT: offset in bytes for element n of the sg of the first
+ *                      byte that has not been processed where n is the return
+ *                      value of this function.
   * @set_page:      driver page assignment function pointer
   *
   * Core service helper for drivers to convert the largest
@@ -1645,23 +1686,26 @@ EXPORT_SYMBOL(ib_map_mr_sg);
   * Returns the number of sg elements that were assigned to
   * a page vector.
   */
-int ib_sg_to_pages(struct ib_mr *mr,
-                  struct scatterlist *sgl,
-                  int sg_nents,
-                  int (*set_page)(struct ib_mr *, u64))
+int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents,
+               unsigned int *sg_offset_p, int (*set_page)(struct ib_mr *, u64))
  {
         struct scatterlist *sg;
         u64 last_end_dma_addr = 0;
+       unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
         unsigned int last_page_off = 0;
         u64 page_mask = ~((u64)mr->page_size - 1);
         int i, ret;
  
-       mr->iova = sg_dma_address(&sgl[0]);
+       if (unlikely(sg_nents <= 0 || sg_offset > sg_dma_len(&sgl[0])))
+               return -EINVAL;
+
+       mr->iova = sg_dma_address(&sgl[0]) + sg_offset;
         mr->length = 0;
  
         for_each_sg(sgl, sg, sg_nents, i) {
-               u64 dma_addr = sg_dma_address(sg);
-               unsigned int dma_len = sg_dma_len(sg);
+               u64 dma_addr = sg_dma_address(sg) + sg_offset;
+               u64 prev_addr = dma_addr;
+               unsigned int dma_len = sg_dma_len(sg) - sg_offset;
                 u64 end_dma_addr = dma_addr + dma_len;
                 u64 page_addr = dma_addr & page_mask;
  
@@ -1685,8 +1729,14 @@ int ib_sg_to_pages(struct ib_mr *mr,
  
                 do {
                         ret = set_page(mr, page_addr);
-                       if (unlikely(ret < 0))
-                               return i ? : ret;
+                       if (unlikely(ret < 0)) {
+                               sg_offset = prev_addr - sg_dma_address(sg);
+                               mr->length += prev_addr - dma_addr;
+                               if (sg_offset_p)
+                                       *sg_offset_p = sg_offset;
+                               return i || sg_offset ? i : ret;
+                       }
+                       prev_addr = page_addr;
  next_page:
                         page_addr += mr->page_size;
                 } while (page_addr < end_dma_addr);
@@ -1694,8 +1744,12 @@ next_page:
                 mr->length += dma_len;
                 last_end_dma_addr = end_dma_addr;
                 last_page_off = end_dma_addr & ~page_mask;
+
+               sg_offset = 0;
         }
  
+       if (sg_offset_p)
+               *sg_offset_p = 0;
         return i;
  }
  EXPORT_SYMBOL(ib_sg_to_pages);
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c

index 3234a8be16f6c53e4d155920cb766236b0f74306..47cb927a0dd665bd0dfb0f85508650aa97aaa7b7 100644 (file)
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -783,15 +783,14 @@ static int iwch_set_page(struct ib_mr *ibmr, u64 addr)
         return 0;
  }
  
-static int iwch_map_mr_sg(struct ib_mr *ibmr,
-                         struct scatterlist *sg,
-                         int sg_nents)
+static int iwch_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
+                         int sg_nents, unsigned int *sg_offset)
  {
         struct iwch_mr *mhp = to_iwch_mr(ibmr);
  
         mhp->npages = 0;
  
-       return ib_sg_to_pages(ibmr, sg, sg_nents, iwch_set_page);
+       return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, iwch_set_page);
  }
  
  static int iwch_destroy_qp(struct ib_qp *ib_qp)
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c

index 651711370d557f1798fdac34e2b57b9f15c1b3cf..a3a67216bce6748423ef0c4b33ed32740d54e29c 100644 (file)
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -119,7 +119,7 @@ MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout "
  static int mpa_rev = 2;
  module_param(mpa_rev, int, 0644);
  MODULE_PARM_DESC(mpa_rev, "MPA Revision, 0 supports amso1100, "
-               "1 is RFC0544 spec compliant, 2 is IETF MPA Peer Connect Draft"
+               "1 is RFC5044 spec compliant, 2 is IETF MPA Peer Connect Draft"
                 " compliant (default=2)");
  
  static int markers_enabled;
@@ -145,19 +145,35 @@ static struct sk_buff_head rxq;
  static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp);
  static void ep_timeout(unsigned long arg);
  static void connect_reply_upcall(struct c4iw_ep *ep, int status);
+static int sched(struct c4iw_dev *dev, struct sk_buff *skb);
  
  static LIST_HEAD(timeout_list);
  static spinlock_t timeout_lock;
  
+static void deref_cm_id(struct c4iw_ep_common *epc)
+{
+       epc->cm_id->rem_ref(epc->cm_id);
+       epc->cm_id = NULL;
+       set_bit(CM_ID_DEREFED, &epc->history);
+}
+
+static void ref_cm_id(struct c4iw_ep_common *epc)
+{
+       set_bit(CM_ID_REFED, &epc->history);
+       epc->cm_id->add_ref(epc->cm_id);
+}
+
  static void deref_qp(struct c4iw_ep *ep)
  {
         c4iw_qp_rem_ref(&ep->com.qp->ibqp);
         clear_bit(QP_REFERENCED, &ep->com.flags);
+       set_bit(QP_DEREFED, &ep->com.history);
  }
  
  static void ref_qp(struct c4iw_ep *ep)
  {
         set_bit(QP_REFERENCED, &ep->com.flags);
+       set_bit(QP_REFED, &ep->com.history);
         c4iw_qp_add_ref(&ep->com.qp->ibqp);
  }
  
@@ -201,6 +217,8 @@ static int c4iw_l2t_send(struct c4iw_rdev *rdev, struct sk_buff *skb,
         error = cxgb4_l2t_send(rdev->lldi.ports[0], skb, l2e);
         if (error < 0)
                 kfree_skb(skb);
+       else if (error == NET_XMIT_DROP)
+               return -ENOMEM;
         return error < 0 ? error : 0;
  }
  
@@ -290,12 +308,63 @@ static void *alloc_ep(int size, gfp_t gfp)
         return epc;
  }
  
+static void remove_ep_tid(struct c4iw_ep *ep)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&ep->com.dev->lock, flags);
+       _remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid, 0);
+       spin_unlock_irqrestore(&ep->com.dev->lock, flags);
+}
+
+static void insert_ep_tid(struct c4iw_ep *ep)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&ep->com.dev->lock, flags);
+       _insert_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep, ep->hwtid, 0);
+       spin_unlock_irqrestore(&ep->com.dev->lock, flags);
+}
+
+/*
+ * Atomically lookup the ep ptr given the tid and grab a reference on the ep.
+ */
+static struct c4iw_ep *get_ep_from_tid(struct c4iw_dev *dev, unsigned int tid)
+{
+       struct c4iw_ep *ep;
+       unsigned long flags;
+
+       spin_lock_irqsave(&dev->lock, flags);
+       ep = idr_find(&dev->hwtid_idr, tid);
+       if (ep)
+               c4iw_get_ep(&ep->com);
+       spin_unlock_irqrestore(&dev->lock, flags);
+       return ep;
+}
+
+/*
+ * Atomically lookup the ep ptr given the stid and grab a reference on the ep.
+ */
+static struct c4iw_listen_ep *get_ep_from_stid(struct c4iw_dev *dev,
+                                              unsigned int stid)
+{
+       struct c4iw_listen_ep *ep;
+       unsigned long flags;
+
+       spin_lock_irqsave(&dev->lock, flags);
+       ep = idr_find(&dev->stid_idr, stid);
+       if (ep)
+               c4iw_get_ep(&ep->com);
+       spin_unlock_irqrestore(&dev->lock, flags);
+       return ep;
+}
+
  void _c4iw_free_ep(struct kref *kref)
  {
         struct c4iw_ep *ep;
  
         ep = container_of(kref, struct c4iw_ep, com.kref);
-       PDBG("%s ep %p state %s\n", __func__, ep, states[state_read(&ep->com)]);
+       PDBG("%s ep %p state %s\n", __func__, ep, states[ep->com.state]);
         if (test_bit(QP_REFERENCED, &ep->com.flags))
                 deref_qp(ep);
         if (test_bit(RELEASE_RESOURCES, &ep->com.flags)) {
@@ -309,10 +378,11 @@ void _c4iw_free_ep(struct kref *kref)
                                         (const u32 *)&sin6->sin6_addr.s6_addr,
                                         1);
                 }
-               remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid);
                 cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid);
                 dst_release(ep->dst);
                 cxgb4_l2t_release(ep->l2t);
+               if (ep->mpa_skb)
+                       kfree_skb(ep->mpa_skb);
         }
         kfree(ep);
  }
@@ -320,6 +390,15 @@ void _c4iw_free_ep(struct kref *kref)
  static void release_ep_resources(struct c4iw_ep *ep)
  {
         set_bit(RELEASE_RESOURCES, &ep->com.flags);
+
+       /*
+        * If we have a hwtid, then remove it from the idr table
+        * so lookups will no longer find this endpoint.  Otherwise
+        * we have a race where one thread finds the ep ptr just
+        * before the other thread is freeing the ep memory.
+        */
+       if (ep->hwtid != -1)
+               remove_ep_tid(ep);
         c4iw_put_ep(&ep->com);
  }
  
@@ -432,10 +511,74 @@ static struct dst_entry *find_route(struct c4iw_dev *dev, __be32 local_ip,
  
  static void arp_failure_discard(void *handle, struct sk_buff *skb)
  {
-       PDBG("%s c4iw_dev %p\n", __func__, handle);
+       pr_err(MOD "ARP failure\n");
         kfree_skb(skb);
  }
  
+static void mpa_start_arp_failure(void *handle, struct sk_buff *skb)
+{
+       pr_err("ARP failure during MPA Negotiation - Closing Connection\n");
+}
+
+enum {
+       NUM_FAKE_CPLS = 2,
+       FAKE_CPL_PUT_EP_SAFE = NUM_CPL_CMDS + 0,
+       FAKE_CPL_PASS_PUT_EP_SAFE = NUM_CPL_CMDS + 1,
+};
+
+static int _put_ep_safe(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+       struct c4iw_ep *ep;
+
+       ep = *((struct c4iw_ep **)(skb->cb + 2 * sizeof(void *)));
+       release_ep_resources(ep);
+       return 0;
+}
+
+static int _put_pass_ep_safe(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+       struct c4iw_ep *ep;
+
+       ep = *((struct c4iw_ep **)(skb->cb + 2 * sizeof(void *)));
+       c4iw_put_ep(&ep->parent_ep->com);
+       release_ep_resources(ep);
+       return 0;
+}
+
+/*
+ * Fake up a special CPL opcode and call sched() so process_work() will call
+ * _put_ep_safe() in a safe context to free the ep resources.  This is needed
+ * because ARP error handlers are called in an ATOMIC context, and
+ * _c4iw_free_ep() needs to block.
+ */
+static void queue_arp_failure_cpl(struct c4iw_ep *ep, struct sk_buff *skb,
+                                 int cpl)
+{
+       struct cpl_act_establish *rpl = cplhdr(skb);
+
+       /* Set our special ARP_FAILURE opcode */
+       rpl->ot.opcode = cpl;
+
+       /*
+        * Save ep in the skb->cb area, after where sched() will save the dev
+        * ptr.
+        */
+       *((struct c4iw_ep **)(skb->cb + 2 * sizeof(void *))) = ep;
+       sched(ep->com.dev, skb);
+}
+
+/* Handle an ARP failure for an accept */
+static void pass_accept_rpl_arp_failure(void *handle, struct sk_buff *skb)
+{
+       struct c4iw_ep *ep = handle;
+
+       pr_err(MOD "ARP failure during accept - tid %u -dropping connection\n",
+              ep->hwtid);
+
+       __state_set(&ep->com, DEAD);
+       queue_arp_failure_cpl(ep, skb, FAKE_CPL_PASS_PUT_EP_SAFE);
+}
+
  /*
   * Handle an ARP failure for an active open.
   */
@@ -444,9 +587,8 @@ static void act_open_req_arp_failure(void *handle, struct sk_buff *skb)
         struct c4iw_ep *ep = handle;
  
         printk(KERN_ERR MOD "ARP failure during connect\n");
-       kfree_skb(skb);
         connect_reply_upcall(ep, -EHOSTUNREACH);
-       state_set(&ep->com, DEAD);
+       __state_set(&ep->com, DEAD);
         if (ep->com.remote_addr.ss_family == AF_INET6) {
                 struct sockaddr_in6 *sin6 =
                         (struct sockaddr_in6 *)&ep->com.local_addr;
@@ -455,9 +597,7 @@ static void act_open_req_arp_failure(void *handle, struct sk_buff *skb)
         }
         remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);
         cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
-       dst_release(ep->dst);
-       cxgb4_l2t_release(ep->l2t);
-       c4iw_put_ep(&ep->com);
+       queue_arp_failure_cpl(ep, skb, FAKE_CPL_PUT_EP_SAFE);
  }
  
  /*
@@ -466,15 +606,21 @@ static void act_open_req_arp_failure(void *handle, struct sk_buff *skb)
   */
  static void abort_arp_failure(void *handle, struct sk_buff *skb)
  {
-       struct c4iw_rdev *rdev = handle;
+       int ret;
+       struct c4iw_ep *ep = handle;
+       struct c4iw_rdev *rdev = &ep->com.dev->rdev;
         struct cpl_abort_req *req = cplhdr(skb);
  
         PDBG("%s rdev %p\n", __func__, rdev);
         req->cmd = CPL_ABORT_NO_RST;
-       c4iw_ofld_send(rdev, skb);
+       ret = c4iw_ofld_send(rdev, skb);
+       if (ret) {
+               __state_set(&ep->com, DEAD);
+               queue_arp_failure_cpl(ep, skb, FAKE_CPL_PUT_EP_SAFE);
+       }
  }
  
-static void send_flowc(struct c4iw_ep *ep, struct sk_buff *skb)
+static int send_flowc(struct c4iw_ep *ep, struct sk_buff *skb)
  {
         unsigned int flowclen = 80;
         struct fw_flowc_wr *flowc;
@@ -530,7 +676,7 @@ static void send_flowc(struct c4iw_ep *ep, struct sk_buff *skb)
         }
  
         set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
-       c4iw_ofld_send(&ep->com.dev->rdev, skb);
+       return c4iw_ofld_send(&ep->com.dev->rdev, skb);
  }
  
  static int send_halfclose(struct c4iw_ep *ep, gfp_t gfp)
@@ -568,7 +714,7 @@ static int send_abort(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp)
                 return -ENOMEM;
         }
         set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
-       t4_set_arp_err_handler(skb, &ep->com.dev->rdev, abort_arp_failure);
+       t4_set_arp_err_handler(skb, ep, abort_arp_failure);
         req = (struct cpl_abort_req *) skb_put(skb, wrlen);
         memset(req, 0, wrlen);
         INIT_TP_WR(req, ep->hwtid);
@@ -807,10 +953,10 @@ clip_release:
         return ret;
  }
  
-static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb,
-               u8 mpa_rev_to_use)
+static int send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb,
+                       u8 mpa_rev_to_use)
  {
-       int mpalen, wrlen;
+       int mpalen, wrlen, ret;
         struct fw_ofld_tx_data_wr *req;
         struct mpa_message *mpa;
         struct mpa_v2_conn_params mpa_v2_params;
@@ -826,7 +972,7 @@ static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb,
         skb = get_skb(skb, wrlen, GFP_KERNEL);
         if (!skb) {
                 connect_reply_upcall(ep, -ENOMEM);
-               return;
+               return -ENOMEM;
         }
         set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
  
@@ -894,12 +1040,14 @@ static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb,
         t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
         BUG_ON(ep->mpa_skb);
         ep->mpa_skb = skb;
-       c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+       ret = c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+       if (ret)
+               return ret;
         start_ep_timer(ep);
         __state_set(&ep->com, MPA_REQ_SENT);
         ep->mpa_attr.initiator = 1;
         ep->snd_seq += mpalen;
-       return;
+       return ret;
  }
  
  static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen)
@@ -975,7 +1123,7 @@ static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen)
          */
         skb_get(skb);
         set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
-       t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
+       t4_set_arp_err_handler(skb, NULL, mpa_start_arp_failure);
         BUG_ON(ep->mpa_skb);
         ep->mpa_skb = skb;
         ep->snd_seq += mpalen;
@@ -1060,7 +1208,7 @@ static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen)
          * Function fw4_ack() will deref it.
          */
         skb_get(skb);
-       t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
+       t4_set_arp_err_handler(skb, NULL, mpa_start_arp_failure);
         ep->mpa_skb = skb;
         __state_set(&ep->com, MPA_REP_SENT);
         ep->snd_seq += mpalen;
@@ -1074,6 +1222,7 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb)
         unsigned int tid = GET_TID(req);
         unsigned int atid = TID_TID_G(ntohl(req->tos_atid));
         struct tid_info *t = dev->rdev.lldi.tids;
+       int ret;
  
         ep = lookup_atid(t, atid);
  
@@ -1086,7 +1235,7 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb)
         /* setup the hwtid for this connection */
         ep->hwtid = tid;
         cxgb4_insert_tid(t, ep, tid);
-       insert_handle(dev, &dev->hwtid_idr, ep, ep->hwtid);
+       insert_ep_tid(ep);
  
         ep->snd_seq = be32_to_cpu(req->snd_isn);
         ep->rcv_seq = be32_to_cpu(req->rcv_isn);
@@ -1099,13 +1248,22 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb)
         set_bit(ACT_ESTAB, &ep->com.history);
  
         /* start MPA negotiation */
-       send_flowc(ep, NULL);
+       ret = send_flowc(ep, NULL);
+       if (ret)
+               goto err;
         if (ep->retry_with_mpa_v1)
-               send_mpa_req(ep, skb, 1);
+               ret = send_mpa_req(ep, skb, 1);
         else
-               send_mpa_req(ep, skb, mpa_rev);
+               ret = send_mpa_req(ep, skb, mpa_rev);
+       if (ret)
+               goto err;
         mutex_unlock(&ep->com.mutex);
         return 0;
+err:
+       mutex_unlock(&ep->com.mutex);
+       connect_reply_upcall(ep, -ENOMEM);
+       c4iw_ep_disconnect(ep, 0, GFP_KERNEL);
+       return 0;
  }
  
  static void close_complete_upcall(struct c4iw_ep *ep, int status)
@@ -1120,20 +1278,11 @@ static void close_complete_upcall(struct c4iw_ep *ep, int status)
                 PDBG("close complete delivered ep %p cm_id %p tid %u\n",
                      ep, ep->com.cm_id, ep->hwtid);
                 ep->com.cm_id->event_handler(ep->com.cm_id, &event);
-               ep->com.cm_id->rem_ref(ep->com.cm_id);
-               ep->com.cm_id = NULL;
+               deref_cm_id(&ep->com);
                 set_bit(CLOSE_UPCALL, &ep->com.history);
         }
  }
  
-static int abort_connection(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp)
-{
-       PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
-       __state_set(&ep->com, ABORTING);
-       set_bit(ABORT_CONN, &ep->com.history);
-       return send_abort(ep, skb, gfp);
-}
-
  static void peer_close_upcall(struct c4iw_ep *ep)
  {
         struct iw_cm_event event;
@@ -1161,8 +1310,7 @@ static void peer_abort_upcall(struct c4iw_ep *ep)
                 PDBG("abort delivered ep %p cm_id %p tid %u\n", ep,
                      ep->com.cm_id, ep->hwtid);
                 ep->com.cm_id->event_handler(ep->com.cm_id, &event);
-               ep->com.cm_id->rem_ref(ep->com.cm_id);
-               ep->com.cm_id = NULL;
+               deref_cm_id(&ep->com);
                 set_bit(ABORT_UPCALL, &ep->com.history);
         }
  }
@@ -1205,10 +1353,8 @@ static void connect_reply_upcall(struct c4iw_ep *ep, int status)
         set_bit(CONN_RPL_UPCALL, &ep->com.history);
         ep->com.cm_id->event_handler(ep->com.cm_id, &event);
  
-       if (status < 0) {
-               ep->com.cm_id->rem_ref(ep->com.cm_id);
-               ep->com.cm_id = NULL;
-       }
+       if (status < 0)
+               deref_cm_id(&ep->com);
  }
  
  static int connect_request_upcall(struct c4iw_ep *ep)
@@ -1301,6 +1447,18 @@ static int update_rx_credits(struct c4iw_ep *ep, u32 credits)
  
  #define RELAXED_IRD_NEGOTIATION 1
  
+/*
+ * process_mpa_reply - process streaming mode MPA reply
+ *
+ * Returns:
+ *
+ * 0 upon success indicating a connect request was delivered to the ULP
+ * or the mpa request is incomplete but valid so far.
+ *
+ * 1 if a failure requires the caller to close the connection.
+ *
+ * 2 if a failure requires the caller to abort the connection.
+ */
  static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)
  {
         struct mpa_message *mpa;
@@ -1315,21 +1473,13 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)
  
         PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
  
-       /*
-        * Stop mpa timer.  If it expired, then
-        * we ignore the MPA reply.  process_timeout()
-        * will abort the connection.
-        */
-       if (stop_ep_timer(ep))
-               return 0;
-
         /*
          * If we get more than the supported amount of private data
          * then we must fail this connection.
          */
         if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) {
                 err = -EINVAL;
-               goto err;
+               goto err_stop_timer;
         }
  
         /*
@@ -1351,11 +1501,11 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)
                 printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d,"
                        " Received = %d\n", __func__, mpa_rev, mpa->revision);
                 err = -EPROTO;
-               goto err;
+               goto err_stop_timer;
         }
         if (memcmp(mpa->key, MPA_KEY_REP, sizeof(mpa->key))) {
                 err = -EPROTO;
-               goto err;
+               goto err_stop_timer;
         }
  
         plen = ntohs(mpa->private_data_size);
@@ -1365,7 +1515,7 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)
          */
         if (plen > MPA_MAX_PRIVATE_DATA) {
                 err = -EPROTO;
-               goto err;
+               goto err_stop_timer;
         }
  
         /*
@@ -1373,7 +1523,7 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)
          */
         if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
                 err = -EPROTO;
-               goto err;
+               goto err_stop_timer;
         }
  
         ep->plen = (u8) plen;
@@ -1387,9 +1537,17 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)
  
         if (mpa->flags & MPA_REJECT) {
                 err = -ECONNREFUSED;
-               goto err;
+               goto err_stop_timer;
         }
  
+       /*
+        * Stop mpa timer.  If it expired, then
+        * we ignore the MPA reply.  process_timeout()
+        * will abort the connection.
+        */
+       if (stop_ep_timer(ep))
+               return 0;
+
         /*
          * If we get here we have accumulated the entire mpa
          * start reply message including private data. And
@@ -1529,15 +1687,28 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)
                 goto out;
         }
         goto out;
+err_stop_timer:
+       stop_ep_timer(ep);
  err:
-       __state_set(&ep->com, ABORTING);
-       send_abort(ep, skb, GFP_KERNEL);
+       disconnect = 2;
  out:
         connect_reply_upcall(ep, err);
         return disconnect;
  }
  
-static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
+/*
+ * process_mpa_request - process streaming mode MPA request
+ *
+ * Returns:
+ *
+ * 0 upon success indicating a connect request was delivered to the ULP
+ * or the mpa request is incomplete but valid so far.
+ *
+ * 1 if a failure requires the caller to close the connection.
+ *
+ * 2 if a failure requires the caller to abort the connection.
+ */
+static int process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
  {
         struct mpa_message *mpa;
         struct mpa_v2_conn_params *mpa_v2_params;
@@ -1549,11 +1720,8 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
          * If we get more than the supported amount of private data
          * then we must fail this connection.
          */
-       if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) {
-               (void)stop_ep_timer(ep);
-               abort_connection(ep, skb, GFP_KERNEL);
-               return;
-       }
+       if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt))
+               goto err_stop_timer;
  
         PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__);
  
@@ -1569,7 +1737,7 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
          * We'll continue process when more data arrives.
          */
         if (ep->mpa_pkt_len < sizeof(*mpa))
-               return;
+               return 0;
  
         PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__);
         mpa = (struct mpa_message *) ep->mpa_pkt;
@@ -1580,43 +1748,32 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
         if (mpa->revision > mpa_rev) {
                 printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d,"
                        " Received = %d\n", __func__, mpa_rev, mpa->revision);
-               (void)stop_ep_timer(ep);
-               abort_connection(ep, skb, GFP_KERNEL);
-               return;
+               goto err_stop_timer;
         }
  
-       if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) {
-               (void)stop_ep_timer(ep);
-               abort_connection(ep, skb, GFP_KERNEL);
-               return;
-       }
+       if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key)))
+               goto err_stop_timer;
  
         plen = ntohs(mpa->private_data_size);
  
         /*
          * Fail if there's too much private data.
          */
-       if (plen > MPA_MAX_PRIVATE_DATA) {
-               (void)stop_ep_timer(ep);
-               abort_connection(ep, skb, GFP_KERNEL);
-               return;
-       }
+       if (plen > MPA_MAX_PRIVATE_DATA)
+               goto err_stop_timer;
  
         /*
          * If plen does not account for pkt size
          */
-       if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
-               (void)stop_ep_timer(ep);
-               abort_connection(ep, skb, GFP_KERNEL);
-               return;
-       }
+       if (ep->mpa_pkt_len > (sizeof(*mpa) + plen))
+               goto err_stop_timer;
         ep->plen = (u8) plen;
  
         /*
          * If we don't have all the pdata yet, then bail.
          */
         if (ep->mpa_pkt_len < (sizeof(*mpa) + plen))
-               return;
+               return 0;
  
         /*
          * If we get here we have accumulated the entire mpa
@@ -1665,26 +1822,26 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
              ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version,
              ep->mpa_attr.p2p_type);
  
-       /*
-        * If the endpoint timer already expired, then we ignore
-        * the start request.  process_timeout() will abort
-        * the connection.
-        */
-       if (!stop_ep_timer(ep)) {
-               __state_set(&ep->com, MPA_REQ_RCVD);
-
-               /* drive upcall */
-               mutex_lock_nested(&ep->parent_ep->com.mutex,
-                                 SINGLE_DEPTH_NESTING);
-               if (ep->parent_ep->com.state != DEAD) {
-                       if (connect_request_upcall(ep))
-                               abort_connection(ep, skb, GFP_KERNEL);
-               } else {
-                       abort_connection(ep, skb, GFP_KERNEL);
-               }
-               mutex_unlock(&ep->parent_ep->com.mutex);
+       __state_set(&ep->com, MPA_REQ_RCVD);
+
+       /* drive upcall */
+       mutex_lock_nested(&ep->parent_ep->com.mutex, SINGLE_DEPTH_NESTING);
+       if (ep->parent_ep->com.state != DEAD) {
+               if (connect_request_upcall(ep))
+                       goto err_unlock_parent;
+       } else {
+               goto err_unlock_parent;
         }
-       return;
+       mutex_unlock(&ep->parent_ep->com.mutex);
+       return 0;
+
+err_unlock_parent:
+       mutex_unlock(&ep->parent_ep->com.mutex);
+       goto err_out;
+err_stop_timer:
+       (void)stop_ep_timer(ep);
+err_out:
+       return 2;
  }
  
  static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb)
@@ -1693,11 +1850,10 @@ static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb)
         struct cpl_rx_data *hdr = cplhdr(skb);
         unsigned int dlen = ntohs(hdr->len);
         unsigned int tid = GET_TID(hdr);
-       struct tid_info *t = dev->rdev.lldi.tids;
         __u8 status = hdr->status;
         int disconnect = 0;
  
-       ep = lookup_tid(t, tid);
+       ep = get_ep_from_tid(dev, tid);
         if (!ep)
                 return 0;
         PDBG("%s ep %p tid %u dlen %u\n", __func__, ep, ep->hwtid, dlen);
@@ -1715,7 +1871,7 @@ static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb)
                 break;
         case MPA_REQ_WAIT:
                 ep->rcv_seq += dlen;
-               process_mpa_request(ep, skb);
+               disconnect = process_mpa_request(ep, skb);
                 break;
         case FPDU_MODE: {
                 struct c4iw_qp_attributes attrs;
@@ -1736,7 +1892,8 @@ static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb)
         }
         mutex_unlock(&ep->com.mutex);
         if (disconnect)
-               c4iw_ep_disconnect(ep, 0, GFP_KERNEL);
+               c4iw_ep_disconnect(ep, disconnect == 2, GFP_KERNEL);
+       c4iw_put_ep(&ep->com);
         return 0;
  }
  
@@ -1746,9 +1903,8 @@ static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
         struct cpl_abort_rpl_rss *rpl = cplhdr(skb);
         int release = 0;
         unsigned int tid = GET_TID(rpl);
-       struct tid_info *t = dev->rdev.lldi.tids;
  
-       ep = lookup_tid(t, tid);
+       ep = get_ep_from_tid(dev, tid);
         if (!ep) {
                 printk(KERN_WARNING MOD "Abort rpl to freed endpoint\n");
                 return 0;
@@ -1770,10 +1926,11 @@ static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
  
         if (release)
                 release_ep_resources(ep);
+       c4iw_put_ep(&ep->com);
         return 0;
  }
  
-static void send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid)
+static int send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid)
  {
         struct sk_buff *skb;
         struct fw_ofld_connection_wr *req;
@@ -1843,7 +2000,7 @@ static void send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid)
         req->tcb.opt2 = cpu_to_be32((__force u32)req->tcb.opt2);
         set_wr_txq(skb, CPL_PRIORITY_CONTROL, ep->ctrlq_idx);
         set_bit(ACT_OFLD_CONN, &ep->com.history);
-       c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+       return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
  }
  
  /*
@@ -1986,6 +2143,7 @@ static int c4iw_reconnect(struct c4iw_ep *ep)
  
         PDBG("%s qp %p cm_id %p\n", __func__, ep->com.qp, ep->com.cm_id);
         init_timer(&ep->timer);
+       c4iw_init_wr_wait(&ep->com.wr_wait);
  
         /*
          * Allocate an active TID to initiate a TCP connection.
@@ -2069,6 +2227,7 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
         struct sockaddr_in *ra;
         struct sockaddr_in6 *la6;
         struct sockaddr_in6 *ra6;
+       int ret = 0;
  
         ep = lookup_atid(t, atid);
         la = (struct sockaddr_in *)&ep->com.local_addr;
@@ -2104,9 +2263,10 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
                 mutex_unlock(&dev->rdev.stats.lock);
                 if (ep->com.local_addr.ss_family == AF_INET &&
                     dev->rdev.lldi.enable_fw_ofld_conn) {
-                       send_fw_act_open_req(ep,
-                                            TID_TID_G(AOPEN_ATID_G(
-                                            ntohl(rpl->atid_status))));
+                       ret = send_fw_act_open_req(ep, TID_TID_G(AOPEN_ATID_G(
+                                                  ntohl(rpl->atid_status))));
+                       if (ret)
+                               goto fail;
                         return 0;
                 }
                 break;
@@ -2146,6 +2306,7 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
                 break;
         }
  
+fail:
         connect_reply_upcall(ep, status2errno(status));
         state_set(&ep->com, DEAD);
  
@@ -2170,9 +2331,8 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
  static int pass_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
  {
         struct cpl_pass_open_rpl *rpl = cplhdr(skb);
-       struct tid_info *t = dev->rdev.lldi.tids;
         unsigned int stid = GET_TID(rpl);
-       struct c4iw_listen_ep *ep = lookup_stid(t, stid);
+       struct c4iw_listen_ep *ep = get_ep_from_stid(dev, stid);
  
         if (!ep) {
                 PDBG("%s stid %d lookup failure!\n", __func__, stid);
@@ -2181,7 +2341,7 @@ static int pass_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
         PDBG("%s ep %p status %d error %d\n", __func__, ep,
              rpl->status, status2errno(rpl->status));
         c4iw_wake_up(&ep->com.wr_wait, status2errno(rpl->status));
-
+       c4iw_put_ep(&ep->com);
  out:
         return 0;
  }
@@ -2189,17 +2349,17 @@ out:
  static int close_listsrv_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
  {
         struct cpl_close_listsvr_rpl *rpl = cplhdr(skb);
-       struct tid_info *t = dev->rdev.lldi.tids;
         unsigned int stid = GET_TID(rpl);
-       struct c4iw_listen_ep *ep = lookup_stid(t, stid);
+       struct c4iw_listen_ep *ep = get_ep_from_stid(dev, stid);
  
         PDBG("%s ep %p\n", __func__, ep);
         c4iw_wake_up(&ep->com.wr_wait, status2errno(rpl->status));
+       c4iw_put_ep(&ep->com);
         return 0;
  }
  
-static void accept_cr(struct c4iw_ep *ep, struct sk_buff *skb,
-                     struct cpl_pass_accept_req *req)
+static int accept_cr(struct c4iw_ep *ep, struct sk_buff *skb,
+                    struct cpl_pass_accept_req *req)
  {
         struct cpl_pass_accept_rpl *rpl;
         unsigned int mtu_idx;
@@ -2287,10 +2447,9 @@ static void accept_cr(struct c4iw_ep *ep, struct sk_buff *skb,
         rpl->opt0 = cpu_to_be64(opt0);
         rpl->opt2 = cpu_to_be32(opt2);
         set_wr_txq(skb, CPL_PRIORITY_SETUP, ep->ctrlq_idx);
-       t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
-       c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+       t4_set_arp_err_handler(skb, ep, pass_accept_rpl_arp_failure);
  
-       return;
+       return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
  }
  
  static void reject_cr(struct c4iw_dev *dev, u32 hwtid, struct sk_buff *skb)
@@ -2355,7 +2514,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
         unsigned short hdrs;
         u8 tos = PASS_OPEN_TOS_G(ntohl(req->tos_stid));
  
-       parent_ep = lookup_stid(t, stid);
+       parent_ep = (struct c4iw_ep *)get_ep_from_stid(dev, stid);
         if (!parent_ep) {
                 PDBG("%s connect request on invalid stid %d\n", __func__, stid);
                 goto reject;
@@ -2468,9 +2627,13 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
  
         init_timer(&child_ep->timer);
         cxgb4_insert_tid(t, child_ep, hwtid);
-       insert_handle(dev, &dev->hwtid_idr, child_ep, child_ep->hwtid);
-       accept_cr(child_ep, skb, req);
-       set_bit(PASS_ACCEPT_REQ, &child_ep->com.history);
+       insert_ep_tid(child_ep);
+       if (accept_cr(child_ep, skb, req)) {
+               c4iw_put_ep(&parent_ep->com);
+               release_ep_resources(child_ep);
+       } else {
+               set_bit(PASS_ACCEPT_REQ, &child_ep->com.history);
+       }
         if (iptype == 6) {
                 sin6 = (struct sockaddr_in6 *)&child_ep->com.local_addr;
                 cxgb4_clip_get(child_ep->com.dev->rdev.lldi.ports[0],
@@ -2479,6 +2642,8 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
         goto out;
  reject:
         reject_cr(dev, hwtid, skb);
+       if (parent_ep)
+               c4iw_put_ep(&parent_ep->com);
  out:
         return 0;
  }
@@ -2487,10 +2652,10 @@ static int pass_establish(struct c4iw_dev *dev, struct sk_buff *skb)
  {
         struct c4iw_ep *ep;
         struct cpl_pass_establish *req = cplhdr(skb);
-       struct tid_info *t = dev->rdev.lldi.tids;
         unsigned int tid = GET_TID(req);
+       int ret;
  
-       ep = lookup_tid(t, tid);
+       ep = get_ep_from_tid(dev, tid);
         PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
         ep->snd_seq = be32_to_cpu(req->snd_isn);
         ep->rcv_seq = be32_to_cpu(req->rcv_isn);
@@ -2501,10 +2666,15 @@ static int pass_establish(struct c4iw_dev *dev, struct sk_buff *skb)
         set_emss(ep, ntohs(req->tcp_opt));
  
         dst_confirm(ep->dst);
-       state_set(&ep->com, MPA_REQ_WAIT);
+       mutex_lock(&ep->com.mutex);
+       ep->com.state = MPA_REQ_WAIT;
         start_ep_timer(ep);
-       send_flowc(ep, skb);
         set_bit(PASS_ESTAB, &ep->com.history);
+       ret = send_flowc(ep, skb);
+       mutex_unlock(&ep->com.mutex);
+       if (ret)
+               c4iw_ep_disconnect(ep, 1, GFP_KERNEL);
+       c4iw_put_ep(&ep->com);
  
         return 0;
  }
@@ -2516,11 +2686,13 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb)
         struct c4iw_qp_attributes attrs;
         int disconnect = 1;
         int release = 0;
-       struct tid_info *t = dev->rdev.lldi.tids;
         unsigned int tid = GET_TID(hdr);
         int ret;
  
-       ep = lookup_tid(t, tid);
+       ep = get_ep_from_tid(dev, tid);
+       if (!ep)
+               return 0;
+
         PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
         dst_confirm(ep->dst);
  
@@ -2592,6 +2764,7 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb)
                 c4iw_ep_disconnect(ep, 0, GFP_KERNEL);
         if (release)
                 release_ep_resources(ep);
+       c4iw_put_ep(&ep->com);
         return 0;
  }
  
@@ -2604,10 +2777,12 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
         struct c4iw_qp_attributes attrs;
         int ret;
         int release = 0;
-       struct tid_info *t = dev->rdev.lldi.tids;
         unsigned int tid = GET_TID(req);
  
-       ep = lookup_tid(t, tid);
+       ep = get_ep_from_tid(dev, tid);
+       if (!ep)
+               return 0;
+
         if (is_neg_adv(req->status)) {
                 PDBG("%s Negative advice on abort- tid %u status %d (%s)\n",
                      __func__, ep->hwtid, req->status,
@@ -2616,7 +2791,7 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
                 mutex_lock(&dev->rdev.stats.lock);
                 dev->rdev.stats.neg_adv++;
                 mutex_unlock(&dev->rdev.stats.lock);
-               return 0;
+               goto deref_ep;
         }
         PDBG("%s ep %p tid %u state %u\n", __func__, ep, ep->hwtid,
              ep->com.state);
@@ -2633,6 +2808,7 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
         mutex_lock(&ep->com.mutex);
         switch (ep->com.state) {
         case CONNECTING:
+               c4iw_put_ep(&ep->parent_ep->com);
                 break;
         case MPA_REQ_WAIT:
                 (void)stop_ep_timer(ep);
@@ -2681,7 +2857,7 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
         case DEAD:
                 PDBG("%s PEER_ABORT IN DEAD STATE!!!!\n", __func__);
                 mutex_unlock(&ep->com.mutex);
-               return 0;
+               goto deref_ep;
         default:
                 BUG_ON(1);
                 break;
@@ -2728,6 +2904,10 @@ out:
                 c4iw_reconnect(ep);
         }
  
+deref_ep:
+       c4iw_put_ep(&ep->com);
+       /* Dereferencing ep, referenced in peer_abort_intr() */
+       c4iw_put_ep(&ep->com);
         return 0;
  }
  
@@ -2737,16 +2917,18 @@ static int close_con_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
         struct c4iw_qp_attributes attrs;
         struct cpl_close_con_rpl *rpl = cplhdr(skb);
         int release = 0;
-       struct tid_info *t = dev->rdev.lldi.tids;
         unsigned int tid = GET_TID(rpl);
  
-       ep = lookup_tid(t, tid);
+       ep = get_ep_from_tid(dev, tid);
+       if (!ep)
+               return 0;
  
         PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
         BUG_ON(!ep);
  
         /* The cm_id may be null if we failed to connect */
         mutex_lock(&ep->com.mutex);
+       set_bit(CLOSE_CON_RPL, &ep->com.history);
         switch (ep->com.state) {
         case CLOSING:
                 __state_set(&ep->com, MORIBUND);
@@ -2774,18 +2956,18 @@ static int close_con_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
         mutex_unlock(&ep->com.mutex);
         if (release)
                 release_ep_resources(ep);
+       c4iw_put_ep(&ep->com);
         return 0;
  }
  
  static int terminate(struct c4iw_dev *dev, struct sk_buff *skb)
  {
         struct cpl_rdma_terminate *rpl = cplhdr(skb);
-       struct tid_info *t = dev->rdev.lldi.tids;
         unsigned int tid = GET_TID(rpl);
         struct c4iw_ep *ep;
         struct c4iw_qp_attributes attrs;
  
-       ep = lookup_tid(t, tid);
+       ep = get_ep_from_tid(dev, tid);
         BUG_ON(!ep);
  
         if (ep && ep->com.qp) {
@@ -2796,6 +2978,7 @@ static int terminate(struct c4iw_dev *dev, struct sk_buff *skb)
                                C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
         } else
                 printk(KERN_WARNING MOD "TERM received tid %u no ep/qp\n", tid);
+       c4iw_put_ep(&ep->com);
  
         return 0;
  }
@@ -2811,15 +2994,16 @@ static int fw4_ack(struct c4iw_dev *dev, struct sk_buff *skb)
         struct cpl_fw4_ack *hdr = cplhdr(skb);
         u8 credits = hdr->credits;
         unsigned int tid = GET_TID(hdr);
-       struct tid_info *t = dev->rdev.lldi.tids;
  
  
-       ep = lookup_tid(t, tid);
+       ep = get_ep_from_tid(dev, tid);
+       if (!ep)
+               return 0;
         PDBG("%s ep %p tid %u credits %u\n", __func__, ep, ep->hwtid, credits);
         if (credits == 0) {
                 PDBG("%s 0 credit ack ep %p tid %u state %u\n",
                      __func__, ep, ep->hwtid, state_read(&ep->com));
-               return 0;
+               goto out;
         }
  
         dst_confirm(ep->dst);
@@ -2829,7 +3013,13 @@ static int fw4_ack(struct c4iw_dev *dev, struct sk_buff *skb)
                      state_read(&ep->com), ep->mpa_attr.initiator ? 1 : 0);
                 kfree_skb(ep->mpa_skb);
                 ep->mpa_skb = NULL;
+               mutex_lock(&ep->com.mutex);
+               if (test_bit(STOP_MPA_TIMER, &ep->com.flags))
+                       stop_ep_timer(ep);
+               mutex_unlock(&ep->com.mutex);
         }
+out:
+       c4iw_put_ep(&ep->com);
         return 0;
  }
  
@@ -2841,22 +3031,23 @@ int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
         PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
  
         mutex_lock(&ep->com.mutex);
-       if (ep->com.state == DEAD) {
+       if (ep->com.state != MPA_REQ_RCVD) {
                 mutex_unlock(&ep->com.mutex);
                 c4iw_put_ep(&ep->com);
                 return -ECONNRESET;
         }
         set_bit(ULP_REJECT, &ep->com.history);
-       BUG_ON(ep->com.state != MPA_REQ_RCVD);
         if (mpa_rev == 0)
-               abort_connection(ep, NULL, GFP_KERNEL);
+               disconnect = 2;
         else {
                 err = send_mpa_reject(ep, pdata, pdata_len);
                 disconnect = 1;
         }
         mutex_unlock(&ep->com.mutex);
-       if (disconnect)
-               err = c4iw_ep_disconnect(ep, 0, GFP_KERNEL);
+       if (disconnect) {
+               stop_ep_timer(ep);
+               err = c4iw_ep_disconnect(ep, disconnect == 2, GFP_KERNEL);
+       }
         c4iw_put_ep(&ep->com);
         return 0;
  }
@@ -2869,24 +3060,23 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
         struct c4iw_ep *ep = to_ep(cm_id);
         struct c4iw_dev *h = to_c4iw_dev(cm_id->device);
         struct c4iw_qp *qp = get_qhp(h, conn_param->qpn);
+       int abort = 0;
  
         PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
  
         mutex_lock(&ep->com.mutex);
-       if (ep->com.state == DEAD) {
+       if (ep->com.state != MPA_REQ_RCVD) {
                 err = -ECONNRESET;
-               goto err;
+               goto err_out;
         }
  
-       BUG_ON(ep->com.state != MPA_REQ_RCVD);
         BUG_ON(!qp);
  
         set_bit(ULP_ACCEPT, &ep->com.history);
         if ((conn_param->ord > cur_max_read_depth(ep->com.dev)) ||
             (conn_param->ird > cur_max_read_depth(ep->com.dev))) {
-               abort_connection(ep, NULL, GFP_KERNEL);
                 err = -EINVAL;
-               goto err;
+               goto err_abort;
         }
  
         if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) {
@@ -2898,9 +3088,8 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
                                 ep->ord = conn_param->ord;
                                 send_mpa_reject(ep, conn_param->private_data,
                                                 conn_param->private_data_len);
-                               abort_connection(ep, NULL, GFP_KERNEL);
                                 err = -ENOMEM;
-                               goto err;
+                               goto err_abort;
                         }
                 }
                 if (conn_param->ird < ep->ord) {
@@ -2908,9 +3097,8 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
                             ep->ord <= h->rdev.lldi.max_ordird_qp) {
                                 conn_param->ird = ep->ord;
                         } else {
-                               abort_connection(ep, NULL, GFP_KERNEL);
                                 err = -ENOMEM;
-                               goto err;
+                               goto err_abort;
                         }
                 }
         }
@@ -2929,8 +3117,8 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
  
         PDBG("%s %d ird %d ord %d\n", __func__, __LINE__, ep->ird, ep->ord);
  
-       cm_id->add_ref(cm_id);
         ep->com.cm_id = cm_id;
+       ref_cm_id(&ep->com);
         ep->com.qp = qp;
         ref_qp(ep);
  
@@ -2951,23 +3139,27 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
         err = c4iw_modify_qp(ep->com.qp->rhp,
                              ep->com.qp, mask, &attrs, 1);
         if (err)
-               goto err1;
+               goto err_deref_cm_id;
+
+       set_bit(STOP_MPA_TIMER, &ep->com.flags);
         err = send_mpa_reply(ep, conn_param->private_data,
                              conn_param->private_data_len);
         if (err)
-               goto err1;
+               goto err_deref_cm_id;
  
         __state_set(&ep->com, FPDU_MODE);
         established_upcall(ep);
         mutex_unlock(&ep->com.mutex);
         c4iw_put_ep(&ep->com);
         return 0;
-err1:
-       ep->com.cm_id = NULL;
-       abort_connection(ep, NULL, GFP_KERNEL);
-       cm_id->rem_ref(cm_id);
-err:
+err_deref_cm_id:
+       deref_cm_id(&ep->com);
+err_abort:
+       abort = 1;
+err_out:
         mutex_unlock(&ep->com.mutex);
+       if (abort)
+               c4iw_ep_disconnect(ep, 1, GFP_KERNEL);
         c4iw_put_ep(&ep->com);
         return err;
  }
@@ -3067,9 +3259,9 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
         if (peer2peer && ep->ord == 0)
                 ep->ord = 1;
  
-       cm_id->add_ref(cm_id);
-       ep->com.dev = dev;
         ep->com.cm_id = cm_id;
+       ref_cm_id(&ep->com);
+       ep->com.dev = dev;
         ep->com.qp = get_qhp(dev, conn_param->qpn);
         if (!ep->com.qp) {
                 PDBG("%s qpn 0x%x not found!\n", __func__, conn_param->qpn);
@@ -3108,7 +3300,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
                 /*
                  * Handle loopback requests to INADDR_ANY.
                  */
-               if ((__force int)raddr->sin_addr.s_addr == INADDR_ANY) {
+               if (raddr->sin_addr.s_addr == htonl(INADDR_ANY)) {
                         err = pick_local_ipaddrs(dev, cm_id);
                         if (err)
                                 goto fail1;
@@ -3176,7 +3368,7 @@ fail2:
         remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);
         cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
  fail1:
-       cm_id->rem_ref(cm_id);
+       deref_cm_id(&ep->com);
         c4iw_put_ep(&ep->com);
  out:
         return err;
@@ -3270,8 +3462,8 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog)
                 goto fail1;
         }
         PDBG("%s ep %p\n", __func__, ep);
-       cm_id->add_ref(cm_id);
         ep->com.cm_id = cm_id;
+       ref_cm_id(&ep->com);
         ep->com.dev = dev;
         ep->backlog = backlog;
         memcpy(&ep->com.local_addr, &cm_id->m_local_addr,
@@ -3311,7 +3503,7 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog)
         cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid,
                         ep->com.local_addr.ss_family);
  fail2:
-       cm_id->rem_ref(cm_id);
+       deref_cm_id(&ep->com);
         c4iw_put_ep(&ep->com);
  fail1:
  out:
@@ -3350,7 +3542,7 @@ int c4iw_destroy_listen(struct iw_cm_id *cm_id)
         cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid,
                         ep->com.local_addr.ss_family);
  done:
-       cm_id->rem_ref(cm_id);
+       deref_cm_id(&ep->com);
         c4iw_put_ep(&ep->com);
         return err;
  }
@@ -3367,6 +3559,12 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
         PDBG("%s ep %p state %s, abrupt %d\n", __func__, ep,
              states[ep->com.state], abrupt);
  
+       /*
+        * Ref the ep here in case we have fatal errors causing the
+        * ep to be released and freed.
+        */
+       c4iw_get_ep(&ep->com);
+
         rdev = &ep->com.dev->rdev;
         if (c4iw_fatal_error(rdev)) {
                 fatal = 1;
@@ -3418,10 +3616,30 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
                         set_bit(EP_DISC_CLOSE, &ep->com.history);
                         ret = send_halfclose(ep, gfp);
                 }
-               if (ret)
+               if (ret) {
+                       set_bit(EP_DISC_FAIL, &ep->com.history);
+                       if (!abrupt) {
+                               stop_ep_timer(ep);
+                               close_complete_upcall(ep, -EIO);
+                       }
+                       if (ep->com.qp) {
+                               struct c4iw_qp_attributes attrs;
+
+                               attrs.next_state = C4IW_QP_STATE_ERROR;
+                               ret = c4iw_modify_qp(ep->com.qp->rhp,
+                                                    ep->com.qp,
+                                                    C4IW_QP_ATTR_NEXT_STATE,
+                                                    &attrs, 1);
+                               if (ret)
+                                       pr_err(MOD
+                                              "%s - qp <- error failed!\n",
+                                              __func__);
+                       }
                         fatal = 1;
+               }
         }
         mutex_unlock(&ep->com.mutex);
+       c4iw_put_ep(&ep->com);
         if (fatal)
                 release_ep_resources(ep);
         return ret;
@@ -3676,7 +3894,7 @@ static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb)
         struct cpl_pass_accept_req *req = (void *)(rss + 1);
         struct l2t_entry *e;
         struct dst_entry *dst;
-       struct c4iw_ep *lep;
+       struct c4iw_ep *lep = NULL;
         u16 window;
         struct port_info *pi;
         struct net_device *pdev;
@@ -3701,7 +3919,7 @@ static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb)
          */
         stid = (__force int) cpu_to_be32((__force u32) rss->hash_val);
  
-       lep = (struct c4iw_ep *)lookup_stid(dev->rdev.lldi.tids, stid);
+       lep = (struct c4iw_ep *)get_ep_from_stid(dev, stid);
         if (!lep) {
                 PDBG("%s connect request on invalid stid %d\n", __func__, stid);
                 goto reject;
@@ -3802,6 +4020,8 @@ static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb)
  free_dst:
         dst_release(dst);
  reject:
+       if (lep)
+               c4iw_put_ep(&lep->com);
         return 0;
  }
  
@@ -3809,7 +4029,7 @@ reject:
   * These are the real handlers that are called from a
   * work queue.
   */
-static c4iw_handler_func work_handlers[NUM_CPL_CMDS] = {
+static c4iw_handler_func work_handlers[NUM_CPL_CMDS + NUM_FAKE_CPLS] = {
         [CPL_ACT_ESTABLISH] = act_establish,
         [CPL_ACT_OPEN_RPL] = act_open_rpl,
         [CPL_RX_DATA] = rx_data,
@@ -3825,7 +4045,9 @@ static c4iw_handler_func work_handlers[NUM_CPL_CMDS] = {
         [CPL_RDMA_TERMINATE] = terminate,
         [CPL_FW4_ACK] = fw4_ack,
         [CPL_FW6_MSG] = deferred_fw6_msg,
-       [CPL_RX_PKT] = rx_pkt
+       [CPL_RX_PKT] = rx_pkt,
+       [FAKE_CPL_PUT_EP_SAFE] = _put_ep_safe,
+       [FAKE_CPL_PASS_PUT_EP_SAFE] = _put_pass_ep_safe
  };
  
  static void process_timeout(struct c4iw_ep *ep)
@@ -3839,11 +4061,12 @@ static void process_timeout(struct c4iw_ep *ep)
         set_bit(TIMEDOUT, &ep->com.history);
         switch (ep->com.state) {
         case MPA_REQ_SENT:
-               __state_set(&ep->com, ABORTING);
                 connect_reply_upcall(ep, -ETIMEDOUT);
                 break;
         case MPA_REQ_WAIT:
-               __state_set(&ep->com, ABORTING);
+       case MPA_REQ_RCVD:
+       case MPA_REP_SENT:
+       case FPDU_MODE:
                 break;
         case CLOSING:
         case MORIBUND:
@@ -3853,7 +4076,6 @@ static void process_timeout(struct c4iw_ep *ep)
                                      ep->com.qp, C4IW_QP_ATTR_NEXT_STATE,
                                      &attrs, 1);
                 }
-               __state_set(&ep->com, ABORTING);
                 close_complete_upcall(ep, -ETIMEDOUT);
                 break;
         case ABORTING:
@@ -3871,9 +4093,9 @@ static void process_timeout(struct c4iw_ep *ep)
                         __func__, ep, ep->hwtid, ep->com.state);
                 abort = 0;
         }
-       if (abort)
-               abort_connection(ep, NULL, GFP_KERNEL);
         mutex_unlock(&ep->com.mutex);
+       if (abort)
+               c4iw_ep_disconnect(ep, 1, GFP_KERNEL);
         c4iw_put_ep(&ep->com);
  }
  
@@ -4006,10 +4228,10 @@ static int peer_abort_intr(struct c4iw_dev *dev, struct sk_buff *skb)
  {
         struct cpl_abort_req_rss *req = cplhdr(skb);
         struct c4iw_ep *ep;
-       struct tid_info *t = dev->rdev.lldi.tids;
         unsigned int tid = GET_TID(req);
  
-       ep = lookup_tid(t, tid);
+       ep = get_ep_from_tid(dev, tid);
+       /* This EP will be dereferenced in peer_abort() */
         if (!ep) {
                 printk(KERN_WARNING MOD
                        "Abort on non-existent endpoint, tid %d\n", tid);
@@ -4020,24 +4242,13 @@ static int peer_abort_intr(struct c4iw_dev *dev, struct sk_buff *skb)
                 PDBG("%s Negative advice on abort- tid %u status %d (%s)\n",
                      __func__, ep->hwtid, req->status,
                      neg_adv_str(req->status));
-               ep->stats.abort_neg_adv++;
-               dev->rdev.stats.neg_adv++;
-               kfree_skb(skb);
-               return 0;
+               goto out;
         }
         PDBG("%s ep %p tid %u state %u\n", __func__, ep, ep->hwtid,
              ep->com.state);
  
-       /*
-        * Wake up any threads in rdma_init() or rdma_fini().
-        * However, if we are on MPAv2 and want to retry with MPAv1
-        * then, don't wake up yet.
-        */
-       if (mpa_rev == 2 && !ep->tried_with_mpa_v1) {
-               if (ep->com.state != MPA_REQ_SENT)
-                       c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET);
-       } else
-               c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET);
+       c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET);
+out:
         sched(dev, skb);
         return 0;
  }
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h

index df43f871ab61cba60d44986cc4ac17c7881a3777..f6f34a75af271f34a141d305021f2d5cf3318aca 100644 (file)
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -755,6 +755,7 @@ enum c4iw_ep_flags {
         CLOSE_SENT              = 3,
         TIMEOUT                 = 4,
         QP_REFERENCED           = 5,
+       STOP_MPA_TIMER          = 7,
  };
  
  enum c4iw_ep_history {
@@ -779,7 +780,13 @@ enum c4iw_ep_history {
         EP_DISC_ABORT           = 18,
         CONN_RPL_UPCALL         = 19,
         ACT_RETRY_NOMEM         = 20,
-       ACT_RETRY_INUSE         = 21
+       ACT_RETRY_INUSE         = 21,
+       CLOSE_CON_RPL           = 22,
+       EP_DISC_FAIL            = 24,
+       QP_REFED                = 25,
+       QP_DEREFED              = 26,
+       CM_ID_REFED             = 27,
+       CM_ID_DEREFED           = 28,
  };
  
  struct c4iw_ep_common {
@@ -917,9 +924,8 @@ void c4iw_qp_rem_ref(struct ib_qp *qp);
  struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd,
                             enum ib_mr_type mr_type,
                             u32 max_num_sg);
-int c4iw_map_mr_sg(struct ib_mr *ibmr,
-                  struct scatterlist *sg,
-                  int sg_nents);
+int c4iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+                  unsigned int *sg_offset);
  int c4iw_dealloc_mw(struct ib_mw *mw);
  struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
                             struct ib_udata *udata);
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c

index 008be07d560432b7a0954a0e7676e71b2ef61d04..55d0651ee4de58931dd76be90105ec6aadbf47ef 100644 (file)
--- a/drivers/infiniband/hw/cxgb4/mem.c
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -86,8 +86,9 @@ static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr,
                         (wait ? FW_WR_COMPL_F : 0));
         req->wr.wr_lo = wait ? (__force __be64)(unsigned long) &wr_wait : 0L;
         req->wr.wr_mid = cpu_to_be32(FW_WR_LEN16_V(DIV_ROUND_UP(wr_len, 16)));
-       req->cmd = cpu_to_be32(ULPTX_CMD_V(ULP_TX_MEM_WRITE));
-       req->cmd |= cpu_to_be32(T5_ULP_MEMIO_ORDER_V(1));
+       req->cmd = cpu_to_be32(ULPTX_CMD_V(ULP_TX_MEM_WRITE) |
+                              T5_ULP_MEMIO_ORDER_V(1) |
+                              T5_ULP_MEMIO_FID_V(rdev->lldi.rxq_ids[0]));
         req->dlen = cpu_to_be32(ULP_MEMIO_DATA_LEN_V(len>>5));
         req->len16 = cpu_to_be32(DIV_ROUND_UP(wr_len-sizeof(req->wr), 16));
         req->lock_addr = cpu_to_be32(ULP_MEMIO_ADDR_V(addr));
@@ -690,15 +691,14 @@ static int c4iw_set_page(struct ib_mr *ibmr, u64 addr)
         return 0;
  }
  
-int c4iw_map_mr_sg(struct ib_mr *ibmr,
-                  struct scatterlist *sg,
-                  int sg_nents)
+int c4iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+                  unsigned int *sg_offset)
  {
         struct c4iw_mr *mhp = to_c4iw_mr(ibmr);
  
         mhp->mpl_len = 0;
  
-       return ib_sg_to_pages(ibmr, sg, sg_nents, c4iw_set_page);
+       return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, c4iw_set_page);
  }
  
  int c4iw_dereg_mr(struct ib_mr *ib_mr)
diff --git a/drivers/infiniband/hw/i40iw/i40iw.h b/drivers/infiniband/hw/i40iw/i40iw.h

index 81976768144555a4c0f578c71ede72ff8644beaf..8b95320345583802c9263cfd308a877af54bc014 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw.h
+++ b/drivers/infiniband/hw/i40iw/i40iw.h
@@ -50,8 +50,6 @@
  #include <rdma/ib_pack.h>
  #include <rdma/rdma_cm.h>
  #include <rdma/iw_cm.h>
-#include <rdma/iw_portmap.h>
-#include <rdma/rdma_netlink.h>
  #include <crypto/hash.h>
  
  #include "i40iw_status.h"
@@ -254,6 +252,7 @@ struct i40iw_device {
         u32 arp_table_size;
         u32 next_arp_index;
         spinlock_t resource_lock; /* hw resource access */
+       spinlock_t qptable_lock;
         u32 vendor_id;
         u32 vendor_part_id;
         u32 of_device_registered;
@@ -392,7 +391,7 @@ void i40iw_flush_wqes(struct i40iw_device *iwdev,
  
  void i40iw_manage_arp_cache(struct i40iw_device *iwdev,
                             unsigned char *mac_addr,
-                           __be32 *ip_addr,
+                           u32 *ip_addr,
                             bool ipv4,
                             u32 action);
  
@@ -550,7 +549,7 @@ enum i40iw_status_code i40iw_hw_flush_wqes(struct i40iw_device *iwdev,
                                            struct i40iw_qp_flush_info *info,
                                            bool wait);
  
-void i40iw_copy_ip_ntohl(u32 *dst, u32 *src);
+void i40iw_copy_ip_ntohl(u32 *dst, __be32 *src);
  struct ib_mr *i40iw_reg_phys_mr(struct ib_pd *ib_pd,
                                 u64 addr,
                                 u64 size,
diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c

index 38f917a6c7784101247c6a59af14b21a73d85d0a..d2fa7251696077a3fb3f7b8cec5955bd9348a603 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_cm.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c
@@ -771,6 +771,7 @@ static void i40iw_build_mpa_v2(struct i40iw_cm_node *cm_node,
  {
         struct ietf_mpa_v2 *mpa_frame = (struct ietf_mpa_v2 *)start_addr;
         struct ietf_rtr_msg *rtr_msg = &mpa_frame->rtr_msg;
+       u16 ctrl_ird, ctrl_ord;
  
         /* initialize the upper 5 bytes of the frame */
         i40iw_build_mpa_v1(cm_node, start_addr, mpa_key);
@@ -779,38 +780,38 @@ static void i40iw_build_mpa_v2(struct i40iw_cm_node *cm_node,
  
         /* initialize RTR msg */
         if (cm_node->mpav2_ird_ord == IETF_NO_IRD_ORD) {
-               rtr_msg->ctrl_ird = IETF_NO_IRD_ORD;
-               rtr_msg->ctrl_ord = IETF_NO_IRD_ORD;
+               ctrl_ird = IETF_NO_IRD_ORD;
+               ctrl_ord = IETF_NO_IRD_ORD;
         } else {
-               rtr_msg->ctrl_ird = (cm_node->ird_size > IETF_NO_IRD_ORD) ?
+               ctrl_ird = (cm_node->ird_size > IETF_NO_IRD_ORD) ?
                         IETF_NO_IRD_ORD : cm_node->ird_size;
-               rtr_msg->ctrl_ord = (cm_node->ord_size > IETF_NO_IRD_ORD) ?
+               ctrl_ord = (cm_node->ord_size > IETF_NO_IRD_ORD) ?
                         IETF_NO_IRD_ORD : cm_node->ord_size;
         }
  
-       rtr_msg->ctrl_ird |= IETF_PEER_TO_PEER;
-       rtr_msg->ctrl_ird |= IETF_FLPDU_ZERO_LEN;
+       ctrl_ird |= IETF_PEER_TO_PEER;
+       ctrl_ird |= IETF_FLPDU_ZERO_LEN;
  
         switch (mpa_key) {
         case MPA_KEY_REQUEST:
-               rtr_msg->ctrl_ord |= IETF_RDMA0_WRITE;
-               rtr_msg->ctrl_ord |= IETF_RDMA0_READ;
+               ctrl_ord |= IETF_RDMA0_WRITE;
+               ctrl_ord |= IETF_RDMA0_READ;
                 break;
         case MPA_KEY_REPLY:
                 switch (cm_node->send_rdma0_op) {
                 case SEND_RDMA_WRITE_ZERO:
-                       rtr_msg->ctrl_ord |= IETF_RDMA0_WRITE;
+                       ctrl_ord |= IETF_RDMA0_WRITE;
                         break;
                 case SEND_RDMA_READ_ZERO:
-                       rtr_msg->ctrl_ord |= IETF_RDMA0_READ;
+                       ctrl_ord |= IETF_RDMA0_READ;
                         break;
                 }
                 break;
         default:
                 break;
         }
-       rtr_msg->ctrl_ird = htons(rtr_msg->ctrl_ird);
-       rtr_msg->ctrl_ord = htons(rtr_msg->ctrl_ord);
+       rtr_msg->ctrl_ird = htons(ctrl_ird);
+       rtr_msg->ctrl_ord = htons(ctrl_ord);
  }
  
  /**
@@ -2107,7 +2108,7 @@ static bool i40iw_ipv6_is_loopback(u32 *loc_addr, u32 *rem_addr)
         struct in6_addr raddr6;
  
         i40iw_copy_ip_htonl(raddr6.in6_u.u6_addr32, rem_addr);
-       return (!memcmp(loc_addr, rem_addr, 16) || ipv6_addr_loopback(&raddr6));
+       return !memcmp(loc_addr, rem_addr, 16) || ipv6_addr_loopback(&raddr6);
  }
  
  /**
@@ -2160,7 +2161,7 @@ static struct i40iw_cm_node *i40iw_make_cm_node(
         cm_node->tcp_cntxt.rcv_wnd =
                         I40IW_CM_DEFAULT_RCV_WND_SCALED >> I40IW_CM_DEFAULT_RCV_WND_SCALE;
         ts = current_kernel_time();
-       cm_node->tcp_cntxt.loc_seq_num = htonl(ts.tv_nsec);
+       cm_node->tcp_cntxt.loc_seq_num = ts.tv_nsec;
         cm_node->tcp_cntxt.mss = iwdev->mss;
  
         cm_node->iwdev = iwdev;
@@ -2234,7 +2235,7 @@ static void i40iw_rem_ref_cm_node(struct i40iw_cm_node *cm_node)
         if (cm_node->listener) {
                 i40iw_dec_refcnt_listen(cm_core, cm_node->listener, 0, true);
         } else {
-               if (!i40iw_listen_port_in_use(cm_core, htons(cm_node->loc_port)) &&
+               if (!i40iw_listen_port_in_use(cm_core, cm_node->loc_port) &&
                     cm_node->apbvt_set && cm_node->iwdev) {
                         i40iw_manage_apbvt(cm_node->iwdev,
                                            cm_node->loc_port,
@@ -2852,7 +2853,6 @@ static struct i40iw_cm_node *i40iw_create_cm_node(
                                         void *private_data,
                                         struct i40iw_cm_info *cm_info)
  {
-       int ret;
         struct i40iw_cm_node *cm_node;
         struct i40iw_cm_listener *loopback_remotelistener;
         struct i40iw_cm_node *loopback_remotenode;
@@ -2922,30 +2922,6 @@ static struct i40iw_cm_node *i40iw_create_cm_node(
         memcpy(cm_node->pdata_buf, private_data, private_data_len);
  
         cm_node->state = I40IW_CM_STATE_SYN_SENT;
-       ret = i40iw_send_syn(cm_node, 0);
-
-       if (ret) {
-               if (cm_node->ipv4)
-                       i40iw_debug(cm_node->dev,
-                                   I40IW_DEBUG_CM,
-                                   "Api - connect() FAILED: dest addr=%pI4",
-                                   cm_node->rem_addr);
-               else
-                       i40iw_debug(cm_node->dev, I40IW_DEBUG_CM,
-                                   "Api - connect() FAILED: dest addr=%pI6",
-                                   cm_node->rem_addr);
-               i40iw_rem_ref_cm_node(cm_node);
-               cm_node = NULL;
-       }
-
-       if (cm_node)
-               i40iw_debug(cm_node->dev,
-                           I40IW_DEBUG_CM,
-                           "Api - connect(): port=0x%04x, cm_node=%p, cm_id = %p.\n",
-                           cm_node->rem_port,
-                           cm_node,
-                           cm_node->cm_id);
-
         return cm_node;
  }
  
@@ -3266,11 +3242,13 @@ static void i40iw_init_tcp_ctx(struct i40iw_cm_node *cm_node,
  
                 tcp_info->dest_ip_addr3 = cpu_to_le32(cm_node->rem_addr[0]);
                 tcp_info->local_ipaddr3 = cpu_to_le32(cm_node->loc_addr[0]);
-               tcp_info->arp_idx = cpu_to_le32(i40iw_arp_table(iwqp->iwdev,
-                                                               &tcp_info->dest_ip_addr3,
-                                                               true,
-                                                               NULL,
-                                                               I40IW_ARP_RESOLVE));
+               tcp_info->arp_idx =
+                       cpu_to_le16((u16)i40iw_arp_table(
+                                                        iwqp->iwdev,
+                                                        &tcp_info->dest_ip_addr3,
+                                                        true,
+                                                        NULL,
+                                                        I40IW_ARP_RESOLVE));
         } else {
                 tcp_info->src_port = cpu_to_le16(cm_node->loc_port);
                 tcp_info->dst_port = cpu_to_le16(cm_node->rem_port);
@@ -3282,12 +3260,13 @@ static void i40iw_init_tcp_ctx(struct i40iw_cm_node *cm_node,
                 tcp_info->local_ipaddr1 = cpu_to_le32(cm_node->loc_addr[1]);
                 tcp_info->local_ipaddr2 = cpu_to_le32(cm_node->loc_addr[2]);
                 tcp_info->local_ipaddr3 = cpu_to_le32(cm_node->loc_addr[3]);
-               tcp_info->arp_idx = cpu_to_le32(i40iw_arp_table(
-                                                       iwqp->iwdev,
-                                                       &tcp_info->dest_ip_addr0,
-                                                       false,
-                                                       NULL,
-                                                       I40IW_ARP_RESOLVE));
+               tcp_info->arp_idx =
+                       cpu_to_le16((u16)i40iw_arp_table(
+                                                        iwqp->iwdev,
+                                                        &tcp_info->dest_ip_addr0,
+                                                        false,
+                                                        NULL,
+                                                        I40IW_ARP_RESOLVE));
         }
  }
  
@@ -3564,7 +3543,6 @@ int i40iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
         struct i40iw_cm_node *cm_node;
         struct ib_qp_attr attr;
         int passive_state;
-       struct i40iw_ib_device *iwibdev;
         struct ib_mr *ibmr;
         struct i40iw_pd *iwpd;
         u16 buf_len = 0;
@@ -3627,7 +3605,6 @@ int i40iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
              !i40iw_ipv4_is_loopback(cm_node->loc_addr[0], cm_node->rem_addr[0])) ||
             (!cm_node->ipv4 &&
              !i40iw_ipv6_is_loopback(cm_node->loc_addr, cm_node->rem_addr))) {
-               iwibdev = iwdev->iwibdev;
                 iwpd = iwqp->iwpd;
                 tagged_offset = (uintptr_t)iwqp->ietf_mem.va;
                 ibmr = i40iw_reg_phys_mr(&iwpd->ibpd,
@@ -3752,6 +3729,7 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
         struct sockaddr_in *raddr;
         struct sockaddr_in6 *laddr6;
         struct sockaddr_in6 *raddr6;
+       bool qhash_set = false;
         int apbvt_set = 0;
         enum i40iw_status_code status;
  
@@ -3810,6 +3788,7 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
                                             true);
                 if (status)
                         return -EINVAL;
+               qhash_set = true;
         }
         status = i40iw_manage_apbvt(iwdev, cm_info.loc_port, I40IW_MANAGE_APBVT_ADD);
         if (status) {
@@ -3828,23 +3807,8 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
                                        conn_param->private_data_len,
                                        (void *)conn_param->private_data,
                                        &cm_info);
-       if (!cm_node) {
-               i40iw_manage_qhash(iwdev,
-                                  &cm_info,
-                                  I40IW_QHASH_TYPE_TCP_ESTABLISHED,
-                                  I40IW_QHASH_MANAGE_TYPE_DELETE,
-                                  NULL,
-                                  false);
-
-               if (apbvt_set && !i40iw_listen_port_in_use(&iwdev->cm_core,
-                                                          cm_info.loc_port))
-                       i40iw_manage_apbvt(iwdev,
-                                          cm_info.loc_port,
-                                          I40IW_MANAGE_APBVT_DEL);
-               cm_id->rem_ref(cm_id);
-               iwdev->cm_core.stats_connect_errs++;
-               return -ENOMEM;
-       }
+       if (!cm_node)
+               goto err;
  
         i40iw_record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord);
         if (cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO &&
@@ -3852,12 +3816,54 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
                 cm_node->ord_size = 1;
  
         cm_node->apbvt_set = apbvt_set;
-       cm_node->qhash_set = true;
+       cm_node->qhash_set = qhash_set;
         iwqp->cm_node = cm_node;
         cm_node->iwqp = iwqp;
         iwqp->cm_id = cm_id;
         i40iw_add_ref(&iwqp->ibqp);
+
+       if (cm_node->state == I40IW_CM_STATE_SYN_SENT) {
+               if (i40iw_send_syn(cm_node, 0)) {
+                       i40iw_rem_ref_cm_node(cm_node);
+                       goto err;
+               }
+       }
+
+       i40iw_debug(cm_node->dev,
+                   I40IW_DEBUG_CM,
+                   "Api - connect(): port=0x%04x, cm_node=%p, cm_id = %p.\n",
+                   cm_node->rem_port,
+                   cm_node,
+                   cm_node->cm_id);
         return 0;
+
+err:
+       if (cm_node) {
+               if (cm_node->ipv4)
+                       i40iw_debug(cm_node->dev,
+                                   I40IW_DEBUG_CM,
+                                   "Api - connect() FAILED: dest addr=%pI4",
+                                   cm_node->rem_addr);
+               else
+                       i40iw_debug(cm_node->dev, I40IW_DEBUG_CM,
+                                   "Api - connect() FAILED: dest addr=%pI6",
+                                   cm_node->rem_addr);
+       }
+       i40iw_manage_qhash(iwdev,
+                          &cm_info,
+                          I40IW_QHASH_TYPE_TCP_ESTABLISHED,
+                          I40IW_QHASH_MANAGE_TYPE_DELETE,
+                          NULL,
+                          false);
+
+       if (apbvt_set && !i40iw_listen_port_in_use(&iwdev->cm_core,
+                                                  cm_info.loc_port))
+               i40iw_manage_apbvt(iwdev,
+                                  cm_info.loc_port,
+                                  I40IW_MANAGE_APBVT_DEL);
+       cm_id->rem_ref(cm_id);
+       iwdev->cm_core.stats_connect_errs++;
+       return -ENOMEM;
  }
  
  /**
diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.h b/drivers/infiniband/hw/i40iw/i40iw_cm.h

index 5f8ceb4a8e847923689f367a6ed7aed9796dabd0..e9046d9f96456c847e1df069b6a5dff9e0f0cfe2 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_cm.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_cm.h
@@ -1,6 +1,6 @@
  /*******************************************************************************
  *
-* Copyright (c) 2015 Intel Corporation.  All rights reserved.
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -291,8 +291,6 @@ struct i40iw_cm_listener {
         u8 loc_mac[ETH_ALEN];
         u32 loc_addr[4];
         u16 loc_port;
-       u32 map_loc_addr[4];
-       u16 map_loc_port;
         struct iw_cm_id *cm_id;
         atomic_t ref_count;
         struct i40iw_device *iwdev;
@@ -317,8 +315,6 @@ struct i40iw_kmem_info {
  struct i40iw_cm_node {
         u32 loc_addr[4], rem_addr[4];
         u16 loc_port, rem_port;
-       u32 map_loc_addr[4], map_rem_addr[4];
-       u16 map_loc_port, map_rem_port;
         u16 vlan_id;
         enum i40iw_cm_node_state state;
         u8 loc_mac[ETH_ALEN];
@@ -370,10 +366,6 @@ struct i40iw_cm_info {
         u16 rem_port;
         u32 loc_addr[4];
         u32 rem_addr[4];
-       u16 map_loc_port;
-       u16 map_rem_port;
-       u32 map_loc_addr[4];
-       u32 map_rem_addr[4];
         u16 vlan_id;
         int backlog;
         u16 user_pri;
diff --git a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c

index f05802bf6ca04dbbe5e60bc00b044dd1faab4cb1..2c4b4d072d6ae230fb862fe89886ea820381562d 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c
@@ -114,16 +114,21 @@ static enum i40iw_status_code i40iw_cqp_poll_registers(
   * i40iw_sc_parse_fpm_commit_buf - parse fpm commit buffer
   * @buf: ptr to fpm commit buffer
   * @info: ptr to i40iw_hmc_obj_info struct
+ * @sd: number of SDs for HMC objects
   *
   * parses fpm commit info and copy base value
   * of hmc objects in hmc_info
   */
  static enum i40iw_status_code i40iw_sc_parse_fpm_commit_buf(
                                 u64 *buf,
-                               struct i40iw_hmc_obj_info *info)
+                               struct i40iw_hmc_obj_info *info,
+                               u32 *sd)
  {
         u64 temp;
+       u64 size;
+       u64 base = 0;
         u32 i, j;
+       u32 k = 0;
         u32 low;
  
         /* copy base values in obj_info */
@@ -131,10 +136,20 @@ static enum i40iw_status_code i40iw_sc_parse_fpm_commit_buf(
                         i <= I40IW_HMC_IW_PBLE; i++, j += 8) {
                 get_64bit_val(buf, j, &temp);
                 info[i].base = RS_64_1(temp, 32) * 512;
+               if (info[i].base > base) {
+                       base = info[i].base;
+                       k = i;
+               }
                 low = (u32)(temp);
                 if (low)
                         info[i].cnt = low;
         }
+       size = info[k].cnt * info[k].size + info[k].base;
+       if (size & 0x1FFFFF)
+               *sd = (u32)((size >> 21) + 1); /* add 1 for remainder */
+       else
+               *sd = (u32)(size >> 21);
+
         return 0;
  }
  
@@ -2908,6 +2923,65 @@ static enum i40iw_status_code i40iw_sc_mw_alloc(
         return 0;
  }
  
+/**
+ * i40iw_sc_mr_fast_register - Posts RDMA fast register mr WR to iwarp qp
+ * @qp: sc qp struct
+ * @info: fast mr info
+ * @post_sq: flag for cqp db to ring
+ */
+enum i40iw_status_code i40iw_sc_mr_fast_register(
+                               struct i40iw_sc_qp *qp,
+                               struct i40iw_fast_reg_stag_info *info,
+                               bool post_sq)
+{
+       u64 temp, header;
+       u64 *wqe;
+       u32 wqe_idx;
+
+       wqe = i40iw_qp_get_next_send_wqe(&qp->qp_uk, &wqe_idx, I40IW_QP_WQE_MIN_SIZE,
+                                        0, info->wr_id);
+       if (!wqe)
+               return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
+
+       i40iw_debug(qp->dev, I40IW_DEBUG_MR, "%s: wr_id[%llxh] wqe_idx[%04d] location[%p]\n",
+                   __func__, info->wr_id, wqe_idx,
+                   &qp->qp_uk.sq_wrtrk_array[wqe_idx].wrid);
+       temp = (info->addr_type == I40IW_ADDR_TYPE_VA_BASED) ? (uintptr_t)info->va : info->fbo;
+       set_64bit_val(wqe, 0, temp);
+
+       temp = RS_64(info->first_pm_pbl_index >> 16, I40IWQPSQ_FIRSTPMPBLIDXHI);
+       set_64bit_val(wqe,
+                     8,
+                     LS_64(temp, I40IWQPSQ_FIRSTPMPBLIDXHI) |
+                     LS_64(info->reg_addr_pa >> I40IWQPSQ_PBLADDR_SHIFT, I40IWQPSQ_PBLADDR));
+
+       set_64bit_val(wqe,
+                     16,
+                     info->total_len |
+                     LS_64(info->first_pm_pbl_index, I40IWQPSQ_FIRSTPMPBLIDXLO));
+
+       header = LS_64(info->stag_key, I40IWQPSQ_STAGKEY) |
+                LS_64(info->stag_idx, I40IWQPSQ_STAGINDEX) |
+                LS_64(I40IWQP_OP_FAST_REGISTER, I40IWQPSQ_OPCODE) |
+                LS_64(info->chunk_size, I40IWQPSQ_LPBLSIZE) |
+                LS_64(info->page_size, I40IWQPSQ_HPAGESIZE) |
+                LS_64(info->access_rights, I40IWQPSQ_STAGRIGHTS) |
+                LS_64(info->addr_type, I40IWQPSQ_VABASEDTO) |
+                LS_64(info->read_fence, I40IWQPSQ_READFENCE) |
+                LS_64(info->local_fence, I40IWQPSQ_LOCALFENCE) |
+                LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) |
+                LS_64(qp->qp_uk.swqe_polarity, I40IWQPSQ_VALID);
+
+       i40iw_insert_wqe_hdr(wqe, header);
+
+       i40iw_debug_buf(qp->dev, I40IW_DEBUG_WQE, "FAST_REG WQE",
+                       wqe, I40IW_QP_WQE_MIN_SIZE);
+
+       if (post_sq)
+               i40iw_qp_post_wr(&qp->qp_uk);
+       return 0;
+}
+
  /**
   * i40iw_sc_send_lsmm - send last streaming mode message
   * @qp: sc qp struct
@@ -3147,7 +3221,7 @@ enum i40iw_status_code i40iw_sc_init_iw_hmc(struct i40iw_sc_dev *dev, u8 hmc_fn_
                 i40iw_cqp_commit_fpm_values_cmd(dev, &query_fpm_mem, hmc_fn_id);
  
                 /* parse the fpm_commit_buf and fill hmc obj info */
-               i40iw_sc_parse_fpm_commit_buf((u64 *)query_fpm_mem.va, hmc_info->hmc_obj);
+               i40iw_sc_parse_fpm_commit_buf((u64 *)query_fpm_mem.va, hmc_info->hmc_obj, &hmc_info->sd_table.sd_cnt);
                 mem_size = sizeof(struct i40iw_hmc_sd_entry) *
                            (hmc_info->sd_table.sd_cnt + hmc_info->first_sd_index);
                 ret_code = i40iw_allocate_virt_mem(dev->hw, &virt_mem, mem_size);
@@ -3221,7 +3295,9 @@ static enum i40iw_status_code i40iw_sc_configure_iw_fpm(struct i40iw_sc_dev *dev
  
         /* parse the fpm_commit_buf and fill hmc obj info */
         if (!ret_code)
-               ret_code = i40iw_sc_parse_fpm_commit_buf(dev->fpm_commit_buf, hmc_info->hmc_obj);
+               ret_code = i40iw_sc_parse_fpm_commit_buf(dev->fpm_commit_buf,
+                                                        hmc_info->hmc_obj,
+                                                        &hmc_info->sd_table.sd_cnt);
  
         i40iw_debug_buf(dev, I40IW_DEBUG_HMC, "COMMIT FPM BUFFER",
                         commit_fpm_mem.va, I40IW_COMMIT_FPM_BUF_SIZE);
@@ -3468,6 +3544,40 @@ static bool i40iw_ring_full(struct i40iw_sc_cqp *cqp)
         return I40IW_RING_FULL_ERR(cqp->sq_ring);
  }
  
+/**
+ * i40iw_est_sd - returns approximate number of SDs for HMC
+ * @dev: sc device struct
+ * @hmc_info: hmc structure, size and count for HMC objects
+ */
+static u64 i40iw_est_sd(struct i40iw_sc_dev *dev, struct i40iw_hmc_info *hmc_info)
+{
+       int i;
+       u64 size = 0;
+       u64 sd;
+
+       for (i = I40IW_HMC_IW_QP; i < I40IW_HMC_IW_PBLE; i++)
+               size += hmc_info->hmc_obj[i].cnt * hmc_info->hmc_obj[i].size;
+
+       if (dev->is_pf)
+               size += hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt * hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].size;
+
+       if (size & 0x1FFFFF)
+               sd = (size >> 21) + 1; /* add 1 for remainder */
+       else
+               sd = size >> 21;
+
+       if (!dev->is_pf) {
+               /* 2MB alignment for VF PBLE HMC */
+               size = hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt * hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].size;
+               if (size & 0x1FFFFF)
+                       sd += (size >> 21) + 1; /* add 1 for remainder */
+               else
+                       sd += size >> 21;
+       }
+
+       return sd;
+}
+
  /**
   * i40iw_config_fpm_values - configure HMC objects
   * @dev: sc device struct
@@ -3479,7 +3589,7 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_
         u32 i, mem_size;
         u32 qpwantedoriginal, qpwanted, mrwanted, pblewanted;
         u32 powerof2;
-       u64 sd_needed, bytes_needed;
+       u64 sd_needed;
         u32 loop_count = 0;
  
         struct i40iw_hmc_info *hmc_info;
@@ -3497,23 +3607,15 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_
                 return ret_code;
         }
  
-       bytes_needed = 0;
-       for (i = I40IW_HMC_IW_QP; i < I40IW_HMC_IW_MAX; i++) {
+       for (i = I40IW_HMC_IW_QP; i < I40IW_HMC_IW_MAX; i++)
                 hmc_info->hmc_obj[i].cnt = hmc_info->hmc_obj[i].max_cnt;
-               bytes_needed +=
-                   (hmc_info->hmc_obj[i].max_cnt) * (hmc_info->hmc_obj[i].size);
-               i40iw_debug(dev, I40IW_DEBUG_HMC,
-                           "%s i[%04d] max_cnt[0x%04X] size[0x%04llx]\n",
-                           __func__, i, hmc_info->hmc_obj[i].max_cnt,
-                           hmc_info->hmc_obj[i].size);
-       }
-       sd_needed = (bytes_needed / I40IW_HMC_DIRECT_BP_SIZE) + 1; /* round up */
+       sd_needed = i40iw_est_sd(dev, hmc_info);
         i40iw_debug(dev, I40IW_DEBUG_HMC,
                     "%s: FW initial max sd_count[%08lld] first_sd_index[%04d]\n",
                     __func__, sd_needed, hmc_info->first_sd_index);
         i40iw_debug(dev, I40IW_DEBUG_HMC,
-                   "%s: bytes_needed=0x%llx sd count %d where max sd is %d\n",
-                   __func__, bytes_needed, hmc_info->sd_table.sd_cnt,
+                   "%s: sd count %d where max sd is %d\n",
+                   __func__, hmc_info->sd_table.sd_cnt,
                     hmc_fpm_misc->max_sds);
  
         qpwanted = min(qp_count, hmc_info->hmc_obj[I40IW_HMC_IW_QP].max_cnt);
@@ -3555,11 +3657,7 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_
                 hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt = pblewanted;
  
                 /* How much memory is needed for all the objects. */
-               bytes_needed = 0;
-               for (i = I40IW_HMC_IW_QP; i < I40IW_HMC_IW_MAX; i++)
-                       bytes_needed +=
-                           (hmc_info->hmc_obj[i].cnt) * (hmc_info->hmc_obj[i].size);
-               sd_needed = (bytes_needed / I40IW_HMC_DIRECT_BP_SIZE) + 1;
+               sd_needed = i40iw_est_sd(dev, hmc_info);
                 if ((loop_count > 1000) ||
                     ((!(loop_count % 10)) &&
                     (qpwanted > qpwantedoriginal * 2 / 3))) {
@@ -3580,15 +3678,7 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_
                         pblewanted -= FPM_MULTIPLIER * 1000;
         } while (sd_needed > hmc_fpm_misc->max_sds && loop_count < 2000);
  
-       bytes_needed = 0;
-       for (i = I40IW_HMC_IW_QP; i < I40IW_HMC_IW_MAX; i++) {
-               bytes_needed += (hmc_info->hmc_obj[i].cnt) * (hmc_info->hmc_obj[i].size);
-               i40iw_debug(dev, I40IW_DEBUG_HMC,
-                           "%s i[%04d] cnt[0x%04x] size[0x%04llx]\n",
-                           __func__, i, hmc_info->hmc_obj[i].cnt,
-                           hmc_info->hmc_obj[i].size);
-       }
-       sd_needed = (bytes_needed / I40IW_HMC_DIRECT_BP_SIZE) + 1;    /* round up not truncate. */
+       sd_needed = i40iw_est_sd(dev, hmc_info);
  
         i40iw_debug(dev, I40IW_DEBUG_HMC,
                     "loop_cnt=%d, sd_needed=%lld, qpcnt = %d, cqcnt=%d, mrcnt=%d, pblecnt=%d\n",
@@ -3606,8 +3696,6 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_
                 return ret_code;
         }
  
-       hmc_info->sd_table.sd_cnt = (u32)sd_needed;
-
         mem_size = sizeof(struct i40iw_hmc_sd_entry) *
                    (hmc_info->sd_table.sd_cnt + hmc_info->first_sd_index + 1);
         ret_code = i40iw_allocate_virt_mem(dev->hw, &virt_mem, mem_size);
@@ -3911,11 +3999,11 @@ enum i40iw_status_code i40iw_process_bh(struct i40iw_sc_dev *dev)
   */
  static u32 i40iw_iwarp_opcode(struct i40iw_aeqe_info *info, u8 *pkt)
  {
-       u16 *mpa;
+       __be16 *mpa;
         u32 opcode = 0xffffffff;
  
         if (info->q2_data_written) {
-               mpa = (u16 *)pkt;
+               mpa = (__be16 *)pkt;
                 opcode = ntohs(mpa[1]) & 0xf;
         }
         return opcode;
@@ -3977,7 +4065,7 @@ static int i40iw_bld_terminate_hdr(struct i40iw_sc_qp *qp,
         if (info->q2_data_written) {
                 /* Use data from offending packet to fill in ddp & rdma hdrs */
                 pkt = i40iw_locate_mpa(pkt);
-               ddp_seg_len = ntohs(*(u16 *)pkt);
+               ddp_seg_len = ntohs(*(__be16 *)pkt);
                 if (ddp_seg_len) {
                         copy_len = 2;
                         termhdr->hdrct = DDP_LEN_FLAG;
@@ -4188,13 +4276,13 @@ void i40iw_terminate_connection(struct i40iw_sc_qp *qp, struct i40iw_aeqe_info *
  void i40iw_terminate_received(struct i40iw_sc_qp *qp, struct i40iw_aeqe_info *info)
  {
         u8 *pkt = qp->q2_buf + Q2_BAD_FRAME_OFFSET;
-       u32 *mpa;
+       __be32 *mpa;
         u8 ddp_ctl;
         u8 rdma_ctl;
         u16 aeq_id = 0;
         struct i40iw_terminate_hdr *termhdr;
  
-       mpa = (u32 *)i40iw_locate_mpa(pkt);
+       mpa = (__be32 *)i40iw_locate_mpa(pkt);
         if (info->q2_data_written) {
                 /* did not validate the frame - do it now */
                 ddp_ctl = (ntohl(mpa[0]) >> 8) & 0xff;
@@ -4559,17 +4647,18 @@ static struct i40iw_pd_ops iw_pd_ops = {
  };
  
  static struct i40iw_priv_qp_ops iw_priv_qp_ops = {
-       i40iw_sc_qp_init,
-       i40iw_sc_qp_create,
-       i40iw_sc_qp_modify,
-       i40iw_sc_qp_destroy,
-       i40iw_sc_qp_flush_wqes,
-       i40iw_sc_qp_upload_context,
-       i40iw_sc_qp_setctx,
-       i40iw_sc_send_lsmm,
-       i40iw_sc_send_lsmm_nostag,
-       i40iw_sc_send_rtt,
-       i40iw_sc_post_wqe0,
+       .qp_init = i40iw_sc_qp_init,
+       .qp_create = i40iw_sc_qp_create,
+       .qp_modify = i40iw_sc_qp_modify,
+       .qp_destroy = i40iw_sc_qp_destroy,
+       .qp_flush_wqes = i40iw_sc_qp_flush_wqes,
+       .qp_upload_context = i40iw_sc_qp_upload_context,
+       .qp_setctx = i40iw_sc_qp_setctx,
+       .qp_send_lsmm = i40iw_sc_send_lsmm,
+       .qp_send_lsmm_nostag = i40iw_sc_send_lsmm_nostag,
+       .qp_send_rtt = i40iw_sc_send_rtt,
+       .qp_post_wqe0 = i40iw_sc_post_wqe0,
+       .iw_mr_fast_register = i40iw_sc_mr_fast_register
  };
  
  static struct i40iw_priv_cq_ops iw_priv_cq_ops = {
diff --git a/drivers/infiniband/hw/i40iw/i40iw_d.h b/drivers/infiniband/hw/i40iw/i40iw_d.h

index aab88d65f805655e01e7a0625007a81ac99b99d6..bd942da91a2797dcfac81dbadd4aa6dbc32b3643 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_d.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_d.h
@@ -1290,7 +1290,7 @@
  
  /* wqe size considering 32 bytes per wqe*/
  #define I40IWQP_SW_MIN_WQSIZE 4                /* 128 bytes */
-#define I40IWQP_SW_MAX_WQSIZE 16384    /* 524288 bytes */
+#define I40IWQP_SW_MAX_WQSIZE 2048     /* 2048 bytes */
  
  #define I40IWQP_OP_RDMA_WRITE 0
  #define I40IWQP_OP_RDMA_READ 1
@@ -1512,6 +1512,8 @@ enum i40iw_alignment {
         I40IW_SD_BUF_ALIGNMENT =        0x100
  };
  
+#define I40IW_WQE_SIZE_64      64
+
  #define I40IW_QP_WQE_MIN_SIZE  32
  #define I40IW_QP_WQE_MAX_SIZE  128
  
diff --git a/drivers/infiniband/hw/i40iw/i40iw_hw.c b/drivers/infiniband/hw/i40iw/i40iw_hw.c

index 9fd302425563948ab9884e7fd12faec4db576726..3ee0cad96bc688457475a1f727969e7f05336e62 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_hw.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_hw.c
@@ -106,7 +106,9 @@ u32 i40iw_initialize_hw_resources(struct i40iw_device *iwdev)
         set_bit(2, iwdev->allocated_pds);
  
         spin_lock_init(&iwdev->resource_lock);
-       mrdrvbits = 24 - get_count_order(iwdev->max_mr);
+       spin_lock_init(&iwdev->qptable_lock);
+       /* stag index mask has a minimum of 14 bits */
+       mrdrvbits = 24 - max(get_count_order(iwdev->max_mr), 14);
         iwdev->mr_stagmask = ~(((1 << mrdrvbits) - 1) << (32 - mrdrvbits));
         return 0;
  }
@@ -301,11 +303,15 @@ void i40iw_process_aeq(struct i40iw_device *iwdev)
                             "%s ae_id = 0x%x bool qp=%d qp_id = %d\n",
                             __func__, info->ae_id, info->qp, info->qp_cq_id);
                 if (info->qp) {
+                       spin_lock_irqsave(&iwdev->qptable_lock, flags);
                         iwqp = iwdev->qp_table[info->qp_cq_id];
                         if (!iwqp) {
+                               spin_unlock_irqrestore(&iwdev->qptable_lock, flags);
                                 i40iw_pr_err("qp_id %d is already freed\n", info->qp_cq_id);
                                 continue;
                         }
+                       i40iw_add_ref(&iwqp->ibqp);
+                       spin_unlock_irqrestore(&iwdev->qptable_lock, flags);
                         qp = &iwqp->sc_qp;
                         spin_lock_irqsave(&iwqp->lock, flags);
                         iwqp->hw_tcp_state = info->tcp_state;
@@ -411,6 +417,8 @@ void i40iw_process_aeq(struct i40iw_device *iwdev)
                                 i40iw_terminate_connection(qp, info);
                                 break;
                 }
+               if (info->qp)
+                       i40iw_rem_ref(&iwqp->ibqp);
         } while (1);
  
         if (aeqcnt)
@@ -460,7 +468,7 @@ int i40iw_manage_apbvt(struct i40iw_device *iwdev, u16 accel_local_port, bool ad
   */
  void i40iw_manage_arp_cache(struct i40iw_device *iwdev,
                             unsigned char *mac_addr,
-                           __be32 *ip_addr,
+                           u32 *ip_addr,
                             bool ipv4,
                             u32 action)
  {
@@ -481,7 +489,7 @@ void i40iw_manage_arp_cache(struct i40iw_device *iwdev,
                 cqp_info->cqp_cmd = OP_ADD_ARP_CACHE_ENTRY;
                 info = &cqp_info->in.u.add_arp_cache_entry.info;
                 memset(info, 0, sizeof(*info));
-               info->arp_index = cpu_to_le32(arp_index);
+               info->arp_index = cpu_to_le16((u16)arp_index);
                 info->permanent = true;
                 ether_addr_copy(info->mac_addr, mac_addr);
                 cqp_info->in.u.add_arp_cache_entry.scratch = (uintptr_t)cqp_request;
diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c

index e41fae2422ab8ce6459c27fcec8384969cdf1ac7..c963cad92f5a8eb061af74ae964e9222115e1815 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_main.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_main.c
@@ -270,7 +270,6 @@ static void i40iw_disable_irq(struct i40iw_sc_dev *dev,
                 i40iw_wr32(dev->hw, I40E_PFINT_DYN_CTLN(msix_vec->idx - 1), 0);
         else
                 i40iw_wr32(dev->hw, I40E_VFINT_DYN_CTLN1(msix_vec->idx - 1), 0);
-       synchronize_irq(msix_vec->irq);
         free_irq(msix_vec->irq, dev_id);
  }
  
@@ -1147,10 +1146,7 @@ static enum i40iw_status_code i40iw_alloc_set_mac_ipaddr(struct i40iw_device *iw
         if (!status) {
                 status = i40iw_add_mac_ipaddr_entry(iwdev, macaddr,
                                                     (u8)iwdev->mac_ip_table_idx);
-               if (!status)
-                       status = i40iw_add_mac_ipaddr_entry(iwdev, macaddr,
-                                                           (u8)iwdev->mac_ip_table_idx);
-               else
+               if (status)
                         i40iw_del_macip_entry(iwdev, (u8)iwdev->mac_ip_table_idx);
         }
         return status;
@@ -1165,7 +1161,7 @@ static void i40iw_add_ipv6_addr(struct i40iw_device *iwdev)
         struct net_device *ip_dev;
         struct inet6_dev *idev;
         struct inet6_ifaddr *ifp;
-       __be32 local_ipaddr6[4];
+       u32 local_ipaddr6[4];
  
         rcu_read_lock();
         for_each_netdev_rcu(&init_net, ip_dev) {
@@ -1512,6 +1508,7 @@ static enum i40iw_status_code i40iw_setup_init_state(struct i40iw_handler *hdl,
             I40IW_HMC_PROFILE_DEFAULT;
         iwdev->max_rdma_vfs =
                 (iwdev->resource_profile != I40IW_HMC_PROFILE_DEFAULT) ?  max_rdma_vfs : 0;
+       iwdev->max_enabled_vfs = iwdev->max_rdma_vfs;
         iwdev->netdev = ldev->netdev;
         hdl->client = client;
         iwdev->mss = (!ldev->params.mtu) ? I40IW_DEFAULT_MSS : ldev->params.mtu - I40IW_MTU_TO_MSS;
@@ -1531,7 +1528,10 @@ static enum i40iw_status_code i40iw_setup_init_state(struct i40iw_handler *hdl,
                 goto exit;
         iwdev->obj_next = iwdev->obj_mem;
         iwdev->push_mode = push_mode;
+
         init_waitqueue_head(&iwdev->vchnl_waitq);
+       init_waitqueue_head(&dev->vf_reqs);
+
         status = i40iw_initialize_dev(iwdev, ldev);
  exit:
         if (status) {
@@ -1710,7 +1710,6 @@ static void i40iw_vf_reset(struct i40e_info *ldev, struct i40e_client *client, u
         for (i = 0; i < I40IW_MAX_PE_ENABLED_VF_COUNT; i++) {
                 if (!dev->vf_dev[i] || (dev->vf_dev[i]->vf_id != vf_id))
                         continue;
-
                 /* free all resources allocated on behalf of vf */
                 tmp_vfdev = dev->vf_dev[i];
                 spin_lock_irqsave(&dev->dev_pestat.stats_lock, flags);
@@ -1819,8 +1818,6 @@ static int i40iw_virtchnl_receive(struct i40e_info *ldev,
         dev = &hdl->device.sc_dev;
         iwdev = dev->back_dev;
  
-       i40iw_debug(dev, I40IW_DEBUG_VIRT, "msg %p, message length %u\n", msg, len);
-
         if (dev->vchnl_if.vchnl_recv) {
                 ret_code = dev->vchnl_if.vchnl_recv(dev, vf_id, msg, len);
                 if (!dev->is_pf) {
@@ -1831,6 +1828,39 @@ static int i40iw_virtchnl_receive(struct i40e_info *ldev,
         return ret_code;
  }
  
+/**
+ * i40iw_vf_clear_to_send - wait to send virtual channel message
+ * @dev: iwarp device *
+ * Wait for until virtual channel is clear
+ * before sending the next message
+ *
+ * Returns false if error
+ * Returns true if clear to send
+ */
+bool i40iw_vf_clear_to_send(struct i40iw_sc_dev *dev)
+{
+       struct i40iw_device *iwdev;
+       wait_queue_t wait;
+
+       iwdev = dev->back_dev;
+
+       if (!wq_has_sleeper(&dev->vf_reqs) &&
+           (atomic_read(&iwdev->vchnl_msgs) == 0))
+               return true; /* virtual channel is clear */
+
+       init_wait(&wait);
+       add_wait_queue_exclusive(&dev->vf_reqs, &wait);
+
+       if (!wait_event_timeout(dev->vf_reqs,
+                               (atomic_read(&iwdev->vchnl_msgs) == 0),
+                               I40IW_VCHNL_EVENT_TIMEOUT))
+               dev->vchnl_up = false;
+
+       remove_wait_queue(&dev->vf_reqs, &wait);
+
+       return dev->vchnl_up;
+}
+
  /**
   * i40iw_virtchnl_send - send a message through the virtual channel
   * @dev: iwarp device
@@ -1848,18 +1878,16 @@ static enum i40iw_status_code i40iw_virtchnl_send(struct i40iw_sc_dev *dev,
  {
         struct i40iw_device *iwdev;
         struct i40e_info *ldev;
-       enum i40iw_status_code ret_code = I40IW_ERR_BAD_PTR;
  
         if (!dev || !dev->back_dev)
-               return ret_code;
+               return I40IW_ERR_BAD_PTR;
  
         iwdev = dev->back_dev;
         ldev = iwdev->ldev;
  
         if (ldev && ldev->ops && ldev->ops->virtchnl_send)
-               ret_code = ldev->ops->virtchnl_send(ldev, &i40iw_client, vf_id, msg, len);
-
-       return ret_code;
+               return ldev->ops->virtchnl_send(ldev, &i40iw_client, vf_id, msg, len);
+       return I40IW_ERR_BAD_PTR;
  }
  
  /* client interface functions */
diff --git a/drivers/infiniband/hw/i40iw/i40iw_osdep.h b/drivers/infiniband/hw/i40iw/i40iw_osdep.h

index 7e20493510e84170231ae60a52ae05f492d1ef7c..80f422bf3967f4c723865dea334a12dea7e4004d 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_osdep.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_osdep.h
@@ -172,6 +172,7 @@ struct i40iw_hw;
  u8 __iomem *i40iw_get_hw_addr(void *dev);
  void i40iw_ieq_mpa_crc_ae(struct i40iw_sc_dev *dev, struct i40iw_sc_qp *qp);
  enum i40iw_status_code i40iw_vf_wait_vchnl_resp(struct i40iw_sc_dev *dev);
+bool i40iw_vf_clear_to_send(struct i40iw_sc_dev *dev);
  enum i40iw_status_code i40iw_ieq_check_mpacrc(struct shash_desc *desc, void *addr,
                                               u32 length, u32 value);
  struct i40iw_sc_qp *i40iw_ieq_get_qp(struct i40iw_sc_dev *dev, struct i40iw_puda_buf *buf);
diff --git a/drivers/infiniband/hw/i40iw/i40iw_pble.c b/drivers/infiniband/hw/i40iw/i40iw_pble.c

index ded853d2fad8ee91af5d152a8f8ed78e9787ccd5..85993dc44f6e687d4920ae85676035e34e07cf71 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_pble.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_pble.c
@@ -404,13 +404,14 @@ static enum i40iw_status_code add_pble_pool(struct i40iw_sc_dev *dev,
                         sd_entry->u.pd_table.pd_page_addr.pa : sd_entry->u.bp.addr.pa;
         if (sd_entry->valid)
                 return 0;
-       if (dev->is_pf)
+       if (dev->is_pf) {
                 ret_code = i40iw_hmc_sd_one(dev, hmc_info->hmc_fn_id,
                                             sd_reg_val, idx->sd_idx,
                                             sd_entry->entry_type, true);
-       if (ret_code) {
-               i40iw_pr_err("cqp cmd failed for sd (pbles)\n");
-               goto error;
+               if (ret_code) {
+                       i40iw_pr_err("cqp cmd failed for sd (pbles)\n");
+                       goto error;
+               }
         }
  
         sd_entry->valid = true;
diff --git a/drivers/infiniband/hw/i40iw/i40iw_puda.c b/drivers/infiniband/hw/i40iw/i40iw_puda.c

index 8eb400d8a7a04ab95a15645d8919514c08946cda..e9c6e82af9c7a07a8bb3d256007f297fcd61aa15 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_puda.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_puda.c
@@ -1194,7 +1194,7 @@ static enum i40iw_status_code i40iw_ieq_process_buf(struct i40iw_puda_rsrc *ieq,
  
         ioffset = (u16)(buf->data - (u8 *)buf->mem.va);
         while (datalen) {
-               fpdu_len = i40iw_ieq_get_fpdu_length(ntohs(*(u16 *)datap));
+               fpdu_len = i40iw_ieq_get_fpdu_length(ntohs(*(__be16 *)datap));
                 if (fpdu_len > pfpdu->max_fpdu_data) {
                         i40iw_debug(ieq->dev, I40IW_DEBUG_IEQ,
                                     "%s: error bad fpdu_len\n", __func__);
diff --git a/drivers/infiniband/hw/i40iw/i40iw_status.h b/drivers/infiniband/hw/i40iw/i40iw_status.h

index b0110c15e04483b72faf9f9e8d5efa9303851b8f..91c421762f06797be844f0834d7ff09ff36bb5f9 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_status.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_status.h
@@ -95,6 +95,7 @@ enum i40iw_status_code {
         I40IW_ERR_INVALID_MAC_ADDR = -65,
         I40IW_ERR_BAD_STAG      = -66,
         I40IW_ERR_CQ_COMPL_ERROR = -67,
+       I40IW_ERR_QUEUE_DESTROYED = -68
  
  };
  #endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_type.h b/drivers/infiniband/hw/i40iw/i40iw_type.h

index edb3a8c8267ab4893e56b8919355bfa18df5749a..16cc61720b530e5879ba509ab8ad43e714fc738c 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_type.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_type.h
@@ -479,16 +479,17 @@ struct i40iw_sc_dev {
         struct i40iw_virt_mem ieq_mem;
         struct i40iw_puda_rsrc *ieq;
  
-       struct i40iw_vf_cqp_ops *iw_vf_cqp_ops;
+       const struct i40iw_vf_cqp_ops *iw_vf_cqp_ops;
  
         struct i40iw_hmc_fpm_misc hmc_fpm_misc;
         u16 qs_handle;
-       u32     debug_mask;
+       u32 debug_mask;
         u16 exception_lan_queue;
         u8 hmc_fn_id;
         bool is_pf;
         bool vchnl_up;
         u8 vf_id;
+       wait_queue_head_t vf_reqs;
         u64 cqp_cmd_stats[OP_SIZE_CQP_STAT_ARRAY];
         struct i40iw_vchnl_vf_msg_buffer vchnl_vf_msg_buf;
         u8 hw_rev;
@@ -889,8 +890,8 @@ struct i40iw_qhash_table_info {
         u32 qp_num;
         u32 dest_ip[4];
         u32 src_ip[4];
-       u32 dest_port;
-       u32 src_port;
+       u16 dest_port;
+       u16 src_port;
  };
  
  struct i40iw_local_mac_ipaddr_entry_info {
@@ -1040,6 +1041,9 @@ struct i40iw_priv_qp_ops {
         void (*qp_send_lsmm_nostag)(struct i40iw_sc_qp *, void *, u32);
         void (*qp_send_rtt)(struct i40iw_sc_qp *, bool);
         enum i40iw_status_code (*qp_post_wqe0)(struct i40iw_sc_qp *, u8);
+       enum i40iw_status_code (*iw_mr_fast_register)(struct i40iw_sc_qp *,
+                                                     struct i40iw_fast_reg_stag_info *,
+                                                     bool);
  };
  
  struct i40iw_priv_cq_ops {
@@ -1108,7 +1112,7 @@ struct i40iw_hmc_ops {
         enum i40iw_status_code (*parse_fpm_query_buf)(u64 *, struct i40iw_hmc_info *,
                                                       struct i40iw_hmc_fpm_misc *);
         enum i40iw_status_code (*configure_iw_fpm)(struct i40iw_sc_dev *, u8);
-       enum i40iw_status_code (*parse_fpm_commit_buf)(u64 *, struct i40iw_hmc_obj_info *);
+       enum i40iw_status_code (*parse_fpm_commit_buf)(u64 *, struct i40iw_hmc_obj_info *, u32 *sd);
         enum i40iw_status_code (*create_hmc_object)(struct i40iw_sc_dev *dev,
                                                     struct i40iw_hmc_create_obj_info *);
         enum i40iw_status_code (*del_hmc_object)(struct i40iw_sc_dev *dev,
diff --git a/drivers/infiniband/hw/i40iw/i40iw_uk.c b/drivers/infiniband/hw/i40iw/i40iw_uk.c

index f78c3dc8bdb22d56e22e6a76fb34021480296fa6..e35faea88c134ca777c2914a2dcfaf87afc77a25 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_uk.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_uk.c
@@ -56,6 +56,9 @@ static enum i40iw_status_code i40iw_nop_1(struct i40iw_qp_uk *qp)
  
         wqe_idx = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring);
         wqe = qp->sq_base[wqe_idx].elem;
+
+       qp->sq_wrtrk_array[wqe_idx].wqe_size = I40IW_QP_WQE_MIN_SIZE;
+
         peek_head = (qp->sq_ring.head + 1) % qp->sq_ring.size;
         wqe_0 = qp->sq_base[peek_head].elem;
         if (peek_head)
@@ -130,7 +133,10 @@ static void i40iw_qp_ring_push_db(struct i40iw_qp_uk *qp, u32 wqe_idx)
   */
  u64 *i40iw_qp_get_next_send_wqe(struct i40iw_qp_uk *qp,
                                 u32 *wqe_idx,
-                               u8 wqe_size)
+                               u8 wqe_size,
+                               u32 total_size,
+                               u64 wr_id
+                               )
  {
         u64 *wqe = NULL;
         u64 wqe_ptr;
@@ -159,6 +165,17 @@ u64 *i40iw_qp_get_next_send_wqe(struct i40iw_qp_uk *qp,
                 if (!*wqe_idx)
                         qp->swqe_polarity = !qp->swqe_polarity;
         }
+
+       if (((*wqe_idx & 3) == 1) && (wqe_size == I40IW_WQE_SIZE_64)) {
+               i40iw_nop_1(qp);
+               I40IW_RING_MOVE_HEAD(qp->sq_ring, ret_code);
+               if (ret_code)
+                       return NULL;
+               *wqe_idx = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring);
+               if (!*wqe_idx)
+                       qp->swqe_polarity = !qp->swqe_polarity;
+       }
+
         for (i = 0; i < wqe_size / I40IW_QP_WQE_MIN_SIZE; i++) {
                 I40IW_RING_MOVE_HEAD(qp->sq_ring, ret_code);
                 if (ret_code)
@@ -169,8 +186,15 @@ u64 *i40iw_qp_get_next_send_wqe(struct i40iw_qp_uk *qp,
  
         peek_head = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring);
         wqe_0 = qp->sq_base[peek_head].elem;
-       if (peek_head & 0x3)
-               wqe_0[3] = LS_64(!qp->swqe_polarity, I40IWQPSQ_VALID);
+
+       if (((peek_head & 3) == 1) || ((peek_head & 3) == 3)) {
+               if (RS_64(wqe_0[3], I40IWQPSQ_VALID) != !qp->swqe_polarity)
+                       wqe_0[3] = LS_64(!qp->swqe_polarity, I40IWQPSQ_VALID);
+       }
+
+       qp->sq_wrtrk_array[*wqe_idx].wrid = wr_id;
+       qp->sq_wrtrk_array[*wqe_idx].wr_len = total_size;
+       qp->sq_wrtrk_array[*wqe_idx].wqe_size = wqe_size;
         return wqe;
  }
  
@@ -249,12 +273,9 @@ static enum i40iw_status_code i40iw_rdma_write(struct i40iw_qp_uk *qp,
         if (ret_code)
                 return ret_code;
  
-       wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size);
+       wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, total_size, info->wr_id);
         if (!wqe)
                 return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
-
-       qp->sq_wrtrk_array[wqe_idx].wrid = info->wr_id;
-       qp->sq_wrtrk_array[wqe_idx].wr_len = total_size;
         set_64bit_val(wqe, 16,
                       LS_64(op_info->rem_addr.tag_off, I40IWQPSQ_FRAG_TO));
         if (!op_info->rem_addr.stag)
@@ -309,12 +330,9 @@ static enum i40iw_status_code i40iw_rdma_read(struct i40iw_qp_uk *qp,
         ret_code = i40iw_fragcnt_to_wqesize_sq(1, &wqe_size);
         if (ret_code)
                 return ret_code;
-       wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size);
+       wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, op_info->lo_addr.len, info->wr_id);
         if (!wqe)
                 return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
-
-       qp->sq_wrtrk_array[wqe_idx].wrid = info->wr_id;
-       qp->sq_wrtrk_array[wqe_idx].wr_len = op_info->lo_addr.len;
         local_fence |= info->local_fence;
  
         set_64bit_val(wqe, 16, LS_64(op_info->rem_addr.tag_off, I40IWQPSQ_FRAG_TO));
@@ -366,13 +384,11 @@ static enum i40iw_status_code i40iw_send(struct i40iw_qp_uk *qp,
         if (ret_code)
                 return ret_code;
  
-       wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size);
+       wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, total_size, info->wr_id);
         if (!wqe)
                 return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
  
         read_fence |= info->read_fence;
-       qp->sq_wrtrk_array[wqe_idx].wrid = info->wr_id;
-       qp->sq_wrtrk_array[wqe_idx].wr_len = total_size;
         set_64bit_val(wqe, 16, 0);
         header = LS_64(stag_to_inv, I40IWQPSQ_REMSTAG) |
                  LS_64(info->op_type, I40IWQPSQ_OPCODE) |
@@ -427,13 +443,11 @@ static enum i40iw_status_code i40iw_inline_rdma_write(struct i40iw_qp_uk *qp,
         if (ret_code)
                 return ret_code;
  
-       wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size);
+       wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, op_info->len, info->wr_id);
         if (!wqe)
                 return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
  
         read_fence |= info->read_fence;
-       qp->sq_wrtrk_array[wqe_idx].wrid = info->wr_id;
-       qp->sq_wrtrk_array[wqe_idx].wr_len = op_info->len;
         set_64bit_val(wqe, 16,
                       LS_64(op_info->rem_addr.tag_off, I40IWQPSQ_FRAG_TO));
  
@@ -507,14 +521,11 @@ static enum i40iw_status_code i40iw_inline_send(struct i40iw_qp_uk *qp,
         if (ret_code)
                 return ret_code;
  
-       wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size);
+       wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, op_info->len, info->wr_id);
         if (!wqe)
                 return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
  
         read_fence |= info->read_fence;
-
-       qp->sq_wrtrk_array[wqe_idx].wrid = info->wr_id;
-       qp->sq_wrtrk_array[wqe_idx].wr_len = op_info->len;
         header = LS_64(stag_to_inv, I40IWQPSQ_REMSTAG) |
             LS_64(info->op_type, I40IWQPSQ_OPCODE) |
             LS_64(op_info->len, I40IWQPSQ_INLINEDATALEN) |
@@ -574,12 +585,9 @@ static enum i40iw_status_code i40iw_stag_local_invalidate(struct i40iw_qp_uk *qp
         op_info = &info->op.inv_local_stag;
         local_fence = info->local_fence;
  
-       wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE);
+       wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE, 0, info->wr_id);
         if (!wqe)
                 return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
-
-       qp->sq_wrtrk_array[wqe_idx].wrid = info->wr_id;
-       qp->sq_wrtrk_array[wqe_idx].wr_len = 0;
         set_64bit_val(wqe, 0, 0);
         set_64bit_val(wqe, 8,
                       LS_64(op_info->target_stag, I40IWQPSQ_LOCSTAG));
@@ -619,12 +627,9 @@ static enum i40iw_status_code i40iw_mw_bind(struct i40iw_qp_uk *qp,
         op_info = &info->op.bind_window;
  
         local_fence |= info->local_fence;
-       wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE);
+       wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE, 0, info->wr_id);
         if (!wqe)
                 return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
-
-       qp->sq_wrtrk_array[wqe_idx].wrid = info->wr_id;
-       qp->sq_wrtrk_array[wqe_idx].wr_len = 0;
         set_64bit_val(wqe, 0, (uintptr_t)op_info->va);
         set_64bit_val(wqe, 8,
                       LS_64(op_info->mr_stag, I40IWQPSQ_PARENTMRSTAG) |
@@ -760,7 +765,7 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq,
         enum i40iw_status_code ret_code2 = 0;
         bool move_cq_head = true;
         u8 polarity;
-       u8 addl_frag_cnt, addl_wqes = 0;
+       u8 addl_wqes = 0;
  
         if (cq->avoid_mem_cflct)
                 cqe = (u64 *)I40IW_GET_CURRENT_EXTENDED_CQ_ELEMENT(cq);
@@ -797,6 +802,10 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq,
         info->is_srq = (bool)RS_64(qword3, I40IWCQ_SRQ);
  
         qp = (struct i40iw_qp_uk *)(unsigned long)comp_ctx;
+       if (!qp) {
+               ret_code = I40IW_ERR_QUEUE_DESTROYED;
+               goto exit;
+       }
         wqe_idx = (u32)RS_64(qword3, I40IW_CQ_WQEIDX);
         info->qp_handle = (i40iw_qp_handle)(unsigned long)qp;
  
@@ -827,11 +836,8 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq,
                         info->op_type = (u8)RS_64(qword3, I40IWCQ_OP);
                         sw_wqe = qp->sq_base[wqe_idx].elem;
                         get_64bit_val(sw_wqe, 24, &wqe_qword);
-                       addl_frag_cnt =
-                           (u8)RS_64(wqe_qword, I40IWQPSQ_ADDFRAGCNT);
-                       i40iw_fragcnt_to_wqesize_sq(addl_frag_cnt + 1, &addl_wqes);
  
-                       addl_wqes = (addl_wqes / I40IW_QP_WQE_MIN_SIZE);
+                       addl_wqes = qp->sq_wrtrk_array[wqe_idx].wqe_size / I40IW_QP_WQE_MIN_SIZE;
                         I40IW_RING_SET_TAIL(qp->sq_ring, (wqe_idx + addl_wqes));
                 } else {
                         do {
@@ -843,9 +849,7 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq,
                                 get_64bit_val(sw_wqe, 24, &wqe_qword);
                                 op_type = (u8)RS_64(wqe_qword, I40IWQPSQ_OPCODE);
                                 info->op_type = op_type;
-                               addl_frag_cnt = (u8)RS_64(wqe_qword, I40IWQPSQ_ADDFRAGCNT);
-                               i40iw_fragcnt_to_wqesize_sq(addl_frag_cnt + 1, &addl_wqes);
-                               addl_wqes = (addl_wqes / I40IW_QP_WQE_MIN_SIZE);
+                               addl_wqes = qp->sq_wrtrk_array[tail].wqe_size / I40IW_QP_WQE_MIN_SIZE;
                                 I40IW_RING_SET_TAIL(qp->sq_ring, (tail + addl_wqes));
                                 if (op_type != I40IWQP_OP_NOP) {
                                         info->wr_id = qp->sq_wrtrk_array[tail].wrid;
@@ -859,6 +863,7 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq,
  
         ret_code = 0;
  
+exit:
         if (!ret_code &&
             (info->comp_status == I40IW_COMPL_STATUS_FLUSHED))
                 if (pring && (I40IW_RING_MORE_WORK(*pring)))
@@ -893,19 +898,21 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq,
   * i40iw_get_wqe_shift - get shift count for maximum wqe size
   * @wqdepth: depth of wq required.
   * @sge: Maximum Scatter Gather Elements wqe
+ * @inline_data: Maximum inline data size
   * @shift: Returns the shift needed based on sge
   *
- * Shift can be used to left shift the wqe size based on sge.
- * If sge, == 1, shift =0 (wqe_size of 32 bytes), for sge=2 and 3, shift =1
- * (64 bytes wqes) and 2 otherwise (128 bytes wqe).
+ * Shift can be used to left shift the wqe size based on number of SGEs and inlind data size.
+ * For 1 SGE or inline data <= 16, shift = 0 (wqe size of 32 bytes).
+ * For 2 or 3 SGEs or inline data <= 48, shift = 1 (wqe size of 64 bytes).
+ * Shift of 2 otherwise (wqe size of 128 bytes).
   */
-enum i40iw_status_code i40iw_get_wqe_shift(u32 wqdepth, u8 sge, u8 *shift)
+enum i40iw_status_code i40iw_get_wqe_shift(u32 wqdepth, u32 sge, u32 inline_data, u8 *shift)
  {
         u32 size;
  
         *shift = 0;
-       if (sge > 1)
-               *shift = (sge < 4) ? 1 : 2;
+       if (sge > 1 || inline_data > 16)
+               *shift = (sge < 4 && inline_data <= 48) ? 1 : 2;
  
         /* check if wqdepth is multiple of 2 or not */
  
@@ -968,11 +975,11 @@ enum i40iw_status_code i40iw_qp_uk_init(struct i40iw_qp_uk *qp,
  
         if (info->max_rq_frag_cnt > I40IW_MAX_WQ_FRAGMENT_COUNT)
                 return I40IW_ERR_INVALID_FRAG_COUNT;
-       ret_code = i40iw_get_wqe_shift(info->sq_size, info->max_sq_frag_cnt, &sqshift);
+       ret_code = i40iw_get_wqe_shift(info->sq_size, info->max_sq_frag_cnt, info->max_inline_data, &sqshift);
         if (ret_code)
                 return ret_code;
  
-       ret_code = i40iw_get_wqe_shift(info->rq_size, info->max_rq_frag_cnt, &rqshift);
+       ret_code = i40iw_get_wqe_shift(info->rq_size, info->max_rq_frag_cnt, 0, &rqshift);
         if (ret_code)
                 return ret_code;
  
@@ -1097,12 +1104,9 @@ enum i40iw_status_code i40iw_nop(struct i40iw_qp_uk *qp,
         u64 header, *wqe;
         u32 wqe_idx;
  
-       wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE);
+       wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE, 0, wr_id);
         if (!wqe)
                 return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
-
-       qp->sq_wrtrk_array[wqe_idx].wrid = wr_id;
-       qp->sq_wrtrk_array[wqe_idx].wr_len = 0;
         set_64bit_val(wqe, 0, 0);
         set_64bit_val(wqe, 8, 0);
         set_64bit_val(wqe, 16, 0);
@@ -1125,7 +1129,7 @@ enum i40iw_status_code i40iw_nop(struct i40iw_qp_uk *qp,
   * @frag_cnt: number of fragments
   * @wqe_size: size of sq wqe returned
   */
-enum i40iw_status_code i40iw_fragcnt_to_wqesize_sq(u8 frag_cnt, u8 *wqe_size)
+enum i40iw_status_code i40iw_fragcnt_to_wqesize_sq(u32 frag_cnt, u8 *wqe_size)
  {
         switch (frag_cnt) {
         case 0:
@@ -1156,7 +1160,7 @@ enum i40iw_status_code i40iw_fragcnt_to_wqesize_sq(u8 frag_cnt, u8 *wqe_size)
   * @frag_cnt: number of fragments
   * @wqe_size: size of rq wqe returned
   */
-enum i40iw_status_code i40iw_fragcnt_to_wqesize_rq(u8 frag_cnt, u8 *wqe_size)
+enum i40iw_status_code i40iw_fragcnt_to_wqesize_rq(u32 frag_cnt, u8 *wqe_size)
  {
         switch (frag_cnt) {
         case 0:
diff --git a/drivers/infiniband/hw/i40iw/i40iw_user.h b/drivers/infiniband/hw/i40iw/i40iw_user.h

index 5cd971bb8cc7d5483e602de158b1754b50eb74b6..4627646fe8cde4976681df7cfecdfe103aab80a2 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_user.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_user.h
@@ -61,7 +61,7 @@ enum i40iw_device_capabilities_const {
         I40IW_MAX_CQ_SIZE =                     1048575,
         I40IW_MAX_AEQ_ALLOCATE_COUNT =          255,
         I40IW_DB_ID_ZERO =                      0,
-       I40IW_MAX_WQ_FRAGMENT_COUNT =           6,
+       I40IW_MAX_WQ_FRAGMENT_COUNT =           3,
         I40IW_MAX_SGE_RD =                      1,
         I40IW_MAX_OUTBOUND_MESSAGE_SIZE =       2147483647,
         I40IW_MAX_INBOUND_MESSAGE_SIZE =        2147483647,
@@ -70,8 +70,8 @@ enum i40iw_device_capabilities_const {
         I40IW_MAX_VF_FPM_ID =                   47,
         I40IW_MAX_VF_PER_PF =                   127,
         I40IW_MAX_SQ_PAYLOAD_SIZE =             2145386496,
-       I40IW_MAX_INLINE_DATA_SIZE =            112,
-       I40IW_MAX_PUSHMODE_INLINE_DATA_SIZE =   112,
+       I40IW_MAX_INLINE_DATA_SIZE =            48,
+       I40IW_MAX_PUSHMODE_INLINE_DATA_SIZE =   48,
         I40IW_MAX_IRD_SIZE =                    32,
         I40IW_QPCTX_ENCD_MAXIRD =               3,
         I40IW_MAX_WQ_ENTRIES =                  2048,
@@ -102,6 +102,8 @@ enum i40iw_device_capabilities_const {
  
  #define I40IW_STAG_INDEX_FROM_STAG(stag)    (((stag) && 0xFFFFFF00) >> 8)
  
+#define        I40IW_MAX_MR_SIZE       0x10000000000L
+
  struct i40iw_qp_uk;
  struct i40iw_cq_uk;
  struct i40iw_srq_uk;
@@ -198,7 +200,7 @@ enum i40iw_completion_notify {
  
  struct i40iw_post_send {
         i40iw_sgl sg_list;
-       u8 num_sges;
+       u32 num_sges;
  };
  
  struct i40iw_post_inline_send {
@@ -220,7 +222,7 @@ struct i40iw_post_inline_send_w_inv {
  
  struct i40iw_rdma_write {
         i40iw_sgl lo_sg_list;
-       u8 num_lo_sges;
+       u32 num_lo_sges;
         struct i40iw_sge rem_addr;
  };
  
@@ -345,7 +347,9 @@ struct i40iw_dev_uk {
  
  struct i40iw_sq_uk_wr_trk_info {
         u64 wrid;
-       u64 wr_len;
+       u32 wr_len;
+       u8 wqe_size;
+       u8 reserved[3];
  };
  
  struct i40iw_qp_quanta {
@@ -367,6 +371,8 @@ struct i40iw_qp_uk {
         u32 qp_id;
         u32 sq_size;
         u32 rq_size;
+       u32 max_sq_frag_cnt;
+       u32 max_rq_frag_cnt;
         struct i40iw_qp_uk_ops ops;
         bool use_srq;
         u8 swqe_polarity;
@@ -374,8 +380,6 @@ struct i40iw_qp_uk {
         u8 rwqe_polarity;
         u8 rq_wqe_size;
         u8 rq_wqe_size_multiplier;
-       u8 max_sq_frag_cnt;
-       u8 max_rq_frag_cnt;
         bool deferred_flag;
  };
  
@@ -404,8 +408,9 @@ struct i40iw_qp_uk_init_info {
         u32 qp_id;
         u32 sq_size;
         u32 rq_size;
-       u8 max_sq_frag_cnt;
-       u8 max_rq_frag_cnt;
+       u32 max_sq_frag_cnt;
+       u32 max_rq_frag_cnt;
+       u32 max_inline_data;
  
  };
  
@@ -422,7 +427,10 @@ void i40iw_device_init_uk(struct i40iw_dev_uk *dev);
  
  void i40iw_qp_post_wr(struct i40iw_qp_uk *qp);
  u64 *i40iw_qp_get_next_send_wqe(struct i40iw_qp_uk *qp, u32 *wqe_idx,
-                               u8 wqe_size);
+                               u8 wqe_size,
+                               u32 total_size,
+                               u64 wr_id
+                               );
  u64 *i40iw_qp_get_next_recv_wqe(struct i40iw_qp_uk *qp, u32 *wqe_idx);
  u64 *i40iw_qp_get_next_srq_wqe(struct i40iw_srq_uk *srq, u32 *wqe_idx);
  
@@ -434,9 +442,9 @@ enum i40iw_status_code i40iw_qp_uk_init(struct i40iw_qp_uk *qp,
  void i40iw_clean_cq(void *queue, struct i40iw_cq_uk *cq);
  enum i40iw_status_code i40iw_nop(struct i40iw_qp_uk *qp, u64 wr_id,
                                  bool signaled, bool post_sq);
-enum i40iw_status_code i40iw_fragcnt_to_wqesize_sq(u8 frag_cnt, u8 *wqe_size);
-enum i40iw_status_code i40iw_fragcnt_to_wqesize_rq(u8 frag_cnt, u8 *wqe_size);
+enum i40iw_status_code i40iw_fragcnt_to_wqesize_sq(u32 frag_cnt, u8 *wqe_size);
+enum i40iw_status_code i40iw_fragcnt_to_wqesize_rq(u32 frag_cnt, u8 *wqe_size);
  enum i40iw_status_code i40iw_inline_data_size_to_wqesize(u32 data_size,
                                                          u8 *wqe_size);
-enum i40iw_status_code i40iw_get_wqe_shift(u32 wqdepth, u8 sge, u8 *shift);
+enum i40iw_status_code i40iw_get_wqe_shift(u32 wqdepth, u32 sge, u32 inline_data, u8 *shift);
  #endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_utils.c b/drivers/infiniband/hw/i40iw/i40iw_utils.c

index 1ceec81bd8eb3ff95428d36a1eeefc86295c576b..0e8db0a3514153d7659977ff31d4dbaf45a103d5 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_utils.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_utils.c
@@ -59,7 +59,7 @@
   * @action: modify, delete or add
   */
  int i40iw_arp_table(struct i40iw_device *iwdev,
-                   __be32 *ip_addr,
+                   u32 *ip_addr,
                     bool ipv4,
                     u8 *mac_addr,
                     u32 action)
@@ -152,7 +152,7 @@ int i40iw_inetaddr_event(struct notifier_block *notifier,
         struct net_device *upper_dev;
         struct i40iw_device *iwdev;
         struct i40iw_handler *hdl;
-       __be32 local_ipaddr;
+       u32 local_ipaddr;
  
         hdl = i40iw_find_netdev(event_netdev);
         if (!hdl)
@@ -167,11 +167,10 @@ int i40iw_inetaddr_event(struct notifier_block *notifier,
         switch (event) {
         case NETDEV_DOWN:
                 if (upper_dev)
-                       local_ipaddr =
-                               ((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address;
+                       local_ipaddr = ntohl(
+                               ((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address);
                 else
-                       local_ipaddr = ifa->ifa_address;
-               local_ipaddr = ntohl(local_ipaddr);
+                       local_ipaddr = ntohl(ifa->ifa_address);
                 i40iw_manage_arp_cache(iwdev,
                                        netdev->dev_addr,
                                        &local_ipaddr,
@@ -180,11 +179,10 @@ int i40iw_inetaddr_event(struct notifier_block *notifier,
                 return NOTIFY_OK;
         case NETDEV_UP:
                 if (upper_dev)
-                       local_ipaddr =
-                               ((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address;
+                       local_ipaddr = ntohl(
+                               ((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address);
                 else
-                       local_ipaddr = ifa->ifa_address;
-               local_ipaddr = ntohl(local_ipaddr);
+                       local_ipaddr = ntohl(ifa->ifa_address);
                 i40iw_manage_arp_cache(iwdev,
                                        netdev->dev_addr,
                                        &local_ipaddr,
@@ -194,12 +192,11 @@ int i40iw_inetaddr_event(struct notifier_block *notifier,
         case NETDEV_CHANGEADDR:
                 /* Add the address to the IP table */
                 if (upper_dev)
-                       local_ipaddr =
-                               ((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address;
+                       local_ipaddr = ntohl(
+                               ((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address);
                 else
-                       local_ipaddr = ifa->ifa_address;
+                       local_ipaddr = ntohl(ifa->ifa_address);
  
-               local_ipaddr = ntohl(local_ipaddr);
                 i40iw_manage_arp_cache(iwdev,
                                        netdev->dev_addr,
                                        &local_ipaddr,
@@ -227,7 +224,7 @@ int i40iw_inet6addr_event(struct notifier_block *notifier,
         struct net_device *netdev;
         struct i40iw_device *iwdev;
         struct i40iw_handler *hdl;
-       __be32 local_ipaddr6[4];
+       u32 local_ipaddr6[4];
  
         hdl = i40iw_find_netdev(event_netdev);
         if (!hdl)
@@ -506,14 +503,19 @@ void i40iw_rem_ref(struct ib_qp *ibqp)
         struct cqp_commands_info *cqp_info;
         struct i40iw_device *iwdev;
         u32 qp_num;
+       unsigned long flags;
  
         iwqp = to_iwqp(ibqp);
-       if (!atomic_dec_and_test(&iwqp->refcount))
+       iwdev = iwqp->iwdev;
+       spin_lock_irqsave(&iwdev->qptable_lock, flags);
+       if (!atomic_dec_and_test(&iwqp->refcount)) {
+               spin_unlock_irqrestore(&iwdev->qptable_lock, flags);
                 return;
+       }
  
-       iwdev = iwqp->iwdev;
         qp_num = iwqp->ibqp.qp_num;
         iwdev->qp_table[qp_num] = NULL;
+       spin_unlock_irqrestore(&iwdev->qptable_lock, flags);
         cqp_request = i40iw_get_cqp_request(&iwdev->cqp, false);
         if (!cqp_request)
                 return;
@@ -985,21 +987,24 @@ enum i40iw_status_code i40iw_cqp_commit_fpm_values_cmd(struct i40iw_sc_dev *dev,
  enum i40iw_status_code i40iw_vf_wait_vchnl_resp(struct i40iw_sc_dev *dev)
  {
         struct i40iw_device *iwdev = dev->back_dev;
-       enum i40iw_status_code err_code = 0;
         int timeout_ret;
  
         i40iw_debug(dev, I40IW_DEBUG_VIRT, "%s[%u] dev %p, iwdev %p\n",
                     __func__, __LINE__, dev, iwdev);
-       atomic_add(2, &iwdev->vchnl_msgs);
+
+       atomic_set(&iwdev->vchnl_msgs, 2);
         timeout_ret = wait_event_timeout(iwdev->vchnl_waitq,
                                          (atomic_read(&iwdev->vchnl_msgs) == 1),
                                          I40IW_VCHNL_EVENT_TIMEOUT);
         atomic_dec(&iwdev->vchnl_msgs);
         if (!timeout_ret) {
                 i40iw_pr_err("virt channel completion timeout = 0x%x\n", timeout_ret);
-               err_code = I40IW_ERR_TIMEOUT;
+               atomic_set(&iwdev->vchnl_msgs, 0);
+               dev->vchnl_up = false;
+               return I40IW_ERR_TIMEOUT;
         }
-       return err_code;
+       wake_up(&dev->vf_reqs);
+       return 0;
  }
  
  /**
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c

index 1fe3b84a06e433a9b434f77cbab8f4b05870d08c..4a740f7a0519bcac97d35b86844ae844727daeb4 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -63,8 +63,8 @@ static int i40iw_query_device(struct ib_device *ibdev,
         ether_addr_copy((u8 *)&props->sys_image_guid, iwdev->netdev->dev_addr);
         props->fw_ver = I40IW_FW_VERSION;
         props->device_cap_flags = iwdev->device_cap_flags;
-       props->vendor_id = iwdev->vendor_id;
-       props->vendor_part_id = iwdev->vendor_part_id;
+       props->vendor_id = iwdev->ldev->pcidev->vendor;
+       props->vendor_part_id = iwdev->ldev->pcidev->device;
         props->hw_ver = (u32)iwdev->sc_dev.hw_rev;
         props->max_mr_size = I40IW_MAX_OUTBOUND_MESSAGE_SIZE;
         props->max_qp = iwdev->max_qp;
@@ -74,7 +74,7 @@ static int i40iw_query_device(struct ib_device *ibdev,
         props->max_cqe = iwdev->max_cqe;
         props->max_mr = iwdev->max_mr;
         props->max_pd = iwdev->max_pd;
-       props->max_sge_rd = 1;
+       props->max_sge_rd = I40IW_MAX_SGE_RD;
         props->max_qp_rd_atom = I40IW_MAX_IRD_SIZE;
         props->max_qp_init_rd_atom = props->max_qp_rd_atom;
         props->atomic_cap = IB_ATOMIC_NONE;
@@ -120,7 +120,7 @@ static int i40iw_query_port(struct ib_device *ibdev,
         props->pkey_tbl_len = 1;
         props->active_width = IB_WIDTH_4X;
         props->active_speed = 1;
-       props->max_msg_sz = 0x80000000;
+       props->max_msg_sz = I40IW_MAX_OUTBOUND_MESSAGE_SIZE;
         return 0;
  }
  
@@ -437,7 +437,6 @@ void i40iw_free_qp_resources(struct i40iw_device *iwdev,
         kfree(iwqp->kqp.wrid_mem);
         iwqp->kqp.wrid_mem = NULL;
         kfree(iwqp->allocated_buffer);
-       iwqp->allocated_buffer = NULL;
  }
  
  /**
@@ -521,14 +520,12 @@ static int i40iw_setup_kmode_qp(struct i40iw_device *iwdev,
         enum i40iw_status_code status;
         struct i40iw_qp_uk_init_info *ukinfo = &info->qp_uk_init_info;
  
-       ukinfo->max_sq_frag_cnt = I40IW_MAX_WQ_FRAGMENT_COUNT;
-
         sq_size = i40iw_qp_roundup(ukinfo->sq_size + 1);
         rq_size = i40iw_qp_roundup(ukinfo->rq_size + 1);
  
-       status = i40iw_get_wqe_shift(sq_size, ukinfo->max_sq_frag_cnt, &sqshift);
+       status = i40iw_get_wqe_shift(sq_size, ukinfo->max_sq_frag_cnt, ukinfo->max_inline_data, &sqshift);
         if (!status)
-               status = i40iw_get_wqe_shift(rq_size, ukinfo->max_rq_frag_cnt, &rqshift);
+               status = i40iw_get_wqe_shift(rq_size, ukinfo->max_rq_frag_cnt, 0, &rqshift);
  
         if (status)
                 return -ENOSYS;
@@ -609,6 +606,9 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd,
         if (init_attr->cap.max_inline_data > I40IW_MAX_INLINE_DATA_SIZE)
                 init_attr->cap.max_inline_data = I40IW_MAX_INLINE_DATA_SIZE;
  
+       if (init_attr->cap.max_send_sge > I40IW_MAX_WQ_FRAGMENT_COUNT)
+               init_attr->cap.max_send_sge = I40IW_MAX_WQ_FRAGMENT_COUNT;
+
         memset(&init_info, 0, sizeof(init_info));
  
         sq_size = init_attr->cap.max_send_wr;
@@ -618,6 +618,7 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd,
         init_info.qp_uk_init_info.rq_size = rq_size;
         init_info.qp_uk_init_info.max_sq_frag_cnt = init_attr->cap.max_send_sge;
         init_info.qp_uk_init_info.max_rq_frag_cnt = init_attr->cap.max_recv_sge;
+       init_info.qp_uk_init_info.max_inline_data = init_attr->cap.max_inline_data;
  
         mem = kzalloc(sizeof(*iwqp), GFP_KERNEL);
         if (!mem)
@@ -722,8 +723,10 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd,
         iwarp_info = &iwqp->iwarp_info;
         iwarp_info->rd_enable = true;
         iwarp_info->wr_rdresp_en = true;
-       if (!iwqp->user_mode)
+       if (!iwqp->user_mode) {
+               iwarp_info->fast_reg_en = true;
                 iwarp_info->priv_mode_en = true;
+       }
         iwarp_info->ddp_ver = 1;
         iwarp_info->rdmap_ver = 1;
  
@@ -784,6 +787,8 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd,
                         return ERR_PTR(err_code);
                 }
         }
+       init_completion(&iwqp->sq_drained);
+       init_completion(&iwqp->rq_drained);
  
         return &iwqp->ibqp;
  error:
@@ -1443,6 +1448,166 @@ static int i40iw_handle_q_mem(struct i40iw_device *iwdev,
         return err;
  }
  
+/**
+ * i40iw_hw_alloc_stag - cqp command to allocate stag
+ * @iwdev: iwarp device
+ * @iwmr: iwarp mr pointer
+ */
+static int i40iw_hw_alloc_stag(struct i40iw_device *iwdev, struct i40iw_mr *iwmr)
+{
+       struct i40iw_allocate_stag_info *info;
+       struct i40iw_pd *iwpd = to_iwpd(iwmr->ibmr.pd);
+       enum i40iw_status_code status;
+       int err = 0;
+       struct i40iw_cqp_request *cqp_request;
+       struct cqp_commands_info *cqp_info;
+
+       cqp_request = i40iw_get_cqp_request(&iwdev->cqp, true);
+       if (!cqp_request)
+               return -ENOMEM;
+
+       cqp_info = &cqp_request->info;
+       info = &cqp_info->in.u.alloc_stag.info;
+       memset(info, 0, sizeof(*info));
+       info->page_size = PAGE_SIZE;
+       info->stag_idx = iwmr->stag >> I40IW_CQPSQ_STAG_IDX_SHIFT;
+       info->pd_id = iwpd->sc_pd.pd_id;
+       info->total_len = iwmr->length;
+       cqp_info->cqp_cmd = OP_ALLOC_STAG;
+       cqp_info->post_sq = 1;
+       cqp_info->in.u.alloc_stag.dev = &iwdev->sc_dev;
+       cqp_info->in.u.alloc_stag.scratch = (uintptr_t)cqp_request;
+
+       status = i40iw_handle_cqp_op(iwdev, cqp_request);
+       if (status) {
+               err = -ENOMEM;
+               i40iw_pr_err("CQP-OP MR Reg fail");
+       }
+       return err;
+}
+
+/**
+ * i40iw_alloc_mr - register stag for fast memory registration
+ * @pd: ibpd pointer
+ * @mr_type: memory for stag registrion
+ * @max_num_sg: man number of pages
+ */
+static struct ib_mr *i40iw_alloc_mr(struct ib_pd *pd,
+                                   enum ib_mr_type mr_type,
+                                   u32 max_num_sg)
+{
+       struct i40iw_pd *iwpd = to_iwpd(pd);
+       struct i40iw_device *iwdev = to_iwdev(pd->device);
+       struct i40iw_pble_alloc *palloc;
+       struct i40iw_pbl *iwpbl;
+       struct i40iw_mr *iwmr;
+       enum i40iw_status_code status;
+       u32 stag;
+       int err_code = -ENOMEM;
+
+       iwmr = kzalloc(sizeof(*iwmr), GFP_KERNEL);
+       if (!iwmr)
+               return ERR_PTR(-ENOMEM);
+
+       stag = i40iw_create_stag(iwdev);
+       if (!stag) {
+               err_code = -EOVERFLOW;
+               goto err;
+       }
+       iwmr->stag = stag;
+       iwmr->ibmr.rkey = stag;
+       iwmr->ibmr.lkey = stag;
+       iwmr->ibmr.pd = pd;
+       iwmr->ibmr.device = pd->device;
+       iwpbl = &iwmr->iwpbl;
+       iwpbl->iwmr = iwmr;
+       iwmr->type = IW_MEMREG_TYPE_MEM;
+       palloc = &iwpbl->pble_alloc;
+       iwmr->page_cnt = max_num_sg;
+       mutex_lock(&iwdev->pbl_mutex);
+       status = i40iw_get_pble(&iwdev->sc_dev, iwdev->pble_rsrc, palloc, iwmr->page_cnt);
+       mutex_unlock(&iwdev->pbl_mutex);
+       if (!status)
+               goto err1;
+
+       if (palloc->level != I40IW_LEVEL_1)
+               goto err2;
+       err_code = i40iw_hw_alloc_stag(iwdev, iwmr);
+       if (err_code)
+               goto err2;
+       iwpbl->pbl_allocated = true;
+       i40iw_add_pdusecount(iwpd);
+       return &iwmr->ibmr;
+err2:
+       i40iw_free_pble(iwdev->pble_rsrc, palloc);
+err1:
+       i40iw_free_stag(iwdev, stag);
+err:
+       kfree(iwmr);
+       return ERR_PTR(err_code);
+}
+
+/**
+ * i40iw_set_page - populate pbl list for fmr
+ * @ibmr: ib mem to access iwarp mr pointer
+ * @addr: page dma address fro pbl list
+ */
+static int i40iw_set_page(struct ib_mr *ibmr, u64 addr)
+{
+       struct i40iw_mr *iwmr = to_iwmr(ibmr);
+       struct i40iw_pbl *iwpbl = &iwmr->iwpbl;
+       struct i40iw_pble_alloc *palloc = &iwpbl->pble_alloc;
+       u64 *pbl;
+
+       if (unlikely(iwmr->npages == iwmr->page_cnt))
+               return -ENOMEM;
+
+       pbl = (u64 *)palloc->level1.addr;
+       pbl[iwmr->npages++] = cpu_to_le64(addr);
+       return 0;
+}
+
+/**
+ * i40iw_map_mr_sg - map of sg list for fmr
+ * @ibmr: ib mem to access iwarp mr pointer
+ * @sg: scatter gather list for fmr
+ * @sg_nents: number of sg pages
+ */
+static int i40iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
+                          int sg_nents, unsigned int *sg_offset)
+{
+       struct i40iw_mr *iwmr = to_iwmr(ibmr);
+
+       iwmr->npages = 0;
+       return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, i40iw_set_page);
+}
+
+/**
+ * i40iw_drain_sq - drain the send queue
+ * @ibqp: ib qp pointer
+ */
+static void i40iw_drain_sq(struct ib_qp *ibqp)
+{
+       struct i40iw_qp *iwqp = to_iwqp(ibqp);
+       struct i40iw_sc_qp *qp = &iwqp->sc_qp;
+
+       if (I40IW_RING_MORE_WORK(qp->qp_uk.sq_ring))
+               wait_for_completion(&iwqp->sq_drained);
+}
+
+/**
+ * i40iw_drain_rq - drain the receive queue
+ * @ibqp: ib qp pointer
+ */
+static void i40iw_drain_rq(struct ib_qp *ibqp)
+{
+       struct i40iw_qp *iwqp = to_iwqp(ibqp);
+       struct i40iw_sc_qp *qp = &iwqp->sc_qp;
+
+       if (I40IW_RING_MORE_WORK(qp->qp_uk.rq_ring))
+               wait_for_completion(&iwqp->rq_drained);
+}
+
  /**
   * i40iw_hwreg_mr - send cqp command for memory registration
   * @iwdev: iwarp device
@@ -1526,14 +1691,16 @@ static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd,
         struct i40iw_mr *iwmr;
         struct ib_umem *region;
         struct i40iw_mem_reg_req req;
-       u32 pbl_depth = 0;
+       u64 pbl_depth = 0;
         u32 stag = 0;
         u16 access;
-       u32 region_length;
+       u64 region_length;
         bool use_pbles = false;
         unsigned long flags;
         int err = -ENOSYS;
  
+       if (length > I40IW_MAX_MR_SIZE)
+               return ERR_PTR(-EINVAL);
         region = ib_umem_get(pd->uobject->context, start, length, acc, 0);
         if (IS_ERR(region))
                 return (struct ib_mr *)region;
@@ -1564,7 +1731,7 @@ static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd,
         palloc = &iwpbl->pble_alloc;
  
         iwmr->type = req.reg_type;
-       iwmr->page_cnt = pbl_depth;
+       iwmr->page_cnt = (u32)pbl_depth;
  
         switch (req.reg_type) {
         case IW_MEMREG_TYPE_QP:
@@ -1881,12 +2048,14 @@ static int i40iw_post_send(struct ib_qp *ibqp,
         enum i40iw_status_code ret;
         int err = 0;
         unsigned long flags;
+       bool inv_stag;
  
         iwqp = (struct i40iw_qp *)ibqp;
         ukqp = &iwqp->sc_qp.qp_uk;
  
         spin_lock_irqsave(&iwqp->lock, flags);
         while (ib_wr) {
+               inv_stag = false;
                 memset(&info, 0, sizeof(info));
                 info.wr_id = (u64)(ib_wr->wr_id);
                 if ((ib_wr->send_flags & IB_SEND_SIGNALED) || iwqp->sig_all)
@@ -1896,19 +2065,28 @@ static int i40iw_post_send(struct ib_qp *ibqp,
  
                 switch (ib_wr->opcode) {
                 case IB_WR_SEND:
-                       if (ib_wr->send_flags & IB_SEND_SOLICITED)
-                               info.op_type = I40IW_OP_TYPE_SEND_SOL;
-                       else
-                               info.op_type = I40IW_OP_TYPE_SEND;
+                       /* fall-through */
+               case IB_WR_SEND_WITH_INV:
+                       if (ib_wr->opcode == IB_WR_SEND) {
+                               if (ib_wr->send_flags & IB_SEND_SOLICITED)
+                                       info.op_type = I40IW_OP_TYPE_SEND_SOL;
+                               else
+                                       info.op_type = I40IW_OP_TYPE_SEND;
+                       } else {
+                               if (ib_wr->send_flags & IB_SEND_SOLICITED)
+                                       info.op_type = I40IW_OP_TYPE_SEND_SOL_INV;
+                               else
+                                       info.op_type = I40IW_OP_TYPE_SEND_INV;
+                       }
  
                         if (ib_wr->send_flags & IB_SEND_INLINE) {
                                 info.op.inline_send.data = (void *)(unsigned long)ib_wr->sg_list[0].addr;
                                 info.op.inline_send.len = ib_wr->sg_list[0].length;
-                               ret = ukqp->ops.iw_inline_send(ukqp, &info, rdma_wr(ib_wr)->rkey, false);
+                               ret = ukqp->ops.iw_inline_send(ukqp, &info, ib_wr->ex.invalidate_rkey, false);
                         } else {
                                 info.op.send.num_sges = ib_wr->num_sge;
                                 info.op.send.sg_list = (struct i40iw_sge *)ib_wr->sg_list;
-                               ret = ukqp->ops.iw_send(ukqp, &info, rdma_wr(ib_wr)->rkey, false);
+                               ret = ukqp->ops.iw_send(ukqp, &info, ib_wr->ex.invalidate_rkey, false);
                         }
  
                         if (ret)
@@ -1936,7 +2114,14 @@ static int i40iw_post_send(struct ib_qp *ibqp,
                         if (ret)
                                 err = -EIO;
                         break;
+               case IB_WR_RDMA_READ_WITH_INV:
+                       inv_stag = true;
+                       /* fall-through*/
                 case IB_WR_RDMA_READ:
+                       if (ib_wr->num_sge > I40IW_MAX_SGE_RD) {
+                               err = -EINVAL;
+                               break;
+                       }
                         info.op_type = I40IW_OP_TYPE_RDMA_READ;
                         info.op.rdma_read.rem_addr.tag_off = rdma_wr(ib_wr)->remote_addr;
                         info.op.rdma_read.rem_addr.stag = rdma_wr(ib_wr)->rkey;
@@ -1944,10 +2129,47 @@ static int i40iw_post_send(struct ib_qp *ibqp,
                         info.op.rdma_read.lo_addr.tag_off = ib_wr->sg_list->addr;
                         info.op.rdma_read.lo_addr.stag = ib_wr->sg_list->lkey;
                         info.op.rdma_read.lo_addr.len = ib_wr->sg_list->length;
-                       ret = ukqp->ops.iw_rdma_read(ukqp, &info, false, false);
+                       ret = ukqp->ops.iw_rdma_read(ukqp, &info, inv_stag, false);
                         if (ret)
                                 err = -EIO;
                         break;
+               case IB_WR_LOCAL_INV:
+                       info.op_type = I40IW_OP_TYPE_INV_STAG;
+                       info.op.inv_local_stag.target_stag = ib_wr->ex.invalidate_rkey;
+                       ret = ukqp->ops.iw_stag_local_invalidate(ukqp, &info, true);
+                       if (ret)
+                               err = -EIO;
+                       break;
+               case IB_WR_REG_MR:
+               {
+                       struct i40iw_mr *iwmr = to_iwmr(reg_wr(ib_wr)->mr);
+                       int page_shift = ilog2(reg_wr(ib_wr)->mr->page_size);
+                       int flags = reg_wr(ib_wr)->access;
+                       struct i40iw_pble_alloc *palloc = &iwmr->iwpbl.pble_alloc;
+                       struct i40iw_sc_dev *dev = &iwqp->iwdev->sc_dev;
+                       struct i40iw_fast_reg_stag_info info;
+
+                       info.access_rights = I40IW_ACCESS_FLAGS_LOCALREAD;
+                       info.access_rights |= i40iw_get_user_access(flags);
+                       info.stag_key = reg_wr(ib_wr)->key & 0xff;
+                       info.stag_idx = reg_wr(ib_wr)->key >> 8;
+                       info.wr_id = ib_wr->wr_id;
+
+                       info.addr_type = I40IW_ADDR_TYPE_VA_BASED;
+                       info.va = (void *)(uintptr_t)iwmr->ibmr.iova;
+                       info.total_len = iwmr->ibmr.length;
+                       info.first_pm_pbl_index = palloc->level1.idx;
+                       info.local_fence = ib_wr->send_flags & IB_SEND_FENCE;
+                       info.signaled = ib_wr->send_flags & IB_SEND_SIGNALED;
+
+                       if (page_shift == 21)
+                               info.page_size = 1; /* 2M page */
+
+                       ret = dev->iw_priv_qp_ops->iw_mr_fast_register(&iwqp->sc_qp, &info, true);
+                       if (ret)
+                               err = -EIO;
+                       break;
+               }
                 default:
                         err = -EINVAL;
                         i40iw_pr_err(" upost_send bad opcode = 0x%x\n",
@@ -2027,6 +2249,7 @@ static int i40iw_poll_cq(struct ib_cq *ibcq,
         enum i40iw_status_code ret;
         struct i40iw_cq_uk *ukcq;
         struct i40iw_sc_qp *qp;
+       struct i40iw_qp *iwqp;
         unsigned long flags;
  
         iwcq = (struct i40iw_cq *)ibcq;
@@ -2037,6 +2260,8 @@ static int i40iw_poll_cq(struct ib_cq *ibcq,
                 ret = ukcq->ops.iw_cq_poll_completion(ukcq, &cq_poll_info, true);
                 if (ret == I40IW_ERR_QUEUE_EMPTY) {
                         break;
+               } else if (ret == I40IW_ERR_QUEUE_DESTROYED) {
+                       continue;
                 } else if (ret) {
                         if (!cqe_count)
                                 cqe_count = -1;
@@ -2044,10 +2269,12 @@ static int i40iw_poll_cq(struct ib_cq *ibcq,
                 }
                 entry->wc_flags = 0;
                 entry->wr_id = cq_poll_info.wr_id;
-               if (!cq_poll_info.error)
-                       entry->status = IB_WC_SUCCESS;
-               else
+               if (cq_poll_info.error) {
                         entry->status = IB_WC_WR_FLUSH_ERR;
+                       entry->vendor_err = cq_poll_info.major_err << 16 | cq_poll_info.minor_err;
+               } else {
+                       entry->status = IB_WC_SUCCESS;
+               }
  
                 switch (cq_poll_info.op_type) {
                 case I40IW_OP_TYPE_RDMA_WRITE:
@@ -2071,12 +2298,17 @@ static int i40iw_poll_cq(struct ib_cq *ibcq,
                         break;
                 }
  
-               entry->vendor_err =
-                   cq_poll_info.major_err << 16 | cq_poll_info.minor_err;
                 entry->ex.imm_data = 0;
                 qp = (struct i40iw_sc_qp *)cq_poll_info.qp_handle;
                 entry->qp = (struct ib_qp *)qp->back_qp;
                 entry->src_qp = cq_poll_info.qp_id;
+               iwqp = (struct i40iw_qp *)qp->back_qp;
+               if (iwqp->iwarp_state > I40IW_QP_STATE_RTS) {
+                       if (!I40IW_RING_MORE_WORK(qp->qp_uk.sq_ring))
+                               complete(&iwqp->sq_drained);
+                       if (!I40IW_RING_MORE_WORK(qp->qp_uk.rq_ring))
+                               complete(&iwqp->rq_drained);
+               }
                 entry->byte_len = cq_poll_info.bytes_xfered;
                 entry++;
                 cqe_count++;
@@ -2143,7 +2375,6 @@ static int i40iw_get_protocol_stats(struct ib_device *ibdev,
         struct i40iw_dev_hw_stats *hw_stats = &devstat->hw_stats;
         struct timespec curr_time;
         static struct timespec last_rd_time = {0, 0};
-       enum i40iw_status_code status = 0;
         unsigned long flags;
  
         curr_time = current_kernel_time();
@@ -2156,11 +2387,8 @@ static int i40iw_get_protocol_stats(struct ib_device *ibdev,
                 spin_unlock_irqrestore(&devstat->stats_lock, flags);
         } else {
                 if (((u64)curr_time.tv_sec - (u64)last_rd_time.tv_sec) > 1)
-                       status = i40iw_vchnl_vf_get_pe_stats(dev,
-                                                            &devstat->hw_stats);
-
-               if (status)
-                       return -ENOSYS;
+                       if (i40iw_vchnl_vf_get_pe_stats(dev, &devstat->hw_stats))
+                               return -ENOSYS;
         }
  
         stats->iw.ipInReceives = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP4RXPKTS] +
@@ -2327,6 +2555,10 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev
         iwibdev->ibdev.query_device = i40iw_query_device;
         iwibdev->ibdev.create_ah = i40iw_create_ah;
         iwibdev->ibdev.destroy_ah = i40iw_destroy_ah;
+       iwibdev->ibdev.drain_sq = i40iw_drain_sq;
+       iwibdev->ibdev.drain_rq = i40iw_drain_rq;
+       iwibdev->ibdev.alloc_mr = i40iw_alloc_mr;
+       iwibdev->ibdev.map_mr_sg = i40iw_map_mr_sg;
         iwibdev->ibdev.iwcm = kzalloc(sizeof(*iwibdev->ibdev.iwcm), GFP_KERNEL);
         if (!iwibdev->ibdev.iwcm) {
                 ib_dealloc_device(&iwibdev->ibdev);
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.h b/drivers/infiniband/hw/i40iw/i40iw_verbs.h

index 1101f77080e62fd98aff799c8c90c19f175d953b..0069be8a5a38b77ded23699d9adf6e979435ffa6 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.h
@@ -92,6 +92,7 @@ struct i40iw_mr {
         struct ib_umem *region;
         u16 type;
         u32 page_cnt;
+       u32 npages;
         u32 stag;
         u64 length;
         u64 pgaddrmem[MAX_SAVE_PAGE_ADDRS];
@@ -169,5 +170,7 @@ struct i40iw_qp {
         struct i40iw_pbl *iwpbl;
         struct i40iw_dma_mem q2_ctx_mem;
         struct i40iw_dma_mem ietf_mem;
+       struct completion sq_drained;
+       struct completion rq_drained;
  };
  #endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_vf.c b/drivers/infiniband/hw/i40iw/i40iw_vf.c

index cb0f18340e144bb7b4e426152f91ef2bda3d817f..e33d4810965c853a3e132114ebbefaafaaf62f0c 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_vf.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_vf.c
@@ -80,6 +80,6 @@ enum i40iw_status_code i40iw_manage_vf_pble_bp(struct i40iw_sc_cqp *cqp,
         return 0;
  }
  
-struct i40iw_vf_cqp_ops iw_vf_cqp_ops = {
+const struct i40iw_vf_cqp_ops iw_vf_cqp_ops = {
         i40iw_manage_vf_pble_bp
  };
diff --git a/drivers/infiniband/hw/i40iw/i40iw_vf.h b/drivers/infiniband/hw/i40iw/i40iw_vf.h

index f649f3a62e13adde83a622c094b4718318bc8a89..4359559ece9c1edebef59ebbac1884246176d51a 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_vf.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_vf.h
@@ -57,6 +57,6 @@ enum i40iw_status_code i40iw_manage_vf_pble_bp(struct i40iw_sc_cqp *cqp,
                                                u64 scratch,
                                                bool post_sq);
  
-extern struct i40iw_vf_cqp_ops iw_vf_cqp_ops;
+extern const struct i40iw_vf_cqp_ops iw_vf_cqp_ops;
  
  #endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_virtchnl.c b/drivers/infiniband/hw/i40iw/i40iw_virtchnl.c

index 6b68f7890b76252180c85a94c58bcad80cdaf766..3041003c94d2c53ed52beb93dca53440577ecfde 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_virtchnl.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_virtchnl.c
@@ -254,7 +254,7 @@ static void vchnl_pf_send_get_hmc_fcn_resp(struct i40iw_sc_dev *dev,
  static void vchnl_pf_send_get_pe_stats_resp(struct i40iw_sc_dev *dev,
                                             u32 vf_id,
                                             struct i40iw_virtchnl_op_buf *vchnl_msg,
-                                           struct i40iw_dev_hw_stats hw_stats)
+                                           struct i40iw_dev_hw_stats *hw_stats)
  {
         enum i40iw_status_code ret_code;
         u8 resp_buffer[sizeof(struct i40iw_virtchnl_resp_buf) + sizeof(struct i40iw_dev_hw_stats) - 1];
@@ -264,7 +264,7 @@ static void vchnl_pf_send_get_pe_stats_resp(struct i40iw_sc_dev *dev,
         vchnl_msg_resp->iw_chnl_op_ctx = vchnl_msg->iw_chnl_op_ctx;
         vchnl_msg_resp->iw_chnl_buf_len = sizeof(resp_buffer);
         vchnl_msg_resp->iw_op_ret_code = I40IW_SUCCESS;
-       *((struct i40iw_dev_hw_stats *)vchnl_msg_resp->iw_chnl_buf) = hw_stats;
+       *((struct i40iw_dev_hw_stats *)vchnl_msg_resp->iw_chnl_buf) = *hw_stats;
         ret_code = dev->vchnl_if.vchnl_send(dev, vf_id, resp_buffer, sizeof(resp_buffer));
         if (ret_code)
                 i40iw_debug(dev, I40IW_DEBUG_VIRT,
@@ -437,11 +437,9 @@ enum i40iw_status_code i40iw_vchnl_recv_pf(struct i40iw_sc_dev *dev,
                         vchnl_pf_send_get_ver_resp(dev, vf_id, vchnl_msg);
                 return I40IW_SUCCESS;
         }
-       for (iw_vf_idx = 0; iw_vf_idx < I40IW_MAX_PE_ENABLED_VF_COUNT;
-            iw_vf_idx++) {
+       for (iw_vf_idx = 0; iw_vf_idx < I40IW_MAX_PE_ENABLED_VF_COUNT; iw_vf_idx++) {
                 if (!dev->vf_dev[iw_vf_idx]) {
-                       if (first_avail_iw_vf ==
-                           I40IW_MAX_PE_ENABLED_VF_COUNT)
+                       if (first_avail_iw_vf == I40IW_MAX_PE_ENABLED_VF_COUNT)
                                 first_avail_iw_vf = iw_vf_idx;
                         continue;
                 }
@@ -541,7 +539,7 @@ enum i40iw_status_code i40iw_vchnl_recv_pf(struct i40iw_sc_dev *dev,
                 devstat->ops.iw_hw_stat_read_all(devstat, &devstat->hw_stats);
                 spin_unlock_irqrestore(&dev->dev_pestat.stats_lock, flags);
                 vf_dev->msg_count--;
-               vchnl_pf_send_get_pe_stats_resp(dev, vf_id, vchnl_msg, devstat->hw_stats);
+               vchnl_pf_send_get_pe_stats_resp(dev, vf_id, vchnl_msg, &devstat->hw_stats);
                 break;
         default:
                 i40iw_debug(dev, I40IW_DEBUG_VIRT,
@@ -596,23 +594,25 @@ enum i40iw_status_code i40iw_vchnl_vf_get_ver(struct i40iw_sc_dev *dev,
         struct i40iw_virtchnl_req vchnl_req;
         enum i40iw_status_code ret_code;
  
+       if (!i40iw_vf_clear_to_send(dev))
+               return I40IW_ERR_TIMEOUT;
         memset(&vchnl_req, 0, sizeof(vchnl_req));
         vchnl_req.dev = dev;
         vchnl_req.parm = vchnl_ver;
         vchnl_req.parm_len = sizeof(*vchnl_ver);
         vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg;
+
         ret_code = vchnl_vf_send_get_ver_req(dev, &vchnl_req);
-       if (!ret_code) {
-               ret_code = i40iw_vf_wait_vchnl_resp(dev);
-               if (!ret_code)
-                       ret_code = vchnl_req.ret_code;
-               else
-                       dev->vchnl_up = false;
-       } else {
+       if (ret_code) {
                 i40iw_debug(dev, I40IW_DEBUG_VIRT,
                             "%s Send message failed 0x%0x\n", __func__, ret_code);
+               return ret_code;
         }
-       return ret_code;
+       ret_code = i40iw_vf_wait_vchnl_resp(dev);
+       if (ret_code)
+               return ret_code;
+       else
+               return vchnl_req.ret_code;
  }
  
  /**
@@ -626,23 +626,25 @@ enum i40iw_status_code i40iw_vchnl_vf_get_hmc_fcn(struct i40iw_sc_dev *dev,
         struct i40iw_virtchnl_req vchnl_req;
         enum i40iw_status_code ret_code;
  
+       if (!i40iw_vf_clear_to_send(dev))
+               return I40IW_ERR_TIMEOUT;
         memset(&vchnl_req, 0, sizeof(vchnl_req));
         vchnl_req.dev = dev;
         vchnl_req.parm = hmc_fcn;
         vchnl_req.parm_len = sizeof(*hmc_fcn);
         vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg;
+
         ret_code = vchnl_vf_send_get_hmc_fcn_req(dev, &vchnl_req);
-       if (!ret_code) {
-               ret_code = i40iw_vf_wait_vchnl_resp(dev);
-               if (!ret_code)
-                       ret_code = vchnl_req.ret_code;
-               else
-                       dev->vchnl_up = false;
-       } else {
+       if (ret_code) {
                 i40iw_debug(dev, I40IW_DEBUG_VIRT,
                             "%s Send message failed 0x%0x\n", __func__, ret_code);
+               return ret_code;
         }
-       return ret_code;
+       ret_code = i40iw_vf_wait_vchnl_resp(dev);
+       if (ret_code)
+               return ret_code;
+       else
+               return vchnl_req.ret_code;
  }
  
  /**
@@ -660,25 +662,27 @@ enum i40iw_status_code i40iw_vchnl_vf_add_hmc_objs(struct i40iw_sc_dev *dev,
         struct i40iw_virtchnl_req vchnl_req;
         enum i40iw_status_code ret_code;
  
+       if (!i40iw_vf_clear_to_send(dev))
+               return I40IW_ERR_TIMEOUT;
         memset(&vchnl_req, 0, sizeof(vchnl_req));
         vchnl_req.dev = dev;
         vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg;
+
         ret_code = vchnl_vf_send_add_hmc_objs_req(dev,
                                                   &vchnl_req,
                                                   rsrc_type,
                                                   start_index,
                                                   rsrc_count);
-       if (!ret_code) {
-               ret_code = i40iw_vf_wait_vchnl_resp(dev);
-               if (!ret_code)
-                       ret_code = vchnl_req.ret_code;
-               else
-                       dev->vchnl_up = false;
-       } else {
+       if (ret_code) {
                 i40iw_debug(dev, I40IW_DEBUG_VIRT,
                             "%s Send message failed 0x%0x\n", __func__, ret_code);
+               return ret_code;
         }
-       return ret_code;
+       ret_code = i40iw_vf_wait_vchnl_resp(dev);
+       if (ret_code)
+               return ret_code;
+       else
+               return vchnl_req.ret_code;
  }
  
  /**
@@ -696,25 +700,27 @@ enum i40iw_status_code i40iw_vchnl_vf_del_hmc_obj(struct i40iw_sc_dev *dev,
         struct i40iw_virtchnl_req vchnl_req;
         enum i40iw_status_code ret_code;
  
+       if (!i40iw_vf_clear_to_send(dev))
+               return I40IW_ERR_TIMEOUT;
         memset(&vchnl_req, 0, sizeof(vchnl_req));
         vchnl_req.dev = dev;
         vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg;
+
         ret_code = vchnl_vf_send_del_hmc_objs_req(dev,
                                                   &vchnl_req,
                                                   rsrc_type,
                                                   start_index,
                                                   rsrc_count);
-       if (!ret_code) {
-               ret_code = i40iw_vf_wait_vchnl_resp(dev);
-               if (!ret_code)
-                       ret_code = vchnl_req.ret_code;
-               else
-                       dev->vchnl_up = false;
-       } else {
+       if (ret_code) {
                 i40iw_debug(dev, I40IW_DEBUG_VIRT,
                             "%s Send message failed 0x%0x\n", __func__, ret_code);
+               return ret_code;
         }
-       return ret_code;
+       ret_code = i40iw_vf_wait_vchnl_resp(dev);
+       if (ret_code)
+               return ret_code;
+       else
+               return vchnl_req.ret_code;
  }
  
  /**
@@ -728,21 +734,23 @@ enum i40iw_status_code i40iw_vchnl_vf_get_pe_stats(struct i40iw_sc_dev *dev,
         struct i40iw_virtchnl_req  vchnl_req;
         enum i40iw_status_code ret_code;
  
+       if (!i40iw_vf_clear_to_send(dev))
+               return I40IW_ERR_TIMEOUT;
         memset(&vchnl_req, 0, sizeof(vchnl_req));
         vchnl_req.dev = dev;
         vchnl_req.parm = hw_stats;
         vchnl_req.parm_len = sizeof(*hw_stats);
         vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg;
+
         ret_code = vchnl_vf_send_get_pe_stats_req(dev, &vchnl_req);
-       if (!ret_code) {
-               ret_code = i40iw_vf_wait_vchnl_resp(dev);
-               if (!ret_code)
-                       ret_code = vchnl_req.ret_code;
-               else
-                       dev->vchnl_up = false;
-       } else {
+       if (ret_code) {
                 i40iw_debug(dev, I40IW_DEBUG_VIRT,
                             "%s Send message failed 0x%0x\n", __func__, ret_code);
+               return ret_code;
         }
-       return ret_code;
+       ret_code = i40iw_vf_wait_vchnl_resp(dev);
+       if (ret_code)
+               return ret_code;
+       else
+               return vchnl_req.ret_code;
  }
diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c

index 99451d887266d2bffee04e2a71347f69b4883305..8f7ad07915b07e27ed7f512eabc07757269332c8 100644 (file)
--- a/drivers/infiniband/hw/mlx4/mcg.c
+++ b/drivers/infiniband/hw/mlx4/mcg.c
@@ -96,7 +96,7 @@ struct ib_sa_mcmember_data {
         u8              scope_join_state;
         u8              proxy_join;
         u8              reserved[2];
-};
+} __packed __aligned(4);
  
  struct mcast_group {
         struct ib_sa_mcmember_data rec;
@@ -747,14 +747,11 @@ static struct mcast_group *search_relocate_mgid0_group(struct mlx4_ib_demux_ctx
                                                        __be64 tid,
                                                        union ib_gid *new_mgid)
  {
-       struct mcast_group *group = NULL, *cur_group;
+       struct mcast_group *group = NULL, *cur_group, *n;
         struct mcast_req *req;
-       struct list_head *pos;
-       struct list_head *n;
  
         mutex_lock(&ctx->mcg_table_lock);
-       list_for_each_safe(pos, n, &ctx->mcg_mgid0_list) {
-               group = list_entry(pos, struct mcast_group, mgid0_list);
+       list_for_each_entry_safe(group, n, &ctx->mcg_mgid0_list, mgid0_list) {
                 mutex_lock(&group->lock);
                 if (group->last_req_tid == tid) {
                         if (memcmp(new_mgid, &mgid0, sizeof mgid0)) {
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h

index 1eca01cebe51f6cc044a8e3430d25bfe114f13e0..6c5ac5d8f32ffcfafbb4dd7a4b9202b4049b95b5 100644 (file)
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -717,9 +717,8 @@ int mlx4_ib_dealloc_mw(struct ib_mw *mw);
  struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd,
                                enum ib_mr_type mr_type,
                                u32 max_num_sg);
-int mlx4_ib_map_mr_sg(struct ib_mr *ibmr,
-                     struct scatterlist *sg,
-                     int sg_nents);
+int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+                     unsigned int *sg_offset);
  int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
  int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata);
  struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c

index ce0b5aa8eb9b3a5695f9a9ae02d3f153f7dcc145..631272172a0b9793c0074e6ab9c00a4b9f2e4665 100644 (file)
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -528,9 +528,8 @@ static int mlx4_set_page(struct ib_mr *ibmr, u64 addr)
         return 0;
  }
  
-int mlx4_ib_map_mr_sg(struct ib_mr *ibmr,
-                     struct scatterlist *sg,
-                     int sg_nents)
+int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+                     unsigned int *sg_offset)
  {
         struct mlx4_ib_mr *mr = to_mmr(ibmr);
         int rc;
@@ -541,7 +540,7 @@ int mlx4_ib_map_mr_sg(struct ib_mr *ibmr,
                                    sizeof(u64) * mr->max_pages,
                                    DMA_TO_DEVICE);
  
-       rc = ib_sg_to_pages(ibmr, sg, sg_nents, mlx4_set_page);
+       rc = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, mlx4_set_page);
  
         ib_dma_sync_single_for_device(ibmr->device, mr->page_map,
                                       sizeof(u64) * mr->max_pages,
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c

index a00ba4418de9ba79875c5b17ce9192c3fff6ba9f..dabcc65bd65e3b6a93adb47b9b53bbd67d08e15c 100644 (file)
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -879,7 +879,10 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
  
         mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn);
         cq->mcq.irqn = irqn;
-       cq->mcq.comp  = mlx5_ib_cq_comp;
+       if (context)
+               cq->mcq.tasklet_ctx.comp = mlx5_ib_cq_comp;
+       else
+               cq->mcq.comp  = mlx5_ib_cq_comp;
         cq->mcq.event = mlx5_ib_cq_event;
  
         INIT_LIST_HEAD(&cq->wc_list);
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c

index 4cb81f68d85090f43cc694eca6f400b99ab3adfd..c72797cd9e4f199302ededac309b133d4ddd1b27 100644 (file)
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -38,6 +38,9 @@
  #include <linux/dma-mapping.h>
  #include <linux/slab.h>
  #include <linux/io-mapping.h>
+#if defined(CONFIG_X86)
+#include <asm/pat.h>
+#endif
  #include <linux/sched.h>
  #include <rdma/ib_user_verbs.h>
  #include <rdma/ib_addr.h>
@@ -517,6 +520,10 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
                 props->device_cap_flags |= IB_DEVICE_UD_TSO;
         }
  
+       if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
+           MLX5_CAP_ETH(dev->mdev, scatter_fcs))
+               props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS;
+
         props->vendor_part_id      = mdev->pdev->device;
         props->hw_ver              = mdev->pdev->revision;
  
@@ -1068,38 +1075,89 @@ static int get_index(unsigned long offset)
         return get_arg(offset);
  }
  
+static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd)
+{
+       switch (cmd) {
+       case MLX5_IB_MMAP_WC_PAGE:
+               return "WC";
+       case MLX5_IB_MMAP_REGULAR_PAGE:
+               return "best effort WC";
+       case MLX5_IB_MMAP_NC_PAGE:
+               return "NC";
+       default:
+               return NULL;
+       }
+}
+
+static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
+                   struct vm_area_struct *vma, struct mlx5_uuar_info *uuari)
+{
+       int err;
+       unsigned long idx;
+       phys_addr_t pfn, pa;
+       pgprot_t prot;
+
+       switch (cmd) {
+       case MLX5_IB_MMAP_WC_PAGE:
+/* Some architectures don't support WC memory */
+#if defined(CONFIG_X86)
+               if (!pat_enabled())
+                       return -EPERM;
+#elif !(defined(CONFIG_PPC) || (defined(CONFIG_ARM) && defined(CONFIG_MMU)))
+                       return -EPERM;
+#endif
+       /* fall through */
+       case MLX5_IB_MMAP_REGULAR_PAGE:
+               /* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */
+               prot = pgprot_writecombine(vma->vm_page_prot);
+               break;
+       case MLX5_IB_MMAP_NC_PAGE:
+               prot = pgprot_noncached(vma->vm_page_prot);
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+               return -EINVAL;
+
+       idx = get_index(vma->vm_pgoff);
+       if (idx >= uuari->num_uars)
+               return -EINVAL;
+
+       pfn = uar_index2pfn(dev, uuari->uars[idx].index);
+       mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn);
+
+       vma->vm_page_prot = prot;
+       err = io_remap_pfn_range(vma, vma->vm_start, pfn,
+                                PAGE_SIZE, vma->vm_page_prot);
+       if (err) {
+               mlx5_ib_err(dev, "io_remap_pfn_range failed with error=%d, vm_start=0x%lx, pfn=%pa, mmap_cmd=%s\n",
+                           err, vma->vm_start, &pfn, mmap_cmd2str(cmd));
+               return -EAGAIN;
+       }
+
+       pa = pfn << PAGE_SHIFT;
+       mlx5_ib_dbg(dev, "mapped %s at 0x%lx, PA %pa\n", mmap_cmd2str(cmd),
+                   vma->vm_start, &pa);
+
+       return 0;
+}
+
  static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
  {
         struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
         struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
         struct mlx5_uuar_info *uuari = &context->uuari;
         unsigned long command;
-       unsigned long idx;
         phys_addr_t pfn;
  
         command = get_command(vma->vm_pgoff);
         switch (command) {
+       case MLX5_IB_MMAP_WC_PAGE:
+       case MLX5_IB_MMAP_NC_PAGE:
         case MLX5_IB_MMAP_REGULAR_PAGE:
-               if (vma->vm_end - vma->vm_start != PAGE_SIZE)
-                       return -EINVAL;
-
-               idx = get_index(vma->vm_pgoff);
-               if (idx >= uuari->num_uars)
-                       return -EINVAL;
-
-               pfn = uar_index2pfn(dev, uuari->uars[idx].index);
-               mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn 0x%llx\n", idx,
-                           (unsigned long long)pfn);
-
-               vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
-               if (io_remap_pfn_range(vma, vma->vm_start, pfn,
-                                      PAGE_SIZE, vma->vm_page_prot))
-                       return -EAGAIN;
-
-               mlx5_ib_dbg(dev, "mapped WC at 0x%lx, PA 0x%llx\n",
-                           vma->vm_start,
-                           (unsigned long long)pfn << PAGE_SHIFT);
-               break;
+               return uar_mmap(dev, command, vma, uuari);
  
         case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
                 return -ENOSYS;
@@ -1108,7 +1166,7 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm
                 if (vma->vm_end - vma->vm_start != PAGE_SIZE)
                         return -EINVAL;
  
-               if (vma->vm_flags & (VM_WRITE | VM_EXEC))
+               if (vma->vm_flags & VM_WRITE)
                         return -EPERM;
  
                 /* Don't expose to user-space information it shouldn't have */
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h

index b46c25542a7c6285d6c5088f2006889d59eaf10f..c4a9825828bcfa4fe7144a515c797ec279832e3e 100644 (file)
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -70,6 +70,8 @@ enum {
  enum mlx5_ib_mmap_cmd {
         MLX5_IB_MMAP_REGULAR_PAGE               = 0,
         MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES       = 1,
+       MLX5_IB_MMAP_WC_PAGE                    = 2,
+       MLX5_IB_MMAP_NC_PAGE                    = 3,
         /* 5 is chosen in order to be compatible with old versions of libmlx5 */
         MLX5_IB_MMAP_CORE_CLOCK                 = 5,
  };
@@ -356,6 +358,7 @@ enum mlx5_ib_qp_flags {
         MLX5_IB_QP_SIGNATURE_HANDLING           = 1 << 5,
         /* QP uses 1 as its source QP number */
         MLX5_IB_QP_SQPN_QP1                     = 1 << 6,
+       MLX5_IB_QP_CAP_SCATTER_FCS              = 1 << 7,
  };
  
  struct mlx5_umr_wr {
@@ -712,9 +715,8 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr);
  struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd,
                                enum ib_mr_type mr_type,
                                u32 max_num_sg);
-int mlx5_ib_map_mr_sg(struct ib_mr *ibmr,
-                     struct scatterlist *sg,
-                     int sg_nents);
+int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+                     unsigned int *sg_offset);
  int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
                         const struct ib_wc *in_wc, const struct ib_grh *in_grh,
                         const struct ib_mad_hdr *in, size_t in_mad_size,
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c

index 4d5bff151cdf09957942f35255d84df2286f572b..8cf2ce50511f935e6043bea4ebaa4af3a980661c 100644 (file)
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1751,26 +1751,33 @@ done:
  static int
  mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
                    struct scatterlist *sgl,
-                  unsigned short sg_nents)
+                  unsigned short sg_nents,
+                  unsigned int *sg_offset_p)
  {
         struct scatterlist *sg = sgl;
         struct mlx5_klm *klms = mr->descs;
+       unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
         u32 lkey = mr->ibmr.pd->local_dma_lkey;
         int i;
  
-       mr->ibmr.iova = sg_dma_address(sg);
+       mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
         mr->ibmr.length = 0;
         mr->ndescs = sg_nents;
  
         for_each_sg(sgl, sg, sg_nents, i) {
                 if (unlikely(i > mr->max_descs))
                         break;
-               klms[i].va = cpu_to_be64(sg_dma_address(sg));
-               klms[i].bcount = cpu_to_be32(sg_dma_len(sg));
+               klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset);
+               klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset);
                 klms[i].key = cpu_to_be32(lkey);
                 mr->ibmr.length += sg_dma_len(sg);
+
+               sg_offset = 0;
         }
  
+       if (sg_offset_p)
+               *sg_offset_p = sg_offset;
+
         return i;
  }
  
@@ -1788,9 +1795,8 @@ static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
         return 0;
  }
  
-int mlx5_ib_map_mr_sg(struct ib_mr *ibmr,
-                     struct scatterlist *sg,
-                     int sg_nents)
+int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+                     unsigned int *sg_offset)
  {
         struct mlx5_ib_mr *mr = to_mmr(ibmr);
         int n;
@@ -1802,9 +1808,10 @@ int mlx5_ib_map_mr_sg(struct ib_mr *ibmr,
                                    DMA_TO_DEVICE);
  
         if (mr->access_mode == MLX5_ACCESS_MODE_KLM)
-               n = mlx5_ib_sg_to_klms(mr, sg, sg_nents);
+               n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset);
         else
-               n = ib_sg_to_pages(ibmr, sg, sg_nents, mlx5_set_page);
+               n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
+                               mlx5_set_page);
  
         ib_dma_sync_single_for_device(ibmr->device, mr->desc_map,
                                       mr->desc_size * mr->max_descs,
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c

index 8dee8bc1e0fe9aa7139cfbd7aa4d9415127e52ca..504117657d41ffccd9d65467b748bede40fb1054 100644 (file)
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -1028,6 +1028,7 @@ static int get_rq_pas_size(void *qpc)
  static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev,
                                    struct mlx5_ib_rq *rq, void *qpin)
  {
+       struct mlx5_ib_qp *mqp = rq->base.container_mibqp;
         __be64 *pas;
         __be64 *qp_pas;
         void *in;
@@ -1051,6 +1052,9 @@ static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev,
         MLX5_SET(rqc, rqc, user_index, MLX5_GET(qpc, qpc, user_index));
         MLX5_SET(rqc, rqc, cqn, MLX5_GET(qpc, qpc, cqn_rcv));
  
+       if (mqp->flags & MLX5_IB_QP_CAP_SCATTER_FCS)
+               MLX5_SET(rqc, rqc, scatter_fcs, 1);
+
         wq = MLX5_ADDR_OF(rqc, rqc, wq);
         MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
         MLX5_SET(wq, wq, end_padding_mode,
@@ -1136,11 +1140,12 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
         }
  
         if (qp->rq.wqe_cnt) {
+               rq->base.container_mibqp = qp;
+
                 err = create_raw_packet_qp_rq(dev, rq, in);
                 if (err)
                         goto err_destroy_sq;
  
-               rq->base.container_mibqp = qp;
  
                 err = create_raw_packet_qp_tir(dev, rq, tdn);
                 if (err)
@@ -1252,6 +1257,19 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
                         return -EOPNOTSUPP;
                 }
  
+       if (init_attr->create_flags & IB_QP_CREATE_SCATTER_FCS) {
+               if (init_attr->qp_type != IB_QPT_RAW_PACKET) {
+                       mlx5_ib_dbg(dev, "Scatter FCS is supported only for Raw Packet QPs");
+                       return -EOPNOTSUPP;
+               }
+               if (!MLX5_CAP_GEN(dev->mdev, eth_net_offloads) ||
+                   !MLX5_CAP_ETH(dev->mdev, scatter_fcs)) {
+                       mlx5_ib_dbg(dev, "Scatter FCS isn't supported\n");
+                       return -EOPNOTSUPP;
+               }
+               qp->flags |= MLX5_IB_QP_CAP_SCATTER_FCS;
+       }
+
         if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
                 qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE;
  
diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c

index 6d3a169c049b315d764dc1eec9dd47436c53b1a4..37331e2fdc5fde5bc075582e353bb81a6ace99b6 100644 (file)
--- a/drivers/infiniband/hw/nes/nes_utils.c
+++ b/drivers/infiniband/hw/nes/nes_utils.c
@@ -44,6 +44,7 @@
  #include <linux/ip.h>
  #include <linux/tcp.h>
  #include <linux/init.h>
+#include <linux/kernel.h>
  
  #include <asm/io.h>
  #include <asm/irq.h>
@@ -903,70 +904,15 @@ void nes_clc(unsigned long parm)
   */
  void nes_dump_mem(unsigned int dump_debug_level, void *addr, int length)
  {
-       char  xlate[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
-               'a', 'b', 'c', 'd', 'e', 'f'};
-       char  *ptr;
-       char  hex_buf[80];
-       char  ascii_buf[20];
-       int   num_char;
-       int   num_ascii;
-       int   num_hex;
-
         if (!(nes_debug_level & dump_debug_level)) {
                 return;
         }
  
-       ptr = addr;
         if (length > 0x100) {
                 nes_debug(dump_debug_level, "Length truncated from %x to %x\n", length, 0x100);
                 length = 0x100;
         }
-       nes_debug(dump_debug_level, "Address=0x%p, length=0x%x (%d)\n", ptr, length, length);
-
-       memset(ascii_buf, 0, 20);
-       memset(hex_buf, 0, 80);
-
-       num_ascii = 0;
-       num_hex = 0;
-       for (num_char = 0; num_char < length; num_char++) {
-               if (num_ascii == 8) {
-                       ascii_buf[num_ascii++] = ' ';
-                       hex_buf[num_hex++] = '-';
-                       hex_buf[num_hex++] = ' ';
-               }
-
-               if (*ptr < 0x20 || *ptr > 0x7e)
-                       ascii_buf[num_ascii++] = '.';
-               else
-                       ascii_buf[num_ascii++] = *ptr;
-               hex_buf[num_hex++] = xlate[((*ptr & 0xf0) >> 4)];
-               hex_buf[num_hex++] = xlate[*ptr & 0x0f];
-               hex_buf[num_hex++] = ' ';
-               ptr++;
-
-               if (num_ascii >= 17) {
-                       /* output line and reset */
-                       nes_debug(dump_debug_level, "   %s |  %s\n", hex_buf, ascii_buf);
-                       memset(ascii_buf, 0, 20);
-                       memset(hex_buf, 0, 80);
-                       num_ascii = 0;
-                       num_hex = 0;
-               }
-       }
+       nes_debug(dump_debug_level, "Address=0x%p, length=0x%x (%d)\n", addr, length, length);
  
-       /* output the rest */
-       if (num_ascii) {
-               while (num_ascii < 17) {
-                       if (num_ascii == 8) {
-                               hex_buf[num_hex++] = ' ';
-                               hex_buf[num_hex++] = ' ';
-                       }
-                       hex_buf[num_hex++] = ' ';
-                       hex_buf[num_hex++] = ' ';
-                       hex_buf[num_hex++] = ' ';
-                       num_ascii++;
-               }
-
-               nes_debug(dump_debug_level, "   %s |  %s\n", hex_buf, ascii_buf);
-       }
+       print_hex_dump(KERN_ERR, PFX, DUMP_PREFIX_NONE, 16, 1, addr, length, true);
  }
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c

index fba69a39a7eb496cb80ae819c231917e48203c17..464d6da5fe913707f18ff95cc3df93a3b36c4ee7 100644 (file)
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -402,15 +402,14 @@ static int nes_set_page(struct ib_mr *ibmr, u64 addr)
         return 0;
  }
  
-static int nes_map_mr_sg(struct ib_mr *ibmr,
-                        struct scatterlist *sg,
-                        int sg_nents)
+static int nes_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
+                        int sg_nents, unsigned int *sg_offset)
  {
         struct nes_mr *nesmr = to_nesmr(ibmr);
  
         nesmr->npages = 0;
  
-       return ib_sg_to_pages(ibmr, sg, sg_nents, nes_set_page);
+       return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, nes_set_page);
  }
  
  /**
@@ -981,7 +980,7 @@ static int nes_setup_mmap_qp(struct nes_qp *nesqp, struct nes_vnic *nesvnic,
  /**
   * nes_free_qp_mem() is to free up the qp's pci_alloc_consistent() memory.
   */
-static inline void nes_free_qp_mem(struct nes_device *nesdev,
+static void nes_free_qp_mem(struct nes_device *nesdev,
                 struct nes_qp *nesqp, int virt_wqs)
  {
         unsigned long flags;
@@ -1315,6 +1314,8 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd,
                         nes_debug(NES_DBG_QP, "Invalid QP type: %d\n", init_attr->qp_type);
                         return ERR_PTR(-EINVAL);
         }
+       init_completion(&nesqp->sq_drained);
+       init_completion(&nesqp->rq_drained);
  
         nesqp->sig_all = (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR);
         init_timer(&nesqp->terminate_timer);
@@ -3452,6 +3453,29 @@ out:
         return err;
  }
  
+/**
+ * nes_drain_sq - drain sq
+ * @ibqp: pointer to ibqp
+ */
+static void nes_drain_sq(struct ib_qp *ibqp)
+{
+       struct nes_qp *nesqp = to_nesqp(ibqp);
+
+       if (nesqp->hwqp.sq_tail != nesqp->hwqp.sq_head)
+               wait_for_completion(&nesqp->sq_drained);
+}
+
+/**
+ * nes_drain_rq - drain rq
+ * @ibqp: pointer to ibqp
+ */
+static void nes_drain_rq(struct ib_qp *ibqp)
+{
+       struct nes_qp *nesqp = to_nesqp(ibqp);
+
+       if (nesqp->hwqp.rq_tail != nesqp->hwqp.rq_head)
+               wait_for_completion(&nesqp->rq_drained);
+}
  
  /**
   * nes_poll_cq
@@ -3582,6 +3606,13 @@ static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
                                 }
                         }
  
+                       if (nesqp->iwarp_state > NES_CQP_QP_IWARP_STATE_RTS) {
+                               if (nesqp->hwqp.sq_tail == nesqp->hwqp.sq_head)
+                                       complete(&nesqp->sq_drained);
+                               if (nesqp->hwqp.rq_tail == nesqp->hwqp.rq_head)
+                                       complete(&nesqp->rq_drained);
+                       }
+
                         entry->wr_id = wrid;
                         entry++;
                         cqe_count++;
@@ -3754,6 +3785,8 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev)
         nesibdev->ibdev.req_notify_cq = nes_req_notify_cq;
         nesibdev->ibdev.post_send = nes_post_send;
         nesibdev->ibdev.post_recv = nes_post_recv;
+       nesibdev->ibdev.drain_sq = nes_drain_sq;
+       nesibdev->ibdev.drain_rq = nes_drain_rq;
  
         nesibdev->ibdev.iwcm = kzalloc(sizeof(*nesibdev->ibdev.iwcm), GFP_KERNEL);
         if (nesibdev->ibdev.iwcm == NULL) {
diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h

index 70290883d06769f3eee19ed4b36750808c2fa3d8..e02a5662dc209738a640b8793f0f9cafd9d180ce 100644 (file)
--- a/drivers/infiniband/hw/nes/nes_verbs.h
+++ b/drivers/infiniband/hw/nes/nes_verbs.h
@@ -189,6 +189,8 @@ struct nes_qp {
         u8                    pau_pending;
         u8                    pau_state;
         __u64                 nesuqp_addr;
+       struct completion     sq_drained;
+       struct completion     rq_drained;
  };
  
  struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c

index a8496a18e20d667614cf7b2c71408cc046c22b3d..b1a3d91fe8b94fe292fbe09a4c3820a8592e9f6e 100644 (file)
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -3081,13 +3081,12 @@ static int ocrdma_set_page(struct ib_mr *ibmr, u64 addr)
         return 0;
  }
  
-int ocrdma_map_mr_sg(struct ib_mr *ibmr,
-                    struct scatterlist *sg,
-                    int sg_nents)
+int ocrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+                    unsigned int *sg_offset)
  {
         struct ocrdma_mr *mr = get_ocrdma_mr(ibmr);
  
         mr->npages = 0;
  
-       return ib_sg_to_pages(ibmr, sg, sg_nents, ocrdma_set_page);
+       return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, ocrdma_set_page);
  }
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h

index 8b517fd3677924099a1d08663035de8f19a09d79..704ef1e9271b74b62ec14e5cf7178014fed8df72 100644 (file)
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
@@ -122,8 +122,7 @@ struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *, u64 start, u64 length,
  struct ib_mr *ocrdma_alloc_mr(struct ib_pd *pd,
                               enum ib_mr_type mr_type,
                               u32 max_num_sg);
-int ocrdma_map_mr_sg(struct ib_mr *ibmr,
-                    struct scatterlist *sg,
-                    int sg_nents);
+int ocrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+                    unsigned int *sg_offset);
  
  #endif                         /* __OCRDMA_VERBS_H__ */
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c

index 24f4a782e0f431282bb8f79fc496a0b998a7a55a..ff946d5f59e4071606ebc00fff43df5a56dde2f6 100644 (file)
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -824,10 +824,7 @@ static int mmap_piobufs(struct vm_area_struct *vma,
         phys = dd->physaddr + piobufs;
  
  #if defined(__powerpc__)
-       /* There isn't a generic way to specify writethrough mappings */
-       pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE;
-       pgprot_val(vma->vm_page_prot) |= _PAGE_WRITETHRU;
-       pgprot_val(vma->vm_page_prot) &= ~_PAGE_GUARDED;
+       vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
  #endif
  
         /*
diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c

index 3f062f0dd9d8082f9fb2986ee13b61c4a867a362..f253111e682ea7494782bfa2a12c97ead940affb 100644 (file)
--- a/drivers/infiniband/hw/qib/qib_init.c
+++ b/drivers/infiniband/hw/qib/qib_init.c
@@ -1090,7 +1090,7 @@ void qib_free_devdata(struct qib_devdata *dd)
         qib_dbg_ibdev_exit(&dd->verbs_dev);
  #endif
         free_percpu(dd->int_counter);
-       ib_dealloc_device(&dd->verbs_dev.rdi.ibdev);
+       rvt_dealloc_device(&dd->verbs_dev.rdi);
  }
  
  u64 qib_int_counter(struct qib_devdata *dd)
@@ -1183,7 +1183,7 @@ struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra)
  bail:
         if (!list_empty(&dd->list))
                 list_del_init(&dd->list);
-       ib_dealloc_device(&dd->verbs_dev.rdi.ibdev);
+       rvt_dealloc_device(&dd->verbs_dev.rdi);
         return ERR_PTR(ret);
  }
  
diff --git a/drivers/infiniband/hw/qib/qib_pcie.c b/drivers/infiniband/hw/qib/qib_pcie.c

index 4758a3801ae8f916b6a96988b6c88ce5c344c588..6abe1c621aa4220cb8ca554ef6fee42508915ea7 100644 (file)
--- a/drivers/infiniband/hw/qib/qib_pcie.c
+++ b/drivers/infiniband/hw/qib/qib_pcie.c
@@ -144,13 +144,7 @@ int qib_pcie_ddinit(struct qib_devdata *dd, struct pci_dev *pdev,
         addr = pci_resource_start(pdev, 0);
         len = pci_resource_len(pdev, 0);
  
-#if defined(__powerpc__)
-       /* There isn't a generic way to specify writethrough mappings */
-       dd->kregbase = __ioremap(addr, len, _PAGE_NO_CACHE | _PAGE_WRITETHRU);
-#else
         dd->kregbase = ioremap_nocache(addr, len);
-#endif
-
         if (!dd->kregbase)
                 return -ENOMEM;
  
diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c

index 9088e26d3ac8b0aef86424d244c0465c4f33b87e..444028a3582a3898f2ddcedcdb69614904ab6f0d 100644 (file)
--- a/drivers/infiniband/hw/qib/qib_rc.c
+++ b/drivers/infiniband/hw/qib/qib_rc.c
@@ -230,7 +230,7 @@ bail:
   *
   * Return 1 if constructed; otherwise, return 0.
   */
-int qib_make_rc_req(struct rvt_qp *qp)
+int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags)
  {
         struct qib_qp_priv *priv = qp->priv;
         struct qib_ibdev *dev = to_idev(qp->ibqp.device);
diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c

index a5f07a64b228d2e8f5e1796f8892ddeda6aa37fd..b6777925629704d3a2c3c7bc51a5475a725ca2c3 100644 (file)
--- a/drivers/infiniband/hw/qib/qib_ruc.c
+++ b/drivers/infiniband/hw/qib/qib_ruc.c
@@ -739,7 +739,7 @@ void qib_do_send(struct rvt_qp *qp)
         struct qib_qp_priv *priv = qp->priv;
         struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
         struct qib_pportdata *ppd = ppd_from_ibp(ibp);
-       int (*make_req)(struct rvt_qp *qp);
+       int (*make_req)(struct rvt_qp *qp, unsigned long *flags);
         unsigned long flags;
  
         if ((qp->ibqp.qp_type == IB_QPT_RC ||
@@ -781,7 +781,7 @@ void qib_do_send(struct rvt_qp *qp)
                         qp->s_hdrwords = 0;
                         spin_lock_irqsave(&qp->s_lock, flags);
                 }
-       } while (make_req(qp));
+       } while (make_req(qp, &flags));
  
         spin_unlock_irqrestore(&qp->s_lock, flags);
  }
diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c

index 7bdbc79ceaa3bbd7a25a87ea107b06f1460dd7ad..1d61bd04f4494bf12a6c020bad53a6fe1c6f8e81 100644 (file)
--- a/drivers/infiniband/hw/qib/qib_uc.c
+++ b/drivers/infiniband/hw/qib/qib_uc.c
@@ -45,7 +45,7 @@
   *
   * Return 1 if constructed; otherwise, return 0.
   */
-int qib_make_uc_req(struct rvt_qp *qp)
+int qib_make_uc_req(struct rvt_qp *qp, unsigned long *flags)
  {
         struct qib_qp_priv *priv = qp->priv;
         struct qib_other_headers *ohdr;
diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c

index d9502137de626b80966cfd200e3806c65c36b21a..846e6c726df7c67204f1dc1d5f1bfb066c36e149 100644 (file)
--- a/drivers/infiniband/hw/qib/qib_ud.c
+++ b/drivers/infiniband/hw/qib/qib_ud.c
@@ -238,7 +238,7 @@ drop:
   *
   * Return 1 if constructed; otherwise, return 0.
   */
-int qib_make_ud_req(struct rvt_qp *qp)
+int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags)
  {
         struct qib_qp_priv *priv = qp->priv;
         struct qib_other_headers *ohdr;
@@ -294,7 +294,7 @@ int qib_make_ud_req(struct rvt_qp *qp)
                 this_cpu_inc(ibp->pmastats->n_unicast_xmit);
                 lid = ah_attr->dlid & ~((1 << ppd->lmc) - 1);
                 if (unlikely(lid == ppd->lid)) {
-                       unsigned long flags;
+                       unsigned long tflags = *flags;
                         /*
                          * If DMAs are in progress, we can't generate
                          * a completion for the loopback packet since
@@ -307,10 +307,10 @@ int qib_make_ud_req(struct rvt_qp *qp)
                                 goto bail;
                         }
                         qp->s_cur = next_cur;
-                       local_irq_save(flags);
-                       spin_unlock_irqrestore(&qp->s_lock, flags);
+                       spin_unlock_irqrestore(&qp->s_lock, tflags);
                         qib_ud_loopback(qp, wqe);
-                       spin_lock_irqsave(&qp->s_lock, flags);
+                       spin_lock_irqsave(&qp->s_lock, tflags);
+                       *flags = tflags;
                         qib_send_complete(qp, wqe, IB_WC_SUCCESS);
                         goto done;
                 }
diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h

index 4b76a8d593371b3929faf86437890ff8191ecf1c..6888f03c6d61007b6f4d86c1f73b403908e08279 100644 (file)
--- a/drivers/infiniband/hw/qib/qib_verbs.h
+++ b/drivers/infiniband/hw/qib/qib_verbs.h
@@ -430,11 +430,11 @@ void qib_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
  
  void qib_send_rc_ack(struct rvt_qp *qp);
  
-int qib_make_rc_req(struct rvt_qp *qp);
+int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags);
  
-int qib_make_uc_req(struct rvt_qp *qp);
+int qib_make_uc_req(struct rvt_qp *qp, unsigned long *flags);
  
-int qib_make_ud_req(struct rvt_qp *qp);
+int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags);
  
  int qib_register_ib_device(struct qib_devdata *);
  
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c

index a9e3bcc522c40167abc3e22caf18a2669b12e73d..0f12c211c3853f771c7eca89504388461ef6d6ee 100644 (file)
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -829,13 +829,13 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
         case IB_QPT_SMI:
         case IB_QPT_GSI:
         case IB_QPT_UD:
-               qp->allowed_ops = IB_OPCODE_UD_SEND_ONLY & RVT_OPCODE_QP_MASK;
+               qp->allowed_ops = IB_OPCODE_UD;
                 break;
         case IB_QPT_RC:
-               qp->allowed_ops = IB_OPCODE_RC_SEND_ONLY & RVT_OPCODE_QP_MASK;
+               qp->allowed_ops = IB_OPCODE_RC;
                 break;
         case IB_QPT_UC:
-               qp->allowed_ops = IB_OPCODE_UC_SEND_ONLY & RVT_OPCODE_QP_MASK;
+               qp->allowed_ops = IB_OPCODE_UC;
                 break;
         default:
                 ret = ERR_PTR(-EINVAL);
diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c

index 6caf5272ba1f70b96ab19a7a4df1e51f08dfb41c..e1cc2cc42f2550e1a2113b8c471f4456b182dd14 100644 (file)
--- a/drivers/infiniband/sw/rdmavt/vt.c
+++ b/drivers/infiniband/sw/rdmavt/vt.c
@@ -106,6 +106,19 @@ struct rvt_dev_info *rvt_alloc_device(size_t size, int nports)
  }
  EXPORT_SYMBOL(rvt_alloc_device);
  
+/**
+ * rvt_dealloc_device - deallocate rdi
+ * @rdi: structure to free
+ *
+ * Free a structure allocated with rvt_alloc_device()
+ */
+void rvt_dealloc_device(struct rvt_dev_info *rdi)
+{
+       kfree(rdi->ports);
+       ib_dealloc_device(&rdi->ibdev);
+}
+EXPORT_SYMBOL(rvt_dealloc_device);
+
  static int rvt_query_device(struct ib_device *ibdev,
                             struct ib_device_attr *props,
                             struct ib_udata *uhw)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c

index a53fa5fc0dec7d65900cce1133b2d4e2326183cc..1502199c8e56272584e7fb5797031f369a67fcd2 100644 (file)
--- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
@@ -36,6 +36,27 @@
  
  #include "ipoib.h"
  
+struct ipoib_stats {
+       char stat_string[ETH_GSTRING_LEN];
+       int stat_offset;
+};
+
+#define IPOIB_NETDEV_STAT(m) { \
+               .stat_string = #m, \
+               .stat_offset = offsetof(struct rtnl_link_stats64, m) }
+
+static const struct ipoib_stats ipoib_gstrings_stats[] = {
+       IPOIB_NETDEV_STAT(rx_packets),
+       IPOIB_NETDEV_STAT(tx_packets),
+       IPOIB_NETDEV_STAT(rx_bytes),
+       IPOIB_NETDEV_STAT(tx_bytes),
+       IPOIB_NETDEV_STAT(tx_errors),
+       IPOIB_NETDEV_STAT(rx_dropped),
+       IPOIB_NETDEV_STAT(tx_dropped)
+};
+
+#define IPOIB_GLOBAL_STATS_LEN ARRAY_SIZE(ipoib_gstrings_stats)
+
  static void ipoib_get_drvinfo(struct net_device *netdev,
                               struct ethtool_drvinfo *drvinfo)
  {
@@ -92,11 +113,57 @@ static int ipoib_set_coalesce(struct net_device *dev,
  
         return 0;
  }
+static void ipoib_get_ethtool_stats(struct net_device *dev,
+                                   struct ethtool_stats __always_unused *stats,
+                                   u64 *data)
+{
+       int i;
+       struct net_device_stats *net_stats = &dev->stats;
+       u8 *p = (u8 *)net_stats;
+
+       for (i = 0; i < IPOIB_GLOBAL_STATS_LEN; i++)
+               data[i] = *(u64 *)(p + ipoib_gstrings_stats[i].stat_offset);
+
+}
+static void ipoib_get_strings(struct net_device __always_unused *dev,
+                             u32 stringset, u8 *data)
+{
+       u8 *p = data;
+       int i;
+
+       switch (stringset) {
+       case ETH_SS_STATS:
+               for (i = 0; i < IPOIB_GLOBAL_STATS_LEN; i++) {
+                       memcpy(p, ipoib_gstrings_stats[i].stat_string,
+                               ETH_GSTRING_LEN);
+                       p += ETH_GSTRING_LEN;
+               }
+               break;
+       case ETH_SS_TEST:
+       default:
+               break;
+       }
+}
+static int ipoib_get_sset_count(struct net_device __always_unused *dev,
+                                int sset)
+{
+       switch (sset) {
+       case ETH_SS_STATS:
+               return IPOIB_GLOBAL_STATS_LEN;
+       case ETH_SS_TEST:
+       default:
+               break;
+       }
+       return -EOPNOTSUPP;
+}
  
  static const struct ethtool_ops ipoib_ethtool_ops = {
         .get_drvinfo            = ipoib_get_drvinfo,
         .get_coalesce           = ipoib_get_coalesce,
         .set_coalesce           = ipoib_set_coalesce,
+       .get_strings            = ipoib_get_strings,
+       .get_ethtool_stats      = ipoib_get_ethtool_stats,
+       .get_sset_count         = ipoib_get_sset_count,
  };
  
  void ipoib_set_ethtool_ops(struct net_device *dev)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c

index 3643d559ba316960716809f93007d00a2d46fe0e..418e5a1c8744d5a03e7b22e87e5907bbef601ea2 100644 (file)
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -51,8 +51,6 @@ MODULE_PARM_DESC(data_debug_level,
                  "Enable data path debug tracing if > 0");
  #endif
  
-static DEFINE_MUTEX(pkey_mutex);
-
  struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
                                  struct ib_pd *pd, struct ib_ah_attr *attr)
  {
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c

index 9a391cc5b9b3f45f16f0a49138aeda5b96030b01..90be568934146aaf55bd01bd801f177c12bcfd2a 100644 (file)
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -236,7 +236,7 @@ int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task,
         page_vec->npages = 0;
         page_vec->fake_mr.page_size = SIZE_4K;
         plen = ib_sg_to_pages(&page_vec->fake_mr, mem->sg,
-                             mem->size, iser_set_page);
+                             mem->size, NULL, iser_set_page);
         if (unlikely(plen < mem->size)) {
                 iser_err("page vec too short to hold this SG\n");
                 iser_data_buf_dump(mem, device->ib_device);
@@ -446,7 +446,7 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
  
         ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey));
  
-       n = ib_map_mr_sg(mr, mem->sg, mem->size, SIZE_4K);
+       n = ib_map_mr_sg(mr, mem->sg, mem->size, NULL, SIZE_4K);
         if (unlikely(n != mem->size)) {
                 iser_err("failed to map sg (%d/%d)\n",
                          n, mem->size);
diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c

index 411e4464ca235bc984d5bc34c723a52e06071f31..897b5a4993e868563f36a164dea007f787d12c02 100644 (file)
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -33,7 +33,8 @@
  
  #define        ISERT_MAX_CONN          8
  #define ISER_MAX_RX_CQ_LEN     (ISERT_QP_MAX_RECV_DTOS * ISERT_MAX_CONN)
-#define ISER_MAX_TX_CQ_LEN     (ISERT_QP_MAX_REQ_DTOS  * ISERT_MAX_CONN)
+#define ISER_MAX_TX_CQ_LEN \
+       ((ISERT_QP_MAX_REQ_DTOS + ISCSI_DEF_XMIT_CMDS_MAX) * ISERT_MAX_CONN)
  #define ISER_MAX_CQ_LEN                (ISER_MAX_RX_CQ_LEN + ISER_MAX_TX_CQ_LEN + \
                                  ISERT_MAX_CONN)
  
@@ -46,14 +47,6 @@ static LIST_HEAD(device_list);
  static struct workqueue_struct *isert_comp_wq;
  static struct workqueue_struct *isert_release_wq;
  
-static void
-isert_unmap_cmd(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn);
-static int
-isert_map_rdma(struct isert_cmd *isert_cmd, struct iscsi_conn *conn);
-static void
-isert_unreg_rdma(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn);
-static int
-isert_reg_rdma(struct isert_cmd *isert_cmd, struct iscsi_conn *conn);
  static int
  isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd);
  static int
@@ -142,6 +135,7 @@ isert_create_qp(struct isert_conn *isert_conn,
         attr.recv_cq = comp->cq;
         attr.cap.max_send_wr = ISERT_QP_MAX_REQ_DTOS + 1;
         attr.cap.max_recv_wr = ISERT_QP_MAX_RECV_DTOS + 1;
+       attr.cap.max_rdma_ctxs = ISCSI_DEF_XMIT_CMDS_MAX;
         attr.cap.max_send_sge = device->ib_device->attrs.max_sge;
         isert_conn->max_sge = min(device->ib_device->attrs.max_sge,
                                   device->ib_device->attrs.max_sge_rd);
@@ -270,9 +264,9 @@ isert_alloc_comps(struct isert_device *device)
                                  device->ib_device->num_comp_vectors));
  
         isert_info("Using %d CQs, %s supports %d vectors support "
-                  "Fast registration %d pi_capable %d\n",
+                  "pi_capable %d\n",
                    device->comps_used, device->ib_device->name,
-                  device->ib_device->num_comp_vectors, device->use_fastreg,
+                  device->ib_device->num_comp_vectors,
                    device->pi_capable);
  
         device->comps = kcalloc(device->comps_used, sizeof(struct isert_comp),
@@ -313,18 +307,6 @@ isert_create_device_ib_res(struct isert_device *device)
         isert_dbg("devattr->max_sge: %d\n", ib_dev->attrs.max_sge);
         isert_dbg("devattr->max_sge_rd: %d\n", ib_dev->attrs.max_sge_rd);
  
-       /* asign function handlers */
-       if (ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS &&
-           ib_dev->attrs.device_cap_flags & IB_DEVICE_SIGNATURE_HANDOVER) {
-               device->use_fastreg = 1;
-               device->reg_rdma_mem = isert_reg_rdma;
-               device->unreg_rdma_mem = isert_unreg_rdma;
-       } else {
-               device->use_fastreg = 0;
-               device->reg_rdma_mem = isert_map_rdma;
-               device->unreg_rdma_mem = isert_unmap_cmd;
-       }
-
         ret = isert_alloc_comps(device);
         if (ret)
                 goto out;
@@ -416,146 +398,6 @@ isert_device_get(struct rdma_cm_id *cma_id)
         return device;
  }
  
-static void
-isert_conn_free_fastreg_pool(struct isert_conn *isert_conn)
-{
-       struct fast_reg_descriptor *fr_desc, *tmp;
-       int i = 0;
-
-       if (list_empty(&isert_conn->fr_pool))
-               return;
-
-       isert_info("Freeing conn %p fastreg pool", isert_conn);
-
-       list_for_each_entry_safe(fr_desc, tmp,
-                                &isert_conn->fr_pool, list) {
-               list_del(&fr_desc->list);
-               ib_dereg_mr(fr_desc->data_mr);
-               if (fr_desc->pi_ctx) {
-                       ib_dereg_mr(fr_desc->pi_ctx->prot_mr);
-                       ib_dereg_mr(fr_desc->pi_ctx->sig_mr);
-                       kfree(fr_desc->pi_ctx);
-               }
-               kfree(fr_desc);
-               ++i;
-       }
-
-       if (i < isert_conn->fr_pool_size)
-               isert_warn("Pool still has %d regions registered\n",
-                       isert_conn->fr_pool_size - i);
-}
-
-static int
-isert_create_pi_ctx(struct fast_reg_descriptor *desc,
-                   struct ib_device *device,
-                   struct ib_pd *pd)
-{
-       struct pi_context *pi_ctx;
-       int ret;
-
-       pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL);
-       if (!pi_ctx) {
-               isert_err("Failed to allocate pi context\n");
-               return -ENOMEM;
-       }
-
-       pi_ctx->prot_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG,
-                                     ISCSI_ISER_SG_TABLESIZE);
-       if (IS_ERR(pi_ctx->prot_mr)) {
-               isert_err("Failed to allocate prot frmr err=%ld\n",
-                         PTR_ERR(pi_ctx->prot_mr));
-               ret = PTR_ERR(pi_ctx->prot_mr);
-               goto err_pi_ctx;
-       }
-       desc->ind |= ISERT_PROT_KEY_VALID;
-
-       pi_ctx->sig_mr = ib_alloc_mr(pd, IB_MR_TYPE_SIGNATURE, 2);
-       if (IS_ERR(pi_ctx->sig_mr)) {
-               isert_err("Failed to allocate signature enabled mr err=%ld\n",
-                         PTR_ERR(pi_ctx->sig_mr));
-               ret = PTR_ERR(pi_ctx->sig_mr);
-               goto err_prot_mr;
-       }
-
-       desc->pi_ctx = pi_ctx;
-       desc->ind |= ISERT_SIG_KEY_VALID;
-       desc->ind &= ~ISERT_PROTECTED;
-
-       return 0;
-
-err_prot_mr:
-       ib_dereg_mr(pi_ctx->prot_mr);
-err_pi_ctx:
-       kfree(pi_ctx);
-
-       return ret;
-}
-
-static int
-isert_create_fr_desc(struct ib_device *ib_device, struct ib_pd *pd,
-                    struct fast_reg_descriptor *fr_desc)
-{
-       fr_desc->data_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG,
-                                      ISCSI_ISER_SG_TABLESIZE);
-       if (IS_ERR(fr_desc->data_mr)) {
-               isert_err("Failed to allocate data frmr err=%ld\n",
-                         PTR_ERR(fr_desc->data_mr));
-               return PTR_ERR(fr_desc->data_mr);
-       }
-       fr_desc->ind |= ISERT_DATA_KEY_VALID;
-
-       isert_dbg("Created fr_desc %p\n", fr_desc);
-
-       return 0;
-}
-
-static int
-isert_conn_create_fastreg_pool(struct isert_conn *isert_conn)
-{
-       struct fast_reg_descriptor *fr_desc;
-       struct isert_device *device = isert_conn->device;
-       struct se_session *se_sess = isert_conn->conn->sess->se_sess;
-       struct se_node_acl *se_nacl = se_sess->se_node_acl;
-       int i, ret, tag_num;
-       /*
-        * Setup the number of FRMRs based upon the number of tags
-        * available to session in iscsi_target_locate_portal().
-        */
-       tag_num = max_t(u32, ISCSIT_MIN_TAGS, se_nacl->queue_depth);
-       tag_num = (tag_num * 2) + ISCSIT_EXTRA_TAGS;
-
-       isert_conn->fr_pool_size = 0;
-       for (i = 0; i < tag_num; i++) {
-               fr_desc = kzalloc(sizeof(*fr_desc), GFP_KERNEL);
-               if (!fr_desc) {
-                       isert_err("Failed to allocate fast_reg descriptor\n");
-                       ret = -ENOMEM;
-                       goto err;
-               }
-
-               ret = isert_create_fr_desc(device->ib_device,
-                                          device->pd, fr_desc);
-               if (ret) {
-                       isert_err("Failed to create fastreg descriptor err=%d\n",
-                              ret);
-                       kfree(fr_desc);
-                       goto err;
-               }
-
-               list_add_tail(&fr_desc->list, &isert_conn->fr_pool);
-               isert_conn->fr_pool_size++;
-       }
-
-       isert_dbg("Creating conn %p fastreg pool size=%d",
-                isert_conn, isert_conn->fr_pool_size);
-
-       return 0;
-
-err:
-       isert_conn_free_fastreg_pool(isert_conn);
-       return ret;
-}
-
  static void
  isert_init_conn(struct isert_conn *isert_conn)
  {
@@ -565,8 +407,6 @@ isert_init_conn(struct isert_conn *isert_conn)
         init_completion(&isert_conn->login_req_comp);
         kref_init(&isert_conn->kref);
         mutex_init(&isert_conn->mutex);
-       spin_lock_init(&isert_conn->pool_lock);
-       INIT_LIST_HEAD(&isert_conn->fr_pool);
         INIT_WORK(&isert_conn->release_work, isert_release_work);
  }
  
@@ -739,9 +579,6 @@ isert_connect_release(struct isert_conn *isert_conn)
  
         BUG_ON(!device);
  
-       if (device->use_fastreg)
-               isert_conn_free_fastreg_pool(isert_conn);
-
         isert_free_rx_descriptors(isert_conn);
         if (isert_conn->cm_id)
                 rdma_destroy_id(isert_conn->cm_id);
@@ -1080,7 +917,6 @@ isert_init_send_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
  {
         struct iser_tx_desc *tx_desc = &isert_cmd->tx_desc;
  
-       isert_cmd->iser_ib_op = ISER_IB_SEND;
         tx_desc->tx_cqe.done = isert_send_done;
         send_wr->wr_cqe = &tx_desc->tx_cqe;
  
@@ -1160,16 +996,6 @@ isert_put_login_tx(struct iscsi_conn *conn, struct iscsi_login *login,
         }
         if (!login->login_failed) {
                 if (login->login_complete) {
-                       if (!conn->sess->sess_ops->SessionType &&
-                           isert_conn->device->use_fastreg) {
-                               ret = isert_conn_create_fastreg_pool(isert_conn);
-                               if (ret) {
-                                       isert_err("Conn: %p failed to create"
-                                              " fastreg pool\n", isert_conn);
-                                       return ret;
-                               }
-                       }
-
                         ret = isert_alloc_rx_descriptors(isert_conn);
                         if (ret)
                                 return ret;
@@ -1633,97 +1459,26 @@ isert_login_recv_done(struct ib_cq *cq, struct ib_wc *wc)
                                 ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
  }
  
-static int
-isert_map_data_buf(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
-                  struct scatterlist *sg, u32 nents, u32 length, u32 offset,
-                  enum iser_ib_op_code op, struct isert_data_buf *data)
-{
-       struct ib_device *ib_dev = isert_conn->cm_id->device;
-
-       data->dma_dir = op == ISER_IB_RDMA_WRITE ?
-                             DMA_TO_DEVICE : DMA_FROM_DEVICE;
-
-       data->len = length - offset;
-       data->offset = offset;
-       data->sg_off = data->offset / PAGE_SIZE;
-
-       data->sg = &sg[data->sg_off];
-       data->nents = min_t(unsigned int, nents - data->sg_off,
-                                         ISCSI_ISER_SG_TABLESIZE);
-       data->len = min_t(unsigned int, data->len, ISCSI_ISER_SG_TABLESIZE *
-                                       PAGE_SIZE);
-
-       data->dma_nents = ib_dma_map_sg(ib_dev, data->sg, data->nents,
-                                       data->dma_dir);
-       if (unlikely(!data->dma_nents)) {
-               isert_err("Cmd: unable to dma map SGs %p\n", sg);
-               return -EINVAL;
-       }
-
-       isert_dbg("Mapped cmd: %p count: %u sg: %p sg_nents: %u rdma_len %d\n",
-                 isert_cmd, data->dma_nents, data->sg, data->nents, data->len);
-
-       return 0;
-}
-
  static void
-isert_unmap_data_buf(struct isert_conn *isert_conn, struct isert_data_buf *data)
+isert_rdma_rw_ctx_destroy(struct isert_cmd *cmd, struct isert_conn *conn)
  {
-       struct ib_device *ib_dev = isert_conn->cm_id->device;
-
-       ib_dma_unmap_sg(ib_dev, data->sg, data->nents, data->dma_dir);
-       memset(data, 0, sizeof(*data));
-}
-
-
-
-static void
-isert_unmap_cmd(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn)
-{
-       isert_dbg("Cmd %p\n", isert_cmd);
+       struct se_cmd *se_cmd = &cmd->iscsi_cmd->se_cmd;
+       enum dma_data_direction dir = target_reverse_dma_direction(se_cmd);
  
-       if (isert_cmd->data.sg) {
-               isert_dbg("Cmd %p unmap_sg op\n", isert_cmd);
-               isert_unmap_data_buf(isert_conn, &isert_cmd->data);
-       }
-
-       if (isert_cmd->rdma_wr) {
-               isert_dbg("Cmd %p free send_wr\n", isert_cmd);
-               kfree(isert_cmd->rdma_wr);
-               isert_cmd->rdma_wr = NULL;
-       }
-
-       if (isert_cmd->ib_sge) {
-               isert_dbg("Cmd %p free ib_sge\n", isert_cmd);
-               kfree(isert_cmd->ib_sge);
-               isert_cmd->ib_sge = NULL;
-       }
-}
-
-static void
-isert_unreg_rdma(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn)
-{
-       isert_dbg("Cmd %p\n", isert_cmd);
-
-       if (isert_cmd->fr_desc) {
-               isert_dbg("Cmd %p free fr_desc %p\n", isert_cmd, isert_cmd->fr_desc);
-               if (isert_cmd->fr_desc->ind & ISERT_PROTECTED) {
-                       isert_unmap_data_buf(isert_conn, &isert_cmd->prot);
-                       isert_cmd->fr_desc->ind &= ~ISERT_PROTECTED;
-               }
-               spin_lock_bh(&isert_conn->pool_lock);
-               list_add_tail(&isert_cmd->fr_desc->list, &isert_conn->fr_pool);
-               spin_unlock_bh(&isert_conn->pool_lock);
-               isert_cmd->fr_desc = NULL;
-       }
+       if (!cmd->rw.nr_ops)
+               return;
  
-       if (isert_cmd->data.sg) {
-               isert_dbg("Cmd %p unmap_sg op\n", isert_cmd);
-               isert_unmap_data_buf(isert_conn, &isert_cmd->data);
+       if (isert_prot_cmd(conn, se_cmd)) {
+               rdma_rw_ctx_destroy_signature(&cmd->rw, conn->qp,
+                               conn->cm_id->port_num, se_cmd->t_data_sg,
+                               se_cmd->t_data_nents, se_cmd->t_prot_sg,
+                               se_cmd->t_prot_nents, dir);
+       } else {
+               rdma_rw_ctx_destroy(&cmd->rw, conn->qp, conn->cm_id->port_num,
+                               se_cmd->t_data_sg, se_cmd->t_data_nents, dir);
         }
  
-       isert_cmd->ib_sge = NULL;
-       isert_cmd->rdma_wr = NULL;
+       cmd->rw.nr_ops = 0;
  }
  
  static void
@@ -1732,7 +1487,6 @@ isert_put_cmd(struct isert_cmd *isert_cmd, bool comp_err)
         struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd;
         struct isert_conn *isert_conn = isert_cmd->conn;
         struct iscsi_conn *conn = isert_conn->conn;
-       struct isert_device *device = isert_conn->device;
         struct iscsi_text_rsp *hdr;
  
         isert_dbg("Cmd %p\n", isert_cmd);
@@ -1760,7 +1514,7 @@ isert_put_cmd(struct isert_cmd *isert_cmd, bool comp_err)
                         }
                 }
  
-               device->unreg_rdma_mem(isert_cmd, isert_conn);
+               isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn);
                 transport_generic_free_cmd(&cmd->se_cmd, 0);
                 break;
         case ISCSI_OP_SCSI_TMFUNC:
@@ -1894,14 +1648,9 @@ isert_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
  
         isert_dbg("Cmd %p\n", isert_cmd);
  
-       if (isert_cmd->fr_desc && isert_cmd->fr_desc->ind & ISERT_PROTECTED) {
-               ret = isert_check_pi_status(cmd,
-                               isert_cmd->fr_desc->pi_ctx->sig_mr);
-               isert_cmd->fr_desc->ind &= ~ISERT_PROTECTED;
-       }
+       ret = isert_check_pi_status(cmd, isert_cmd->rw.sig->sig_mr);
+       isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn);
  
-       device->unreg_rdma_mem(isert_cmd, isert_conn);
-       isert_cmd->rdma_wr_num = 0;
         if (ret)
                 transport_send_check_condition_and_sense(cmd, cmd->pi_err, 0);
         else
@@ -1929,16 +1678,12 @@ isert_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc)
  
         isert_dbg("Cmd %p\n", isert_cmd);
  
-       if (isert_cmd->fr_desc && isert_cmd->fr_desc->ind & ISERT_PROTECTED) {
-               ret = isert_check_pi_status(se_cmd,
-                                           isert_cmd->fr_desc->pi_ctx->sig_mr);
-               isert_cmd->fr_desc->ind &= ~ISERT_PROTECTED;
-       }
-
         iscsit_stop_dataout_timer(cmd);
-       device->unreg_rdma_mem(isert_cmd, isert_conn);
-       cmd->write_data_done = isert_cmd->data.len;
-       isert_cmd->rdma_wr_num = 0;
+
+       if (isert_prot_cmd(isert_conn, se_cmd))
+               ret = isert_check_pi_status(se_cmd, isert_cmd->rw.sig->sig_mr);
+       isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn);
+       cmd->write_data_done = 0;
  
         isert_dbg("Cmd: %p RDMA_READ comp calling execute_cmd\n", isert_cmd);
         spin_lock_bh(&cmd->istate_lock);
@@ -2111,7 +1856,6 @@ isert_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
  {
         struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
         struct isert_conn *isert_conn = conn->context;
-       struct isert_device *device = isert_conn->device;
  
         spin_lock_bh(&conn->cmd_lock);
         if (!list_empty(&cmd->i_conn_node))
@@ -2120,8 +1864,7 @@ isert_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
  
         if (cmd->data_direction == DMA_TO_DEVICE)
                 iscsit_stop_dataout_timer(cmd);
-
-       device->unreg_rdma_mem(isert_cmd, isert_conn);
+       isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn);
  }
  
  static enum target_prot_op
@@ -2274,234 +2017,6 @@ isert_put_text_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
         return isert_post_response(isert_conn, isert_cmd);
  }
  
-static int
-isert_build_rdma_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
-                   struct ib_sge *ib_sge, struct ib_rdma_wr *rdma_wr,
-                   u32 data_left, u32 offset)
-{
-       struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd;
-       struct scatterlist *sg_start, *tmp_sg;
-       struct isert_device *device = isert_conn->device;
-       struct ib_device *ib_dev = device->ib_device;
-       u32 sg_off, page_off;
-       int i = 0, sg_nents;
-
-       sg_off = offset / PAGE_SIZE;
-       sg_start = &cmd->se_cmd.t_data_sg[sg_off];
-       sg_nents = min(cmd->se_cmd.t_data_nents - sg_off, isert_conn->max_sge);
-       page_off = offset % PAGE_SIZE;
-
-       rdma_wr->wr.sg_list = ib_sge;
-       rdma_wr->wr.wr_cqe = &isert_cmd->tx_desc.tx_cqe;
-
-       /*
-        * Perform mapping of TCM scatterlist memory ib_sge dma_addr.
-        */
-       for_each_sg(sg_start, tmp_sg, sg_nents, i) {
-               isert_dbg("RDMA from SGL dma_addr: 0x%llx dma_len: %u, "
-                         "page_off: %u\n",
-                         (unsigned long long)tmp_sg->dma_address,
-                         tmp_sg->length, page_off);
-
-               ib_sge->addr = ib_sg_dma_address(ib_dev, tmp_sg) + page_off;
-               ib_sge->length = min_t(u32, data_left,
-                               ib_sg_dma_len(ib_dev, tmp_sg) - page_off);
-               ib_sge->lkey = device->pd->local_dma_lkey;
-
-               isert_dbg("RDMA ib_sge: addr: 0x%llx  length: %u lkey: %x\n",
-                         ib_sge->addr, ib_sge->length, ib_sge->lkey);
-               page_off = 0;
-               data_left -= ib_sge->length;
-               if (!data_left)
-                       break;
-               ib_sge++;
-               isert_dbg("Incrementing ib_sge pointer to %p\n", ib_sge);
-       }
-
-       rdma_wr->wr.num_sge = ++i;
-       isert_dbg("Set outgoing sg_list: %p num_sg: %u from TCM SGLs\n",
-                 rdma_wr->wr.sg_list, rdma_wr->wr.num_sge);
-
-       return rdma_wr->wr.num_sge;
-}
-
-static int
-isert_map_rdma(struct isert_cmd *isert_cmd, struct iscsi_conn *conn)
-{
-       struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd;
-       struct se_cmd *se_cmd = &cmd->se_cmd;
-       struct isert_conn *isert_conn = conn->context;
-       struct isert_data_buf *data = &isert_cmd->data;
-       struct ib_rdma_wr *rdma_wr;
-       struct ib_sge *ib_sge;
-       u32 offset, data_len, data_left, rdma_write_max, va_offset = 0;
-       int ret = 0, i, ib_sge_cnt;
-
-       offset = isert_cmd->iser_ib_op == ISER_IB_RDMA_READ ?
-                       cmd->write_data_done : 0;
-       ret = isert_map_data_buf(isert_conn, isert_cmd, se_cmd->t_data_sg,
-                                se_cmd->t_data_nents, se_cmd->data_length,
-                                offset, isert_cmd->iser_ib_op,
-                                &isert_cmd->data);
-       if (ret)
-               return ret;
-
-       data_left = data->len;
-       offset = data->offset;
-
-       ib_sge = kzalloc(sizeof(struct ib_sge) * data->nents, GFP_KERNEL);
-       if (!ib_sge) {
-               isert_warn("Unable to allocate ib_sge\n");
-               ret = -ENOMEM;
-               goto unmap_cmd;
-       }
-       isert_cmd->ib_sge = ib_sge;
-
-       isert_cmd->rdma_wr_num = DIV_ROUND_UP(data->nents, isert_conn->max_sge);
-       isert_cmd->rdma_wr = kzalloc(sizeof(struct ib_rdma_wr) *
-                       isert_cmd->rdma_wr_num, GFP_KERNEL);
-       if (!isert_cmd->rdma_wr) {
-               isert_dbg("Unable to allocate isert_cmd->rdma_wr\n");
-               ret = -ENOMEM;
-               goto unmap_cmd;
-       }
-
-       rdma_write_max = isert_conn->max_sge * PAGE_SIZE;
-
-       for (i = 0; i < isert_cmd->rdma_wr_num; i++) {
-               rdma_wr = &isert_cmd->rdma_wr[i];
-               data_len = min(data_left, rdma_write_max);
-
-               rdma_wr->wr.send_flags = 0;
-               if (isert_cmd->iser_ib_op == ISER_IB_RDMA_WRITE) {
-                       isert_cmd->tx_desc.tx_cqe.done = isert_rdma_write_done;
-
-                       rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
-                       rdma_wr->remote_addr = isert_cmd->read_va + offset;
-                       rdma_wr->rkey = isert_cmd->read_stag;
-                       if (i + 1 == isert_cmd->rdma_wr_num)
-                               rdma_wr->wr.next = &isert_cmd->tx_desc.send_wr;
-                       else
-                               rdma_wr->wr.next = &isert_cmd->rdma_wr[i + 1].wr;
-               } else {
-                       isert_cmd->tx_desc.tx_cqe.done = isert_rdma_read_done;
-
-                       rdma_wr->wr.opcode = IB_WR_RDMA_READ;
-                       rdma_wr->remote_addr = isert_cmd->write_va + va_offset;
-                       rdma_wr->rkey = isert_cmd->write_stag;
-                       if (i + 1 == isert_cmd->rdma_wr_num)
-                               rdma_wr->wr.send_flags = IB_SEND_SIGNALED;
-                       else
-                               rdma_wr->wr.next = &isert_cmd->rdma_wr[i + 1].wr;
-               }
-
-               ib_sge_cnt = isert_build_rdma_wr(isert_conn, isert_cmd, ib_sge,
-                                       rdma_wr, data_len, offset);
-               ib_sge += ib_sge_cnt;
-
-               offset += data_len;
-               va_offset += data_len;
-               data_left -= data_len;
-       }
-
-       return 0;
-unmap_cmd:
-       isert_unmap_data_buf(isert_conn, data);
-
-       return ret;
-}
-
-static inline void
-isert_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr)
-{
-       u32 rkey;
-
-       memset(inv_wr, 0, sizeof(*inv_wr));
-       inv_wr->wr_cqe = NULL;
-       inv_wr->opcode = IB_WR_LOCAL_INV;
-       inv_wr->ex.invalidate_rkey = mr->rkey;
-
-       /* Bump the key */
-       rkey = ib_inc_rkey(mr->rkey);
-       ib_update_fast_reg_key(mr, rkey);
-}
-
-static int
-isert_fast_reg_mr(struct isert_conn *isert_conn,
-                 struct fast_reg_descriptor *fr_desc,
-                 struct isert_data_buf *mem,
-                 enum isert_indicator ind,
-                 struct ib_sge *sge)
-{
-       struct isert_device *device = isert_conn->device;
-       struct ib_device *ib_dev = device->ib_device;
-       struct ib_mr *mr;
-       struct ib_reg_wr reg_wr;
-       struct ib_send_wr inv_wr, *bad_wr, *wr = NULL;
-       int ret, n;
-
-       if (mem->dma_nents == 1) {
-               sge->lkey = device->pd->local_dma_lkey;
-               sge->addr = ib_sg_dma_address(ib_dev, &mem->sg[0]);
-               sge->length = ib_sg_dma_len(ib_dev, &mem->sg[0]);
-               isert_dbg("sge: addr: 0x%llx  length: %u lkey: %x\n",
-                        sge->addr, sge->length, sge->lkey);
-               return 0;
-       }
-
-       if (ind == ISERT_DATA_KEY_VALID)
-               /* Registering data buffer */
-               mr = fr_desc->data_mr;
-       else
-               /* Registering protection buffer */
-               mr = fr_desc->pi_ctx->prot_mr;
-
-       if (!(fr_desc->ind & ind)) {
-               isert_inv_rkey(&inv_wr, mr);
-               wr = &inv_wr;
-       }
-
-       n = ib_map_mr_sg(mr, mem->sg, mem->nents, PAGE_SIZE);
-       if (unlikely(n != mem->nents)) {
-               isert_err("failed to map mr sg (%d/%d)\n",
-                        n, mem->nents);
-               return n < 0 ? n : -EINVAL;
-       }
-
-       isert_dbg("Use fr_desc %p sg_nents %d offset %u\n",
-                 fr_desc, mem->nents, mem->offset);
-
-       reg_wr.wr.next = NULL;
-       reg_wr.wr.opcode = IB_WR_REG_MR;
-       reg_wr.wr.wr_cqe = NULL;
-       reg_wr.wr.send_flags = 0;
-       reg_wr.wr.num_sge = 0;
-       reg_wr.mr = mr;
-       reg_wr.key = mr->lkey;
-       reg_wr.access = IB_ACCESS_LOCAL_WRITE;
-
-       if (!wr)
-               wr = &reg_wr.wr;
-       else
-               wr->next = &reg_wr.wr;
-
-       ret = ib_post_send(isert_conn->qp, wr, &bad_wr);
-       if (ret) {
-               isert_err("fast registration failed, ret:%d\n", ret);
-               return ret;
-       }
-       fr_desc->ind &= ~ind;
-
-       sge->lkey = mr->lkey;
-       sge->addr = mr->iova;
-       sge->length = mr->length;
-
-       isert_dbg("sge: addr: 0x%llx  length: %u lkey: %x\n",
-                 sge->addr, sge->length, sge->lkey);
-
-       return ret;
-}
-
  static inline void
  isert_set_dif_domain(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs,
                      struct ib_sig_domain *domain)
@@ -2526,6 +2041,8 @@ isert_set_dif_domain(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs,
  static int
  isert_set_sig_attrs(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs)
  {
+       memset(sig_attrs, 0, sizeof(*sig_attrs));
+
         switch (se_cmd->prot_op) {
         case TARGET_PROT_DIN_INSERT:
         case TARGET_PROT_DOUT_STRIP:
@@ -2547,228 +2064,59 @@ isert_set_sig_attrs(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs)
                 return -EINVAL;
         }
  
+       sig_attrs->check_mask =
+              (se_cmd->prot_checks & TARGET_DIF_CHECK_GUARD  ? 0xc0 : 0) |
+              (se_cmd->prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x30 : 0) |
+              (se_cmd->prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x0f : 0);
         return 0;
  }
  
-static inline u8
-isert_set_prot_checks(u8 prot_checks)
-{
-       return (prot_checks & TARGET_DIF_CHECK_GUARD  ? 0xc0 : 0) |
-              (prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x30 : 0) |
-              (prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x0f : 0);
-}
-
-static int
-isert_reg_sig_mr(struct isert_conn *isert_conn,
-                struct isert_cmd *isert_cmd,
-                struct fast_reg_descriptor *fr_desc)
-{
-       struct se_cmd *se_cmd = &isert_cmd->iscsi_cmd->se_cmd;
-       struct ib_sig_handover_wr sig_wr;
-       struct ib_send_wr inv_wr, *bad_wr, *wr = NULL;
-       struct pi_context *pi_ctx = fr_desc->pi_ctx;
-       struct ib_sig_attrs sig_attrs;
-       int ret;
-
-       memset(&sig_attrs, 0, sizeof(sig_attrs));
-       ret = isert_set_sig_attrs(se_cmd, &sig_attrs);
-       if (ret)
-               goto err;
-
-       sig_attrs.check_mask = isert_set_prot_checks(se_cmd->prot_checks);
-
-       if (!(fr_desc->ind & ISERT_SIG_KEY_VALID)) {
-               isert_inv_rkey(&inv_wr, pi_ctx->sig_mr);
-               wr = &inv_wr;
-       }
-
-       memset(&sig_wr, 0, sizeof(sig_wr));
-       sig_wr.wr.opcode = IB_WR_REG_SIG_MR;
-       sig_wr.wr.wr_cqe = NULL;
-       sig_wr.wr.sg_list = &isert_cmd->ib_sg[DATA];
-       sig_wr.wr.num_sge = 1;
-       sig_wr.access_flags = IB_ACCESS_LOCAL_WRITE;
-       sig_wr.sig_attrs = &sig_attrs;
-       sig_wr.sig_mr = pi_ctx->sig_mr;
-       if (se_cmd->t_prot_sg)
-               sig_wr.prot = &isert_cmd->ib_sg[PROT];
-
-       if (!wr)
-               wr = &sig_wr.wr;
-       else
-               wr->next = &sig_wr.wr;
-
-       ret = ib_post_send(isert_conn->qp, wr, &bad_wr);
-       if (ret) {
-               isert_err("fast registration failed, ret:%d\n", ret);
-               goto err;
-       }
-       fr_desc->ind &= ~ISERT_SIG_KEY_VALID;
-
-       isert_cmd->ib_sg[SIG].lkey = pi_ctx->sig_mr->lkey;
-       isert_cmd->ib_sg[SIG].addr = 0;
-       isert_cmd->ib_sg[SIG].length = se_cmd->data_length;
-       if (se_cmd->prot_op != TARGET_PROT_DIN_STRIP &&
-           se_cmd->prot_op != TARGET_PROT_DOUT_INSERT)
-               /*
-                * We have protection guards on the wire
-                * so we need to set a larget transfer
-                */
-               isert_cmd->ib_sg[SIG].length += se_cmd->prot_length;
-
-       isert_dbg("sig_sge: addr: 0x%llx  length: %u lkey: %x\n",
-                 isert_cmd->ib_sg[SIG].addr, isert_cmd->ib_sg[SIG].length,
-                 isert_cmd->ib_sg[SIG].lkey);
-err:
-       return ret;
-}
-
  static int
-isert_handle_prot_cmd(struct isert_conn *isert_conn,
-                     struct isert_cmd *isert_cmd)
-{
-       struct isert_device *device = isert_conn->device;
-       struct se_cmd *se_cmd = &isert_cmd->iscsi_cmd->se_cmd;
+isert_rdma_rw_ctx_post(struct isert_cmd *cmd, struct isert_conn *conn,
+               struct ib_cqe *cqe, struct ib_send_wr *chain_wr)
+{
+       struct se_cmd *se_cmd = &cmd->iscsi_cmd->se_cmd;
+       enum dma_data_direction dir = target_reverse_dma_direction(se_cmd);
+       u8 port_num = conn->cm_id->port_num;
+       u64 addr;
+       u32 rkey, offset;
         int ret;
  
-       if (!isert_cmd->fr_desc->pi_ctx) {
-               ret = isert_create_pi_ctx(isert_cmd->fr_desc,
-                                         device->ib_device,
-                                         device->pd);
-               if (ret) {
-                       isert_err("conn %p failed to allocate pi_ctx\n",
-                                 isert_conn);
-                       return ret;
-               }
-       }
-
-       if (se_cmd->t_prot_sg) {
-               ret = isert_map_data_buf(isert_conn, isert_cmd,
-                                        se_cmd->t_prot_sg,
-                                        se_cmd->t_prot_nents,
-                                        se_cmd->prot_length,
-                                        0,
-                                        isert_cmd->iser_ib_op,
-                                        &isert_cmd->prot);
-               if (ret) {
-                       isert_err("conn %p failed to map protection buffer\n",
-                                 isert_conn);
-                       return ret;
-               }
-
-               memset(&isert_cmd->ib_sg[PROT], 0, sizeof(isert_cmd->ib_sg[PROT]));
-               ret = isert_fast_reg_mr(isert_conn, isert_cmd->fr_desc,
-                                       &isert_cmd->prot,
-                                       ISERT_PROT_KEY_VALID,
-                                       &isert_cmd->ib_sg[PROT]);
-               if (ret) {
-                       isert_err("conn %p failed to fast reg mr\n",
-                                 isert_conn);
-                       goto unmap_prot_cmd;
-               }
-       }
-
-       ret = isert_reg_sig_mr(isert_conn, isert_cmd, isert_cmd->fr_desc);
-       if (ret) {
-               isert_err("conn %p failed to fast reg mr\n",
-                         isert_conn);
-               goto unmap_prot_cmd;
-       }
-       isert_cmd->fr_desc->ind |= ISERT_PROTECTED;
-
-       return 0;
-
-unmap_prot_cmd:
-       if (se_cmd->t_prot_sg)
-               isert_unmap_data_buf(isert_conn, &isert_cmd->prot);
-
-       return ret;
-}
-
-static int
-isert_reg_rdma(struct isert_cmd *isert_cmd, struct iscsi_conn *conn)
-{
-       struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd;
-       struct se_cmd *se_cmd = &cmd->se_cmd;
-       struct isert_conn *isert_conn = conn->context;
-       struct fast_reg_descriptor *fr_desc = NULL;
-       struct ib_rdma_wr *rdma_wr;
-       struct ib_sge *ib_sg;
-       u32 offset;
-       int ret = 0;
-       unsigned long flags;
-
-       offset = isert_cmd->iser_ib_op == ISER_IB_RDMA_READ ?
-                       cmd->write_data_done : 0;
-       ret = isert_map_data_buf(isert_conn, isert_cmd, se_cmd->t_data_sg,
-                                se_cmd->t_data_nents, se_cmd->data_length,
-                                offset, isert_cmd->iser_ib_op,
-                                &isert_cmd->data);
-       if (ret)
-               return ret;
-
-       if (isert_cmd->data.dma_nents != 1 ||
-           isert_prot_cmd(isert_conn, se_cmd)) {
-               spin_lock_irqsave(&isert_conn->pool_lock, flags);
-               fr_desc = list_first_entry(&isert_conn->fr_pool,
-                                          struct fast_reg_descriptor, list);
-               list_del(&fr_desc->list);
-               spin_unlock_irqrestore(&isert_conn->pool_lock, flags);
-               isert_cmd->fr_desc = fr_desc;
-       }
-
-       ret = isert_fast_reg_mr(isert_conn, fr_desc, &isert_cmd->data,
-                               ISERT_DATA_KEY_VALID, &isert_cmd->ib_sg[DATA]);
-       if (ret)
-               goto unmap_cmd;
-
-       if (isert_prot_cmd(isert_conn, se_cmd)) {
-               ret = isert_handle_prot_cmd(isert_conn, isert_cmd);
-               if (ret)
-                       goto unmap_cmd;
-
-               ib_sg = &isert_cmd->ib_sg[SIG];
+       if (dir == DMA_FROM_DEVICE) {
+               addr = cmd->write_va;
+               rkey = cmd->write_stag;
+               offset = cmd->iscsi_cmd->write_data_done;
         } else {
-               ib_sg = &isert_cmd->ib_sg[DATA];
+               addr = cmd->read_va;
+               rkey = cmd->read_stag;
+               offset = 0;
         }
  
-       memcpy(&isert_cmd->s_ib_sge, ib_sg, sizeof(*ib_sg));
-       isert_cmd->ib_sge = &isert_cmd->s_ib_sge;
-       isert_cmd->rdma_wr_num = 1;
-       memset(&isert_cmd->s_rdma_wr, 0, sizeof(isert_cmd->s_rdma_wr));
-       isert_cmd->rdma_wr = &isert_cmd->s_rdma_wr;
+       if (isert_prot_cmd(conn, se_cmd)) {
+               struct ib_sig_attrs sig_attrs;
  
-       rdma_wr = &isert_cmd->s_rdma_wr;
-       rdma_wr->wr.sg_list = &isert_cmd->s_ib_sge;
-       rdma_wr->wr.num_sge = 1;
-       rdma_wr->wr.wr_cqe = &isert_cmd->tx_desc.tx_cqe;
-       if (isert_cmd->iser_ib_op == ISER_IB_RDMA_WRITE) {
-               isert_cmd->tx_desc.tx_cqe.done = isert_rdma_write_done;
+               ret = isert_set_sig_attrs(se_cmd, &sig_attrs);
+               if (ret)
+                       return ret;
  
-               rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
-               rdma_wr->remote_addr = isert_cmd->read_va;
-               rdma_wr->rkey = isert_cmd->read_stag;
-               rdma_wr->wr.send_flags = !isert_prot_cmd(isert_conn, se_cmd) ?
-                                     0 : IB_SEND_SIGNALED;
+               WARN_ON_ONCE(offset);
+               ret = rdma_rw_ctx_signature_init(&cmd->rw, conn->qp, port_num,
+                               se_cmd->t_data_sg, se_cmd->t_data_nents,
+                               se_cmd->t_prot_sg, se_cmd->t_prot_nents,
+                               &sig_attrs, addr, rkey, dir);
         } else {
-               isert_cmd->tx_desc.tx_cqe.done = isert_rdma_read_done;
-
-               rdma_wr->wr.opcode = IB_WR_RDMA_READ;
-               rdma_wr->remote_addr = isert_cmd->write_va;
-               rdma_wr->rkey = isert_cmd->write_stag;
-               rdma_wr->wr.send_flags = IB_SEND_SIGNALED;
+               ret = rdma_rw_ctx_init(&cmd->rw, conn->qp, port_num,
+                               se_cmd->t_data_sg, se_cmd->t_data_nents,
+                               offset, addr, rkey, dir);
         }
-
-       return 0;
-
-unmap_cmd:
-       if (fr_desc) {
-               spin_lock_irqsave(&isert_conn->pool_lock, flags);
-               list_add_tail(&fr_desc->list, &isert_conn->fr_pool);
-               spin_unlock_irqrestore(&isert_conn->pool_lock, flags);
+       if (ret < 0) {
+               isert_err("Cmd: %p failed to prepare RDMA res\n", cmd);
+               return ret;
         }
-       isert_unmap_data_buf(isert_conn, &isert_cmd->data);
  
+       ret = rdma_rw_ctx_post(&cmd->rw, conn->qp, port_num, cqe, chain_wr);
+       if (ret < 0)
+               isert_err("Cmd: %p failed to post RDMA res\n", cmd);
         return ret;
  }
  
@@ -2778,21 +2126,17 @@ isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
         struct se_cmd *se_cmd = &cmd->se_cmd;
         struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
         struct isert_conn *isert_conn = conn->context;
-       struct isert_device *device = isert_conn->device;
-       struct ib_send_wr *wr_failed;
+       struct ib_cqe *cqe = NULL;
+       struct ib_send_wr *chain_wr = NULL;
         int rc;
  
         isert_dbg("Cmd: %p RDMA_WRITE data_length: %u\n",
                  isert_cmd, se_cmd->data_length);
  
-       isert_cmd->iser_ib_op = ISER_IB_RDMA_WRITE;
-       rc = device->reg_rdma_mem(isert_cmd, conn);
-       if (rc) {
-               isert_err("Cmd: %p failed to prepare RDMA res\n", isert_cmd);
-               return rc;
-       }
-
-       if (!isert_prot_cmd(isert_conn, se_cmd)) {
+       if (isert_prot_cmd(isert_conn, se_cmd)) {
+               isert_cmd->tx_desc.tx_cqe.done = isert_rdma_write_done;
+               cqe = &isert_cmd->tx_desc.tx_cqe;
+       } else {
                 /*
                  * Build isert_conn->tx_desc for iSCSI response PDU and attach
                  */
@@ -2803,56 +2147,35 @@ isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
                 isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc);
                 isert_init_send_wr(isert_conn, isert_cmd,
                                    &isert_cmd->tx_desc.send_wr);
-               isert_cmd->s_rdma_wr.wr.next = &isert_cmd->tx_desc.send_wr;
-               isert_cmd->rdma_wr_num += 1;
  
                 rc = isert_post_recv(isert_conn, isert_cmd->rx_desc);
                 if (rc) {
                         isert_err("ib_post_recv failed with %d\n", rc);
                         return rc;
                 }
-       }
  
-       rc = ib_post_send(isert_conn->qp, &isert_cmd->rdma_wr->wr, &wr_failed);
-       if (rc)
-               isert_warn("ib_post_send() failed for IB_WR_RDMA_WRITE\n");
-
-       if (!isert_prot_cmd(isert_conn, se_cmd))
-               isert_dbg("Cmd: %p posted RDMA_WRITE + Response for iSER Data "
-                        "READ\n", isert_cmd);
-       else
-               isert_dbg("Cmd: %p posted RDMA_WRITE for iSER Data READ\n",
-                        isert_cmd);
+               chain_wr = &isert_cmd->tx_desc.send_wr;
+       }
  
+       isert_rdma_rw_ctx_post(isert_cmd, isert_conn, cqe, chain_wr);
+       isert_dbg("Cmd: %p posted RDMA_WRITE for iSER Data READ\n", isert_cmd);
         return 1;
  }
  
  static int
  isert_get_dataout(struct iscsi_conn *conn, struct iscsi_cmd *cmd, bool recovery)
  {
-       struct se_cmd *se_cmd = &cmd->se_cmd;
         struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
-       struct isert_conn *isert_conn = conn->context;
-       struct isert_device *device = isert_conn->device;
-       struct ib_send_wr *wr_failed;
-       int rc;
  
         isert_dbg("Cmd: %p RDMA_READ data_length: %u write_data_done: %u\n",
-                isert_cmd, se_cmd->data_length, cmd->write_data_done);
-       isert_cmd->iser_ib_op = ISER_IB_RDMA_READ;
-       rc = device->reg_rdma_mem(isert_cmd, conn);
-       if (rc) {
-               isert_err("Cmd: %p failed to prepare RDMA res\n", isert_cmd);
-               return rc;
-       }
+                isert_cmd, cmd->se_cmd.data_length, cmd->write_data_done);
  
-       rc = ib_post_send(isert_conn->qp, &isert_cmd->rdma_wr->wr, &wr_failed);
-       if (rc)
-               isert_warn("ib_post_send() failed for IB_WR_RDMA_READ\n");
+       isert_cmd->tx_desc.tx_cqe.done = isert_rdma_read_done;
+       isert_rdma_rw_ctx_post(isert_cmd, conn->context,
+                       &isert_cmd->tx_desc.tx_cqe, NULL);
  
         isert_dbg("Cmd: %p posted RDMA_READ memory for ISER Data WRITE\n",
                  isert_cmd);
-
         return 0;
  }
  
diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h

index 147900cbb5788209d952f2f394c2991dd1ea8084..e512ba941f2f980d659b3e75ec31fb708964ff01 100644 (file)
--- a/drivers/infiniband/ulp/isert/ib_isert.h
+++ b/drivers/infiniband/ulp/isert/ib_isert.h
@@ -3,6 +3,7 @@
  #include <linux/in6.h>
  #include <rdma/ib_verbs.h>
  #include <rdma/rdma_cm.h>
+#include <rdma/rw.h>
  #include <scsi/iser.h>
  
  
@@ -53,10 +54,7 @@
  
  #define ISERT_MIN_POSTED_RX    (ISCSI_DEF_XMIT_CMDS_MAX >> 2)
  
-#define ISERT_INFLIGHT_DATAOUTS        8
-
-#define ISERT_QP_MAX_REQ_DTOS  (ISCSI_DEF_XMIT_CMDS_MAX *    \
-                               (1 + ISERT_INFLIGHT_DATAOUTS) + \
+#define ISERT_QP_MAX_REQ_DTOS  (ISCSI_DEF_XMIT_CMDS_MAX +    \
                                 ISERT_MAX_TX_MISC_PDUS  + \
                                 ISERT_MAX_RX_MISC_PDUS)
  
@@ -71,13 +69,6 @@ enum isert_desc_type {
         ISCSI_TX_DATAIN
  };
  
-enum iser_ib_op_code {
-       ISER_IB_RECV,
-       ISER_IB_SEND,
-       ISER_IB_RDMA_WRITE,
-       ISER_IB_RDMA_READ,
-};
-
  enum iser_conn_state {
         ISER_CONN_INIT,
         ISER_CONN_UP,
@@ -118,42 +109,6 @@ static inline struct iser_tx_desc *cqe_to_tx_desc(struct ib_cqe *cqe)
         return container_of(cqe, struct iser_tx_desc, tx_cqe);
  }
  
-
-enum isert_indicator {
-       ISERT_PROTECTED         = 1 << 0,
-       ISERT_DATA_KEY_VALID    = 1 << 1,
-       ISERT_PROT_KEY_VALID    = 1 << 2,
-       ISERT_SIG_KEY_VALID     = 1 << 3,
-};
-
-struct pi_context {
-       struct ib_mr                   *prot_mr;
-       struct ib_mr                   *sig_mr;
-};
-
-struct fast_reg_descriptor {
-       struct list_head                list;
-       struct ib_mr                   *data_mr;
-       u8                              ind;
-       struct pi_context              *pi_ctx;
-};
-
-struct isert_data_buf {
-       struct scatterlist     *sg;
-       int                     nents;
-       u32                     sg_off;
-       u32                     len; /* cur_rdma_length */
-       u32                     offset;
-       unsigned int            dma_nents;
-       enum dma_data_direction dma_dir;
-};
-
-enum {
-       DATA = 0,
-       PROT = 1,
-       SIG = 2,
-};
-
  struct isert_cmd {
         uint32_t                read_stag;
         uint32_t                write_stag;
@@ -166,16 +121,7 @@ struct isert_cmd {
         struct iscsi_cmd        *iscsi_cmd;
         struct iser_tx_desc     tx_desc;
         struct iser_rx_desc     *rx_desc;
-       enum iser_ib_op_code    iser_ib_op;
-       struct ib_sge           *ib_sge;
-       struct ib_sge           s_ib_sge;
-       int                     rdma_wr_num;
-       struct ib_rdma_wr       *rdma_wr;
-       struct ib_rdma_wr       s_rdma_wr;
-       struct ib_sge           ib_sg[3];
-       struct isert_data_buf   data;
-       struct isert_data_buf   prot;
-       struct fast_reg_descriptor *fr_desc;
+       struct rdma_rw_ctx      rw;
         struct work_struct      comp_work;
         struct scatterlist      sg;
  };
@@ -210,10 +156,6 @@ struct isert_conn {
         struct isert_device     *device;
         struct mutex            mutex;
         struct kref             kref;
-       struct list_head        fr_pool;
-       int                     fr_pool_size;
-       /* lock to protect fastreg pool */
-       spinlock_t              pool_lock;
         struct work_struct      release_work;
         bool                    logout_posted;
         bool                    snd_w_inv;
@@ -236,7 +178,6 @@ struct isert_comp {
  };
  
  struct isert_device {
-       int                     use_fastreg;
         bool                    pi_capable;
         int                     refcount;
         struct ib_device        *ib_device;
@@ -244,10 +185,6 @@ struct isert_device {
         struct isert_comp       *comps;
         int                     comps_used;
         struct list_head        dev_node;
-       int                     (*reg_rdma_mem)(struct isert_cmd *isert_cmd,
-                                               struct iscsi_conn *conn);
-       void                    (*unreg_rdma_mem)(struct isert_cmd *isert_cmd,
-                                                 struct isert_conn *isert_conn);
  };
  
  struct isert_np {
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c

index 369a75e1f44ecbb671af1b24a6ad124f33fed13c..646de170ec12e44a9eb20661fa4614e54dd0afe2 100644 (file)
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -70,6 +70,7 @@ static unsigned int indirect_sg_entries;
  static bool allow_ext_sg;
  static bool prefer_fr = true;
  static bool register_always = true;
+static bool never_register;
  static int topspin_workarounds = 1;
  
  module_param(srp_sg_tablesize, uint, 0444);
@@ -99,6 +100,9 @@ module_param(register_always, bool, 0444);
  MODULE_PARM_DESC(register_always,
                  "Use memory registration even for contiguous memory regions");
  
+module_param(never_register, bool, 0444);
+MODULE_PARM_DESC(never_register, "Never register memory");
+
  static const struct kernel_param_ops srp_tmo_ops;
  
  static int srp_reconnect_delay = 10;
@@ -316,7 +320,7 @@ static struct ib_fmr_pool *srp_alloc_fmr_pool(struct srp_target_port *target)
         struct ib_fmr_pool_param fmr_param;
  
         memset(&fmr_param, 0, sizeof(fmr_param));
-       fmr_param.pool_size         = target->scsi_host->can_queue;
+       fmr_param.pool_size         = target->mr_pool_size;
         fmr_param.dirty_watermark   = fmr_param.pool_size / 4;
         fmr_param.cache             = 1;
         fmr_param.max_pages_per_fmr = dev->max_pages_per_mr;
@@ -441,23 +445,22 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target)
  {
         struct srp_device *dev = target->srp_host->srp_dev;
  
-       return srp_create_fr_pool(dev->dev, dev->pd,
-                                 target->scsi_host->can_queue,
+       return srp_create_fr_pool(dev->dev, dev->pd, target->mr_pool_size,
                                   dev->max_pages_per_mr);
  }
  
  /**
   * srp_destroy_qp() - destroy an RDMA queue pair
- * @ch: SRP RDMA channel.
+ * @qp: RDMA queue pair.
   *
   * Drain the qp before destroying it.  This avoids that the receive
   * completion handler can access the queue pair while it is
   * being destroyed.
   */
-static void srp_destroy_qp(struct srp_rdma_ch *ch)
+static void srp_destroy_qp(struct ib_qp *qp)
  {
-       ib_drain_rq(ch->qp);
-       ib_destroy_qp(ch->qp);
+       ib_drain_rq(qp);
+       ib_destroy_qp(qp);
  }
  
  static int srp_create_ch_ib(struct srp_rdma_ch *ch)
@@ -469,7 +472,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
         struct ib_qp *qp;
         struct ib_fmr_pool *fmr_pool = NULL;
         struct srp_fr_pool *fr_pool = NULL;
-       const int m = dev->use_fast_reg ? 3 : 1;
+       const int m = 1 + dev->use_fast_reg * target->mr_per_cmd * 2;
         int ret;
  
         init_attr = kzalloc(sizeof *init_attr, GFP_KERNEL);
@@ -530,7 +533,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
         }
  
         if (ch->qp)
-               srp_destroy_qp(ch);
+               srp_destroy_qp(ch->qp);
         if (ch->recv_cq)
                 ib_free_cq(ch->recv_cq);
         if (ch->send_cq)
@@ -554,7 +557,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
         return 0;
  
  err_qp:
-       srp_destroy_qp(ch);
+       srp_destroy_qp(qp);
  
  err_send_cq:
         ib_free_cq(send_cq);
@@ -597,7 +600,7 @@ static void srp_free_ch_ib(struct srp_target_port *target,
                         ib_destroy_fmr_pool(ch->fmr_pool);
         }
  
-       srp_destroy_qp(ch);
+       srp_destroy_qp(ch->qp);
         ib_free_cq(ch->send_cq);
         ib_free_cq(ch->recv_cq);
  
@@ -850,7 +853,7 @@ static int srp_alloc_req_data(struct srp_rdma_ch *ch)
  
         for (i = 0; i < target->req_ring_size; ++i) {
                 req = &ch->req_ring[i];
-               mr_list = kmalloc(target->cmd_sg_cnt * sizeof(void *),
+               mr_list = kmalloc(target->mr_per_cmd * sizeof(void *),
                                   GFP_KERNEL);
                 if (!mr_list)
                         goto out;
@@ -1112,7 +1115,7 @@ static struct scsi_cmnd *srp_claim_req(struct srp_rdma_ch *ch,
  }
  
  /**
- * srp_free_req() - Unmap data and add request to the free request list.
+ * srp_free_req() - Unmap data and adjust ch->req_lim.
   * @ch:     SRP RDMA channel.
   * @req:    Request to be freed.
   * @scmnd:  SCSI command associated with @req.
@@ -1299,9 +1302,16 @@ static void srp_reg_mr_err_done(struct ib_cq *cq, struct ib_wc *wc)
         srp_handle_qp_err(cq, wc, "FAST REG");
  }
  
+/*
+ * Map up to sg_nents elements of state->sg where *sg_offset_p is the offset
+ * where to start in the first element. If sg_offset_p != NULL then
+ * *sg_offset_p is updated to the offset in state->sg[retval] of the first
+ * byte that has not yet been mapped.
+ */
  static int srp_map_finish_fr(struct srp_map_state *state,
                              struct srp_request *req,
-                            struct srp_rdma_ch *ch, int sg_nents)
+                            struct srp_rdma_ch *ch, int sg_nents,
+                            unsigned int *sg_offset_p)
  {
         struct srp_target_port *target = ch->target;
         struct srp_device *dev = target->srp_host->srp_dev;
@@ -1316,13 +1326,14 @@ static int srp_map_finish_fr(struct srp_map_state *state,
  
         WARN_ON_ONCE(!dev->use_fast_reg);
  
-       if (sg_nents == 0)
-               return 0;
-
         if (sg_nents == 1 && target->global_mr) {
-               srp_map_desc(state, sg_dma_address(state->sg),
-                            sg_dma_len(state->sg),
+               unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
+
+               srp_map_desc(state, sg_dma_address(state->sg) + sg_offset,
+                            sg_dma_len(state->sg) - sg_offset,
                              target->global_mr->rkey);
+               if (sg_offset_p)
+                       *sg_offset_p = 0;
                 return 1;
         }
  
@@ -1333,9 +1344,17 @@ static int srp_map_finish_fr(struct srp_map_state *state,
         rkey = ib_inc_rkey(desc->mr->rkey);
         ib_update_fast_reg_key(desc->mr, rkey);
  
-       n = ib_map_mr_sg(desc->mr, state->sg, sg_nents, dev->mr_page_size);
-       if (unlikely(n < 0))
+       n = ib_map_mr_sg(desc->mr, state->sg, sg_nents, sg_offset_p,
+                        dev->mr_page_size);
+       if (unlikely(n < 0)) {
+               srp_fr_pool_put(ch->fr_pool, &desc, 1);
+               pr_debug("%s: ib_map_mr_sg(%d, %d) returned %d.\n",
+                        dev_name(&req->scmnd->device->sdev_gendev), sg_nents,
+                        sg_offset_p ? *sg_offset_p : -1, n);
                 return n;
+       }
+
+       WARN_ON_ONCE(desc->mr->length == 0);
  
         req->reg_cqe.done = srp_reg_mr_err_done;
  
@@ -1357,8 +1376,10 @@ static int srp_map_finish_fr(struct srp_map_state *state,
                      desc->mr->length, desc->mr->rkey);
  
         err = ib_post_send(ch->qp, &wr.wr, &bad_wr);
-       if (unlikely(err))
+       if (unlikely(err)) {
+               WARN_ON_ONCE(err == -ENOMEM);
                 return err;
+       }
  
         return n;
  }
@@ -1398,7 +1419,7 @@ static int srp_map_sg_entry(struct srp_map_state *state,
         /*
          * If the last entry of the MR wasn't a full page, then we need to
          * close it out and start a new one -- we can only merge at page
-        * boundries.
+        * boundaries.
          */
         ret = 0;
         if (len != dev->mr_page_size)
@@ -1413,10 +1434,9 @@ static int srp_map_sg_fmr(struct srp_map_state *state, struct srp_rdma_ch *ch,
         struct scatterlist *sg;
         int i, ret;
  
-       state->desc = req->indirect_desc;
         state->pages = req->map_page;
         state->fmr.next = req->fmr_list;
-       state->fmr.end = req->fmr_list + ch->target->cmd_sg_cnt;
+       state->fmr.end = req->fmr_list + ch->target->mr_per_cmd;
  
         for_each_sg(scat, sg, count, i) {
                 ret = srp_map_sg_entry(state, ch, sg, i);
@@ -1428,8 +1448,6 @@ static int srp_map_sg_fmr(struct srp_map_state *state, struct srp_rdma_ch *ch,
         if (ret)
                 return ret;
  
-       req->nmdesc = state->nmdesc;
-
         return 0;
  }
  
@@ -1437,15 +1455,20 @@ static int srp_map_sg_fr(struct srp_map_state *state, struct srp_rdma_ch *ch,
                          struct srp_request *req, struct scatterlist *scat,
                          int count)
  {
+       unsigned int sg_offset = 0;
+
         state->desc = req->indirect_desc;
         state->fr.next = req->fr_list;
-       state->fr.end = req->fr_list + ch->target->cmd_sg_cnt;
+       state->fr.end = req->fr_list + ch->target->mr_per_cmd;
         state->sg = scat;
  
+       if (count == 0)
+               return 0;
+
         while (count) {
                 int i, n;
  
-               n = srp_map_finish_fr(state, req, ch, count);
+               n = srp_map_finish_fr(state, req, ch, count, &sg_offset);
                 if (unlikely(n < 0))
                         return n;
  
@@ -1454,8 +1477,6 @@ static int srp_map_sg_fr(struct srp_map_state *state, struct srp_rdma_ch *ch,
                         state->sg = sg_next(state->sg);
         }
  
-       req->nmdesc = state->nmdesc;
-
         return 0;
  }
  
@@ -1475,8 +1496,6 @@ static int srp_map_sg_dma(struct srp_map_state *state, struct srp_rdma_ch *ch,
                              target->global_mr->rkey);
         }
  
-       req->nmdesc = state->nmdesc;
-
         return 0;
  }
  
@@ -1509,14 +1528,15 @@ static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req,
  
         if (dev->use_fast_reg) {
                 state.sg = idb_sg;
-               sg_set_buf(idb_sg, req->indirect_desc, idb_len);
+               sg_init_one(idb_sg, req->indirect_desc, idb_len);
                 idb_sg->dma_address = req->indirect_dma_addr; /* hack! */
  #ifdef CONFIG_NEED_SG_DMA_LENGTH
                 idb_sg->dma_length = idb_sg->length;          /* hack^2 */
  #endif
-               ret = srp_map_finish_fr(&state, req, ch, 1);
+               ret = srp_map_finish_fr(&state, req, ch, 1, NULL);
                 if (ret < 0)
                         return ret;
+               WARN_ON_ONCE(ret < 1);
         } else if (dev->use_fmr) {
                 state.pages = idb_pages;
                 state.pages[0] = (req->indirect_dma_addr &
@@ -1534,6 +1554,41 @@ static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req,
         return 0;
  }
  
+#if defined(DYNAMIC_DATA_DEBUG)
+static void srp_check_mapping(struct srp_map_state *state,
+                             struct srp_rdma_ch *ch, struct srp_request *req,
+                             struct scatterlist *scat, int count)
+{
+       struct srp_device *dev = ch->target->srp_host->srp_dev;
+       struct srp_fr_desc **pfr;
+       u64 desc_len = 0, mr_len = 0;
+       int i;
+
+       for (i = 0; i < state->ndesc; i++)
+               desc_len += be32_to_cpu(req->indirect_desc[i].len);
+       if (dev->use_fast_reg)
+               for (i = 0, pfr = req->fr_list; i < state->nmdesc; i++, pfr++)
+                       mr_len += (*pfr)->mr->length;
+       else if (dev->use_fmr)
+               for (i = 0; i < state->nmdesc; i++)
+                       mr_len += be32_to_cpu(req->indirect_desc[i].len);
+       if (desc_len != scsi_bufflen(req->scmnd) ||
+           mr_len > scsi_bufflen(req->scmnd))
+               pr_err("Inconsistent: scsi len %d <> desc len %lld <> mr len %lld; ndesc %d; nmdesc = %d\n",
+                      scsi_bufflen(req->scmnd), desc_len, mr_len,
+                      state->ndesc, state->nmdesc);
+}
+#endif
+
+/**
+ * srp_map_data() - map SCSI data buffer onto an SRP request
+ * @scmnd: SCSI command to map
+ * @ch: SRP RDMA channel
+ * @req: SRP request
+ *
+ * Returns the length in bytes of the SRP_CMD IU or a negative value if
+ * mapping failed.
+ */
  static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
                         struct srp_request *req)
  {
@@ -1601,11 +1656,23 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
  
         memset(&state, 0, sizeof(state));
         if (dev->use_fast_reg)
-               srp_map_sg_fr(&state, ch, req, scat, count);
+               ret = srp_map_sg_fr(&state, ch, req, scat, count);
         else if (dev->use_fmr)
-               srp_map_sg_fmr(&state, ch, req, scat, count);
+               ret = srp_map_sg_fmr(&state, ch, req, scat, count);
         else
-               srp_map_sg_dma(&state, ch, req, scat, count);
+               ret = srp_map_sg_dma(&state, ch, req, scat, count);
+       req->nmdesc = state.nmdesc;
+       if (ret < 0)
+               goto unmap;
+
+#if defined(DYNAMIC_DEBUG)
+       {
+               DEFINE_DYNAMIC_DEBUG_METADATA(ddm,
+                       "Memory mapping consistency check");
+               if (unlikely(ddm.flags & _DPRINTK_FLAGS_PRINT))
+                       srp_check_mapping(&state, ch, req, scat, count);
+       }
+#endif
  
         /* We've mapped the request, now pull as much of the indirect
          * descriptor table as we can into the command buffer. If this
@@ -1628,7 +1695,8 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
                                                 !target->allow_ext_sg)) {
                 shost_printk(KERN_ERR, target->scsi_host,
                              "Could not fit S/G list into SRP_CMD\n");
-               return -EIO;
+               ret = -EIO;
+               goto unmap;
         }
  
         count = min(state.ndesc, target->cmd_sg_cnt);
@@ -1646,7 +1714,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
                 ret = srp_map_idb(ch, req, state.gen.next, state.gen.end,
                                   idb_len, &idb_rkey);
                 if (ret < 0)
-                       return ret;
+                       goto unmap;
                 req->nmdesc++;
         } else {
                 idb_rkey = cpu_to_be32(target->global_mr->rkey);
@@ -1672,6 +1740,12 @@ map_complete:
                 cmd->buf_fmt = fmt;
  
         return len;
+
+unmap:
+       srp_unmap_data(scmnd, ch, req);
+       if (ret == -ENOMEM && req->nmdesc >= target->mr_pool_size)
+               ret = -E2BIG;
+       return ret;
  }
  
  /*
@@ -2564,6 +2638,20 @@ static int srp_reset_host(struct scsi_cmnd *scmnd)
         return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED;
  }
  
+static int srp_slave_alloc(struct scsi_device *sdev)
+{
+       struct Scsi_Host *shost = sdev->host;
+       struct srp_target_port *target = host_to_target(shost);
+       struct srp_device *srp_dev = target->srp_host->srp_dev;
+       struct ib_device *ibdev = srp_dev->dev;
+
+       if (!(ibdev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG))
+               blk_queue_virt_boundary(sdev->request_queue,
+                                       ~srp_dev->mr_page_mask);
+
+       return 0;
+}
+
  static int srp_slave_configure(struct scsi_device *sdev)
  {
         struct Scsi_Host *shost = sdev->host;
@@ -2755,6 +2843,7 @@ static struct scsi_host_template srp_template = {
         .module                         = THIS_MODULE,
         .name                           = "InfiniBand SRP initiator",
         .proc_name                      = DRV_NAME,
+       .slave_alloc                    = srp_slave_alloc,
         .slave_configure                = srp_slave_configure,
         .info                           = srp_target_info,
         .queuecommand                   = srp_queuecommand,
@@ -2829,7 +2918,7 @@ static int srp_add_target(struct srp_host *host, struct srp_target_port *target)
                 goto out;
         }
  
-       pr_debug(PFX "%s: SCSI scan succeeded - detected %d LUNs\n",
+       pr_debug("%s: SCSI scan succeeded - detected %d LUNs\n",
                  dev_name(&target->scsi_host->shost_gendev),
                  srp_sdev_count(target->scsi_host));
  
@@ -3161,6 +3250,7 @@ static ssize_t srp_create_target(struct device *dev,
         struct srp_device *srp_dev = host->srp_dev;
         struct ib_device *ibdev = srp_dev->dev;
         int ret, node_idx, node, cpu, i;
+       unsigned int max_sectors_per_mr, mr_per_cmd = 0;
         bool multich = false;
  
         target_host = scsi_host_alloc(&srp_template,
@@ -3217,7 +3307,33 @@ static ssize_t srp_create_target(struct device *dev,
                 target->sg_tablesize = target->cmd_sg_cnt;
         }
  
+       if (srp_dev->use_fast_reg || srp_dev->use_fmr) {
+               /*
+                * FR and FMR can only map one HCA page per entry. If the
+                * start address is not aligned on a HCA page boundary two
+                * entries will be used for the head and the tail although
+                * these two entries combined contain at most one HCA page of
+                * data. Hence the "+ 1" in the calculation below.
+                *
+                * The indirect data buffer descriptor is contiguous so the
+                * memory for that buffer will only be registered if
+                * register_always is true. Hence add one to mr_per_cmd if
+                * register_always has been set.
+                */
+               max_sectors_per_mr = srp_dev->max_pages_per_mr <<
+                                 (ilog2(srp_dev->mr_page_size) - 9);
+               mr_per_cmd = register_always +
+                       (target->scsi_host->max_sectors + 1 +
+                        max_sectors_per_mr - 1) / max_sectors_per_mr;
+               pr_debug("max_sectors = %u; max_pages_per_mr = %u; mr_page_size = %u; max_sectors_per_mr = %u; mr_per_cmd = %u\n",
+                        target->scsi_host->max_sectors,
+                        srp_dev->max_pages_per_mr, srp_dev->mr_page_size,
+                        max_sectors_per_mr, mr_per_cmd);
+       }
+
         target_host->sg_tablesize = target->sg_tablesize;
+       target->mr_pool_size = target->scsi_host->can_queue * mr_per_cmd;
+       target->mr_per_cmd = mr_per_cmd;
         target->indirect_size = target->sg_tablesize *
                                 sizeof (struct srp_direct_buf);
         target->max_iu_len = sizeof (struct srp_cmd) +
@@ -3414,17 +3530,6 @@ static void srp_add_one(struct ib_device *device)
         if (!srp_dev)
                 return;
  
-       srp_dev->has_fmr = (device->alloc_fmr && device->dealloc_fmr &&
-                           device->map_phys_fmr && device->unmap_fmr);
-       srp_dev->has_fr = (device->attrs.device_cap_flags &
-                          IB_DEVICE_MEM_MGT_EXTENSIONS);
-       if (!srp_dev->has_fmr && !srp_dev->has_fr)
-               dev_warn(&device->dev, "neither FMR nor FR is supported\n");
-
-       srp_dev->use_fast_reg = (srp_dev->has_fr &&
-                                (!srp_dev->has_fmr || prefer_fr));
-       srp_dev->use_fmr = !srp_dev->use_fast_reg && srp_dev->has_fmr;
-
         /*
          * Use the smallest page size supported by the HCA, down to a
          * minimum of 4096 bytes. We're unlikely to build large sglists
@@ -3435,8 +3540,25 @@ static void srp_add_one(struct ib_device *device)
         srp_dev->mr_page_mask   = ~((u64) srp_dev->mr_page_size - 1);
         max_pages_per_mr        = device->attrs.max_mr_size;
         do_div(max_pages_per_mr, srp_dev->mr_page_size);
+       pr_debug("%s: %llu / %u = %llu <> %u\n", __func__,
+                device->attrs.max_mr_size, srp_dev->mr_page_size,
+                max_pages_per_mr, SRP_MAX_PAGES_PER_MR);
         srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR,
                                           max_pages_per_mr);
+
+       srp_dev->has_fmr = (device->alloc_fmr && device->dealloc_fmr &&
+                           device->map_phys_fmr && device->unmap_fmr);
+       srp_dev->has_fr = (device->attrs.device_cap_flags &
+                          IB_DEVICE_MEM_MGT_EXTENSIONS);
+       if (!never_register && !srp_dev->has_fmr && !srp_dev->has_fr) {
+               dev_warn(&device->dev, "neither FMR nor FR is supported\n");
+       } else if (!never_register &&
+                  device->attrs.max_mr_size >= 2 * srp_dev->mr_page_size) {
+               srp_dev->use_fast_reg = (srp_dev->has_fr &&
+                                        (!srp_dev->has_fmr || prefer_fr));
+               srp_dev->use_fmr = !srp_dev->use_fast_reg && srp_dev->has_fmr;
+       }
+
         if (srp_dev->use_fast_reg) {
                 srp_dev->max_pages_per_mr =
                         min_t(u32, srp_dev->max_pages_per_mr,
@@ -3456,7 +3578,8 @@ static void srp_add_one(struct ib_device *device)
         if (IS_ERR(srp_dev->pd))
                 goto free_dev;
  
-       if (!register_always || (!srp_dev->has_fmr && !srp_dev->has_fr)) {
+       if (never_register || !register_always ||
+           (!srp_dev->has_fmr && !srp_dev->has_fr)) {
                 srp_dev->global_mr = ib_get_dma_mr(srp_dev->pd,
                                                    IB_ACCESS_LOCAL_WRITE |
                                                    IB_ACCESS_REMOTE_READ |
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h

index 9e05ce4a04fd08b0a450f5a546b7fb51901c5afe..26bb9b0a7a639d8f7e3aac136376dbca786ba2d6 100644 (file)
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -202,6 +202,8 @@ struct srp_target_port {
         char                    target_name[32];
         unsigned int            scsi_id;
         unsigned int            sg_tablesize;
+       int                     mr_pool_size;
+       int                     mr_per_cmd;
         int                     queue_size;
         int                     req_ring_size;
         int                     comp_vector;
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c

index 8b42401d4795646019f4498ed909b306630f19ec..2843f1ae75bdf50bd647b026a68e077cb884723a 100644 (file)
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -764,52 +764,6 @@ static int srpt_post_recv(struct srpt_device *sdev,
         return ib_post_srq_recv(sdev->srq, &wr, &bad_wr);
  }
  
-/**
- * srpt_post_send() - Post an IB send request.
- *
- * Returns zero upon success and a non-zero value upon failure.
- */
-static int srpt_post_send(struct srpt_rdma_ch *ch,
-                         struct srpt_send_ioctx *ioctx, int len)
-{
-       struct ib_sge list;
-       struct ib_send_wr wr, *bad_wr;
-       struct srpt_device *sdev = ch->sport->sdev;
-       int ret;
-
-       atomic_inc(&ch->req_lim);
-
-       ret = -ENOMEM;
-       if (unlikely(atomic_dec_return(&ch->sq_wr_avail) < 0)) {
-               pr_warn("IB send queue full (needed 1)\n");
-               goto out;
-       }
-
-       ib_dma_sync_single_for_device(sdev->device, ioctx->ioctx.dma, len,
-                                     DMA_TO_DEVICE);
-
-       list.addr = ioctx->ioctx.dma;
-       list.length = len;
-       list.lkey = sdev->pd->local_dma_lkey;
-
-       ioctx->ioctx.cqe.done = srpt_send_done;
-       wr.next = NULL;
-       wr.wr_cqe = &ioctx->ioctx.cqe;
-       wr.sg_list = &list;
-       wr.num_sge = 1;
-       wr.opcode = IB_WR_SEND;
-       wr.send_flags = IB_SEND_SIGNALED;
-
-       ret = ib_post_send(ch->qp, &wr, &bad_wr);
-
-out:
-       if (ret < 0) {
-               atomic_inc(&ch->sq_wr_avail);
-               atomic_dec(&ch->req_lim);
-       }
-       return ret;
-}
-
  /**
   * srpt_zerolength_write() - Perform a zero-length RDMA write.
   *
@@ -843,6 +797,110 @@ static void srpt_zerolength_write_done(struct ib_cq *cq, struct ib_wc *wc)
         }
  }
  
+static int srpt_alloc_rw_ctxs(struct srpt_send_ioctx *ioctx,
+               struct srp_direct_buf *db, int nbufs, struct scatterlist **sg,
+               unsigned *sg_cnt)
+{
+       enum dma_data_direction dir = target_reverse_dma_direction(&ioctx->cmd);
+       struct srpt_rdma_ch *ch = ioctx->ch;
+       struct scatterlist *prev = NULL;
+       unsigned prev_nents;
+       int ret, i;
+
+       if (nbufs == 1) {
+               ioctx->rw_ctxs = &ioctx->s_rw_ctx;
+       } else {
+               ioctx->rw_ctxs = kmalloc_array(nbufs, sizeof(*ioctx->rw_ctxs),
+                       GFP_KERNEL);
+               if (!ioctx->rw_ctxs)
+                       return -ENOMEM;
+       }
+
+       for (i = ioctx->n_rw_ctx; i < nbufs; i++, db++) {
+               struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i];
+               u64 remote_addr = be64_to_cpu(db->va);
+               u32 size = be32_to_cpu(db->len);
+               u32 rkey = be32_to_cpu(db->key);
+
+               ret = target_alloc_sgl(&ctx->sg, &ctx->nents, size, false,
+                               i < nbufs - 1);
+               if (ret)
+                       goto unwind;
+
+               ret = rdma_rw_ctx_init(&ctx->rw, ch->qp, ch->sport->port,
+                               ctx->sg, ctx->nents, 0, remote_addr, rkey, dir);
+               if (ret < 0) {
+                       target_free_sgl(ctx->sg, ctx->nents);
+                       goto unwind;
+               }
+
+               ioctx->n_rdma += ret;
+               ioctx->n_rw_ctx++;
+
+               if (prev) {
+                       sg_unmark_end(&prev[prev_nents - 1]);
+                       sg_chain(prev, prev_nents + 1, ctx->sg);
+               } else {
+                       *sg = ctx->sg;
+               }
+
+               prev = ctx->sg;
+               prev_nents = ctx->nents;
+
+               *sg_cnt += ctx->nents;
+       }
+
+       return 0;
+
+unwind:
+       while (--i >= 0) {
+               struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i];
+
+               rdma_rw_ctx_destroy(&ctx->rw, ch->qp, ch->sport->port,
+                               ctx->sg, ctx->nents, dir);
+               target_free_sgl(ctx->sg, ctx->nents);
+       }
+       if (ioctx->rw_ctxs != &ioctx->s_rw_ctx)
+               kfree(ioctx->rw_ctxs);
+       return ret;
+}
+
+static void srpt_free_rw_ctxs(struct srpt_rdma_ch *ch,
+                                   struct srpt_send_ioctx *ioctx)
+{
+       enum dma_data_direction dir = target_reverse_dma_direction(&ioctx->cmd);
+       int i;
+
+       for (i = 0; i < ioctx->n_rw_ctx; i++) {
+               struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i];
+
+               rdma_rw_ctx_destroy(&ctx->rw, ch->qp, ch->sport->port,
+                               ctx->sg, ctx->nents, dir);
+               target_free_sgl(ctx->sg, ctx->nents);
+       }
+
+       if (ioctx->rw_ctxs != &ioctx->s_rw_ctx)
+               kfree(ioctx->rw_ctxs);
+}
+
+static inline void *srpt_get_desc_buf(struct srp_cmd *srp_cmd)
+{
+       /*
+        * The pointer computations below will only be compiled correctly
+        * if srp_cmd::add_data is declared as s8*, u8*, s8[] or u8[], so check
+        * whether srp_cmd::add_data has been declared as a byte pointer.
+        */
+       BUILD_BUG_ON(!__same_type(srp_cmd->add_data[0], (s8)0) &&
+                    !__same_type(srp_cmd->add_data[0], (u8)0));
+
+       /*
+        * According to the SRP spec, the lower two bits of the 'ADDITIONAL
+        * CDB LENGTH' field are reserved and the size in bytes of this field
+        * is four times the value specified in bits 3..7. Hence the "& ~3".
+        */
+       return srp_cmd->add_data + (srp_cmd->add_cdb_len & ~3);
+}
+
  /**
   * srpt_get_desc_tbl() - Parse the data descriptors of an SRP_CMD request.
   * @ioctx: Pointer to the I/O context associated with the request.
@@ -858,94 +916,59 @@ static void srpt_zerolength_write_done(struct ib_cq *cq, struct ib_wc *wc)
   * -ENOMEM when memory allocation fails and zero upon success.
   */
  static int srpt_get_desc_tbl(struct srpt_send_ioctx *ioctx,
-                            struct srp_cmd *srp_cmd,
-                            enum dma_data_direction *dir, u64 *data_len)
+               struct srp_cmd *srp_cmd, enum dma_data_direction *dir,
+               struct scatterlist **sg, unsigned *sg_cnt, u64 *data_len)
  {
-       struct srp_indirect_buf *idb;
-       struct srp_direct_buf *db;
-       unsigned add_cdb_offset;
-       int ret;
-
-       /*
-        * The pointer computations below will only be compiled correctly
-        * if srp_cmd::add_data is declared as s8*, u8*, s8[] or u8[], so check
-        * whether srp_cmd::add_data has been declared as a byte pointer.
-        */
-       BUILD_BUG_ON(!__same_type(srp_cmd->add_data[0], (s8)0)
-                    && !__same_type(srp_cmd->add_data[0], (u8)0));
-
         BUG_ON(!dir);
         BUG_ON(!data_len);
  
-       ret = 0;
-       *data_len = 0;
-
         /*
          * The lower four bits of the buffer format field contain the DATA-IN
          * buffer descriptor format, and the highest four bits contain the
          * DATA-OUT buffer descriptor format.
          */
-       *dir = DMA_NONE;
         if (srp_cmd->buf_fmt & 0xf)
                 /* DATA-IN: transfer data from target to initiator (read). */
                 *dir = DMA_FROM_DEVICE;
         else if (srp_cmd->buf_fmt >> 4)
                 /* DATA-OUT: transfer data from initiator to target (write). */
                 *dir = DMA_TO_DEVICE;
+       else
+               *dir = DMA_NONE;
+
+       /* initialize data_direction early as srpt_alloc_rw_ctxs needs it */
+       ioctx->cmd.data_direction = *dir;
  
-       /*
-        * According to the SRP spec, the lower two bits of the 'ADDITIONAL
-        * CDB LENGTH' field are reserved and the size in bytes of this field
-        * is four times the value specified in bits 3..7. Hence the "& ~3".
-        */
-       add_cdb_offset = srp_cmd->add_cdb_len & ~3;
         if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_DIRECT) ||
             ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_DIRECT)) {
-               ioctx->n_rbuf = 1;
-               ioctx->rbufs = &ioctx->single_rbuf;
+               struct srp_direct_buf *db = srpt_get_desc_buf(srp_cmd);
  
-               db = (struct srp_direct_buf *)(srp_cmd->add_data
-                                              + add_cdb_offset);
-               memcpy(ioctx->rbufs, db, sizeof(*db));
                 *data_len = be32_to_cpu(db->len);
+               return srpt_alloc_rw_ctxs(ioctx, db, 1, sg, sg_cnt);
         } else if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_INDIRECT) ||
                    ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_INDIRECT)) {
-               idb = (struct srp_indirect_buf *)(srp_cmd->add_data
-                                                 + add_cdb_offset);
+               struct srp_indirect_buf *idb = srpt_get_desc_buf(srp_cmd);
+               int nbufs = be32_to_cpu(idb->table_desc.len) /
+                               sizeof(struct srp_direct_buf);
  
-               ioctx->n_rbuf = be32_to_cpu(idb->table_desc.len) / sizeof(*db);
-
-               if (ioctx->n_rbuf >
+               if (nbufs >
                     (srp_cmd->data_out_desc_cnt + srp_cmd->data_in_desc_cnt)) {
                         pr_err("received unsupported SRP_CMD request"
                                " type (%u out + %u in != %u / %zu)\n",
                                srp_cmd->data_out_desc_cnt,
                                srp_cmd->data_in_desc_cnt,
                                be32_to_cpu(idb->table_desc.len),
-                              sizeof(*db));
-                       ioctx->n_rbuf = 0;
-                       ret = -EINVAL;
-                       goto out;
-               }
-
-               if (ioctx->n_rbuf == 1)
-                       ioctx->rbufs = &ioctx->single_rbuf;
-               else {
-                       ioctx->rbufs =
-                               kmalloc(ioctx->n_rbuf * sizeof(*db), GFP_ATOMIC);
-                       if (!ioctx->rbufs) {
-                               ioctx->n_rbuf = 0;
-                               ret = -ENOMEM;
-                               goto out;
-                       }
+                              sizeof(struct srp_direct_buf));
+                       return -EINVAL;
                 }
  
-               db = idb->desc_list;
-               memcpy(ioctx->rbufs, db, ioctx->n_rbuf * sizeof(*db));
                 *data_len = be32_to_cpu(idb->len);
+               return srpt_alloc_rw_ctxs(ioctx, idb->desc_list, nbufs,
+                               sg, sg_cnt);
+       } else {
+               *data_len = 0;
+               return 0;
         }
-out:
-       return ret;
  }
  
  /**
@@ -1048,217 +1071,6 @@ static int srpt_ch_qp_err(struct srpt_rdma_ch *ch)
         return ib_modify_qp(ch->qp, &qp_attr, IB_QP_STATE);
  }
  
-/**
- * srpt_unmap_sg_to_ib_sge() - Unmap an IB SGE list.
- */
-static void srpt_unmap_sg_to_ib_sge(struct srpt_rdma_ch *ch,
-                                   struct srpt_send_ioctx *ioctx)
-{
-       struct scatterlist *sg;
-       enum dma_data_direction dir;
-
-       BUG_ON(!ch);
-       BUG_ON(!ioctx);
-       BUG_ON(ioctx->n_rdma && !ioctx->rdma_wrs);
-
-       while (ioctx->n_rdma)
-               kfree(ioctx->rdma_wrs[--ioctx->n_rdma].wr.sg_list);
-
-       kfree(ioctx->rdma_wrs);
-       ioctx->rdma_wrs = NULL;
-
-       if (ioctx->mapped_sg_count) {
-               sg = ioctx->sg;
-               WARN_ON(!sg);
-               dir = ioctx->cmd.data_direction;
-               BUG_ON(dir == DMA_NONE);
-               ib_dma_unmap_sg(ch->sport->sdev->device, sg, ioctx->sg_cnt,
-                               target_reverse_dma_direction(&ioctx->cmd));
-               ioctx->mapped_sg_count = 0;
-       }
-}
-
-/**
- * srpt_map_sg_to_ib_sge() - Map an SG list to an IB SGE list.
- */
-static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
-                                struct srpt_send_ioctx *ioctx)
-{
-       struct ib_device *dev = ch->sport->sdev->device;
-       struct se_cmd *cmd;
-       struct scatterlist *sg, *sg_orig;
-       int sg_cnt;
-       enum dma_data_direction dir;
-       struct ib_rdma_wr *riu;
-       struct srp_direct_buf *db;
-       dma_addr_t dma_addr;
-       struct ib_sge *sge;
-       u64 raddr;
-       u32 rsize;
-       u32 tsize;
-       u32 dma_len;
-       int count, nrdma;
-       int i, j, k;
-
-       BUG_ON(!ch);
-       BUG_ON(!ioctx);
-       cmd = &ioctx->cmd;
-       dir = cmd->data_direction;
-       BUG_ON(dir == DMA_NONE);
-
-       ioctx->sg = sg = sg_orig = cmd->t_data_sg;
-       ioctx->sg_cnt = sg_cnt = cmd->t_data_nents;
-
-       count = ib_dma_map_sg(ch->sport->sdev->device, sg, sg_cnt,
-                             target_reverse_dma_direction(cmd));
-       if (unlikely(!count))
-               return -EAGAIN;
-
-       ioctx->mapped_sg_count = count;
-
-       if (ioctx->rdma_wrs && ioctx->n_rdma_wrs)
-               nrdma = ioctx->n_rdma_wrs;
-       else {
-               nrdma = (count + SRPT_DEF_SG_PER_WQE - 1) / SRPT_DEF_SG_PER_WQE
-                       + ioctx->n_rbuf;
-
-               ioctx->rdma_wrs = kcalloc(nrdma, sizeof(*ioctx->rdma_wrs),
-                               GFP_KERNEL);
-               if (!ioctx->rdma_wrs)
-                       goto free_mem;
-
-               ioctx->n_rdma_wrs = nrdma;
-       }
-
-       db = ioctx->rbufs;
-       tsize = cmd->data_length;
-       dma_len = ib_sg_dma_len(dev, &sg[0]);
-       riu = ioctx->rdma_wrs;
-
-       /*
-        * For each remote desc - calculate the #ib_sge.
-        * If #ib_sge < SRPT_DEF_SG_PER_WQE per rdma operation then
-        *      each remote desc rdma_iu is required a rdma wr;
-        * else
-        *      we need to allocate extra rdma_iu to carry extra #ib_sge in
-        *      another rdma wr
-        */
-       for (i = 0, j = 0;
-            j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) {
-               rsize = be32_to_cpu(db->len);
-               raddr = be64_to_cpu(db->va);
-               riu->remote_addr = raddr;
-               riu->rkey = be32_to_cpu(db->key);
-               riu->wr.num_sge = 0;
-
-               /* calculate how many sge required for this remote_buf */
-               while (rsize > 0 && tsize > 0) {
-
-                       if (rsize >= dma_len) {
-                               tsize -= dma_len;
-                               rsize -= dma_len;
-                               raddr += dma_len;
-
-                               if (tsize > 0) {
-                                       ++j;
-                                       if (j < count) {
-                                               sg = sg_next(sg);
-                                               dma_len = ib_sg_dma_len(
-                                                               dev, sg);
-                                       }
-                               }
-                       } else {
-                               tsize -= rsize;
-                               dma_len -= rsize;
-                               rsize = 0;
-                       }
-
-                       ++riu->wr.num_sge;
-
-                       if (rsize > 0 &&
-                           riu->wr.num_sge == SRPT_DEF_SG_PER_WQE) {
-                               ++ioctx->n_rdma;
-                               riu->wr.sg_list = kmalloc_array(riu->wr.num_sge,
-                                               sizeof(*riu->wr.sg_list),
-                                               GFP_KERNEL);
-                               if (!riu->wr.sg_list)
-                                       goto free_mem;
-
-                               ++riu;
-                               riu->wr.num_sge = 0;
-                               riu->remote_addr = raddr;
-                               riu->rkey = be32_to_cpu(db->key);
-                       }
-               }
-
-               ++ioctx->n_rdma;
-               riu->wr.sg_list = kmalloc_array(riu->wr.num_sge,
-                                       sizeof(*riu->wr.sg_list),
-                                       GFP_KERNEL);
-               if (!riu->wr.sg_list)
-                       goto free_mem;
-       }
-
-       db = ioctx->rbufs;
-       tsize = cmd->data_length;
-       riu = ioctx->rdma_wrs;
-       sg = sg_orig;
-       dma_len = ib_sg_dma_len(dev, &sg[0]);
-       dma_addr = ib_sg_dma_address(dev, &sg[0]);
-
-       /* this second loop is really mapped sg_addres to rdma_iu->ib_sge */
-       for (i = 0, j = 0;
-            j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) {
-               rsize = be32_to_cpu(db->len);
-               sge = riu->wr.sg_list;
-               k = 0;
-
-               while (rsize > 0 && tsize > 0) {
-                       sge->addr = dma_addr;
-                       sge->lkey = ch->sport->sdev->pd->local_dma_lkey;
-
-                       if (rsize >= dma_len) {
-                               sge->length =
-                                       (tsize < dma_len) ? tsize : dma_len;
-                               tsize -= dma_len;
-                               rsize -= dma_len;
-
-                               if (tsize > 0) {
-                                       ++j;
-                                       if (j < count) {
-                                               sg = sg_next(sg);
-                                               dma_len = ib_sg_dma_len(
-                                                               dev, sg);
-                                               dma_addr = ib_sg_dma_address(
-                                                               dev, sg);
-                                       }
-                               }
-                       } else {
-                               sge->length = (tsize < rsize) ? tsize : rsize;
-                               tsize -= rsize;
-                               dma_len -= rsize;
-                               dma_addr += rsize;
-                               rsize = 0;
-                       }
-
-                       ++k;
-                       if (k == riu->wr.num_sge && rsize > 0 && tsize > 0) {
-                               ++riu;
-                               sge = riu->wr.sg_list;
-                               k = 0;
-                       } else if (rsize > 0 && tsize > 0)
-                               ++sge;
-               }
-       }
-
-       return 0;
-
-free_mem:
-       srpt_unmap_sg_to_ib_sge(ch, ioctx);
-
-       return -ENOMEM;
-}
-
  /**
   * srpt_get_send_ioctx() - Obtain an I/O context for sending to the initiator.
   */
@@ -1284,12 +1096,8 @@ static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch)
         BUG_ON(ioctx->ch != ch);
         spin_lock_init(&ioctx->spinlock);
         ioctx->state = SRPT_STATE_NEW;
-       ioctx->n_rbuf = 0;
-       ioctx->rbufs = NULL;
         ioctx->n_rdma = 0;
-       ioctx->n_rdma_wrs = 0;
-       ioctx->rdma_wrs = NULL;
-       ioctx->mapped_sg_count = 0;
+       ioctx->n_rw_ctx = 0;
         init_completion(&ioctx->tx_done);
         ioctx->queue_status_only = false;
         /*
@@ -1359,7 +1167,6 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx)
                  * SRP_RSP sending failed or the SRP_RSP send completion has
                  * not been received in time.
                  */
-               srpt_unmap_sg_to_ib_sge(ioctx->ch, ioctx);
                 transport_generic_free_cmd(&ioctx->cmd, 0);
                 break;
         case SRPT_STATE_MGMT_RSP_SENT:
@@ -1387,6 +1194,7 @@ static void srpt_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc)
  
         WARN_ON(ioctx->n_rdma <= 0);
         atomic_add(ioctx->n_rdma, &ch->sq_wr_avail);
+       ioctx->n_rdma = 0;
  
         if (unlikely(wc->status != IB_WC_SUCCESS)) {
                 pr_info("RDMA_READ for ioctx 0x%p failed with status %d\n",
@@ -1403,23 +1211,6 @@ static void srpt_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc)
                        __LINE__, srpt_get_cmd_state(ioctx));
  }
  
-static void srpt_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
-{
-       struct srpt_send_ioctx *ioctx =
-               container_of(wc->wr_cqe, struct srpt_send_ioctx, rdma_cqe);
-
-       if (unlikely(wc->status != IB_WC_SUCCESS)) {
-               /*
-                * Note: if an RDMA write error completion is received that
-                * means that a SEND also has been posted. Defer further
-                * processing of the associated command until the send error
-                * completion has been received.
-                */
-               pr_info("RDMA_WRITE for ioctx 0x%p failed with status %d\n",
-                       ioctx, wc->status);
-       }
-}
-
  /**
   * srpt_build_cmd_rsp() - Build an SRP_RSP response.
   * @ch: RDMA channel through which the request has been received.
@@ -1537,6 +1328,8 @@ static void srpt_handle_cmd(struct srpt_rdma_ch *ch,
  {
         struct se_cmd *cmd;
         struct srp_cmd *srp_cmd;
+       struct scatterlist *sg = NULL;
+       unsigned sg_cnt = 0;
         u64 data_len;
         enum dma_data_direction dir;
         int rc;
@@ -1563,16 +1356,21 @@ static void srpt_handle_cmd(struct srpt_rdma_ch *ch,
                 break;
         }
  
-       if (srpt_get_desc_tbl(send_ioctx, srp_cmd, &dir, &data_len)) {
-               pr_err("0x%llx: parsing SRP descriptor table failed.\n",
-                      srp_cmd->tag);
+       rc = srpt_get_desc_tbl(send_ioctx, srp_cmd, &dir, &sg, &sg_cnt,
+                       &data_len);
+       if (rc) {
+               if (rc != -EAGAIN) {
+                       pr_err("0x%llx: parsing SRP descriptor table failed.\n",
+                              srp_cmd->tag);
+               }
                 goto release_ioctx;
         }
  
-       rc = target_submit_cmd(cmd, ch->sess, srp_cmd->cdb,
+       rc = target_submit_cmd_map_sgls(cmd, ch->sess, srp_cmd->cdb,
                                &send_ioctx->sense_data[0],
                                scsilun_to_int(&srp_cmd->lun), data_len,
-                              TCM_SIMPLE_TAG, dir, TARGET_SCF_ACK_KREF);
+                              TCM_SIMPLE_TAG, dir, TARGET_SCF_ACK_KREF,
+                              sg, sg_cnt, NULL, 0, NULL, 0);
         if (rc != 0) {
                 pr_debug("target_submit_cmd() returned %d for tag %#llx\n", rc,
                          srp_cmd->tag);
@@ -1664,23 +1462,21 @@ static void srpt_handle_new_iu(struct srpt_rdma_ch *ch,
                                    recv_ioctx->ioctx.dma, srp_max_req_size,
                                    DMA_FROM_DEVICE);
  
-       if (unlikely(ch->state == CH_CONNECTING)) {
-               list_add_tail(&recv_ioctx->wait_list, &ch->cmd_wait_list);
-               goto out;
-       }
+       if (unlikely(ch->state == CH_CONNECTING))
+               goto out_wait;
  
         if (unlikely(ch->state != CH_LIVE))
-               goto out;
+               return;
  
         srp_cmd = recv_ioctx->ioctx.buf;
         if (srp_cmd->opcode == SRP_CMD || srp_cmd->opcode == SRP_TSK_MGMT) {
-               if (!send_ioctx)
+               if (!send_ioctx) {
+                       if (!list_empty(&ch->cmd_wait_list))
+                               goto out_wait;
                         send_ioctx = srpt_get_send_ioctx(ch);
-               if (unlikely(!send_ioctx)) {
-                       list_add_tail(&recv_ioctx->wait_list,
-                                     &ch->cmd_wait_list);
-                       goto out;
                 }
+               if (unlikely(!send_ioctx))
+                       goto out_wait;
         }
  
         switch (srp_cmd->opcode) {
@@ -1709,8 +1505,10 @@ static void srpt_handle_new_iu(struct srpt_rdma_ch *ch,
         }
  
         srpt_post_recv(ch->sport->sdev, recv_ioctx);
-out:
         return;
+
+out_wait:
+       list_add_tail(&recv_ioctx->wait_list, &ch->cmd_wait_list);
  }
  
  static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc)
@@ -1779,14 +1577,13 @@ static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc)
         WARN_ON(state != SRPT_STATE_CMD_RSP_SENT &&
                 state != SRPT_STATE_MGMT_RSP_SENT);
  
-       atomic_inc(&ch->sq_wr_avail);
+       atomic_add(1 + ioctx->n_rdma, &ch->sq_wr_avail);
  
         if (wc->status != IB_WC_SUCCESS)
                 pr_info("sending response for ioctx 0x%p failed"
                         " with status %d\n", ioctx, wc->status);
  
         if (state != SRPT_STATE_DONE) {
-               srpt_unmap_sg_to_ib_sge(ch, ioctx);
                 transport_generic_free_cmd(&ioctx->cmd, 0);
         } else {
                 pr_err("IB completion has been received too late for"
@@ -1832,8 +1629,18 @@ retry:
         qp_init->srq = sdev->srq;
         qp_init->sq_sig_type = IB_SIGNAL_REQ_WR;
         qp_init->qp_type = IB_QPT_RC;
-       qp_init->cap.max_send_wr = srp_sq_size;
-       qp_init->cap.max_send_sge = SRPT_DEF_SG_PER_WQE;
+       /*
+        * We divide up our send queue size into half SEND WRs to send the
+        * completions, and half R/W contexts to actually do the RDMA
+        * READ/WRITE transfers.  Note that we need to allocate CQ slots for
+        * both both, as RDMA contexts will also post completions for the
+        * RDMA READ case.
+        */
+       qp_init->cap.max_send_wr = srp_sq_size / 2;
+       qp_init->cap.max_rdma_ctxs = srp_sq_size / 2;
+       qp_init->cap.max_send_sge = max(sdev->device->attrs.max_sge_rd,
+                                       sdev->device->attrs.max_sge);
+       qp_init->port_num = ch->sport->port;
  
         ch->qp = ib_create_qp(sdev->pd, qp_init);
         if (IS_ERR(ch->qp)) {
@@ -2386,95 +2193,6 @@ static int srpt_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
         return ret;
  }
  
-/**
- * srpt_perform_rdmas() - Perform IB RDMA.
- *
- * Returns zero upon success or a negative number upon failure.
- */
-static int srpt_perform_rdmas(struct srpt_rdma_ch *ch,
-                             struct srpt_send_ioctx *ioctx)
-{
-       struct ib_send_wr *bad_wr;
-       int sq_wr_avail, ret, i;
-       enum dma_data_direction dir;
-       const int n_rdma = ioctx->n_rdma;
-
-       dir = ioctx->cmd.data_direction;
-       if (dir == DMA_TO_DEVICE) {
-               /* write */
-               ret = -ENOMEM;
-               sq_wr_avail = atomic_sub_return(n_rdma, &ch->sq_wr_avail);
-               if (sq_wr_avail < 0) {
-                       pr_warn("IB send queue full (needed %d)\n",
-                               n_rdma);
-                       goto out;
-               }
-       }
-
-       for (i = 0; i < n_rdma; i++) {
-               struct ib_send_wr *wr = &ioctx->rdma_wrs[i].wr;
-
-               wr->opcode = (dir == DMA_FROM_DEVICE) ?
-                               IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
-
-               if (i == n_rdma - 1) {
-                       /* only get completion event for the last rdma read */
-                       if (dir == DMA_TO_DEVICE) {
-                               wr->send_flags = IB_SEND_SIGNALED;
-                               ioctx->rdma_cqe.done = srpt_rdma_read_done;
-                       } else {
-                               ioctx->rdma_cqe.done = srpt_rdma_write_done;
-                       }
-                       wr->wr_cqe = &ioctx->rdma_cqe;
-                       wr->next = NULL;
-               } else {
-                       wr->wr_cqe = NULL;
-                       wr->next = &ioctx->rdma_wrs[i + 1].wr;
-               }
-       }
-
-       ret = ib_post_send(ch->qp, &ioctx->rdma_wrs->wr, &bad_wr);
-       if (ret)
-               pr_err("%s[%d]: ib_post_send() returned %d for %d/%d\n",
-                                __func__, __LINE__, ret, i, n_rdma);
-out:
-       if (unlikely(dir == DMA_TO_DEVICE && ret < 0))
-               atomic_add(n_rdma, &ch->sq_wr_avail);
-       return ret;
-}
-
-/**
- * srpt_xfer_data() - Start data transfer from initiator to target.
- */
-static int srpt_xfer_data(struct srpt_rdma_ch *ch,
-                         struct srpt_send_ioctx *ioctx)
-{
-       int ret;
-
-       ret = srpt_map_sg_to_ib_sge(ch, ioctx);
-       if (ret) {
-               pr_err("%s[%d] ret=%d\n", __func__, __LINE__, ret);
-               goto out;
-       }
-
-       ret = srpt_perform_rdmas(ch, ioctx);
-       if (ret) {
-               if (ret == -EAGAIN || ret == -ENOMEM)
-                       pr_info("%s[%d] queue full -- ret=%d\n",
-                               __func__, __LINE__, ret);
-               else
-                       pr_err("%s[%d] fatal error -- ret=%d\n",
-                              __func__, __LINE__, ret);
-               goto out_unmap;
-       }
-
-out:
-       return ret;
-out_unmap:
-       srpt_unmap_sg_to_ib_sge(ch, ioctx);
-       goto out;
-}
-
  static int srpt_write_pending_status(struct se_cmd *se_cmd)
  {
         struct srpt_send_ioctx *ioctx;
@@ -2491,11 +2209,42 @@ static int srpt_write_pending(struct se_cmd *se_cmd)
         struct srpt_send_ioctx *ioctx =
                 container_of(se_cmd, struct srpt_send_ioctx, cmd);
         struct srpt_rdma_ch *ch = ioctx->ch;
+       struct ib_send_wr *first_wr = NULL, *bad_wr;
+       struct ib_cqe *cqe = &ioctx->rdma_cqe;
         enum srpt_command_state new_state;
+       int ret, i;
  
         new_state = srpt_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA);
         WARN_ON(new_state == SRPT_STATE_DONE);
-       return srpt_xfer_data(ch, ioctx);
+
+       if (atomic_sub_return(ioctx->n_rdma, &ch->sq_wr_avail) < 0) {
+               pr_warn("%s: IB send queue full (needed %d)\n",
+                               __func__, ioctx->n_rdma);
+               ret = -ENOMEM;
+               goto out_undo;
+       }
+
+       cqe->done = srpt_rdma_read_done;
+       for (i = ioctx->n_rw_ctx - 1; i >= 0; i--) {
+               struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i];
+
+               first_wr = rdma_rw_ctx_wrs(&ctx->rw, ch->qp, ch->sport->port,
+                               cqe, first_wr);
+               cqe = NULL;
+       }
+       
+       ret = ib_post_send(ch->qp, first_wr, &bad_wr);
+       if (ret) {
+               pr_err("%s: ib_post_send() returned %d for %d (avail: %d)\n",
+                        __func__, ret, ioctx->n_rdma,
+                        atomic_read(&ch->sq_wr_avail));
+               goto out_undo;
+       }
+
+       return 0;
+out_undo:
+       atomic_add(ioctx->n_rdma, &ch->sq_wr_avail);
+       return ret;
  }
  
  static u8 tcm_to_srp_tsk_mgmt_status(const int tcm_mgmt_status)
@@ -2517,17 +2266,17 @@ static u8 tcm_to_srp_tsk_mgmt_status(const int tcm_mgmt_status)
   */
  static void srpt_queue_response(struct se_cmd *cmd)
  {
-       struct srpt_rdma_ch *ch;
-       struct srpt_send_ioctx *ioctx;
+       struct srpt_send_ioctx *ioctx =
+               container_of(cmd, struct srpt_send_ioctx, cmd);
+       struct srpt_rdma_ch *ch = ioctx->ch;
+       struct srpt_device *sdev = ch->sport->sdev;
+       struct ib_send_wr send_wr, *first_wr = NULL, *bad_wr;
+       struct ib_sge sge;
         enum srpt_command_state state;
         unsigned long flags;
-       int ret;
-       enum dma_data_direction dir;
-       int resp_len;
+       int resp_len, ret, i;
         u8 srp_tm_status;
  
-       ioctx = container_of(cmd, struct srpt_send_ioctx, cmd);
-       ch = ioctx->ch;
         BUG_ON(!ch);
  
         spin_lock_irqsave(&ioctx->spinlock, flags);
@@ -2554,17 +2303,19 @@ static void srpt_queue_response(struct se_cmd *cmd)
                 return;
         }
  
-       dir = ioctx->cmd.data_direction;
-
         /* For read commands, transfer the data to the initiator. */
-       if (dir == DMA_FROM_DEVICE && ioctx->cmd.data_length &&
+       if (ioctx->cmd.data_direction == DMA_FROM_DEVICE &&
+           ioctx->cmd.data_length &&
             !ioctx->queue_status_only) {
-               ret = srpt_xfer_data(ch, ioctx);
-               if (ret) {
-                       pr_err("xfer_data failed for tag %llu\n",
-                              ioctx->cmd.tag);
-                       return;
+               for (i = ioctx->n_rw_ctx - 1; i >= 0; i--) {
+                       struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i];
+
+                       first_wr = rdma_rw_ctx_wrs(&ctx->rw, ch->qp,
+                                       ch->sport->port, NULL,
+                                       first_wr ? first_wr : &send_wr);
                 }
+       } else {
+               first_wr = &send_wr;
         }
  
         if (state != SRPT_STATE_MGMT)
@@ -2576,14 +2327,46 @@ static void srpt_queue_response(struct se_cmd *cmd)
                 resp_len = srpt_build_tskmgmt_rsp(ch, ioctx, srp_tm_status,
                                                  ioctx->cmd.tag);
         }
-       ret = srpt_post_send(ch, ioctx, resp_len);
-       if (ret) {
-               pr_err("sending cmd response failed for tag %llu\n",
-                      ioctx->cmd.tag);
-               srpt_unmap_sg_to_ib_sge(ch, ioctx);
-               srpt_set_cmd_state(ioctx, SRPT_STATE_DONE);
-               target_put_sess_cmd(&ioctx->cmd);
+
+       atomic_inc(&ch->req_lim);
+
+       if (unlikely(atomic_sub_return(1 + ioctx->n_rdma,
+                       &ch->sq_wr_avail) < 0)) {
+               pr_warn("%s: IB send queue full (needed %d)\n",
+                               __func__, ioctx->n_rdma);
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       ib_dma_sync_single_for_device(sdev->device, ioctx->ioctx.dma, resp_len,
+                                     DMA_TO_DEVICE);
+
+       sge.addr = ioctx->ioctx.dma;
+       sge.length = resp_len;
+       sge.lkey = sdev->pd->local_dma_lkey;
+
+       ioctx->ioctx.cqe.done = srpt_send_done;
+       send_wr.next = NULL;
+       send_wr.wr_cqe = &ioctx->ioctx.cqe;
+       send_wr.sg_list = &sge;
+       send_wr.num_sge = 1;
+       send_wr.opcode = IB_WR_SEND;
+       send_wr.send_flags = IB_SEND_SIGNALED;
+
+       ret = ib_post_send(ch->qp, first_wr, &bad_wr);
+       if (ret < 0) {
+               pr_err("%s: sending cmd response failed for tag %llu (%d)\n",
+                       __func__, ioctx->cmd.tag, ret);
+               goto out;
         }
+
+       return;
+
+out:
+       atomic_add(1 + ioctx->n_rdma, &ch->sq_wr_avail);
+       atomic_dec(&ch->req_lim);
+       srpt_set_cmd_state(ioctx, SRPT_STATE_DONE);
+       target_put_sess_cmd(&ioctx->cmd);
  }
  
  static int srpt_queue_data_in(struct se_cmd *cmd)
@@ -2599,10 +2382,6 @@ static void srpt_queue_tm_rsp(struct se_cmd *cmd)
  
  static void srpt_aborted_task(struct se_cmd *cmd)
  {
-       struct srpt_send_ioctx *ioctx = container_of(cmd,
-                               struct srpt_send_ioctx, cmd);
-
-       srpt_unmap_sg_to_ib_sge(ioctx->ch, ioctx);
  }
  
  static int srpt_queue_status(struct se_cmd *cmd)
@@ -2903,12 +2682,10 @@ static void srpt_release_cmd(struct se_cmd *se_cmd)
         unsigned long flags;
  
         WARN_ON(ioctx->state != SRPT_STATE_DONE);
-       WARN_ON(ioctx->mapped_sg_count != 0);
  
-       if (ioctx->n_rbuf > 1) {
-               kfree(ioctx->rbufs);
-               ioctx->rbufs = NULL;
-               ioctx->n_rbuf = 0;
+       if (ioctx->n_rw_ctx) {
+               srpt_free_rw_ctxs(ch, ioctx);
+               ioctx->n_rw_ctx = 0;
         }
  
         spin_lock_irqsave(&ch->spinlock, flags);
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h

index af9b8b527340c80f4c8af515cc4aa641a5c5b426..fee6bfd7ca218f138e38f58318bbc56f12402f5b 100644 (file)
--- a/drivers/infiniband/ulp/srpt/ib_srpt.h
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.h
@@ -42,6 +42,7 @@
  #include <rdma/ib_verbs.h>
  #include <rdma/ib_sa.h>
  #include <rdma/ib_cm.h>
+#include <rdma/rw.h>
  
  #include <scsi/srp.h>
  
@@ -105,7 +106,6 @@ enum {
         SRP_LOGIN_RSP_MULTICHAN_MAINTAINED = 0x2,
  
         SRPT_DEF_SG_TABLESIZE = 128,
-       SRPT_DEF_SG_PER_WQE = 16,
  
         MIN_SRPT_SQ_SIZE = 16,
         DEF_SRPT_SQ_SIZE = 4096,
@@ -174,21 +174,17 @@ struct srpt_recv_ioctx {
         struct srpt_ioctx       ioctx;
         struct list_head        wait_list;
  };
+       
+struct srpt_rw_ctx {
+       struct rdma_rw_ctx      rw;
+       struct scatterlist      *sg;
+       unsigned int            nents;
+};
  
  /**
   * struct srpt_send_ioctx - SRPT send I/O context.
   * @ioctx:       See above.
   * @ch:          Channel pointer.
- * @free_list:   Node in srpt_rdma_ch.free_list.
- * @n_rbuf:      Number of data buffers in the received SRP command.
- * @rbufs:       Pointer to SRP data buffer array.
- * @single_rbuf: SRP data buffer if the command has only a single buffer.
- * @sg:          Pointer to sg-list associated with this I/O context.
- * @sg_cnt:      SG-list size.
- * @mapped_sg_count: ib_dma_map_sg() return value.
- * @n_rdma_wrs:  Number of elements in the rdma_wrs array.
- * @rdma_wrs:    Array with information about the RDMA mapping.
- * @tag:         Tag of the received SRP information unit.
   * @spinlock:    Protects 'state'.
   * @state:       I/O context state.
   * @cmd:         Target core command data structure.
@@ -197,21 +193,18 @@ struct srpt_recv_ioctx {
  struct srpt_send_ioctx {
         struct srpt_ioctx       ioctx;
         struct srpt_rdma_ch     *ch;
-       struct ib_rdma_wr       *rdma_wrs;
+
+       struct srpt_rw_ctx      s_rw_ctx;
+       struct srpt_rw_ctx      *rw_ctxs;
+
         struct ib_cqe           rdma_cqe;
-       struct srp_direct_buf   *rbufs;
-       struct srp_direct_buf   single_rbuf;
-       struct scatterlist      *sg;
         struct list_head        free_list;
         spinlock_t              spinlock;
         enum srpt_command_state state;
         struct se_cmd           cmd;
         struct completion       tx_done;
-       int                     sg_cnt;
-       int                     mapped_sg_count;
-       u16                     n_rdma_wrs;
         u8                      n_rdma;
-       u8                      n_rbuf;
+       u8                      n_rw_ctx;
         bool                    queue_status_only;
         u8                      sense_data[TRANSPORT_SENSE_BUFFER];
  };
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c

index 0360919a5737afcdcca8d741b93f4cd087ac4a59..e206ce7a4e4bdb7ed2fb1a7d60dd1ea90e15cb0c 100644 (file)
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -50,7 +50,7 @@
  #include "io-pgtable.h"
  
  /* Maximum number of stream IDs assigned to a single device */
-#define MAX_MASTER_STREAMIDS           MAX_PHANDLE_ARGS
+#define MAX_MASTER_STREAMIDS           128
  
  /* Maximum number of context banks per SMMU */
  #define ARM_SMMU_MAX_CBS               128
@@ -397,6 +397,12 @@ struct arm_smmu_domain {
         struct iommu_domain             domain;
  };
  
+struct arm_smmu_phandle_args {
+       struct device_node *np;
+       int args_count;
+       uint32_t args[MAX_MASTER_STREAMIDS];
+};
+
  static DEFINE_SPINLOCK(arm_smmu_devices_lock);
  static LIST_HEAD(arm_smmu_devices);
  
@@ -506,7 +512,7 @@ static int insert_smmu_master(struct arm_smmu_device *smmu,
  
  static int register_smmu_master(struct arm_smmu_device *smmu,
                                 struct device *dev,
-                               struct of_phandle_args *masterspec)
+                               struct arm_smmu_phandle_args *masterspec)
  {
         int i;
         struct arm_smmu_master *master;
@@ -1875,7 +1881,8 @@ static int arm_smmu_device_dt_probe(struct platform_device *pdev)
         struct arm_smmu_device *smmu;
         struct device *dev = &pdev->dev;
         struct rb_node *node;
-       struct of_phandle_args masterspec;
+       struct of_phandle_iterator it;
+       struct arm_smmu_phandle_args *masterspec;
         int num_irqs, i, err;
  
         smmu = devm_kzalloc(dev, sizeof(*smmu), GFP_KERNEL);
@@ -1938,20 +1945,35 @@ static int arm_smmu_device_dt_probe(struct platform_device *pdev)
  
         i = 0;
         smmu->masters = RB_ROOT;
-       while (!of_parse_phandle_with_args(dev->of_node, "mmu-masters",
-                                          "#stream-id-cells", i,
-                                          &masterspec)) {
-               err = register_smmu_master(smmu, dev, &masterspec);
+
+       err = -ENOMEM;
+       /* No need to zero the memory for masterspec */
+       masterspec = kmalloc(sizeof(*masterspec), GFP_KERNEL);
+       if (!masterspec)
+               goto out_put_masters;
+
+       of_for_each_phandle(&it, err, dev->of_node,
+                           "mmu-masters", "#stream-id-cells", 0) {
+               int count = of_phandle_iterator_args(&it, masterspec->args,
+                                                    MAX_MASTER_STREAMIDS);
+               masterspec->np          = of_node_get(it.node);
+               masterspec->args_count  = count;
+
+               err = register_smmu_master(smmu, dev, masterspec);
                 if (err) {
                         dev_err(dev, "failed to add master %s\n",
-                               masterspec.np->name);
+                               masterspec->np->name);
+                       kfree(masterspec);
                         goto out_put_masters;
                 }
  
                 i++;
         }
+
         dev_notice(dev, "registered %d master devices\n", i);
  
+       kfree(masterspec);
+
         parse_driver_options(smmu);
  
         if (smmu->version == ARM_SMMU_V2 &&
diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig

index 46f10ec17d5cfba7427ea8784fd736ff396b0071..fa33c50b0e5a00cef1e560b0aa105034dd30e816 100644 (file)
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -256,6 +256,7 @@ config PARTITION_PERCPU
  
  config EZNPS_GIC
         bool "NPS400 Global Interrupt Manager (GIM)"
+       depends on ARC || (COMPILE_TEST && !64BIT)
         select IRQ_DOMAIN
         help
           Support the EZchip NPS400 global interrupt controller
diff --git a/drivers/macintosh/rack-meter.c b/drivers/macintosh/rack-meter.c

index caaec654d7ea028e001c0bbdaee4bd01c3a2a292..465c52219639a0e5324499892addf1a3a6df6739 100644 (file)
--- a/drivers/macintosh/rack-meter.c
+++ b/drivers/macintosh/rack-meter.c
@@ -154,8 +154,8 @@ static void rackmeter_do_pause(struct rackmeter *rm, int pause)
                 DBDMA_DO_STOP(rm->dma_regs);
                 return;
         }
-       memset(rdma->buf1, 0, SAMPLE_COUNT & sizeof(u32));
-       memset(rdma->buf2, 0, SAMPLE_COUNT & sizeof(u32));
+       memset(rdma->buf1, 0, ARRAY_SIZE(rdma->buf1));
+       memset(rdma->buf2, 0, ARRAY_SIZE(rdma->buf2));
  
         rm->dma_buf_v->mark = 0;
  
@@ -227,6 +227,7 @@ static void rackmeter_do_timer(struct work_struct *work)
  
         total_idle_ticks = get_cpu_idle_time(cpu);
         idle_ticks = (unsigned int) (total_idle_ticks - rcpu->prev_idle);
+       idle_ticks = min(idle_ticks, total_ticks);
         rcpu->prev_idle = total_idle_ticks;
  
         /* We do a very dumb calculation to update the LEDs for now,
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c

index 01ee736fe0efc820f980f1e2acf1e20a453e26b2..f8b6d1403c160b12f1011df82dd32a6ed8b21290 100644 (file)
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -1851,7 +1851,7 @@ static int powerbook_sleep_grackle(void)
                 _set_L2CR(save_l2cr);
         
         /* Restore userland MMU context */
-       switch_mmu_context(NULL, current->active_mm);
+       switch_mmu_context(NULL, current->active_mm, NULL);
  
         /* Power things up */
         pmu_unlock();
@@ -1940,7 +1940,7 @@ powerbook_sleep_Core99(void)
                 _set_L3CR(save_l3cr);
         
         /* Restore userland MMU context */
-       switch_mmu_context(NULL, current->active_mm);
+       switch_mmu_context(NULL, current->active_mm, NULL);
  
         /* Tell PMU we are ready */
         pmu_unlock();
diff --git a/drivers/media/usb/dvb-usb/dib0700_core.c b/drivers/media/usb/dvb-usb/dib0700_core.c

index c16f999b9d7c9de8c2cc75526256ba30fa69a33c..bf890c3d9cda021a74079a0013b792858db1c45b 100644 (file)
--- a/drivers/media/usb/dvb-usb/dib0700_core.c
+++ b/drivers/media/usb/dvb-usb/dib0700_core.c
@@ -517,7 +517,7 @@ int dib0700_download_firmware(struct usb_device *udev, const struct firmware *fw
         if (nb_packet_buffer_size < 1)
                 nb_packet_buffer_size = 1;
  
-       /* get the fimware version */
+       /* get the firmware version */
         usb_control_msg(udev, usb_rcvctrlpipe(udev, 0),
                                   REQUEST_GET_VERSION,
                                   USB_TYPE_VENDOR | USB_DIR_IN, 0, 0,
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig

index eea61e349e26afe4b0f9d52a3f036877f0e2403a..1bcf601de5bcea35c6844362811213bfd9b406c7 100644 (file)
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -134,7 +134,7 @@ config MFD_CROS_EC
         select MFD_CORE
         select CHROME_PLATFORMS
         select CROS_EC_PROTO
-       depends on X86 || ARM || COMPILE_TEST
+       depends on X86 || ARM || ARM64 || COMPILE_TEST
         help
           If you say Y here you get support for the ChromeOS Embedded
           Controller (EC) providing keyboard, battery and power services.
@@ -319,6 +319,16 @@ config MFD_HI6421_PMIC
           menus in order to enable them.
           We communicate with the Hi6421 via memory-mapped I/O.
  
+config MFD_HI655X_PMIC
+       tristate "HiSilicon Hi655X series PMU/Codec IC"
+       depends on ARCH_HISI || COMPILE_TEST
+       depends on OF
+       select MFD_CORE
+       select REGMAP_MMIO
+       select REGMAP_IRQ
+       help
+         Select this option to enable Hisilicon hi655x series pmic driver.
+
  config HTC_EGPIO
         bool "HTC EGPIO support"
         depends on GPIOLIB && ARM
@@ -527,6 +537,21 @@ config MFD_MAX14577
           additional drivers must be enabled in order to use the functionality
           of the device.
  
+config MFD_MAX77620
+       bool "Maxim Semiconductor MAX77620 and MAX20024 PMIC Support"
+       depends on I2C=y
+       depends on OF
+       select MFD_CORE
+       select REGMAP_I2C
+       select REGMAP_IRQ
+       select IRQ_DOMAIN
+       help
+         Say yes here to add support for Maxim Semiconductor MAX77620 and
+         MAX20024 which are Power Management IC with General purpose pins,
+         RTC, regulators, clock generator, watchdog etc. This driver
+         provides common support for accessing the device; additional drivers
+         must be enabled in order to use the functionality of the device.
+
  config MFD_MAX77686
         tristate "Maxim Semiconductor MAX77686/802 PMIC Support"
         depends on I2C
@@ -543,8 +568,8 @@ config MFD_MAX77686
           of the device.
  
  config MFD_MAX77693
-       bool "Maxim Semiconductor MAX77693 PMIC Support"
-       depends on I2C=y
+       tristate "Maxim Semiconductor MAX77693 PMIC Support"
+       depends on I2C
         select MFD_CORE
         select REGMAP_I2C
         select REGMAP_IRQ
@@ -1568,7 +1593,7 @@ endmenu
  
  config MFD_VEXPRESS_SYSREG
         bool "Versatile Express System Registers"
-       depends on VEXPRESS_CONFIG && GPIOLIB
+       depends on VEXPRESS_CONFIG && GPIOLIB && !ARCH_USES_GETTIMEOFFSET
         default y
         select CLKSRC_MMIO
         select GPIO_GENERIC_PLATFORM
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile

index 5eaa6465d0a6e2bf20df77951b573b4c0ccc477f..42a66e19e191da93b5036cd9fd8bb9e1cb1340bd 100644 (file)
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -128,6 +128,7 @@ obj-$(CONFIG_MFD_DA9063)    += da9063.o
  obj-$(CONFIG_MFD_DA9150)       += da9150-core.o
  
  obj-$(CONFIG_MFD_MAX14577)     += max14577.o
+obj-$(CONFIG_MFD_MAX77620)     += max77620.o
  obj-$(CONFIG_MFD_MAX77686)     += max77686.o
  obj-$(CONFIG_MFD_MAX77693)     += max77693.o
  obj-$(CONFIG_MFD_MAX77843)     += max77843.o
@@ -195,6 +196,7 @@ obj-$(CONFIG_MFD_STW481X)   += stw481x.o
  obj-$(CONFIG_MFD_IPAQ_MICRO)   += ipaq-micro.o
  obj-$(CONFIG_MFD_MENF21BMC)    += menf21bmc.o
  obj-$(CONFIG_MFD_HI6421_PMIC)  += hi6421-pmic-core.o
+obj-$(CONFIG_MFD_HI655X_PMIC)   += hi655x-pmic.o
  obj-$(CONFIG_MFD_DLN2)         += dln2.o
  obj-$(CONFIG_MFD_RT5033)       += rt5033.o
  obj-$(CONFIG_MFD_SKY81452)     += sky81452.o
diff --git a/drivers/mfd/act8945a.c b/drivers/mfd/act8945a.c

index 525b546ba42f6a6e2f78e087312bb0ebbe73439d..10c6d2da88226e02102cac7bf0c6bf216049ad2f 100644 (file)
--- a/drivers/mfd/act8945a.c
+++ b/drivers/mfd/act8945a.c
@@ -46,8 +46,9 @@ static int act8945a_i2c_probe(struct i2c_client *i2c,
  
         i2c_set_clientdata(i2c, regmap);
  
-       ret = mfd_add_devices(&i2c->dev, PLATFORM_DEVID_NONE, act8945a_devs,
-                             ARRAY_SIZE(act8945a_devs), NULL, 0, NULL);
+       ret = devm_mfd_add_devices(&i2c->dev, PLATFORM_DEVID_NONE,
+                                  act8945a_devs, ARRAY_SIZE(act8945a_devs),
+                                  NULL, 0, NULL);
         if (ret) {
                 dev_err(&i2c->dev, "Failed to add sub devices\n");
                 return ret;
@@ -56,13 +57,6 @@ static int act8945a_i2c_probe(struct i2c_client *i2c,
         return 0;
  }
  
-static int act8945a_i2c_remove(struct i2c_client *i2c)
-{
-       mfd_remove_devices(&i2c->dev);
-
-       return 0;
-}
-
  static const struct i2c_device_id act8945a_i2c_id[] = {
         { "act8945a", 0 },
         {}
@@ -81,7 +75,6 @@ static struct i2c_driver act8945a_i2c_driver = {
                    .of_match_table = of_match_ptr(act8945a_of_match),
         },
         .probe = act8945a_i2c_probe,
-       .remove = act8945a_i2c_remove,
         .id_table = act8945a_i2c_id,
  };
  
diff --git a/drivers/mfd/arizona-core.c b/drivers/mfd/arizona-core.c

index 5319f252790be97e84a4f0777904f70de6b93ac3..bf2717967597435cbca89ff132f7d605a18f8e7a 100644 (file)
--- a/drivers/mfd/arizona-core.c
+++ b/drivers/mfd/arizona-core.c
@@ -908,12 +908,12 @@ static const char * const wm5102_supplies[] = {
  
  static const struct mfd_cell wm5102_devs[] = {
         { .name = "arizona-micsupp" },
+       { .name = "arizona-gpio" },
         {
                 .name = "arizona-extcon",
                 .parent_supplies = wm5102_supplies,
                 .num_parent_supplies = 1, /* We only need MICVDD */
         },
-       { .name = "arizona-gpio" },
         { .name = "arizona-haptics" },
         { .name = "arizona-pwm" },
         {
@@ -925,12 +925,12 @@ static const struct mfd_cell wm5102_devs[] = {
  
  static const struct mfd_cell wm5110_devs[] = {
         { .name = "arizona-micsupp" },
+       { .name = "arizona-gpio" },
         {
                 .name = "arizona-extcon",
                 .parent_supplies = wm5102_supplies,
                 .num_parent_supplies = 1, /* We only need MICVDD */
         },
-       { .name = "arizona-gpio" },
         { .name = "arizona-haptics" },
         { .name = "arizona-pwm" },
         {
@@ -966,12 +966,12 @@ static const char * const wm8997_supplies[] = {
  
  static const struct mfd_cell wm8997_devs[] = {
         { .name = "arizona-micsupp" },
+       { .name = "arizona-gpio" },
         {
                 .name = "arizona-extcon",
                 .parent_supplies = wm8997_supplies,
                 .num_parent_supplies = 1, /* We only need MICVDD */
         },
-       { .name = "arizona-gpio" },
         { .name = "arizona-haptics" },
         { .name = "arizona-pwm" },
         {
@@ -982,12 +982,13 @@ static const struct mfd_cell wm8997_devs[] = {
  };
  
  static const struct mfd_cell wm8998_devs[] = {
+       { .name = "arizona-micsupp" },
+       { .name = "arizona-gpio" },
         {
                 .name = "arizona-extcon",
                 .parent_supplies = wm5102_supplies,
                 .num_parent_supplies = 1, /* We only need MICVDD */
         },
-       { .name = "arizona-gpio" },
         { .name = "arizona-haptics" },
         { .name = "arizona-pwm" },
         {
@@ -995,7 +996,6 @@ static const struct mfd_cell wm8998_devs[] = {
                 .parent_supplies = wm5102_supplies,
                 .num_parent_supplies = ARRAY_SIZE(wm5102_supplies),
         },
-       { .name = "arizona-micsupp" },
  };
  
  int arizona_dev_init(struct arizona *arizona)
diff --git a/drivers/mfd/arizona-irq.c b/drivers/mfd/arizona-irq.c

index 5fef014920a308f930d1f6b5d9bc4a2ebeae9614..edeb4951366a0d1db3bf3a14c49c03a50d469ed1 100644 (file)
--- a/drivers/mfd/arizona-irq.c
+++ b/drivers/mfd/arizona-irq.c
@@ -168,12 +168,15 @@ static struct irq_chip arizona_irq_chip = {
         .irq_set_wake           = arizona_irq_set_wake,
  };
  
+static struct lock_class_key arizona_irq_lock_class;
+
  static int arizona_irq_map(struct irq_domain *h, unsigned int virq,
                               irq_hw_number_t hw)
  {
         struct arizona *data = h->host_data;
  
         irq_set_chip_data(virq, data);
+       irq_set_lockdep_class(virq, &arizona_irq_lock_class);
         irq_set_chip_and_handler(virq, &arizona_irq_chip, handle_simple_irq);
         irq_set_nested_thread(virq, 1);
         irq_set_noprobe(virq);
diff --git a/drivers/mfd/as3711.c b/drivers/mfd/as3711.c

index 09e1483b99bc921ffbb81b3bd5d6cc8fc159d3ec..67b12417585d18301170289a50858292e1af807e 100644 (file)
--- a/drivers/mfd/as3711.c
+++ b/drivers/mfd/as3711.c
@@ -189,22 +189,14 @@ static int as3711_i2c_probe(struct i2c_client *client,
                 as3711_subdevs[AS3711_BACKLIGHT].pdata_size = 0;
         }
  
-       ret = mfd_add_devices(as3711->dev, -1, as3711_subdevs,
-                             ARRAY_SIZE(as3711_subdevs), NULL, 0, NULL);
+       ret = devm_mfd_add_devices(as3711->dev, -1, as3711_subdevs,
+                                  ARRAY_SIZE(as3711_subdevs), NULL, 0, NULL);
         if (ret < 0)
                 dev_err(&client->dev, "add mfd devices failed: %d\n", ret);
  
         return ret;
  }
  
-static int as3711_i2c_remove(struct i2c_client *client)
-{
-       struct as3711 *as3711 = i2c_get_clientdata(client);
-
-       mfd_remove_devices(as3711->dev);
-       return 0;
-}
-
  static const struct i2c_device_id as3711_i2c_id[] = {
         {.name = "as3711", .driver_data = 0},
         {}
@@ -218,7 +210,6 @@ static struct i2c_driver as3711_i2c_driver = {
                    .of_match_table = of_match_ptr(as3711_of_match),
         },
         .probe = as3711_i2c_probe,
-       .remove = as3711_i2c_remove,
         .id_table = as3711_i2c_id,
  };
  
diff --git a/drivers/mfd/as3722.c b/drivers/mfd/as3722.c

index e1f597f97f869e65297c3343b38aad0c2a48490f..f87342c211bcd53deb9b982545af7ce877bfc36a 100644 (file)
--- a/drivers/mfd/as3722.c
+++ b/drivers/mfd/as3722.c
@@ -385,9 +385,10 @@ static int as3722_i2c_probe(struct i2c_client *i2c,
                 return ret;
  
         irq_flags = as3722->irq_flags | IRQF_ONESHOT;
-       ret = regmap_add_irq_chip(as3722->regmap, as3722->chip_irq,
-                       irq_flags, -1, &as3722_irq_chip,
-                       &as3722->irq_data);
+       ret = devm_regmap_add_irq_chip(as3722->dev, as3722->regmap,
+                                      as3722->chip_irq,
+                                      irq_flags, -1, &as3722_irq_chip,
+                                      &as3722->irq_data);
         if (ret < 0) {
                 dev_err(as3722->dev, "Failed to add regmap irq: %d\n", ret);
                 return ret;
@@ -395,33 +396,20 @@ static int as3722_i2c_probe(struct i2c_client *i2c,
  
         ret = as3722_configure_pullups(as3722);
         if (ret < 0)
-               goto scrub;
+               return ret;
  
-       ret = mfd_add_devices(&i2c->dev, -1, as3722_devs,
-                       ARRAY_SIZE(as3722_devs), NULL, 0,
-                       regmap_irq_get_domain(as3722->irq_data));
+       ret = devm_mfd_add_devices(&i2c->dev, -1, as3722_devs,
+                                  ARRAY_SIZE(as3722_devs), NULL, 0,
+                                  regmap_irq_get_domain(as3722->irq_data));
         if (ret) {
                 dev_err(as3722->dev, "Failed to add MFD devices: %d\n", ret);
-               goto scrub;
+               return ret;
         }
  
         device_init_wakeup(as3722->dev, true);
  
         dev_dbg(as3722->dev, "AS3722 core driver initialized successfully\n");
         return 0;
-
-scrub:
-       regmap_del_irq_chip(as3722->chip_irq, as3722->irq_data);
-       return ret;
-}
-
-static int as3722_i2c_remove(struct i2c_client *i2c)
-{
-       struct as3722 *as3722 = i2c_get_clientdata(i2c);
-
-       mfd_remove_devices(as3722->dev);
-       regmap_del_irq_chip(as3722->chip_irq, as3722->irq_data);
-       return 0;
  }
  
  static int __maybe_unused as3722_i2c_suspend(struct device *dev)
@@ -470,7 +458,6 @@ static struct i2c_driver as3722_i2c_driver = {
                 .pm = &as3722_pm_ops,
         },
         .probe = as3722_i2c_probe,
-       .remove = as3722_i2c_remove,
         .id_table = as3722_i2c_id,
  };
  
diff --git a/drivers/mfd/asic3.c b/drivers/mfd/asic3.c

index 4dca6bc61f5b2b4881956f14d60eccd95269b928..0413c8159551efb86a61db1ef5d39317c92c1486 100644 (file)
--- a/drivers/mfd/asic3.c
+++ b/drivers/mfd/asic3.c
@@ -446,7 +446,7 @@ static int asic3_gpio_direction(struct gpio_chip *chip,
         unsigned long flags;
         struct asic3 *asic;
  
-       asic = container_of(chip, struct asic3, gpio);
+       asic = gpiochip_get_data(chip);
         gpio_base = ASIC3_GPIO_TO_BASE(offset);
  
         if (gpio_base > ASIC3_GPIO_D_BASE) {
@@ -492,7 +492,7 @@ static int asic3_gpio_get(struct gpio_chip *chip,
         u32 mask = ASIC3_GPIO_TO_MASK(offset);
         struct asic3 *asic;
  
-       asic = container_of(chip, struct asic3, gpio);
+       asic = gpiochip_get_data(chip);
         gpio_base = ASIC3_GPIO_TO_BASE(offset);
  
         if (gpio_base > ASIC3_GPIO_D_BASE) {
@@ -513,7 +513,7 @@ static void asic3_gpio_set(struct gpio_chip *chip,
         unsigned long flags;
         struct asic3 *asic;
  
-       asic = container_of(chip, struct asic3, gpio);
+       asic = gpiochip_get_data(chip);
         gpio_base = ASIC3_GPIO_TO_BASE(offset);
  
         if (gpio_base > ASIC3_GPIO_D_BASE) {
@@ -540,7 +540,7 @@ static void asic3_gpio_set(struct gpio_chip *chip,
  
  static int asic3_gpio_to_irq(struct gpio_chip *chip, unsigned offset)
  {
-       struct asic3 *asic = container_of(chip, struct asic3, gpio);
+       struct asic3 *asic = gpiochip_get_data(chip);
  
         return asic->irq_base + offset;
  }
@@ -595,7 +595,7 @@ static __init int asic3_gpio_probe(struct platform_device *pdev,
                                      alt_reg[i]);
         }
  
-       return gpiochip_add(&asic->gpio);
+       return gpiochip_add_data(&asic->gpio, asic);
  }
  
  static int asic3_gpio_remove(struct platform_device *pdev)
diff --git a/drivers/mfd/atmel-hlcdc.c b/drivers/mfd/atmel-hlcdc.c

index 06c205868573e8d7d7b98e1c142156a2cc572454..eca7ea69b81c8c16085f98a803da511589599b37 100644 (file)
--- a/drivers/mfd/atmel-hlcdc.c
+++ b/drivers/mfd/atmel-hlcdc.c
@@ -128,16 +128,9 @@ static int atmel_hlcdc_probe(struct platform_device *pdev)
  
         dev_set_drvdata(dev, hlcdc);
  
-       return mfd_add_devices(dev, -1, atmel_hlcdc_cells,
-                              ARRAY_SIZE(atmel_hlcdc_cells),
-                              NULL, 0, NULL);
-}
-
-static int atmel_hlcdc_remove(struct platform_device *pdev)
-{
-       mfd_remove_devices(&pdev->dev);
-
-       return 0;
+       return devm_mfd_add_devices(dev, -1, atmel_hlcdc_cells,
+                                   ARRAY_SIZE(atmel_hlcdc_cells),
+                                   NULL, 0, NULL);
  }
  
  static const struct of_device_id atmel_hlcdc_match[] = {
@@ -152,7 +145,6 @@ MODULE_DEVICE_TABLE(of, atmel_hlcdc_match);
  
  static struct platform_driver atmel_hlcdc_driver = {
         .probe = atmel_hlcdc_probe,
-       .remove = atmel_hlcdc_remove,
         .driver = {
                 .name = "atmel-hlcdc",
                 .of_match_table = atmel_hlcdc_match,
diff --git a/drivers/mfd/axp20x-rsb.c b/drivers/mfd/axp20x-rsb.c

index 28c20247c112391270ad1c6a056c000ffd3e33c2..a407527bcd0958ba79999446138a849cf03a0166 100644 (file)
--- a/drivers/mfd/axp20x-rsb.c
+++ b/drivers/mfd/axp20x-rsb.c
@@ -61,6 +61,7 @@ static int axp20x_rsb_remove(struct sunxi_rsb_device *rdev)
  
  static const struct of_device_id axp20x_rsb_of_match[] = {
         { .compatible = "x-powers,axp223", .data = (void *)AXP223_ID },
+       { .compatible = "x-powers,axp809", .data = (void *)AXP809_ID },
         { },
  };
  MODULE_DEVICE_TABLE(of, axp20x_rsb_of_match);
diff --git a/drivers/mfd/axp20x.c b/drivers/mfd/axp20x.c

index a57d6e94061010022c5d41ad9b1f396ca444ce98..e4e32978c37752b46ee1a631b820768477591619 100644 (file)
--- a/drivers/mfd/axp20x.c
+++ b/drivers/mfd/axp20x.c
@@ -37,6 +37,7 @@ static const char * const axp20x_model_names[] = {
         "AXP221",
         "AXP223",
         "AXP288",
+       "AXP809",
  };
  
  static const struct regmap_range axp152_writeable_ranges[] = {
@@ -85,6 +86,7 @@ static const struct regmap_access_table axp20x_volatile_table = {
         .n_yes_ranges   = ARRAY_SIZE(axp20x_volatile_ranges),
  };
  
+/* AXP22x ranges are shared with the AXP809, as they cover the same range */
  static const struct regmap_range axp22x_writeable_ranges[] = {
         regmap_reg_range(AXP20X_DATACACHE(0), AXP20X_IRQ5_STATE),
         regmap_reg_range(AXP20X_DCDC_MODE, AXP22X_BATLOW_THRES1),
@@ -128,6 +130,12 @@ static struct resource axp152_pek_resources[] = {
         DEFINE_RES_IRQ_NAMED(AXP152_IRQ_PEK_FAL_EDGE, "PEK_DBF"),
  };
  
+static struct resource axp20x_ac_power_supply_resources[] = {
+       DEFINE_RES_IRQ_NAMED(AXP20X_IRQ_ACIN_PLUGIN, "ACIN_PLUGIN"),
+       DEFINE_RES_IRQ_NAMED(AXP20X_IRQ_ACIN_REMOVAL, "ACIN_REMOVAL"),
+       DEFINE_RES_IRQ_NAMED(AXP20X_IRQ_ACIN_OVER_V, "ACIN_OVER_V"),
+};
+
  static struct resource axp20x_pek_resources[] = {
         {
                 .name   = "PEK_DBR",
@@ -211,6 +219,20 @@ static struct resource axp288_fuel_gauge_resources[] = {
         },
  };
  
+static struct resource axp809_pek_resources[] = {
+       {
+               .name   = "PEK_DBR",
+               .start  = AXP809_IRQ_PEK_RIS_EDGE,
+               .end    = AXP809_IRQ_PEK_RIS_EDGE,
+               .flags  = IORESOURCE_IRQ,
+       }, {
+               .name   = "PEK_DBF",
+               .start  = AXP809_IRQ_PEK_FAL_EDGE,
+               .end    = AXP809_IRQ_PEK_FAL_EDGE,
+               .flags  = IORESOURCE_IRQ,
+       },
+};
+
  static const struct regmap_config axp152_regmap_config = {
         .reg_bits       = 8,
         .val_bits       = 8,
@@ -378,6 +400,41 @@ static const struct regmap_irq axp288_regmap_irqs[] = {
         INIT_REGMAP_IRQ(AXP288, BC_USB_CHNG,            5, 1),
  };
  
+static const struct regmap_irq axp809_regmap_irqs[] = {
+       INIT_REGMAP_IRQ(AXP809, ACIN_OVER_V,            0, 7),
+       INIT_REGMAP_IRQ(AXP809, ACIN_PLUGIN,            0, 6),
+       INIT_REGMAP_IRQ(AXP809, ACIN_REMOVAL,           0, 5),
+       INIT_REGMAP_IRQ(AXP809, VBUS_OVER_V,            0, 4),
+       INIT_REGMAP_IRQ(AXP809, VBUS_PLUGIN,            0, 3),
+       INIT_REGMAP_IRQ(AXP809, VBUS_REMOVAL,           0, 2),
+       INIT_REGMAP_IRQ(AXP809, VBUS_V_LOW,             0, 1),
+       INIT_REGMAP_IRQ(AXP809, BATT_PLUGIN,            1, 7),
+       INIT_REGMAP_IRQ(AXP809, BATT_REMOVAL,           1, 6),
+       INIT_REGMAP_IRQ(AXP809, BATT_ENT_ACT_MODE,      1, 5),
+       INIT_REGMAP_IRQ(AXP809, BATT_EXIT_ACT_MODE,     1, 4),
+       INIT_REGMAP_IRQ(AXP809, CHARG,                  1, 3),
+       INIT_REGMAP_IRQ(AXP809, CHARG_DONE,             1, 2),
+       INIT_REGMAP_IRQ(AXP809, BATT_CHG_TEMP_HIGH,     2, 7),
+       INIT_REGMAP_IRQ(AXP809, BATT_CHG_TEMP_HIGH_END, 2, 6),
+       INIT_REGMAP_IRQ(AXP809, BATT_CHG_TEMP_LOW,      2, 5),
+       INIT_REGMAP_IRQ(AXP809, BATT_CHG_TEMP_LOW_END,  2, 4),
+       INIT_REGMAP_IRQ(AXP809, BATT_ACT_TEMP_HIGH,     2, 3),
+       INIT_REGMAP_IRQ(AXP809, BATT_ACT_TEMP_HIGH_END, 2, 2),
+       INIT_REGMAP_IRQ(AXP809, BATT_ACT_TEMP_LOW,      2, 1),
+       INIT_REGMAP_IRQ(AXP809, BATT_ACT_TEMP_LOW_END,  2, 0),
+       INIT_REGMAP_IRQ(AXP809, DIE_TEMP_HIGH,          3, 7),
+       INIT_REGMAP_IRQ(AXP809, LOW_PWR_LVL1,           3, 1),
+       INIT_REGMAP_IRQ(AXP809, LOW_PWR_LVL2,           3, 0),
+       INIT_REGMAP_IRQ(AXP809, TIMER,                  4, 7),
+       INIT_REGMAP_IRQ(AXP809, PEK_RIS_EDGE,           4, 6),
+       INIT_REGMAP_IRQ(AXP809, PEK_FAL_EDGE,           4, 5),
+       INIT_REGMAP_IRQ(AXP809, PEK_SHORT,              4, 4),
+       INIT_REGMAP_IRQ(AXP809, PEK_LONG,               4, 3),
+       INIT_REGMAP_IRQ(AXP809, PEK_OVER_OFF,           4, 2),
+       INIT_REGMAP_IRQ(AXP809, GPIO1_INPUT,            4, 1),
+       INIT_REGMAP_IRQ(AXP809, GPIO0_INPUT,            4, 0),
+};
+
  static const struct regmap_irq_chip axp152_regmap_irq_chip = {
         .name                   = "axp152_irq_chip",
         .status_base            = AXP152_IRQ1_STATE,
@@ -428,6 +485,18 @@ static const struct regmap_irq_chip axp288_regmap_irq_chip = {
  
  };
  
+static const struct regmap_irq_chip axp809_regmap_irq_chip = {
+       .name                   = "axp809",
+       .status_base            = AXP20X_IRQ1_STATE,
+       .ack_base               = AXP20X_IRQ1_STATE,
+       .mask_base              = AXP20X_IRQ1_EN,
+       .mask_invert            = true,
+       .init_ack_masked        = true,
+       .irqs                   = axp809_regmap_irqs,
+       .num_irqs               = ARRAY_SIZE(axp809_regmap_irqs),
+       .num_regs               = 5,
+};
+
  static struct mfd_cell axp20x_cells[] = {
         {
                 .name           = "axp20x-pek",
@@ -435,6 +504,11 @@ static struct mfd_cell axp20x_cells[] = {
                 .resources      = axp20x_pek_resources,
         }, {
                 .name           = "axp20x-regulator",
+       }, {
+               .name           = "axp20x-ac-power-supply",
+               .of_compatible  = "x-powers,axp202-ac-power-supply",
+               .num_resources  = ARRAY_SIZE(axp20x_ac_power_supply_resources),
+               .resources      = axp20x_ac_power_supply_resources,
         }, {
                 .name           = "axp20x-usb-power-supply",
                 .of_compatible  = "x-powers,axp202-usb-power-supply",
@@ -572,6 +646,16 @@ static struct mfd_cell axp288_cells[] = {
         },
  };
  
+static struct mfd_cell axp809_cells[] = {
+       {
+               .name                   = "axp20x-pek",
+               .num_resources          = ARRAY_SIZE(axp809_pek_resources),
+               .resources              = axp809_pek_resources,
+       }, {
+               .name                   = "axp20x-regulator",
+       },
+};
+
  static struct axp20x_dev *axp20x_pm_power_off;
  static void axp20x_power_off(void)
  {
@@ -631,6 +715,12 @@ int axp20x_match_device(struct axp20x_dev *axp20x)
                 axp20x->regmap_cfg = &axp288_regmap_config;
                 axp20x->regmap_irq_chip = &axp288_regmap_irq_chip;
                 break;
+       case AXP809_ID:
+               axp20x->nr_cells = ARRAY_SIZE(axp809_cells);
+               axp20x->cells = axp809_cells;
+               axp20x->regmap_cfg = &axp22x_regmap_config;
+               axp20x->regmap_irq_chip = &axp809_regmap_irq_chip;
+               break;
         default:
                 dev_err(dev, "unsupported AXP20X ID %lu\n", axp20x->variant);
                 return -EINVAL;
diff --git a/drivers/mfd/bcm590xx.c b/drivers/mfd/bcm590xx.c

index 320aaefee7187f10da1d1735d15c7375daba38d3..0d76d690176b4efd46c04246a5ccfff0a223b0c0 100644 (file)
--- a/drivers/mfd/bcm590xx.c
+++ b/drivers/mfd/bcm590xx.c
@@ -82,8 +82,8 @@ static int bcm590xx_i2c_probe(struct i2c_client *i2c_pri,
                 goto err;
         }
  
-       ret = mfd_add_devices(&i2c_pri->dev, -1, bcm590xx_devs,
-                             ARRAY_SIZE(bcm590xx_devs), NULL, 0, NULL);
+       ret = devm_mfd_add_devices(&i2c_pri->dev, -1, bcm590xx_devs,
+                                  ARRAY_SIZE(bcm590xx_devs), NULL, 0, NULL);
         if (ret < 0) {
                 dev_err(&i2c_pri->dev, "failed to add sub-devices: %d\n", ret);
                 goto err;
@@ -96,12 +96,6 @@ err:
         return ret;
  }
  
-static int bcm590xx_i2c_remove(struct i2c_client *i2c)
-{
-       mfd_remove_devices(&i2c->dev);
-       return 0;
-}
-
  static const struct of_device_id bcm590xx_of_match[] = {
         { .compatible = "brcm,bcm59056" },
         { }
@@ -120,7 +114,6 @@ static struct i2c_driver bcm590xx_i2c_driver = {
                    .of_match_table = of_match_ptr(bcm590xx_of_match),
         },
         .probe = bcm590xx_i2c_probe,
-       .remove = bcm590xx_i2c_remove,
         .id_table = bcm590xx_i2c_id,
  };
  module_i2c_driver(bcm590xx_i2c_driver);
diff --git a/drivers/mfd/da9063-irq.c b/drivers/mfd/da9063-irq.c

index 26302634633c73383526eab7cb62fa2550961183..7e903fcb88131c71db1c107cf1346b349eb66c17 100644 (file)
--- a/drivers/mfd/da9063-irq.c
+++ b/drivers/mfd/da9063-irq.c
@@ -25,14 +25,6 @@
  #define        DA9063_REG_EVENT_B_OFFSET       1
  #define        DA9063_REG_EVENT_C_OFFSET       2
  #define        DA9063_REG_EVENT_D_OFFSET       3
-#define EVENTS_BUF_LEN                 4
-
-static const u8 mask_events_buf[] = { [0 ... (EVENTS_BUF_LEN - 1)] = ~0 };
-
-struct da9063_irq_data {
-       u16 reg;
-       u8 mask;
-};
  
  static const struct regmap_irq da9063_irqs[] = {
         /* DA9063 event A register */
diff --git a/drivers/mfd/dm355evm_msp.c b/drivers/mfd/dm355evm_msp.c

index ec4438ed2fafd73415b3efb40b131e587ddbf45a..14661ec5ef7fe46beb514ff8b3a6e5822e989019 100644 (file)
--- a/drivers/mfd/dm355evm_msp.c
+++ b/drivers/mfd/dm355evm_msp.c
@@ -33,25 +33,25 @@
   * This driver was tested with firmware revision A4.
   */
  
-#if defined(CONFIG_INPUT_DM355EVM) || defined(CONFIG_INPUT_DM355EVM_MODULE)
+#if IS_ENABLED(CONFIG_INPUT_DM355EVM)
  #define msp_has_keyboard()     true
  #else
  #define msp_has_keyboard()     false
  #endif
  
-#if defined(CONFIG_LEDS_GPIO) || defined(CONFIG_LEDS_GPIO_MODULE)
+#if IS_ENABLED(CONFIG_LEDS_GPIO)
  #define msp_has_leds()         true
  #else
  #define msp_has_leds()         false
  #endif
  
-#if defined(CONFIG_RTC_DRV_DM355EVM) || defined(CONFIG_RTC_DRV_DM355EVM_MODULE)
+#if IS_ENABLED(CONFIG_RTC_DRV_DM355EVM)
  #define msp_has_rtc()          true
  #else
  #define msp_has_rtc()          false
  #endif
  
-#if defined(CONFIG_VIDEO_TVP514X) || defined(CONFIG_VIDEO_TVP514X_MODULE)
+#if IS_ENABLED(CONFIG_VIDEO_TVP514X)
  #define msp_has_tvp()          true
  #else
  #define msp_has_tvp()          false
@@ -260,7 +260,7 @@ static int add_children(struct i2c_client *client)
  
         /* GPIO-ish stuff */
         dm355evm_msp_gpio.parent = &client->dev;
-       status = gpiochip_add(&dm355evm_msp_gpio);
+       status = gpiochip_add_data(&dm355evm_msp_gpio, NULL);
         if (status < 0)
                 return status;
  
diff --git a/drivers/mfd/hi6421-pmic-core.c b/drivers/mfd/hi6421-pmic-core.c

index f9ded45a992de31a1fa5c33a6da6be3cf376fa1f..3fd703fe3abad287cca295c63eee6addf8146bd9 100644 (file)
--- a/drivers/mfd/hi6421-pmic-core.c
+++ b/drivers/mfd/hi6421-pmic-core.c
@@ -76,8 +76,8 @@ static int hi6421_pmic_probe(struct platform_device *pdev)
  
         platform_set_drvdata(pdev, pmic);
  
-       ret = mfd_add_devices(&pdev->dev, 0, hi6421_devs,
-                       ARRAY_SIZE(hi6421_devs), NULL, 0, NULL);
+       ret = devm_mfd_add_devices(&pdev->dev, 0, hi6421_devs,
+                                  ARRAY_SIZE(hi6421_devs), NULL, 0, NULL);
         if (ret) {
                 dev_err(&pdev->dev, "add mfd devices failed: %d\n", ret);
                 return ret;
@@ -86,13 +86,6 @@ static int hi6421_pmic_probe(struct platform_device *pdev)
         return 0;
  }
  
-static int hi6421_pmic_remove(struct platform_device *pdev)
-{
-       mfd_remove_devices(&pdev->dev);
-
-       return 0;
-}
-
  static const struct of_device_id of_hi6421_pmic_match_tbl[] = {
         { .compatible = "hisilicon,hi6421-pmic", },
         { },
@@ -105,7 +98,6 @@ static struct platform_driver hi6421_pmic_driver = {
                 .of_match_table = of_hi6421_pmic_match_tbl,
         },
         .probe  = hi6421_pmic_probe,
-       .remove = hi6421_pmic_remove,
  };
  module_platform_driver(hi6421_pmic_driver);
  
diff --git a/drivers/mfd/hi655x-pmic.c b/drivers/mfd/hi655x-pmic.c

new file mode 100644 (file)

index 0000000..05ddc78
--- /dev/null
+++ b/drivers/mfd/hi655x-pmic.c
@@ -0,0 +1,162 @@
+/*
+ * Device driver for MFD hi655x PMIC
+ *
+ * Copyright (c) 2016 Hisilicon.
+ *
+ * Authors:
+ * Chen Feng <puck.chen@hisilicon.com>
+ * Fei  Wang <w.f@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/gpio.h>
+#include <linux/io.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/hi655x-pmic.h>
+#include <linux/module.h>
+#include <linux/of_gpio.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+
+static const struct mfd_cell hi655x_pmic_devs[] = {
+       { .name = "hi655x-regulator", },
+};
+
+static const struct regmap_irq hi655x_irqs[] = {
+       { .reg_offset = 0, .mask = OTMP_D1R_INT },
+       { .reg_offset = 0, .mask = VSYS_2P5_R_INT },
+       { .reg_offset = 0, .mask = VSYS_UV_D3R_INT },
+       { .reg_offset = 0, .mask = VSYS_6P0_D200UR_INT },
+       { .reg_offset = 0, .mask = PWRON_D4SR_INT },
+       { .reg_offset = 0, .mask = PWRON_D20F_INT },
+       { .reg_offset = 0, .mask = PWRON_D20R_INT },
+       { .reg_offset = 0, .mask = RESERVE_INT },
+};
+
+static const struct regmap_irq_chip hi655x_irq_chip = {
+       .name = "hi655x-pmic",
+       .irqs = hi655x_irqs,
+       .num_regs = 1,
+       .num_irqs = ARRAY_SIZE(hi655x_irqs),
+       .status_base = HI655X_IRQ_STAT_BASE,
+       .mask_base = HI655X_IRQ_MASK_BASE,
+};
+
+static struct regmap_config hi655x_regmap_config = {
+       .reg_bits = 32,
+       .reg_stride = HI655X_STRIDE,
+       .val_bits = 8,
+       .max_register = HI655X_BUS_ADDR(0xFFF),
+};
+
+static void hi655x_local_irq_clear(struct regmap *map)
+{
+       int i;
+
+       regmap_write(map, HI655X_ANA_IRQM_BASE, HI655X_IRQ_CLR);
+       for (i = 0; i < HI655X_IRQ_ARRAY; i++) {
+               regmap_write(map, HI655X_IRQ_STAT_BASE + i * HI655X_STRIDE,
+                            HI655X_IRQ_CLR);
+       }
+}
+
+static int hi655x_pmic_probe(struct platform_device *pdev)
+{
+       int ret;
+       struct hi655x_pmic *pmic;
+       struct device *dev = &pdev->dev;
+       struct device_node *np = dev->of_node;
+       void __iomem *base;
+
+       pmic = devm_kzalloc(dev, sizeof(*pmic), GFP_KERNEL);
+       if (!pmic)
+               return -ENOMEM;
+       pmic->dev = dev;
+
+       pmic->res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       if (!pmic->res)
+               return -ENOENT;
+
+       base = devm_ioremap_resource(dev, pmic->res);
+       if (!base)
+               return -ENOMEM;
+
+       pmic->regmap = devm_regmap_init_mmio_clk(dev, NULL, base,
+                                                &hi655x_regmap_config);
+
+       regmap_read(pmic->regmap, HI655X_BUS_ADDR(HI655X_VER_REG), &pmic->ver);
+       if ((pmic->ver < PMU_VER_START) || (pmic->ver > PMU_VER_END)) {
+               dev_warn(dev, "PMU version %d unsupported\n", pmic->ver);
+               return -EINVAL;
+       }
+
+       hi655x_local_irq_clear(pmic->regmap);
+
+       pmic->gpio = of_get_named_gpio(np, "pmic-gpios", 0);
+       if (!gpio_is_valid(pmic->gpio)) {
+               dev_err(dev, "Failed to get the pmic-gpios\n");
+               return -ENODEV;
+       }
+
+       ret = devm_gpio_request_one(dev, pmic->gpio, GPIOF_IN,
+                                   "hi655x_pmic_irq");
+       if (ret < 0) {
+               dev_err(dev, "Failed to request gpio %d  ret = %d\n",
+                       pmic->gpio, ret);
+               return ret;
+       }
+
+       ret = regmap_add_irq_chip(pmic->regmap, gpio_to_irq(pmic->gpio),
+                                 IRQF_TRIGGER_LOW | IRQF_NO_SUSPEND, 0,
+                                 &hi655x_irq_chip, &pmic->irq_data);
+       if (ret) {
+               dev_err(dev, "Failed to obtain 'hi655x_pmic_irq' %d\n", ret);
+               return ret;
+       }
+
+       platform_set_drvdata(pdev, pmic);
+
+       ret = mfd_add_devices(dev, PLATFORM_DEVID_AUTO, hi655x_pmic_devs,
+                             ARRAY_SIZE(hi655x_pmic_devs), NULL, 0, NULL);
+       if (ret) {
+               dev_err(dev, "Failed to register device %d\n", ret);
+               regmap_del_irq_chip(gpio_to_irq(pmic->gpio), pmic->irq_data);
+               return ret;
+       }
+
+       return 0;
+}
+
+static int hi655x_pmic_remove(struct platform_device *pdev)
+{
+       struct hi655x_pmic *pmic = platform_get_drvdata(pdev);
+
+       regmap_del_irq_chip(gpio_to_irq(pmic->gpio), pmic->irq_data);
+       mfd_remove_devices(&pdev->dev);
+       return 0;
+}
+
+static const struct of_device_id hi655x_pmic_match[] = {
+       { .compatible = "hisilicon,hi655x-pmic", },
+       {},
+};
+
+static struct platform_driver hi655x_pmic_driver = {
+       .driver = {
+               .name = "hi655x-pmic",
+               .of_match_table = of_match_ptr(hi655x_pmic_match),
+       },
+       .probe  = hi655x_pmic_probe,
+       .remove = hi655x_pmic_remove,
+};
+module_platform_driver(hi655x_pmic_driver);
+
+MODULE_AUTHOR("Chen Feng <puck.chen@hisilicon.com>");
+MODULE_DESCRIPTION("Hisilicon hi655x PMIC driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/mfd/htc-egpio.c b/drivers/mfd/htc-egpio.c

index c636b5f83cfbb89b2a04cd6a84d8e3d5308ef4c3..513cfc5c8fb62703529c2e714f70b7bbdf857306 100644 (file)
--- a/drivers/mfd/htc-egpio.c
+++ b/drivers/mfd/htc-egpio.c
@@ -155,7 +155,7 @@ static int egpio_get(struct gpio_chip *chip, unsigned offset)
  
         pr_debug("egpio_get_value(%d)\n", chip->base + offset);
  
-       egpio = container_of(chip, struct egpio_chip, chip);
+       egpio = gpiochip_get_data(chip);
         ei    = dev_get_drvdata(egpio->dev);
         bit   = egpio_bit(ei, offset);
         reg   = egpio->reg_start + egpio_pos(ei, offset);
@@ -170,7 +170,7 @@ static int egpio_direction_input(struct gpio_chip *chip, unsigned offset)
  {
         struct egpio_chip *egpio;
  
-       egpio = container_of(chip, struct egpio_chip, chip);
+       egpio = gpiochip_get_data(chip);
         return test_bit(offset, &egpio->is_out) ? -EINVAL : 0;
  }
  
@@ -192,7 +192,7 @@ static void egpio_set(struct gpio_chip *chip, unsigned offset, int value)
         pr_debug("egpio_set(%s, %d(%d), %d)\n",
                         chip->label, offset, offset+chip->base, value);
  
-       egpio = container_of(chip, struct egpio_chip, chip);
+       egpio = gpiochip_get_data(chip);
         ei    = dev_get_drvdata(egpio->dev);
         bit   = egpio_bit(ei, offset);
         pos   = egpio_pos(ei, offset);
@@ -216,7 +216,7 @@ static int egpio_direction_output(struct gpio_chip *chip,
  {
         struct egpio_chip *egpio;
  
-       egpio = container_of(chip, struct egpio_chip, chip);
+       egpio = gpiochip_get_data(chip);
         if (test_bit(offset, &egpio->is_out)) {
                 egpio_set(chip, offset, value);
                 return 0;
@@ -330,7 +330,7 @@ static int __init egpio_probe(struct platform_device *pdev)
                 chip->base            = pdata->chip[i].gpio_base;
                 chip->ngpio           = pdata->chip[i].num_gpios;
  
-               gpiochip_add(chip);
+               gpiochip_add_data(chip, &ei->chip[i]);
         }
  
         /* Set initial pin values */
diff --git a/drivers/mfd/htc-i2cpld.c b/drivers/mfd/htc-i2cpld.c

index bd6b96d07ab84fd0dd2722a496ba1223d564e7d3..3f9eee5f8fb9b60035252aea68d73f122d2d3437 100644 (file)
--- a/drivers/mfd/htc-i2cpld.c
+++ b/drivers/mfd/htc-i2cpld.c
@@ -227,8 +227,7 @@ static irqreturn_t htcpld_handler(int irq, void *dev)
  static void htcpld_chip_set(struct gpio_chip *chip, unsigned offset, int val)
  {
         struct i2c_client *client;
-       struct htcpld_chip *chip_data =
-               container_of(chip, struct htcpld_chip, chip_out);
+       struct htcpld_chip *chip_data = gpiochip_get_data(chip);
         unsigned long flags;
  
         client = chip_data->client;
@@ -257,14 +256,12 @@ static void htcpld_chip_set_ni(struct work_struct *work)
  
  static int htcpld_chip_get(struct gpio_chip *chip, unsigned offset)
  {
-       struct htcpld_chip *chip_data;
+       struct htcpld_chip *chip_data = gpiochip_get_data(chip);
         u8 cache;
  
         if (!strncmp(chip->label, "htcpld-out", 10)) {
-               chip_data = container_of(chip, struct htcpld_chip, chip_out);
                 cache = chip_data->cache_out;
         } else if (!strncmp(chip->label, "htcpld-in", 9)) {
-               chip_data = container_of(chip, struct htcpld_chip, chip_in);
                 cache = chip_data->cache_in;
         } else
                 return -EINVAL;
@@ -291,9 +288,7 @@ static int htcpld_direction_input(struct gpio_chip *chip,
  
  static int htcpld_chip_to_irq(struct gpio_chip *chip, unsigned offset)
  {
-       struct htcpld_chip *chip_data;
-
-       chip_data = container_of(chip, struct htcpld_chip, chip_in);
+       struct htcpld_chip *chip_data = gpiochip_get_data(chip);
  
         if (offset < chip_data->nirqs)
                 return chip_data->irq_start + offset;
@@ -451,14 +446,14 @@ static int htcpld_register_chip_gpio(
         gpio_chip->ngpio           = plat_chip_data->num_gpios;
  
         /* Add the GPIO chips */
-       ret = gpiochip_add(&(chip->chip_out));
+       ret = gpiochip_add_data(&(chip->chip_out), chip);
         if (ret) {
                 dev_warn(dev, "Unable to register output GPIOs for 0x%x: %d\n",
                          plat_chip_data->addr, ret);
                 return ret;
         }
  
-       ret = gpiochip_add(&(chip->chip_in));
+       ret = gpiochip_add_data(&(chip->chip_in), chip);
         if (ret) {
                 dev_warn(dev, "Unable to register input GPIOs for 0x%x: %d\n",
                          plat_chip_data->addr, ret);
diff --git a/drivers/mfd/intel-lpss.c b/drivers/mfd/intel-lpss.c

index 6352aaba96a455095a33b806d9fffe9669e7c097..41b113875d6452acc545085ffbc4c52c23079338 100644 (file)
--- a/drivers/mfd/intel-lpss.c
+++ b/drivers/mfd/intel-lpss.c
@@ -34,6 +34,7 @@
  #define LPSS_DEV_SIZE          0x200
  #define LPSS_PRIV_OFFSET       0x200
  #define LPSS_PRIV_SIZE         0x100
+#define LPSS_PRIV_REG_COUNT    (LPSS_PRIV_SIZE / 4)
  #define LPSS_IDMA64_OFFSET     0x800
  #define LPSS_IDMA64_SIZE       0x800
  
@@ -76,6 +77,7 @@ struct intel_lpss {
         struct mfd_cell *cell;
         struct device *dev;
         void __iomem *priv;
+       u32 priv_ctx[LPSS_PRIV_REG_COUNT];
         int devid;
         u32 caps;
         u32 active_ltr;
@@ -336,8 +338,8 @@ static int intel_lpss_register_clock(struct intel_lpss *lpss)
                 return 0;
  
         /* Root clock */
-       clk = clk_register_fixed_rate(NULL, dev_name(lpss->dev), NULL,
-                                     CLK_IS_ROOT, lpss->info->clk_rate);
+       clk = clk_register_fixed_rate(NULL, dev_name(lpss->dev), NULL, 0,
+                                     lpss->info->clk_rate);
         if (IS_ERR(clk))
                 return PTR_ERR(clk);
  
@@ -493,6 +495,16 @@ EXPORT_SYMBOL_GPL(intel_lpss_prepare);
  
  int intel_lpss_suspend(struct device *dev)
  {
+       struct intel_lpss *lpss = dev_get_drvdata(dev);
+       unsigned int i;
+
+       /* Save device context */
+       for (i = 0; i < LPSS_PRIV_REG_COUNT; i++)
+               lpss->priv_ctx[i] = readl(lpss->priv + i * 4);
+
+       /* Put the device into reset state */
+       writel(0, lpss->priv + LPSS_PRIV_RESETS);
+
         return 0;
  }
  EXPORT_SYMBOL_GPL(intel_lpss_suspend);
@@ -500,8 +512,13 @@ EXPORT_SYMBOL_GPL(intel_lpss_suspend);
  int intel_lpss_resume(struct device *dev)
  {
         struct intel_lpss *lpss = dev_get_drvdata(dev);
+       unsigned int i;
  
-       intel_lpss_init_dev(lpss);
+       intel_lpss_deassert_reset(lpss);
+
+       /* Restore device context */
+       for (i = 0; i < LPSS_PRIV_REG_COUNT; i++)
+               writel(lpss->priv_ctx[i], lpss->priv + i * 4);
  
         return 0;
  }
diff --git a/drivers/mfd/intel_quark_i2c_gpio.c b/drivers/mfd/intel_quark_i2c_gpio.c

index a24b35fc2b5bf59464ea6a2b2f1e45e23b55f0a4..7946d6e38b87201e3904f3247d2807c2ed174ca4 100644 (file)
--- a/drivers/mfd/intel_quark_i2c_gpio.c
+++ b/drivers/mfd/intel_quark_i2c_gpio.c
@@ -53,7 +53,7 @@
  #define INTEL_QUARK_I2C_CLK_HZ 33000000
  
  struct intel_quark_mfd {
-       struct pci_dev          *pdev;
+       struct device           *dev;
         struct clk              *i2c_clk;
         struct clk_lookup       *i2c_clk_lookup;
  };
@@ -123,14 +123,14 @@ static const struct pci_device_id intel_quark_mfd_ids[] = {
  };
  MODULE_DEVICE_TABLE(pci, intel_quark_mfd_ids);
  
-static int intel_quark_register_i2c_clk(struct intel_quark_mfd *quark_mfd)
+static int intel_quark_register_i2c_clk(struct device *dev)
  {
-       struct pci_dev *pdev = quark_mfd->pdev;
+       struct intel_quark_mfd *quark_mfd = dev_get_drvdata(dev);
         struct clk *i2c_clk;
  
-       i2c_clk = clk_register_fixed_rate(&pdev->dev,
+       i2c_clk = clk_register_fixed_rate(dev,
                                           INTEL_QUARK_I2C_CONTROLLER_CLK, NULL,
-                                         CLK_IS_ROOT, INTEL_QUARK_I2C_CLK_HZ);
+                                         0, INTEL_QUARK_I2C_CLK_HZ);
         if (IS_ERR(i2c_clk))
                 return PTR_ERR(i2c_clk);
  
@@ -139,18 +139,19 @@ static int intel_quark_register_i2c_clk(struct intel_quark_mfd *quark_mfd)
                                                 INTEL_QUARK_I2C_CONTROLLER_CLK);
  
         if (!quark_mfd->i2c_clk_lookup) {
-               dev_err(&pdev->dev, "Fixed clk register failed\n");
+               clk_unregister(quark_mfd->i2c_clk);
+               dev_err(dev, "Fixed clk register failed\n");
                 return -ENOMEM;
         }
  
         return 0;
  }
  
-static void intel_quark_unregister_i2c_clk(struct pci_dev *pdev)
+static void intel_quark_unregister_i2c_clk(struct device *dev)
  {
-       struct intel_quark_mfd *quark_mfd = dev_get_drvdata(&pdev->dev);
+       struct intel_quark_mfd *quark_mfd = dev_get_drvdata(dev);
  
-       if (!quark_mfd->i2c_clk || !quark_mfd->i2c_clk_lookup)
+       if (!quark_mfd->i2c_clk_lookup)
                 return;
  
         clkdev_drop(quark_mfd->i2c_clk_lookup);
@@ -245,30 +246,38 @@ static int intel_quark_mfd_probe(struct pci_dev *pdev,
         quark_mfd = devm_kzalloc(&pdev->dev, sizeof(*quark_mfd), GFP_KERNEL);
         if (!quark_mfd)
                 return -ENOMEM;
-       quark_mfd->pdev = pdev;
  
-       ret = intel_quark_register_i2c_clk(quark_mfd);
+       quark_mfd->dev = &pdev->dev;
+       dev_set_drvdata(&pdev->dev, quark_mfd);
+
+       ret = intel_quark_register_i2c_clk(&pdev->dev);
         if (ret)
                 return ret;
  
-       dev_set_drvdata(&pdev->dev, quark_mfd);
-
         ret = intel_quark_i2c_setup(pdev, &intel_quark_mfd_cells[1]);
         if (ret)
-               return ret;
+               goto err_unregister_i2c_clk;
  
         ret = intel_quark_gpio_setup(pdev, &intel_quark_mfd_cells[0]);
         if (ret)
-               return ret;
+               goto err_unregister_i2c_clk;
+
+       ret = mfd_add_devices(&pdev->dev, 0, intel_quark_mfd_cells,
+                             ARRAY_SIZE(intel_quark_mfd_cells), NULL, 0,
+                             NULL);
+       if (ret)
+               goto err_unregister_i2c_clk;
+
+       return 0;
  
-       return mfd_add_devices(&pdev->dev, 0, intel_quark_mfd_cells,
-                              ARRAY_SIZE(intel_quark_mfd_cells), NULL, 0,
-                              NULL);
+err_unregister_i2c_clk:
+       intel_quark_unregister_i2c_clk(&pdev->dev);
+       return ret;
  }
  
  static void intel_quark_mfd_remove(struct pci_dev *pdev)
  {
-       intel_quark_unregister_i2c_clk(pdev);
+       intel_quark_unregister_i2c_clk(&pdev->dev);
         mfd_remove_devices(&pdev->dev);
  }
  
diff --git a/drivers/mfd/intel_soc_pmic_core.c b/drivers/mfd/intel_soc_pmic_core.c

index d9e15cf7c6c8839ba09b6e505730f62b5820798d..12d6ebb4ae5d5bfb5c9e1a753dbc8f81d5194e2d 100644 (file)
--- a/drivers/mfd/intel_soc_pmic_core.c
+++ b/drivers/mfd/intel_soc_pmic_core.c
@@ -35,6 +35,7 @@ static struct gpiod_lookup_table panel_gpio_table = {
         .table = {
                 /* Panel EN/DISABLE */
                 GPIO_LOOKUP("gpio_crystalcove", 94, "panel", GPIO_ACTIVE_HIGH),
+               { },
         },
  };
  
diff --git a/drivers/mfd/lp3943.c b/drivers/mfd/lp3943.c

index eecbb13de1bd36202b1e9def49b27ff17f2da73e..65a2a8f14e74d9f330092c9bbd7168d750ac1b38 100644 (file)
--- a/drivers/mfd/lp3943.c
+++ b/drivers/mfd/lp3943.c
@@ -123,16 +123,9 @@ static int lp3943_probe(struct i2c_client *cl, const struct i2c_device_id *id)
         lp3943->mux_cfg = lp3943_mux_cfg;
         i2c_set_clientdata(cl, lp3943);
  
-       return mfd_add_devices(dev, -1, lp3943_devs, ARRAY_SIZE(lp3943_devs),
-                              NULL, 0, NULL);
-}
-
-static int lp3943_remove(struct i2c_client *cl)
-{
-       struct lp3943 *lp3943 = i2c_get_clientdata(cl);
-
-       mfd_remove_devices(lp3943->dev);
-       return 0;
+       return devm_mfd_add_devices(dev, -1, lp3943_devs,
+                                   ARRAY_SIZE(lp3943_devs),
+                                   NULL, 0, NULL);
  }
  
  static const struct i2c_device_id lp3943_ids[] = {
@@ -151,7 +144,6 @@ MODULE_DEVICE_TABLE(of, lp3943_of_match);
  
  static struct i2c_driver lp3943_driver = {
         .probe = lp3943_probe,
-       .remove = lp3943_remove,
         .driver = {
                 .name = "lp3943",
                 .of_match_table = of_match_ptr(lp3943_of_match),
diff --git a/drivers/mfd/lp8788-irq.c b/drivers/mfd/lp8788-irq.c

index c7a9825aa4ce42a8f358834b529657374e2a47b8..792d51bae20f5109c5b42d99c652d004e8d332f9 100644 (file)
--- a/drivers/mfd/lp8788-irq.c
+++ b/drivers/mfd/lp8788-irq.c
@@ -112,7 +112,7 @@ static irqreturn_t lp8788_irq_handler(int irq, void *ptr)
         struct lp8788_irq_data *irqd = ptr;
         struct lp8788 *lp = irqd->lp;
         u8 status[NUM_REGS], addr, mask;
-       bool handled;
+       bool handled = false;
         int i;
  
         if (lp8788_read_multi_bytes(lp, LP8788_INT_1, status, NUM_REGS))
diff --git a/drivers/mfd/max77620.c b/drivers/mfd/max77620.c

new file mode 100644 (file)

index 0000000..199d261
--- /dev/null
+++ b/drivers/mfd/max77620.c
@@ -0,0 +1,590 @@
+/*
+ * Maxim MAX77620 MFD Driver
+ *
+ * Copyright (C) 2016 NVIDIA CORPORATION. All rights reserved.
+ *
+ * Author:
+ *     Laxman Dewangan <ldewangan@nvidia.com>
+ *     Chaitanya Bandi <bandik@nvidia.com>
+ *     Mallikarjun Kasoju <mkasoju@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/****************** Teminology used in driver ********************
+ * Here are some terminology used from datasheet for quick reference:
+ * Flexible Power Sequence (FPS):
+ * The Flexible Power Sequencer (FPS) allows each regulator to power up under
+ * hardware or software control. Additionally, each regulator can power on
+ * independently or among a group of other regulators with an adjustable
+ * power-up and power-down delays (sequencing). GPIO1, GPIO2, and GPIO3 can
+ * be programmed to be part of a sequence allowing external regulators to be
+ * sequenced along with internal regulators. 32KHz clock can be programmed to
+ * be part of a sequence.
+ * There is 3 FPS confguration registers and all resources are configured to
+ * any of these FPS or no FPS.
+ */
+
+#include <linux/i2c.h>
+#include <linux/interrupt.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/max77620.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+
+static struct resource gpio_resources[] = {
+       DEFINE_RES_IRQ(MAX77620_IRQ_TOP_GPIO),
+};
+
+static struct resource power_resources[] = {
+       DEFINE_RES_IRQ(MAX77620_IRQ_LBT_MBATLOW),
+};
+
+static struct resource rtc_resources[] = {
+       DEFINE_RES_IRQ(MAX77620_IRQ_TOP_RTC),
+};
+
+static struct resource thermal_resources[] = {
+       DEFINE_RES_IRQ(MAX77620_IRQ_LBT_TJALRM1),
+       DEFINE_RES_IRQ(MAX77620_IRQ_LBT_TJALRM2),
+};
+
+static const struct regmap_irq max77620_top_irqs[] = {
+       REGMAP_IRQ_REG(MAX77620_IRQ_TOP_GLBL, 0, MAX77620_IRQ_TOP_GLBL_MASK),
+       REGMAP_IRQ_REG(MAX77620_IRQ_TOP_SD, 0, MAX77620_IRQ_TOP_SD_MASK),
+       REGMAP_IRQ_REG(MAX77620_IRQ_TOP_LDO, 0, MAX77620_IRQ_TOP_LDO_MASK),
+       REGMAP_IRQ_REG(MAX77620_IRQ_TOP_GPIO, 0, MAX77620_IRQ_TOP_GPIO_MASK),
+       REGMAP_IRQ_REG(MAX77620_IRQ_TOP_RTC, 0, MAX77620_IRQ_TOP_RTC_MASK),
+       REGMAP_IRQ_REG(MAX77620_IRQ_TOP_32K, 0, MAX77620_IRQ_TOP_32K_MASK),
+       REGMAP_IRQ_REG(MAX77620_IRQ_TOP_ONOFF, 0, MAX77620_IRQ_TOP_ONOFF_MASK),
+       REGMAP_IRQ_REG(MAX77620_IRQ_LBT_MBATLOW, 1, MAX77620_IRQ_LBM_MASK),
+       REGMAP_IRQ_REG(MAX77620_IRQ_LBT_TJALRM1, 1, MAX77620_IRQ_TJALRM1_MASK),
+       REGMAP_IRQ_REG(MAX77620_IRQ_LBT_TJALRM2, 1, MAX77620_IRQ_TJALRM2_MASK),
+};
+
+static const struct mfd_cell max77620_children[] = {
+       { .name = "max77620-pinctrl", },
+       { .name = "max77620-clock", },
+       { .name = "max77620-pmic", },
+       { .name = "max77620-watchdog", },
+       {
+               .name = "max77620-gpio",
+               .resources = gpio_resources,
+               .num_resources = ARRAY_SIZE(gpio_resources),
+       }, {
+               .name = "max77620-rtc",
+               .resources = rtc_resources,
+               .num_resources = ARRAY_SIZE(rtc_resources),
+       }, {
+               .name = "max77620-power",
+               .resources = power_resources,
+               .num_resources = ARRAY_SIZE(power_resources),
+       }, {
+               .name = "max77620-thermal",
+               .resources = thermal_resources,
+               .num_resources = ARRAY_SIZE(thermal_resources),
+       },
+};
+
+static const struct mfd_cell max20024_children[] = {
+       { .name = "max20024-pinctrl", },
+       { .name = "max77620-clock", },
+       { .name = "max20024-pmic", },
+       { .name = "max77620-watchdog", },
+       {
+               .name = "max77620-gpio",
+               .resources = gpio_resources,
+               .num_resources = ARRAY_SIZE(gpio_resources),
+       }, {
+               .name = "max77620-rtc",
+               .resources = rtc_resources,
+               .num_resources = ARRAY_SIZE(rtc_resources),
+       }, {
+               .name = "max20024-power",
+               .resources = power_resources,
+               .num_resources = ARRAY_SIZE(power_resources),
+       },
+};
+
+static struct regmap_irq_chip max77620_top_irq_chip = {
+       .name = "max77620-top",
+       .irqs = max77620_top_irqs,
+       .num_irqs = ARRAY_SIZE(max77620_top_irqs),
+       .num_regs = 2,
+       .status_base = MAX77620_REG_IRQTOP,
+       .mask_base = MAX77620_REG_IRQTOPM,
+};
+
+static const struct regmap_range max77620_readable_ranges[] = {
+       regmap_reg_range(MAX77620_REG_CNFGGLBL1, MAX77620_REG_DVSSD4),
+};
+
+static const struct regmap_access_table max77620_readable_table = {
+       .yes_ranges = max77620_readable_ranges,
+       .n_yes_ranges = ARRAY_SIZE(max77620_readable_ranges),
+};
+
+static const struct regmap_range max20024_readable_ranges[] = {
+       regmap_reg_range(MAX77620_REG_CNFGGLBL1, MAX77620_REG_DVSSD4),
+       regmap_reg_range(MAX20024_REG_MAX_ADD, MAX20024_REG_MAX_ADD),
+};
+
+static const struct regmap_access_table max20024_readable_table = {
+       .yes_ranges = max20024_readable_ranges,
+       .n_yes_ranges = ARRAY_SIZE(max20024_readable_ranges),
+};
+
+static const struct regmap_range max77620_writable_ranges[] = {
+       regmap_reg_range(MAX77620_REG_CNFGGLBL1, MAX77620_REG_DVSSD4),
+};
+
+static const struct regmap_access_table max77620_writable_table = {
+       .yes_ranges = max77620_writable_ranges,
+       .n_yes_ranges = ARRAY_SIZE(max77620_writable_ranges),
+};
+
+static const struct regmap_range max77620_cacheable_ranges[] = {
+       regmap_reg_range(MAX77620_REG_SD0_CFG, MAX77620_REG_LDO_CFG3),
+       regmap_reg_range(MAX77620_REG_FPS_CFG0, MAX77620_REG_FPS_SD3),
+};
+
+static const struct regmap_access_table max77620_volatile_table = {
+       .no_ranges = max77620_cacheable_ranges,
+       .n_no_ranges = ARRAY_SIZE(max77620_cacheable_ranges),
+};
+
+static const struct regmap_config max77620_regmap_config = {
+       .name = "power-slave",
+       .reg_bits = 8,
+       .val_bits = 8,
+       .max_register = MAX77620_REG_DVSSD4 + 1,
+       .cache_type = REGCACHE_RBTREE,
+       .rd_table = &max77620_readable_table,
+       .wr_table = &max77620_writable_table,
+       .volatile_table = &max77620_volatile_table,
+};
+
+static const struct regmap_config max20024_regmap_config = {
+       .name = "power-slave",
+       .reg_bits = 8,
+       .val_bits = 8,
+       .max_register = MAX20024_REG_MAX_ADD + 1,
+       .cache_type = REGCACHE_RBTREE,
+       .rd_table = &max20024_readable_table,
+       .wr_table = &max77620_writable_table,
+       .volatile_table = &max77620_volatile_table,
+};
+
+/* max77620_get_fps_period_reg_value:  Get FPS bit field value from
+ *                                    requested periods.
+ * MAX77620 supports the FPS period of 40, 80, 160, 320, 540, 1280, 2560
+ * and 5120 microseconds. MAX20024 supports the FPS period of 20, 40, 80,
+ * 160, 320, 540, 1280 and 2560 microseconds.
+ * The FPS register has 3 bits field to set the FPS period as
+ * bits                max77620                max20024
+ * 000         40                      20
+ * 001         80                      40
+ * :::
+*/
+static int max77620_get_fps_period_reg_value(struct max77620_chip *chip,
+                                            int tperiod)
+{
+       int fps_min_period;
+       int i;
+
+       switch (chip->chip_id) {
+       case MAX20024:
+               fps_min_period = MAX20024_FPS_PERIOD_MIN_US;
+               break;
+       case MAX77620:
+               fps_min_period = MAX77620_FPS_PERIOD_MIN_US;
+       default:
+               return -EINVAL;
+       }
+
+       for (i = 0; i < 7; i++) {
+               if (fps_min_period >= tperiod)
+                       return i;
+               fps_min_period *= 2;
+       }
+
+       return i;
+}
+
+/* max77620_config_fps: Configure FPS configuration registers
+ *                     based on platform specific information.
+ */
+static int max77620_config_fps(struct max77620_chip *chip,
+                              struct device_node *fps_np)
+{
+       struct device *dev = chip->dev;
+       unsigned int mask = 0, config = 0;
+       u32 fps_max_period;
+       u32 param_val;
+       int tperiod, fps_id;
+       int ret;
+       char fps_name[10];
+
+       switch (chip->chip_id) {
+       case MAX20024:
+               fps_max_period = MAX20024_FPS_PERIOD_MAX_US;
+               break;
+       case MAX77620:
+               fps_max_period = MAX77620_FPS_PERIOD_MAX_US;
+       default:
+               return -EINVAL;
+       }
+
+       for (fps_id = 0; fps_id < MAX77620_FPS_COUNT; fps_id++) {
+               sprintf(fps_name, "fps%d", fps_id);
+               if (!strcmp(fps_np->name, fps_name))
+                       break;
+       }
+
+       if (fps_id == MAX77620_FPS_COUNT) {
+               dev_err(dev, "FPS node name %s is not valid\n", fps_np->name);
+               return -EINVAL;
+       }
+
+       ret = of_property_read_u32(fps_np, "maxim,shutdown-fps-time-period-us",
+                                  &param_val);
+       if (!ret) {
+               mask |= MAX77620_FPS_TIME_PERIOD_MASK;
+               chip->shutdown_fps_period[fps_id] = min(param_val,
+                                                       fps_max_period);
+               tperiod = max77620_get_fps_period_reg_value(chip,
+                               chip->shutdown_fps_period[fps_id]);
+               config |= tperiod << MAX77620_FPS_TIME_PERIOD_SHIFT;
+       }
+
+       ret = of_property_read_u32(fps_np, "maxim,suspend-fps-time-period-us",
+                                  &param_val);
+       if (!ret)
+               chip->suspend_fps_period[fps_id] = min(param_val,
+                                                      fps_max_period);
+
+       ret = of_property_read_u32(fps_np, "maxim,fps-event-source",
+                                  &param_val);
+       if (!ret) {
+               if (param_val > 2) {
+                       dev_err(dev, "FPS%d event-source invalid\n", fps_id);
+                       return -EINVAL;
+               }
+               mask |= MAX77620_FPS_EN_SRC_MASK;
+               config |= param_val << MAX77620_FPS_EN_SRC_SHIFT;
+               if (param_val == 2) {
+                       mask |= MAX77620_FPS_ENFPS_SW_MASK;
+                       config |= MAX77620_FPS_ENFPS_SW;
+               }
+       }
+
+       if (!chip->sleep_enable && !chip->enable_global_lpm) {
+               ret = of_property_read_u32(fps_np,
+                               "maxim,device-state-on-disabled-event",
+                               &param_val);
+               if (!ret) {
+                       if (param_val == 0)
+                               chip->sleep_enable = true;
+                       else if (param_val == 1)
+                               chip->enable_global_lpm = true;
+               }
+       }
+
+       ret = regmap_update_bits(chip->rmap, MAX77620_REG_FPS_CFG0 + fps_id,
+                                mask, config);
+       if (ret < 0) {
+               dev_err(dev, "Failed to update FPS CFG: %d\n", ret);
+               return ret;
+       }
+
+       return 0;
+}
+
+static int max77620_initialise_fps(struct max77620_chip *chip)
+{
+       struct device *dev = chip->dev;
+       struct device_node *fps_np, *fps_child;
+       u8 config;
+       int fps_id;
+       int ret;
+
+       for (fps_id = 0; fps_id < MAX77620_FPS_COUNT; fps_id++) {
+               chip->shutdown_fps_period[fps_id] = -1;
+               chip->suspend_fps_period[fps_id] = -1;
+       }
+
+       fps_np = of_get_child_by_name(dev->of_node, "fps");
+       if (!fps_np)
+               goto skip_fps;
+
+       for_each_child_of_node(fps_np, fps_child) {
+               ret = max77620_config_fps(chip, fps_child);
+               if (ret < 0)
+                       return ret;
+       }
+
+       config = chip->enable_global_lpm ? MAX77620_ONOFFCNFG2_SLP_LPM_MSK : 0;
+       ret = regmap_update_bits(chip->rmap, MAX77620_REG_ONOFFCNFG2,
+                                MAX77620_ONOFFCNFG2_SLP_LPM_MSK, config);
+       if (ret < 0) {
+               dev_err(dev, "Failed to update SLP_LPM: %d\n", ret);
+               return ret;
+       }
+
+skip_fps:
+       /* Enable wake on EN0 pin */
+       ret = regmap_update_bits(chip->rmap, MAX77620_REG_ONOFFCNFG2,
+                                MAX77620_ONOFFCNFG2_WK_EN0,
+                                MAX77620_ONOFFCNFG2_WK_EN0);
+       if (ret < 0) {
+               dev_err(dev, "Failed to update WK_EN0: %d\n", ret);
+               return ret;
+       }
+
+       /* For MAX20024, SLPEN will be POR reset if CLRSE is b11 */
+       if ((chip->chip_id == MAX20024) && chip->sleep_enable) {
+               config = MAX77620_ONOFFCNFG1_SLPEN | MAX20024_ONOFFCNFG1_CLRSE;
+               ret = regmap_update_bits(chip->rmap, MAX77620_REG_ONOFFCNFG1,
+                                        config, config);
+               if (ret < 0) {
+                       dev_err(dev, "Failed to update SLPEN: %d\n", ret);
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+
+static int max77620_read_es_version(struct max77620_chip *chip)
+{
+       unsigned int val;
+       u8 cid_val[6];
+       int i;
+       int ret;
+
+       for (i = MAX77620_REG_CID0; i <= MAX77620_REG_CID5; i++) {
+               ret = regmap_read(chip->rmap, i, &val);
+               if (ret < 0) {
+                       dev_err(chip->dev, "Failed to read CID: %d\n", ret);
+                       return ret;
+               }
+               dev_dbg(chip->dev, "CID%d: 0x%02x\n",
+                       i - MAX77620_REG_CID0, val);
+               cid_val[i - MAX77620_REG_CID0] = val;
+       }
+
+       /* CID4 is OTP Version  and CID5 is ES version */
+       dev_info(chip->dev, "PMIC Version OTP:0x%02X and ES:0x%X\n",
+                cid_val[4], MAX77620_CID5_DIDM(cid_val[5]));
+
+       return ret;
+}
+
+static int max77620_probe(struct i2c_client *client,
+                         const struct i2c_device_id *id)
+{
+       const struct regmap_config *rmap_config;
+       struct max77620_chip *chip;
+       const struct mfd_cell *mfd_cells;
+       int n_mfd_cells;
+       int ret;
+
+       chip = devm_kzalloc(&client->dev, sizeof(*chip), GFP_KERNEL);
+       if (!chip)
+               return -ENOMEM;
+
+       i2c_set_clientdata(client, chip);
+       chip->dev = &client->dev;
+       chip->irq_base = -1;
+       chip->chip_irq = client->irq;
+       chip->chip_id = (enum max77620_chip_id)id->driver_data;
+
+       switch (chip->chip_id) {
+       case MAX77620:
+               mfd_cells = max77620_children;
+               n_mfd_cells = ARRAY_SIZE(max77620_children);
+               rmap_config = &max77620_regmap_config;
+               break;
+       case MAX20024:
+               mfd_cells = max20024_children;
+               n_mfd_cells = ARRAY_SIZE(max20024_children);
+               rmap_config = &max20024_regmap_config;
+               break;
+       default:
+               dev_err(chip->dev, "ChipID is invalid %d\n", chip->chip_id);
+               return -EINVAL;
+       }
+
+       chip->rmap = devm_regmap_init_i2c(client, rmap_config);
+       if (IS_ERR(chip->rmap)) {
+               ret = PTR_ERR(chip->rmap);
+               dev_err(chip->dev, "Failed to intialise regmap: %d\n", ret);
+               return ret;
+       }
+
+       ret = max77620_read_es_version(chip);
+       if (ret < 0)
+               return ret;
+
+       ret = devm_regmap_add_irq_chip(chip->dev, chip->rmap, client->irq,
+                                      IRQF_ONESHOT | IRQF_SHARED,
+                                      chip->irq_base, &max77620_top_irq_chip,
+                                      &chip->top_irq_data);
+       if (ret < 0) {
+               dev_err(chip->dev, "Failed to add regmap irq: %d\n", ret);
+               return ret;
+       }
+
+       ret = max77620_initialise_fps(chip);
+       if (ret < 0)
+               return ret;
+
+       ret =  devm_mfd_add_devices(chip->dev, PLATFORM_DEVID_NONE,
+                                   mfd_cells, n_mfd_cells, NULL, 0,
+                                   regmap_irq_get_domain(chip->top_irq_data));
+       if (ret < 0) {
+               dev_err(chip->dev, "Failed to add MFD children: %d\n", ret);
+               return ret;
+       }
+
+       return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int max77620_set_fps_period(struct max77620_chip *chip,
+                                  int fps_id, int time_period)
+{
+       int period = max77620_get_fps_period_reg_value(chip, time_period);
+       int ret;
+
+       ret = regmap_update_bits(chip->rmap, MAX77620_REG_FPS_CFG0 + fps_id,
+                                MAX77620_FPS_TIME_PERIOD_MASK,
+                                period << MAX77620_FPS_TIME_PERIOD_SHIFT);
+       if (ret < 0) {
+               dev_err(chip->dev, "Failed to update FPS period: %d\n", ret);
+               return ret;
+       }
+
+       return 0;
+}
+
+static int max77620_i2c_suspend(struct device *dev)
+{
+       struct max77620_chip *chip = dev_get_drvdata(dev);
+       struct i2c_client *client = to_i2c_client(dev);
+       unsigned int config;
+       int fps;
+       int ret;
+
+       for (fps = 0; fps < MAX77620_FPS_COUNT; fps++) {
+               if (chip->suspend_fps_period[fps] < 0)
+                       continue;
+
+               ret = max77620_set_fps_period(chip, fps,
+                                             chip->suspend_fps_period[fps]);
+               if (ret < 0)
+                       return ret;
+       }
+
+       /*
+        * For MAX20024: No need to configure SLPEN on suspend as
+        * it will be configured on Init.
+        */
+       if (chip->chip_id == MAX20024)
+               goto out;
+
+       config = (chip->sleep_enable) ? MAX77620_ONOFFCNFG1_SLPEN : 0;
+       ret = regmap_update_bits(chip->rmap, MAX77620_REG_ONOFFCNFG1,
+                                MAX77620_ONOFFCNFG1_SLPEN,
+                                config);
+       if (ret < 0) {
+               dev_err(dev, "Failed to configure sleep in suspend: %d\n", ret);
+               return ret;
+       }
+
+       /* Disable WK_EN0 */
+       ret = regmap_update_bits(chip->rmap, MAX77620_REG_ONOFFCNFG2,
+                                MAX77620_ONOFFCNFG2_WK_EN0, 0);
+       if (ret < 0) {
+               dev_err(dev, "Failed to configure WK_EN in suspend: %d\n", ret);
+               return ret;
+       }
+
+out:
+       disable_irq(client->irq);
+
+       return 0;
+}
+
+static int max77620_i2c_resume(struct device *dev)
+{
+       struct max77620_chip *chip = dev_get_drvdata(dev);
+       struct i2c_client *client = to_i2c_client(dev);
+       int ret;
+       int fps;
+
+       for (fps = 0; fps < MAX77620_FPS_COUNT; fps++) {
+               if (chip->shutdown_fps_period[fps] < 0)
+                       continue;
+
+               ret = max77620_set_fps_period(chip, fps,
+                                             chip->shutdown_fps_period[fps]);
+               if (ret < 0)
+                       return ret;
+       }
+
+       /*
+        * For MAX20024: No need to configure WKEN0 on resume as
+        * it is configured on Init.
+        */
+       if (chip->chip_id == MAX20024)
+               goto out;
+
+       /* Enable WK_EN0 */
+       ret = regmap_update_bits(chip->rmap, MAX77620_REG_ONOFFCNFG2,
+                                MAX77620_ONOFFCNFG2_WK_EN0,
+                                MAX77620_ONOFFCNFG2_WK_EN0);
+       if (ret < 0) {
+               dev_err(dev, "Failed to configure WK_EN0 n resume: %d\n", ret);
+               return ret;
+       }
+
+out:
+       enable_irq(client->irq);
+
+       return 0;
+}
+#endif
+
+static const struct i2c_device_id max77620_id[] = {
+       {"max77620", MAX77620},
+       {"max20024", MAX20024},
+       {},
+};
+MODULE_DEVICE_TABLE(i2c, max77620_id);
+
+static const struct dev_pm_ops max77620_pm_ops = {
+       SET_SYSTEM_SLEEP_PM_OPS(max77620_i2c_suspend, max77620_i2c_resume)
+};
+
+static struct i2c_driver max77620_driver = {
+       .driver = {
+               .name = "max77620",
+               .pm = &max77620_pm_ops,
+       },
+       .probe = max77620_probe,
+       .id_table = max77620_id,
+};
+
+module_i2c_driver(max77620_driver);
+
+MODULE_DESCRIPTION("MAX77620/MAX20024 Multi Function Device Core Driver");
+MODULE_AUTHOR("Laxman Dewangan <ldewangan@nvidia.com>");
+MODULE_AUTHOR("Chaitanya Bandi <bandik@nvidia.com>");
+MODULE_AUTHOR("Mallikarjun Kasoju <mkasoju@nvidia.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/mfd/max77686.c b/drivers/mfd/max77686.c

index c1aff46e89d9e1cf18d14edb31738181e53c4048..7b68ed72e9cbfcc541d4d1f50a42b7d63d3bdbd6 100644 (file)
--- a/drivers/mfd/max77686.c
+++ b/drivers/mfd/max77686.c
@@ -2,7 +2,7 @@
   * max77686.c - mfd core driver for the Maxim 77686/802
   *
   * Copyright (C) 2012 Samsung Electronics
- * Chiwoong Byun <woong.byun@smasung.com>
+ * Chiwoong Byun <woong.byun@samsung.com>
   * Jonghwa Lee <jonghwa3.lee@samsung.com>
   *
   * This program is free software; you can redistribute it and/or modify
@@ -230,37 +230,23 @@ static int max77686_i2c_probe(struct i2c_client *i2c,
                 return -ENODEV;
         }
  
-       ret = regmap_add_irq_chip(max77686->regmap, max77686->irq,
-                                 IRQF_TRIGGER_FALLING | IRQF_ONESHOT |
-                                 IRQF_SHARED, 0, irq_chip,
-                                 &max77686->irq_data);
+       ret = devm_regmap_add_irq_chip(&i2c->dev, max77686->regmap,
+                                      max77686->irq,
+                                      IRQF_TRIGGER_FALLING | IRQF_ONESHOT |
+                                      IRQF_SHARED, 0, irq_chip,
+                                      &max77686->irq_data);
         if (ret < 0) {
                 dev_err(&i2c->dev, "failed to add PMIC irq chip: %d\n", ret);
                 return ret;
         }
  
-       ret = mfd_add_devices(max77686->dev, -1, cells, n_devs, NULL, 0, NULL);
+       ret = devm_mfd_add_devices(max77686->dev, -1, cells, n_devs, NULL,
+                                  0, NULL);
         if (ret < 0) {
                 dev_err(&i2c->dev, "failed to add MFD devices: %d\n", ret);
-               goto err_del_irqc;
+               return ret;
         }
  
-       return 0;
-
-err_del_irqc:
-       regmap_del_irq_chip(max77686->irq, max77686->irq_data);
-
-       return ret;
-}
-
-static int max77686_i2c_remove(struct i2c_client *i2c)
-{
-       struct max77686_dev *max77686 = i2c_get_clientdata(i2c);
-
-       mfd_remove_devices(max77686->dev);
-
-       regmap_del_irq_chip(max77686->irq, max77686->irq_data);
-
         return 0;
  }
  
@@ -317,22 +303,10 @@ static struct i2c_driver max77686_i2c_driver = {
                    .of_match_table = of_match_ptr(max77686_pmic_dt_match),
         },
         .probe = max77686_i2c_probe,
-       .remove = max77686_i2c_remove,
         .id_table = max77686_i2c_id,
  };
  
-static int __init max77686_i2c_init(void)
-{
-       return i2c_add_driver(&max77686_i2c_driver);
-}
-/* init early so consumer devices can complete system boot */
-subsys_initcall(max77686_i2c_init);
-
-static void __exit max77686_i2c_exit(void)
-{
-       i2c_del_driver(&max77686_i2c_driver);
-}
-module_exit(max77686_i2c_exit);
+module_i2c_driver(max77686_i2c_driver);
  
  MODULE_DESCRIPTION("MAXIM 77686/802 multi-function core driver");
  MODULE_AUTHOR("Chiwoong Byun <woong.byun@samsung.com>");
diff --git a/drivers/mfd/max77693.c b/drivers/mfd/max77693.c

index b83b7a7da1ae0021acbcffccedcf5bf169c2a02e..662ae0d9e33497e2eac861de437386009315a335 100644 (file)
--- a/drivers/mfd/max77693.c
+++ b/drivers/mfd/max77693.c
@@ -2,7 +2,7 @@
   * max77693.c - mfd core driver for the MAX 77693
   *
   * Copyright (C) 2012 Samsung Electronics
- * SangYoung Son <hello.son@smasung.com>
+ * SangYoung Son <hello.son@samsung.com>
   *
   * This program is not provided / owned by Maxim Integrated Products.
   *
@@ -368,6 +368,7 @@ static const struct of_device_id max77693_dt_match[] = {
         { .compatible = "maxim,max77693" },
         {},
  };
+MODULE_DEVICE_TABLE(of, max77693_dt_match);
  #endif
  
  static struct i2c_driver max77693_i2c_driver = {
@@ -381,18 +382,7 @@ static struct i2c_driver max77693_i2c_driver = {
         .id_table = max77693_i2c_id,
  };
  
-static int __init max77693_i2c_init(void)
-{
-       return i2c_add_driver(&max77693_i2c_driver);
-}
-/* init early so consumer devices can complete system boot */
-subsys_initcall(max77693_i2c_init);
-
-static void __exit max77693_i2c_exit(void)
-{
-       i2c_del_driver(&max77693_i2c_driver);
-}
-module_exit(max77693_i2c_exit);
+module_i2c_driver(max77693_i2c_driver);
  
  MODULE_DESCRIPTION("MAXIM 77693 multi-function core driver");
  MODULE_AUTHOR("SangYoung, Son <hello.son@samsung.com>");
diff --git a/drivers/mfd/menf21bmc.c b/drivers/mfd/menf21bmc.c

index 1c274345820ccb47057f00bec5fb6d32b1f9d321..3ad2def947d8b04ff2871bc209d6562cd7011c2b 100644 (file)
--- a/drivers/mfd/menf21bmc.c
+++ b/drivers/mfd/menf21bmc.c
@@ -96,8 +96,8 @@ menf21bmc_probe(struct i2c_client *client, const struct i2c_device_id *ids)
                 return ret;
         }
  
-       ret = mfd_add_devices(&client->dev, 0, menf21bmc_cell,
-                             ARRAY_SIZE(menf21bmc_cell), NULL, 0, NULL);
+       ret = devm_mfd_add_devices(&client->dev, 0, menf21bmc_cell,
+                                  ARRAY_SIZE(menf21bmc_cell), NULL, 0, NULL);
         if (ret < 0) {
                 dev_err(&client->dev, "failed to add BMC sub-devices\n");
                 return ret;
@@ -106,12 +106,6 @@ menf21bmc_probe(struct i2c_client *client, const struct i2c_device_id *ids)
         return 0;
  }
  
-static int menf21bmc_remove(struct i2c_client *client)
-{
-       mfd_remove_devices(&client->dev);
-       return 0;
-}
-
  static const struct i2c_device_id menf21bmc_id_table[] = {
         { "menf21bmc" },
         { }
@@ -122,7 +116,6 @@ static struct i2c_driver menf21bmc_driver = {
         .driver.name    = "menf21bmc",
         .id_table       = menf21bmc_id_table,
         .probe          = menf21bmc_probe,
-       .remove         = menf21bmc_remove,
  };
  
  module_i2c_driver(menf21bmc_driver);
diff --git a/drivers/mfd/mfd-core.c b/drivers/mfd/mfd-core.c

index fc1c1fc138133d86053ee32a2a7a7b96cd255145..3ac486a597f3c31e8e362f1f9954098cdf081086 100644 (file)
--- a/drivers/mfd/mfd-core.c
+++ b/drivers/mfd/mfd-core.c
@@ -107,7 +107,7 @@ static void mfd_acpi_add_device(const struct mfd_cell *cell,
  
                         strlcpy(ids[0].id, match->pnpid, sizeof(ids[0].id));
                         list_for_each_entry(child, &parent->children, node) {
-                               if (acpi_match_device_ids(child, ids)) {
+                               if (!acpi_match_device_ids(child, ids)) {
                                         adev = child;
                                         break;
                                 }
@@ -334,6 +334,44 @@ void mfd_remove_devices(struct device *parent)
  }
  EXPORT_SYMBOL(mfd_remove_devices);
  
+static void devm_mfd_dev_release(struct device *dev, void *res)
+{
+       mfd_remove_devices(dev);
+}
+
+/**
+ * devm_mfd_add_devices - Resource managed version of mfd_add_devices()
+ *
+ * Returns 0 on success or an appropriate negative error number on failure.
+ * All child-devices of the MFD will automatically be removed when it gets
+ * unbinded.
+ */
+int devm_mfd_add_devices(struct device *dev, int id,
+                        const struct mfd_cell *cells, int n_devs,
+                        struct resource *mem_base,
+                        int irq_base, struct irq_domain *domain)
+{
+       struct device **ptr;
+       int ret;
+
+       ptr = devres_alloc(devm_mfd_dev_release, sizeof(*ptr), GFP_KERNEL);
+       if (!ptr)
+               return -ENOMEM;
+
+       ret = mfd_add_devices(dev, id, cells, n_devs, mem_base,
+                             irq_base, domain);
+       if (ret < 0) {
+               devres_free(ptr);
+               return ret;
+       }
+
+       *ptr = dev;
+       devres_add(dev, ptr);
+
+       return ret;
+}
+EXPORT_SYMBOL(devm_mfd_add_devices);
+
  int mfd_clone_cell(const char *cell, const char **clones, size_t n_clones)
  {
         struct mfd_cell cell_entry;
diff --git a/drivers/mfd/mt6397-core.c b/drivers/mfd/mt6397-core.c

index 8e8d93249c09794829f148bfa0428581082dd06b..e14d8b058f0c24b448af028e7911ab99c1732888 100644 (file)
--- a/drivers/mfd/mt6397-core.c
+++ b/drivers/mfd/mt6397-core.c
@@ -267,17 +267,26 @@ static int mt6397_probe(struct platform_device *pdev)
         ret = regmap_read(pmic->regmap, MT6397_CID, &id);
         if (ret) {
                 dev_err(pmic->dev, "Failed to read chip id: %d\n", ret);
-               goto fail_irq;
+               return ret;
         }
  
+       pmic->irq = platform_get_irq(pdev, 0);
+       if (pmic->irq <= 0)
+               return pmic->irq;
+
         switch (id & 0xff) {
         case MT6323_CID_CODE:
                 pmic->int_con[0] = MT6323_INT_CON0;
                 pmic->int_con[1] = MT6323_INT_CON1;
                 pmic->int_status[0] = MT6323_INT_STATUS0;
                 pmic->int_status[1] = MT6323_INT_STATUS1;
-               ret = mfd_add_devices(&pdev->dev, -1, mt6323_devs,
-                               ARRAY_SIZE(mt6323_devs), NULL, 0, NULL);
+               ret = mt6397_irq_init(pmic);
+               if (ret)
+                       return ret;
+
+               ret = devm_mfd_add_devices(&pdev->dev, -1, mt6323_devs,
+                                          ARRAY_SIZE(mt6323_devs), NULL,
+                                          0, NULL);
                 break;
  
         case MT6397_CID_CODE:
@@ -286,8 +295,13 @@ static int mt6397_probe(struct platform_device *pdev)
                 pmic->int_con[1] = MT6397_INT_CON1;
                 pmic->int_status[0] = MT6397_INT_STATUS0;
                 pmic->int_status[1] = MT6397_INT_STATUS1;
-               ret = mfd_add_devices(&pdev->dev, -1, mt6397_devs,
-                               ARRAY_SIZE(mt6397_devs), NULL, 0, NULL);
+               ret = mt6397_irq_init(pmic);
+               if (ret)
+                       return ret;
+
+               ret = devm_mfd_add_devices(&pdev->dev, -1, mt6397_devs,
+                                          ARRAY_SIZE(mt6397_devs), NULL,
+                                          0, NULL);
                 break;
  
         default:
@@ -296,14 +310,6 @@ static int mt6397_probe(struct platform_device *pdev)
                 break;
         }
  
-       pmic->irq = platform_get_irq(pdev, 0);
-       if (pmic->irq > 0) {
-               ret = mt6397_irq_init(pmic);
-               if (ret)
-                       return ret;
-       }
-
-fail_irq:
         if (ret) {
                 irq_domain_remove(pmic->irq_domain);
                 dev_err(&pdev->dev, "failed to add child devices: %d\n", ret);
@@ -312,13 +318,6 @@ fail_irq:
         return ret;
  }
  
-static int mt6397_remove(struct platform_device *pdev)
-{
-       mfd_remove_devices(&pdev->dev);
-
-       return 0;
-}
-
  static const struct of_device_id mt6397_of_match[] = {
         { .compatible = "mediatek,mt6397" },
         { .compatible = "mediatek,mt6323" },
@@ -334,7 +333,6 @@ MODULE_DEVICE_TABLE(platform, mt6397_id);
  
  static struct platform_driver mt6397_driver = {
         .probe = mt6397_probe,
-       .remove = mt6397_remove,
         .driver = {
                 .name = "mt6397",
                 .of_match_table = of_match_ptr(mt6397_of_match),
diff --git a/drivers/mfd/omap-usb-tll.c b/drivers/mfd/omap-usb-tll.c

index b7b3e8ee64f26a70cf9a494b929aaab9e3218852..c30290f334306d1a4ff5c8cf6170b03d6c1b6c6b 100644 (file)
--- a/drivers/mfd/omap-usb-tll.c
+++ b/drivers/mfd/omap-usb-tll.c
@@ -269,6 +269,8 @@ static int usbtll_omap_probe(struct platform_device *pdev)
  
                 if (IS_ERR(tll->ch_clk[i]))
                         dev_dbg(dev, "can't get clock : %s\n", clkname);
+               else
+                       clk_prepare(tll->ch_clk[i]);
         }
  
         pm_runtime_put_sync(dev);
@@ -301,9 +303,12 @@ static int usbtll_omap_remove(struct platform_device *pdev)
         tll_dev = NULL;
         spin_unlock(&tll_lock);
  
-       for (i = 0; i < tll->nch; i++)
-               if (!IS_ERR(tll->ch_clk[i]))
+       for (i = 0; i < tll->nch; i++) {
+               if (!IS_ERR(tll->ch_clk[i])) {
+                       clk_unprepare(tll->ch_clk[i]);
                         clk_put(tll->ch_clk[i]);
+               }
+       }
  
         pm_runtime_disable(&pdev->dev);
         return 0;
@@ -420,7 +425,7 @@ int omap_tll_enable(struct usbhs_omap_platform_data *pdata)
                         if (IS_ERR(tll->ch_clk[i]))
                                 continue;
  
-                       r = clk_prepare_enable(tll->ch_clk[i]);
+                       r = clk_enable(tll->ch_clk[i]);
                         if (r) {
                                 dev_err(tll_dev,
                                  "Error enabling ch %d clock: %d\n", i, r);
@@ -448,7 +453,7 @@ int omap_tll_disable(struct usbhs_omap_platform_data *pdata)
         for (i = 0; i < tll->nch; i++) {
                 if (omap_usb_mode_needs_tll(pdata->port_mode[i])) {
                         if (!IS_ERR(tll->ch_clk[i]))
-                               clk_disable_unprepare(tll->ch_clk[i]);
+                               clk_disable(tll->ch_clk[i]);
                 }
         }
  
diff --git a/drivers/mfd/rc5t583-irq.c b/drivers/mfd/rc5t583-irq.c

index 3f8812daa3045e89e67df777a17b38817800315b..f8dde59ea6af4d0aecb16cd0d6269df549693e57 100644 (file)
--- a/drivers/mfd/rc5t583-irq.c
+++ b/drivers/mfd/rc5t583-irq.c
@@ -389,17 +389,10 @@ int rc5t583_irq_init(struct rc5t583 *rc5t583, int irq, int irq_base)
                 irq_clear_status_flags(__irq, IRQ_NOREQUEST);
         }
  
-       ret = request_threaded_irq(irq, NULL, rc5t583_irq, IRQF_ONESHOT,
-                               "rc5t583", rc5t583);
+       ret = devm_request_threaded_irq(rc5t583->dev, irq, NULL, rc5t583_irq,
+                                       IRQF_ONESHOT, "rc5t583", rc5t583);
         if (ret < 0)
                 dev_err(rc5t583->dev,
                         "Error in registering interrupt error: %d\n", ret);
         return ret;
  }
-
-int rc5t583_irq_exit(struct rc5t583 *rc5t583)
-{
-       if (rc5t583->chip_irq)
-               free_irq(rc5t583->chip_irq, rc5t583);
-       return 0;
-}
diff --git a/drivers/mfd/rc5t583.c b/drivers/mfd/rc5t583.c

index fc2b2d93f354c269134191f2629cf8e646696c12..d12243d5ecb809b064be189d9db9b22e281336e0 100644 (file)
--- a/drivers/mfd/rc5t583.c
+++ b/drivers/mfd/rc5t583.c
@@ -252,7 +252,6 @@ static int rc5t583_i2c_probe(struct i2c_client *i2c,
         struct rc5t583 *rc5t583;
         struct rc5t583_platform_data *pdata = dev_get_platdata(&i2c->dev);
         int ret;
-       bool irq_init_success = false;
  
         if (!pdata) {
                 dev_err(&i2c->dev, "Err: Platform data not found\n");
@@ -284,32 +283,16 @@ static int rc5t583_i2c_probe(struct i2c_client *i2c,
                 /* Still continue with warning, if irq init fails */
                 if (ret)
                         dev_warn(&i2c->dev, "IRQ init failed: %d\n", ret);
-               else
-                       irq_init_success = true;
         }
  
-       ret = mfd_add_devices(rc5t583->dev, -1, rc5t583_subdevs,
-                             ARRAY_SIZE(rc5t583_subdevs), NULL, 0, NULL);
+       ret = devm_mfd_add_devices(rc5t583->dev, -1, rc5t583_subdevs,
+                                  ARRAY_SIZE(rc5t583_subdevs), NULL, 0, NULL);
         if (ret) {
                 dev_err(&i2c->dev, "add mfd devices failed: %d\n", ret);
-               goto err_add_devs;
+               return ret;
         }
  
         return 0;
-
-err_add_devs:
-       if (irq_init_success)
-               rc5t583_irq_exit(rc5t583);
-       return ret;
-}
-
-static int  rc5t583_i2c_remove(struct i2c_client *i2c)
-{
-       struct rc5t583 *rc5t583 = i2c_get_clientdata(i2c);
-
-       mfd_remove_devices(rc5t583->dev);
-       rc5t583_irq_exit(rc5t583);
-       return 0;
  }
  
  static const struct i2c_device_id rc5t583_i2c_id[] = {
@@ -324,7 +307,6 @@ static struct i2c_driver rc5t583_i2c_driver = {
                    .name = "rc5t583",
                    },
         .probe = rc5t583_i2c_probe,
-       .remove = rc5t583_i2c_remove,
         .id_table = rc5t583_i2c_id,
  };
  
diff --git a/drivers/mfd/rdc321x-southbridge.c b/drivers/mfd/rdc321x-southbridge.c

index 6575585f1d1f0eed3933383d1432cbaa40ba8dbe..2bd8c5b6d600a7824b8c9f437fb75b21f0f01b2e 100644 (file)
--- a/drivers/mfd/rdc321x-southbridge.c
+++ b/drivers/mfd/rdc321x-southbridge.c
@@ -85,14 +85,10 @@ static int rdc321x_sb_probe(struct pci_dev *pdev,
         rdc321x_gpio_pdata.sb_pdev = pdev;
         rdc321x_wdt_pdata.sb_pdev = pdev;
  
-       return mfd_add_devices(&pdev->dev, -1,
-                              rdc321x_sb_cells, ARRAY_SIZE(rdc321x_sb_cells),
-                              NULL, 0, NULL);
-}
-
-static void rdc321x_sb_remove(struct pci_dev *pdev)
-{
-       mfd_remove_devices(&pdev->dev);
+       return devm_mfd_add_devices(&pdev->dev, -1,
+                                   rdc321x_sb_cells,
+                                   ARRAY_SIZE(rdc321x_sb_cells),
+                                   NULL, 0, NULL);
  }
  
  static const struct pci_device_id rdc321x_sb_table[] = {
@@ -105,7 +101,6 @@ static struct pci_driver rdc321x_sb_driver = {
         .name           = "RDC321x Southbridge",
         .id_table       = rdc321x_sb_table,
         .probe          = rdc321x_sb_probe,
-       .remove         = rdc321x_sb_remove,
  };
  
  module_pci_driver(rdc321x_sb_driver);
diff --git a/drivers/mfd/rk808.c b/drivers/mfd/rk808.c

index 4b1e4399754bf2950c5662d763e3763653137b9d..49d7f624fc94618b160425d5df4f78e246130176 100644 (file)
--- a/drivers/mfd/rk808.c
+++ b/drivers/mfd/rk808.c
@@ -213,9 +213,9 @@ static int rk808_probe(struct i2c_client *client,
         rk808->i2c = client;
         i2c_set_clientdata(client, rk808);
  
-       ret = mfd_add_devices(&client->dev, -1,
-                             rk808s, ARRAY_SIZE(rk808s),
-                             NULL, 0, regmap_irq_get_domain(rk808->irq_data));
+       ret = devm_mfd_add_devices(&client->dev, -1,
+                                  rk808s, ARRAY_SIZE(rk808s), NULL, 0,
+                                  regmap_irq_get_domain(rk808->irq_data));
         if (ret) {
                 dev_err(&client->dev, "failed to add MFD devices %d\n", ret);
                 goto err_irq;
@@ -240,7 +240,6 @@ static int rk808_remove(struct i2c_client *client)
         struct rk808 *rk808 = i2c_get_clientdata(client);
  
         regmap_del_irq_chip(client->irq, rk808->irq_data);
-       mfd_remove_devices(&client->dev);
         pm_power_off = NULL;
  
         return 0;
diff --git a/drivers/mfd/rn5t618.c b/drivers/mfd/rn5t618.c

index 666857192dbeb6db52795b2d462172856aed2e23..0ad51d792feb5460d47eeb4b93065a12d7a9b94c 100644 (file)
--- a/drivers/mfd/rn5t618.c
+++ b/drivers/mfd/rn5t618.c
@@ -78,8 +78,8 @@ static int rn5t618_i2c_probe(struct i2c_client *i2c,
                 return ret;
         }
  
-       ret = mfd_add_devices(&i2c->dev, -1, rn5t618_cells,
-                             ARRAY_SIZE(rn5t618_cells), NULL, 0, NULL);
+       ret = devm_mfd_add_devices(&i2c->dev, -1, rn5t618_cells,
+                                  ARRAY_SIZE(rn5t618_cells), NULL, 0, NULL);
         if (ret) {
                 dev_err(&i2c->dev, "failed to add sub-devices: %d\n", ret);
                 return ret;
@@ -102,7 +102,6 @@ static int rn5t618_i2c_remove(struct i2c_client *i2c)
                 pm_power_off = NULL;
         }
  
-       mfd_remove_devices(&i2c->dev);
         return 0;
  }
  
diff --git a/drivers/mfd/rt5033.c b/drivers/mfd/rt5033.c

index 2b95485f00574704f4a4eea300b4ada92987df53..9bd089c563753b6cb85a7de43ebbdcedcf86738f 100644 (file)
--- a/drivers/mfd/rt5033.c
+++ b/drivers/mfd/rt5033.c
@@ -97,9 +97,9 @@ static int rt5033_i2c_probe(struct i2c_client *i2c,
                 return ret;
         }
  
-       ret = mfd_add_devices(rt5033->dev, -1, rt5033_devs,
-                       ARRAY_SIZE(rt5033_devs), NULL, 0,
-                       regmap_irq_get_domain(rt5033->irq_data));
+       ret = devm_mfd_add_devices(rt5033->dev, -1, rt5033_devs,
+                                  ARRAY_SIZE(rt5033_devs), NULL, 0,
+                                  regmap_irq_get_domain(rt5033->irq_data));
         if (ret < 0) {
                 dev_err(&i2c->dev, "Failed to add RT5033 child devices.\n");
                 return ret;
@@ -110,13 +110,6 @@ static int rt5033_i2c_probe(struct i2c_client *i2c,
         return 0;
  }
  
-static int rt5033_i2c_remove(struct i2c_client *i2c)
-{
-       mfd_remove_devices(&i2c->dev);
-
-       return 0;
-}
-
  static const struct i2c_device_id rt5033_i2c_id[] = {
         { "rt5033", },
         { }
@@ -135,7 +128,6 @@ static struct i2c_driver rt5033_driver = {
                 .of_match_table = of_match_ptr(rt5033_dt_match),
         },
         .probe = rt5033_i2c_probe,
-       .remove = rt5033_i2c_remove,
         .id_table = rt5033_i2c_id,
  };
  module_i2c_driver(rt5033_driver);
diff --git a/drivers/mfd/sec-core.c b/drivers/mfd/sec-core.c

index 400e1d7d8d08fb94086de003eb1346c1bbba4139..ca6b80d08ffccbdc0736f4901ef09bc833939f19 100644 (file)
--- a/drivers/mfd/sec-core.c
+++ b/drivers/mfd/sec-core.c
@@ -481,29 +481,16 @@ static int sec_pmic_probe(struct i2c_client *i2c,
                 /* If this happens the probe function is problem */
                 BUG();
         }
-       ret = mfd_add_devices(sec_pmic->dev, -1, sec_devs, num_sec_devs, NULL,
-                             0, NULL);
+       ret = devm_mfd_add_devices(sec_pmic->dev, -1, sec_devs, num_sec_devs,
+                                  NULL, 0, NULL);
         if (ret)
-               goto err_mfd;
+               return ret;
  
         device_init_wakeup(sec_pmic->dev, sec_pmic->wakeup);
         sec_pmic_configure(sec_pmic);
         sec_pmic_dump_rev(sec_pmic);
  
         return ret;
-
-err_mfd:
-       sec_irq_exit(sec_pmic);
-       return ret;
-}
-
-static int sec_pmic_remove(struct i2c_client *i2c)
-{
-       struct sec_pmic_dev *sec_pmic = i2c_get_clientdata(i2c);
-
-       mfd_remove_devices(sec_pmic->dev);
-       sec_irq_exit(sec_pmic);
-       return 0;
  }
  
  static void sec_pmic_shutdown(struct i2c_client *i2c)
@@ -583,7 +570,6 @@ static struct i2c_driver sec_pmic_driver = {
                    .of_match_table = of_match_ptr(sec_dt_match),
         },
         .probe = sec_pmic_probe,
-       .remove = sec_pmic_remove,
         .shutdown = sec_pmic_shutdown,
         .id_table = sec_pmic_id,
  };
diff --git a/drivers/mfd/sec-irq.c b/drivers/mfd/sec-irq.c

index d77de431cc506c02c4bcc9e937a7dfd1361c5e4c..5eb59c233d520faed01b218f262129e643bcb0fd 100644 (file)
--- a/drivers/mfd/sec-irq.c
+++ b/drivers/mfd/sec-irq.c
@@ -483,10 +483,11 @@ int sec_irq_init(struct sec_pmic_dev *sec_pmic)
                 return -EINVAL;
         }
  
-       ret = regmap_add_irq_chip(sec_pmic->regmap_pmic, sec_pmic->irq,
-                         IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
-                         sec_pmic->irq_base, sec_irq_chip,
-                         &sec_pmic->irq_data);
+       ret = devm_regmap_add_irq_chip(sec_pmic->dev, sec_pmic->regmap_pmic,
+                                      sec_pmic->irq,
+                                      IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
+                                      sec_pmic->irq_base, sec_irq_chip,
+                                      &sec_pmic->irq_data);
         if (ret != 0) {
                 dev_err(sec_pmic->dev, "Failed to register IRQ chip: %d\n", ret);
                 return ret;
@@ -500,8 +501,3 @@ int sec_irq_init(struct sec_pmic_dev *sec_pmic)
  
         return 0;
  }
-
-void sec_irq_exit(struct sec_pmic_dev *sec_pmic)
-{
-       regmap_del_irq_chip(sec_pmic->irq, sec_pmic->irq_data);
-}
diff --git a/drivers/mfd/sky81452.c b/drivers/mfd/sky81452.c

index b0c9b04156508164902d3dfc6c4f19ae1c594fe0..30a2a677100fbbe3ecf1677c23265237463b9420 100644 (file)
--- a/drivers/mfd/sky81452.c
+++ b/drivers/mfd/sky81452.c
@@ -64,19 +64,14 @@ static int sky81452_probe(struct i2c_client *client,
         cells[1].platform_data = pdata->regulator_init_data;
         cells[1].pdata_size = sizeof(*pdata->regulator_init_data);
  
-       ret = mfd_add_devices(dev, -1, cells, ARRAY_SIZE(cells), NULL, 0, NULL);
+       ret = devm_mfd_add_devices(dev, -1, cells, ARRAY_SIZE(cells),
+                                  NULL, 0, NULL);
         if (ret)
                 dev_err(dev, "failed to add child devices. err=%d\n", ret);
  
         return ret;
  }
  
-static int sky81452_remove(struct i2c_client *client)
-{
-       mfd_remove_devices(&client->dev);
-       return 0;
-}
-
  static const struct i2c_device_id sky81452_ids[] = {
         { "sky81452" },
         { }
@@ -97,7 +92,6 @@ static struct i2c_driver sky81452_driver = {
                 .of_match_table = of_match_ptr(sky81452_of_match),
         },
         .probe = sky81452_probe,
-       .remove = sky81452_remove,
         .id_table = sky81452_ids,
  };
  
diff --git a/drivers/mfd/sm501.c b/drivers/mfd/sm501.c

index c646784c5a7d0a5de96165f584a51a9c65f1e6ac..65cd0d2a822a6565ed7aec4bf0a3930bf60812b4 100644 (file)
--- a/drivers/mfd/sm501.c
+++ b/drivers/mfd/sm501.c
@@ -879,11 +879,6 @@ static int sm501_register_display(struct sm501_devdata *sm,
  
  #ifdef CONFIG_MFD_SM501_GPIO
  
-static inline struct sm501_gpio_chip *to_sm501_gpio(struct gpio_chip *gc)
-{
-       return container_of(gc, struct sm501_gpio_chip, gpio);
-}
-
  static inline struct sm501_devdata *sm501_gpio_to_dev(struct sm501_gpio *gpio)
  {
         return container_of(gpio, struct sm501_devdata, gpio);
@@ -892,7 +887,7 @@ static inline struct sm501_devdata *sm501_gpio_to_dev(struct sm501_gpio *gpio)
  static int sm501_gpio_get(struct gpio_chip *chip, unsigned offset)
  
  {
-       struct sm501_gpio_chip *smgpio = to_sm501_gpio(chip);
+       struct sm501_gpio_chip *smgpio = gpiochip_get_data(chip);
         unsigned long result;
  
         result = smc501_readl(smgpio->regbase + SM501_GPIO_DATA_LOW);
@@ -923,7 +918,7 @@ static void sm501_gpio_ensure_gpio(struct sm501_gpio_chip *smchip,
  static void sm501_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
  
  {
-       struct sm501_gpio_chip *smchip = to_sm501_gpio(chip);
+       struct sm501_gpio_chip *smchip = gpiochip_get_data(chip);
         struct sm501_gpio *smgpio = smchip->ourgpio;
         unsigned long bit = 1 << offset;
         void __iomem *regs = smchip->regbase;
@@ -948,7 +943,7 @@ static void sm501_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
  
  static int sm501_gpio_input(struct gpio_chip *chip, unsigned offset)
  {
-       struct sm501_gpio_chip *smchip = to_sm501_gpio(chip);
+       struct sm501_gpio_chip *smchip = gpiochip_get_data(chip);
         struct sm501_gpio *smgpio = smchip->ourgpio;
         void __iomem *regs = smchip->regbase;
         unsigned long bit = 1 << offset;
@@ -974,7 +969,7 @@ static int sm501_gpio_input(struct gpio_chip *chip, unsigned offset)
  static int sm501_gpio_output(struct gpio_chip *chip,
                              unsigned offset, int value)
  {
-       struct sm501_gpio_chip *smchip = to_sm501_gpio(chip);
+       struct sm501_gpio_chip *smchip = gpiochip_get_data(chip);
         struct sm501_gpio *smgpio = smchip->ourgpio;
         unsigned long bit = 1 << offset;
         void __iomem *regs = smchip->regbase;
@@ -1039,7 +1034,7 @@ static int sm501_gpio_register_chip(struct sm501_devdata *sm,
         gchip->base   = base;
         chip->ourgpio = gpio;
  
-       return gpiochip_add(gchip);
+       return gpiochip_add_data(gchip, chip);
  }
  
  static int sm501_register_gpio(struct sm501_devdata *sm)
diff --git a/drivers/mfd/smsc-ece1099.c b/drivers/mfd/smsc-ece1099.c

index a4c0df71c8b30a40b1dd97877ab1a2cd8d536cc0..7f89e89b8a5ee1f2b03e8e7e0fc528b731bea53a 100644 (file)
--- a/drivers/mfd/smsc-ece1099.c
+++ b/drivers/mfd/smsc-ece1099.c
@@ -80,15 +80,6 @@ err:
         return ret;
  }
  
-static int smsc_i2c_remove(struct i2c_client *i2c)
-{
-       struct smsc *smsc = i2c_get_clientdata(i2c);
-
-       mfd_remove_devices(smsc->dev);
-
-       return 0;
-}
-
  static const struct i2c_device_id smsc_i2c_id[] = {
         { "smscece1099", 0},
         {},
@@ -100,7 +91,6 @@ static struct i2c_driver smsc_i2c_driver = {
                    .name = "smsc",
         },
         .probe = smsc_i2c_probe,
-       .remove = smsc_i2c_remove,
         .id_table = smsc_i2c_id,
  };
  
diff --git a/drivers/mfd/stw481x.c b/drivers/mfd/stw481x.c

index ca613df36143888f2c0848d27e9dd31404087e11..ab949eaca6ad82edee77c443efb220c6a81b8a23 100644 (file)
--- a/drivers/mfd/stw481x.c
+++ b/drivers/mfd/stw481x.c
@@ -206,8 +206,8 @@ static int stw481x_probe(struct i2c_client *client,
                 stw481x_cells[i].pdata_size = sizeof(*stw481x);
         }
  
-       ret = mfd_add_devices(&client->dev, 0, stw481x_cells,
-                       ARRAY_SIZE(stw481x_cells), NULL, 0, NULL);
+       ret = devm_mfd_add_devices(&client->dev, 0, stw481x_cells,
+                                  ARRAY_SIZE(stw481x_cells), NULL, 0, NULL);
         if (ret)
                 return ret;
  
@@ -216,12 +216,6 @@ static int stw481x_probe(struct i2c_client *client,
         return ret;
  }
  
-static int stw481x_remove(struct i2c_client *client)
-{
-       mfd_remove_devices(&client->dev);
-       return 0;
-}
-
  /*
   * This ID table is completely unused, as this is a pure
   * device-tree probed driver, but it has to be here due to
@@ -246,7 +240,6 @@ static struct i2c_driver stw481x_driver = {
                 .of_match_table = stw481x_match,
         },
         .probe          = stw481x_probe,
-       .remove         = stw481x_remove,
         .id_table       = stw481x_id,
  };
  
diff --git a/drivers/mfd/tc6393xb.c b/drivers/mfd/tc6393xb.c

index 1ecbfa40d1b32b28ba7f2672f858b9b38e512ca4..d42d322ac7ca7ebc99c626ce09b9bb158ebb50a0 100644 (file)
--- a/drivers/mfd/tc6393xb.c
+++ b/drivers/mfd/tc6393xb.c
@@ -24,7 +24,7 @@
  #include <linux/mfd/core.h>
  #include <linux/mfd/tmio.h>
  #include <linux/mfd/tc6393xb.h>
-#include <linux/gpio.h>
+#include <linux/gpio/driver.h>
  #include <linux/slab.h>
  
  #define SCR_REVID      0x08            /* b Revision ID        */
@@ -434,7 +434,7 @@ static struct mfd_cell tc6393xb_cells[] = {
  static int tc6393xb_gpio_get(struct gpio_chip *chip,
                 unsigned offset)
  {
-       struct tc6393xb *tc6393xb = container_of(chip, struct tc6393xb, gpio);
+       struct tc6393xb *tc6393xb = gpiochip_get_data(chip);
  
         /* XXX: does dsr also represent inputs? */
         return !!(tmio_ioread8(tc6393xb->scr + SCR_GPO_DSR(offset / 8))
@@ -444,7 +444,7 @@ static int tc6393xb_gpio_get(struct gpio_chip *chip,
  static void __tc6393xb_gpio_set(struct gpio_chip *chip,
                 unsigned offset, int value)
  {
-       struct tc6393xb *tc6393xb = container_of(chip, struct tc6393xb, gpio);
+       struct tc6393xb *tc6393xb = gpiochip_get_data(chip);
         u8  dsr;
  
         dsr = tmio_ioread8(tc6393xb->scr + SCR_GPO_DSR(offset / 8));
@@ -459,7 +459,7 @@ static void __tc6393xb_gpio_set(struct gpio_chip *chip,
  static void tc6393xb_gpio_set(struct gpio_chip *chip,
                 unsigned offset, int value)
  {
-       struct tc6393xb *tc6393xb = container_of(chip, struct tc6393xb, gpio);
+       struct tc6393xb *tc6393xb = gpiochip_get_data(chip);
         unsigned long flags;
  
         spin_lock_irqsave(&tc6393xb->lock, flags);
@@ -472,7 +472,7 @@ static void tc6393xb_gpio_set(struct gpio_chip *chip,
  static int tc6393xb_gpio_direction_input(struct gpio_chip *chip,
                         unsigned offset)
  {
-       struct tc6393xb *tc6393xb = container_of(chip, struct tc6393xb, gpio);
+       struct tc6393xb *tc6393xb = gpiochip_get_data(chip);
         unsigned long flags;
         u8 doecr;
  
@@ -490,7 +490,7 @@ static int tc6393xb_gpio_direction_input(struct gpio_chip *chip,
  static int tc6393xb_gpio_direction_output(struct gpio_chip *chip,
                         unsigned offset, int value)
  {
-       struct tc6393xb *tc6393xb = container_of(chip, struct tc6393xb, gpio);
+       struct tc6393xb *tc6393xb = gpiochip_get_data(chip);
         unsigned long flags;
         u8 doecr;
  
@@ -517,7 +517,7 @@ static int tc6393xb_register_gpio(struct tc6393xb *tc6393xb, int gpio_base)
         tc6393xb->gpio.direction_input = tc6393xb_gpio_direction_input;
         tc6393xb->gpio.direction_output = tc6393xb_gpio_direction_output;
  
-       return gpiochip_add(&tc6393xb->gpio);
+       return gpiochip_add_data(&tc6393xb->gpio, tc6393xb);
  }
  
  /*--------------------------------------------------------------------------*/
diff --git a/drivers/mfd/tps6105x.c b/drivers/mfd/tps6105x.c

index 51c54951c2206b98cd74daf638e3ddee7f978649..baa12ea666fb2c950b425dafb801b531f62d9b03 100644 (file)
--- a/drivers/mfd/tps6105x.c
+++ b/drivers/mfd/tps6105x.c
@@ -21,7 +21,6 @@
  #include <linux/spinlock.h>
  #include <linux/slab.h>
  #include <linux/err.h>
-#include <linux/regulator/driver.h>
  #include <linux/mfd/core.h>
  #include <linux/mfd/tps6105x.h>
  
diff --git a/drivers/mfd/tps65010.c b/drivers/mfd/tps65010.c

index 495e4518fc2992ff1a757858611f7cf1f3c27b55..d829a6131f09e5de61ecc02b8727fd225cea5232 100644 (file)
--- a/drivers/mfd/tps65010.c
+++ b/drivers/mfd/tps65010.c
@@ -34,7 +34,7 @@
  
  #include <linux/i2c/tps65010.h>
  
-#include <linux/gpio.h>
+#include <linux/gpio/driver.h>
  
  
  /*-------------------------------------------------------------------------*/
@@ -477,7 +477,7 @@ tps65010_output(struct gpio_chip *chip, unsigned offset, int value)
         if (offset < 4) {
                 struct tps65010         *tps;
  
-               tps = container_of(chip, struct tps65010, chip);
+               tps = gpiochip_get_data(chip);
                 if (!(tps->outmask & (1 << offset)))
                         return -EINVAL;
                 tps65010_set_gpio_out_value(offset + 1, value);
@@ -494,7 +494,7 @@ static int tps65010_gpio_get(struct gpio_chip *chip, unsigned offset)
         int                     value;
         struct tps65010         *tps;
  
-       tps = container_of(chip, struct tps65010, chip);
+       tps = gpiochip_get_data(chip);
  
         if (offset < 4) {
                 value = i2c_smbus_read_byte_data(tps->client, TPS_DEFGPIO);
@@ -651,7 +651,7 @@ static int tps65010_probe(struct i2c_client *client,
                 tps->chip.ngpio = 7;
                 tps->chip.can_sleep = 1;
  
-               status = gpiochip_add(&tps->chip);
+               status = gpiochip_add_data(&tps->chip, tps);
                 if (status < 0)
                         dev_err(&client->dev, "can't add gpiochip, err %d\n",
                                         status);
diff --git a/drivers/mfd/tps6507x.c b/drivers/mfd/tps6507x.c

index 1ab3dd6c8adf97e721ab469a9249d168ef1c9893..40beb2f4350c596decfe06ec477805cdc004554c 100644 (file)
--- a/drivers/mfd/tps6507x.c
+++ b/drivers/mfd/tps6507x.c
@@ -100,16 +100,8 @@ static int tps6507x_i2c_probe(struct i2c_client *i2c,
         tps6507x->read_dev = tps6507x_i2c_read_device;
         tps6507x->write_dev = tps6507x_i2c_write_device;
  
-       return mfd_add_devices(tps6507x->dev, -1, tps6507x_devs,
-                              ARRAY_SIZE(tps6507x_devs), NULL, 0, NULL);
-}
-
-static int tps6507x_i2c_remove(struct i2c_client *i2c)
-{
-       struct tps6507x_dev *tps6507x = i2c_get_clientdata(i2c);
-
-       mfd_remove_devices(tps6507x->dev);
-       return 0;
+       return devm_mfd_add_devices(tps6507x->dev, -1, tps6507x_devs,
+                                   ARRAY_SIZE(tps6507x_devs), NULL, 0, NULL);
  }
  
  static const struct i2c_device_id tps6507x_i2c_id[] = {
@@ -132,7 +124,6 @@ static struct i2c_driver tps6507x_i2c_driver = {
                    .of_match_table = of_match_ptr(tps6507x_of_match),
         },
         .probe = tps6507x_i2c_probe,
-       .remove = tps6507x_i2c_remove,
         .id_table = tps6507x_i2c_id,
  };
  
diff --git a/drivers/mfd/tps65217.c b/drivers/mfd/tps65217.c

index d32b54426b70a3d8baf47cc9a5e47f609b5c19d1..049a6fcac6511fc579581367a8fa25e4792fa3a5 100644 (file)
--- a/drivers/mfd/tps65217.c
+++ b/drivers/mfd/tps65217.c
@@ -205,8 +205,8 @@ static int tps65217_probe(struct i2c_client *client,
                 return ret;
         }
  
-       ret = mfd_add_devices(tps->dev, -1, tps65217s,
-                             ARRAY_SIZE(tps65217s), NULL, 0, NULL);
+       ret = devm_mfd_add_devices(tps->dev, -1, tps65217s,
+                                  ARRAY_SIZE(tps65217s), NULL, 0, NULL);
         if (ret < 0) {
                 dev_err(tps->dev, "mfd_add_devices failed: %d\n", ret);
                 return ret;
@@ -235,15 +235,6 @@ static int tps65217_probe(struct i2c_client *client,
         return 0;
  }
  
-static int tps65217_remove(struct i2c_client *client)
-{
-       struct tps65217 *tps = i2c_get_clientdata(client);
-
-       mfd_remove_devices(tps->dev);
-
-       return 0;
-}
-
  static const struct i2c_device_id tps65217_id_table[] = {
         {"tps65217", TPS65217},
         { /* sentinel */ }
@@ -257,7 +248,6 @@ static struct i2c_driver tps65217_driver = {
         },
         .id_table       = tps65217_id_table,
         .probe          = tps65217_probe,
-       .remove         = tps65217_remove,
  };
  
  static int __init tps65217_init(void)
diff --git a/drivers/mfd/tps65910.c b/drivers/mfd/tps65910.c

index f7ab115483a9d3357122a70ee562a345885c57ed..11cab1582f2f223316ec4ec5337dce863908b609 100644 (file)
--- a/drivers/mfd/tps65910.c
+++ b/drivers/mfd/tps65910.c
@@ -252,9 +252,10 @@ static int tps65910_irq_init(struct tps65910 *tps65910, int irq,
         }
  
         tps65910->chip_irq = irq;
-       ret = regmap_add_irq_chip(tps65910->regmap, tps65910->chip_irq,
-               IRQF_ONESHOT, pdata->irq_base,
-               tps6591x_irqs_chip, &tps65910->irq_data);
+       ret = devm_regmap_add_irq_chip(tps65910->dev, tps65910->regmap,
+                                      tps65910->chip_irq,
+                                      IRQF_ONESHOT, pdata->irq_base,
+                                      tps6591x_irqs_chip, &tps65910->irq_data);
         if (ret < 0) {
                 dev_warn(tps65910->dev, "Failed to add irq_chip %d\n", ret);
                 tps65910->chip_irq = 0;
@@ -262,13 +263,6 @@ static int tps65910_irq_init(struct tps65910 *tps65910, int irq,
         return ret;
  }
  
-static int tps65910_irq_exit(struct tps65910 *tps65910)
-{
-       if (tps65910->chip_irq > 0)
-               regmap_del_irq_chip(tps65910->chip_irq, tps65910->irq_data);
-       return 0;
-}
-
  static bool is_volatile_reg(struct device *dev, unsigned int reg)
  {
         struct tps65910 *tps65910 = dev_get_drvdata(dev);
@@ -510,29 +504,18 @@ static int tps65910_i2c_probe(struct i2c_client *i2c,
                 pm_power_off = tps65910_power_off;
         }
  
-       ret = mfd_add_devices(tps65910->dev, -1,
-                             tps65910s, ARRAY_SIZE(tps65910s),
-                             NULL, 0,
-                             regmap_irq_get_domain(tps65910->irq_data));
+       ret = devm_mfd_add_devices(tps65910->dev, -1,
+                                  tps65910s, ARRAY_SIZE(tps65910s),
+                                  NULL, 0,
+                                  regmap_irq_get_domain(tps65910->irq_data));
         if (ret < 0) {
                 dev_err(&i2c->dev, "mfd_add_devices failed: %d\n", ret);
-               tps65910_irq_exit(tps65910);
                 return ret;
         }
  
         return ret;
  }
  
-static int tps65910_i2c_remove(struct i2c_client *i2c)
-{
-       struct tps65910 *tps65910 = i2c_get_clientdata(i2c);
-
-       tps65910_irq_exit(tps65910);
-       mfd_remove_devices(tps65910->dev);
-
-       return 0;
-}
-
  static const struct i2c_device_id tps65910_i2c_id[] = {
         { "tps65910", TPS65910 },
         { "tps65911", TPS65911 },
@@ -547,7 +530,6 @@ static struct i2c_driver tps65910_i2c_driver = {
                    .of_match_table = of_match_ptr(tps65910_of_match),
         },
         .probe = tps65910_i2c_probe,
-       .remove = tps65910_i2c_remove,
         .id_table = tps65910_i2c_id,
  };
  
diff --git a/drivers/mfd/twl4030-power.c b/drivers/mfd/twl4030-power.c

index 04b539850e72a00ef3468be973759e3eef09a6af..1beb722f608080c8fd232c8fc6b21429fa2d8f6d 100644 (file)
--- a/drivers/mfd/twl4030-power.c
+++ b/drivers/mfd/twl4030-power.c
@@ -1,5 +1,4 @@
  /*
- * linux/drivers/i2c/chips/twl4030-power.c
   *
   * Handle TWL4030 Power initialization
   *
diff --git a/drivers/mfd/twl6040.c b/drivers/mfd/twl6040.c

index 08a693cd38cc4e2b17a7538be0e55ed743b6c002..852d5874aabb764ae7948d9cbfd4abf5123d413c 100644 (file)
--- a/drivers/mfd/twl6040.c
+++ b/drivers/mfd/twl6040.c
@@ -291,7 +291,11 @@ int twl6040_power(struct twl6040 *twl6040, int on)
                 if (twl6040->power_count++)
                         goto out;
  
-               clk_prepare_enable(twl6040->clk32k);
+               ret = clk_prepare_enable(twl6040->clk32k);
+               if (ret) {
+                       twl6040->power_count = 0;
+                       goto out;
+               }
  
                 /* Allow writes to the chip */
                 regcache_cache_only(twl6040->regmap, false);
@@ -300,6 +304,7 @@ int twl6040_power(struct twl6040 *twl6040, int on)
                         /* use automatic power-up sequence */
                         ret = twl6040_power_up_automatic(twl6040);
                         if (ret) {
+                               clk_disable_unprepare(twl6040->clk32k);
                                 twl6040->power_count = 0;
                                 goto out;
                         }
@@ -307,6 +312,7 @@ int twl6040_power(struct twl6040 *twl6040, int on)
                         /* use manual power-up sequence */
                         ret = twl6040_power_up_manual(twl6040);
                         if (ret) {
+                               clk_disable_unprepare(twl6040->clk32k);
                                 twl6040->power_count = 0;
                                 goto out;
                         }
diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c

index bcafe1ecd71cd4c4fc9492a2cd9db64068f28ff3..9ab9ec47ea755d63a2bcd10a8113c4d464b27b59 100644 (file)
--- a/drivers/mfd/ucb1x00-core.c
+++ b/drivers/mfd/ucb1x00-core.c
@@ -28,7 +28,7 @@
  #include <linux/mutex.h>
  #include <linux/mfd/ucb1x00.h>
  #include <linux/pm.h>
-#include <linux/gpio.h>
+#include <linux/gpio/driver.h>
  
  static DEFINE_MUTEX(ucb1x00_mutex);
  static LIST_HEAD(ucb1x00_drivers);
@@ -109,7 +109,7 @@ unsigned int ucb1x00_io_read(struct ucb1x00 *ucb)
  
  static void ucb1x00_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
  {
-       struct ucb1x00 *ucb = container_of(chip, struct ucb1x00, gpio);
+       struct ucb1x00 *ucb = gpiochip_get_data(chip);
         unsigned long flags;
  
         spin_lock_irqsave(&ucb->io_lock, flags);
@@ -126,7 +126,7 @@ static void ucb1x00_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
  
  static int ucb1x00_gpio_get(struct gpio_chip *chip, unsigned offset)
  {
-       struct ucb1x00 *ucb = container_of(chip, struct ucb1x00, gpio);
+       struct ucb1x00 *ucb = gpiochip_get_data(chip);
         unsigned val;
  
         ucb1x00_enable(ucb);
@@ -138,7 +138,7 @@ static int ucb1x00_gpio_get(struct gpio_chip *chip, unsigned offset)
  
  static int ucb1x00_gpio_direction_input(struct gpio_chip *chip, unsigned offset)
  {
-       struct ucb1x00 *ucb = container_of(chip, struct ucb1x00, gpio);
+       struct ucb1x00 *ucb = gpiochip_get_data(chip);
         unsigned long flags;
  
         spin_lock_irqsave(&ucb->io_lock, flags);
@@ -154,7 +154,7 @@ static int ucb1x00_gpio_direction_input(struct gpio_chip *chip, unsigned offset)
  static int ucb1x00_gpio_direction_output(struct gpio_chip *chip, unsigned offset
                 , int value)
  {
-       struct ucb1x00 *ucb = container_of(chip, struct ucb1x00, gpio);
+       struct ucb1x00 *ucb = gpiochip_get_data(chip);
         unsigned long flags;
         unsigned old, mask = 1 << offset;
  
@@ -181,7 +181,7 @@ static int ucb1x00_gpio_direction_output(struct gpio_chip *chip, unsigned offset
  
  static int ucb1x00_to_irq(struct gpio_chip *chip, unsigned offset)
  {
-       struct ucb1x00 *ucb = container_of(chip, struct ucb1x00, gpio);
+       struct ucb1x00 *ucb = gpiochip_get_data(chip);
  
         return ucb->irq_base > 0 ? ucb->irq_base + offset : -ENXIO;
  }
@@ -579,7 +579,7 @@ static int ucb1x00_probe(struct mcp *mcp)
                 ucb->gpio.direction_input = ucb1x00_gpio_direction_input;
                 ucb->gpio.direction_output = ucb1x00_gpio_direction_output;
                 ucb->gpio.to_irq = ucb1x00_to_irq;
-               ret = gpiochip_add(&ucb->gpio);
+               ret = gpiochip_add_data(&ucb->gpio, ucb);
                 if (ret)
                         goto err_gpio_add;
         } else
diff --git a/drivers/mfd/vexpress-sysreg.c b/drivers/mfd/vexpress-sysreg.c

index 855c0204f09ae50b4986c13fb54bbbb601b6bc0f..201a3ea2a9d3045eaabf961c9db15d44aa340c8e 100644 (file)
--- a/drivers/mfd/vexpress-sysreg.c
+++ b/drivers/mfd/vexpress-sysreg.c
@@ -202,7 +202,7 @@ static int vexpress_sysreg_probe(struct platform_device *pdev)
         bgpio_init(mmc_gpio_chip, &pdev->dev, 0x4, base + SYS_MCI,
                         NULL, NULL, NULL, NULL, 0);
         mmc_gpio_chip->ngpio = 2;
-       gpiochip_add(mmc_gpio_chip);
+       gpiochip_add_data(mmc_gpio_chip, NULL);
  
         return mfd_add_devices(&pdev->dev, PLATFORM_DEVID_AUTO,
                         vexpress_sysreg_cells,
diff --git a/drivers/mfd/wl1273-core.c b/drivers/mfd/wl1273-core.c

index f7c52d901040cc5b14f00ebb86a8678ee7918e6d..708046592b3313ff557491551e84192acdf829ab 100644 (file)
--- a/drivers/mfd/wl1273-core.c
+++ b/drivers/mfd/wl1273-core.c
@@ -170,15 +170,6 @@ static int wl1273_fm_set_volume(struct wl1273_core *core, unsigned int volume)
         return 0;
  }
  
-static int wl1273_core_remove(struct i2c_client *client)
-{
-       dev_dbg(&client->dev, "%s\n", __func__);
-
-       mfd_remove_devices(&client->dev);
-
-       return 0;
-}
-
  static int wl1273_core_probe(struct i2c_client *client,
                                        const struct i2c_device_id *id)
  {
@@ -237,8 +228,8 @@ static int wl1273_core_probe(struct i2c_client *client,
         dev_dbg(&client->dev, "%s: number of children: %d.\n",
                 __func__, children);
  
-       r = mfd_add_devices(&client->dev, -1, core->cells,
-                           children, NULL, 0, NULL);
+       r = devm_mfd_add_devices(&client->dev, -1, core->cells,
+                                children, NULL, 0, NULL);
         if (r)
                 goto err;
  
@@ -258,7 +249,6 @@ static struct i2c_driver wl1273_core_driver = {
         },
         .probe = wl1273_core_probe,
         .id_table = wl1273_driver_id_table,
-       .remove = wl1273_core_remove,
  };
  
  static int __init wl1273_core_init(void)
diff --git a/drivers/mfd/wm5110-tables.c b/drivers/mfd/wm5110-tables.c

index 8e74e71507e728df5a91144969578dcec8d24f3b..1ee68bd440fbc279874ceaf7ccefc37c692e9f5c 100644 (file)
--- a/drivers/mfd/wm5110-tables.c
+++ b/drivers/mfd/wm5110-tables.c
@@ -3066,6 +3066,7 @@ static bool wm5110_volatile_register(struct device *dev, unsigned int reg)
         case ARIZONA_AOD_IRQ_RAW_STATUS:
         case ARIZONA_FX_CTRL2:
         case ARIZONA_ASRC_STATUS:
+       case ARIZONA_CLOCK_CONTROL:
         case ARIZONA_DSP_STATUS:
         case ARIZONA_DSP1_STATUS_1:
         case ARIZONA_DSP1_STATUS_2:
diff --git a/drivers/mfd/wm8400-core.c b/drivers/mfd/wm8400-core.c

index 3bd44a45c3789036cb202e89f5c661ccf0923896..8a98a2fc74e11fa1204677ffffa9b57ca206b92e 100644 (file)
--- a/drivers/mfd/wm8400-core.c
+++ b/drivers/mfd/wm8400-core.c
@@ -35,27 +35,6 @@ static bool wm8400_volatile(struct device *dev, unsigned int reg)
         }
  }
  
-/**
- * wm8400_reg_read - Single register read
- *
- * @wm8400: Pointer to wm8400 control structure
- * @reg:    Register to read
- *
- * @return  Read value
- */
-u16 wm8400_reg_read(struct wm8400 *wm8400, u8 reg)
-{
-       unsigned int val;
-       int ret;
-
-       ret = regmap_read(wm8400->regmap, reg, &val);
-       if (ret < 0)
-               return ret;
-
-       return val;
-}
-EXPORT_SYMBOL_GPL(wm8400_reg_read);
-
  int wm8400_block_read(struct wm8400 *wm8400, u8 reg, int count, u16 *data)
  {
         return regmap_bulk_read(wm8400->regmap, reg, data, count);
@@ -70,7 +49,7 @@ static int wm8400_register_codec(struct wm8400 *wm8400)
                 .pdata_size = sizeof(*wm8400),
         };
  
-       return mfd_add_devices(wm8400->dev, -1, &cell, 1, NULL, 0, NULL);
+       return devm_mfd_add_devices(wm8400->dev, -1, &cell, 1, NULL, 0, NULL);
  }
  
  /*
@@ -111,7 +90,7 @@ static int wm8400_init(struct wm8400 *wm8400,
         ret = wm8400_register_codec(wm8400);
         if (ret != 0) {
                 dev_err(wm8400->dev, "Failed to register codec\n");
-               goto err_children;
+               return ret;
         }
  
         if (pdata && pdata->platform_init) {
@@ -119,21 +98,12 @@ static int wm8400_init(struct wm8400 *wm8400,
                 if (ret != 0) {
                         dev_err(wm8400->dev, "Platform init failed: %d\n",
                                 ret);
-                       goto err_children;
+                       return ret;
                 }
         } else
                 dev_warn(wm8400->dev, "No platform initialisation supplied\n");
  
         return 0;
-
-err_children:
-       mfd_remove_devices(wm8400->dev);
-       return ret;
-}
-
-static void wm8400_release(struct wm8400 *wm8400)
-{
-       mfd_remove_devices(wm8400->dev);
  }
  
  static const struct regmap_config wm8400_regmap_config = {
@@ -156,7 +126,7 @@ void wm8400_reset_codec_reg_cache(struct wm8400 *wm8400)
  }
  EXPORT_SYMBOL_GPL(wm8400_reset_codec_reg_cache);
  
-#if defined(CONFIG_I2C) || defined(CONFIG_I2C_MODULE)
+#if IS_ENABLED(CONFIG_I2C)
  static int wm8400_i2c_probe(struct i2c_client *i2c,
                             const struct i2c_device_id *id)
  {
@@ -176,15 +146,6 @@ static int wm8400_i2c_probe(struct i2c_client *i2c,
         return wm8400_init(wm8400, dev_get_platdata(&i2c->dev));
  }
  
-static int wm8400_i2c_remove(struct i2c_client *i2c)
-{
-       struct wm8400 *wm8400 = i2c_get_clientdata(i2c);
-
-       wm8400_release(wm8400);
-
-       return 0;
-}
-
  static const struct i2c_device_id wm8400_i2c_id[] = {
         { "wm8400", 0 },
         { }
@@ -196,7 +157,6 @@ static struct i2c_driver wm8400_i2c_driver = {
                 .name = "WM8400",
         },
         .probe    = wm8400_i2c_probe,
-       .remove   = wm8400_i2c_remove,
         .id_table = wm8400_i2c_id,
  };
  #endif
@@ -205,7 +165,7 @@ static int __init wm8400_module_init(void)
  {
         int ret = -ENODEV;
  
-#if defined(CONFIG_I2C) || defined(CONFIG_I2C_MODULE)
+#if IS_ENABLED(CONFIG_I2C)
         ret = i2c_add_driver(&wm8400_i2c_driver);
         if (ret != 0)
                 pr_err("Failed to register I2C driver: %d\n", ret);
@@ -217,7 +177,7 @@ subsys_initcall(wm8400_module_init);
  
  static void __exit wm8400_module_exit(void)
  {
-#if defined(CONFIG_I2C) || defined(CONFIG_I2C_MODULE)
+#if IS_ENABLED(CONFIG_I2C)
         i2c_del_driver(&wm8400_i2c_driver);
  #endif
  }
diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c

index 2107c948406d359e6a305ebcc4ba495d0f8f51d8..6d228ccd884d14d1eebb46ba5a02c3830ea787bd 100644 (file)
--- a/drivers/misc/cxl/api.c
+++ b/drivers/misc/cxl/api.c
@@ -68,15 +68,6 @@ struct cxl_context *cxl_get_context(struct pci_dev *dev)
  }
  EXPORT_SYMBOL_GPL(cxl_get_context);
  
-struct device *cxl_get_phys_dev(struct pci_dev *dev)
-{
-       struct cxl_afu *afu;
-
-       afu = cxl_pci_to_afu(dev);
-
-       return afu->adapter->dev.parent;
-}
-
  int cxl_release_context(struct cxl_context *ctx)
  {
         if (ctx->status >= STARTED)
@@ -192,6 +183,7 @@ int cxl_start_context(struct cxl_context *ctx, u64 wed,
                 ctx->pid = get_task_pid(task, PIDTYPE_PID);
                 ctx->glpid = get_task_pid(task->group_leader, PIDTYPE_PID);
                 kernel = false;
+               ctx->real_mode = false;
         }
  
         cxl_ctx_get();
@@ -228,6 +220,24 @@ void cxl_set_master(struct cxl_context *ctx)
  }
  EXPORT_SYMBOL_GPL(cxl_set_master);
  
+int cxl_set_translation_mode(struct cxl_context *ctx, bool real_mode)
+{
+       if (ctx->status == STARTED) {
+               /*
+                * We could potentially update the PE and issue an update LLCMD
+                * to support this, but it doesn't seem to have a good use case
+                * since it's trivial to just create a second kernel context
+                * with different translation modes, so until someone convinces
+                * me otherwise:
+                */
+               return -EBUSY;
+       }
+
+       ctx->real_mode = real_mode;
+       return 0;
+}
+EXPORT_SYMBOL_GPL(cxl_set_translation_mode);
+
  /* wrappers around afu_* file ops which are EXPORTED */
  int cxl_fd_open(struct inode *inode, struct file *file)
  {
diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c

index 7edea9c19199a8a4714b2ea67eb8c277019b78a4..26d206b1d08c8f62fec387db108fa19ce4926c71 100644 (file)
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -297,8 +297,7 @@ static void reclaim_ctx(struct rcu_head *rcu)
         if (ctx->kernelapi)
                 kfree(ctx->mapping);
  
-       if (ctx->irq_bitmap)
-               kfree(ctx->irq_bitmap);
+       kfree(ctx->irq_bitmap);
  
         /* Drop ref to the afu device taken during cxl_context_init */
         cxl_afu_put(ctx->afu);
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h

index 73dc2a33da7434d6552563407476a9d83f326942..4fe50788ff45a1f12e28f26ae405fbabc99f9886 100644 (file)
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -178,15 +178,6 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An     = {0x0A0};
  #define CXL_PSL_SR_An_MP  (1ull << (63-62)) /* Master Process */
  #define CXL_PSL_SR_An_LE  (1ull << (63-63)) /* Little Endian */
  
-/****** CXL_PSL_LLCMD_An ****************************************************/
-#define CXL_LLCMD_TERMINATE   0x0001000000000000ULL
-#define CXL_LLCMD_REMOVE      0x0002000000000000ULL
-#define CXL_LLCMD_SUSPEND     0x0003000000000000ULL
-#define CXL_LLCMD_RESUME      0x0004000000000000ULL
-#define CXL_LLCMD_ADD         0x0005000000000000ULL
-#define CXL_LLCMD_UPDATE      0x0006000000000000ULL
-#define CXL_LLCMD_HANDLE_MASK 0x000000000000ffffULL
-
  /****** CXL_PSL_ID_An ****************************************************/
  #define CXL_PSL_ID_An_F        (1ull << (63-31))
  #define CXL_PSL_ID_An_L        (1ull << (63-30))
@@ -376,11 +367,13 @@ struct cxl_afu_native {
  };
  
  struct cxl_afu_guest {
+       struct cxl_afu *parent;
         u64 handle;
         phys_addr_t p2n_phys;
         u64 p2n_size;
         int max_ints;
-       struct mutex recovery_lock;
+       bool handle_err;
+       struct delayed_work work_err;
         int previous_state;
  };
  
@@ -524,6 +517,7 @@ struct cxl_context {
         bool pe_inserted;
         bool master;
         bool kernel;
+       bool real_mode;
         bool pending_irq;
         bool pending_fault;
         bool pending_afu_err;
@@ -580,6 +574,7 @@ struct cxl {
         bool perst_loads_image;
         bool perst_select_user;
         bool perst_same_image;
+       bool psl_timebase_synced;
  };
  
  int cxl_pci_alloc_one_irq(struct cxl *adapter);
diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c

index 9a8650bcb042d4827eedc46d11edf7e9ddb4684e..377e650a2a1dc3464fa0eb3f6fdf99493a15f32c 100644 (file)
--- a/drivers/misc/cxl/fault.c
+++ b/drivers/misc/cxl/fault.c
@@ -149,11 +149,13 @@ static void cxl_handle_page_fault(struct cxl_context *ctx,
          * update_mmu_cache() will not have loaded the hash since current->trap
          * is not a 0x400 or 0x300, so just call hash_page_mm() here.
          */
-       access = _PAGE_PRESENT;
+       access = _PAGE_PRESENT | _PAGE_READ;
         if (dsisr & CXL_PSL_DSISR_An_S)
-               access |= _PAGE_RW;
-       if ((!ctx->kernel) || ~(dar & (1ULL << 63)))
-               access |= _PAGE_USER;
+               access |= _PAGE_WRITE;
+
+       access |= _PAGE_PRIVILEGED;
+       if ((!ctx->kernel) || (REGION_ID(dar) == USER_REGION_ID))
+               access &= ~_PAGE_PRIVILEGED;
  
         if (dsisr & DSISR_NOHPTE)
                 inv_flags |= HPTE_NOHPTE_UPDATE;
diff --git a/drivers/misc/cxl/guest.c b/drivers/misc/cxl/guest.c

index 8213372de2b7217e73d1b884bfa1f07a78d7e83a..bc8d0b9870eb42b5bf18ec8577c32769fbee8c2d 100644 (file)
--- a/drivers/misc/cxl/guest.c
+++ b/drivers/misc/cxl/guest.c
@@ -178,6 +178,9 @@ static int afu_read_error_state(struct cxl_afu *afu, int *state_out)
         u64 state;
         int rc = 0;
  
+       if (!afu)
+               return -EIO;
+
         rc = cxl_h_read_error_state(afu->guest->handle, &state);
         if (!rc) {
                 WARN_ON(state != H_STATE_NORMAL &&
@@ -552,6 +555,17 @@ static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr)
  
         elem->common.sstp0  = cpu_to_be64(ctx->sstp0);
         elem->common.sstp1  = cpu_to_be64(ctx->sstp1);
+
+       /*
+        * Ensure we have at least one interrupt allocated to take faults for
+        * kernel contexts that may not have allocated any AFU IRQs at all:
+        */
+       if (ctx->irqs.range[0] == 0) {
+               rc = afu_register_irqs(ctx, 0);
+               if (rc)
+                       goto out_free;
+       }
+
         for (r = 0; r < CXL_IRQ_RANGES; r++) {
                 for (i = 0; i < ctx->irqs.range[r]; i++) {
                         if (r == 0 && i == 0) {
@@ -597,6 +611,7 @@ static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr)
                 enable_afu_irqs(ctx);
         }
  
+out_free:
         free_page((u64)elem);
         return rc;
  }
@@ -605,6 +620,9 @@ static int guest_attach_process(struct cxl_context *ctx, bool kernel, u64 wed, u
  {
         pr_devel("in %s\n", __func__);
  
+       if (ctx->real_mode)
+               return -EPERM;
+
         ctx->kernel = kernel;
         if (ctx->afu->current_mode == CXL_MODE_DIRECTED)
                 return attach_afu_directed(ctx, wed, amr);
@@ -818,7 +836,6 @@ static int afu_update_state(struct cxl_afu *afu)
         switch (cur_state) {
         case H_STATE_NORMAL:
                 afu->guest->previous_state = cur_state;
-               rc = 1;
                 break;
  
         case H_STATE_DISABLE:
@@ -834,7 +851,6 @@ static int afu_update_state(struct cxl_afu *afu)
                         pci_error_handlers(afu, CXL_SLOT_RESET_EVENT,
                                         pci_channel_io_normal);
                         pci_error_handlers(afu, CXL_RESUME_EVENT, 0);
-                       rc = 1;
                 }
                 afu->guest->previous_state = 0;
                 break;
@@ -859,39 +875,30 @@ static int afu_update_state(struct cxl_afu *afu)
         return rc;
  }
  
-static int afu_do_recovery(struct cxl_afu *afu)
+static void afu_handle_errstate(struct work_struct *work)
  {
-       int rc;
+       struct cxl_afu_guest *afu_guest =
+               container_of(to_delayed_work(work), struct cxl_afu_guest, work_err);
  
-       /* many threads can arrive here, in case of detach_all for example.
-        * Only one needs to drive the recovery
-        */
-       if (mutex_trylock(&afu->guest->recovery_lock)) {
-               rc = afu_update_state(afu);
-               mutex_unlock(&afu->guest->recovery_lock);
-               return rc;
-       }
-       return 0;
+       if (!afu_update_state(afu_guest->parent) &&
+           afu_guest->previous_state == H_STATE_PERM_UNAVAILABLE)
+               return;
+
+       if (afu_guest->handle_err == true)
+               schedule_delayed_work(&afu_guest->work_err,
+                                     msecs_to_jiffies(3000));
  }
  
  static bool guest_link_ok(struct cxl *cxl, struct cxl_afu *afu)
  {
         int state;
  
-       if (afu) {
-               if (afu_read_error_state(afu, &state) ||
-                       state != H_STATE_NORMAL) {
-                       if (afu_do_recovery(afu) > 0) {
-                               /* check again in case we've just fixed it */
-                               if (!afu_read_error_state(afu, &state) &&
-                                       state == H_STATE_NORMAL)
-                                       return true;
-                       }
-                       return false;
-               }
+       if (afu && (!afu_read_error_state(afu, &state))) {
+               if (state == H_STATE_NORMAL)
+                       return true;
         }
  
-       return true;
+       return false;
  }
  
  static int afu_properties_look_ok(struct cxl_afu *afu)
@@ -929,8 +936,6 @@ int cxl_guest_init_afu(struct cxl *adapter, int slice, struct device_node *afu_n
                 return -ENOMEM;
         }
  
-       mutex_init(&afu->guest->recovery_lock);
-
         if ((rc = dev_set_name(&afu->dev, "afu%i.%i",
                                           adapter->adapter_num,
                                           slice)))
@@ -986,6 +991,15 @@ int cxl_guest_init_afu(struct cxl *adapter, int slice, struct device_node *afu_n
  
         afu->enabled = true;
  
+       /*
+        * wake up the cpu periodically to check the state
+        * of the AFU using "afu" stored in the guest structure.
+        */
+       afu->guest->parent = afu;
+       afu->guest->handle_err = true;
+       INIT_DELAYED_WORK(&afu->guest->work_err, afu_handle_errstate);
+       schedule_delayed_work(&afu->guest->work_err, msecs_to_jiffies(1000));
+
         if ((rc = cxl_pci_vphb_add(afu)))
                 dev_info(&afu->dev, "Can't register vPHB\n");
  
@@ -1014,6 +1028,10 @@ void cxl_guest_remove_afu(struct cxl_afu *afu)
         if (!afu)
                 return;
  
+       /* flush and stop pending job */
+       afu->guest->handle_err = false;
+       flush_delayed_work(&afu->guest->work_err);
+
         cxl_pci_vphb_remove(afu);
         cxl_sysfs_afu_remove(afu);
  
@@ -1101,6 +1119,12 @@ struct cxl *cxl_guest_init_adapter(struct device_node *np, struct platform_devic
         adapter->dev.release = release_adapter;
         dev_set_drvdata(&pdev->dev, adapter);
  
+       /*
+        * Hypervisor controls PSL timebase initialization (p1 register).
+        * On FW840, PSL is initialized.
+        */
+       adapter->psl_timebase_synced = true;
+
         if ((rc = cxl_of_read_adapter_handle(adapter, np)))
                 goto err1;
  
diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c

index ecf7557cd657f66caa932fce83c785bc241bda9e..55d8a1459f28c69a953844b1d4d698d069ba8f80 100644 (file)
--- a/drivers/misc/cxl/native.c
+++ b/drivers/misc/cxl/native.c
@@ -186,16 +186,25 @@ static int spa_max_procs(int spa_size)
  
  int cxl_alloc_spa(struct cxl_afu *afu)
  {
+       unsigned spa_size;
+
         /* Work out how many pages to allocate */
         afu->native->spa_order = 0;
         do {
                 afu->native->spa_order++;
-               afu->native->spa_size = (1 << afu->native->spa_order) * PAGE_SIZE;
+               spa_size = (1 << afu->native->spa_order) * PAGE_SIZE;
+
+               if (spa_size > 0x100000) {
+                       dev_warn(&afu->dev, "num_of_processes too large for the SPA, limiting to %i (0x%x)\n",
+                                       afu->native->spa_max_procs, afu->native->spa_size);
+                       afu->num_procs = afu->native->spa_max_procs;
+                       break;
+               }
+
+               afu->native->spa_size = spa_size;
                 afu->native->spa_max_procs = spa_max_procs(afu->native->spa_size);
         } while (afu->native->spa_max_procs < afu->num_procs);
  
-       WARN_ON(afu->native->spa_size > 0x100000); /* Max size supported by the hardware */
-
         if (!(afu->native->spa = (struct cxl_process_element *)
               __get_free_pages(GFP_KERNEL | __GFP_ZERO, afu->native->spa_order))) {
                 pr_err("cxl_alloc_spa: Unable to allocate scheduled process area\n");
@@ -486,8 +495,9 @@ static u64 calculate_sr(struct cxl_context *ctx)
         if (mfspr(SPRN_LPCR) & LPCR_TC)
                 sr |= CXL_PSL_SR_An_TC;
         if (ctx->kernel) {
-               sr |= CXL_PSL_SR_An_R | (mfmsr() & MSR_SF);
-               sr |= CXL_PSL_SR_An_HV;
+               if (!ctx->real_mode)
+                       sr |= CXL_PSL_SR_An_R;
+               sr |= (mfmsr() & MSR_SF) | CXL_PSL_SR_An_HV;
         } else {
                 sr |= CXL_PSL_SR_An_PR | CXL_PSL_SR_An_R;
                 sr &= ~(CXL_PSL_SR_An_HV);
@@ -526,6 +536,15 @@ static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr)
         ctx->elem->common.sstp0 = cpu_to_be64(ctx->sstp0);
         ctx->elem->common.sstp1 = cpu_to_be64(ctx->sstp1);
  
+       /*
+        * Ensure we have the multiplexed PSL interrupt set up to take faults
+        * for kernel contexts that may not have allocated any AFU IRQs at all:
+        */
+       if (ctx->irqs.range[0] == 0) {
+               ctx->irqs.offset[0] = ctx->afu->native->psl_hwirq;
+               ctx->irqs.range[0] = 1;
+       }
+
         for (r = 0; r < CXL_IRQ_RANGES; r++) {
                 ctx->elem->ivte_offsets[r] = cpu_to_be16(ctx->irqs.offset[r]);
                 ctx->elem->ivte_ranges[r] = cpu_to_be16(ctx->irqs.range[r]);
diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c

index 2844e975bf79b1a5ccb9455bd52a23c1176046f5..a08fcc888a71df37d4fcf6fd37d8716d71786d71 100644 (file)
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -21,6 +21,7 @@
  #include <asm/msi_bitmap.h>
  #include <asm/pnv-pci.h>
  #include <asm/io.h>
+#include <asm/reg.h>
  
  #include "cxl.h"
  #include <misc/cxl.h>
@@ -321,12 +322,43 @@ static void dump_afu_descriptor(struct cxl_afu *afu)
  #undef show_reg
  }
  
+#define CAPP_UNIT0_ID 0xBA
+#define CAPP_UNIT1_ID 0XBE
+
+static u64 get_capp_unit_id(struct device_node *np)
+{
+       u32 phb_index;
+
+       /*
+        * For chips other than POWER8NVL, we only have CAPP 0,
+        * irrespective of which PHB is used.
+        */
+       if (!pvr_version_is(PVR_POWER8NVL))
+               return CAPP_UNIT0_ID;
+
+       /*
+        * For POWER8NVL, assume CAPP 0 is attached to PHB0 and
+        * CAPP 1 is attached to PHB1.
+        */
+       if (of_property_read_u32(np, "ibm,phb-index", &phb_index))
+               return 0;
+
+       if (phb_index == 0)
+               return CAPP_UNIT0_ID;
+
+       if (phb_index == 1)
+               return CAPP_UNIT1_ID;
+
+       return 0;
+}
+
  static int init_implementation_adapter_regs(struct cxl *adapter, struct pci_dev *dev)
  {
         struct device_node *np;
         const __be32 *prop;
         u64 psl_dsnctl;
         u64 chipid;
+       u64 capp_unit_id;
  
         if (!(np = pnv_pci_get_phb_node(dev)))
                 return -ENODEV;
@@ -336,10 +368,19 @@ static int init_implementation_adapter_regs(struct cxl *adapter, struct pci_dev
         if (!np)
                 return -ENODEV;
         chipid = be32_to_cpup(prop);
+       capp_unit_id = get_capp_unit_id(np);
         of_node_put(np);
+       if (!capp_unit_id) {
+               pr_err("cxl: invalid capp unit id\n");
+               return -ENODEV;
+       }
  
+       psl_dsnctl = 0x0000900000000000ULL; /* pteupd ttype, scdone */
+       psl_dsnctl |= (0x2ULL << (63-38)); /* MMIO hang pulse: 256 us */
         /* Tell PSL where to route data to */
-       psl_dsnctl = 0x02E8900002000000ULL | (chipid << (63-5));
+       psl_dsnctl |= (chipid << (63-5));
+       psl_dsnctl |= (capp_unit_id << (63-13));
+
         cxl_p1_write(adapter, CXL_PSL_DSNDCTL, psl_dsnctl);
         cxl_p1_write(adapter, CXL_PSL_RESLCKTO, 0x20000000200ULL);
         /* snoop write mask */
@@ -355,22 +396,24 @@ static int init_implementation_adapter_regs(struct cxl *adapter, struct pci_dev
  #define TBSYNC_CNT(n) (((u64)n & 0x7) << (63-6))
  #define _2048_250MHZ_CYCLES 1
  
-static int cxl_setup_psl_timebase(struct cxl *adapter, struct pci_dev *dev)
+static void cxl_setup_psl_timebase(struct cxl *adapter, struct pci_dev *dev)
  {
         u64 psl_tb;
         int delta;
         unsigned int retry = 0;
         struct device_node *np;
  
+       adapter->psl_timebase_synced = false;
+
         if (!(np = pnv_pci_get_phb_node(dev)))
-               return -ENODEV;
+               return;
  
         /* Do not fail when CAPP timebase sync is not supported by OPAL */
         of_node_get(np);
         if (! of_get_property(np, "ibm,capp-timebase-sync", NULL)) {
                 of_node_put(np);
-               pr_err("PSL: Timebase sync: OPAL support missing\n");
-               return 0;
+               dev_info(&dev->dev, "PSL timebase inactive: OPAL support missing\n");
+               return;
         }
         of_node_put(np);
  
@@ -389,8 +432,8 @@ static int cxl_setup_psl_timebase(struct cxl *adapter, struct pci_dev *dev)
         do {
                 msleep(1);
                 if (retry++ > 5) {
-                       pr_err("PSL: Timebase sync: giving up!\n");
-                       return -EIO;
+                       dev_info(&dev->dev, "PSL timebase can't synchronize\n");
+                       return;
                 }
                 psl_tb = cxl_p1_read(adapter, CXL_PSL_Timebase);
                 delta = mftb() - psl_tb;
@@ -398,7 +441,8 @@ static int cxl_setup_psl_timebase(struct cxl *adapter, struct pci_dev *dev)
                         delta = -delta;
         } while (tb_to_ns(delta) > 16000);
  
-       return 0;
+       adapter->psl_timebase_synced = true;
+       return;
  }
  
  static int init_implementation_afu_regs(struct cxl_afu *afu)
@@ -1144,8 +1188,8 @@ static int cxl_configure_adapter(struct cxl *adapter, struct pci_dev *dev)
         if ((rc = pnv_phb_to_cxl_mode(dev, OPAL_PHB_CAPI_MODE_SNOOP_ON)))
                 goto err;
  
-       if ((rc = cxl_setup_psl_timebase(adapter, dev)))
-               goto err;
+       /* Ignore error, adapter init is not dependant on timebase sync */
+       cxl_setup_psl_timebase(adapter, dev);
  
         if ((rc = cxl_native_register_psl_err_irq(adapter)))
                 goto err;
diff --git a/drivers/misc/cxl/sysfs.c b/drivers/misc/cxl/sysfs.c

index 25913c08794cc29e24c19102e737f94426522e06..b043c20f158f122d11bbf5dc765ae957ebf9155d 100644 (file)
--- a/drivers/misc/cxl/sysfs.c
+++ b/drivers/misc/cxl/sysfs.c
@@ -57,6 +57,15 @@ static ssize_t image_loaded_show(struct device *device,
         return scnprintf(buf, PAGE_SIZE, "factory\n");
  }
  
+static ssize_t psl_timebase_synced_show(struct device *device,
+                                       struct device_attribute *attr,
+                                       char *buf)
+{
+       struct cxl *adapter = to_cxl_adapter(device);
+
+       return scnprintf(buf, PAGE_SIZE, "%i\n", adapter->psl_timebase_synced);
+}
+
  static ssize_t reset_adapter_store(struct device *device,
                                    struct device_attribute *attr,
                                    const char *buf, size_t count)
@@ -142,6 +151,7 @@ static struct device_attribute adapter_attrs[] = {
         __ATTR_RO(psl_revision),
         __ATTR_RO(base_image),
         __ATTR_RO(image_loaded),
+       __ATTR_RO(psl_timebase_synced),
         __ATTR_RW(load_image_on_perst),
         __ATTR_RW(perst_reloads_same_image),
         __ATTR(reset, S_IWUSR, NULL, reset_adapter_store),
diff --git a/drivers/mtd/maps/pxa2xx-flash.c b/drivers/mtd/maps/pxa2xx-flash.c

index 7497090e990029fdf6b2161f4d2eb378abf2c540..2cde28ed95c99edd44fe324d5dac093271a04110 100644 (file)
--- a/drivers/mtd/maps/pxa2xx-flash.c
+++ b/drivers/mtd/maps/pxa2xx-flash.c
@@ -71,8 +71,8 @@ static int pxa2xx_flash_probe(struct platform_device *pdev)
                        info->map.name);
                 return -ENOMEM;
         }
-       info->map.cached = memremap(info->map.phys, info->map.size,
-                       MEMREMAP_WB);
+       info->map.cached =
+               ioremap_cached(info->map.phys, info->map.size);
         if (!info->map.cached)
                 printk(KERN_WARNING "Failed to ioremap cached %s\n",
                        info->map.name);
@@ -111,7 +111,7 @@ static int pxa2xx_flash_remove(struct platform_device *dev)
         map_destroy(info->mtd);
         iounmap(info->map.virt);
         if (info->map.cached)
-               memunmap(info->map.cached);
+               iounmap(info->map.cached);
         kfree(info);
         return 0;
  }
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c

index 06b819db51b18d9e3aaa8e1ada8ed8e929597143..0ff8e60deccb78a941f73163212a25b23301f871 100644 (file)
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
@@ -23,7 +23,7 @@ static void nicvf_get_page(struct nicvf *nic)
         if (!nic->rb_pageref || !nic->rb_page)
                 return;
  
-       atomic_add(nic->rb_pageref, &nic->rb_page->_count);
+       page_ref_add(nic->rb_page, nic->rb_pageref);
         nic->rb_pageref = 0;
  }
  
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h

index 80417fc564d43fc6305a737e90a819080bbcdd8b..4705e2dea42342d870fff93ab3d070e08a5ac715 100644 (file)
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
@@ -1392,6 +1392,10 @@ struct ulp_mem_io {
  #define T5_ULP_MEMIO_ORDER_V(x) ((x) << T5_ULP_MEMIO_ORDER_S)
  #define T5_ULP_MEMIO_ORDER_F    T5_ULP_MEMIO_ORDER_V(1U)
  
+#define T5_ULP_MEMIO_FID_S     4
+#define T5_ULP_MEMIO_FID_M     0x7ff
+#define T5_ULP_MEMIO_FID_V(x)  ((x) << T5_ULP_MEMIO_FID_S)
+
  /* ulp_mem_io.lock_addr fields */
  #define ULP_MEMIO_ADDR_S    0
  #define ULP_MEMIO_ADDR_V(x) ((x) << ULP_MEMIO_ADDR_S)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c

index b51e42d6fbecaaed913d592f3bedc490a4dc3ecb..873a631ad1552c8a6dfdb72ebda8d0cec1d5c28e 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
@@ -39,6 +39,53 @@
  #include <linux/mlx5/cq.h>
  #include "mlx5_core.h"
  
+#define TASKLET_MAX_TIME 2
+#define TASKLET_MAX_TIME_JIFFIES msecs_to_jiffies(TASKLET_MAX_TIME)
+
+void mlx5_cq_tasklet_cb(unsigned long data)
+{
+       unsigned long flags;
+       unsigned long end = jiffies + TASKLET_MAX_TIME_JIFFIES;
+       struct mlx5_eq_tasklet *ctx = (struct mlx5_eq_tasklet *)data;
+       struct mlx5_core_cq *mcq;
+       struct mlx5_core_cq *temp;
+
+       spin_lock_irqsave(&ctx->lock, flags);
+       list_splice_tail_init(&ctx->list, &ctx->process_list);
+       spin_unlock_irqrestore(&ctx->lock, flags);
+
+       list_for_each_entry_safe(mcq, temp, &ctx->process_list,
+                                tasklet_ctx.list) {
+               list_del_init(&mcq->tasklet_ctx.list);
+               mcq->tasklet_ctx.comp(mcq);
+               if (atomic_dec_and_test(&mcq->refcount))
+                       complete(&mcq->free);
+               if (time_after(jiffies, end))
+                       break;
+       }
+
+       if (!list_empty(&ctx->process_list))
+               tasklet_schedule(&ctx->task);
+}
+
+static void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq)
+{
+       unsigned long flags;
+       struct mlx5_eq_tasklet *tasklet_ctx = cq->tasklet_ctx.priv;
+
+       spin_lock_irqsave(&tasklet_ctx->lock, flags);
+       /* When migrating CQs between EQs will be implemented, please note
+        * that you need to sync this point. It is possible that
+        * while migrating a CQ, completions on the old EQs could
+        * still arrive.
+        */
+       if (list_empty_careful(&cq->tasklet_ctx.list)) {
+               atomic_inc(&cq->refcount);
+               list_add_tail(&cq->tasklet_ctx.list, &tasklet_ctx->list);
+       }
+       spin_unlock_irqrestore(&tasklet_ctx->lock, flags);
+}
+
  void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn)
  {
         struct mlx5_core_cq *cq;
@@ -96,6 +143,13 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
         struct mlx5_create_cq_mbox_out out;
         struct mlx5_destroy_cq_mbox_in din;
         struct mlx5_destroy_cq_mbox_out dout;
+       int eqn = MLX5_GET(cqc, MLX5_ADDR_OF(create_cq_in, in, cq_context),
+                          c_eqn);
+       struct mlx5_eq *eq;
+
+       eq = mlx5_eqn2eq(dev, eqn);
+       if (IS_ERR(eq))
+               return PTR_ERR(eq);
  
         in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_CQ);
         memset(&out, 0, sizeof(out));
@@ -111,6 +165,11 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
         cq->arm_sn     = 0;
         atomic_set(&cq->refcount, 1);
         init_completion(&cq->free);
+       if (!cq->comp)
+               cq->comp = mlx5_add_cq_to_tasklet;
+       /* assuming CQ will be deleted before the EQ */
+       cq->tasklet_ctx.priv = &eq->tasklet_ctx;
+       INIT_LIST_HEAD(&cq->tasklet_ctx.list);
  
         spin_lock_irq(&table->lock);
         err = radix_tree_insert(&table->tree, cq->cqn, cq);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c

index f3456798c5960fda619a2a57074aa9aaf902e1ef..bd947704b59c41747a71d75725a5d230bde5dfc0 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -433,8 +433,8 @@ static int mlx5e_alloc_rx_fragmented_mpwqe(struct mlx5e_rq *rq,
         for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
                 if (unlikely(mlx5e_alloc_and_map_page(rq, wi, i)))
                         goto err_unmap;
-               atomic_add(mlx5e_mpwqe_strides_per_page(rq),
-                          &wi->umr.dma_info[i].page->_count);
+               page_ref_add(wi->umr.dma_info[i].page,
+                            mlx5e_mpwqe_strides_per_page(rq));
                 wi->skbs_frags[i] = 0;
         }
  
@@ -452,8 +452,8 @@ err_unmap:
         while (--i >= 0) {
                 dma_unmap_page(rq->pdev, wi->umr.dma_info[i].addr, PAGE_SIZE,
                                PCI_DMA_FROMDEVICE);
-               atomic_sub(mlx5e_mpwqe_strides_per_page(rq),
-                          &wi->umr.dma_info[i].page->_count);
+               page_ref_sub(wi->umr.dma_info[i].page,
+                            mlx5e_mpwqe_strides_per_page(rq));
                 put_page(wi->umr.dma_info[i].page);
         }
         dma_unmap_single(rq->pdev, wi->umr.mtt_addr, mtt_sz, PCI_DMA_TODEVICE);
@@ -477,8 +477,8 @@ void mlx5e_free_rx_fragmented_mpwqe(struct mlx5e_rq *rq,
         for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
                 dma_unmap_page(rq->pdev, wi->umr.dma_info[i].addr, PAGE_SIZE,
                                PCI_DMA_FROMDEVICE);
-               atomic_sub(mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i],
-                          &wi->umr.dma_info[i].page->_count);
+               page_ref_sub(wi->umr.dma_info[i].page,
+                       mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i]);
                 put_page(wi->umr.dma_info[i].page);
         }
         dma_unmap_single(rq->pdev, wi->umr.mtt_addr, mtt_sz, PCI_DMA_TODEVICE);
@@ -527,8 +527,8 @@ static int mlx5e_alloc_rx_linear_mpwqe(struct mlx5e_rq *rq,
          */
         split_page(wi->dma_info.page, MLX5_MPWRQ_WQE_PAGE_ORDER);
         for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
-               atomic_add(mlx5e_mpwqe_strides_per_page(rq),
-                          &wi->dma_info.page[i]._count);
+               page_ref_add(&wi->dma_info.page[i],
+                            mlx5e_mpwqe_strides_per_page(rq));
                 wi->skbs_frags[i] = 0;
         }
  
@@ -551,8 +551,8 @@ void mlx5e_free_rx_linear_mpwqe(struct mlx5e_rq *rq,
         dma_unmap_page(rq->pdev, wi->dma_info.addr, rq->wqe_sz,
                        PCI_DMA_FROMDEVICE);
         for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
-               atomic_sub(mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i],
-                          &wi->dma_info.page[i]._count);
+               page_ref_sub(&wi->dma_info.page[i],
+                       mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i]);
                 put_page(&wi->dma_info.page[i]);
         }
  }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c

index 18fccec72c5da210ec04be452837c1a5fddf7762..0e30602ef76dc7a7ce97e598bde9adca8aae50ff 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -202,7 +202,7 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
         struct mlx5_eqe *eqe;
         int eqes_found = 0;
         int set_ci = 0;
-       u32 cqn;
+       u32 cqn = -1;
         u32 rsn;
         u8 port;
  
@@ -320,6 +320,9 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
  
         eq_update_ci(eq, 1);
  
+       if (cqn != -1)
+               tasklet_schedule(&eq->tasklet_ctx.task);
+
         return eqes_found;
  }
  
@@ -403,6 +406,12 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
         if (err)
                 goto err_irq;
  
+       INIT_LIST_HEAD(&eq->tasklet_ctx.list);
+       INIT_LIST_HEAD(&eq->tasklet_ctx.process_list);
+       spin_lock_init(&eq->tasklet_ctx.lock);
+       tasklet_init(&eq->tasklet_ctx.task, mlx5_cq_tasklet_cb,
+                    (unsigned long)&eq->tasklet_ctx);
+
         /* EQs are created in ARMED state
          */
         eq_update_ci(eq, 1);
@@ -436,6 +445,7 @@ int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
                 mlx5_core_warn(dev, "failed to destroy a previously created eq: eqn %d\n",
                                eq->eqn);
         synchronize_irq(eq->irqn);
+       tasklet_disable(&eq->tasklet_ctx.task);
         mlx5_buf_free(dev, &eq->buf);
  
         return err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c

index 6feef7fb9d6a9537580b604a60ab6d420d810c02..a19b59348dd685816c88736a4d9e47f78b3f847c 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -663,6 +663,23 @@ int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn,
  }
  EXPORT_SYMBOL(mlx5_vector2eqn);
  
+struct mlx5_eq *mlx5_eqn2eq(struct mlx5_core_dev *dev, int eqn)
+{
+       struct mlx5_eq_table *table = &dev->priv.eq_table;
+       struct mlx5_eq *eq;
+
+       spin_lock(&table->lock);
+       list_for_each_entry(eq, &table->comp_eqs_list, list)
+               if (eq->eqn == eqn) {
+                       spin_unlock(&table->lock);
+                       return eq;
+               }
+
+       spin_unlock(&table->lock);
+
+       return ERR_PTR(-ENOENT);
+}
+
  static void free_comp_eqs(struct mlx5_core_dev *dev)
  {
         struct mlx5_eq_table *table = &dev->priv.eq_table;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h

index 482604bd051cd95ecb4762b85dc0c9f816614a39..2f86ec6fcf25c27b06e3008392627df27b1e1260 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -102,6 +102,8 @@ int mlx5_core_disable_hca(struct mlx5_core_dev *dev, u16 func_id);
  int mlx5_wait_for_vf_pages(struct mlx5_core_dev *dev);
  cycle_t mlx5_read_internal_timer(struct mlx5_core_dev *dev);
  u32 mlx5_get_msix_vec(struct mlx5_core_dev *dev, int vecidx);
+struct mlx5_eq *mlx5_eqn2eq(struct mlx5_core_dev *dev, int eqn);
+void mlx5_cq_tasklet_cb(unsigned long data);
  
  void mlx5e_init(void);
  void mlx5e_cleanup(void);
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c

index 8114541f327cfe5ecd7e9b7629c593f886db835f..73dd525fbf08571e391c1fcacecceeeb0dd9dc42 100644 (file)
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -920,7 +920,7 @@ static inline int qede_realloc_rx_buffer(struct qede_dev *edev,
                  * network stack to take the ownership of the page
                  * which can be recycled multiple times by the driver.
                  */
-               atomic_inc(&curr_cons->data->_count);
+               page_ref_inc(curr_cons->data);
                 qede_reuse_page(edev, rxq, curr_cons);
         }
  
@@ -1036,7 +1036,7 @@ static int qede_fill_frag_skb(struct qede_dev *edev,
                 /* Incr page ref count to reuse on allocation failure
                  * so that it doesn't get freed while freeing SKB.
                  */
-               atomic_inc(&current_bd->data->_count);
+               page_ref_inc(current_bd->data);
                 goto out;
         }
  
@@ -1487,7 +1487,7 @@ static int qede_rx_int(struct qede_fastpath *fp, int budget)
                                  * freeing SKB.
                                  */
  
-                               atomic_inc(&sw_rx_data->data->_count);
+                               page_ref_inc(sw_rx_data->data);
                                 rxq->rx_alloc_errors++;
                                 qede_recycle_rx_bd_ring(rxq, edev,
                                                         fp_cqe->bd_num);
diff --git a/drivers/of/base.c b/drivers/of/base.c

index b299de2b3afad1eb8a9832cf0fe74f2918797013..ebf84e3b56d5a96b1ea4cb9879380e4daac29487 100644 (file)
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -394,7 +394,8 @@ bool __weak arch_find_n_match_cpu_physical_id(struct device_node *cpun,
   * before booting secondary cores. This function uses arch_match_cpu_phys_id
   * which can be overridden by architecture specific implementation.
   *
- * Returns a node pointer for the logical cpu if found, else NULL.
+ * Returns a node pointer for the logical cpu with refcount incremented, use
+ * of_node_put() on it when done. Returns NULL if not found.
   */
  struct device_node *of_get_cpu_node(int cpu, unsigned int *thread)
  {
@@ -1440,106 +1441,155 @@ void of_print_phandle_args(const char *msg, const struct of_phandle_args *args)
         printk("\n");
  }
  
-static int __of_parse_phandle_with_args(const struct device_node *np,
-                                       const char *list_name,
-                                       const char *cells_name,
-                                       int cell_count, int index,
-                                       struct of_phandle_args *out_args)
+int of_phandle_iterator_init(struct of_phandle_iterator *it,
+               const struct device_node *np,
+               const char *list_name,
+               const char *cells_name,
+               int cell_count)
  {
-       const __be32 *list, *list_end;
-       int rc = 0, size, cur_index = 0;
-       uint32_t count = 0;
-       struct device_node *node = NULL;
-       phandle phandle;
+       const __be32 *list;
+       int size;
+
+       memset(it, 0, sizeof(*it));
  
-       /* Retrieve the phandle list property */
         list = of_get_property(np, list_name, &size);
         if (!list)
                 return -ENOENT;
-       list_end = list + size / sizeof(*list);
  
-       /* Loop over the phandles until all the requested entry is found */
-       while (list < list_end) {
-               rc = -EINVAL;
-               count = 0;
+       it->cells_name = cells_name;
+       it->cell_count = cell_count;
+       it->parent = np;
+       it->list_end = list + size / sizeof(*list);
+       it->phandle_end = list;
+       it->cur = list;
+
+       return 0;
+}
+
+int of_phandle_iterator_next(struct of_phandle_iterator *it)
+{
+       uint32_t count = 0;
+
+       if (it->node) {
+               of_node_put(it->node);
+               it->node = NULL;
+       }
+
+       if (!it->cur || it->phandle_end >= it->list_end)
+               return -ENOENT;
+
+       it->cur = it->phandle_end;
+
+       /* If phandle is 0, then it is an empty entry with no arguments. */
+       it->phandle = be32_to_cpup(it->cur++);
+
+       if (it->phandle) {
  
                 /*
-                * If phandle is 0, then it is an empty entry with no
-                * arguments.  Skip forward to the next entry.
+                * Find the provider node and parse the #*-cells property to
+                * determine the argument length.
                  */
-               phandle = be32_to_cpup(list++);
-               if (phandle) {
-                       /*
-                        * Find the provider node and parse the #*-cells
-                        * property to determine the argument length.
-                        *
-                        * This is not needed if the cell count is hard-coded
-                        * (i.e. cells_name not set, but cell_count is set),
-                        * except when we're going to return the found node
-                        * below.
-                        */
-                       if (cells_name || cur_index == index) {
-                               node = of_find_node_by_phandle(phandle);
-                               if (!node) {
-                                       pr_err("%s: could not find phandle\n",
-                                               np->full_name);
-                                       goto err;
-                               }
-                       }
+               it->node = of_find_node_by_phandle(it->phandle);
  
-                       if (cells_name) {
-                               if (of_property_read_u32(node, cells_name,
-                                                        &count)) {
-                                       pr_err("%s: could not get %s for %s\n",
-                                               np->full_name, cells_name,
-                                               node->full_name);
-                                       goto err;
-                               }
-                       } else {
-                               count = cell_count;
+               if (it->cells_name) {
+                       if (!it->node) {
+                               pr_err("%s: could not find phandle\n",
+                                      it->parent->full_name);
+                               goto err;
                         }
  
-                       /*
-                        * Make sure that the arguments actually fit in the
-                        * remaining property data length
-                        */
-                       if (list + count > list_end) {
-                               pr_err("%s: arguments longer than property\n",
-                                        np->full_name);
+                       if (of_property_read_u32(it->node, it->cells_name,
+                                                &count)) {
+                               pr_err("%s: could not get %s for %s\n",
+                                      it->parent->full_name,
+                                      it->cells_name,
+                                      it->node->full_name);
                                 goto err;
                         }
+               } else {
+                       count = it->cell_count;
+               }
+
+               /*
+                * Make sure that the arguments actually fit in the remaining
+                * property data length
+                */
+               if (it->cur + count > it->list_end) {
+                       pr_err("%s: arguments longer than property\n",
+                              it->parent->full_name);
+                       goto err;
                 }
+       }
+
+       it->phandle_end = it->cur + count;
+       it->cur_count = count;
+
+       return 0;
+
+err:
+       if (it->node) {
+               of_node_put(it->node);
+               it->node = NULL;
+       }
+
+       return -EINVAL;
+}
+
+int of_phandle_iterator_args(struct of_phandle_iterator *it,
+                            uint32_t *args,
+                            int size)
+{
+       int i, count;
+
+       count = it->cur_count;
+
+       if (WARN_ON(size < count))
+               count = size;
+
+       for (i = 0; i < count; i++)
+               args[i] = be32_to_cpup(it->cur++);
+
+       return count;
+}
+
+static int __of_parse_phandle_with_args(const struct device_node *np,
+                                       const char *list_name,
+                                       const char *cells_name,
+                                       int cell_count, int index,
+                                       struct of_phandle_args *out_args)
+{
+       struct of_phandle_iterator it;
+       int rc, cur_index = 0;
  
+       /* Loop over the phandles until all the requested entry is found */
+       of_for_each_phandle(&it, rc, np, list_name, cells_name, cell_count) {
                 /*
-                * All of the error cases above bail out of the loop, so at
+                * All of the error cases bail out of the loop, so at
                  * this point, the parsing is successful. If the requested
                  * index matches, then fill the out_args structure and return,
                  * or return -ENOENT for an empty entry.
                  */
                 rc = -ENOENT;
                 if (cur_index == index) {
-                       if (!phandle)
+                       if (!it.phandle)
                                 goto err;
  
                         if (out_args) {
-                               int i;
-                               if (WARN_ON(count > MAX_PHANDLE_ARGS))
-                                       count = MAX_PHANDLE_ARGS;
-                               out_args->np = node;
-                               out_args->args_count = count;
-                               for (i = 0; i < count; i++)
-                                       out_args->args[i] = be32_to_cpup(list++);
+                               int c;
+
+                               c = of_phandle_iterator_args(&it,
+                                                            out_args->args,
+                                                            MAX_PHANDLE_ARGS);
+                               out_args->np = it.node;
+                               out_args->args_count = c;
                         } else {
-                               of_node_put(node);
+                               of_node_put(it.node);
                         }
  
                         /* Found it! return success */
                         return 0;
                 }
  
-               of_node_put(node);
-               node = NULL;
-               list += count;
                 cur_index++;
         }
  
@@ -1547,12 +1597,11 @@ static int __of_parse_phandle_with_args(const struct device_node *np,
          * Unlock node before returning result; will be one of:
          * -ENOENT : index is for empty phandle
          * -EINVAL : parsing error on data
-        * [1..n]  : Number of phandle (count mode; when index = -1)
          */
-       rc = index < 0 ? cur_index : -ENOENT;
+
   err:
-       if (node)
-               of_node_put(node);
+       if (it.node)
+               of_node_put(it.node);
         return rc;
  }
  
@@ -1684,8 +1733,20 @@ EXPORT_SYMBOL(of_parse_phandle_with_fixed_args);
  int of_count_phandle_with_args(const struct device_node *np, const char *list_name,
                                 const char *cells_name)
  {
-       return __of_parse_phandle_with_args(np, list_name, cells_name, 0, -1,
-                                           NULL);
+       struct of_phandle_iterator it;
+       int rc, cur_index = 0;
+
+       rc = of_phandle_iterator_init(&it, np, list_name, cells_name, 0);
+       if (rc)
+               return rc;
+
+       while ((rc = of_phandle_iterator_next(&it)) == 0)
+               cur_index += 1;
+
+       if (rc != -ENOENT)
+               return rc;
+
+       return cur_index;
  }
  EXPORT_SYMBOL(of_count_phandle_with_args);
  
@@ -1777,6 +1838,9 @@ int of_remove_property(struct device_node *np, struct property *prop)
         unsigned long flags;
         int rc;
  
+       if (!prop)
+               return -ENODEV;
+
         mutex_lock(&of_mutex);
  
         raw_spin_lock_irqsave(&devtree_lock, flags);
diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c

index c647bd1b69033234339ab1475bc43f974542e2ac..3033fa3250dc486c85f422fbd49000218b3c8c47 100644 (file)
--- a/drivers/of/dynamic.c
+++ b/drivers/of/dynamic.c
@@ -311,6 +311,7 @@ int of_detach_node(struct device_node *np)
  
         return rc;
  }
+EXPORT_SYMBOL_GPL(of_detach_node);
  
  /**
   * of_node_release() - release a dynamically allocated node
@@ -497,6 +498,11 @@ static void __of_changeset_entry_invert(struct of_changeset_entry *ce,
         case OF_RECONFIG_UPDATE_PROPERTY:
                 rce->old_prop = ce->prop;
                 rce->prop = ce->old_prop;
+               /* update was used but original property did not exist */
+               if (!rce->prop) {
+                       rce->action = OF_RECONFIG_REMOVE_PROPERTY;
+                       rce->prop = ce->prop;
+               }
                 break;
         }
  }
diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c

index 3349d2aa66346335fa3c84e485668bdbf77f6e52..14f2f8c7c2607e603337b30a37117195f95f0880 100644 (file)
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -161,39 +161,127 @@ static void *unflatten_dt_alloc(void **mem, unsigned long size,
         return res;
  }
  
-/**
- * unflatten_dt_node - Alloc and populate a device_node from the flat tree
- * @blob: The parent device tree blob
- * @mem: Memory chunk to use for allocating device nodes and properties
- * @poffset: pointer to node in flat tree
- * @dad: Parent struct device_node
- * @nodepp: The device_node tree created by the call
- * @fpsize: Size of the node path up at the current depth.
- * @dryrun: If true, do not allocate device nodes but still calculate needed
- * memory size
- */
-static void * unflatten_dt_node(const void *blob,
-                               void *mem,
-                               int *poffset,
-                               struct device_node *dad,
-                               struct device_node **nodepp,
-                               unsigned long fpsize,
+static void populate_properties(const void *blob,
+                               int offset,
+                               void **mem,
+                               struct device_node *np,
+                               const char *nodename,
                                 bool dryrun)
  {
-       const __be32 *p;
+       struct property *pp, **pprev = NULL;
+       int cur;
+       bool has_name = false;
+
+       pprev = &np->properties;
+       for (cur = fdt_first_property_offset(blob, offset);
+            cur >= 0;
+            cur = fdt_next_property_offset(blob, cur)) {
+               const __be32 *val;
+               const char *pname;
+               u32 sz;
+
+               val = fdt_getprop_by_offset(blob, cur, &pname, &sz);
+               if (!val) {
+                       pr_warn("%s: Cannot locate property at 0x%x\n",
+                               __func__, cur);
+                       continue;
+               }
+
+               if (!pname) {
+                       pr_warn("%s: Cannot find property name at 0x%x\n",
+                               __func__, cur);
+                       continue;
+               }
+
+               if (!strcmp(pname, "name"))
+                       has_name = true;
+
+               pp = unflatten_dt_alloc(mem, sizeof(struct property),
+                                       __alignof__(struct property));
+               if (dryrun)
+                       continue;
+
+               /* We accept flattened tree phandles either in
+                * ePAPR-style "phandle" properties, or the
+                * legacy "linux,phandle" properties.  If both
+                * appear and have different values, things
+                * will get weird. Don't do that.
+                */
+               if (!strcmp(pname, "phandle") ||
+                   !strcmp(pname, "linux,phandle")) {
+                       if (!np->phandle)
+                               np->phandle = be32_to_cpup(val);
+               }
+
+               /* And we process the "ibm,phandle" property
+                * used in pSeries dynamic device tree
+                * stuff
+                */
+               if (!strcmp(pname, "ibm,phandle"))
+                       np->phandle = be32_to_cpup(val);
+
+               pp->name   = (char *)pname;
+               pp->length = sz;
+               pp->value  = (__be32 *)val;
+               *pprev     = pp;
+               pprev      = &pp->next;
+       }
+
+       /* With version 0x10 we may not have the name property,
+        * recreate it here from the unit name if absent
+        */
+       if (!has_name) {
+               const char *p = nodename, *ps = p, *pa = NULL;
+               int len;
+
+               while (*p) {
+                       if ((*p) == '@')
+                               pa = p;
+                       else if ((*p) == '/')
+                               ps = p + 1;
+                       p++;
+               }
+
+               if (pa < ps)
+                       pa = p;
+               len = (pa - ps) + 1;
+               pp = unflatten_dt_alloc(mem, sizeof(struct property) + len,
+                                       __alignof__(struct property));
+               if (!dryrun) {
+                       pp->name   = "name";
+                       pp->length = len;
+                       pp->value  = pp + 1;
+                       *pprev     = pp;
+                       pprev      = &pp->next;
+                       memcpy(pp->value, ps, len - 1);
+                       ((char *)pp->value)[len - 1] = 0;
+                       pr_debug("fixed up name for %s -> %s\n",
+                                nodename, (char *)pp->value);
+               }
+       }
+
+       if (!dryrun)
+               *pprev = NULL;
+}
+
+static unsigned int populate_node(const void *blob,
+                                 int offset,
+                                 void **mem,
+                                 struct device_node *dad,
+                                 unsigned int fpsize,
+                                 struct device_node **pnp,
+                                 bool dryrun)
+{
         struct device_node *np;
-       struct property *pp, **prev_pp = NULL;
         const char *pathp;
         unsigned int l, allocl;
-       static int depth;
-       int old_depth;
-       int offset;
-       int has_name = 0;
         int new_format = 0;
  
-       pathp = fdt_get_name(blob, *poffset, &l);
-       if (!pathp)
-               return mem;
+       pathp = fdt_get_name(blob, offset, &l);
+       if (!pathp) {
+               *pnp = NULL;
+               return 0;
+       }
  
         allocl = ++l;
  
@@ -223,7 +311,7 @@ static void * unflatten_dt_node(const void *blob,
                 }
         }
  
-       np = unflatten_dt_alloc(&mem, sizeof(struct device_node) + allocl,
+       np = unflatten_dt_alloc(mem, sizeof(struct device_node) + allocl,
                                 __alignof__(struct device_node));
         if (!dryrun) {
                 char *fn;
@@ -246,89 +334,15 @@ static void * unflatten_dt_node(const void *blob,
                 }
                 memcpy(fn, pathp, l);
  
-               prev_pp = &np->properties;
                 if (dad != NULL) {
                         np->parent = dad;
                         np->sibling = dad->child;
                         dad->child = np;
                 }
         }
-       /* process properties */
-       for (offset = fdt_first_property_offset(blob, *poffset);
-            (offset >= 0);
-            (offset = fdt_next_property_offset(blob, offset))) {
-               const char *pname;
-               u32 sz;
-
-               if (!(p = fdt_getprop_by_offset(blob, offset, &pname, &sz))) {
-                       offset = -FDT_ERR_INTERNAL;
-                       break;
-               }
  
-               if (pname == NULL) {
-                       pr_info("Can't find property name in list !\n");
-                       break;
-               }
-               if (strcmp(pname, "name") == 0)
-                       has_name = 1;
-               pp = unflatten_dt_alloc(&mem, sizeof(struct property),
-                                       __alignof__(struct property));
-               if (!dryrun) {
-                       /* We accept flattened tree phandles either in
-                        * ePAPR-style "phandle" properties, or the
-                        * legacy "linux,phandle" properties.  If both
-                        * appear and have different values, things
-                        * will get weird.  Don't do that. */
-                       if ((strcmp(pname, "phandle") == 0) ||
-                           (strcmp(pname, "linux,phandle") == 0)) {
-                               if (np->phandle == 0)
-                                       np->phandle = be32_to_cpup(p);
-                       }
-                       /* And we process the "ibm,phandle" property
-                        * used in pSeries dynamic device tree
-                        * stuff */
-                       if (strcmp(pname, "ibm,phandle") == 0)
-                               np->phandle = be32_to_cpup(p);
-                       pp->name = (char *)pname;
-                       pp->length = sz;
-                       pp->value = (__be32 *)p;
-                       *prev_pp = pp;
-                       prev_pp = &pp->next;
-               }
-       }
-       /* with version 0x10 we may not have the name property, recreate
-        * it here from the unit name if absent
-        */
-       if (!has_name) {
-               const char *p1 = pathp, *ps = pathp, *pa = NULL;
-               int sz;
-
-               while (*p1) {
-                       if ((*p1) == '@')
-                               pa = p1;
-                       if ((*p1) == '/')
-                               ps = p1 + 1;
-                       p1++;
-               }
-               if (pa < ps)
-                       pa = p1;
-               sz = (pa - ps) + 1;
-               pp = unflatten_dt_alloc(&mem, sizeof(struct property) + sz,
-                                       __alignof__(struct property));
-               if (!dryrun) {
-                       pp->name = "name";
-                       pp->length = sz;
-                       pp->value = pp + 1;
-                       *prev_pp = pp;
-                       prev_pp = &pp->next;
-                       memcpy(pp->value, ps, sz - 1);
-                       ((char *)pp->value)[sz - 1] = 0;
-                       pr_debug("fixed up name for %s -> %s\n", pathp,
-                               (char *)pp->value);
-               }
-       }
+       populate_properties(blob, offset, mem, np, pathp, dryrun);
         if (!dryrun) {
-               *prev_pp = NULL;
                 np->name = of_get_property(np, "name", NULL);
                 np->type = of_get_property(np, "device_type", NULL);
  
@@ -338,36 +352,94 @@ static void * unflatten_dt_node(const void *blob,
                         np->type = "<NULL>";
         }
  
-       old_depth = depth;
-       *poffset = fdt_next_node(blob, *poffset, &depth);
-       if (depth < 0)
-               depth = 0;
-       while (*poffset > 0 && depth > old_depth)
-               mem = unflatten_dt_node(blob, mem, poffset, np, NULL,
-                                       fpsize, dryrun);
+       *pnp = np;
+       return fpsize;
+}
+
+static void reverse_nodes(struct device_node *parent)
+{
+       struct device_node *child, *next;
+
+       /* In-depth first */
+       child = parent->child;
+       while (child) {
+               reverse_nodes(child);
+
+               child = child->sibling;
+       }
+
+       /* Reverse the nodes in the child list */
+       child = parent->child;
+       parent->child = NULL;
+       while (child) {
+               next = child->sibling;
+
+               child->sibling = parent->child;
+               parent->child = child;
+               child = next;
+       }
+}
+
+/**
+ * unflatten_dt_nodes - Alloc and populate a device_node from the flat tree
+ * @blob: The parent device tree blob
+ * @mem: Memory chunk to use for allocating device nodes and properties
+ * @dad: Parent struct device_node
+ * @nodepp: The device_node tree created by the call
+ *
+ * It returns the size of unflattened device tree or error code
+ */
+static int unflatten_dt_nodes(const void *blob,
+                             void *mem,
+                             struct device_node *dad,
+                             struct device_node **nodepp)
+{
+       struct device_node *root;
+       int offset = 0, depth = 0;
+#define FDT_MAX_DEPTH  64
+       unsigned int fpsizes[FDT_MAX_DEPTH];
+       struct device_node *nps[FDT_MAX_DEPTH];
+       void *base = mem;
+       bool dryrun = !base;
  
-       if (*poffset < 0 && *poffset != -FDT_ERR_NOTFOUND)
-               pr_err("unflatten: error %d processing FDT\n", *poffset);
+       if (nodepp)
+               *nodepp = NULL;
+
+       root = dad;
+       fpsizes[depth] = dad ? strlen(of_node_full_name(dad)) : 0;
+       nps[depth] = dad;
+       for (offset = 0;
+            offset >= 0 && depth >= 0;
+            offset = fdt_next_node(blob, offset, &depth)) {
+               if (WARN_ON_ONCE(depth >= FDT_MAX_DEPTH))
+                       continue;
+
+               fpsizes[depth+1] = populate_node(blob, offset, &mem,
+                                                nps[depth],
+                                                fpsizes[depth],
+                                                &nps[depth+1], dryrun);
+               if (!fpsizes[depth+1])
+                       return mem - base;
+
+               if (!dryrun && nodepp && !*nodepp)
+                       *nodepp = nps[depth+1];
+               if (!dryrun && !root)
+                       root = nps[depth+1];
+       }
+
+       if (offset < 0 && offset != -FDT_ERR_NOTFOUND) {
+               pr_err("%s: Error %d processing FDT\n", __func__, offset);
+               return -EINVAL;
+       }
  
         /*
          * Reverse the child list. Some drivers assumes node order matches .dts
          * node order
          */
-       if (!dryrun && np->child) {
-               struct device_node *child = np->child;
-               np->child = NULL;
-               while (child) {
-                       struct device_node *next = child->sibling;
-                       child->sibling = np->child;
-                       np->child = child;
-                       child = next;
-               }
-       }
-
-       if (nodepp)
-               *nodepp = np;
+       if (!dryrun)
+               reverse_nodes(root);
  
-       return mem;
+       return mem - base;
  }
  
  /**
@@ -378,23 +450,27 @@ static void * unflatten_dt_node(const void *blob,
   * pointers of the nodes so the normal device-tree walking functions
   * can be used.
   * @blob: The blob to expand
+ * @dad: Parent device node
   * @mynodes: The device_node tree created by the call
   * @dt_alloc: An allocator that provides a virtual address to memory
   * for the resulting tree
+ *
+ * Returns NULL on failure or the memory chunk containing the unflattened
+ * device tree on success.
   */
-static void __unflatten_device_tree(const void *blob,
-                            struct device_node **mynodes,
-                            void * (*dt_alloc)(u64 size, u64 align))
+static void *__unflatten_device_tree(const void *blob,
+                                    struct device_node *dad,
+                                    struct device_node **mynodes,
+                                    void *(*dt_alloc)(u64 size, u64 align))
  {
-       unsigned long size;
-       int start;
+       int size;
         void *mem;
  
         pr_debug(" -> unflatten_device_tree()\n");
  
         if (!blob) {
                 pr_debug("No device tree pointer\n");
-               return;
+               return NULL;
         }
  
         pr_debug("Unflattening device tree:\n");
@@ -404,15 +480,16 @@ static void __unflatten_device_tree(const void *blob,
  
         if (fdt_check_header(blob)) {
                 pr_err("Invalid device tree blob header\n");
-               return;
+               return NULL;
         }
  
         /* First pass, scan for size */
-       start = 0;
-       size = (unsigned long)unflatten_dt_node(blob, NULL, &start, NULL, NULL, 0, true);
-       size = ALIGN(size, 4);
+       size = unflatten_dt_nodes(blob, NULL, dad, NULL);
+       if (size < 0)
+               return NULL;
  
-       pr_debug("  size is %lx, allocating...\n", size);
+       size = ALIGN(size, 4);
+       pr_debug("  size is %d, allocating...\n", size);
  
         /* Allocate memory for the expanded device tree */
         mem = dt_alloc(size + 4, __alignof__(struct device_node));
@@ -423,13 +500,13 @@ static void __unflatten_device_tree(const void *blob,
         pr_debug("  unflattening %p...\n", mem);
  
         /* Second pass, do actual unflattening */
-       start = 0;
-       unflatten_dt_node(blob, mem, &start, NULL, mynodes, 0, false);
+       unflatten_dt_nodes(blob, mem, dad, mynodes);
         if (be32_to_cpup(mem + size) != 0xdeadbeef)
                 pr_warning("End of tree marker overwritten: %08x\n",
                            be32_to_cpup(mem + size));
  
         pr_debug(" <- unflatten_device_tree()\n");
+       return mem;
  }
  
  static void *kernel_tree_alloc(u64 size, u64 align)
@@ -441,18 +518,29 @@ static DEFINE_MUTEX(of_fdt_unflatten_mutex);
  
  /**
   * of_fdt_unflatten_tree - create tree of device_nodes from flat blob
+ * @blob: Flat device tree blob
+ * @dad: Parent device node
+ * @mynodes: The device tree created by the call
   *
   * unflattens the device-tree passed by the firmware, creating the
   * tree of struct device_node. It also fills the "name" and "type"
   * pointers of the nodes so the normal device-tree walking functions
   * can be used.
+ *
+ * Returns NULL on failure or the memory chunk containing the unflattened
+ * device tree on success.
   */
-void of_fdt_unflatten_tree(const unsigned long *blob,
-                       struct device_node **mynodes)
+void *of_fdt_unflatten_tree(const unsigned long *blob,
+                           struct device_node *dad,
+                           struct device_node **mynodes)
  {
+       void *mem;
+
         mutex_lock(&of_fdt_unflatten_mutex);
-       __unflatten_device_tree(blob, mynodes, &kernel_tree_alloc);
+       mem = __unflatten_device_tree(blob, dad, mynodes, &kernel_tree_alloc);
         mutex_unlock(&of_fdt_unflatten_mutex);
+
+       return mem;
  }
  EXPORT_SYMBOL_GPL(of_fdt_unflatten_tree);
  
@@ -969,10 +1057,16 @@ int __init early_init_dt_scan_chosen(unsigned long node, const char *uname,
          * is set in which case we override whatever was found earlier.
          */
  #ifdef CONFIG_CMDLINE
-#ifndef CONFIG_CMDLINE_FORCE
+#if defined(CONFIG_CMDLINE_EXTEND)
+       strlcat(data, " ", COMMAND_LINE_SIZE);
+       strlcat(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
+#elif defined(CONFIG_CMDLINE_FORCE)
+       strlcpy(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
+#else
+       /* No arguments from boot loader, use kernel's  cmdl*/
         if (!((char *)data)[0])
-#endif
                 strlcpy(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
+#endif
  #endif /* CONFIG_CMDLINE */
  
         pr_debug("Command line is: %s\n", (char*)data);
@@ -1118,7 +1212,7 @@ bool __init early_init_dt_scan(void *params)
   */
  void __init unflatten_device_tree(void)
  {
-       __unflatten_device_tree(initial_boot_params, &of_root,
+       __unflatten_device_tree(initial_boot_params, NULL, &of_root,
                                 early_init_dt_alloc_memory_arch);
  
         /* Get pointer to "/chosen" and "/aliases" nodes for use everywhere */
diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c

index c1ebbfb794537b6bf553abeeff876bdbfe9ac670..f34ed9310323cdb32e91e37bcf0404835edfd004 100644 (file)
--- a/drivers/of/unittest.c
+++ b/drivers/of/unittest.c
@@ -8,7 +8,6 @@
  #include <linux/err.h>
  #include <linux/errno.h>
  #include <linux/hashtable.h>
-#include <linux/module.h>
  #include <linux/of.h>
  #include <linux/of_fdt.h>
  #include <linux/of_irq.h>
@@ -921,7 +920,7 @@ static int __init unittest_data_add(void)
                         "not running tests\n", __func__);
                 return -ENOMEM;
         }
-       of_fdt_unflatten_tree(unittest_data, &unittest_data_node);
+       of_fdt_unflatten_tree(unittest_data, NULL, &unittest_data_node);
         if (!unittest_data_node) {
                 pr_warn("%s: No tree to attach; not running tests\n", __func__);
                 return -ENODATA;
diff --git a/drivers/pci/hotplug/rpadlpar_core.c b/drivers/pci/hotplug/rpadlpar_core.c

index b46b57d870fc9e44c20564300248ca6fcd46cd32..dc67f39779ecd3cbfbd7d0f60eaafc627e85e2d2 100644 (file)
--- a/drivers/pci/hotplug/rpadlpar_core.c
+++ b/drivers/pci/hotplug/rpadlpar_core.c
@@ -175,7 +175,7 @@ static int dlpar_add_pci_slot(char *drc_name, struct device_node *dn)
         struct pci_dev *dev;
         struct pci_controller *phb;
  
-       if (pcibios_find_pci_bus(dn))
+       if (pci_find_bus_by_node(dn))
                 return -EINVAL;
  
         /* Add pci bus */
@@ -212,7 +212,7 @@ static int dlpar_remove_phb(char *drc_name, struct device_node *dn)
         struct pci_dn *pdn;
         int rc = 0;
  
-       if (!pcibios_find_pci_bus(dn))
+       if (!pci_find_bus_by_node(dn))
                 return -EINVAL;
  
         /* If pci slot is hotpluggable, use hotplug to remove it */
@@ -356,7 +356,7 @@ int dlpar_remove_pci_slot(char *drc_name, struct device_node *dn)
  
         pci_lock_rescan_remove();
  
-       bus = pcibios_find_pci_bus(dn);
+       bus = pci_find_bus_by_node(dn);
         if (!bus) {
                 ret = -EINVAL;
                 goto out;
@@ -380,7 +380,7 @@ int dlpar_remove_pci_slot(char *drc_name, struct device_node *dn)
         }
  
         /* Remove all devices below slot */
-       pcibios_remove_pci_devices(bus);
+       pci_hp_remove_devices(bus);
  
         /* Unmap PCI IO space */
         if (pcibios_unmap_io_space(bus)) {
diff --git a/drivers/pci/hotplug/rpaphp_core.c b/drivers/pci/hotplug/rpaphp_core.c

index 611f6056221a94fbe1e06c289e2a36b33f0e1c52..8d132024f06ed579a4ea8b3aefef856b337ed80e 100644 (file)
--- a/drivers/pci/hotplug/rpaphp_core.c
+++ b/drivers/pci/hotplug/rpaphp_core.c
@@ -404,7 +404,7 @@ static int enable_slot(struct hotplug_slot *hotplug_slot)
  
         if (state == PRESENT) {
                 pci_lock_rescan_remove();
-               pcibios_add_pci_devices(slot->bus);
+               pci_hp_add_devices(slot->bus);
                 pci_unlock_rescan_remove();
                 slot->state = CONFIGURED;
         } else if (state == EMPTY) {
@@ -426,7 +426,7 @@ static int disable_slot(struct hotplug_slot *hotplug_slot)
                 return -EINVAL;
  
         pci_lock_rescan_remove();
-       pcibios_remove_pci_devices(slot->bus);
+       pci_hp_remove_devices(slot->bus);
         pci_unlock_rescan_remove();
         vm_unmap_aliases();
  
diff --git a/drivers/pci/hotplug/rpaphp_pci.c b/drivers/pci/hotplug/rpaphp_pci.c

index 7836d6913e67df7053e4b31a1c1067b2e2f1c17a..ea41ea1d3c0052790b5f18aacae1dd9e33016c0d 100644 (file)
--- a/drivers/pci/hotplug/rpaphp_pci.c
+++ b/drivers/pci/hotplug/rpaphp_pci.c
@@ -93,7 +93,7 @@ int rpaphp_enable_slot(struct slot *slot)
         if (rc)
                 return rc;
  
-       bus = pcibios_find_pci_bus(slot->dn);
+       bus = pci_find_bus_by_node(slot->dn);
         if (!bus) {
                 err("%s: no pci_bus for dn %s\n", __func__, slot->dn->full_name);
                 return -EINVAL;
@@ -116,7 +116,7 @@ int rpaphp_enable_slot(struct slot *slot)
                 }
  
                 if (list_empty(&bus->devices))
-                       pcibios_add_pci_devices(bus);
+                       pci_hp_add_devices(bus);
  
                 if (!list_empty(&bus->devices)) {
                         info->adapter_status = CONFIGURED;
diff --git a/drivers/pcmcia/electra_cf.c b/drivers/pcmcia/electra_cf.c

index 61cf61ac621ebc8d253dd59afc36eda535378a67..4d7bc3f4124a58110c4573337f37499efa63f188 100644 (file)
--- a/drivers/pcmcia/electra_cf.c
+++ b/drivers/pcmcia/electra_cf.c
@@ -228,7 +228,7 @@ static int electra_cf_probe(struct platform_device *ofdev)
  
         if (!cf->mem_base || !cf->io_virt || !cf->gpio_base ||
             (__ioremap_at(io.start, cf->io_virt, cf->io_size,
-               _PAGE_NO_CACHE | _PAGE_GUARDED) == NULL)) {
+                 pgprot_val(pgprot_noncached(__pgprot(0)))) == NULL)) {
                 dev_err(device, "can't ioremap ranges\n");
                 status = -ENOMEM;
                 goto fail1;
diff --git a/drivers/power/ipaq_micro_battery.c b/drivers/power/ipaq_micro_battery.c

index 3f314b1a30d72af732acd7f9fd679bdfe34ba407..35b01c7d775b45c00f5ed98549c216f744f155a0 100644 (file)
--- a/drivers/power/ipaq_micro_battery.c
+++ b/drivers/power/ipaq_micro_battery.c
@@ -261,7 +261,7 @@ static int micro_batt_probe(struct platform_device *pdev)
         return 0;
  
  ac_err:
-       power_supply_unregister(micro_ac_power);
+       power_supply_unregister(micro_batt_power);
  batt_err:
         cancel_delayed_work_sync(&mb->update);
         destroy_workqueue(mb->wq);
diff --git a/drivers/power/max8925_power.c b/drivers/power/max8925_power.c

index 57eb5c2bfc212b5ff643f9cda13d95eb06ece526..3b94620ce5c1948ec95f8422e0007dbb983b2a5f 100644 (file)
--- a/drivers/power/max8925_power.c
+++ b/drivers/power/max8925_power.c
@@ -540,14 +540,14 @@ static int max8925_power_probe(struct platform_device *pdev)
         info->usb = power_supply_register(&pdev->dev, &usb_desc, &psy_cfg);
         if (IS_ERR(info->usb)) {
                 ret = PTR_ERR(info->usb);
-               goto out_usb;
+               goto out_unregister_ac;
         }
         info->usb->dev.parent = &pdev->dev;
  
         info->battery = power_supply_register(&pdev->dev, &battery_desc, NULL);
         if (IS_ERR(info->battery)) {
                 ret = PTR_ERR(info->battery);
-               goto out_battery;
+               goto out_unregister_usb;
         }
         info->battery->dev.parent = &pdev->dev;
  
@@ -560,9 +560,9 @@ static int max8925_power_probe(struct platform_device *pdev)
  
         max8925_init_charger(chip, info);
         return 0;
-out_battery:
-       power_supply_unregister(info->battery);
-out_usb:
+out_unregister_usb:
+       power_supply_unregister(info->usb);
+out_unregister_ac:
         power_supply_unregister(info->ac);
  out:
         return ret;
diff --git a/drivers/power/reset/Kconfig b/drivers/power/reset/Kconfig

index 0a6408a39c6614b0da113535f2995c281b658b54..9bb2622c23bfff0195cdd93dd113b5cee68e0fda 100644 (file)
--- a/drivers/power/reset/Kconfig
+++ b/drivers/power/reset/Kconfig
@@ -30,6 +30,14 @@ config POWER_RESET_AT91_RESET
           This driver supports restart for Atmel AT91SAM9 and SAMA5
           SoCs
  
+config POWER_RESET_AT91_SAMA5D2_SHDWC
+       tristate "Atmel AT91 SAMA5D2-Compatible shutdown controller driver"
+       depends on ARCH_AT91 || COMPILE_TEST
+       default SOC_SAMA5
+       help
+         This driver supports the alternate shutdown controller for some Atmel
+         SAMA5 SoCs. It is present for example on SAMA5D2 SoC.
+
  config POWER_RESET_AXXIA
         bool "LSI Axxia reset driver"
         depends on ARCH_AXXIA
diff --git a/drivers/power/reset/Makefile b/drivers/power/reset/Makefile

index 096fa67047f6ed62ac74397cfea552cb571a0ec3..ab7aa8614d1f3c3d48b5f7ff85a4aa404321091f 100644 (file)
--- a/drivers/power/reset/Makefile
+++ b/drivers/power/reset/Makefile
@@ -1,6 +1,7 @@
  obj-$(CONFIG_POWER_RESET_AS3722) += as3722-poweroff.o
  obj-$(CONFIG_POWER_RESET_AT91_POWEROFF) += at91-poweroff.o
  obj-$(CONFIG_POWER_RESET_AT91_RESET) += at91-reset.o
+obj-$(CONFIG_POWER_RESET_AT91_SAMA5D2_SHDWC) += at91-sama5d2_shdwc.o
  obj-$(CONFIG_POWER_RESET_AXXIA) += axxia-reset.o
  obj-$(CONFIG_POWER_RESET_BRCMSTB) += brcmstb-reboot.o
  obj-$(CONFIG_POWER_RESET_GPIO) += gpio-poweroff.o
diff --git a/drivers/power/reset/at91-sama5d2_shdwc.c b/drivers/power/reset/at91-sama5d2_shdwc.c

new file mode 100644 (file)

index 0000000..8a5ac97
--- /dev/null
+++ b/drivers/power/reset/at91-sama5d2_shdwc.c
@@ -0,0 +1,282 @@
+/*
+ * Atmel SAMA5D2-Compatible Shutdown Controller (SHDWC) driver.
+ * Found on some SoCs as the sama5d2 (obviously).
+ *
+ * Copyright (C) 2015 Atmel Corporation,
+ *                    Nicolas Ferre <nicolas.ferre@atmel.com>
+ *
+ * Evolved from driver at91-poweroff.c.
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2.  This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ *
+ * TODO:
+ * - addition to status of other wake-up inputs [1 - 15]
+ * - Analog Comparator wake-up alarm
+ * - Serial RX wake-up alarm
+ * - low power debouncer
+ */
+
+#include <linux/clk.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/printk.h>
+
+#define SLOW_CLOCK_FREQ        32768
+
+#define AT91_SHDW_CR   0x00            /* Shut Down Control Register */
+#define AT91_SHDW_SHDW         BIT(0)                  /* Shut Down command */
+#define AT91_SHDW_KEY          (0xa5UL << 24)          /* KEY Password */
+
+#define AT91_SHDW_MR   0x04            /* Shut Down Mode Register */
+#define AT91_SHDW_WKUPDBC_SHIFT        24
+#define AT91_SHDW_WKUPDBC_MASK GENMASK(31, 16)
+#define AT91_SHDW_WKUPDBC(x)   (((x) << AT91_SHDW_WKUPDBC_SHIFT) \
+                                               & AT91_SHDW_WKUPDBC_MASK)
+
+#define AT91_SHDW_SR   0x08            /* Shut Down Status Register */
+#define AT91_SHDW_WKUPIS_SHIFT 16
+#define AT91_SHDW_WKUPIS_MASK  GENMASK(31, 16)
+#define AT91_SHDW_WKUPIS(x)    ((1 << (x)) << AT91_SHDW_WKUPIS_SHIFT \
+                                               & AT91_SHDW_WKUPIS_MASK)
+
+#define AT91_SHDW_WUIR 0x0c            /* Shutdown Wake-up Inputs Register */
+#define AT91_SHDW_WKUPEN_MASK  GENMASK(15, 0)
+#define AT91_SHDW_WKUPEN(x)    ((1 << (x)) & AT91_SHDW_WKUPEN_MASK)
+#define AT91_SHDW_WKUPT_SHIFT  16
+#define AT91_SHDW_WKUPT_MASK   GENMASK(31, 16)
+#define AT91_SHDW_WKUPT(x)     ((1 << (x)) << AT91_SHDW_WKUPT_SHIFT \
+                                               & AT91_SHDW_WKUPT_MASK)
+
+#define SHDW_WK_PIN(reg, cfg)  ((reg) & AT91_SHDW_WKUPIS((cfg)->wkup_pin_input))
+#define SHDW_RTCWK(reg, cfg)   (((reg) >> ((cfg)->sr_rtcwk_shift)) & 0x1)
+#define SHDW_RTCWKEN(cfg)      (1 << ((cfg)->mr_rtcwk_shift))
+
+#define DBC_PERIOD_US(x)       DIV_ROUND_UP_ULL((1000000 * (x)), \
+                                                       SLOW_CLOCK_FREQ)
+
+struct shdwc_config {
+       u8 wkup_pin_input;
+       u8 mr_rtcwk_shift;
+       u8 sr_rtcwk_shift;
+};
+
+struct shdwc {
+       struct shdwc_config *cfg;
+       void __iomem *at91_shdwc_base;
+};
+
+/*
+ * Hold configuration here, cannot be more than one instance of the driver
+ * since pm_power_off itself is global.
+ */
+static struct shdwc *at91_shdwc;
+static struct clk *sclk;
+
+static const unsigned long long sdwc_dbc_period[] = {
+       0, 3, 32, 512, 4096, 32768,
+};
+
+static void __init at91_wakeup_status(struct platform_device *pdev)
+{
+       struct shdwc *shdw = platform_get_drvdata(pdev);
+       u32 reg;
+       char *reason = "unknown";
+
+       reg = readl(shdw->at91_shdwc_base + AT91_SHDW_SR);
+
+       dev_dbg(&pdev->dev, "%s: status = %#x\n", __func__, reg);
+
+       /* Simple power-on, just bail out */
+       if (!reg)
+               return;
+
+       if (SHDW_WK_PIN(reg, shdw->cfg))
+               reason = "WKUP pin";
+       else if (SHDW_RTCWK(reg, shdw->cfg))
+               reason = "RTC";
+
+       pr_info("AT91: Wake-Up source: %s\n", reason);
+}
+
+static void at91_poweroff(void)
+{
+       writel(AT91_SHDW_KEY | AT91_SHDW_SHDW,
+              at91_shdwc->at91_shdwc_base + AT91_SHDW_CR);
+}
+
+static u32 at91_shdwc_debouncer_value(struct platform_device *pdev,
+                                     u32 in_period_us)
+{
+       int i;
+       int max_idx = ARRAY_SIZE(sdwc_dbc_period) - 1;
+       unsigned long long period_us;
+       unsigned long long max_period_us = DBC_PERIOD_US(sdwc_dbc_period[max_idx]);
+
+       if (in_period_us > max_period_us) {
+               dev_warn(&pdev->dev,
+                        "debouncer period %u too big, reduced to %llu us\n",
+                        in_period_us, max_period_us);
+               return max_idx;
+       }
+
+       for (i = max_idx - 1; i > 0; i--) {
+               period_us = DBC_PERIOD_US(sdwc_dbc_period[i]);
+               dev_dbg(&pdev->dev, "%s: ref[%d] = %llu\n",
+                                               __func__, i, period_us);
+               if (in_period_us > period_us)
+                       break;
+       }
+
+       return i + 1;
+}
+
+static u32 at91_shdwc_get_wakeup_input(struct platform_device *pdev,
+                                      struct device_node *np)
+{
+       struct device_node *cnp;
+       u32 wk_input_mask;
+       u32 wuir = 0;
+       u32 wk_input;
+
+       for_each_child_of_node(np, cnp) {
+               if (of_property_read_u32(cnp, "reg", &wk_input)) {
+                       dev_warn(&pdev->dev, "reg property is missing for %s\n",
+                                cnp->full_name);
+                       continue;
+               }
+
+               wk_input_mask = 1 << wk_input;
+               if (!(wk_input_mask & AT91_SHDW_WKUPEN_MASK)) {
+                       dev_warn(&pdev->dev,
+                                "wake-up input %d out of bounds ignore\n",
+                                wk_input);
+                       continue;
+               }
+               wuir |= wk_input_mask;
+
+               if (of_property_read_bool(cnp, "atmel,wakeup-active-high"))
+                       wuir |= AT91_SHDW_WKUPT(wk_input);
+
+               dev_dbg(&pdev->dev, "%s: (child %d) wuir = %#x\n",
+                                               __func__, wk_input, wuir);
+       }
+
+       return wuir;
+}
+
+static void at91_shdwc_dt_configure(struct platform_device *pdev)
+{
+       struct shdwc *shdw = platform_get_drvdata(pdev);
+       struct device_node *np = pdev->dev.of_node;
+       u32 mode = 0, tmp, input;
+
+       if (!np) {
+               dev_err(&pdev->dev, "device node not found\n");
+               return;
+       }
+
+       if (!of_property_read_u32(np, "debounce-delay-us", &tmp))
+               mode |= AT91_SHDW_WKUPDBC(at91_shdwc_debouncer_value(pdev, tmp));
+
+       if (of_property_read_bool(np, "atmel,wakeup-rtc-timer"))
+               mode |= SHDW_RTCWKEN(shdw->cfg);
+
+       dev_dbg(&pdev->dev, "%s: mode = %#x\n", __func__, mode);
+       writel(mode, shdw->at91_shdwc_base + AT91_SHDW_MR);
+
+       input = at91_shdwc_get_wakeup_input(pdev, np);
+       writel(input, shdw->at91_shdwc_base + AT91_SHDW_WUIR);
+}
+
+static const struct shdwc_config sama5d2_shdwc_config = {
+       .wkup_pin_input = 0,
+       .mr_rtcwk_shift = 17,
+       .sr_rtcwk_shift = 5,
+};
+
+static const struct of_device_id at91_shdwc_of_match[] = {
+       {
+               .compatible = "atmel,sama5d2-shdwc",
+               .data = &sama5d2_shdwc_config,
+       }, {
+               /*sentinel*/
+       }
+};
+MODULE_DEVICE_TABLE(of, at91_shdwc_of_match);
+
+static int __init at91_shdwc_probe(struct platform_device *pdev)
+{
+       struct resource *res;
+       const struct of_device_id *match;
+       int ret;
+
+       if (!pdev->dev.of_node)
+               return -ENODEV;
+
+       at91_shdwc = devm_kzalloc(&pdev->dev, sizeof(*at91_shdwc), GFP_KERNEL);
+       if (!at91_shdwc)
+               return -ENOMEM;
+
+       platform_set_drvdata(pdev, at91_shdwc);
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       at91_shdwc->at91_shdwc_base = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(at91_shdwc->at91_shdwc_base)) {
+               dev_err(&pdev->dev, "Could not map reset controller address\n");
+               return PTR_ERR(at91_shdwc->at91_shdwc_base);
+       }
+
+       match = of_match_node(at91_shdwc_of_match, pdev->dev.of_node);
+       at91_shdwc->cfg = (struct shdwc_config *)(match->data);
+
+       sclk = devm_clk_get(&pdev->dev, NULL);
+       if (IS_ERR(sclk))
+               return PTR_ERR(sclk);
+
+       ret = clk_prepare_enable(sclk);
+       if (ret) {
+               dev_err(&pdev->dev, "Could not enable slow clock\n");
+               return ret;
+       }
+
+       at91_wakeup_status(pdev);
+
+       at91_shdwc_dt_configure(pdev);
+
+       pm_power_off = at91_poweroff;
+
+       return 0;
+}
+
+static int __exit at91_shdwc_remove(struct platform_device *pdev)
+{
+       struct shdwc *shdw = platform_get_drvdata(pdev);
+
+       if (pm_power_off == at91_poweroff)
+               pm_power_off = NULL;
+
+       /* Reset values to disable wake-up features  */
+       writel(0, shdw->at91_shdwc_base + AT91_SHDW_MR);
+       writel(0, shdw->at91_shdwc_base + AT91_SHDW_WUIR);
+
+       clk_disable_unprepare(sclk);
+
+       return 0;
+}
+
+static struct platform_driver at91_shdwc_driver = {
+       .remove = __exit_p(at91_shdwc_remove),
+       .driver = {
+               .name = "at91-shdwc",
+               .of_match_table = at91_shdwc_of_match,
+       },
+};
+module_platform_driver_probe(at91_shdwc_driver, at91_shdwc_probe);
+
+MODULE_AUTHOR("Nicolas Ferre <nicolas.ferre@atmel.com>");
+MODULE_DESCRIPTION("Atmel shutdown controller driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/power/sbs-battery.c b/drivers/power/sbs-battery.c

index d6226d68b5746d67cfb8561afa0906684280b76d..768b9fcb58eacdc533e8094f8f68327bfae518e2 100644 (file)
--- a/drivers/power/sbs-battery.c
+++ b/drivers/power/sbs-battery.c
@@ -382,8 +382,6 @@ static int sbs_get_battery_property(struct i2c_client *client,
  
                 if (ret & BATTERY_FULL_CHARGED)
                         val->intval = POWER_SUPPLY_STATUS_FULL;
-               else if (ret & BATTERY_FULL_DISCHARGED)
-                       val->intval = POWER_SUPPLY_STATUS_NOT_CHARGING;
                 else if (ret & BATTERY_DISCHARGING)
                         val->intval = POWER_SUPPLY_STATUS_DISCHARGING;
                 else
@@ -702,8 +700,6 @@ static void sbs_delayed_work(struct work_struct *work)
  
         if (ret & BATTERY_FULL_CHARGED)
                 ret = POWER_SUPPLY_STATUS_FULL;
-       else if (ret & BATTERY_FULL_DISCHARGED)
-               ret = POWER_SUPPLY_STATUS_NOT_CHARGING;
         else if (ret & BATTERY_DISCHARGING)
                 ret = POWER_SUPPLY_STATUS_DISCHARGING;
         else
diff --git a/drivers/scsi/bfa/bfi.h b/drivers/scsi/bfa/bfi.h

index 97600dcec6491865f23fcc8df3e0365a6d692e92..5f698d038b21f2c7d54c26da10949bfcbce0655a 100644 (file)
--- a/drivers/scsi/bfa/bfi.h
+++ b/drivers/scsi/bfa/bfi.h
@@ -356,7 +356,7 @@ struct bfi_ioc_image_hdr_s {
         u8      port0_mode;     /* device mode for port 0       */
         u8      port1_mode;     /* device mode for port 1       */
         u32     exec;           /* exec vector                  */
-       u32     bootenv;        /* fimware boot env             */
+       u32     bootenv;        /* firmware boot env            */
         u32     rsvd_b[2];
         struct bfi_ioc_fwver_s  fwver;
         u32     md5sum[BFI_IOC_MD5SUM_SZ];
diff --git a/drivers/staging/comedi/drivers/daqboard2000.c b/drivers/staging/comedi/drivers/daqboard2000.c

index 57ab6680e3aed4c1375820b652ca0938470adcc5..a536a15c1d30763dd4040ce8d1b41d1158f5da68 100644 (file)
--- a/drivers/staging/comedi/drivers/daqboard2000.c
+++ b/drivers/staging/comedi/drivers/daqboard2000.c
@@ -26,7 +26,7 @@
   * Much of the functionality of this driver was determined from reading
   * the source code for the Windows driver.
   *
- * The FPGA on the board requires fimware, which is available from
+ * The FPGA on the board requires firmware, which is available from
   * http://www.comedi.org in the comedi_nonfree_firmware tarball.
   *
   * Configuration options: not applicable, uses PCI auto config
diff --git a/drivers/staging/rdma/hfi1/affinity.c b/drivers/staging/rdma/hfi1/affinity.c

index 2cb8ca77f876e7eeec1c553445f25587e080224c..6e7050ab9e16049039406b8cbb207cdfced18109 100644 (file)
--- a/drivers/staging/rdma/hfi1/affinity.c
+++ b/drivers/staging/rdma/hfi1/affinity.c
@@ -53,20 +53,6 @@
  #include "sdma.h"
  #include "trace.h"
  
-struct cpu_mask_set {
-       struct cpumask mask;
-       struct cpumask used;
-       uint gen;
-};
-
-struct hfi1_affinity {
-       struct cpu_mask_set def_intr;
-       struct cpu_mask_set rcv_intr;
-       struct cpu_mask_set proc;
-       /* spin lock to protect affinity struct */
-       spinlock_t lock;
-};
-
  /* Name of IRQ types, indexed by enum irq_type */
  static const char * const irq_type_names[] = {
         "SDMA",
@@ -82,6 +68,48 @@ static inline void init_cpu_mask_set(struct cpu_mask_set *set)
         set->gen = 0;
  }
  
+/* Initialize non-HT cpu cores mask */
+int init_real_cpu_mask(struct hfi1_devdata *dd)
+{
+       struct hfi1_affinity *info;
+       int possible, curr_cpu, i, ht;
+
+       info = kzalloc(sizeof(*info), GFP_KERNEL);
+       if (!info)
+               return -ENOMEM;
+
+       cpumask_clear(&info->real_cpu_mask);
+
+       /* Start with cpu online mask as the real cpu mask */
+       cpumask_copy(&info->real_cpu_mask, cpu_online_mask);
+
+       /*
+        * Remove HT cores from the real cpu mask.  Do this in two steps below.
+        */
+       possible = cpumask_weight(&info->real_cpu_mask);
+       ht = cpumask_weight(topology_sibling_cpumask(
+                                       cpumask_first(&info->real_cpu_mask)));
+       /*
+        * Step 1.  Skip over the first N HT siblings and use them as the
+        * "real" cores.  Assumes that HT cores are not enumerated in
+        * succession (except in the single core case).
+        */
+       curr_cpu = cpumask_first(&info->real_cpu_mask);
+       for (i = 0; i < possible / ht; i++)
+               curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask);
+       /*
+        * Step 2.  Remove the remaining HT siblings.  Use cpumask_next() to
+        * skip any gaps.
+        */
+       for (; i < possible; i++) {
+               cpumask_clear_cpu(curr_cpu, &info->real_cpu_mask);
+               curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask);
+       }
+
+       dd->affinity = info;
+       return 0;
+}
+
  /*
   * Interrupt affinity.
   *
@@ -93,20 +121,17 @@ static inline void init_cpu_mask_set(struct cpu_mask_set *set)
   * to the node relative 1 as necessary.
   *
   */
-int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
+void hfi1_dev_affinity_init(struct hfi1_devdata *dd)
  {
         int node = pcibus_to_node(dd->pcidev->bus);
-       struct hfi1_affinity *info;
+       struct hfi1_affinity *info = dd->affinity;
         const struct cpumask *local_mask;
-       int curr_cpu, possible, i, ht;
+       int curr_cpu, possible, i;
  
         if (node < 0)
                 node = numa_node_id();
         dd->node = node;
  
-       info = kzalloc(sizeof(*info), GFP_KERNEL);
-       if (!info)
-               return -ENOMEM;
         spin_lock_init(&info->lock);
  
         init_cpu_mask_set(&info->def_intr);
@@ -116,30 +141,8 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
         local_mask = cpumask_of_node(dd->node);
         if (cpumask_first(local_mask) >= nr_cpu_ids)
                 local_mask = topology_core_cpumask(0);
-       /* use local mask as default */
-       cpumask_copy(&info->def_intr.mask, local_mask);
-       /*
-        * Remove HT cores from the default mask.  Do this in two steps below.
-        */
-       possible = cpumask_weight(&info->def_intr.mask);
-       ht = cpumask_weight(topology_sibling_cpumask(
-                                       cpumask_first(&info->def_intr.mask)));
-       /*
-        * Step 1.  Skip over the first N HT siblings and use them as the
-        * "real" cores.  Assumes that HT cores are not enumerated in
-        * succession (except in the single core case).
-        */
-       curr_cpu = cpumask_first(&info->def_intr.mask);
-       for (i = 0; i < possible / ht; i++)
-               curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask);
-       /*
-        * Step 2.  Remove the remaining HT siblings.  Use cpumask_next() to
-        * skip any gaps.
-        */
-       for (; i < possible; i++) {
-               cpumask_clear_cpu(curr_cpu, &info->def_intr.mask);
-               curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask);
-       }
+       /* Use the "real" cpu mask of this node as the default */
+       cpumask_and(&info->def_intr.mask, &info->real_cpu_mask, local_mask);
  
         /*  fill in the receive list */
         possible = cpumask_weight(&info->def_intr.mask);
@@ -167,8 +170,6 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
         }
  
         cpumask_copy(&info->proc.mask, cpu_online_mask);
-       dd->affinity = info;
-       return 0;
  }
  
  void hfi1_dev_affinity_free(struct hfi1_devdata *dd)
diff --git a/drivers/staging/rdma/hfi1/affinity.h b/drivers/staging/rdma/hfi1/affinity.h

index b287e4963024d4760a3db6fac88860d54f3482de..20f52fe7409161cb772b90aae96f8e4c7a012e05 100644 (file)
--- a/drivers/staging/rdma/hfi1/affinity.h
+++ b/drivers/staging/rdma/hfi1/affinity.h
@@ -64,10 +64,27 @@ enum affinity_flags {
         AFF_IRQ_LOCAL
  };
  
+struct cpu_mask_set {
+       struct cpumask mask;
+       struct cpumask used;
+       uint gen;
+};
+
+struct hfi1_affinity {
+       struct cpu_mask_set def_intr;
+       struct cpu_mask_set rcv_intr;
+       struct cpu_mask_set proc;
+       struct cpumask real_cpu_mask;
+       /* spin lock to protect affinity struct */
+       spinlock_t lock;
+};
+
  struct hfi1_msix_entry;
  
+/* Initialize non-HT cpu cores mask */
+int init_real_cpu_mask(struct hfi1_devdata *);
  /* Initialize driver affinity data */
-int hfi1_dev_affinity_init(struct hfi1_devdata *);
+void hfi1_dev_affinity_init(struct hfi1_devdata *);
  /* Free driver affinity data */
  void hfi1_dev_affinity_free(struct hfi1_devdata *);
  /*
diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c

index 16eb653903e0b873909f3cf3175cf597fae88ea8..dcae8e723f98921dac1a7f7f672d06f7bb0f87e5 100644 (file)
--- a/drivers/staging/rdma/hfi1/chip.c
+++ b/drivers/staging/rdma/hfi1/chip.c
@@ -123,6 +123,8 @@ struct flag_table {
  
  #define MIN_KERNEL_KCTXTS         2
  #define FIRST_KERNEL_KCTXT        1
+/* sizes for both the QP and RSM map tables */
+#define NUM_MAP_ENTRIES                256
  #define NUM_MAP_REGS             32
  
  /* Bit offset into the GUID which carries HFI id information */
@@ -1029,9 +1031,12 @@ static int thermal_init(struct hfi1_devdata *dd);
  static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
                                   int msecs);
  static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc);
+static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr);
  static void handle_temp_err(struct hfi1_devdata *);
  static void dc_shutdown(struct hfi1_devdata *);
  static void dc_start(struct hfi1_devdata *);
+static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
+                          unsigned int *np);
  
  /*
   * Error interrupt table entry.  This is used as input to the interrupt
@@ -5661,7 +5666,7 @@ static int sc_to_vl(struct hfi1_devdata *dd, int sw_index)
         sci = &dd->send_contexts[sw_index];
  
         /* there is no information for user (PSM) and ack contexts */
-       if (sci->type != SC_KERNEL)
+       if ((sci->type != SC_KERNEL) && (sci->type != SC_VL15))
                 return -1;
  
         sc = sci->sc;
@@ -6199,18 +6204,13 @@ static void hreq_response(struct hfi1_devdata *dd, u8 return_code, u16 rsp_data)
  
  /*
   * Handle host requests from the 8051.
- *
- * This is a work-queue function outside of the interrupt.
   */
-void handle_8051_request(struct work_struct *work)
+static void handle_8051_request(struct hfi1_pportdata *ppd)
  {
-       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
-                                                       dc_host_req_work);
         struct hfi1_devdata *dd = ppd->dd;
         u64 reg;
         u16 data = 0;
-       u8 type, i, lanes, *cache = ppd->qsfp_info.cache;
-       u8 cdr_ctrl_byte = cache[QSFP_CDR_CTRL_BYTE_OFFS];
+       u8 type;
  
         reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_1);
         if ((reg & DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK) == 0)
@@ -6231,46 +6231,11 @@ void handle_8051_request(struct work_struct *work)
         case HREQ_READ_CONFIG:
         case HREQ_SET_TX_EQ_ABS:
         case HREQ_SET_TX_EQ_REL:
+       case HREQ_ENABLE:
                 dd_dev_info(dd, "8051 request: request 0x%x not supported\n",
                             type);
                 hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
                 break;
-
-       case HREQ_ENABLE:
-               lanes = data & 0xF;
-               for (i = 0; lanes; lanes >>= 1, i++) {
-                       if (!(lanes & 1))
-                               continue;
-                       if (data & 0x200) {
-                               /* enable TX CDR */
-                               if (cache[QSFP_MOD_PWR_OFFS] & 0x8 &&
-                                   cache[QSFP_CDR_INFO_OFFS] & 0x80)
-                                       cdr_ctrl_byte |= (1 << (i + 4));
-                       } else {
-                               /* disable TX CDR */
-                               if (cache[QSFP_MOD_PWR_OFFS] & 0x8 &&
-                                   cache[QSFP_CDR_INFO_OFFS] & 0x80)
-                                       cdr_ctrl_byte &= ~(1 << (i + 4));
-                       }
-
-                       if (data & 0x800) {
-                               /* enable RX CDR */
-                               if (cache[QSFP_MOD_PWR_OFFS] & 0x4 &&
-                                   cache[QSFP_CDR_INFO_OFFS] & 0x40)
-                                       cdr_ctrl_byte |= (1 << i);
-                       } else {
-                               /* disable RX CDR */
-                               if (cache[QSFP_MOD_PWR_OFFS] & 0x4 &&
-                                   cache[QSFP_CDR_INFO_OFFS] & 0x40)
-                                       cdr_ctrl_byte &= ~(1 << i);
-                       }
-               }
-               one_qsfp_write(ppd, dd->hfi1_id, QSFP_CDR_CTRL_BYTE_OFFS,
-                              &cdr_ctrl_byte, 1);
-               hreq_response(dd, HREQ_SUCCESS, data);
-               refresh_qsfp_cache(ppd, &ppd->qsfp_info);
-               break;
-
         case HREQ_CONFIG_DONE:
                 hreq_response(dd, HREQ_SUCCESS, 0);
                 break;
@@ -6278,7 +6243,6 @@ void handle_8051_request(struct work_struct *work)
         case HREQ_INTERFACE_TEST:
                 hreq_response(dd, HREQ_SUCCESS, data);
                 break;
-
         default:
                 dd_dev_err(dd, "8051 request: unknown request 0x%x\n", type);
                 hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
@@ -6849,6 +6813,75 @@ static void reset_neighbor_info(struct hfi1_pportdata *ppd)
         ppd->neighbor_fm_security = 0;
  }
  
+static const char * const link_down_reason_strs[] = {
+       [OPA_LINKDOWN_REASON_NONE] = "None",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_0] = "Recive error 0",
+       [OPA_LINKDOWN_REASON_BAD_PKT_LEN] = "Bad packet length",
+       [OPA_LINKDOWN_REASON_PKT_TOO_LONG] = "Packet too long",
+       [OPA_LINKDOWN_REASON_PKT_TOO_SHORT] = "Packet too short",
+       [OPA_LINKDOWN_REASON_BAD_SLID] = "Bad SLID",
+       [OPA_LINKDOWN_REASON_BAD_DLID] = "Bad DLID",
+       [OPA_LINKDOWN_REASON_BAD_L2] = "Bad L2",
+       [OPA_LINKDOWN_REASON_BAD_SC] = "Bad SC",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_8] = "Receive error 8",
+       [OPA_LINKDOWN_REASON_BAD_MID_TAIL] = "Bad mid tail",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_10] = "Receive error 10",
+       [OPA_LINKDOWN_REASON_PREEMPT_ERROR] = "Preempt error",
+       [OPA_LINKDOWN_REASON_PREEMPT_VL15] = "Preempt vl15",
+       [OPA_LINKDOWN_REASON_BAD_VL_MARKER] = "Bad VL marker",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_14] = "Receive error 14",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_15] = "Receive error 15",
+       [OPA_LINKDOWN_REASON_BAD_HEAD_DIST] = "Bad head distance",
+       [OPA_LINKDOWN_REASON_BAD_TAIL_DIST] = "Bad tail distance",
+       [OPA_LINKDOWN_REASON_BAD_CTRL_DIST] = "Bad control distance",
+       [OPA_LINKDOWN_REASON_BAD_CREDIT_ACK] = "Bad credit ack",
+       [OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER] = "Unsupported VL marker",
+       [OPA_LINKDOWN_REASON_BAD_PREEMPT] = "Bad preempt",
+       [OPA_LINKDOWN_REASON_BAD_CONTROL_FLIT] = "Bad control flit",
+       [OPA_LINKDOWN_REASON_EXCEED_MULTICAST_LIMIT] = "Exceed multicast limit",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_24] = "Receive error 24",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_25] = "Receive error 25",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_26] = "Receive error 26",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_27] = "Receive error 27",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_28] = "Receive error 28",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_29] = "Receive error 29",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_30] = "Receive error 30",
+       [OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN] =
+                                       "Excessive buffer overrun",
+       [OPA_LINKDOWN_REASON_UNKNOWN] = "Unknown",
+       [OPA_LINKDOWN_REASON_REBOOT] = "Reboot",
+       [OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN] = "Neighbor unknown",
+       [OPA_LINKDOWN_REASON_FM_BOUNCE] = "FM bounce",
+       [OPA_LINKDOWN_REASON_SPEED_POLICY] = "Speed policy",
+       [OPA_LINKDOWN_REASON_WIDTH_POLICY] = "Width policy",
+       [OPA_LINKDOWN_REASON_DISCONNECTED] = "Disconnected",
+       [OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED] =
+                                       "Local media not installed",
+       [OPA_LINKDOWN_REASON_NOT_INSTALLED] = "Not installed",
+       [OPA_LINKDOWN_REASON_CHASSIS_CONFIG] = "Chassis config",
+       [OPA_LINKDOWN_REASON_END_TO_END_NOT_INSTALLED] =
+                                       "End to end not installed",
+       [OPA_LINKDOWN_REASON_POWER_POLICY] = "Power policy",
+       [OPA_LINKDOWN_REASON_LINKSPEED_POLICY] = "Link speed policy",
+       [OPA_LINKDOWN_REASON_LINKWIDTH_POLICY] = "Link width policy",
+       [OPA_LINKDOWN_REASON_SWITCH_MGMT] = "Switch management",
+       [OPA_LINKDOWN_REASON_SMA_DISABLED] = "SMA disabled",
+       [OPA_LINKDOWN_REASON_TRANSIENT] = "Transient"
+};
+
+/* return the neighbor link down reason string */
+static const char *link_down_reason_str(u8 reason)
+{
+       const char *str = NULL;
+
+       if (reason < ARRAY_SIZE(link_down_reason_strs))
+               str = link_down_reason_strs[reason];
+       if (!str)
+               str = "(invalid)";
+
+       return str;
+}
+
  /*
   * Handle a link down interrupt from the 8051.
   *
@@ -6857,8 +6890,11 @@ static void reset_neighbor_info(struct hfi1_pportdata *ppd)
  void handle_link_down(struct work_struct *work)
  {
         u8 lcl_reason, neigh_reason = 0;
+       u8 link_down_reason;
         struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
-                                                               link_down_work);
+                                                 link_down_work);
+       int was_up;
+       static const char ldr_str[] = "Link down reason: ";
  
         if ((ppd->host_link_state &
              (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) &&
@@ -6867,20 +6903,63 @@ void handle_link_down(struct work_struct *work)
                         HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NOT_INSTALLED);
  
         /* Go offline first, then deal with reading/writing through 8051 */
+       was_up = !!(ppd->host_link_state & HLS_UP);
         set_link_state(ppd, HLS_DN_OFFLINE);
  
-       lcl_reason = 0;
-       read_planned_down_reason_code(ppd->dd, &neigh_reason);
+       if (was_up) {
+               lcl_reason = 0;
+               /* link down reason is only valid if the link was up */
+               read_link_down_reason(ppd->dd, &link_down_reason);
+               switch (link_down_reason) {
+               case LDR_LINK_TRANSFER_ACTIVE_LOW:
+                       /* the link went down, no idle message reason */
+                       dd_dev_info(ppd->dd, "%sUnexpected link down\n",
+                                   ldr_str);
+                       break;
+               case LDR_RECEIVED_LINKDOWN_IDLE_MSG:
+                       /*
+                        * The neighbor reason is only valid if an idle message
+                        * was received for it.
+                        */
+                       read_planned_down_reason_code(ppd->dd, &neigh_reason);
+                       dd_dev_info(ppd->dd,
+                                   "%sNeighbor link down message %d, %s\n",
+                                   ldr_str, neigh_reason,
+                                   link_down_reason_str(neigh_reason));
+                       break;
+               case LDR_RECEIVED_HOST_OFFLINE_REQ:
+                       dd_dev_info(ppd->dd,
+                                   "%sHost requested link to go offline\n",
+                                   ldr_str);
+                       break;
+               default:
+                       dd_dev_info(ppd->dd, "%sUnknown reason 0x%x\n",
+                                   ldr_str, link_down_reason);
+                       break;
+               }
  
-       /*
-        * If no reason, assume peer-initiated but missed
-        * LinkGoingDown idle flits.
-        */
-       if (neigh_reason == 0)
-               lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN;
+               /*
+                * If no reason, assume peer-initiated but missed
+                * LinkGoingDown idle flits.
+                */
+               if (neigh_reason == 0)
+                       lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN;
+       } else {
+               /* went down while polling or going up */
+               lcl_reason = OPA_LINKDOWN_REASON_TRANSIENT;
+       }
  
         set_link_down_reason(ppd, lcl_reason, neigh_reason, 0);
  
+       /* inform the SMA when the link transitions from up to down */
+       if (was_up && ppd->local_link_down_reason.sma == 0 &&
+           ppd->neigh_link_down_reason.sma == 0) {
+               ppd->local_link_down_reason.sma =
+                                       ppd->local_link_down_reason.latest;
+               ppd->neigh_link_down_reason.sma =
+                                       ppd->neigh_link_down_reason.latest;
+       }
+
         reset_neighbor_info(ppd);
  
         /* disable the port */
@@ -6890,7 +6969,7 @@ void handle_link_down(struct work_struct *work)
          * If there is no cable attached, turn the DC off. Otherwise,
          * start the link bring up.
          */
-       if (!qsfp_mod_present(ppd)) {
+       if (ppd->port_type == PORT_TYPE_QSFP && !qsfp_mod_present(ppd)) {
                 dc_shutdown(ppd->dd);
         } else {
                 tune_serdes(ppd);
@@ -7373,7 +7452,11 @@ retry:
                 ppd->link_width_downgrade_rx_active = rx;
         }
  
-       if (lwde == 0) {
+       if (ppd->link_width_downgrade_tx_active == 0 ||
+           ppd->link_width_downgrade_rx_active == 0) {
+               /* the 8051 reported a dead link as a downgrade */
+               dd_dev_err(ppd->dd, "Link downgrade is really a link down, ignoring\n");
+       } else if (lwde == 0) {
                 /* downgrade is disabled */
  
                 /* bounce if not at starting active width */
@@ -7534,7 +7617,7 @@ static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg)
                         host_msg &= ~(u64)LINKUP_ACHIEVED;
                 }
                 if (host_msg & EXT_DEVICE_CFG_REQ) {
-                       queue_work(ppd->hfi1_wq, &ppd->dc_host_req_work);
+                       handle_8051_request(ppd);
                         host_msg &= ~(u64)EXT_DEVICE_CFG_REQ;
                 }
                 if (host_msg & VERIFY_CAP_FRAME) {
@@ -8660,6 +8743,14 @@ static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc)
         *pdrrc = (frame >> DOWN_REMOTE_REASON_SHIFT) & DOWN_REMOTE_REASON_MASK;
  }
  
+static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr)
+{
+       u32 frame;
+
+       read_8051_config(dd, LINK_DOWN_REASON, GENERAL_CONFIG, &frame);
+       *ldr = (frame & 0xff);
+}
+
  static int read_tx_settings(struct hfi1_devdata *dd,
                             u8 *enable_lane_tx,
                             u8 *tx_polarity_inversion,
@@ -9049,9 +9140,9 @@ set_local_link_attributes_fail:
  }
  
  /*
- * Call this to start the link.  Schedule a retry if the cable is not
- * present or if unable to start polling.  Do not do anything if the
- * link is disabled.  Returns 0 if link is disabled or moved to polling
+ * Call this to start the link.
+ * Do not do anything if the link is disabled.
+ * Returns 0 if link is disabled, moved to polling, or the driver is not ready.
   */
  int start_link(struct hfi1_pportdata *ppd)
  {
@@ -9068,15 +9159,7 @@ int start_link(struct hfi1_pportdata *ppd)
                 return 0;
         }
  
-       if (qsfp_mod_present(ppd) || loopback == LOOPBACK_SERDES ||
-           loopback == LOOPBACK_LCB ||
-           ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
-               return set_link_state(ppd, HLS_DN_POLL);
-
-       dd_dev_info(ppd->dd,
-                   "%s: stopping link start because no cable is present\n",
-                   __func__);
-       return -EAGAIN;
+       return set_link_state(ppd, HLS_DN_POLL);
  }
  
  static void wait_for_qsfp_init(struct hfi1_pportdata *ppd)
@@ -9247,7 +9330,7 @@ static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd,
         return 0;
  }
  
-/* This routine will only be scheduled if the QSFP module is present */
+/* This routine will only be scheduled if the QSFP module present is asserted */
  void qsfp_event(struct work_struct *work)
  {
         struct qsfp_data *qd;
@@ -9676,6 +9759,7 @@ static void set_send_length(struct hfi1_pportdata *ppd)
                               & SEND_LEN_CHECK1_LEN_VL15_MASK) <<
                 SEND_LEN_CHECK1_LEN_VL15_SHIFT;
         int i;
+       u32 thres;
  
         for (i = 0; i < ppd->vls_supported; i++) {
                 if (dd->vld[i].mtu > maxvlmtu)
@@ -9694,16 +9778,17 @@ static void set_send_length(struct hfi1_pportdata *ppd)
         /* adjust kernel credit return thresholds based on new MTUs */
         /* all kernel receive contexts have the same hdrqentsize */
         for (i = 0; i < ppd->vls_supported; i++) {
-               sc_set_cr_threshold(dd->vld[i].sc,
-                                   sc_mtu_to_threshold(dd->vld[i].sc,
-                                                       dd->vld[i].mtu,
-                                                       dd->rcd[0]->
-                                                       rcvhdrqentsize));
-       }
-       sc_set_cr_threshold(dd->vld[15].sc,
-                           sc_mtu_to_threshold(dd->vld[15].sc,
-                                               dd->vld[15].mtu,
+               thres = min(sc_percent_to_threshold(dd->vld[i].sc, 50),
+                           sc_mtu_to_threshold(dd->vld[i].sc,
+                                               dd->vld[i].mtu,
                                                 dd->rcd[0]->rcvhdrqentsize));
+               sc_set_cr_threshold(dd->vld[i].sc, thres);
+       }
+       thres = min(sc_percent_to_threshold(dd->vld[15].sc, 50),
+                   sc_mtu_to_threshold(dd->vld[15].sc,
+                                       dd->vld[15].mtu,
+                                       dd->rcd[0]->rcvhdrqentsize));
+       sc_set_cr_threshold(dd->vld[15].sc, thres);
  
         /* Adjust maximum MTU for the port in DC */
         dcmtu = maxvlmtu == 10240 ? DCC_CFG_PORT_MTU_CAP_10240 :
@@ -10030,7 +10115,6 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
         struct hfi1_devdata *dd = ppd->dd;
         struct ib_event event = {.device = NULL};
         int ret1, ret = 0;
-       int was_up, is_down;
         int orig_new_state, poll_bounce;
  
         mutex_lock(&ppd->hls_lock);
@@ -10049,8 +10133,6 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
                     poll_bounce ? "(bounce) " : "",
                     link_state_reason_name(ppd, state));
  
-       was_up = !!(ppd->host_link_state & HLS_UP);
-
         /*
          * If we're going to a (HLS_*) link state that implies the logical
          * link state is neither of (IB_PORT_ARMED, IB_PORT_ACTIVE), then
@@ -10261,17 +10343,6 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
                 break;
         }
  
-       is_down = !!(ppd->host_link_state & (HLS_DN_POLL |
-                       HLS_DN_DISABLE | HLS_DN_OFFLINE));
-
-       if (was_up && is_down && ppd->local_link_down_reason.sma == 0 &&
-           ppd->neigh_link_down_reason.sma == 0) {
-               ppd->local_link_down_reason.sma =
-                 ppd->local_link_down_reason.latest;
-               ppd->neigh_link_down_reason.sma =
-                 ppd->neigh_link_down_reason.latest;
-       }
-
         goto done;
  
  unexpected:
@@ -12673,22 +12744,24 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
         int total_contexts;
         int ret;
         unsigned ngroups;
+       int qos_rmt_count;
+       int user_rmt_reduced;
  
         /*
-        * Kernel contexts: (to be fixed later):
-        * - min or 2 or 1 context/numa
+        * Kernel receive contexts:
+        * - min of 2 or 1 context/numa (excluding control context)
          * - Context 0 - control context (VL15/multicast/error)
-        * - Context 1 - default context
+        * - Context 1 - first kernel context
+        * - Context 2 - second kernel context
+        * ...
          */
         if (n_krcvqs)
                 /*
-                * Don't count context 0 in n_krcvqs since
-                * is isn't used for normal verbs traffic.
-                *
-                * krcvqs will reflect number of kernel
-                * receive contexts above 0.
+                * n_krcvqs is the sum of module parameter kernel receive
+                * contexts, krcvqs[].  It does not include the control
+                * context, so add that.
                  */
-               num_kernel_contexts = n_krcvqs + MIN_KERNEL_KCTXTS - 1;
+               num_kernel_contexts = n_krcvqs + 1;
         else
                 num_kernel_contexts = num_online_nodes() + 1;
         num_kernel_contexts =
@@ -12705,12 +12778,13 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
                 num_kernel_contexts = dd->chip_send_contexts - num_vls - 1;
         }
         /*
-        * User contexts: (to be fixed later)
-        *      - default to 1 user context per CPU if num_user_contexts is
-        *        negative
+        * User contexts:
+        *      - default to 1 user context per real (non-HT) CPU core if
+        *        num_user_contexts is negative
          */
         if (num_user_contexts < 0)
-               num_user_contexts = num_online_cpus();
+               num_user_contexts =
+                       cpumask_weight(&dd->affinity->real_cpu_mask);
  
         total_contexts = num_kernel_contexts + num_user_contexts;
  
@@ -12727,6 +12801,19 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
                 total_contexts = num_kernel_contexts + num_user_contexts;
         }
  
+       /* each user context requires an entry in the RMT */
+       qos_rmt_count = qos_rmt_entries(dd, NULL, NULL);
+       if (qos_rmt_count + num_user_contexts > NUM_MAP_ENTRIES) {
+               user_rmt_reduced = NUM_MAP_ENTRIES - qos_rmt_count;
+               dd_dev_err(dd,
+                          "RMT size is reducing the number of user receive contexts from %d to %d\n",
+                          (int)num_user_contexts,
+                          user_rmt_reduced);
+               /* recalculate */
+               num_user_contexts = user_rmt_reduced;
+               total_contexts = num_kernel_contexts + num_user_contexts;
+       }
+
         /* the first N are kernel contexts, the rest are user contexts */
         dd->num_rcv_contexts = total_contexts;
         dd->n_krcv_queues = num_kernel_contexts;
@@ -12776,12 +12863,13 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
                 dd->num_send_contexts = ret;
                 dd_dev_info(
                         dd,
-                       "send contexts: chip %d, used %d (kernel %d, ack %d, user %d)\n",
+                       "send contexts: chip %d, used %d (kernel %d, ack %d, user %d, vl15 %d)\n",
                         dd->chip_send_contexts,
                         dd->num_send_contexts,
                         dd->sc_sizes[SC_KERNEL].count,
                         dd->sc_sizes[SC_ACK].count,
-                       dd->sc_sizes[SC_USER].count);
+                       dd->sc_sizes[SC_USER].count,
+                       dd->sc_sizes[SC_VL15].count);
                 ret = 0;        /* success */
         }
  
@@ -13451,122 +13539,224 @@ static void init_qpmap_table(struct hfi1_devdata *dd,
         int i;
         u64 ctxt = first_ctxt;
  
-       for (i = 0; i < 256;) {
+       for (i = 0; i < 256; i++) {
                 reg |= ctxt << (8 * (i % 8));
-               i++;
                 ctxt++;
                 if (ctxt > last_ctxt)
                         ctxt = first_ctxt;
-               if (i % 8 == 0) {
+               if (i % 8 == 7) {
                         write_csr(dd, regno, reg);
                         reg = 0;
                         regno += 8;
                 }
         }
-       if (i % 8)
-               write_csr(dd, regno, reg);
  
         add_rcvctrl(dd, RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK
                         | RCV_CTRL_RCV_BYPASS_ENABLE_SMASK);
  }
  
-/**
- * init_qos - init RX qos
- * @dd - device data
- * @first_context
- *
- * This routine initializes Rule 0 and the
- * RSM map table to implement qos.
- *
- * If all of the limit tests succeed,
- * qos is applied based on the array
- * interpretation of krcvqs where
- * entry 0 is VL0.
- *
- * The number of vl bits (n) and the number of qpn
- * bits (m) are computed to feed both the RSM map table
- * and the single rule.
- *
+struct rsm_map_table {
+       u64 map[NUM_MAP_REGS];
+       unsigned int used;
+};
+
+struct rsm_rule_data {
+       u8 offset;
+       u8 pkt_type;
+       u32 field1_off;
+       u32 field2_off;
+       u32 index1_off;
+       u32 index1_width;
+       u32 index2_off;
+       u32 index2_width;
+       u32 mask1;
+       u32 value1;
+       u32 mask2;
+       u32 value2;
+};
+
+/*
+ * Return an initialized RMT map table for users to fill in.  OK if it
+ * returns NULL, indicating no table.
   */
-static void init_qos(struct hfi1_devdata *dd, u32 first_ctxt)
+static struct rsm_map_table *alloc_rsm_map_table(struct hfi1_devdata *dd)
  {
+       struct rsm_map_table *rmt;
+       u8 rxcontext = is_ax(dd) ? 0 : 0xff;  /* 0 is default if a0 ver. */
+
+       rmt = kmalloc(sizeof(*rmt), GFP_KERNEL);
+       if (rmt) {
+               memset(rmt->map, rxcontext, sizeof(rmt->map));
+               rmt->used = 0;
+       }
+
+       return rmt;
+}
+
+/*
+ * Write the final RMT map table to the chip and free the table.  OK if
+ * table is NULL.
+ */
+static void complete_rsm_map_table(struct hfi1_devdata *dd,
+                                  struct rsm_map_table *rmt)
+{
+       int i;
+
+       if (rmt) {
+               /* write table to chip */
+               for (i = 0; i < NUM_MAP_REGS; i++)
+                       write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rmt->map[i]);
+
+               /* enable RSM */
+               add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
+       }
+}
+
+/*
+ * Add a receive side mapping rule.
+ */
+static void add_rsm_rule(struct hfi1_devdata *dd, u8 rule_index,
+                        struct rsm_rule_data *rrd)
+{
+       write_csr(dd, RCV_RSM_CFG + (8 * rule_index),
+                 (u64)rrd->offset << RCV_RSM_CFG_OFFSET_SHIFT |
+                 1ull << rule_index | /* enable bit */
+                 (u64)rrd->pkt_type << RCV_RSM_CFG_PACKET_TYPE_SHIFT);
+       write_csr(dd, RCV_RSM_SELECT + (8 * rule_index),
+                 (u64)rrd->field1_off << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT |
+                 (u64)rrd->field2_off << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT |
+                 (u64)rrd->index1_off << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT |
+                 (u64)rrd->index1_width << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT |
+                 (u64)rrd->index2_off << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT |
+                 (u64)rrd->index2_width << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT);
+       write_csr(dd, RCV_RSM_MATCH + (8 * rule_index),
+                 (u64)rrd->mask1 << RCV_RSM_MATCH_MASK1_SHIFT |
+                 (u64)rrd->value1 << RCV_RSM_MATCH_VALUE1_SHIFT |
+                 (u64)rrd->mask2 << RCV_RSM_MATCH_MASK2_SHIFT |
+                 (u64)rrd->value2 << RCV_RSM_MATCH_VALUE2_SHIFT);
+}
+
+/* return the number of RSM map table entries that will be used for QOS */
+static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
+                          unsigned int *np)
+{
+       int i;
+       unsigned int m, n;
         u8 max_by_vl = 0;
-       unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m;
-       u64 *rsmmap;
-       u64 reg;
-       u8  rxcontext = is_ax(dd) ? 0 : 0xff;  /* 0 is default if a0 ver. */
  
-       /* validate */
+       /* is QOS active at all? */
         if (dd->n_krcv_queues <= MIN_KERNEL_KCTXTS ||
             num_vls == 1 ||
             krcvqsset <= 1)
-               goto bail;
-       for (i = 0; i < min_t(unsigned, num_vls, krcvqsset); i++)
+               goto no_qos;
+
+       /* determine bits for qpn */
+       for (i = 0; i < min_t(unsigned int, num_vls, krcvqsset); i++)
                 if (krcvqs[i] > max_by_vl)
                         max_by_vl = krcvqs[i];
         if (max_by_vl > 32)
-               goto bail;
-       qpns_per_vl = __roundup_pow_of_two(max_by_vl);
-       /* determine bits vl */
-       n = ilog2(num_vls);
-       /* determine bits for qpn */
-       m = ilog2(qpns_per_vl);
+               goto no_qos;
+       m = ilog2(__roundup_pow_of_two(max_by_vl));
+
+       /* determine bits for vl */
+       n = ilog2(__roundup_pow_of_two(num_vls));
+
+       /* reject if too much is used */
         if ((m + n) > 7)
+               goto no_qos;
+
+       if (mp)
+               *mp = m;
+       if (np)
+               *np = n;
+
+       return 1 << (m + n);
+
+no_qos:
+       if (mp)
+               *mp = 0;
+       if (np)
+               *np = 0;
+       return 0;
+}
+
+/**
+ * init_qos - init RX qos
+ * @dd - device data
+ * @rmt - RSM map table
+ *
+ * This routine initializes Rule 0 and the RSM map table to implement
+ * quality of service (qos).
+ *
+ * If all of the limit tests succeed, qos is applied based on the array
+ * interpretation of krcvqs where entry 0 is VL0.
+ *
+ * The number of vl bits (n) and the number of qpn bits (m) are computed to
+ * feed both the RSM map table and the single rule.
+ */
+static void init_qos(struct hfi1_devdata *dd, struct rsm_map_table *rmt)
+{
+       struct rsm_rule_data rrd;
+       unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m;
+       unsigned int rmt_entries;
+       u64 reg;
+
+       if (!rmt)
                 goto bail;
-       if (num_vls * qpns_per_vl > dd->chip_rcv_contexts)
+       rmt_entries = qos_rmt_entries(dd, &m, &n);
+       if (rmt_entries == 0)
                 goto bail;
-       rsmmap = kmalloc_array(NUM_MAP_REGS, sizeof(u64), GFP_KERNEL);
-       if (!rsmmap)
+       qpns_per_vl = 1 << m;
+
+       /* enough room in the map table? */
+       rmt_entries = 1 << (m + n);
+       if (rmt->used + rmt_entries >= NUM_MAP_ENTRIES)
                 goto bail;
-       memset(rsmmap, rxcontext, NUM_MAP_REGS * sizeof(u64));
-       /* init the local copy of the table */
-       for (i = 0, ctxt = first_ctxt; i < num_vls; i++) {
+
+       /* add qos entries to the the RSM map table */
+       for (i = 0, ctxt = FIRST_KERNEL_KCTXT; i < num_vls; i++) {
                 unsigned tctxt;
  
                 for (qpn = 0, tctxt = ctxt;
                      krcvqs[i] && qpn < qpns_per_vl; qpn++) {
                         unsigned idx, regoff, regidx;
  
-                       /* generate index <= 128 */
-                       idx = (qpn << n) ^ i;
+                       /* generate the index the hardware will produce */
+                       idx = rmt->used + ((qpn << n) ^ i);
                         regoff = (idx % 8) * 8;
                         regidx = idx / 8;
-                       reg = rsmmap[regidx];
-                       /* replace 0xff with context number */
+                       /* replace default with context number */
+                       reg = rmt->map[regidx];
                         reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK
                                 << regoff);
                         reg |= (u64)(tctxt++) << regoff;
-                       rsmmap[regidx] = reg;
+                       rmt->map[regidx] = reg;
                         if (tctxt == ctxt + krcvqs[i])
                                 tctxt = ctxt;
                 }
                 ctxt += krcvqs[i];
         }
-       /* flush cached copies to chip */
-       for (i = 0; i < NUM_MAP_REGS; i++)
-               write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rsmmap[i]);
-       /* add rule0 */
-       write_csr(dd, RCV_RSM_CFG /* + (8 * 0) */,
-                 RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK <<
-                 RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT |
-                 2ull << RCV_RSM_CFG_PACKET_TYPE_SHIFT);
-       write_csr(dd, RCV_RSM_SELECT /* + (8 * 0) */,
-                 LRH_BTH_MATCH_OFFSET << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT |
-                 LRH_SC_MATCH_OFFSET << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT |
-                 LRH_SC_SELECT_OFFSET << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT |
-                 ((u64)n) << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT |
-                 QPN_SELECT_OFFSET << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT |
-                 ((u64)m + (u64)n) << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT);
-       write_csr(dd, RCV_RSM_MATCH /* + (8 * 0) */,
-                 LRH_BTH_MASK << RCV_RSM_MATCH_MASK1_SHIFT |
-                 LRH_BTH_VALUE << RCV_RSM_MATCH_VALUE1_SHIFT |
-                 LRH_SC_MASK << RCV_RSM_MATCH_MASK2_SHIFT |
-                 LRH_SC_VALUE << RCV_RSM_MATCH_VALUE2_SHIFT);
-       /* Enable RSM */
-       add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
-       kfree(rsmmap);
-       /* map everything else to first context */
-       init_qpmap_table(dd, FIRST_KERNEL_KCTXT, MIN_KERNEL_KCTXTS - 1);
+
+       rrd.offset = rmt->used;
+       rrd.pkt_type = 2;
+       rrd.field1_off = LRH_BTH_MATCH_OFFSET;
+       rrd.field2_off = LRH_SC_MATCH_OFFSET;
+       rrd.index1_off = LRH_SC_SELECT_OFFSET;
+       rrd.index1_width = n;
+       rrd.index2_off = QPN_SELECT_OFFSET;
+       rrd.index2_width = m + n;
+       rrd.mask1 = LRH_BTH_MASK;
+       rrd.value1 = LRH_BTH_VALUE;
+       rrd.mask2 = LRH_SC_MASK;
+       rrd.value2 = LRH_SC_VALUE;
+
+       /* add rule 0 */
+       add_rsm_rule(dd, 0, &rrd);
+
+       /* mark RSM map entries as used */
+       rmt->used += rmt_entries;
+       /* map everything else to the mcast/err/vl15 context */
+       init_qpmap_table(dd, HFI1_CTRL_CTXT, HFI1_CTRL_CTXT);
         dd->qos_shift = n + 1;
         return;
  bail:
@@ -13574,13 +13764,86 @@ bail:
         init_qpmap_table(dd, FIRST_KERNEL_KCTXT, dd->n_krcv_queues - 1);
  }
  
+static void init_user_fecn_handling(struct hfi1_devdata *dd,
+                                   struct rsm_map_table *rmt)
+{
+       struct rsm_rule_data rrd;
+       u64 reg;
+       int i, idx, regoff, regidx;
+       u8 offset;
+
+       /* there needs to be enough room in the map table */
+       if (rmt->used + dd->num_user_contexts >= NUM_MAP_ENTRIES) {
+               dd_dev_err(dd, "User FECN handling disabled - too many user contexts allocated\n");
+               return;
+       }
+
+       /*
+        * RSM will extract the destination context as an index into the
+        * map table.  The destination contexts are a sequential block
+        * in the range first_user_ctxt...num_rcv_contexts-1 (inclusive).
+        * Map entries are accessed as offset + extracted value.  Adjust
+        * the added offset so this sequence can be placed anywhere in
+        * the table - as long as the entries themselves do not wrap.
+        * There are only enough bits in offset for the table size, so
+        * start with that to allow for a "negative" offset.
+        */
+       offset = (u8)(NUM_MAP_ENTRIES + (int)rmt->used -
+                                               (int)dd->first_user_ctxt);
+
+       for (i = dd->first_user_ctxt, idx = rmt->used;
+                               i < dd->num_rcv_contexts; i++, idx++) {
+               /* replace with identity mapping */
+               regoff = (idx % 8) * 8;
+               regidx = idx / 8;
+               reg = rmt->map[regidx];
+               reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK << regoff);
+               reg |= (u64)i << regoff;
+               rmt->map[regidx] = reg;
+       }
+
+       /*
+        * For RSM intercept of Expected FECN packets:
+        * o packet type 0 - expected
+        * o match on F (bit 95), using select/match 1, and
+        * o match on SH (bit 133), using select/match 2.
+        *
+        * Use index 1 to extract the 8-bit receive context from DestQP
+        * (start at bit 64).  Use that as the RSM map table index.
+        */
+       rrd.offset = offset;
+       rrd.pkt_type = 0;
+       rrd.field1_off = 95;
+       rrd.field2_off = 133;
+       rrd.index1_off = 64;
+       rrd.index1_width = 8;
+       rrd.index2_off = 0;
+       rrd.index2_width = 0;
+       rrd.mask1 = 1;
+       rrd.value1 = 1;
+       rrd.mask2 = 1;
+       rrd.value2 = 1;
+
+       /* add rule 1 */
+       add_rsm_rule(dd, 1, &rrd);
+
+       rmt->used += dd->num_user_contexts;
+}
+
  static void init_rxe(struct hfi1_devdata *dd)
  {
+       struct rsm_map_table *rmt;
+
         /* enable all receive errors */
         write_csr(dd, RCV_ERR_MASK, ~0ull);
-       /* setup QPN map table - start where VL15 context leaves off */
-       init_qos(dd, dd->n_krcv_queues > MIN_KERNEL_KCTXTS ?
-                MIN_KERNEL_KCTXTS : 0);
+
+       rmt = alloc_rsm_map_table(dd);
+       /* set up QOS, including the QPN map table */
+       init_qos(dd, rmt);
+       init_user_fecn_handling(dd, rmt);
+       complete_rsm_map_table(dd, rmt);
+       kfree(rmt);
+
         /*
          * make sure RcvCtrl.RcvWcb <= PCIe Device Control
          * Register Max_Payload_Size (PCI_EXP_DEVCTL in Linux PCIe config
@@ -13762,6 +14025,7 @@ int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey)
         write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, reg);
         reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
         reg |= SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
+       reg &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK;
         write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
  done:
         return ret;
@@ -14148,6 +14412,19 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
                  (dd->revision >> CCE_REVISION_SW_SHIFT)
                     & CCE_REVISION_SW_MASK);
  
+       /*
+        * The real cpu mask is part of the affinity struct but has to be
+        * initialized earlier than the rest of the affinity struct because it
+        * is needed to calculate the number of user contexts in
+        * set_up_context_variables(). However, hfi1_dev_affinity_init(),
+        * which initializes the rest of the affinity struct members,
+        * depends on set_up_context_variables() for the number of kernel
+        * contexts, so it cannot be called before set_up_context_variables().
+        */
+       ret = init_real_cpu_mask(dd);
+       if (ret)
+               goto bail_cleanup;
+
         ret = set_up_context_variables(dd);
         if (ret)
                 goto bail_cleanup;
@@ -14161,9 +14438,7 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
         /* set up KDETH QP prefix in both RX and TX CSRs */
         init_kdeth_qp(dd);
  
-       ret = hfi1_dev_affinity_init(dd);
-       if (ret)
-               goto bail_cleanup;
+       hfi1_dev_affinity_init(dd);
  
         /* send contexts must be set up before receive contexts */
         ret = init_send_contexts(dd);
diff --git a/drivers/staging/rdma/hfi1/chip.h b/drivers/staging/rdma/hfi1/chip.h

index 4f3b878e43ebc0ebc8526eded454073509658176..1948706fff1a956f8101e1e4fe6540bbcc7227d7 100644 (file)
--- a/drivers/staging/rdma/hfi1/chip.h
+++ b/drivers/staging/rdma/hfi1/chip.h
@@ -389,6 +389,7 @@
  #define LAST_REMOTE_STATE_COMPLETE   0x13
  #define LINK_QUALITY_INFO            0x14
  #define REMOTE_DEVICE_ID            0x15
+#define LINK_DOWN_REASON            0x16
  
  /* 8051 lane specific register field IDs */
  #define TX_EQ_SETTINGS         0x00
@@ -497,6 +498,11 @@
  #define PWRM_BER_CONTROL       0x1
  #define PWRM_BANDWIDTH_CONTROL 0x2
  
+/* 8051 link down reasons */
+#define LDR_LINK_TRANSFER_ACTIVE_LOW   0xa
+#define LDR_RECEIVED_LINKDOWN_IDLE_MSG 0xb
+#define LDR_RECEIVED_HOST_OFFLINE_REQ  0xc
+
  /* verify capability fabric CRC size bits */
  enum {
         CAP_CRC_14B = (1 << 0), /* 14b CRC */
@@ -691,7 +697,6 @@ void handle_verify_cap(struct work_struct *work);
  void handle_freeze(struct work_struct *work);
  void handle_link_up(struct work_struct *work);
  void handle_link_down(struct work_struct *work);
-void handle_8051_request(struct work_struct *work);
  void handle_link_downgrade(struct work_struct *work);
  void handle_link_bounce(struct work_struct *work);
  void handle_sma_message(struct work_struct *work);
diff --git a/drivers/staging/rdma/hfi1/chip_registers.h b/drivers/staging/rdma/hfi1/chip_registers.h

index 770f05c9b8de6010e582347b6d9e46596852004d..8744de6667c25fbe22c37f84c247540744886c4b 100644 (file)
--- a/drivers/staging/rdma/hfi1/chip_registers.h
+++ b/drivers/staging/rdma/hfi1/chip_registers.h
@@ -771,6 +771,7 @@
  #define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK 0x1ull
  #define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT 0
  #define RCV_RSM_CFG_PACKET_TYPE_SHIFT 60
+#define RCV_RSM_CFG_OFFSET_SHIFT 32
  #define RCV_RSM_MAP_TABLE (RXE + 0x000000000900)
  #define RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK 0xFFull
  #define RCV_RSM_MATCH (RXE + 0x000000000800)
diff --git a/drivers/staging/rdma/hfi1/diag.c b/drivers/staging/rdma/hfi1/diag.c

index c5b520bf610e29149baf6aff3b2f3f5a66ecafa2..bb2409ad891a7559d47130bc9d9899f5a568d14e 100644 (file)
--- a/drivers/staging/rdma/hfi1/diag.c
+++ b/drivers/staging/rdma/hfi1/diag.c
@@ -413,7 +413,8 @@ static ssize_t diagpkt_send(struct diag_pkt *dp)
                 goto bail;
         }
         /* can only use kernel contexts */
-       if (dd->send_contexts[dp->sw_index].type != SC_KERNEL) {
+       if (dd->send_contexts[dp->sw_index].type != SC_KERNEL &&
+           dd->send_contexts[dp->sw_index].type != SC_VL15) {
                 ret = -EINVAL;
                 goto bail;
         }
diff --git a/drivers/staging/rdma/hfi1/driver.c b/drivers/staging/rdma/hfi1/driver.c

index 34511e5df1d56e7765c5d11d4e07d68514cf3894..700c6fa3a6330cc10e2e640d1e1b89ebd9678c41 100644 (file)
--- a/drivers/staging/rdma/hfi1/driver.c
+++ b/drivers/staging/rdma/hfi1/driver.c
@@ -75,7 +75,8 @@ DEFINE_MUTEX(hfi1_mutex);     /* general driver use */
  
  unsigned int hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
  module_param_named(max_mtu, hfi1_max_mtu, uint, S_IRUGO);
-MODULE_PARM_DESC(max_mtu, "Set max MTU bytes, default is 8192");
+MODULE_PARM_DESC(max_mtu, "Set max MTU bytes, default is " __stringify(
+                HFI1_DEFAULT_MAX_MTU));
  
  unsigned int hfi1_cu = 1;
  module_param_named(cu, hfi1_cu, uint, S_IRUGO);
diff --git a/drivers/staging/rdma/hfi1/firmware.c b/drivers/staging/rdma/hfi1/firmware.c

index 3040162cb3260828d3f0f9451232494489ab744b..ed680fda611dfdcee292e2613ee6e2d13fdfd891 100644 (file)
--- a/drivers/staging/rdma/hfi1/firmware.c
+++ b/drivers/staging/rdma/hfi1/firmware.c
@@ -1413,8 +1413,15 @@ static int __acquire_chip_resource(struct hfi1_devdata *dd, u32 resource)
  
         if (resource & CR_DYN_MASK) {
                 /* a dynamic resource is in use if either HFI has set the bit */
-               all_bits = resource_mask(0, resource) |
+               if (dd->pcidev->device == PCI_DEVICE_ID_INTEL0 &&
+                   (resource & (CR_I2C1 | CR_I2C2))) {
+                       /* discrete devices must serialize across both chains */
+                       all_bits = resource_mask(0, CR_I2C1 | CR_I2C2) |
+                                       resource_mask(1, CR_I2C1 | CR_I2C2);
+               } else {
+                       all_bits = resource_mask(0, resource) |
                                                 resource_mask(1, resource);
+               }
                 my_bit = resource_mask(dd->hfi1_id, resource);
         } else {
                 /* non-dynamic resources are not split between HFIs */
diff --git a/drivers/staging/rdma/hfi1/hfi.h b/drivers/staging/rdma/hfi1/hfi.h

index 16cbdc4073e0d16fe245e756410fb11af8609057..7b78d56de7f56bdf262582aee1f3f572167b0d93 100644 (file)
--- a/drivers/staging/rdma/hfi1/hfi.h
+++ b/drivers/staging/rdma/hfi1/hfi.h
@@ -455,9 +455,9 @@ struct rvt_sge_state;
  #define HLS_UP (HLS_UP_INIT | HLS_UP_ARMED | HLS_UP_ACTIVE)
  
  /* use this MTU size if none other is given */
-#define HFI1_DEFAULT_ACTIVE_MTU 8192
+#define HFI1_DEFAULT_ACTIVE_MTU 10240
  /* use this MTU size as the default maximum */
-#define HFI1_DEFAULT_MAX_MTU 8192
+#define HFI1_DEFAULT_MAX_MTU 10240
  /* default partition key */
  #define DEFAULT_PKEY 0xffff
  
@@ -606,7 +606,6 @@ struct hfi1_pportdata {
         struct work_struct link_vc_work;
         struct work_struct link_up_work;
         struct work_struct link_down_work;
-       struct work_struct dc_host_req_work;
         struct work_struct sma_message_work;
         struct work_struct freeze_work;
         struct work_struct link_downgrade_work;
@@ -1258,7 +1257,7 @@ void receive_interrupt_work(struct work_struct *work);
  static inline int hdr2sc(struct hfi1_message_header *hdr, u64 rhf)
  {
         return ((be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf) |
-              ((!!(rhf & RHF_DC_INFO_MASK)) << 4);
+              ((!!(rhf & RHF_DC_INFO_SMASK)) << 4);
  }
  
  static inline u16 generate_jkey(kuid_t uid)
@@ -1333,6 +1332,9 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl,  u16 rlid, u32 lqpn,
  void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn,
                 u32 pkey, u32 slid, u32 dlid, u8 sc5,
                 const struct ib_grh *old_grh);
+#define PKEY_CHECK_INVALID -1
+int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth,
+                     u8 sc5, int8_t s_pkey_index);
  
  #define PACKET_EGRESS_TIMEOUT 350
  static inline void pause_for_credit_return(struct hfi1_devdata *dd)
@@ -1776,6 +1778,7 @@ extern struct mutex hfi1_mutex;
  
  #define HFI1_PKT_USER_SC_INTEGRITY                                         \
         (SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK            \
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK           \
         | SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK              \
         | SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK)
  
diff --git a/drivers/staging/rdma/hfi1/init.c b/drivers/staging/rdma/hfi1/init.c

index cfcdc16b41c371a18a4e1cec24b51c4c421b7e80..502b7cf4647de3afa23bf15f2636d267d35a48da 100644 (file)
--- a/drivers/staging/rdma/hfi1/init.c
+++ b/drivers/staging/rdma/hfi1/init.c
@@ -422,9 +422,10 @@ static enum hrtimer_restart cca_timer_fn(struct hrtimer *t)
         struct cca_timer *cca_timer;
         struct hfi1_pportdata *ppd;
         int sl;
-       u16 ccti, ccti_timer, ccti_min;
+       u16 ccti_timer, ccti_min;
         struct cc_state *cc_state;
         unsigned long flags;
+       enum hrtimer_restart ret = HRTIMER_NORESTART;
  
         cca_timer = container_of(t, struct cca_timer, hrtimer);
         ppd = cca_timer->ppd;
@@ -450,24 +451,21 @@ static enum hrtimer_restart cca_timer_fn(struct hrtimer *t)
  
         spin_lock_irqsave(&ppd->cca_timer_lock, flags);
  
-       ccti = cca_timer->ccti;
-
-       if (ccti > ccti_min) {
+       if (cca_timer->ccti > ccti_min) {
                 cca_timer->ccti--;
                 set_link_ipg(ppd);
         }
  
-       spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
-
-       rcu_read_unlock();
-
-       if (ccti > ccti_min) {
+       if (cca_timer->ccti > ccti_min) {
                 unsigned long nsec = 1024 * ccti_timer;
                 /* ccti_timer is in units of 1.024 usec */
                 hrtimer_forward_now(t, ns_to_ktime(nsec));
-               return HRTIMER_RESTART;
+               ret = HRTIMER_RESTART;
         }
-       return HRTIMER_NORESTART;
+
+       spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
+       rcu_read_unlock();
+       return ret;
  }
  
  /*
@@ -496,7 +494,6 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
         INIT_WORK(&ppd->link_vc_work, handle_verify_cap);
         INIT_WORK(&ppd->link_up_work, handle_link_up);
         INIT_WORK(&ppd->link_down_work, handle_link_down);
-       INIT_WORK(&ppd->dc_host_req_work, handle_8051_request);
         INIT_WORK(&ppd->freeze_work, handle_freeze);
         INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade);
         INIT_WORK(&ppd->sma_message_work, handle_sma_message);
@@ -1007,7 +1004,7 @@ void hfi1_free_devdata(struct hfi1_devdata *dd)
         free_percpu(dd->rcv_limit);
         hfi1_dev_affinity_free(dd);
         free_percpu(dd->send_schedule);
-       ib_dealloc_device(&dd->verbs_dev.rdi.ibdev);
+       rvt_dealloc_device(&dd->verbs_dev.rdi);
  }
  
  /*
@@ -1110,7 +1107,7 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra)
  bail:
         if (!list_empty(&dd->list))
                 list_del_init(&dd->list);
-       ib_dealloc_device(&dd->verbs_dev.rdi.ibdev);
+       rvt_dealloc_device(&dd->verbs_dev.rdi);
         return ERR_PTR(ret);
  }
  
diff --git a/drivers/staging/rdma/hfi1/mad.c b/drivers/staging/rdma/hfi1/mad.c

index d1e7f4d7cf6fdf3fb32d699af45348201511b38d..ed58cf21e790e24048a550983758e0f1aa351a36 100644 (file)
--- a/drivers/staging/rdma/hfi1/mad.c
+++ b/drivers/staging/rdma/hfi1/mad.c
@@ -999,7 +999,21 @@ static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp,
                         break;
                 }
  
-               set_link_state(ppd, link_state);
+               if ((link_state == HLS_DN_POLL ||
+                    link_state == HLS_DN_DOWNDEF)) {
+                       /*
+                        * Going to poll.  No matter what the current state,
+                        * always move offline first, then tune and start the
+                        * link.  This correctly handles a FM link bounce and
+                        * a link enable.  Going offline is a no-op if already
+                        * offline.
+                        */
+                       set_link_state(ppd, HLS_DN_OFFLINE);
+                       tune_serdes(ppd);
+                       start_link(ppd);
+               } else {
+                       set_link_state(ppd, link_state);
+               }
                 if (link_state == HLS_DN_DISABLE &&
                     (ppd->offline_disabled_reason >
                      HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED) ||
diff --git a/drivers/staging/rdma/hfi1/mmu_rb.c b/drivers/staging/rdma/hfi1/mmu_rb.c

index b3f0682a36c95ffdd15f1b4d5d2f8cfb2e2afac3..2b0e91d3093dfe284aeffbe12a48df4002082073 100644 (file)
--- a/drivers/staging/rdma/hfi1/mmu_rb.c
+++ b/drivers/staging/rdma/hfi1/mmu_rb.c
@@ -91,7 +91,7 @@ static unsigned long mmu_node_start(struct mmu_rb_node *node)
  
  static unsigned long mmu_node_last(struct mmu_rb_node *node)
  {
-       return PAGE_ALIGN((node->addr & PAGE_MASK) + node->len) - 1;
+       return PAGE_ALIGN(node->addr + node->len) - 1;
  }
  
  int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops)
@@ -126,10 +126,15 @@ void hfi1_mmu_rb_unregister(struct rb_root *root)
         if (!handler)
                 return;
  
+       /* Unregister first so we don't get any more notifications. */
+       if (current->mm)
+               mmu_notifier_unregister(&handler->mn, current->mm);
+
         spin_lock_irqsave(&mmu_rb_lock, flags);
         list_del(&handler->list);
         spin_unlock_irqrestore(&mmu_rb_lock, flags);
  
+       spin_lock_irqsave(&handler->lock, flags);
         if (!RB_EMPTY_ROOT(root)) {
                 struct rb_node *node;
                 struct mmu_rb_node *rbnode;
@@ -141,9 +146,8 @@ void hfi1_mmu_rb_unregister(struct rb_root *root)
                                 handler->ops->remove(root, rbnode, NULL);
                 }
         }
+       spin_unlock_irqrestore(&handler->lock, flags);
  
-       if (current->mm)
-               mmu_notifier_unregister(&handler->mn, current->mm);
         kfree(handler);
  }
  
@@ -235,6 +239,25 @@ struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *root, unsigned long addr,
         return node;
  }
  
+struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *root,
+                                       unsigned long addr, unsigned long len)
+{
+       struct mmu_rb_handler *handler = find_mmu_handler(root);
+       struct mmu_rb_node *node;
+       unsigned long flags;
+
+       if (!handler)
+               return ERR_PTR(-EINVAL);
+
+       spin_lock_irqsave(&handler->lock, flags);
+       node = __mmu_rb_search(handler, addr, len);
+       if (node)
+               __mmu_int_rb_remove(node, handler->root);
+       spin_unlock_irqrestore(&handler->lock, flags);
+
+       return node;
+}
+
  void hfi1_mmu_rb_remove(struct rb_root *root, struct mmu_rb_node *node)
  {
         struct mmu_rb_handler *handler = find_mmu_handler(root);
@@ -293,9 +316,9 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn,
                 hfi1_cdbg(MMU, "Invalidating node addr 0x%llx, len %u",
                           node->addr, node->len);
                 if (handler->ops->invalidate(root, node)) {
-                       spin_unlock_irqrestore(&handler->lock, flags);
-                       __mmu_rb_remove(handler, node, mm);
-                       spin_lock_irqsave(&handler->lock, flags);
+                       __mmu_int_rb_remove(node, root);
+                       if (handler->ops->remove)
+                               handler->ops->remove(root, node, mm);
                 }
         }
         spin_unlock_irqrestore(&handler->lock, flags);
diff --git a/drivers/staging/rdma/hfi1/mmu_rb.h b/drivers/staging/rdma/hfi1/mmu_rb.h

index 19a306e83c7df54be68a8503537f247142bffbf8..7a57b9c49d271fdce65f5fa46a7f03c7f96fd89d 100644 (file)
--- a/drivers/staging/rdma/hfi1/mmu_rb.h
+++ b/drivers/staging/rdma/hfi1/mmu_rb.h
@@ -70,5 +70,7 @@ int hfi1_mmu_rb_insert(struct rb_root *, struct mmu_rb_node *);
  void hfi1_mmu_rb_remove(struct rb_root *, struct mmu_rb_node *);
  struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *, unsigned long,
                                        unsigned long);
+struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *, unsigned long,
+                                       unsigned long);
  
  #endif /* _HFI1_MMU_RB_H */
diff --git a/drivers/staging/rdma/hfi1/pio.c b/drivers/staging/rdma/hfi1/pio.c

index c6849ce9e5ebc9c9731a65d2ba92e4595ed064a9..c67b9ad3fcf4c1c18dd83b50761c27c68176b4fb 100644 (file)
--- a/drivers/staging/rdma/hfi1/pio.c
+++ b/drivers/staging/rdma/hfi1/pio.c
@@ -139,23 +139,30 @@ void pio_send_control(struct hfi1_devdata *dd, int op)
  /* Send Context Size (SCS) wildcards */
  #define SCS_POOL_0 -1
  #define SCS_POOL_1 -2
+
  /* Send Context Count (SCC) wildcards */
  #define SCC_PER_VL -1
  #define SCC_PER_CPU  -2
-
  #define SCC_PER_KRCVQ  -3
-#define SCC_ACK_CREDITS  32
+
+/* Send Context Size (SCS) constants */
+#define SCS_ACK_CREDITS  32
+#define SCS_VL15_CREDITS 102   /* 3 pkts of 2048B data + 128B header */
+
+#define PIO_THRESHOLD_CEILING 4096
  
  #define PIO_WAIT_BATCH_SIZE 5
  
  /* default send context sizes */
  static struct sc_config_sizes sc_config_sizes[SC_MAX] = {
         [SC_KERNEL] = { .size  = SCS_POOL_0,    /* even divide, pool 0 */
-                       .count = SCC_PER_VL },/* one per NUMA */
-       [SC_ACK]    = { .size  = SCC_ACK_CREDITS,
+                       .count = SCC_PER_VL },  /* one per NUMA */
+       [SC_ACK]    = { .size  = SCS_ACK_CREDITS,
                         .count = SCC_PER_KRCVQ },
         [SC_USER]   = { .size  = SCS_POOL_0,    /* even divide, pool 0 */
                         .count = SCC_PER_CPU }, /* one per CPU */
+       [SC_VL15]   = { .size  = SCS_VL15_CREDITS,
+                       .count = 1 },
  
  };
  
@@ -202,7 +209,8 @@ static int wildcard_to_pool(int wc)
  static const char *sc_type_names[SC_MAX] = {
         "kernel",
         "ack",
-       "user"
+       "user",
+       "vl15"
  };
  
  static const char *sc_type_name(int index)
@@ -230,6 +238,22 @@ int init_sc_pools_and_sizes(struct hfi1_devdata *dd)
         int extra;
         int i;
  
+       /*
+        * When SDMA is enabled, kernel context pio packet size is capped by
+        * "piothreshold". Reduce pio buffer allocation for kernel context by
+        * setting it to a fixed size. The allocation allows 3-deep buffering
+        * of the largest pio packets plus up to 128 bytes header, sufficient
+        * to maintain verbs performance.
+        *
+        * When SDMA is disabled, keep the default pooling allocation.
+        */
+       if (HFI1_CAP_IS_KSET(SDMA)) {
+               u16 max_pkt_size = (piothreshold < PIO_THRESHOLD_CEILING) ?
+                                        piothreshold : PIO_THRESHOLD_CEILING;
+               sc_config_sizes[SC_KERNEL].size =
+                       3 * (max_pkt_size + 128) / PIO_BLOCK_SIZE;
+       }
+
         /*
          * Step 0:
          *      - copy the centipercents/absolute sizes from the pool config
@@ -311,7 +335,7 @@ int init_sc_pools_and_sizes(struct hfi1_devdata *dd)
                 if (i == SC_ACK) {
                         count = dd->n_krcv_queues;
                 } else if (i == SC_KERNEL) {
-                       count = (INIT_SC_PER_VL * num_vls) + 1 /* VL15 */;
+                       count = INIT_SC_PER_VL * num_vls;
                 } else if (count == SCC_PER_CPU) {
                         count = dd->num_rcv_contexts - dd->n_krcv_queues;
                 } else if (count < 0) {
@@ -596,7 +620,7 @@ u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize)
   * Return value is what to write into the CSR: trigger return when
   * unreturned credits pass this count.
   */
-static u32 sc_percent_to_threshold(struct send_context *sc, u32 percent)
+u32 sc_percent_to_threshold(struct send_context *sc, u32 percent)
  {
         return (sc->credits * percent) / 100;
  }
@@ -790,7 +814,10 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
          * For Ack contexts, set a threshold for half the credits.
          * For User contexts use the given percentage.  This has been
          * sanitized on driver start-up.
-        * For Kernel contexts, use the default MTU plus a header.
+        * For Kernel contexts, use the default MTU plus a header
+        * or half the credits, whichever is smaller. This should
+        * work for both the 3-deep buffering allocation and the
+        * pooling allocation.
          */
         if (type == SC_ACK) {
                 thresh = sc_percent_to_threshold(sc, 50);
@@ -798,7 +825,9 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
                 thresh = sc_percent_to_threshold(sc,
                                                  user_credit_return_threshold);
         } else { /* kernel */
-               thresh = sc_mtu_to_threshold(sc, hfi1_max_mtu, hdrqentsize);
+               thresh = min(sc_percent_to_threshold(sc, 50),
+                            sc_mtu_to_threshold(sc, hfi1_max_mtu,
+                                                hdrqentsize));
         }
         reg = thresh << SC(CREDIT_CTRL_THRESHOLD_SHIFT);
         /* add in early return */
@@ -1531,7 +1560,8 @@ static void sc_piobufavail(struct send_context *sc)
         unsigned long flags;
         unsigned i, n = 0;
  
-       if (dd->send_contexts[sc->sw_index].type != SC_KERNEL)
+       if (dd->send_contexts[sc->sw_index].type != SC_KERNEL &&
+           dd->send_contexts[sc->sw_index].type != SC_VL15)
                 return;
         list = &sc->piowait;
         /*
@@ -1900,7 +1930,7 @@ int init_pervl_scs(struct hfi1_devdata *dd)
         u32 ctxt;
         struct hfi1_pportdata *ppd = dd->pport;
  
-       dd->vld[15].sc = sc_alloc(dd, SC_KERNEL,
+       dd->vld[15].sc = sc_alloc(dd, SC_VL15,
                                   dd->rcd[0]->rcvhdrqentsize, dd->node);
         if (!dd->vld[15].sc)
                 goto nomem;
diff --git a/drivers/staging/rdma/hfi1/pio.h b/drivers/staging/rdma/hfi1/pio.h

index 0026976ce4f6e8c9853e02b1b1f6f8bbb0fed37a..53a08edb7f642d454f9fbf2be88f01cd2ea3e357 100644 (file)
--- a/drivers/staging/rdma/hfi1/pio.h
+++ b/drivers/staging/rdma/hfi1/pio.h
@@ -51,7 +51,8 @@
  #define SC_KERNEL 0
  #define SC_ACK    1
  #define SC_USER   2
-#define SC_MAX    3
+#define SC_VL15   3
+#define SC_MAX    4
  
  /* invalid send context index */
  #define INVALID_SCI 0xff
@@ -293,6 +294,7 @@ void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context);
  void sc_add_credit_return_intr(struct send_context *sc);
  void sc_del_credit_return_intr(struct send_context *sc);
  void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold);
+u32 sc_percent_to_threshold(struct send_context *sc, u32 percent);
  u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize);
  void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint);
  void sc_wait(struct hfi1_devdata *dd);
diff --git a/drivers/staging/rdma/hfi1/platform.c b/drivers/staging/rdma/hfi1/platform.c

index 0a1d074583e483717dfb955d5fa70fc8888f8766..8fe8a205b5bbb968bc7516f524ec129208a5283d 100644 (file)
--- a/drivers/staging/rdma/hfi1/platform.c
+++ b/drivers/staging/rdma/hfi1/platform.c
@@ -114,21 +114,11 @@ static int qual_power(struct hfi1_pportdata *ppd)
         if (ret)
                 return ret;
  
-       if (QSFP_HIGH_PWR(cache[QSFP_MOD_PWR_OFFS]) != 4)
-               cable_power_class = QSFP_HIGH_PWR(cache[QSFP_MOD_PWR_OFFS]);
-       else
-               cable_power_class = QSFP_PWR(cache[QSFP_MOD_PWR_OFFS]);
+       cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
  
-       if (cable_power_class <= 3 && cable_power_class > (power_class_max - 1))
-               ppd->offline_disabled_reason =
-                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY);
-       else if (cable_power_class > 4 && cable_power_class > (power_class_max))
+       if (cable_power_class > power_class_max)
                 ppd->offline_disabled_reason =
                         HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY);
-       /*
-        * cable_power_class will never have value 4 as this simply
-        * means the high power settings are unused
-        */
  
         if (ppd->offline_disabled_reason ==
                         HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY)) {
@@ -173,12 +163,9 @@ static int set_qsfp_high_power(struct hfi1_pportdata *ppd)
         u8 *cache = ppd->qsfp_info.cache;
         int ret;
  
-       if (QSFP_HIGH_PWR(cache[QSFP_MOD_PWR_OFFS]) != 4)
-               cable_power_class = QSFP_HIGH_PWR(cache[QSFP_MOD_PWR_OFFS]);
-       else
-               cable_power_class = QSFP_PWR(cache[QSFP_MOD_PWR_OFFS]);
+       cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
  
-       if (cable_power_class) {
+       if (cable_power_class > QSFP_POWER_CLASS_1) {
                 power_ctrl_byte = cache[QSFP_PWR_CTRL_BYTE_OFFS];
  
                 power_ctrl_byte |= 1;
@@ -190,8 +177,7 @@ static int set_qsfp_high_power(struct hfi1_pportdata *ppd)
                 if (ret != 1)
                         return -EIO;
  
-               if (cable_power_class > 3) {
-                       /* > power class 4*/
+               if (cable_power_class > QSFP_POWER_CLASS_4) {
                         power_ctrl_byte |= (1 << 2);
                         ret = qsfp_write(ppd, ppd->dd->hfi1_id,
                                          QSFP_PWR_CTRL_BYTE_OFFS,
@@ -212,12 +198,21 @@ static void apply_rx_cdr(struct hfi1_pportdata *ppd,
  {
         u32 rx_preset;
         u8 *cache = ppd->qsfp_info.cache;
+       int cable_power_class;
  
         if (!((cache[QSFP_MOD_PWR_OFFS] & 0x4) &&
               (cache[QSFP_CDR_INFO_OFFS] & 0x40)))
                 return;
  
-       /* rx_preset preset to zero to catch error */
+       /* RX CDR present, bypass supported */
+       cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
+
+       if (cable_power_class <= QSFP_POWER_CLASS_3) {
+               /* Power class <= 3, ignore config & turn RX CDR on */
+               *cdr_ctrl_byte |= 0xF;
+               return;
+       }
+
         get_platform_config_field(
                 ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE,
                 rx_preset_index, RX_PRESET_TABLE_QSFP_RX_CDR_APPLY,
@@ -250,15 +245,25 @@ static void apply_rx_cdr(struct hfi1_pportdata *ppd,
  
  static void apply_tx_cdr(struct hfi1_pportdata *ppd,
                          u32 tx_preset_index,
-                        u8 *ctr_ctrl_byte)
+                        u8 *cdr_ctrl_byte)
  {
         u32 tx_preset;
         u8 *cache = ppd->qsfp_info.cache;
+       int cable_power_class;
  
         if (!((cache[QSFP_MOD_PWR_OFFS] & 0x8) &&
               (cache[QSFP_CDR_INFO_OFFS] & 0x80)))
                 return;
  
+       /* TX CDR present, bypass supported */
+       cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
+
+       if (cable_power_class <= QSFP_POWER_CLASS_3) {
+               /* Power class <= 3, ignore config & turn TX CDR on */
+               *cdr_ctrl_byte |= 0xF0;
+               return;
+       }
+
         get_platform_config_field(
                 ppd->dd,
                 PLATFORM_CONFIG_TX_PRESET_TABLE, tx_preset_index,
@@ -282,10 +287,10 @@ static void apply_tx_cdr(struct hfi1_pportdata *ppd,
                         (tx_preset << 2) | (tx_preset << 3));
  
         if (tx_preset)
-               *ctr_ctrl_byte |= (tx_preset << 4);
+               *cdr_ctrl_byte |= (tx_preset << 4);
         else
                 /* Preserve current/determined RX CDR status */
-               *ctr_ctrl_byte &= ((tx_preset << 4) | 0xF);
+               *cdr_ctrl_byte &= ((tx_preset << 4) | 0xF);
  }
  
  static void apply_cdr_settings(
@@ -598,6 +603,7 @@ static void apply_tunings(
                        "Applying TX settings");
  }
  
+/* Must be holding the QSFP i2c resource */
  static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
                             u32 *ptr_rx_preset, u32 *ptr_total_atten)
  {
@@ -605,26 +611,19 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
         u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled;
         u8 *cache = ppd->qsfp_info.cache;
  
-       ret = acquire_chip_resource(ppd->dd, qsfp_resource(ppd->dd), QSFP_WAIT);
-       if (ret) {
-               dd_dev_err(ppd->dd, "%s: hfi%d: cannot lock i2c chain\n",
-                          __func__, (int)ppd->dd->hfi1_id);
-               return ret;
-       }
-
         ppd->qsfp_info.limiting_active = 1;
  
         ret = set_qsfp_tx(ppd, 0);
         if (ret)
-               goto bail_unlock;
+               return ret;
  
         ret = qual_power(ppd);
         if (ret)
-               goto bail_unlock;
+               return ret;
  
         ret = qual_bitrate(ppd);
         if (ret)
-               goto bail_unlock;
+               return ret;
  
         if (ppd->qsfp_info.reset_needed) {
                 reset_qsfp(ppd);
@@ -636,7 +635,7 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
  
         ret = set_qsfp_high_power(ppd);
         if (ret)
-               goto bail_unlock;
+               return ret;
  
         if (cache[QSFP_EQ_INFO_OFFS] & 0x4) {
                 ret = get_platform_config_field(
@@ -646,7 +645,7 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
                         ptr_tx_preset, 4);
                 if (ret) {
                         *ptr_tx_preset = OPA_INVALID_INDEX;
-                       goto bail_unlock;
+                       return ret;
                 }
         } else {
                 ret = get_platform_config_field(
@@ -656,7 +655,7 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
                         ptr_tx_preset, 4);
                 if (ret) {
                         *ptr_tx_preset = OPA_INVALID_INDEX;
-                       goto bail_unlock;
+                       return ret;
                 }
         }
  
@@ -665,7 +664,7 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
                 PORT_TABLE_RX_PRESET_IDX, ptr_rx_preset, 4);
         if (ret) {
                 *ptr_rx_preset = OPA_INVALID_INDEX;
-               goto bail_unlock;
+               return ret;
         }
  
         if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G))
@@ -685,8 +684,6 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
  
         ret = set_qsfp_tx(ppd, 1);
  
-bail_unlock:
-       release_chip_resource(ppd->dd, qsfp_resource(ppd->dd));
         return ret;
  }
  
@@ -833,12 +830,22 @@ void tune_serdes(struct hfi1_pportdata *ppd)
                         total_atten = platform_atten + remote_atten;
  
                         tuning_method = OPA_PASSIVE_TUNING;
-               } else
+               } else {
                         ppd->offline_disabled_reason =
                              HFI1_ODR_MASK(OPA_LINKDOWN_REASON_CHASSIS_CONFIG);
+                       goto bail;
+               }
                 break;
         case PORT_TYPE_QSFP:
                 if (qsfp_mod_present(ppd)) {
+                       ret = acquire_chip_resource(ppd->dd,
+                                                   qsfp_resource(ppd->dd),
+                                                   QSFP_WAIT);
+                       if (ret) {
+                               dd_dev_err(ppd->dd, "%s: hfi%d: cannot lock i2c chain\n",
+                                          __func__, (int)ppd->dd->hfi1_id);
+                               goto bail;
+                       }
                         refresh_qsfp_cache(ppd, &ppd->qsfp_info);
  
                         if (ppd->qsfp_info.cache_valid) {
@@ -853,21 +860,23 @@ void tune_serdes(struct hfi1_pportdata *ppd)
                                  * update the cache to reflect the changes
                                  */
                                 refresh_qsfp_cache(ppd, &ppd->qsfp_info);
-                               if (ret)
-                                       goto bail;
-
                                 limiting_active =
                                                 ppd->qsfp_info.limiting_active;
                         } else {
                                 dd_dev_err(dd,
                                            "%s: Reading QSFP memory failed\n",
                                            __func__);
-                               goto bail;
+                               ret = -EINVAL; /* a fail indication */
                         }
-               } else
+                       release_chip_resource(ppd->dd, qsfp_resource(ppd->dd));
+                       if (ret)
+                               goto bail;
+               } else {
                         ppd->offline_disabled_reason =
                            HFI1_ODR_MASK(
                                 OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED);
+                       goto bail;
+               }
                 break;
         default:
                 dd_dev_info(ppd->dd, "%s: Unknown port type\n", __func__);
diff --git a/drivers/staging/rdma/hfi1/qp.c b/drivers/staging/rdma/hfi1/qp.c

index dc9119e1b458ee8117de5520218b216343185ce4..91eb42316df9222d03dcfde3058ebeaf430bf937 100644 (file)
--- a/drivers/staging/rdma/hfi1/qp.c
+++ b/drivers/staging/rdma/hfi1/qp.c
@@ -167,8 +167,12 @@ static inline int opa_mtu_enum_to_int(int mtu)
   */
  static inline int verbs_mtu_enum_to_int(struct ib_device *dev, enum ib_mtu mtu)
  {
-       int val = opa_mtu_enum_to_int((int)mtu);
+       int val;
  
+       /* Constraining 10KB packets to 8KB packets */
+       if (mtu == (enum ib_mtu)OPA_MTU_10240)
+               mtu = OPA_MTU_8192;
+       val = opa_mtu_enum_to_int((int)mtu);
         if (val > 0)
                 return val;
         return ib_mtu_enum_to_int(mtu);
diff --git a/drivers/staging/rdma/hfi1/qsfp.c b/drivers/staging/rdma/hfi1/qsfp.c

index 9ed1963010feae31f110122f603bc5a01a002d4d..2441669f0817dbec51c7fbcdc847c0bff9f48bfb 100644 (file)
--- a/drivers/staging/rdma/hfi1/qsfp.c
+++ b/drivers/staging/rdma/hfi1/qsfp.c
@@ -96,7 +96,7 @@ int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
  {
         int ret;
  
-       if (!check_chip_resource(ppd->dd, qsfp_resource(ppd->dd), __func__))
+       if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
                 return -EACCES;
  
         /* make sure the TWSI bus is in a sane state */
@@ -162,7 +162,7 @@ int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
  {
         int ret;
  
-       if (!check_chip_resource(ppd->dd, qsfp_resource(ppd->dd), __func__))
+       if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
                 return -EACCES;
  
         /* make sure the TWSI bus is in a sane state */
@@ -192,7 +192,7 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
         int ret;
         u8 page;
  
-       if (!check_chip_resource(ppd->dd, qsfp_resource(ppd->dd), __func__))
+       if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
                 return -EACCES;
  
         /* make sure the TWSI bus is in a sane state */
@@ -276,7 +276,7 @@ int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
         int ret;
         u8 page;
  
-       if (!check_chip_resource(ppd->dd, qsfp_resource(ppd->dd), __func__))
+       if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
                 return -EACCES;
  
         /* make sure the TWSI bus is in a sane state */
@@ -355,6 +355,8 @@ int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
   * The calls to qsfp_{read,write} in this function correctly handle the
   * address map difference between this mapping and the mapping implemented
   * by those functions
+ *
+ * The caller must be holding the QSFP i2c chain resource.
   */
  int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp)
  {
@@ -371,13 +373,9 @@ int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp)
  
         if (!qsfp_mod_present(ppd)) {
                 ret = -ENODEV;
-               goto bail_no_release;
+               goto bail;
         }
  
-       ret = acquire_chip_resource(ppd->dd, qsfp_resource(ppd->dd), QSFP_WAIT);
-       if (ret)
-               goto bail_no_release;
-
         ret = qsfp_read(ppd, target, 0, cache, QSFP_PAGESIZE);
         if (ret != QSFP_PAGESIZE) {
                 dd_dev_info(ppd->dd,
@@ -440,8 +438,6 @@ int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp)
                 }
         }
  
-       release_chip_resource(ppd->dd, qsfp_resource(ppd->dd));
-
         spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
         ppd->qsfp_info.cache_valid = 1;
         ppd->qsfp_info.cache_refresh_required = 0;
@@ -450,8 +446,6 @@ int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp)
         return 0;
  
  bail:
-       release_chip_resource(ppd->dd, qsfp_resource(ppd->dd));
-bail_no_release:
         memset(cache, 0, (QSFP_MAX_NUM_PAGES * 128));
         return ret;
  }
@@ -466,7 +460,28 @@ const char * const hfi1_qsfp_devtech[16] = {
  #define QSFP_DUMP_CHUNK 16 /* Holds longest string */
  #define QSFP_DEFAULT_HDR_CNT 224
  
-static const char *pwr_codes = "1.5W2.0W2.5W3.5W";
+#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3)
+#define QSFP_HIGH_PWR(pbyte) ((pbyte) & 3)
+/* For use with QSFP_HIGH_PWR macro */
+#define QSFP_HIGH_PWR_UNUSED   0 /* Bits [1:0] = 00 implies low power module */
+
+/*
+ * Takes power class byte [Page 00 Byte 129] in SFF 8636
+ * Returns power class as integer (1 through 7, per SFF 8636 rev 2.4)
+ */
+int get_qsfp_power_class(u8 power_byte)
+{
+       if (QSFP_HIGH_PWR(power_byte) == QSFP_HIGH_PWR_UNUSED)
+               /* power classes count from 1, their bit encodings from 0 */
+               return (QSFP_PWR(power_byte) + 1);
+       /*
+        * 00 in the high power classes stands for unused, bringing
+        * balance to the off-by-1 offset above, we add 4 here to
+        * account for the difference between the low and high power
+        * groups
+        */
+       return (QSFP_HIGH_PWR(power_byte) + 4);
+}
  
  int qsfp_mod_present(struct hfi1_pportdata *ppd)
  {
@@ -537,6 +552,16 @@ set_zeroes:
         return ret;
  }
  
+static const char *pwr_codes[8] = {"N/AW",
+                                 "1.5W",
+                                 "2.0W",
+                                 "2.5W",
+                                 "3.5W",
+                                 "4.0W",
+                                 "4.5W",
+                                 "5.0W"
+                                };
+
  int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len)
  {
         u8 *cache = &ppd->qsfp_info.cache[0];
@@ -546,6 +571,7 @@ int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len)
         int bidx = 0;
         u8 *atten = &cache[QSFP_ATTEN_OFFS];
         u8 *vendor_oui = &cache[QSFP_VOUI_OFFS];
+       u8 power_byte = 0;
  
         sofar = 0;
         lenstr[0] = ' ';
@@ -555,9 +581,9 @@ int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len)
                 if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS]))
                         sprintf(lenstr, "%dM ", cache[QSFP_MOD_LEN_OFFS]);
  
+               power_byte = cache[QSFP_MOD_PWR_OFFS];
                 sofar += scnprintf(buf + sofar, len - sofar, "PWR:%.3sW\n",
-                               pwr_codes +
-                               (QSFP_PWR(cache[QSFP_MOD_PWR_OFFS]) * 4));
+                               pwr_codes[get_qsfp_power_class(power_byte)]);
  
                 sofar += scnprintf(buf + sofar, len - sofar, "TECH:%s%s\n",
                                 lenstr,
diff --git a/drivers/staging/rdma/hfi1/qsfp.h b/drivers/staging/rdma/hfi1/qsfp.h

index 831fe4cf1345ccc743d08418ba3a6885a7f314af..dadc66c442b982130da735bcb7bcf1e81307590f 100644 (file)
--- a/drivers/staging/rdma/hfi1/qsfp.h
+++ b/drivers/staging/rdma/hfi1/qsfp.h
@@ -82,8 +82,9 @@
  /* Byte 128 is Identifier: must be 0x0c for QSFP, or 0x0d for QSFP+ */
  #define QSFP_MOD_ID_OFFS 128
  /*
- * Byte 129 is "Extended Identifier". We only care about D7,D6: Power class
- *  0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W
+ * Byte 129 is "Extended Identifier".
+ * For bits [7:6]: 0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W
+ * For bits [1:0]: 0:Unused, 1:4W, 2:4.5W, 3:5W
   */
  #define QSFP_MOD_PWR_OFFS 129
  /* Byte 130 is Connector type. Not Intel req'd */
@@ -190,6 +191,9 @@ extern const char *const hfi1_qsfp_devtech[16];
  #define QSFP_HIGH_BIAS_WARNING         0x22
  #define QSFP_LOW_BIAS_WARNING          0x11
  
+#define QSFP_ATTEN_SDR(attenarray) (attenarray[0])
+#define QSFP_ATTEN_DDR(attenarray) (attenarray[1])
+
  /*
   * struct qsfp_data encapsulates state of QSFP device for one port.
   * it will be part of port-specific data if a board supports QSFP.
@@ -201,12 +205,6 @@ extern const char *const hfi1_qsfp_devtech[16];
   * and let the qsfp_lock arbitrate access to common resources.
   *
   */
-
-#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3)
-#define QSFP_HIGH_PWR(pbyte) (((pbyte) & 3) | 4)
-#define QSFP_ATTEN_SDR(attenarray) (attenarray[0])
-#define QSFP_ATTEN_DDR(attenarray) (attenarray[1])
-
  struct qsfp_data {
         /* Helps to find our way */
         struct hfi1_pportdata *ppd;
@@ -223,6 +221,7 @@ struct qsfp_data {
  
  int refresh_qsfp_cache(struct hfi1_pportdata *ppd,
                        struct qsfp_data *cp);
+int get_qsfp_power_class(u8 power_byte);
  int qsfp_mod_present(struct hfi1_pportdata *ppd);
  int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr,
                    u32 len, u8 *data);
diff --git a/drivers/staging/rdma/hfi1/rc.c b/drivers/staging/rdma/hfi1/rc.c

index 0d7e1017f3cbbf8cf5b75bb27406f678d303b3aa..792f15eb8efeceeff89cd40e05e4180dace441b2 100644 (file)
--- a/drivers/staging/rdma/hfi1/rc.c
+++ b/drivers/staging/rdma/hfi1/rc.c
@@ -1497,7 +1497,7 @@ reserved:
                 /* Ignore reserved NAK codes. */
                 goto bail_stop;
         }
-       return ret;
+       /* cannot be reached  */
  bail_stop:
         hfi1_stop_rc_timers(qp);
         return ret;
@@ -2021,8 +2021,6 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
         if (sl >= OPA_MAX_SLS)
                 return;
  
-       cca_timer = &ppd->cca_timer[sl];
-
         cc_state = get_cc_state(ppd);
  
         if (!cc_state)
@@ -2041,6 +2039,7 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
  
         spin_lock_irqsave(&ppd->cca_timer_lock, flags);
  
+       cca_timer = &ppd->cca_timer[sl];
         if (cca_timer->ccti < ccti_limit) {
                 if (cca_timer->ccti + ccti_incr <= ccti_limit)
                         cca_timer->ccti += ccti_incr;
@@ -2049,8 +2048,6 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
                 set_link_ipg(ppd);
         }
  
-       spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
-
         ccti = cca_timer->ccti;
  
         if (!hrtimer_active(&cca_timer->hrtimer)) {
@@ -2061,6 +2058,8 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
                               HRTIMER_MODE_REL);
         }
  
+       spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
+
         if ((trigger_threshold != 0) && (ccti >= trigger_threshold))
                 log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type);
  }
diff --git a/drivers/staging/rdma/hfi1/ruc.c b/drivers/staging/rdma/hfi1/ruc.c

index 08813cdbd475773b73bd9ee38ee9ff9cd40f0100..a659aec3c3c6b95823650c7b6bb3ac800649967a 100644 (file)
--- a/drivers/staging/rdma/hfi1/ruc.c
+++ b/drivers/staging/rdma/hfi1/ruc.c
@@ -831,7 +831,6 @@ void hfi1_do_send(struct rvt_qp *qp)
         struct hfi1_pkt_state ps;
         struct hfi1_qp_priv *priv = qp->priv;
         int (*make_req)(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
-       unsigned long flags;
         unsigned long timeout;
         unsigned long timeout_int;
         int cpu;
@@ -866,11 +865,11 @@ void hfi1_do_send(struct rvt_qp *qp)
                 timeout_int = SEND_RESCHED_TIMEOUT;
         }
  
-       spin_lock_irqsave(&qp->s_lock, flags);
+       spin_lock_irqsave(&qp->s_lock, ps.flags);
  
         /* Return if we are already busy processing a work request. */
         if (!hfi1_send_ok(qp)) {
-               spin_unlock_irqrestore(&qp->s_lock, flags);
+               spin_unlock_irqrestore(&qp->s_lock, ps.flags);
                 return;
         }
  
@@ -884,7 +883,7 @@ void hfi1_do_send(struct rvt_qp *qp)
         do {
                 /* Check for a constructed packet to be sent. */
                 if (qp->s_hdrwords != 0) {
-                       spin_unlock_irqrestore(&qp->s_lock, flags);
+                       spin_unlock_irqrestore(&qp->s_lock, ps.flags);
                         /*
                          * If the packet cannot be sent now, return and
                          * the send tasklet will be woken up later.
@@ -897,11 +896,14 @@ void hfi1_do_send(struct rvt_qp *qp)
                         if (unlikely(time_after(jiffies, timeout))) {
                                 if (workqueue_congested(cpu,
                                                         ps.ppd->hfi1_wq)) {
-                                       spin_lock_irqsave(&qp->s_lock, flags);
+                                       spin_lock_irqsave(
+                                               &qp->s_lock,
+                                               ps.flags);
                                         qp->s_flags &= ~RVT_S_BUSY;
                                         hfi1_schedule_send(qp);
-                                       spin_unlock_irqrestore(&qp->s_lock,
-                                                              flags);
+                                       spin_unlock_irqrestore(
+                                               &qp->s_lock,
+                                               ps.flags);
                                         this_cpu_inc(
                                                 *ps.ppd->dd->send_schedule);
                                         return;
@@ -913,11 +915,11 @@ void hfi1_do_send(struct rvt_qp *qp)
                                 }
                                 timeout = jiffies + (timeout_int) / 8;
                         }
-                       spin_lock_irqsave(&qp->s_lock, flags);
+                       spin_lock_irqsave(&qp->s_lock, ps.flags);
                 }
         } while (make_req(qp, &ps));
  
-       spin_unlock_irqrestore(&qp->s_lock, flags);
+       spin_unlock_irqrestore(&qp->s_lock, ps.flags);
  }
  
  /*
diff --git a/drivers/staging/rdma/hfi1/sysfs.c b/drivers/staging/rdma/hfi1/sysfs.c

index c7f1271190af1c4636e68678c59c94b13c4c3a5c..8cd6df8634ad2c056ba56542a2ec99180e05717b 100644 (file)
--- a/drivers/staging/rdma/hfi1/sysfs.c
+++ b/drivers/staging/rdma/hfi1/sysfs.c
@@ -84,7 +84,7 @@ static ssize_t read_cc_table_bin(struct file *filp, struct kobject *kobj,
                 rcu_read_unlock();
                 return -EINVAL;
         }
-       memcpy(buf, &cc_state->cct, count);
+       memcpy(buf, (void *)&cc_state->cct + pos, count);
         rcu_read_unlock();
  
         return count;
@@ -131,7 +131,7 @@ static ssize_t read_cc_setting_bin(struct file *filp, struct kobject *kobj,
                 rcu_read_unlock();
                 return -EINVAL;
         }
-       memcpy(buf, &cc_state->cong_setting, count);
+       memcpy(buf, (void *)&cc_state->cong_setting + pos, count);
         rcu_read_unlock();
  
         return count;
diff --git a/drivers/staging/rdma/hfi1/ud.c b/drivers/staging/rdma/hfi1/ud.c

index ae8a70f703ebcbb877abf630f501158688c6fb4a..1e503ad0bebb764ee1e05462604538e6324002f8 100644 (file)
--- a/drivers/staging/rdma/hfi1/ud.c
+++ b/drivers/staging/rdma/hfi1/ud.c
@@ -322,7 +322,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
                              (lid == ppd->lid ||
                               (lid == be16_to_cpu(IB_LID_PERMISSIVE) &&
                               qp->ibqp.qp_type == IB_QPT_GSI)))) {
-                       unsigned long flags;
+                       unsigned long tflags = ps->flags;
                         /*
                          * If DMAs are in progress, we can't generate
                          * a completion for the loopback packet since
@@ -335,10 +335,10 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
                                 goto bail;
                         }
                         qp->s_cur = next_cur;
-                       local_irq_save(flags);
-                       spin_unlock_irqrestore(&qp->s_lock, flags);
+                       spin_unlock_irqrestore(&qp->s_lock, tflags);
                         ud_loopback(qp, wqe);
-                       spin_lock_irqsave(&qp->s_lock, flags);
+                       spin_lock_irqsave(&qp->s_lock, tflags);
+                       ps->flags = tflags;
                         hfi1_send_complete(qp, wqe, IB_WC_SUCCESS);
                         goto done_free_tx;
                 }
diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.c b/drivers/staging/rdma/hfi1/user_exp_rcv.c

index 8bd56d5c783dede5e59772eb88daf192e0dbb457..1b640a35b3fe82f6f85022599477f6760744f034 100644 (file)
--- a/drivers/staging/rdma/hfi1/user_exp_rcv.c
+++ b/drivers/staging/rdma/hfi1/user_exp_rcv.c
@@ -399,8 +399,11 @@ int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo)
          * pages, accept the amount pinned so far and program only that.
          * User space knows how to deal with partially programmed buffers.
          */
-       if (!hfi1_can_pin_pages(dd, fd->tid_n_pinned, npages))
-               return -ENOMEM;
+       if (!hfi1_can_pin_pages(dd, fd->tid_n_pinned, npages)) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+
         pinned = hfi1_acquire_user_pages(vaddr, npages, true, pages);
         if (pinned <= 0) {
                 ret = pinned;
diff --git a/drivers/staging/rdma/hfi1/user_sdma.c b/drivers/staging/rdma/hfi1/user_sdma.c

index d53a659548e0a78b2cc8b8777e2f0b45a64961c1..0014c9c0e967a502c3623fbb1c242ec9f6d34102 100644 (file)
--- a/drivers/staging/rdma/hfi1/user_sdma.c
+++ b/drivers/staging/rdma/hfi1/user_sdma.c
@@ -180,6 +180,8 @@ struct user_sdma_iovec {
         u64 offset;
  };
  
+#define SDMA_CACHE_NODE_EVICT BIT(0)
+
  struct sdma_mmu_node {
         struct mmu_rb_node rb;
         struct list_head list;
@@ -187,6 +189,7 @@ struct sdma_mmu_node {
         atomic_t refcount;
         struct page **pages;
         unsigned npages;
+       unsigned long flags;
  };
  
  struct user_sdma_request {
@@ -597,6 +600,13 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
                 goto free_req;
         }
  
+       /* Checking P_KEY for requests from user-space */
+       if (egress_pkey_check(dd->pport, req->hdr.lrh, req->hdr.bth, sc,
+                             PKEY_CHECK_INVALID)) {
+               ret = -EINVAL;
+               goto free_req;
+       }
+
         /*
          * Also should check the BTH.lnh. If it says the next header is GRH then
          * the RXE parsing will be off and will land in the middle of the KDETH
@@ -1030,27 +1040,29 @@ static inline int num_user_pages(const struct iovec *iov)
         return 1 + ((epage - spage) >> PAGE_SHIFT);
  }
  
-/* Caller must hold pq->evict_lock */
  static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages)
  {
         u32 cleared = 0;
         struct sdma_mmu_node *node, *ptr;
+       struct list_head to_evict = LIST_HEAD_INIT(to_evict);
  
+       spin_lock(&pq->evict_lock);
         list_for_each_entry_safe_reverse(node, ptr, &pq->evict, list) {
                 /* Make sure that no one is still using the node. */
                 if (!atomic_read(&node->refcount)) {
-                       /*
-                        * Need to use the page count now as the remove callback
-                        * will free the node.
-                        */
+                       set_bit(SDMA_CACHE_NODE_EVICT, &node->flags);
+                       list_del_init(&node->list);
+                       list_add(&node->list, &to_evict);
                         cleared += node->npages;
-                       spin_unlock(&pq->evict_lock);
-                       hfi1_mmu_rb_remove(&pq->sdma_rb_root, &node->rb);
-                       spin_lock(&pq->evict_lock);
                         if (cleared >= npages)
                                 break;
                 }
         }
+       spin_unlock(&pq->evict_lock);
+
+       list_for_each_entry_safe(node, ptr, &to_evict, list)
+               hfi1_mmu_rb_remove(&pq->sdma_rb_root, &node->rb);
+
         return cleared;
  }
  
@@ -1062,9 +1074,9 @@ static int pin_vector_pages(struct user_sdma_request *req,
         struct sdma_mmu_node *node = NULL;
         struct mmu_rb_node *rb_node;
  
-       rb_node = hfi1_mmu_rb_search(&pq->sdma_rb_root,
-                                    (unsigned long)iovec->iov.iov_base,
-                                    iovec->iov.iov_len);
+       rb_node = hfi1_mmu_rb_extract(&pq->sdma_rb_root,
+                                     (unsigned long)iovec->iov.iov_base,
+                                     iovec->iov.iov_len);
         if (rb_node && !IS_ERR(rb_node))
                 node = container_of(rb_node, struct sdma_mmu_node, rb);
         else
@@ -1076,7 +1088,6 @@ static int pin_vector_pages(struct user_sdma_request *req,
                         return -ENOMEM;
  
                 node->rb.addr = (unsigned long)iovec->iov.iov_base;
-               node->rb.len = iovec->iov.iov_len;
                 node->pq = pq;
                 atomic_set(&node->refcount, 0);
                 INIT_LIST_HEAD(&node->list);
@@ -1093,11 +1104,25 @@ static int pin_vector_pages(struct user_sdma_request *req,
                 memcpy(pages, node->pages, node->npages * sizeof(*pages));
  
                 npages -= node->npages;
+
+               /*
+                * If rb_node is NULL, it means that this is brand new node
+                * and, therefore not on the eviction list.
+                * If, however, the rb_node is non-NULL, it means that the
+                * node is already in RB tree and, therefore on the eviction
+                * list (nodes are unconditionally inserted in the eviction
+                * list). In that case, we have to remove the node prior to
+                * calling the eviction function in order to prevent it from
+                * freeing this node.
+                */
+               if (rb_node) {
+                       spin_lock(&pq->evict_lock);
+                       list_del_init(&node->list);
+                       spin_unlock(&pq->evict_lock);
+               }
  retry:
                 if (!hfi1_can_pin_pages(pq->dd, pq->n_locked, npages)) {
-                       spin_lock(&pq->evict_lock);
                         cleared = sdma_cache_evict(pq, npages);
-                       spin_unlock(&pq->evict_lock);
                         if (cleared >= npages)
                                 goto retry;
                 }
@@ -1117,37 +1142,32 @@ retry:
                         goto bail;
                 }
                 kfree(node->pages);
+               node->rb.len = iovec->iov.iov_len;
                 node->pages = pages;
                 node->npages += pinned;
                 npages = node->npages;
                 spin_lock(&pq->evict_lock);
-               if (!rb_node)
-                       list_add(&node->list, &pq->evict);
-               else
-                       list_move(&node->list, &pq->evict);
+               list_add(&node->list, &pq->evict);
                 pq->n_locked += pinned;
                 spin_unlock(&pq->evict_lock);
         }
         iovec->pages = node->pages;
         iovec->npages = npages;
  
-       if (!rb_node) {
-               ret = hfi1_mmu_rb_insert(&req->pq->sdma_rb_root, &node->rb);
-               if (ret) {
-                       spin_lock(&pq->evict_lock);
+       ret = hfi1_mmu_rb_insert(&req->pq->sdma_rb_root, &node->rb);
+       if (ret) {
+               spin_lock(&pq->evict_lock);
+               if (!list_empty(&node->list))
                         list_del(&node->list);
-                       pq->n_locked -= node->npages;
-                       spin_unlock(&pq->evict_lock);
-                       ret = 0;
-                       goto bail;
-               }
-       } else {
-               atomic_inc(&node->refcount);
+               pq->n_locked -= node->npages;
+               spin_unlock(&pq->evict_lock);
+               goto bail;
         }
         return 0;
  bail:
-       if (!rb_node)
-               kfree(node);
+       if (rb_node)
+               unpin_vector_pages(current->mm, node->pages, 0, node->npages);
+       kfree(node);
         return ret;
  }
  
@@ -1558,7 +1578,20 @@ static void sdma_rb_remove(struct rb_root *root, struct mmu_rb_node *mnode,
                 container_of(mnode, struct sdma_mmu_node, rb);
  
         spin_lock(&node->pq->evict_lock);
-       list_del(&node->list);
+       /*
+        * We've been called by the MMU notifier but this node has been
+        * scheduled for eviction. The eviction function will take care
+        * of freeing this node.
+        * We have to take the above lock first because we are racing
+        * against the setting of the bit in the eviction function.
+        */
+       if (mm && test_bit(SDMA_CACHE_NODE_EVICT, &node->flags)) {
+               spin_unlock(&node->pq->evict_lock);
+               return;
+       }
+
+       if (!list_empty(&node->list))
+               list_del(&node->list);
         node->pq->n_locked -= node->npages;
         spin_unlock(&node->pq->evict_lock);
  
diff --git a/drivers/staging/rdma/hfi1/verbs.c b/drivers/staging/rdma/hfi1/verbs.c

index 89f2aad45c1b8cfdb27d81aac73c2cd2ebd0c52f..9cdc85fa366f19e6b0c13ad43c582c7f73faf24d 100644 (file)
--- a/drivers/staging/rdma/hfi1/verbs.c
+++ b/drivers/staging/rdma/hfi1/verbs.c
@@ -545,7 +545,7 @@ static inline int qp_ok(int opcode, struct hfi1_packet *packet)
  
         if (!(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK))
                 goto dropit;
-       if (((opcode & OPCODE_QP_MASK) == packet->qp->allowed_ops) ||
+       if (((opcode & RVT_OPCODE_QP_MASK) == packet->qp->allowed_ops) ||
             (opcode == IB_OPCODE_CNP))
                 return 1;
  dropit:
@@ -1089,16 +1089,16 @@ bail:
  
  /*
   * egress_pkey_matches_entry - return 1 if the pkey matches ent (ent
- * being an entry from the ingress partition key table), return 0
+ * being an entry from the partition key table), return 0
   * otherwise. Use the matching criteria for egress partition keys
   * specified in the OPAv1 spec., section 9.1l.7.
   */
  static inline int egress_pkey_matches_entry(u16 pkey, u16 ent)
  {
         u16 mkey = pkey & PKEY_LOW_15_MASK;
-       u16 ment = ent & PKEY_LOW_15_MASK;
+       u16 mentry = ent & PKEY_LOW_15_MASK;
  
-       if (mkey == ment) {
+       if (mkey == mentry) {
                 /*
                  * If pkey[15] is set (full partition member),
                  * is bit 15 in the corresponding table element
@@ -1111,32 +1111,32 @@ static inline int egress_pkey_matches_entry(u16 pkey, u16 ent)
         return 0;
  }
  
-/*
- * egress_pkey_check - return 0 if hdr's pkey matches according to the
- * criteria in the OPAv1 spec., section 9.11.7.
+/**
+ * egress_pkey_check - check P_KEY of a packet
+ * @ppd:    Physical IB port data
+ * @lrh: Local route header
+ * @bth: Base transport header
+ * @sc5:    SC for packet
+ * @s_pkey_index: It will be used for look up optimization for kernel contexts
+ * only. If it is negative value, then it means user contexts is calling this
+ * function.
+ *
+ * It checks if hdr's pkey is valid.
+ *
+ * Return: 0 on success, otherwise, 1
   */
-static inline int egress_pkey_check(struct hfi1_pportdata *ppd,
-                                   struct hfi1_ib_header *hdr,
-                                   struct rvt_qp *qp)
+int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth,
+                     u8 sc5, int8_t s_pkey_index)
  {
-       struct hfi1_qp_priv *priv = qp->priv;
-       struct hfi1_other_headers *ohdr;
         struct hfi1_devdata *dd;
-       int i = 0;
+       int i;
         u16 pkey;
-       u8 lnh, sc5 = priv->s_sc;
+       int is_user_ctxt_mechanism = (s_pkey_index < 0);
  
         if (!(ppd->part_enforce & HFI1_PART_ENFORCE_OUT))
                 return 0;
  
-       /* locate the pkey within the headers */
-       lnh = be16_to_cpu(hdr->lrh[0]) & 3;
-       if (lnh == HFI1_LRH_GRH)
-               ohdr = &hdr->u.l.oth;
-       else
-               ohdr = &hdr->u.oth;
-
-       pkey = (u16)be32_to_cpu(ohdr->bth[0]);
+       pkey = (u16)be32_to_cpu(bth[0]);
  
         /* If SC15, pkey[0:14] must be 0x7fff */
         if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
@@ -1146,28 +1146,37 @@ static inline int egress_pkey_check(struct hfi1_pportdata *ppd,
         if ((pkey & PKEY_LOW_15_MASK) == 0)
                 goto bad;
  
-       /* The most likely matching pkey has index qp->s_pkey_index */
-       if (unlikely(!egress_pkey_matches_entry(pkey,
-                                               ppd->pkeys
-                                               [qp->s_pkey_index]))) {
-               /* no match - try the entire table */
-               for (; i < MAX_PKEY_VALUES; i++) {
-                       if (egress_pkey_matches_entry(pkey, ppd->pkeys[i]))
-                               break;
-               }
+       /*
+        * For the kernel contexts only, if a qp is passed into the function,
+        * the most likely matching pkey has index qp->s_pkey_index
+        */
+       if (!is_user_ctxt_mechanism &&
+           egress_pkey_matches_entry(pkey, ppd->pkeys[s_pkey_index])) {
+               return 0;
         }
  
-       if (i < MAX_PKEY_VALUES)
-               return 0;
+       for (i = 0; i < MAX_PKEY_VALUES; i++) {
+               if (egress_pkey_matches_entry(pkey, ppd->pkeys[i]))
+                       return 0;
+       }
  bad:
-       incr_cntr64(&ppd->port_xmit_constraint_errors);
-       dd = ppd->dd;
-       if (!(dd->err_info_xmit_constraint.status & OPA_EI_STATUS_SMASK)) {
-               u16 slid = be16_to_cpu(hdr->lrh[3]);
-
-               dd->err_info_xmit_constraint.status |= OPA_EI_STATUS_SMASK;
-               dd->err_info_xmit_constraint.slid = slid;
-               dd->err_info_xmit_constraint.pkey = pkey;
+       /*
+        * For the user-context mechanism, the P_KEY check would only happen
+        * once per SDMA request, not once per packet.  Therefore, there's no
+        * need to increment the counter for the user-context mechanism.
+        */
+       if (!is_user_ctxt_mechanism) {
+               incr_cntr64(&ppd->port_xmit_constraint_errors);
+               dd = ppd->dd;
+               if (!(dd->err_info_xmit_constraint.status &
+                     OPA_EI_STATUS_SMASK)) {
+                       u16 slid = be16_to_cpu(lrh[3]);
+
+                       dd->err_info_xmit_constraint.status |=
+                               OPA_EI_STATUS_SMASK;
+                       dd->err_info_xmit_constraint.slid = slid;
+                       dd->err_info_xmit_constraint.pkey = pkey;
+               }
         }
         return 1;
  }
@@ -1227,11 +1236,26 @@ int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
  {
         struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
         struct hfi1_qp_priv *priv = qp->priv;
+       struct hfi1_other_headers *ohdr;
+       struct hfi1_ib_header *hdr;
         send_routine sr;
         int ret;
+       u8 lnh;
+
+       hdr = &ps->s_txreq->phdr.hdr;
+       /* locate the pkey within the headers */
+       lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+       if (lnh == HFI1_LRH_GRH)
+               ohdr = &hdr->u.l.oth;
+       else
+               ohdr = &hdr->u.oth;
  
         sr = get_send_routine(qp, ps->s_txreq);
-       ret = egress_pkey_check(dd->pport, &ps->s_txreq->phdr.hdr, qp);
+       ret = egress_pkey_check(dd->pport,
+                               hdr->lrh,
+                               ohdr->bth,
+                               priv->s_sc,
+                               qp->s_pkey_index);
         if (unlikely(ret)) {
                 /*
                  * The value we are returning here does not get propagated to
diff --git a/drivers/staging/rdma/hfi1/verbs.h b/drivers/staging/rdma/hfi1/verbs.h

index 6c4670fffdbb8bf852a1ecaa6243339807bffd50..3ee223983b20cff52a13d29a6d3101ebbd60c2f9 100644 (file)
--- a/drivers/staging/rdma/hfi1/verbs.h
+++ b/drivers/staging/rdma/hfi1/verbs.h
@@ -215,6 +215,7 @@ struct hfi1_pkt_state {
         struct hfi1_ibport *ibp;
         struct hfi1_pportdata *ppd;
         struct verbs_txreq *s_txreq;
+       unsigned long flags;
  };
  
  #define HFI1_PSN_CREDIT  16
@@ -334,9 +335,6 @@ int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
  #endif
  #define PSN_MODIFY_MASK 0xFFFFFF
  
-/* Number of bits to pay attention to in the opcode for checking qp type */
-#define OPCODE_QP_MASK 0xE0
-
  /*
   * Compare the lower 24 bits of the msn values.
   * Returns an integer <, ==, or > than zero.
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c

index ab2bf12975e1386f51fbd4c94b7d0a79002b7af7..590384a2bf8ba2f6aa99aecdcdfd3ab5ef39534e 100644 (file)
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -2195,7 +2195,7 @@ queue_full:
         transport_handle_queue_full(cmd, cmd->se_dev);
  }
  
-static inline void transport_free_sgl(struct scatterlist *sgl, int nents)
+void target_free_sgl(struct scatterlist *sgl, int nents)
  {
         struct scatterlist *sg;
         int count;
@@ -2205,6 +2205,7 @@ static inline void transport_free_sgl(struct scatterlist *sgl, int nents)
  
         kfree(sgl);
  }
+EXPORT_SYMBOL(target_free_sgl);
  
  static inline void transport_reset_sgl_orig(struct se_cmd *cmd)
  {
@@ -2225,7 +2226,7 @@ static inline void transport_reset_sgl_orig(struct se_cmd *cmd)
  static inline void transport_free_pages(struct se_cmd *cmd)
  {
         if (!(cmd->se_cmd_flags & SCF_PASSTHROUGH_PROT_SG_TO_MEM_NOALLOC)) {
-               transport_free_sgl(cmd->t_prot_sg, cmd->t_prot_nents);
+               target_free_sgl(cmd->t_prot_sg, cmd->t_prot_nents);
                 cmd->t_prot_sg = NULL;
                 cmd->t_prot_nents = 0;
         }
@@ -2236,7 +2237,7 @@ static inline void transport_free_pages(struct se_cmd *cmd)
                  * SG_TO_MEM_NOALLOC to function with COMPARE_AND_WRITE
                  */
                 if (cmd->se_cmd_flags & SCF_COMPARE_AND_WRITE) {
-                       transport_free_sgl(cmd->t_bidi_data_sg,
+                       target_free_sgl(cmd->t_bidi_data_sg,
                                            cmd->t_bidi_data_nents);
                         cmd->t_bidi_data_sg = NULL;
                         cmd->t_bidi_data_nents = 0;
@@ -2246,11 +2247,11 @@ static inline void transport_free_pages(struct se_cmd *cmd)
         }
         transport_reset_sgl_orig(cmd);
  
-       transport_free_sgl(cmd->t_data_sg, cmd->t_data_nents);
+       target_free_sgl(cmd->t_data_sg, cmd->t_data_nents);
         cmd->t_data_sg = NULL;
         cmd->t_data_nents = 0;
  
-       transport_free_sgl(cmd->t_bidi_data_sg, cmd->t_bidi_data_nents);
+       target_free_sgl(cmd->t_bidi_data_sg, cmd->t_bidi_data_nents);
         cmd->t_bidi_data_sg = NULL;
         cmd->t_bidi_data_nents = 0;
  }
@@ -2324,20 +2325,22 @@ EXPORT_SYMBOL(transport_kunmap_data_sg);
  
  int
  target_alloc_sgl(struct scatterlist **sgl, unsigned int *nents, u32 length,
-                bool zero_page)
+                bool zero_page, bool chainable)
  {
         struct scatterlist *sg;
         struct page *page;
         gfp_t zero_flag = (zero_page) ? __GFP_ZERO : 0;
-       unsigned int nent;
+       unsigned int nalloc, nent;
         int i = 0;
  
-       nent = DIV_ROUND_UP(length, PAGE_SIZE);
-       sg = kmalloc(sizeof(struct scatterlist) * nent, GFP_KERNEL);
+       nalloc = nent = DIV_ROUND_UP(length, PAGE_SIZE);
+       if (chainable)
+               nalloc++;
+       sg = kmalloc_array(nalloc, sizeof(struct scatterlist), GFP_KERNEL);
         if (!sg)
                 return -ENOMEM;
  
-       sg_init_table(sg, nent);
+       sg_init_table(sg, nalloc);
  
         while (length) {
                 u32 page_len = min_t(u32, length, PAGE_SIZE);
@@ -2361,6 +2364,7 @@ out:
         kfree(sg);
         return -ENOMEM;
  }
+EXPORT_SYMBOL(target_alloc_sgl);
  
  /*
   * Allocate any required resources to execute the command.  For writes we
@@ -2376,7 +2380,7 @@ transport_generic_new_cmd(struct se_cmd *cmd)
         if (cmd->prot_op != TARGET_PROT_NORMAL &&
             !(cmd->se_cmd_flags & SCF_PASSTHROUGH_PROT_SG_TO_MEM_NOALLOC)) {
                 ret = target_alloc_sgl(&cmd->t_prot_sg, &cmd->t_prot_nents,
-                                      cmd->prot_length, true);
+                                      cmd->prot_length, true, false);
                 if (ret < 0)
                         return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
         }
@@ -2401,13 +2405,13 @@ transport_generic_new_cmd(struct se_cmd *cmd)
  
                         ret = target_alloc_sgl(&cmd->t_bidi_data_sg,
                                                &cmd->t_bidi_data_nents,
-                                              bidi_length, zero_flag);
+                                              bidi_length, zero_flag, false);
                         if (ret < 0)
                                 return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
                 }
  
                 ret = target_alloc_sgl(&cmd->t_data_sg, &cmd->t_data_nents,
-                                      cmd->data_length, zero_flag);
+                                      cmd->data_length, zero_flag, false);
                 if (ret < 0)
                         return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
         } else if ((cmd->se_cmd_flags & SCF_COMPARE_AND_WRITE) &&
@@ -2421,7 +2425,7 @@ transport_generic_new_cmd(struct se_cmd *cmd)
  
                 ret = target_alloc_sgl(&cmd->t_bidi_data_sg,
                                        &cmd->t_bidi_data_nents,
-                                      caw_length, zero_flag);
+                                      caw_length, zero_flag, false);
                 if (ret < 0)
                         return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
         }
diff --git a/drivers/target/target_core_xcopy.c b/drivers/target/target_core_xcopy.c

index 47fe94ee10b82d876fedef726308738dc62f5805..75cd85426ae3a27f276947f667794acb7c9454d0 100644 (file)
--- a/drivers/target/target_core_xcopy.c
+++ b/drivers/target/target_core_xcopy.c
@@ -563,7 +563,7 @@ static int target_xcopy_setup_pt_cmd(
  
         if (alloc_mem) {
                 rc = target_alloc_sgl(&cmd->t_data_sg, &cmd->t_data_nents,
-                                     cmd->data_length, false);
+                                     cmd->data_length, false, false);
                 if (rc < 0) {
                         ret = rc;
                         goto out;
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c

index 0582b72ef3772cf1257cc330dcf5e32246fe72bb..3054e3fa63ac0f30d52b9507c405d436df1bc750 100644 (file)
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -1188,7 +1188,8 @@ static int tce_iommu_attach_group(void *iommu_data,
                         goto unlock_exit;
                 }
                 table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
-               if (table_group_tmp->ops != table_group->ops) {
+               if (table_group_tmp->ops->create_table !=
+                               table_group->ops->create_table) {
                         pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
                                         iommu_group_id(iommu_group),
                                         iommu_group_id(tcegrp->grp));
diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig

index e0606c01e8ac7166d88d2ad20028c166ebef625e..3c20af999893577c9a1203756a4f2b98f0fe839f 100644 (file)
--- a/drivers/video/Kconfig
+++ b/drivers/video/Kconfig
@@ -8,10 +8,6 @@ menu "Graphics support"
  config HAVE_FB_ATMEL
         bool
  
-config SH_MIPI_DSI
-       tristate
-       depends on (SUPERH || ARCH_SHMOBILE) && HAVE_CLK
-
  config SH_LCD_MIPI_DSI
         bool
  
diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig

index e5a391aecde1b4c94a71181dacf665f99579bab1..88b008fb8a4ec9f4f4458b8b65a5e7d4ccef57e5 100644 (file)
--- a/drivers/video/fbdev/Kconfig
+++ b/drivers/video/fbdev/Kconfig
@@ -1993,7 +1993,6 @@ config FB_SH_MOBILE_LCDC
         select FB_SYS_FOPS
         select FB_DEFERRED_IO
         select FB_BACKLIGHT
-       select SH_MIPI_DSI if SH_LCD_MIPI_DSI
         ---help---
           Frame buffer driver for the on-chip SH-Mobile LCD controller.
  
diff --git a/drivers/video/fbdev/Makefile b/drivers/video/fbdev/Makefile

index 65fb15075c8f7f150febb09966766287c3aa0e07..f6731867dd26dfc4318799e5204cd583a5d27670 100644 (file)
--- a/drivers/video/fbdev/Makefile
+++ b/drivers/video/fbdev/Makefile
@@ -117,7 +117,6 @@ obj-$(CONFIG_FB_SM501)            += sm501fb.o
  obj-$(CONFIG_FB_UDL)             += udlfb.o
  obj-$(CONFIG_FB_SMSCUFX)         += smscufx.o
  obj-$(CONFIG_FB_XILINX)           += xilinxfb.o
-obj-$(CONFIG_SH_MIPI_DSI)        += sh_mipi_dsi.o
  obj-$(CONFIG_FB_SH_MOBILE_MERAM)  += sh_mobile_meram.o
  obj-$(CONFIG_FB_SH_MOBILE_LCDC)          += sh_mobile_lcdcfb.o
  obj-$(CONFIG_FB_OMAP)             += omap/
diff --git a/drivers/video/fbdev/amba-clcd.c b/drivers/video/fbdev/amba-clcd.c

index 93e66a9148b90de9a3308d3b4b49a359414c7363..9b158869cb89158acc0178f034708a276f2f839c 100644 (file)
--- a/drivers/video/fbdev/amba-clcd.c
+++ b/drivers/video/fbdev/amba-clcd.c
@@ -34,8 +34,6 @@
  #include <video/of_display_timing.h>
  #include <video/videomode.h>
  
-#include <asm/sizes.h>
-
  #define to_clcd(info)  container_of(info, struct clcd_fb, fb)
  
  /* This is limited to 16 characters when displayed by X startup */
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c

index 4e73b6f6b1c0547877af572dfca953748d9eabc3..76c1ad96fb37d4f07bbde6462f54825f566acbac 100644 (file)
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -1854,17 +1854,31 @@ EXPORT_SYMBOL(fb_set_suspend);
  static int __init
  fbmem_init(void)
  {
-       proc_create("fb", 0, NULL, &fb_proc_fops);
+       int ret;
+
+       if (!proc_create("fb", 0, NULL, &fb_proc_fops))
+               return -ENOMEM;
  
-       if (register_chrdev(FB_MAJOR,"fb",&fb_fops))
+       ret = register_chrdev(FB_MAJOR, "fb", &fb_fops);
+       if (ret) {
                 printk("unable to get major %d for fb devs\n", FB_MAJOR);
+               goto err_chrdev;
+       }
  
         fb_class = class_create(THIS_MODULE, "graphics");
         if (IS_ERR(fb_class)) {
-               printk(KERN_WARNING "Unable to create fb class; errno = %ld\n", PTR_ERR(fb_class));
+               ret = PTR_ERR(fb_class);
+               pr_warn("Unable to create fb class; errno = %d\n", ret);
                 fb_class = NULL;
+               goto err_class;
         }
         return 0;
+
+err_class:
+       unregister_chrdev(FB_MAJOR, "fb");
+err_chrdev:
+       remove_proc_entry("fb", NULL);
+       return ret;
  }
  
  #ifdef MODULE
diff --git a/drivers/video/fbdev/efifb.c b/drivers/video/fbdev/efifb.c

index f4c045c0051cc65fbcdb1df6207f4e3292b8c327..924bad45c17642c778932a9a00b4bb8f19d4f268 100644 (file)
--- a/drivers/video/fbdev/efifb.c
+++ b/drivers/video/fbdev/efifb.c
@@ -237,10 +237,8 @@ static int efifb_probe(struct platform_device *dev)
                 goto err_release_fb;
         }
  
-       printk(KERN_INFO "efifb: framebuffer at 0x%lx, mapped to 0x%p, "
-              "using %dk, total %dk\n",
-              efifb_fix.smem_start, info->screen_base,
-              size_remap/1024, size_total/1024);
+       printk(KERN_INFO "efifb: framebuffer at 0x%lx, using %dk, total %dk\n",
+              efifb_fix.smem_start, size_remap/1024, size_total/1024);
         printk(KERN_INFO "efifb: mode is %dx%dx%d, linelength=%d, pages=%d\n",
                efifb_defined.xres, efifb_defined.yres,
                efifb_defined.bits_per_pixel, efifb_fix.line_length,
diff --git a/drivers/video/fbdev/imxfb.c b/drivers/video/fbdev/imxfb.c

index 76b6a7784b06c7c752ba79862c26b2edcf51f932..fe0c4eeff2e4f1f11442272d03e3c784db06d49f 100644 (file)
--- a/drivers/video/fbdev/imxfb.c
+++ b/drivers/video/fbdev/imxfb.c
@@ -473,11 +473,12 @@ static int imxfb_set_par(struct fb_info *info)
         return 0;
  }
  
-static void imxfb_enable_controller(struct imxfb_info *fbi)
+static int imxfb_enable_controller(struct imxfb_info *fbi)
  {
+       int ret;
  
         if (fbi->enabled)
-               return;
+               return 0;
  
         pr_debug("Enabling LCD controller\n");
  
@@ -496,10 +497,29 @@ static void imxfb_enable_controller(struct imxfb_info *fbi)
          */
         writel(RMCR_LCDC_EN_MX1, fbi->regs + LCDC_RMCR);
  
-       clk_prepare_enable(fbi->clk_ipg);
-       clk_prepare_enable(fbi->clk_ahb);
-       clk_prepare_enable(fbi->clk_per);
+       ret = clk_prepare_enable(fbi->clk_ipg);
+       if (ret)
+               goto err_enable_ipg;
+
+       ret = clk_prepare_enable(fbi->clk_ahb);
+       if (ret)
+               goto err_enable_ahb;
+
+       ret = clk_prepare_enable(fbi->clk_per);
+       if (ret)
+               goto err_enable_per;
+
         fbi->enabled = true;
+       return 0;
+
+err_enable_per:
+       clk_disable_unprepare(fbi->clk_ahb);
+err_enable_ahb:
+       clk_disable_unprepare(fbi->clk_ipg);
+err_enable_ipg:
+       writel(0, fbi->regs + LCDC_RMCR);
+
+       return ret;
  }
  
  static void imxfb_disable_controller(struct imxfb_info *fbi)
@@ -510,8 +530,8 @@ static void imxfb_disable_controller(struct imxfb_info *fbi)
         pr_debug("Disabling LCD controller\n");
  
         clk_disable_unprepare(fbi->clk_per);
-       clk_disable_unprepare(fbi->clk_ipg);
         clk_disable_unprepare(fbi->clk_ahb);
+       clk_disable_unprepare(fbi->clk_ipg);
         fbi->enabled = false;
  
         writel(0, fbi->regs + LCDC_RMCR);
@@ -532,8 +552,7 @@ static int imxfb_blank(int blank, struct fb_info *info)
                 break;
  
         case FB_BLANK_UNBLANK:
-               imxfb_enable_controller(fbi);
-               break;
+               return imxfb_enable_controller(fbi);
         }
         return 0;
  }
@@ -758,10 +777,11 @@ static int imxfb_lcd_get_power(struct lcd_device *lcddev)
  {
         struct imxfb_info *fbi = dev_get_drvdata(&lcddev->dev);
  
-       if (!IS_ERR(fbi->lcd_pwr))
-               return regulator_is_enabled(fbi->lcd_pwr);
+       if (!IS_ERR(fbi->lcd_pwr) &&
+           !regulator_is_enabled(fbi->lcd_pwr))
+               return FB_BLANK_POWERDOWN;
  
-       return 1;
+       return FB_BLANK_UNBLANK;
  }
  
  static int imxfb_lcd_set_power(struct lcd_device *lcddev, int power)
@@ -769,7 +789,7 @@ static int imxfb_lcd_set_power(struct lcd_device *lcddev, int power)
         struct imxfb_info *fbi = dev_get_drvdata(&lcddev->dev);
  
         if (!IS_ERR(fbi->lcd_pwr)) {
-               if (power)
+               if (power == FB_BLANK_UNBLANK)
                         return regulator_enable(fbi->lcd_pwr);
                 else
                         return regulator_disable(fbi->lcd_pwr);
diff --git a/drivers/video/fbdev/omap2/omapfb/dss/dsi.c b/drivers/video/fbdev/omap2/omapfb/dss/dsi.c

index 0eec073b3919e4b2b6fc6cbe3c87d669cce45e80..d63e59807707966bc5ce43346f4c76d42fd140df 100644 (file)
--- a/drivers/video/fbdev/omap2/omapfb/dss/dsi.c
+++ b/drivers/video/fbdev/omap2/omapfb/dss/dsi.c
@@ -1180,13 +1180,11 @@ static int dsi_regulator_init(struct platform_device *dsidev)
                 return PTR_ERR(vdds_dsi);
         }
  
-       if (regulator_can_change_voltage(vdds_dsi)) {
-               r = regulator_set_voltage(vdds_dsi, 1800000, 1800000);
-               if (r) {
-                       devm_regulator_put(vdds_dsi);
-                       DSSERR("can't set the DSI regulator voltage\n");
-                       return r;
-               }
+       r = regulator_set_voltage(vdds_dsi, 1800000, 1800000);
+       if (r) {
+               devm_regulator_put(vdds_dsi);
+               DSSERR("can't set the DSI regulator voltage\n");
+               return r;
         }
  
         dsi->vdds_dsi_reg = vdds_dsi;
diff --git a/drivers/video/fbdev/omap2/omapfb/dss/hdmi4.c b/drivers/video/fbdev/omap2/omapfb/dss/hdmi4.c

index 7103c659a53488f48495dee43091200edb95f53b..2e71aec838b1b5276d31a47cf1b5da77767b41a3 100644 (file)
--- a/drivers/video/fbdev/omap2/omapfb/dss/hdmi4.c
+++ b/drivers/video/fbdev/omap2/omapfb/dss/hdmi4.c
@@ -114,13 +114,11 @@ static int hdmi_init_regulator(void)
                 return PTR_ERR(reg);
         }
  
-       if (regulator_can_change_voltage(reg)) {
-               r = regulator_set_voltage(reg, 1800000, 1800000);
-               if (r) {
-                       devm_regulator_put(reg);
-                       DSSWARN("can't set the regulator voltage\n");
-                       return r;
-               }
+       r = regulator_set_voltage(reg, 1800000, 1800000);
+       if (r) {
+               devm_regulator_put(reg);
+               DSSWARN("can't set the regulator voltage\n");
+               return r;
         }
  
         hdmi.vdda_reg = reg;
diff --git a/drivers/video/fbdev/omap2/omapfb/dss/hdmi5.c b/drivers/video/fbdev/omap2/omapfb/dss/hdmi5.c

index a955a2c4c061f4cd1c890f61f2ec749076eb66b9..aade6d99662ab800fa03b5ab25e2a844bce822b1 100644 (file)
--- a/drivers/video/fbdev/omap2/omapfb/dss/hdmi5.c
+++ b/drivers/video/fbdev/omap2/omapfb/dss/hdmi5.c
@@ -131,13 +131,11 @@ static int hdmi_init_regulator(void)
                 return PTR_ERR(reg);
         }
  
-       if (regulator_can_change_voltage(reg)) {
-               r = regulator_set_voltage(reg, 1800000, 1800000);
-               if (r) {
-                       devm_regulator_put(reg);
-                       DSSWARN("can't set the regulator voltage\n");
-                       return r;
-               }
+       r = regulator_set_voltage(reg, 1800000, 1800000);
+       if (r) {
+               devm_regulator_put(reg);
+               DSSWARN("can't set the regulator voltage\n");
+               return r;
         }
  
         hdmi.vdda_reg = reg;
diff --git a/drivers/video/fbdev/sh_mipi_dsi.c b/drivers/video/fbdev/sh_mipi_dsi.c

deleted file mode 100644 (file)

index 8f6e8ff..0000000
--- a/drivers/video/fbdev/sh_mipi_dsi.c
+++ /dev/null
@@ -1,587 +0,0 @@
-/*
- * Renesas SH-mobile MIPI DSI support
- *
- * Copyright (C) 2010 Guennadi Liakhovetski <g.liakhovetski@gmx.de>
- *
- * This is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- */
-
-#include <linux/bitmap.h>
-#include <linux/clk.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/io.h>
-#include <linux/platform_device.h>
-#include <linux/pm_runtime.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/module.h>
-
-#include <video/mipi_display.h>
-#include <video/sh_mipi_dsi.h>
-#include <video/sh_mobile_lcdc.h>
-
-#include "sh_mobile_lcdcfb.h"
-
-#define SYSCTRL                0x0000
-#define SYSCONF                0x0004
-#define TIMSET         0x0008
-#define RESREQSET0     0x0018
-#define RESREQSET1     0x001c
-#define HSTTOVSET      0x0020
-#define LPRTOVSET      0x0024
-#define TATOVSET       0x0028
-#define PRTOVSET       0x002c
-#define DSICTRL                0x0030
-#define DSIINTE                0x0060
-#define PHYCTRL                0x0070
-
-/* relative to linkbase */
-#define DTCTR          0x0000
-#define VMCTR1         0x0020
-#define VMCTR2         0x0024
-#define VMLEN1         0x0028
-#define VMLEN2         0x002c
-#define CMTSRTREQ      0x0070
-#define CMTSRTCTR      0x00d0
-
-/* E.g., sh7372 has 2 MIPI-DSIs - one for each LCDC */
-#define MAX_SH_MIPI_DSI 2
-
-struct sh_mipi {
-       struct sh_mobile_lcdc_entity entity;
-
-       void __iomem    *base;
-       void __iomem    *linkbase;
-       struct clk      *dsit_clk;
-       struct platform_device *pdev;
-};
-
-#define to_sh_mipi(e)  container_of(e, struct sh_mipi, entity)
-
-static struct sh_mipi *mipi_dsi[MAX_SH_MIPI_DSI];
-
-/* Protect the above array */
-static DEFINE_MUTEX(array_lock);
-
-static struct sh_mipi *sh_mipi_by_handle(int handle)
-{
-       if (handle >= ARRAY_SIZE(mipi_dsi) || handle < 0)
-               return NULL;
-
-       return mipi_dsi[handle];
-}
-
-static int sh_mipi_send_short(struct sh_mipi *mipi, u8 dsi_cmd,
-                             u8 cmd, u8 param)
-{
-       u32 data = (dsi_cmd << 24) | (cmd << 16) | (param << 8);
-       int cnt = 100;
-
-       /* transmit a short packet to LCD panel */
-       iowrite32(1 | data, mipi->linkbase + CMTSRTCTR);
-       iowrite32(1, mipi->linkbase + CMTSRTREQ);
-
-       while ((ioread32(mipi->linkbase + CMTSRTREQ) & 1) && --cnt)
-               udelay(1);
-
-       return cnt ? 0 : -ETIMEDOUT;
-}
-
-#define LCD_CHAN2MIPI(c) ((c) < LCDC_CHAN_MAINLCD || (c) > LCDC_CHAN_SUBLCD ? \
-                               -EINVAL : (c) - 1)
-
-static int sh_mipi_dcs(int handle, u8 cmd)
-{
-       struct sh_mipi *mipi = sh_mipi_by_handle(LCD_CHAN2MIPI(handle));
-       if (!mipi)
-               return -ENODEV;
-       return sh_mipi_send_short(mipi, MIPI_DSI_DCS_SHORT_WRITE, cmd, 0);
-}
-
-static int sh_mipi_dcs_param(int handle, u8 cmd, u8 param)
-{
-       struct sh_mipi *mipi = sh_mipi_by_handle(LCD_CHAN2MIPI(handle));
-       if (!mipi)
-               return -ENODEV;
-       return sh_mipi_send_short(mipi, MIPI_DSI_DCS_SHORT_WRITE_PARAM, cmd,
-                                 param);
-}
-
-static void sh_mipi_dsi_enable(struct sh_mipi *mipi, bool enable)
-{
-       /*
-        * enable LCDC data tx, transition to LPS after completion of each HS
-        * packet
-        */
-       iowrite32(0x00000002 | enable, mipi->linkbase + DTCTR);
-}
-
-static void sh_mipi_shutdown(struct platform_device *pdev)
-{
-       struct sh_mipi *mipi = to_sh_mipi(platform_get_drvdata(pdev));
-
-       sh_mipi_dsi_enable(mipi, false);
-}
-
-static int sh_mipi_setup(struct sh_mipi *mipi, const struct fb_videomode *mode)
-{
-       void __iomem *base = mipi->base;
-       struct sh_mipi_dsi_info *pdata = mipi->pdev->dev.platform_data;
-       u32 pctype, datatype, pixfmt, linelength, vmctr2;
-       u32 tmp, top, bottom, delay, div;
-       int bpp;
-
-       /*
-        * Select data format. MIPI DSI is not hot-pluggable, so, we just use
-        * the default videomode. If this ever becomes a problem, We'll have to
-        * move this to mipi_display_on() above and use info->var.xres
-        */
-       switch (pdata->data_format) {
-       case MIPI_RGB888:
-               pctype = 0;
-               datatype = MIPI_DSI_PACKED_PIXEL_STREAM_24;
-               pixfmt = MIPI_DCS_PIXEL_FMT_24BIT;
-               linelength = mode->xres * 3;
-               break;
-       case MIPI_RGB565:
-               pctype = 1;
-               datatype = MIPI_DSI_PACKED_PIXEL_STREAM_16;
-               pixfmt = MIPI_DCS_PIXEL_FMT_16BIT;
-               linelength = mode->xres * 2;
-               break;
-       case MIPI_RGB666_LP:
-               pctype = 2;
-               datatype = MIPI_DSI_PIXEL_STREAM_3BYTE_18;
-               pixfmt = MIPI_DCS_PIXEL_FMT_24BIT;
-               linelength = mode->xres * 3;
-               break;
-       case MIPI_RGB666:
-               pctype = 3;
-               datatype = MIPI_DSI_PACKED_PIXEL_STREAM_18;
-               pixfmt = MIPI_DCS_PIXEL_FMT_18BIT;
-               linelength = (mode->xres * 18 + 7) / 8;
-               break;
-       case MIPI_BGR888:
-               pctype = 8;
-               datatype = MIPI_DSI_PACKED_PIXEL_STREAM_24;
-               pixfmt = MIPI_DCS_PIXEL_FMT_24BIT;
-               linelength = mode->xres * 3;
-               break;
-       case MIPI_BGR565:
-               pctype = 9;
-               datatype = MIPI_DSI_PACKED_PIXEL_STREAM_16;
-               pixfmt = MIPI_DCS_PIXEL_FMT_16BIT;
-               linelength = mode->xres * 2;
-               break;
-       case MIPI_BGR666_LP:
-               pctype = 0xa;
-               datatype = MIPI_DSI_PIXEL_STREAM_3BYTE_18;
-               pixfmt = MIPI_DCS_PIXEL_FMT_24BIT;
-               linelength = mode->xres * 3;
-               break;
-       case MIPI_BGR666:
-               pctype = 0xb;
-               datatype = MIPI_DSI_PACKED_PIXEL_STREAM_18;
-               pixfmt = MIPI_DCS_PIXEL_FMT_18BIT;
-               linelength = (mode->xres * 18 + 7) / 8;
-               break;
-       case MIPI_YUYV:
-               pctype = 4;
-               datatype = MIPI_DSI_PACKED_PIXEL_STREAM_YCBCR16;
-               pixfmt = MIPI_DCS_PIXEL_FMT_16BIT;
-               linelength = mode->xres * 2;
-               break;
-       case MIPI_UYVY:
-               pctype = 5;
-               datatype = MIPI_DSI_PACKED_PIXEL_STREAM_YCBCR16;
-               pixfmt = MIPI_DCS_PIXEL_FMT_16BIT;
-               linelength = mode->xres * 2;
-               break;
-       case MIPI_YUV420_L:
-               pctype = 6;
-               datatype = MIPI_DSI_PACKED_PIXEL_STREAM_YCBCR12;
-               pixfmt = MIPI_DCS_PIXEL_FMT_12BIT;
-               linelength = (mode->xres * 12 + 7) / 8;
-               break;
-       case MIPI_YUV420:
-               pctype = 7;
-               datatype = MIPI_DSI_PACKED_PIXEL_STREAM_YCBCR12;
-               pixfmt = MIPI_DCS_PIXEL_FMT_12BIT;
-               /* Length of U/V line */
-               linelength = (mode->xres + 1) / 2;
-               break;
-       default:
-               return -EINVAL;
-       }
-
-       if (!pdata->lane)
-               return -EINVAL;
-
-       /* reset DSI link */
-       iowrite32(0x00000001, base + SYSCTRL);
-       /* Hold reset for 100 cycles of the slowest of bus, HS byte and LP clock */
-       udelay(50);
-       iowrite32(0x00000000, base + SYSCTRL);
-
-       /* setup DSI link */
-
-       /*
-        * T_wakeup = 0x7000
-        * T_hs-trail = 3
-        * T_hs-prepare = 3
-        * T_clk-trail = 3
-        * T_clk-prepare = 2
-        */
-       iowrite32(0x70003332, base + TIMSET);
-       /* no responses requested */
-       iowrite32(0x00000000, base + RESREQSET0);
-       /* request response to packets of type 0x28 */
-       iowrite32(0x00000100, base + RESREQSET1);
-       /* High-speed transmission timeout, default 0xffffffff */
-       iowrite32(0x0fffffff, base + HSTTOVSET);
-       /* LP reception timeout, default 0xffffffff */
-       iowrite32(0x0fffffff, base + LPRTOVSET);
-       /* Turn-around timeout, default 0xffffffff */
-       iowrite32(0x0fffffff, base + TATOVSET);
-       /* Peripheral reset timeout, default 0xffffffff */
-       iowrite32(0x0fffffff, base + PRTOVSET);
-       /* Interrupts not used, disable all */
-       iowrite32(0, base + DSIINTE);
-       /* DSI-Tx bias on */
-       iowrite32(0x00000001, base + PHYCTRL);
-       udelay(200);
-       /* Deassert resets, power on */
-       iowrite32(0x03070001 | pdata->phyctrl, base + PHYCTRL);
-
-       /*
-        * Default = ULPS enable |
-        *      Contention detection enabled |
-        *      EoT packet transmission enable |
-        *      CRC check enable |
-        *      ECC check enable
-        */
-       bitmap_fill((unsigned long *)&tmp, pdata->lane);
-       tmp |= 0x00003700;
-       iowrite32(tmp, base + SYSCONF);
-
-       /* setup l-bridge */
-
-       /*
-        * Enable transmission of all packets,
-        * transmit LPS after each HS packet completion
-        */
-       iowrite32(0x00000006, mipi->linkbase + DTCTR);
-       /* VSYNC width = 2 (<< 17) */
-       iowrite32((mode->vsync_len << pdata->vsynw_offset) |
-                 (pdata->clksrc << 16) | (pctype << 12) | datatype,
-                 mipi->linkbase + VMCTR1);
-
-       /*
-        * Non-burst mode with sync pulses: VSE and HSE are output,
-        * HSA period allowed, no commands in LP
-        */
-       vmctr2 = 0;
-       if (pdata->flags & SH_MIPI_DSI_VSEE)
-               vmctr2 |= 1 << 23;
-       if (pdata->flags & SH_MIPI_DSI_HSEE)
-               vmctr2 |= 1 << 22;
-       if (pdata->flags & SH_MIPI_DSI_HSAE)
-               vmctr2 |= 1 << 21;
-       if (pdata->flags & SH_MIPI_DSI_BL2E)
-               vmctr2 |= 1 << 17;
-       if (pdata->flags & SH_MIPI_DSI_HSABM)
-               vmctr2 |= 1 << 5;
-       if (pdata->flags & SH_MIPI_DSI_HBPBM)
-               vmctr2 |= 1 << 4;
-       if (pdata->flags & SH_MIPI_DSI_HFPBM)
-               vmctr2 |= 1 << 3;
-       iowrite32(vmctr2, mipi->linkbase + VMCTR2);
-
-       /*
-        * VMLEN1 = RGBLEN | HSALEN
-        *
-        * see
-        *  Video mode - Blanking Packet setting
-        */
-       top = linelength << 16; /* RGBLEN */
-       bottom = 0x00000001;
-       if (pdata->flags & SH_MIPI_DSI_HSABM) /* HSALEN */
-               bottom = (pdata->lane * mode->hsync_len) - 10;
-       iowrite32(top | bottom , mipi->linkbase + VMLEN1);
-
-       /*
-        * VMLEN2 = HBPLEN | HFPLEN
-        *
-        * see
-        *  Video mode - Blanking Packet setting
-        */
-       top     = 0x00010000;
-       bottom  = 0x00000001;
-       delay   = 0;
-
-       div = 1;        /* HSbyteCLK is calculation base
-                        * HS4divCLK = HSbyteCLK/2
-                        * HS6divCLK is not supported for now */
-       if (pdata->flags & SH_MIPI_DSI_HS4divCLK)
-               div = 2;
-
-       if (pdata->flags & SH_MIPI_DSI_HFPBM) { /* HBPLEN */
-               top = mode->hsync_len + mode->left_margin;
-               top = ((pdata->lane * top / div) - 10) << 16;
-       }
-       if (pdata->flags & SH_MIPI_DSI_HBPBM) { /* HFPLEN */
-               bottom = mode->right_margin;
-               bottom = (pdata->lane * bottom / div) - 12;
-       }
-
-       bpp = linelength / mode->xres; /* byte / pixel */
-       if ((pdata->lane / div) > bpp) {
-               tmp = mode->xres / bpp; /* output cycle */
-               tmp = mode->xres - tmp; /* (input - output) cycle */
-               delay = (pdata->lane * tmp);
-       }
-
-       iowrite32(top | (bottom + delay) , mipi->linkbase + VMLEN2);
-
-       msleep(5);
-
-       /* setup LCD panel */
-
-       /* cf. drivers/video/omap/lcd_mipid.c */
-       sh_mipi_dcs(pdata->channel, MIPI_DCS_EXIT_SLEEP_MODE);
-       msleep(120);
-       /*
-        * [7] - Page Address Mode
-        * [6] - Column Address Mode
-        * [5] - Page / Column Address Mode
-        * [4] - Display Device Line Refresh Order
-        * [3] - RGB/BGR Order
-        * [2] - Display Data Latch Data Order
-        * [1] - Flip Horizontal
-        * [0] - Flip Vertical
-        */
-       sh_mipi_dcs_param(pdata->channel, MIPI_DCS_SET_ADDRESS_MODE, 0x00);
-       /* cf. set_data_lines() */
-       sh_mipi_dcs_param(pdata->channel, MIPI_DCS_SET_PIXEL_FORMAT,
-                         pixfmt << 4);
-       sh_mipi_dcs(pdata->channel, MIPI_DCS_SET_DISPLAY_ON);
-
-       /* Enable timeout counters */
-       iowrite32(0x00000f00, base + DSICTRL);
-
-       return 0;
-}
-
-static int mipi_display_on(struct sh_mobile_lcdc_entity *entity)
-{
-       struct sh_mipi *mipi = to_sh_mipi(entity);
-       struct sh_mipi_dsi_info *pdata = mipi->pdev->dev.platform_data;
-       int ret;
-
-       pm_runtime_get_sync(&mipi->pdev->dev);
-
-       ret = pdata->set_dot_clock(mipi->pdev, mipi->base, 1);
-       if (ret < 0)
-               goto mipi_display_on_fail1;
-
-       ret = sh_mipi_setup(mipi, &entity->def_mode);
-       if (ret < 0)
-               goto mipi_display_on_fail2;
-
-       sh_mipi_dsi_enable(mipi, true);
-
-       return SH_MOBILE_LCDC_DISPLAY_CONNECTED;
-
-mipi_display_on_fail1:
-       pm_runtime_put_sync(&mipi->pdev->dev);
-mipi_display_on_fail2:
-       pdata->set_dot_clock(mipi->pdev, mipi->base, 0);
-
-       return ret;
-}
-
-static void mipi_display_off(struct sh_mobile_lcdc_entity *entity)
-{
-       struct sh_mipi *mipi = to_sh_mipi(entity);
-       struct sh_mipi_dsi_info *pdata = mipi->pdev->dev.platform_data;
-
-       sh_mipi_dsi_enable(mipi, false);
-
-       pdata->set_dot_clock(mipi->pdev, mipi->base, 0);
-
-       pm_runtime_put_sync(&mipi->pdev->dev);
-}
-
-static const struct sh_mobile_lcdc_entity_ops mipi_ops = {
-       .display_on = mipi_display_on,
-       .display_off = mipi_display_off,
-};
-
-static int __init sh_mipi_probe(struct platform_device *pdev)
-{
-       struct sh_mipi *mipi;
-       struct sh_mipi_dsi_info *pdata = pdev->dev.platform_data;
-       struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-       struct resource *res2 = platform_get_resource(pdev, IORESOURCE_MEM, 1);
-       unsigned long rate, f_current;
-       int idx = pdev->id, ret;
-
-       if (!res || !res2 || idx >= ARRAY_SIZE(mipi_dsi) || !pdata)
-               return -ENODEV;
-
-       if (!pdata->set_dot_clock)
-               return -EINVAL;
-
-       mutex_lock(&array_lock);
-       if (idx < 0)
-               for (idx = 0; idx < ARRAY_SIZE(mipi_dsi) && mipi_dsi[idx]; idx++)
-                       ;
-
-       if (idx == ARRAY_SIZE(mipi_dsi)) {
-               ret = -EBUSY;
-               goto efindslot;
-       }
-
-       mipi = kzalloc(sizeof(*mipi), GFP_KERNEL);
-       if (!mipi) {
-               ret = -ENOMEM;
-               goto ealloc;
-       }
-
-       mipi->entity.owner = THIS_MODULE;
-       mipi->entity.ops = &mipi_ops;
-
-       if (!request_mem_region(res->start, resource_size(res), pdev->name)) {
-               dev_err(&pdev->dev, "MIPI register region already claimed\n");
-               ret = -EBUSY;
-               goto ereqreg;
-       }
-
-       mipi->base = ioremap(res->start, resource_size(res));
-       if (!mipi->base) {
-               ret = -ENOMEM;
-               goto emap;
-       }
-
-       if (!request_mem_region(res2->start, resource_size(res2), pdev->name)) {
-               dev_err(&pdev->dev, "MIPI register region 2 already claimed\n");
-               ret = -EBUSY;
-               goto ereqreg2;
-       }
-
-       mipi->linkbase = ioremap(res2->start, resource_size(res2));
-       if (!mipi->linkbase) {
-               ret = -ENOMEM;
-               goto emap2;
-       }
-
-       mipi->pdev = pdev;
-
-       mipi->dsit_clk = clk_get(&pdev->dev, "dsit_clk");
-       if (IS_ERR(mipi->dsit_clk)) {
-               ret = PTR_ERR(mipi->dsit_clk);
-               goto eclktget;
-       }
-
-       f_current = clk_get_rate(mipi->dsit_clk);
-       /* 80MHz required by the datasheet */
-       rate = clk_round_rate(mipi->dsit_clk, 80000000);
-       if (rate > 0 && rate != f_current)
-               ret = clk_set_rate(mipi->dsit_clk, rate);
-       else
-               ret = rate;
-       if (ret < 0)
-               goto esettrate;
-
-       dev_dbg(&pdev->dev, "DSI-T clk %lu -> %lu\n", f_current, rate);
-
-       ret = clk_enable(mipi->dsit_clk);
-       if (ret < 0)
-               goto eclkton;
-
-       mipi_dsi[idx] = mipi;
-
-       pm_runtime_enable(&pdev->dev);
-       pm_runtime_resume(&pdev->dev);
-
-       mutex_unlock(&array_lock);
-       platform_set_drvdata(pdev, &mipi->entity);
-
-       return 0;
-
-eclkton:
-esettrate:
-       clk_put(mipi->dsit_clk);
-eclktget:
-       iounmap(mipi->linkbase);
-emap2:
-       release_mem_region(res2->start, resource_size(res2));
-ereqreg2:
-       iounmap(mipi->base);
-emap:
-       release_mem_region(res->start, resource_size(res));
-ereqreg:
-       kfree(mipi);
-ealloc:
-efindslot:
-       mutex_unlock(&array_lock);
-
-       return ret;
-}
-
-static int sh_mipi_remove(struct platform_device *pdev)
-{
-       struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-       struct resource *res2 = platform_get_resource(pdev, IORESOURCE_MEM, 1);
-       struct sh_mipi *mipi = to_sh_mipi(platform_get_drvdata(pdev));
-       int i, ret;
-
-       mutex_lock(&array_lock);
-
-       for (i = 0; i < ARRAY_SIZE(mipi_dsi) && mipi_dsi[i] != mipi; i++)
-               ;
-
-       if (i == ARRAY_SIZE(mipi_dsi)) {
-               ret = -EINVAL;
-       } else {
-               ret = 0;
-               mipi_dsi[i] = NULL;
-       }
-
-       mutex_unlock(&array_lock);
-
-       if (ret < 0)
-               return ret;
-
-       pm_runtime_disable(&pdev->dev);
-       clk_disable(mipi->dsit_clk);
-       clk_put(mipi->dsit_clk);
-
-       iounmap(mipi->linkbase);
-       if (res2)
-               release_mem_region(res2->start, resource_size(res2));
-       iounmap(mipi->base);
-       if (res)
-               release_mem_region(res->start, resource_size(res));
-       kfree(mipi);
-
-       return 0;
-}
-
-static struct platform_driver sh_mipi_driver = {
-       .remove         = sh_mipi_remove,
-       .shutdown       = sh_mipi_shutdown,
-       .driver = {
-               .name   = "sh-mipi-dsi",
-       },
-};
-
-module_platform_driver_probe(sh_mipi_driver, sh_mipi_probe);
-
-MODULE_AUTHOR("Guennadi Liakhovetski <g.liakhovetski@gmx.de>");
-MODULE_DESCRIPTION("SuperH / ARM-shmobile MIPI DSI driver");
-MODULE_LICENSE("GPL v2");
diff --git a/drivers/video/fbdev/ssd1307fb.c b/drivers/video/fbdev/ssd1307fb.c

index fa3480815cdb61dadec19f15772523c6eaf35cbd..21dafe53ca492421fd6fe747e32ab3f7aa643dbd 100644 (file)
--- a/drivers/video/fbdev/ssd1307fb.c
+++ b/drivers/video/fbdev/ssd1307fb.c
@@ -389,7 +389,7 @@ static int ssd1307fb_init(struct ssd1307fb_par *par)
                 return ret;
  
         ret = ssd1307fb_write_cmd(par->client,
-               (par->device_info->need_chargepump & 0x1 << 2) & 0x14);
+               BIT(4) | (par->device_info->need_chargepump ? BIT(2) : 0));
         if (ret < 0)
                 return ret;
  
diff --git a/drivers/video/fbdev/via/accel.c b/drivers/video/fbdev/via/accel.c

index 4b67b8e6030a1d25338a0a4159b7a7c5986751af..eb3615c69987e2db6f4f709b015cee975bdabddd 100644 (file)
--- a/drivers/video/fbdev/via/accel.c
+++ b/drivers/video/fbdev/via/accel.c
@@ -358,7 +358,7 @@ int viafb_setup_engine(struct fb_info *info)
         viapar->shared->vq_vram_addr = viapar->fbmem_free;
         viapar->fbmem_used += VQ_SIZE;
  
-#if defined(CONFIG_VIDEO_VIA_CAMERA) || defined(CONFIG_VIDEO_VIA_CAMERA_MODULE)
+#if IS_ENABLED(CONFIG_VIDEO_VIA_CAMERA)
         /*
          * Set aside a chunk of framebuffer memory for the camera
          * driver.  Someday this driver probably needs a proper allocator
diff --git a/drivers/video/fbdev/via/via-core.c b/drivers/video/fbdev/via/via-core.c

index 6e274825fb312349d95d5bdddb43b4c9e45e8775..1d28e16888e9c00685ac69cb57cc7a7224bfcdc5 100644 (file)
--- a/drivers/video/fbdev/via/via-core.c
+++ b/drivers/video/fbdev/via/via-core.c
@@ -116,7 +116,7 @@ EXPORT_SYMBOL_GPL(viafb_irq_disable);
   * most viafb systems will not need to have this extra code for a while.
   * As soon as another user comes long, the ifdef can be removed.
   */
-#if defined(CONFIG_VIDEO_VIA_CAMERA) || defined(CONFIG_VIDEO_VIA_CAMERA_MODULE)
+#if IS_ENABLED(CONFIG_VIDEO_VIA_CAMERA)
  /*
   * Access to the DMA engine.  This currently provides what the camera
   * driver needs (i.e. outgoing only) but is easily expandable if need
@@ -542,7 +542,7 @@ static struct viafb_subdev_info {
         {
                 .name = "viafb-i2c",
         },
-#if defined(CONFIG_VIDEO_VIA_CAMERA) || defined(CONFIG_VIDEO_VIA_CAMERA_MODULE)
+#if IS_ENABLED(CONFIG_VIDEO_VIA_CAMERA)
         {
                 .name = "viafb-camera",
         },
diff --git a/fs/buffer.c b/fs/buffer.c

index af0d9a82a8edff4dd279657a56757cb76588e2f0..754813a6962bc324c476b5d5035f10e761c1e28c 100644 (file)
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -255,17 +255,17 @@ out:
   */
  static void free_more_memory(void)
  {
-       struct zone *zone;
+       struct zoneref *z;
         int nid;
  
         wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
         yield();
  
         for_each_online_node(nid) {
-               (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
-                                               gfp_zone(GFP_NOFS), NULL,
-                                               &zone);
-               if (zone)
+
+               z = first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
+                                               gfp_zone(GFP_NOFS), NULL);
+               if (z->zone)
                         try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
                                                 GFP_NOFS, NULL);
         }
diff --git a/fs/eventpoll.c b/fs/eventpoll.c

index 8a74a2a52e0fa2607801c4c5e7e707379ac77fbf..10db912189338096a3d37176b043145bca923da8 100644 (file)
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1583,15 +1583,15 @@ static int ep_send_events(struct eventpoll *ep,
         return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
  }
  
-static inline struct timespec ep_set_mstimeout(long ms)
+static inline struct timespec64 ep_set_mstimeout(long ms)
  {
-       struct timespec now, ts = {
+       struct timespec64 now, ts = {
                 .tv_sec = ms / MSEC_PER_SEC,
                 .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
         };
  
-       ktime_get_ts(&now);
-       return timespec_add_safe(now, ts);
+       ktime_get_ts64(&now);
+       return timespec64_add_safe(now, ts);
  }
  
  /**
@@ -1621,11 +1621,11 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
         ktime_t expires, *to = NULL;
  
         if (timeout > 0) {
-               struct timespec end_time = ep_set_mstimeout(timeout);
+               struct timespec64 end_time = ep_set_mstimeout(timeout);
  
                 slack = select_estimate_accuracy(&end_time);
                 to = &expires;
-               *to = timespec_to_ktime(end_time);
+               *to = timespec64_to_ktime(end_time);
         } else if (timeout == 0) {
                 /*
                  * Avoid the unnecessary trip to the wait queue loop, if the
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c

index 8524c0e322fcbad28e7fd6df3b9aa7aefd00a9f1..37b7bc14c8da578a89282bc4fab2cadd93ed441c 100644 (file)
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -977,7 +977,7 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
                 if (!list_empty(&bd->bd_list) && !buffer_pinned(bh))
                         list_del_init(&bd->bd_list);
                 else
-                       gfs2_remove_from_journal(bh, current->journal_info, 0);
+                       gfs2_remove_from_journal(bh, REMOVE_JDATA);
         }
         bh->b_bdev = NULL;
         clear_buffer_mapped(bh);
@@ -1063,7 +1063,7 @@ static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
         gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh);
         rv = gfs2_glock_nq(&gh);
         if (rv)
-               return rv;
+               goto out_uninit;
         rv = gfs2_ok_for_dio(ip, offset);
         if (rv != 1)
                 goto out; /* dio not valid, fall back to buffered i/o */
@@ -1102,6 +1102,7 @@ static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
                                   gfs2_get_block_direct, NULL, NULL, 0);
  out:
         gfs2_glock_dq(&gh);
+out_uninit:
         gfs2_holder_uninit(&gh);
         return rv;
  }
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c

index e53b723abd3b828f991f5138515742d869627908..e0f98e483aec1a1aa20ff3fe7c416cd017adf6c7 100644 (file)
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -160,7 +160,7 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
         gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
         error = gfs2_glock_nq(&gh);
         if (error)
-               return error;
+               goto out_uninit;
  
         fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags);
         if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA)
@@ -169,6 +169,7 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
                 error = -EFAULT;
  
         gfs2_glock_dq(&gh);
+out_uninit:
         gfs2_holder_uninit(&gh);
         return error;
  }
@@ -953,6 +954,30 @@ out_uninit:
         return ret;
  }
  
+static ssize_t gfs2_file_splice_read(struct file *in, loff_t *ppos,
+                                    struct pipe_inode_info *pipe, size_t len,
+                                    unsigned int flags)
+{
+       struct inode *inode = in->f_mapping->host;
+       struct gfs2_inode *ip = GFS2_I(inode);
+       struct gfs2_holder gh;
+       int ret;
+
+       inode_lock(inode);
+
+       ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+       if (ret) {
+               inode_unlock(inode);
+               return ret;
+       }
+
+       gfs2_glock_dq_uninit(&gh);
+       inode_unlock(inode);
+
+       return generic_file_splice_read(in, ppos, pipe, len, flags);
+}
+
+
  static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe,
                                       struct file *out, loff_t *ppos,
                                       size_t len, unsigned int flags)
@@ -1115,7 +1140,7 @@ const struct file_operations gfs2_file_fops = {
         .fsync          = gfs2_fsync,
         .lock           = gfs2_lock,
         .flock          = gfs2_flock,
-       .splice_read    = generic_file_splice_read,
+       .splice_read    = gfs2_file_splice_read,
         .splice_write   = gfs2_file_splice_write,
         .setlease       = simple_nosetlease,
         .fallocate      = gfs2_fallocate,
@@ -1143,7 +1168,7 @@ const struct file_operations gfs2_file_fops_nolock = {
         .open           = gfs2_open,
         .release        = gfs2_release,
         .fsync          = gfs2_fsync,
-       .splice_read    = generic_file_splice_read,
+       .splice_read    = gfs2_file_splice_read,
         .splice_write   = gfs2_file_splice_write,
         .setlease       = generic_setlease,
         .fallocate      = gfs2_fallocate,
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c

index 4b73bd101bdcd0b4dd4650acb8a42a420d2a74a2..706fd9352f368818391ad79f4481feffe0809b60 100644 (file)
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -218,7 +218,7 @@ static void gfs2_holder_wake(struct gfs2_holder *gh)
   *
   */
  
-static inline void do_error(struct gfs2_glock *gl, const int ret)
+static void do_error(struct gfs2_glock *gl, const int ret)
  {
         struct gfs2_holder *gh, *tmp;
  
@@ -475,7 +475,14 @@ __acquires(&gl->gl_lockref.lock)
         if (sdp->sd_lockstruct.ls_ops->lm_lock) {
                 /* lock_dlm */
                 ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
-               if (ret) {
+               if (ret == -EINVAL && gl->gl_target == LM_ST_UNLOCKED &&
+                   target == LM_ST_UNLOCKED &&
+                   test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags)) {
+                       finish_xmote(gl, target);
+                       if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+                               gfs2_glock_put(gl);
+               }
+               else if (ret) {
                         pr_err("lm_lock ret %d\n", ret);
                         GLOCK_BUG_ON(gl, 1);
                 }
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c

index 437fd73e381e224063742284fd8f825d35b0e9db..5db59d44483809fa027196910a90d10536843c37 100644 (file)
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -286,17 +286,10 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
  static int inode_go_demote_ok(const struct gfs2_glock *gl)
  {
         struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
-       struct gfs2_holder *gh;
  
         if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object)
                 return 0;
  
-       if (!list_empty(&gl->gl_holders)) {
-               gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
-               if (gh->gh_list.next != &gl->gl_holders)
-                       return 0;
-       }
-
         return 1;
  }
  
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c

index 72e9c64ae37166d45361bfd1252b3fa85ac2159d..21dc784f66c2268d2e857314104847b600cc09de 100644 (file)
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -93,12 +93,12 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
         int error;
  
         inode = iget_locked(sb, (unsigned long)no_addr);
-       ip = GFS2_I(inode);
-       ip->i_no_addr = no_addr;
-
         if (!inode)
                 return ERR_PTR(-ENOMEM);
  
+       ip = GFS2_I(inode);
+       ip->i_no_addr = no_addr;
+
         if (inode->i_state & I_NEW) {
                 struct gfs2_sbd *sdp = GFS2_SB(inode);
                 ip->i_no_formal_ino = no_formal_ino;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c

index 0448524c11bcfca005a874861235066de2a7242e..8eaadabbc77100bea1906de90b70e201cb4c57cd 100644 (file)
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -325,18 +325,19 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
         return 0;
  }
  
-void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta)
+void gfs2_remove_from_journal(struct buffer_head *bh, int meta)
  {
         struct address_space *mapping = bh->b_page->mapping;
         struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
         struct gfs2_bufdata *bd = bh->b_private;
+       struct gfs2_trans *tr = current->journal_info;
         int was_pinned = 0;
  
         if (test_clear_buffer_pinned(bh)) {
                 trace_gfs2_pin(bd, 0);
                 atomic_dec(&sdp->sd_log_pinned);
                 list_del_init(&bd->bd_list);
-               if (meta)
+               if (meta == REMOVE_META)
                         tr->tr_num_buf_rm++;
                 else
                         tr->tr_num_databuf_rm++;
@@ -376,7 +377,7 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
                 if (bh) {
                         lock_buffer(bh);
                         gfs2_log_lock(sdp);
-                       gfs2_remove_from_journal(bh, current->journal_info, 1);
+                       gfs2_remove_from_journal(bh, REMOVE_META);
                         gfs2_log_unlock(sdp);
                         unlock_buffer(bh);
                         brelse(bh);
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h

index c5086c8af5ed41101aae337bfc2c02a5ffe686e8..ffdf6aa3509d59f289361da3f6005c20282c8594 100644 (file)
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -57,8 +57,12 @@ extern int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
  extern int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
  extern struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno,
                                        int create);
-extern void gfs2_remove_from_journal(struct buffer_head *bh,
-                                    struct gfs2_trans *tr, int meta);
+enum {
+       REMOVE_JDATA = 0,
+       REMOVE_META = 1,
+};
+
+extern void gfs2_remove_from_journal(struct buffer_head *bh, int meta);
  extern void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
  extern int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
                                      struct buffer_head **bhp);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c

index 99a0bdac8796b11046f29c5981f32261338d8297..5bd216901e89334186d74751f62ff36ef876ff71 100644 (file)
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -73,8 +73,7 @@ static const char valid_change[16] = {
  };
  
  static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
-                        const struct gfs2_inode *ip, bool nowrap,
-                        const struct gfs2_alloc_parms *ap);
+                        const struct gfs2_inode *ip, bool nowrap);
  
  
  /**
@@ -1511,7 +1510,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
         if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
                 return;
  
-       ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true, ap);
+       ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true);
         if (ret == 0) {
                 rs->rs_rbm = rbm;
                 rs->rs_free = extlen;
@@ -1638,7 +1637,6 @@ fail:
   * @ip: If set, check for reservations
   * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
   *          around until we've reached the starting point.
- * @ap: the allocation parameters
   *
   * Side effects:
   * - If looking for free blocks, we set GBF_FULL on each bitmap which
@@ -1650,8 +1648,7 @@ fail:
   */
  
  static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
-                        const struct gfs2_inode *ip, bool nowrap,
-                        const struct gfs2_alloc_parms *ap)
+                        const struct gfs2_inode *ip, bool nowrap)
  {
         struct buffer_head *bh;
         int initial_bii;
@@ -1772,7 +1769,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
         while (1) {
                 down_write(&sdp->sd_log_flush_lock);
                 error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
-                                     true, NULL);
+                                     true);
                 up_write(&sdp->sd_log_flush_lock);
                 if (error == -ENOSPC)
                         break;
@@ -2329,12 +2326,11 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
         int error;
  
         gfs2_set_alloc_start(&rbm, ip, dinode);
-       error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false, NULL);
+       error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false);
  
         if (error == -ENOSPC) {
                 gfs2_set_alloc_start(&rbm, ip, dinode);
-               error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false,
-                                     NULL);
+               error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false);
         }
  
         /* Since all blocks are reserved in advance, this shouldn't happen */
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c

index cf645835710f8e2e9ea916939418f66adc9655cc..aee4485ad8a9b4f75de1bb9fa356f582ff4a5e4c 100644 (file)
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -68,6 +68,7 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...)
                         fs_err(sdp, "telling LM to unmount\n");
                         lm->lm_unmount(sdp);
                 }
+               set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags);
                 fs_err(sdp, "withdrawn\n");
                 dump_stack();
         }
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h

index b44c68a857e7760743fa74aa0383258b0b6f8e4a..0a3bc2cf192cfdf94f51e3a814c175f26a586e74 100644 (file)
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -56,6 +56,13 @@ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
         fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks,
                                &mnt->mnt_root->d_lock);
  }
+/* prepare for freeing all marks associated with given group */
+extern void fsnotify_detach_group_marks(struct fsnotify_group *group);
+/*
+ * wait for fsnotify_mark_srcu period to end and free all marks in destroy_list
+ */
+extern void fsnotify_mark_destroy_list(void);
+
  /*
   * update the dentry->d_flags of all of inode's children to indicate if inode cares
   * about events that happen to its children.
diff --git a/fs/notify/group.c b/fs/notify/group.c

index d16b62cb28544a147183c3780e3a61ffa246c8bb..3e2dd85be5dd375a51af031566a381f20e46b21f 100644 (file)
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -47,12 +47,21 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
   */
  void fsnotify_destroy_group(struct fsnotify_group *group)
  {
-       /* clear all inode marks for this group */
-       fsnotify_clear_marks_by_group(group);
+       /* clear all inode marks for this group, attach them to destroy_list */
+       fsnotify_detach_group_marks(group);
  
-       synchronize_srcu(&fsnotify_mark_srcu);
+       /*
+        * Wait for fsnotify_mark_srcu period to end and free all marks in
+        * destroy_list
+        */
+       fsnotify_mark_destroy_list();
  
-       /* clear the notification queue of all events */
+       /*
+        * Since we have waited for fsnotify_mark_srcu in
+        * fsnotify_mark_destroy_list() there can be no outstanding event
+        * notification against this group. So clearing the notification queue
+        * of all events is reliable now.
+        */
         fsnotify_flush_notify(group);
  
         /*
diff --git a/fs/notify/mark.c b/fs/notify/mark.c

index 7115c5d7d373c63df1512eba5ea90351ea5c7c45..d3fea0bd89e2cbedcea630ba3e966963720f725a 100644 (file)
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -97,8 +97,8 @@ struct srcu_struct fsnotify_mark_srcu;
  static DEFINE_SPINLOCK(destroy_lock);
  static LIST_HEAD(destroy_list);
  
-static void fsnotify_mark_destroy(struct work_struct *work);
-static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy);
+static void fsnotify_mark_destroy_workfn(struct work_struct *work);
+static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn);
  
  void fsnotify_get_mark(struct fsnotify_mark *mark)
  {
@@ -173,11 +173,15 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
  }
  
  /*
- * Free fsnotify mark. The freeing is actually happening from a kthread which
- * first waits for srcu period end. Caller must have a reference to the mark
- * or be protected by fsnotify_mark_srcu.
+ * Prepare mark for freeing and add it to the list of marks prepared for
+ * freeing. The actual freeing must happen after SRCU period ends and the
+ * caller is responsible for this.
+ *
+ * The function returns true if the mark was added to the list of marks for
+ * freeing. The function returns false if someone else has already called
+ * __fsnotify_free_mark() for the mark.
   */
-void fsnotify_free_mark(struct fsnotify_mark *mark)
+static bool __fsnotify_free_mark(struct fsnotify_mark *mark)
  {
         struct fsnotify_group *group = mark->group;
  
@@ -185,17 +189,11 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
         /* something else already called this function on this mark */
         if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
                 spin_unlock(&mark->lock);
-               return;
+               return false;
         }
         mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
         spin_unlock(&mark->lock);
  
-       spin_lock(&destroy_lock);
-       list_add(&mark->g_list, &destroy_list);
-       spin_unlock(&destroy_lock);
-       queue_delayed_work(system_unbound_wq, &reaper_work,
-                               FSNOTIFY_REAPER_DELAY);
-
         /*
          * Some groups like to know that marks are being freed.  This is a
          * callback to the group function to let it know that this mark
@@ -203,6 +201,25 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
          */
         if (group->ops->freeing_mark)
                 group->ops->freeing_mark(mark, group);
+
+       spin_lock(&destroy_lock);
+       list_add(&mark->g_list, &destroy_list);
+       spin_unlock(&destroy_lock);
+
+       return true;
+}
+
+/*
+ * Free fsnotify mark. The freeing is actually happening from a workqueue which
+ * first waits for srcu period end. Caller must have a reference to the mark
+ * or be protected by fsnotify_mark_srcu.
+ */
+void fsnotify_free_mark(struct fsnotify_mark *mark)
+{
+       if (__fsnotify_free_mark(mark)) {
+               queue_delayed_work(system_unbound_wq, &reaper_work,
+                                  FSNOTIFY_REAPER_DELAY);
+       }
  }
  
  void fsnotify_destroy_mark(struct fsnotify_mark *mark,
@@ -468,11 +485,29 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
  }
  
  /*
- * Given a group, destroy all of the marks associated with that group.
+ * Given a group, prepare for freeing all the marks associated with that group.
+ * The marks are attached to the list of marks prepared for destruction, the
+ * caller is responsible for freeing marks in that list after SRCU period has
+ * ended.
   */
-void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
+void fsnotify_detach_group_marks(struct fsnotify_group *group)
  {
-       fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1);
+       struct fsnotify_mark *mark;
+
+       while (1) {
+               mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
+               if (list_empty(&group->marks_list)) {
+                       mutex_unlock(&group->mark_mutex);
+                       break;
+               }
+               mark = list_first_entry(&group->marks_list,
+                                       struct fsnotify_mark, g_list);
+               fsnotify_get_mark(mark);
+               fsnotify_detach_mark(mark);
+               mutex_unlock(&group->mark_mutex);
+               __fsnotify_free_mark(mark);
+               fsnotify_put_mark(mark);
+       }
  }
  
  void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
@@ -499,7 +534,11 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
         mark->free_mark = free_mark;
  }
  
-static void fsnotify_mark_destroy(struct work_struct *work)
+/*
+ * Destroy all marks in destroy_list, waits for SRCU period to finish before
+ * actually freeing marks.
+ */
+void fsnotify_mark_destroy_list(void)
  {
         struct fsnotify_mark *mark, *next;
         struct list_head private_destroy_list;
@@ -516,3 +555,8 @@ static void fsnotify_mark_destroy(struct work_struct *work)
                 fsnotify_put_mark(mark);
         }
  }
+
+static void fsnotify_mark_destroy_workfn(struct work_struct *work)
+{
+       fsnotify_mark_destroy_list();
+}
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c

index e361d1a0ca09fe411d6d951e9a4ed417940229fb..460c0cedab3a435d5203c753c51b2cedbb709a67 100644 (file)
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5351,7 +5351,7 @@ static int ocfs2_truncate_rec(handle_t *handle,
  {
         int ret;
         u32 left_cpos, rec_range, trunc_range;
-       int wants_rotate = 0, is_rightmost_tree_rec = 0;
+       int is_rightmost_tree_rec = 0;
         struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
         struct ocfs2_path *left_path = NULL;
         struct ocfs2_extent_list *el = path_leaf_el(path);
@@ -5457,7 +5457,6 @@ static int ocfs2_truncate_rec(handle_t *handle,
  
                 memset(rec, 0, sizeof(*rec));
                 ocfs2_cleanup_merge(el, index);
-               wants_rotate = 1;
  
                 next_free = le16_to_cpu(el->l_next_free_rec);
                 if (is_rightmost_tree_rec && next_free > 1) {
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c

index 1934abb6b68081a91309a1ee503b4653b3cf23f6..a8d15beee5cb542d02ff29eab921c2afd2b8569b 100644 (file)
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1456,7 +1456,6 @@ static void o2hb_region_release(struct config_item *item)
  
  static int o2hb_read_block_input(struct o2hb_region *reg,
                                  const char *page,
-                                size_t count,
                                  unsigned long *ret_bytes,
                                  unsigned int *ret_bits)
  {
@@ -1499,8 +1498,8 @@ static ssize_t o2hb_region_block_bytes_store(struct config_item *item,
         if (reg->hr_bdev)
                 return -EINVAL;
  
-       status = o2hb_read_block_input(reg, page, count,
-                                      &block_bytes, &block_bits);
+       status = o2hb_read_block_input(reg, page, &block_bytes,
+                                      &block_bits);
         if (status)
                 return status;
  
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h

index 540ab5b75dbb057b99f5d879edb703c4b18afe9c..44d178b8d1aa90c2876478ae04720b44d33d828d 100644 (file)
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -580,7 +580,7 @@ struct ocfs2_extended_slot {
  /*00*/ __u8    es_valid;
         __u8    es_reserved1[3];
         __le32  es_node_num;
-/*10*/
+/*08*/
  };
  
  /*
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c

index 1e09592148ad2823c8b073ebe9c03b66c99e3366..d7407994f308076fbb8690f00419f5e663aa9bea 100644 (file)
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -535,12 +535,8 @@ void ocfs2_put_slot(struct ocfs2_super *osb)
         spin_unlock(&osb->osb_lock);
  
         status = ocfs2_update_disk_slot(osb, si, slot_num);
-       if (status < 0) {
+       if (status < 0)
                 mlog_errno(status);
-               goto bail;
-       }
  
-bail:
         ocfs2_free_slot_info(osb);
  }
-
diff --git a/fs/proc/page.c b/fs/proc/page.c

index 712f1b9992ccba12deaeae38d3b4e7bcfb50297d..3ecd445e830dc6138916c5db984e05da9e91807c 100644 (file)
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -142,7 +142,7 @@ u64 stable_page_flags(struct page *page)
  
  
         /*
-        * Caveats on high order pages: page->_count will only be set
+        * Caveats on high order pages: page->_refcount will only be set
          * -1 on the head page; SLUB/SLQB do the same for PG_slab;
          * SLOB won't set PG_slab at all on compound pages.
          */
diff --git a/fs/select.c b/fs/select.c

index 869293988c2a068c8a37c58882323797abab5cab..8ed9da50896a156a463ae4d0c657e566809b09a3 100644 (file)
--- a/fs/select.c
+++ b/fs/select.c
@@ -47,7 +47,7 @@
  
  #define MAX_SLACK      (100 * NSEC_PER_MSEC)
  
-static long __estimate_accuracy(struct timespec *tv)
+static long __estimate_accuracy(struct timespec64 *tv)
  {
         long slack;
         int divfactor = 1000;
@@ -70,10 +70,10 @@ static long __estimate_accuracy(struct timespec *tv)
         return slack;
  }
  
-u64 select_estimate_accuracy(struct timespec *tv)
+u64 select_estimate_accuracy(struct timespec64 *tv)
  {
         u64 ret;
-       struct timespec now;
+       struct timespec64 now;
  
         /*
          * Realtime tasks get a slack of 0 for obvious reasons.
@@ -82,8 +82,8 @@ u64 select_estimate_accuracy(struct timespec *tv)
         if (rt_task(current))
                 return 0;
  
-       ktime_get_ts(&now);
-       now = timespec_sub(*tv, now);
+       ktime_get_ts64(&now);
+       now = timespec64_sub(*tv, now);
         ret = __estimate_accuracy(&now);
         if (ret < current->timer_slack_ns)
                 return current->timer_slack_ns;
@@ -260,7 +260,7 @@ EXPORT_SYMBOL(poll_schedule_timeout);
  
  /**
   * poll_select_set_timeout - helper function to setup the timeout value
- * @to:                pointer to timespec variable for the final timeout
+ * @to:                pointer to timespec64 variable for the final timeout
   * @sec:       seconds (from user space)
   * @nsec:      nanoseconds (from user space)
   *
@@ -269,26 +269,28 @@ EXPORT_SYMBOL(poll_schedule_timeout);
   *
   * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
   */
-int poll_select_set_timeout(struct timespec *to, long sec, long nsec)
+int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec)
  {
-       struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec};
+       struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec};
  
-       if (!timespec_valid(&ts))
+       if (!timespec64_valid(&ts))
                 return -EINVAL;
  
         /* Optimize for the zero timeout value here */
         if (!sec && !nsec) {
                 to->tv_sec = to->tv_nsec = 0;
         } else {
-               ktime_get_ts(to);
-               *to = timespec_add_safe(*to, ts);
+               ktime_get_ts64(to);
+               *to = timespec64_add_safe(*to, ts);
         }
         return 0;
  }
  
-static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
+static int poll_select_copy_remaining(struct timespec64 *end_time,
+                                     void __user *p,
                                       int timeval, int ret)
  {
+       struct timespec64 rts64;
         struct timespec rts;
         struct timeval rtv;
  
@@ -302,16 +304,18 @@ static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
         if (!end_time->tv_sec && !end_time->tv_nsec)
                 return ret;
  
-       ktime_get_ts(&rts);
-       rts = timespec_sub(*end_time, rts);
-       if (rts.tv_sec < 0)
-               rts.tv_sec = rts.tv_nsec = 0;
+       ktime_get_ts64(&rts64);
+       rts64 = timespec64_sub(*end_time, rts64);
+       if (rts64.tv_sec < 0)
+               rts64.tv_sec = rts64.tv_nsec = 0;
+
+       rts = timespec64_to_timespec(rts64);
  
         if (timeval) {
                 if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
                         memset(&rtv, 0, sizeof(rtv));
-               rtv.tv_sec = rts.tv_sec;
-               rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
+               rtv.tv_sec = rts64.tv_sec;
+               rtv.tv_usec = rts64.tv_nsec / NSEC_PER_USEC;
  
                 if (!copy_to_user(p, &rtv, sizeof(rtv)))
                         return ret;
@@ -396,7 +400,7 @@ static inline void wait_key_set(poll_table *wait, unsigned long in,
                 wait->_key |= POLLOUT_SET;
  }
  
-int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
+int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
  {
         ktime_t expire, *to = NULL;
         struct poll_wqueues table;
@@ -522,7 +526,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
                  * pointer to the expiry value.
                  */
                 if (end_time && !to) {
-                       expire = timespec_to_ktime(*end_time);
+                       expire = timespec64_to_ktime(*end_time);
                         to = &expire;
                 }
  
@@ -545,7 +549,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
   * I'm trying ERESTARTNOHAND which restart only when you want to.
   */
  int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
-                          fd_set __user *exp, struct timespec *end_time)
+                          fd_set __user *exp, struct timespec64 *end_time)
  {
         fd_set_bits fds;
         void *bits;
@@ -622,7 +626,7 @@ out_nofds:
  SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
                 fd_set __user *, exp, struct timeval __user *, tvp)
  {
-       struct timespec end_time, *to = NULL;
+       struct timespec64 end_time, *to = NULL;
         struct timeval tv;
         int ret;
  
@@ -648,15 +652,17 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
                        const sigset_t __user *sigmask, size_t sigsetsize)
  {
         sigset_t ksigmask, sigsaved;
-       struct timespec ts, end_time, *to = NULL;
+       struct timespec ts;
+       struct timespec64 ts64, end_time, *to = NULL;
         int ret;
  
         if (tsp) {
                 if (copy_from_user(&ts, tsp, sizeof(ts)))
                         return -EFAULT;
+               ts64 = timespec_to_timespec64(ts);
  
                 to = &end_time;
-               if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+               if (poll_select_set_timeout(to, ts64.tv_sec, ts64.tv_nsec))
                         return -EINVAL;
         }
  
@@ -779,7 +785,7 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
  }
  
  static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
-                  struct timespec *end_time)
+                  struct timespec64 *end_time)
  {
         poll_table* pt = &wait->pt;
         ktime_t expire, *to = NULL;
@@ -854,7 +860,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
                  * pointer to the expiry value.
                  */
                 if (end_time && !to) {
-                       expire = timespec_to_ktime(*end_time);
+                       expire = timespec64_to_ktime(*end_time);
                         to = &expire;
                 }
  
@@ -868,7 +874,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
                         sizeof(struct pollfd))
  
  int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
-               struct timespec *end_time)
+               struct timespec64 *end_time)
  {
         struct poll_wqueues table;
         int err = -EFAULT, fdcount, len, size;
@@ -936,7 +942,7 @@ static long do_restart_poll(struct restart_block *restart_block)
  {
         struct pollfd __user *ufds = restart_block->poll.ufds;
         int nfds = restart_block->poll.nfds;
-       struct timespec *to = NULL, end_time;
+       struct timespec64 *to = NULL, end_time;
         int ret;
  
         if (restart_block->poll.has_timeout) {
@@ -957,7 +963,7 @@ static long do_restart_poll(struct restart_block *restart_block)
  SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
                 int, timeout_msecs)
  {
-       struct timespec end_time, *to = NULL;
+       struct timespec64 end_time, *to = NULL;
         int ret;
  
         if (timeout_msecs >= 0) {
@@ -993,7 +999,8 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
                 size_t, sigsetsize)
  {
         sigset_t ksigmask, sigsaved;
-       struct timespec ts, end_time, *to = NULL;
+       struct timespec ts;
+       struct timespec64 end_time, *to = NULL;
         int ret;
  
         if (tsp) {
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h

index 9401f4819891177e64ebfa488fb9b9d9b96e00be..d4458b6dbfb48831c833b203edc058f6f4a9e847 100644 (file)
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -806,4 +806,12 @@ static inline int pmd_clear_huge(pmd_t *pmd)
  #define io_remap_pfn_range remap_pfn_range
  #endif
  
+#ifndef has_transparent_hugepage
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define has_transparent_hugepage() 1
+#else
+#define has_transparent_hugepage() 0
+#endif
+#endif
+
  #endif /* _ASM_GENERIC_PGTABLE_H */
diff --git a/include/dt-bindings/mfd/arizona.h b/include/dt-bindings/mfd/arizona.h

index c40f665e27127f41f572dc39fd78b07dc0e4e0ff..dedf46ffdb539d63f79f1207c2efaa180f259d99 100644 (file)
--- a/include/dt-bindings/mfd/arizona.h
+++ b/include/dt-bindings/mfd/arizona.h
@@ -110,4 +110,9 @@
  #define ARIZONA_ACCDET_MODE_HPM 4
  #define ARIZONA_ACCDET_MODE_ADC 7
  
+#define ARIZONA_GPSW_OPEN           0
+#define ARIZONA_GPSW_CLOSED         1
+#define ARIZONA_GPSW_CLAMP_ENABLED  2
+#define ARIZONA_GPSW_CLAMP_DISABLED 3
+
  #endif
diff --git a/include/dt-bindings/mfd/max77620.h b/include/dt-bindings/mfd/max77620.h

new file mode 100644 (file)

index 0000000..b911a07
--- /dev/null
+++ b/include/dt-bindings/mfd/max77620.h
@@ -0,0 +1,39 @@
+/*
+ * This header provides macros for MAXIM MAX77620 device bindings.
+ *
+ * Copyright (c) 2016, NVIDIA Corporation.
+ * Author: Laxman Dewangan <ldewangan@nvidia.com>
+ */
+
+#ifndef _DT_BINDINGS_MFD_MAX77620_H
+#define _DT_BINDINGS_MFD_MAX77620_H
+
+/* MAX77620 interrupts */
+#define MAX77620_IRQ_TOP_GLBL          0 /* Low-Battery */
+#define MAX77620_IRQ_TOP_SD            1 /* SD power fail */
+#define MAX77620_IRQ_TOP_LDO           2 /* LDO power fail */
+#define MAX77620_IRQ_TOP_GPIO          3 /* GPIO internal int to MAX77620 */
+#define MAX77620_IRQ_TOP_RTC           4 /* RTC */
+#define MAX77620_IRQ_TOP_32K           5 /* 32kHz oscillator */
+#define MAX77620_IRQ_TOP_ONOFF         6 /* ON/OFF oscillator */
+#define MAX77620_IRQ_LBT_MBATLOW       7 /* Thermal alarm status, > 120C */
+#define MAX77620_IRQ_LBT_TJALRM1       8 /* Thermal alarm status, > 120C */
+#define MAX77620_IRQ_LBT_TJALRM2       9 /* Thermal alarm status, > 140C */
+
+/* FPS event source */
+#define MAX77620_FPS_EVENT_SRC_EN0             0
+#define MAX77620_FPS_EVENT_SRC_EN1             1
+#define MAX77620_FPS_EVENT_SRC_SW              2
+
+/* Device state when FPS event LOW  */
+#define MAX77620_FPS_INACTIVE_STATE_SLEEP      0
+#define MAX77620_FPS_INACTIVE_STATE_LOW_POWER  1
+
+/* FPS source */
+#define MAX77620_FPS_SRC_0                     0
+#define MAX77620_FPS_SRC_1                     1
+#define MAX77620_FPS_SRC_2                     2
+#define MAX77620_FPS_SRC_NONE                  3
+#define MAX77620_FPS_SRC_DEF                   4
+
+#endif
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h

index 35b22f94d2d27d4977e30099abd3093352f2e85c..f9be3269171801524f20e56fe5ea8e7879149132 100644 (file)
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -83,34 +83,34 @@ extern void *__alloc_bootmem(unsigned long size,
                              unsigned long goal);
  extern void *__alloc_bootmem_nopanic(unsigned long size,
                                      unsigned long align,
-                                    unsigned long goal);
+                                    unsigned long goal) __malloc;
  extern void *__alloc_bootmem_node(pg_data_t *pgdat,
                                   unsigned long size,
                                   unsigned long align,
-                                 unsigned long goal);
+                                 unsigned long goal) __malloc;
  void *__alloc_bootmem_node_high(pg_data_t *pgdat,
                                   unsigned long size,
                                   unsigned long align,
-                                 unsigned long goal);
+                                 unsigned long goal) __malloc;
  extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
                                   unsigned long size,
                                   unsigned long align,
-                                 unsigned long goal);
+                                 unsigned long goal) __malloc;
  void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
                                   unsigned long size,
                                   unsigned long align,
                                   unsigned long goal,
-                                 unsigned long limit);
+                                 unsigned long limit) __malloc;
  extern void *__alloc_bootmem_low(unsigned long size,
                                  unsigned long align,
-                                unsigned long goal);
+                                unsigned long goal) __malloc;
  void *__alloc_bootmem_low_nopanic(unsigned long size,
                                  unsigned long align,
-                                unsigned long goal);
+                                unsigned long goal) __malloc;
  extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
                                       unsigned long size,
                                       unsigned long align,
-                                     unsigned long goal);
+                                     unsigned long goal) __malloc;
  
  #ifdef CONFIG_NO_BOOTMEM
  /* We are using top down, so it is safe to use 0 here */
diff --git a/include/linux/compaction.h b/include/linux/compaction.h

index d7c8de583a23d2b12d55fe184adcb60d6fc55dd5..242b660f64e62d26253289aa43328190c3b70155 100644 (file)
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -39,12 +39,12 @@ extern int sysctl_compact_unevictable_allowed;
  
  extern int fragmentation_index(struct zone *zone, unsigned int order);
  extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
-                       int alloc_flags, const struct alloc_context *ac,
-                       enum migrate_mode mode, int *contended);
+               unsigned int alloc_flags, const struct alloc_context *ac,
+               enum migrate_mode mode, int *contended);
  extern void compact_pgdat(pg_data_t *pgdat, int order);
  extern void reset_isolation_suitable(pg_data_t *pgdat);
  extern unsigned long compaction_suitable(struct zone *zone, int order,
-                                       int alloc_flags, int classzone_idx);
+               unsigned int alloc_flags, int classzone_idx);
  
  extern void defer_compaction(struct zone *zone, int order);
  extern bool compaction_deferred(struct zone *zone, int order);
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h

index 3d5202eda22f262e5c134bfb1f55b7e248dfcd79..e2949397c19b0d58bbebf84d37028ae9b969546c 100644 (file)
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -142,6 +142,7 @@
  
  #if GCC_VERSION >= 30400
  #define __must_check           __attribute__((warn_unused_result))
+#define __malloc               __attribute__((__malloc__))
  #endif
  
  #if GCC_VERSION >= 40000
diff --git a/include/linux/compiler.h b/include/linux/compiler.h

index b5ff9881bef8f9696e921fb8731e1ff0ff072a94..793c0829e3a3909dd532c972e513c7b8164a16ed 100644 (file)
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -357,6 +357,10 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
  #define __deprecated_for_modules
  #endif
  
+#ifndef __malloc
+#define __malloc
+#endif
+
  /*
   * Allow us to avoid 'defined but not used' warnings on functions and data,
   * as well as force them to be emitted to the assembly file.
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h

index 85a868ccb4931d374a1ee9fb4e4036bb84399561..bfc204e70338ab1eed9016b6183076647149cdbc 100644 (file)
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -16,26 +16,26 @@
  
  #ifdef CONFIG_CPUSETS
  
-extern struct static_key cpusets_enabled_key;
+extern struct static_key_false cpusets_enabled_key;
  static inline bool cpusets_enabled(void)
  {
-       return static_key_false(&cpusets_enabled_key);
+       return static_branch_unlikely(&cpusets_enabled_key);
  }
  
  static inline int nr_cpusets(void)
  {
         /* jump label reference count + the top-level cpuset */
-       return static_key_count(&cpusets_enabled_key) + 1;
+       return static_key_count(&cpusets_enabled_key.key) + 1;
  }
  
  static inline void cpuset_inc(void)
  {
-       static_key_slow_inc(&cpusets_enabled_key);
+       static_branch_inc(&cpusets_enabled_key);
  }
  
  static inline void cpuset_dec(void)
  {
-       static_key_slow_dec(&cpusets_enabled_key);
+       static_branch_dec(&cpusets_enabled_key);
  }
  
  extern int cpuset_init(void);
@@ -48,16 +48,25 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
  void cpuset_init_current_mems_allowed(void);
  int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
  
-extern int __cpuset_node_allowed(int node, gfp_t gfp_mask);
+extern bool __cpuset_node_allowed(int node, gfp_t gfp_mask);
  
-static inline int cpuset_node_allowed(int node, gfp_t gfp_mask)
+static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask)
  {
-       return nr_cpusets() <= 1 || __cpuset_node_allowed(node, gfp_mask);
+       if (cpusets_enabled())
+               return __cpuset_node_allowed(node, gfp_mask);
+       return true;
  }
  
-static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
  {
-       return cpuset_node_allowed(zone_to_nid(z), gfp_mask);
+       return __cpuset_node_allowed(zone_to_nid(z), gfp_mask);
+}
+
+static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+{
+       if (cpusets_enabled())
+               return __cpuset_zone_allowed(z, gfp_mask);
+       return true;
  }
  
  extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
@@ -172,14 +181,19 @@ static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
         return 1;
  }
  
-static inline int cpuset_node_allowed(int node, gfp_t gfp_mask)
+static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask)
  {
-       return 1;
+       return true;
  }
  
-static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
  {
-       return 1;
+       return true;
+}
+
+static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+{
+       return true;
  }
  
  static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h

index 98ffcbd4888ea0d088f2d4e8c4b41757b0e472c2..46056cb161fce0e1f3de707d887abd2ffcb1d02b 100644 (file)
--- a/include/linux/debugobjects.h
+++ b/include/linux/debugobjects.h
@@ -38,8 +38,10 @@ struct debug_obj {
   * @name:              name of the object typee
   * @debug_hint:                function returning address, which have associated
   *                     kernel symbol, to allow identify the object
+ * @is_static_object   return true if the obj is static, otherwise return false
   * @fixup_init:                fixup function, which is called when the init check
- *                     fails
+ *                     fails. All fixup functions must return true if fixup
+ *                     was successful, otherwise return false
   * @fixup_activate:    fixup function, which is called when the activate check
   *                     fails
   * @fixup_destroy:     fixup function, which is called when the destroy check
@@ -51,12 +53,13 @@ struct debug_obj {
   */
  struct debug_obj_descr {
         const char              *name;
-       void *(*debug_hint)     (void *addr);
-       int (*fixup_init)       (void *addr, enum debug_obj_state state);
-       int (*fixup_activate)   (void *addr, enum debug_obj_state state);
-       int (*fixup_destroy)    (void *addr, enum debug_obj_state state);
-       int (*fixup_free)       (void *addr, enum debug_obj_state state);
-       int (*fixup_assert_init)(void *addr, enum debug_obj_state state);
+       void *(*debug_hint)(void *addr);
+       bool (*is_static_object)(void *addr);
+       bool (*fixup_init)(void *addr, enum debug_obj_state state);
+       bool (*fixup_activate)(void *addr, enum debug_obj_state state);
+       bool (*fixup_destroy)(void *addr, enum debug_obj_state state);
+       bool (*fixup_free)(void *addr, enum debug_obj_state state);
+       bool (*fixup_assert_init)(void *addr, enum debug_obj_state state);
  };
  
  #ifdef CONFIG_DEBUG_OBJECTS
diff --git a/include/linux/device.h b/include/linux/device.h

index b130304f9b1bb6a45ee4f379a2eff2cbac204530..ca90ad8bcd619b2694d7214f284f0036a562dc7e 100644 (file)
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -609,14 +609,14 @@ typedef int (*dr_match_t)(struct device *dev, void *res, void *match_data);
  
  #ifdef CONFIG_DEBUG_DEVRES
  extern void *__devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp,
-                                int nid, const char *name);
+                                int nid, const char *name) __malloc;
  #define devres_alloc(release, size, gfp) \
         __devres_alloc_node(release, size, gfp, NUMA_NO_NODE, #release)
  #define devres_alloc_node(release, size, gfp, nid) \
         __devres_alloc_node(release, size, gfp, nid, #release)
  #else
  extern void *devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp,
-                              int nid);
+                              int nid) __malloc;
  static inline void *devres_alloc(dr_release_t release, size_t size, gfp_t gfp)
  {
         return devres_alloc_node(release, size, gfp, NUMA_NO_NODE);
@@ -648,12 +648,12 @@ extern void devres_remove_group(struct device *dev, void *id);
  extern int devres_release_group(struct device *dev, void *id);
  
  /* managed devm_k.alloc/kfree for device drivers */
-extern void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp);
+extern void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp) __malloc;
  extern __printf(3, 0)
  char *devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt,
-                     va_list ap);
+                     va_list ap) __malloc;
  extern __printf(3, 4)
-char *devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...);
+char *devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...) __malloc;
  static inline void *devm_kzalloc(struct device *dev, size_t size, gfp_t gfp)
  {
         return devm_kmalloc(dev, size, gfp | __GFP_ZERO);
@@ -671,7 +671,7 @@ static inline void *devm_kcalloc(struct device *dev,
         return devm_kmalloc_array(dev, n, size, flags | __GFP_ZERO);
  }
  extern void devm_kfree(struct device *dev, void *p);
-extern char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp);
+extern char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp) __malloc;
  extern void *devm_kmemdup(struct device *dev, const void *src, size_t len,
                           gfp_t gfp);
  
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h

index 1259e53d9296edea1a152e9caf9b4810471235e6..29f917517299fd0d4a61615d17a5045ce9826135 100644 (file)
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -359,8 +359,6 @@ extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
  extern void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group);
  /* run all the marks in a group, and clear all of the marks where mark->flags & flags is true*/
  extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags);
-/* run all the marks in a group, and flag them to be freed */
-extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group);
  extern void fsnotify_get_mark(struct fsnotify_mark *mark);
  extern void fsnotify_put_mark(struct fsnotify_mark *mark);
  extern void fsnotify_unmount_inodes(struct super_block *sb);
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h

index d7b9e5346fba0390890a4e477c8952485319e887..419fb9e03447aff8aef55934e89bbd844a28d7e7 100644 (file)
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -28,9 +28,7 @@ extern int zap_huge_pmd(struct mmu_gather *tlb,
  extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                         unsigned long addr, unsigned long end,
                         unsigned char *vec);
-extern bool move_huge_pmd(struct vm_area_struct *vma,
-                        struct vm_area_struct *new_vma,
-                        unsigned long old_addr,
+extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
                          unsigned long new_addr, unsigned long old_end,
                          pmd_t *old_pmd, pmd_t *new_pmd);
  extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h

index 7d953c2542a8f296fafbcb59b2bac323c7a64407..e44c57876e8994660596ae794c52c002ca78dc75 100644 (file)
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -338,6 +338,7 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
  /* arch callback */
  int __init alloc_bootmem_huge_page(struct hstate *h);
  
+void __init hugetlb_bad_size(void);
  void __init hugetlb_add_hstate(unsigned order);
  struct hstate *size_to_hstate(unsigned long size);
  
diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h

index 2bb681fbeb35b3f4dcdece493931e0679c89d824..a4e7ca0f3585e4ca33d65dedd3989af26ecd0bfa 100644 (file)
--- a/include/linux/hugetlb_inline.h
+++ b/include/linux/hugetlb_inline.h
@@ -5,16 +5,16 @@
  
  #include <linux/mm.h>
  
-static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
+static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
  {
         return !!(vma->vm_flags & VM_HUGETLB);
  }
  
  #else
  
-static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
+static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
  {
-       return 0;
+       return false;
  }
  
  #endif
diff --git a/include/linux/kernel.h b/include/linux/kernel.h

index 2f7775e229b093d549aff01270bf3cd2371e5771..cc7398287fddf3d5531a34e5ade5e0d83d5ef14f 100644 (file)
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -412,9 +412,9 @@ extern __printf(3, 4)
  int scnprintf(char *buf, size_t size, const char *fmt, ...);
  extern __printf(3, 0)
  int vscnprintf(char *buf, size_t size, const char *fmt, va_list args);
-extern __printf(2, 3)
+extern __printf(2, 3) __malloc
  char *kasprintf(gfp_t gfp, const char *fmt, ...);
-extern __printf(2, 0)
+extern __printf(2, 0) __malloc
  char *kvasprintf(gfp_t gfp, const char *fmt, va_list args);
  extern __printf(2, 0)
  const char *kvasprintf_const(gfp_t gfp, const char *fmt, va_list args);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h

index 1191d79aa495e698fdd233e91c2b2b2e317cba9d..94da96738df3fcc2b770e913f0743d4121ecb065 100644 (file)
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -658,12 +658,6 @@ mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
         return 0;
  }
  
-static inline void
-mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
-                             int increment)
-{
-}
-
  static inline unsigned long
  mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
                              int nid, unsigned int lru_mask)
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h

index adbef586e696311ef2897886df2f356d54d5a52b..20d8a5d4d133e3ca557b70ea89fb602d50f96e71 100644 (file)
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -247,16 +247,16 @@ static inline void mem_hotplug_done(void) {}
  
  #ifdef CONFIG_MEMORY_HOTREMOVE
  
-extern int is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
+extern bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
  extern void try_offline_node(int nid);
  extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
  extern void remove_memory(int nid, u64 start, u64 size);
  
  #else
-static inline int is_mem_section_removable(unsigned long pfn,
+static inline bool is_mem_section_removable(unsigned long pfn,
                                         unsigned long nr_pages)
  {
-       return 0;
+       return false;
  }
  
  static inline void try_offline_node(int nid) {}
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h

index 2696c1f05ed138a219aa4988acda27f44aca8852..4429d255c8ab6c7524436d2ee36fdcea57304447 100644 (file)
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -172,14 +172,14 @@ extern int mpol_parse_str(char *str, struct mempolicy **mpol);
  extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);
  
  /* Check if a vma is migratable */
-static inline int vma_migratable(struct vm_area_struct *vma)
+static inline bool vma_migratable(struct vm_area_struct *vma)
  {
         if (vma->vm_flags & (VM_IO | VM_PFNMAP))
-               return 0;
+               return false;
  
  #ifndef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
         if (vma->vm_flags & VM_HUGETLB)
-               return 0;
+               return false;
  #endif
  
         /*
@@ -190,8 +190,8 @@ static inline int vma_migratable(struct vm_area_struct *vma)
         if (vma->vm_file &&
                 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
                                                                 < policy_zone)
-                       return 0;
-       return 1;
+                       return false;
+       return true;
  }
  
  extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
@@ -228,6 +228,12 @@ static inline void mpol_free_shared_policy(struct shared_policy *p)
  {
  }
  
+static inline struct mempolicy *
+mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
+{
+       return NULL;
+}
+
  #define vma_policy(vma) NULL
  
  static inline int
diff --git a/include/linux/mempool.h b/include/linux/mempool.h

index 69b6951e8fd231d96d69ec953fd900aba93c2274..b1086c936507a34925020ce18131a8d5c3573a8a 100644 (file)
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -5,6 +5,7 @@
  #define _LINUX_MEMPOOL_H
  
  #include <linux/wait.h>
+#include <linux/compiler.h>
  
  struct kmem_cache;
  
@@ -31,7 +32,7 @@ extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
  
  extern int mempool_resize(mempool_t *pool, int new_min_nr);
  extern void mempool_destroy(mempool_t *pool);
-extern void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask);
+extern void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) __malloc;
  extern void mempool_free(void *element, mempool_t *pool);
  
  /*
diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h

index d82e7d51372bed0d850d2103cda088843f6606d6..0be4982f08fe143615d728bd1d13571199e4bb10 100644 (file)
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -20,6 +20,7 @@ enum {
         AXP221_ID,
         AXP223_ID,
         AXP288_ID,
+       AXP809_ID,
         NR_AXP20X_VARIANTS,
  };
  
@@ -264,6 +265,29 @@ enum {
         AXP22X_REG_ID_MAX,
  };
  
+enum {
+       AXP809_DCDC1 = 0,
+       AXP809_DCDC2,
+       AXP809_DCDC3,
+       AXP809_DCDC4,
+       AXP809_DCDC5,
+       AXP809_DC1SW,
+       AXP809_DC5LDO,
+       AXP809_ALDO1,
+       AXP809_ALDO2,
+       AXP809_ALDO3,
+       AXP809_ELDO1,
+       AXP809_ELDO2,
+       AXP809_ELDO3,
+       AXP809_DLDO1,
+       AXP809_DLDO2,
+       AXP809_RTC_LDO,
+       AXP809_LDO_IO0,
+       AXP809_LDO_IO1,
+       AXP809_SW,
+       AXP809_REG_ID_MAX,
+};
+
  /* IRQs */
  enum {
         AXP152_IRQ_LDO0IN_CONNECT = 1,
@@ -390,6 +414,41 @@ enum axp288_irqs {
         AXP288_IRQ_BC_USB_CHNG,
  };
  
+enum axp809_irqs {
+       AXP809_IRQ_ACIN_OVER_V = 1,
+       AXP809_IRQ_ACIN_PLUGIN,
+       AXP809_IRQ_ACIN_REMOVAL,
+       AXP809_IRQ_VBUS_OVER_V,
+       AXP809_IRQ_VBUS_PLUGIN,
+       AXP809_IRQ_VBUS_REMOVAL,
+       AXP809_IRQ_VBUS_V_LOW,
+       AXP809_IRQ_BATT_PLUGIN,
+       AXP809_IRQ_BATT_REMOVAL,
+       AXP809_IRQ_BATT_ENT_ACT_MODE,
+       AXP809_IRQ_BATT_EXIT_ACT_MODE,
+       AXP809_IRQ_CHARG,
+       AXP809_IRQ_CHARG_DONE,
+       AXP809_IRQ_BATT_CHG_TEMP_HIGH,
+       AXP809_IRQ_BATT_CHG_TEMP_HIGH_END,
+       AXP809_IRQ_BATT_CHG_TEMP_LOW,
+       AXP809_IRQ_BATT_CHG_TEMP_LOW_END,
+       AXP809_IRQ_BATT_ACT_TEMP_HIGH,
+       AXP809_IRQ_BATT_ACT_TEMP_HIGH_END,
+       AXP809_IRQ_BATT_ACT_TEMP_LOW,
+       AXP809_IRQ_BATT_ACT_TEMP_LOW_END,
+       AXP809_IRQ_DIE_TEMP_HIGH,
+       AXP809_IRQ_LOW_PWR_LVL1,
+       AXP809_IRQ_LOW_PWR_LVL2,
+       AXP809_IRQ_TIMER,
+       AXP809_IRQ_PEK_RIS_EDGE,
+       AXP809_IRQ_PEK_FAL_EDGE,
+       AXP809_IRQ_PEK_SHORT,
+       AXP809_IRQ_PEK_LONG,
+       AXP809_IRQ_PEK_OVER_OFF,
+       AXP809_IRQ_GPIO1_INPUT,
+       AXP809_IRQ_GPIO0_INPUT,
+};
+
  #define AXP288_TS_ADC_H                0x58
  #define AXP288_TS_ADC_L                0x59
  #define AXP288_GP_ADC_H                0x5a
diff --git a/include/linux/mfd/core.h b/include/linux/mfd/core.h

index 9837f1e8c94c172fc257a73cf90db512fcb0f84e..99c0395fe1f90ae2bd0a494ca74410262df30c90 100644 (file)
--- a/include/linux/mfd/core.h
+++ b/include/linux/mfd/core.h
@@ -131,4 +131,8 @@ static inline int mfd_add_hotplug_devices(struct device *parent,
  
  extern void mfd_remove_devices(struct device *parent);
  
+extern int devm_mfd_add_devices(struct device *dev, int id,
+                               const struct mfd_cell *cells, int n_devs,
+                               struct resource *mem_base,
+                               int irq_base, struct irq_domain *irq_domain);
  #endif
diff --git a/include/linux/mfd/hi655x-pmic.h b/include/linux/mfd/hi655x-pmic.h

new file mode 100644 (file)

index 0000000..dbbe9a6
--- /dev/null
+++ b/include/linux/mfd/hi655x-pmic.h
@@ -0,0 +1,55 @@
+/*
+ * Device driver for regulators in hi655x IC
+ *
+ * Copyright (c) 2016 Hisilicon.
+ *
+ * Authors:
+ * Chen Feng <puck.chen@hisilicon.com>
+ * Fei  Wang <w.f@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __HI655X_PMIC_H
+#define __HI655X_PMIC_H
+
+/* Hi655x registers are mapped to memory bus in 4 bytes stride */
+#define HI655X_STRIDE                   4
+#define HI655X_BUS_ADDR(x)              ((x) << 2)
+
+#define HI655X_BITS                     8
+
+#define HI655X_NR_IRQ                   32
+
+#define HI655X_IRQ_STAT_BASE            (0x003 << 2)
+#define HI655X_IRQ_MASK_BASE            (0x007 << 2)
+#define HI655X_ANA_IRQM_BASE            (0x1b5 << 2)
+#define HI655X_IRQ_ARRAY                4
+#define HI655X_IRQ_MASK                 0xFF
+#define HI655X_IRQ_CLR                  0xFF
+#define HI655X_VER_REG                  0x00
+
+#define PMU_VER_START                   0x10
+#define PMU_VER_END                     0x38
+
+#define RESERVE_INT                     BIT(7)
+#define PWRON_D20R_INT                  BIT(6)
+#define PWRON_D20F_INT                  BIT(5)
+#define PWRON_D4SR_INT                  BIT(4)
+#define VSYS_6P0_D200UR_INT             BIT(3)
+#define VSYS_UV_D3R_INT                 BIT(2)
+#define VSYS_2P5_R_INT                  BIT(1)
+#define OTMP_D1R_INT                    BIT(0)
+
+struct hi655x_pmic {
+       struct resource *res;
+       struct device *dev;
+       struct regmap *regmap;
+       int gpio;
+       unsigned int ver;
+       struct regmap_irq_chip_data *irq_data;
+};
+
+#endif
diff --git a/include/linux/mfd/max77620.h b/include/linux/mfd/max77620.h

new file mode 100644 (file)

index 0000000..3ca0af0
--- /dev/null
+++ b/include/linux/mfd/max77620.h
@@ -0,0 +1,346 @@
+/*
+ * Defining registers address and its bit definitions of MAX77620 and MAX20024
+ *
+ * Copyright (C) 2016 NVIDIA CORPORATION. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+
+#ifndef _MFD_MAX77620_H_
+#define _MFD_MAX77620_H_
+
+#include <linux/types.h>
+
+/* GLOBAL, PMIC, GPIO, FPS, ONOFFC, CID Registers */
+#define MAX77620_REG_CNFGGLBL1                 0x00
+#define MAX77620_REG_CNFGGLBL2                 0x01
+#define MAX77620_REG_CNFGGLBL3                 0x02
+#define MAX77620_REG_CNFG1_32K                 0x03
+#define MAX77620_REG_CNFGBBC                   0x04
+#define MAX77620_REG_IRQTOP                    0x05
+#define MAX77620_REG_INTLBT                    0x06
+#define MAX77620_REG_IRQSD                     0x07
+#define MAX77620_REG_IRQ_LVL2_L0_7             0x08
+#define MAX77620_REG_IRQ_LVL2_L8               0x09
+#define MAX77620_REG_IRQ_LVL2_GPIO             0x0A
+#define MAX77620_REG_ONOFFIRQ                  0x0B
+#define MAX77620_REG_NVERC                     0x0C
+#define MAX77620_REG_IRQTOPM                   0x0D
+#define MAX77620_REG_INTENLBT                  0x0E
+#define MAX77620_REG_IRQMASKSD                 0x0F
+#define MAX77620_REG_IRQ_MSK_L0_7              0x10
+#define MAX77620_REG_IRQ_MSK_L8                        0x11
+#define MAX77620_REG_ONOFFIRQM                 0x12
+#define MAX77620_REG_STATLBT                   0x13
+#define MAX77620_REG_STATSD                    0x14
+#define MAX77620_REG_ONOFFSTAT                 0x15
+
+/* SD and LDO Registers */
+#define MAX77620_REG_SD0                       0x16
+#define MAX77620_REG_SD1                       0x17
+#define MAX77620_REG_SD2                       0x18
+#define MAX77620_REG_SD3                       0x19
+#define MAX77620_REG_SD4                       0x1A
+#define MAX77620_REG_DVSSD0                    0x1B
+#define MAX77620_REG_DVSSD1                    0x1C
+#define MAX77620_REG_SD0_CFG                   0x1D
+#define MAX77620_REG_SD1_CFG                   0x1E
+#define MAX77620_REG_SD2_CFG                   0x1F
+#define MAX77620_REG_SD3_CFG                   0x20
+#define MAX77620_REG_SD4_CFG                   0x21
+#define MAX77620_REG_SD_CFG2                   0x22
+#define MAX77620_REG_LDO0_CFG                  0x23
+#define MAX77620_REG_LDO0_CFG2                 0x24
+#define MAX77620_REG_LDO1_CFG                  0x25
+#define MAX77620_REG_LDO1_CFG2                 0x26
+#define MAX77620_REG_LDO2_CFG                  0x27
+#define MAX77620_REG_LDO2_CFG2                 0x28
+#define MAX77620_REG_LDO3_CFG                  0x29
+#define MAX77620_REG_LDO3_CFG2                 0x2A
+#define MAX77620_REG_LDO4_CFG                  0x2B
+#define MAX77620_REG_LDO4_CFG2                 0x2C
+#define MAX77620_REG_LDO5_CFG                  0x2D
+#define MAX77620_REG_LDO5_CFG2                 0x2E
+#define MAX77620_REG_LDO6_CFG                  0x2F
+#define MAX77620_REG_LDO6_CFG2                 0x30
+#define MAX77620_REG_LDO7_CFG                  0x31
+#define MAX77620_REG_LDO7_CFG2                 0x32
+#define MAX77620_REG_LDO8_CFG                  0x33
+#define MAX77620_REG_LDO8_CFG2                 0x34
+#define MAX77620_REG_LDO_CFG3                  0x35
+
+#define MAX77620_LDO_SLEW_RATE_MASK            0x1
+
+/* LDO Configuration 3 */
+#define MAX77620_TRACK4_MASK                   BIT(5)
+#define MAX77620_TRACK4_SHIFT                  5
+
+/* Voltage */
+#define MAX77620_SDX_VOLT_MASK                 0xFF
+#define MAX77620_SD0_VOLT_MASK                 0x3F
+#define MAX77620_SD1_VOLT_MASK                 0x7F
+#define MAX77620_LDO_VOLT_MASK                 0x3F
+
+#define MAX77620_REG_GPIO0                     0x36
+#define MAX77620_REG_GPIO1                     0x37
+#define MAX77620_REG_GPIO2                     0x38
+#define MAX77620_REG_GPIO3                     0x39
+#define MAX77620_REG_GPIO4                     0x3A
+#define MAX77620_REG_GPIO5                     0x3B
+#define MAX77620_REG_GPIO6                     0x3C
+#define MAX77620_REG_GPIO7                     0x3D
+#define MAX77620_REG_PUE_GPIO                  0x3E
+#define MAX77620_REG_PDE_GPIO                  0x3F
+#define MAX77620_REG_AME_GPIO                  0x40
+#define MAX77620_REG_ONOFFCNFG1                        0x41
+#define MAX77620_REG_ONOFFCNFG2                        0x42
+
+/* FPS Registers */
+#define MAX77620_REG_FPS_CFG0                  0x43
+#define MAX77620_REG_FPS_CFG1                  0x44
+#define MAX77620_REG_FPS_CFG2                  0x45
+#define MAX77620_REG_FPS_LDO0                  0x46
+#define MAX77620_REG_FPS_LDO1                  0x47
+#define MAX77620_REG_FPS_LDO2                  0x48
+#define MAX77620_REG_FPS_LDO3                  0x49
+#define MAX77620_REG_FPS_LDO4                  0x4A
+#define MAX77620_REG_FPS_LDO5                  0x4B
+#define MAX77620_REG_FPS_LDO6                  0x4C
+#define MAX77620_REG_FPS_LDO7                  0x4D
+#define MAX77620_REG_FPS_LDO8                  0x4E
+#define MAX77620_REG_FPS_SD0                   0x4F
+#define MAX77620_REG_FPS_SD1                   0x50
+#define MAX77620_REG_FPS_SD2                   0x51
+#define MAX77620_REG_FPS_SD3                   0x52
+#define MAX77620_REG_FPS_SD4                   0x53
+#define MAX77620_REG_FPS_NONE                  0
+
+#define MAX77620_FPS_SRC_MASK                  0xC0
+#define MAX77620_FPS_SRC_SHIFT                 6
+#define MAX77620_FPS_PU_PERIOD_MASK            0x38
+#define MAX77620_FPS_PU_PERIOD_SHIFT           3
+#define MAX77620_FPS_PD_PERIOD_MASK            0x07
+#define MAX77620_FPS_PD_PERIOD_SHIFT           0
+#define MAX77620_FPS_TIME_PERIOD_MASK          0x38
+#define MAX77620_FPS_TIME_PERIOD_SHIFT         3
+#define MAX77620_FPS_EN_SRC_MASK               0x06
+#define MAX77620_FPS_EN_SRC_SHIFT              1
+#define MAX77620_FPS_ENFPS_SW_MASK             0x01
+#define MAX77620_FPS_ENFPS_SW                  0x01
+
+/* Minimum and maximum FPS period time (in microseconds) are
+ * different for MAX77620 and Max20024.
+ */
+#define MAX77620_FPS_PERIOD_MIN_US             40
+#define MAX20024_FPS_PERIOD_MIN_US             20
+
+#define MAX77620_FPS_PERIOD_MAX_US             2560
+#define MAX20024_FPS_PERIOD_MAX_US             5120
+
+#define MAX77620_REG_FPS_GPIO1                 0x54
+#define MAX77620_REG_FPS_GPIO2                 0x55
+#define MAX77620_REG_FPS_GPIO3                 0x56
+#define MAX77620_REG_FPS_RSO                   0x57
+#define MAX77620_REG_CID0                      0x58
+#define MAX77620_REG_CID1                      0x59
+#define MAX77620_REG_CID2                      0x5A
+#define MAX77620_REG_CID3                      0x5B
+#define MAX77620_REG_CID4                      0x5C
+#define MAX77620_REG_CID5                      0x5D
+
+#define MAX77620_REG_DVSSD4                    0x5E
+#define MAX20024_REG_MAX_ADD                   0x70
+
+#define MAX77620_CID_DIDM_MASK                 0xF0
+#define MAX77620_CID_DIDM_SHIFT                        4
+
+/* CNCG2SD */
+#define MAX77620_SD_CNF2_ROVS_EN_SD1           BIT(1)
+#define MAX77620_SD_CNF2_ROVS_EN_SD0           BIT(2)
+
+/* Device Identification Metal */
+#define MAX77620_CID5_DIDM(n)                  (((n) >> 4) & 0xF)
+/* Device Indentification OTP */
+#define MAX77620_CID5_DIDO(n)                  ((n) & 0xF)
+
+/* SD CNFG1 */
+#define MAX77620_SD_SR_MASK                    0xC0
+#define MAX77620_SD_SR_SHIFT                   6
+#define MAX77620_SD_POWER_MODE_MASK            0x30
+#define MAX77620_SD_POWER_MODE_SHIFT           4
+#define MAX77620_SD_CFG1_ADE_MASK              BIT(3)
+#define MAX77620_SD_CFG1_ADE_DISABLE           0
+#define MAX77620_SD_CFG1_ADE_ENABLE            BIT(3)
+#define MAX77620_SD_FPWM_MASK                  0x04
+#define MAX77620_SD_FPWM_SHIFT                 2
+#define MAX77620_SD_FSRADE_MASK                        0x01
+#define MAX77620_SD_FSRADE_SHIFT               0
+#define MAX77620_SD_CFG1_FPWM_SD_MASK          BIT(2)
+#define MAX77620_SD_CFG1_FPWM_SD_SKIP          0
+#define MAX77620_SD_CFG1_FPWM_SD_FPWM          BIT(2)
+#define MAX77620_SD_CFG1_FSRADE_SD_MASK                BIT(0)
+#define MAX77620_SD_CFG1_FSRADE_SD_DISABLE     0
+#define MAX77620_SD_CFG1_FSRADE_SD_ENABLE      BIT(0)
+
+/* LDO_CNFG2 */
+#define MAX77620_LDO_POWER_MODE_MASK           0xC0
+#define MAX77620_LDO_POWER_MODE_SHIFT          6
+#define MAX77620_LDO_CFG2_ADE_MASK             BIT(1)
+#define MAX77620_LDO_CFG2_ADE_DISABLE          0
+#define MAX77620_LDO_CFG2_ADE_ENABLE           BIT(1)
+#define MAX77620_LDO_CFG2_SS_MASK              BIT(0)
+#define MAX77620_LDO_CFG2_SS_FAST              BIT(0)
+#define MAX77620_LDO_CFG2_SS_SLOW              0
+
+#define MAX77620_IRQ_TOP_GLBL_MASK             BIT(7)
+#define MAX77620_IRQ_TOP_SD_MASK               BIT(6)
+#define MAX77620_IRQ_TOP_LDO_MASK              BIT(5)
+#define MAX77620_IRQ_TOP_GPIO_MASK             BIT(4)
+#define MAX77620_IRQ_TOP_RTC_MASK              BIT(3)
+#define MAX77620_IRQ_TOP_32K_MASK              BIT(2)
+#define MAX77620_IRQ_TOP_ONOFF_MASK            BIT(1)
+
+#define MAX77620_IRQ_LBM_MASK                  BIT(3)
+#define MAX77620_IRQ_TJALRM1_MASK              BIT(2)
+#define MAX77620_IRQ_TJALRM2_MASK              BIT(1)
+
+#define MAX77620_PWR_I2C_ADDR                  0x3c
+#define MAX77620_RTC_I2C_ADDR                  0x68
+
+#define MAX77620_CNFG_GPIO_DRV_MASK            BIT(0)
+#define MAX77620_CNFG_GPIO_DRV_PUSHPULL                BIT(0)
+#define MAX77620_CNFG_GPIO_DRV_OPENDRAIN       0
+#define MAX77620_CNFG_GPIO_DIR_MASK            BIT(1)
+#define MAX77620_CNFG_GPIO_DIR_INPUT           BIT(1)
+#define MAX77620_CNFG_GPIO_DIR_OUTPUT          0
+#define MAX77620_CNFG_GPIO_INPUT_VAL_MASK      BIT(2)
+#define MAX77620_CNFG_GPIO_OUTPUT_VAL_MASK     BIT(3)
+#define MAX77620_CNFG_GPIO_OUTPUT_VAL_HIGH     BIT(3)
+#define MAX77620_CNFG_GPIO_OUTPUT_VAL_LOW      0
+#define MAX77620_CNFG_GPIO_INT_MASK            (0x3 << 4)
+#define MAX77620_CNFG_GPIO_INT_FALLING         BIT(4)
+#define MAX77620_CNFG_GPIO_INT_RISING          BIT(5)
+#define MAX77620_CNFG_GPIO_DBNC_MASK           (0x3 << 6)
+#define MAX77620_CNFG_GPIO_DBNC_None           (0x0 << 6)
+#define MAX77620_CNFG_GPIO_DBNC_8ms            (0x1 << 6)
+#define MAX77620_CNFG_GPIO_DBNC_16ms           (0x2 << 6)
+#define MAX77620_CNFG_GPIO_DBNC_32ms           (0x3 << 6)
+
+#define MAX77620_IRQ_LVL2_GPIO_EDGE0           BIT(0)
+#define MAX77620_IRQ_LVL2_GPIO_EDGE1           BIT(1)
+#define MAX77620_IRQ_LVL2_GPIO_EDGE2           BIT(2)
+#define MAX77620_IRQ_LVL2_GPIO_EDGE3           BIT(3)
+#define MAX77620_IRQ_LVL2_GPIO_EDGE4           BIT(4)
+#define MAX77620_IRQ_LVL2_GPIO_EDGE5           BIT(5)
+#define MAX77620_IRQ_LVL2_GPIO_EDGE6           BIT(6)
+#define MAX77620_IRQ_LVL2_GPIO_EDGE7           BIT(7)
+
+#define MAX77620_CNFG1_32K_OUT0_EN             BIT(2)
+
+#define MAX77620_ONOFFCNFG1_SFT_RST            BIT(7)
+#define MAX77620_ONOFFCNFG1_MRT_MASK           0x38
+#define MAX77620_ONOFFCNFG1_MRT_SHIFT          0x3
+#define MAX77620_ONOFFCNFG1_SLPEN              BIT(2)
+#define MAX77620_ONOFFCNFG1_PWR_OFF            BIT(1)
+#define MAX20024_ONOFFCNFG1_CLRSE              0x18
+
+#define MAX77620_ONOFFCNFG2_SFT_RST_WK         BIT(7)
+#define MAX77620_ONOFFCNFG2_WD_RST_WK          BIT(6)
+#define MAX77620_ONOFFCNFG2_SLP_LPM_MSK                BIT(5)
+#define MAX77620_ONOFFCNFG2_WK_ALARM1          BIT(2)
+#define MAX77620_ONOFFCNFG2_WK_EN0             BIT(0)
+
+#define MAX77620_GLBLM_MASK                    BIT(0)
+
+#define MAX77620_WDTC_MASK                     0x3
+#define MAX77620_WDTOFFC                       BIT(4)
+#define MAX77620_WDTSLPC                       BIT(3)
+#define MAX77620_WDTEN                         BIT(2)
+
+#define MAX77620_TWD_MASK                      0x3
+#define MAX77620_TWD_2s                                0x0
+#define MAX77620_TWD_16s                       0x1
+#define MAX77620_TWD_64s                       0x2
+#define MAX77620_TWD_128s                      0x3
+
+#define MAX77620_CNFGGLBL1_LBDAC_EN            BIT(7)
+#define MAX77620_CNFGGLBL1_MPPLD               BIT(6)
+#define MAX77620_CNFGGLBL1_LBHYST              (BIT(5) | BIT(4))
+#define MAX77620_CNFGGLBL1_LBDAC               0x0E
+#define MAX77620_CNFGGLBL1_LBRSTEN             BIT(0)
+
+/* CNFG BBC registers */
+#define MAX77620_CNFGBBC_ENABLE                        BIT(0)
+#define MAX77620_CNFGBBC_CURRENT_MASK          0x06
+#define MAX77620_CNFGBBC_CURRENT_SHIFT         1
+#define MAX77620_CNFGBBC_VOLTAGE_MASK          0x18
+#define MAX77620_CNFGBBC_VOLTAGE_SHIFT         3
+#define MAX77620_CNFGBBC_LOW_CURRENT_DISABLE   BIT(5)
+#define MAX77620_CNFGBBC_RESISTOR_MASK         0xC0
+#define MAX77620_CNFGBBC_RESISTOR_SHIFT                6
+
+#define MAX77620_FPS_COUNT                     3
+
+/* Interrupts */
+enum {
+       MAX77620_IRQ_TOP_GLBL,          /* Low-Battery */
+       MAX77620_IRQ_TOP_SD,            /* SD power fail */
+       MAX77620_IRQ_TOP_LDO,           /* LDO power fail */
+       MAX77620_IRQ_TOP_GPIO,          /* TOP GPIO internal int to MAX77620 */
+       MAX77620_IRQ_TOP_RTC,           /* RTC */
+       MAX77620_IRQ_TOP_32K,           /* 32kHz oscillator */
+       MAX77620_IRQ_TOP_ONOFF,         /* ON/OFF oscillator */
+       MAX77620_IRQ_LBT_MBATLOW,       /* Thermal alarm status, > 120C */
+       MAX77620_IRQ_LBT_TJALRM1,       /* Thermal alarm status, > 120C */
+       MAX77620_IRQ_LBT_TJALRM2,       /* Thermal alarm status, > 140C */
+};
+
+/* GPIOs */
+enum {
+       MAX77620_GPIO0,
+       MAX77620_GPIO1,
+       MAX77620_GPIO2,
+       MAX77620_GPIO3,
+       MAX77620_GPIO4,
+       MAX77620_GPIO5,
+       MAX77620_GPIO6,
+       MAX77620_GPIO7,
+       MAX77620_GPIO_NR,
+};
+
+/* FPS Source */
+enum max77620_fps_src {
+       MAX77620_FPS_SRC_0,
+       MAX77620_FPS_SRC_1,
+       MAX77620_FPS_SRC_2,
+       MAX77620_FPS_SRC_NONE,
+       MAX77620_FPS_SRC_DEF,
+};
+
+enum max77620_chip_id {
+       MAX77620,
+       MAX20024,
+};
+
+struct max77620_chip {
+       struct device *dev;
+       struct regmap *rmap;
+
+       int chip_irq;
+       int irq_base;
+
+       /* chip id */
+       enum max77620_chip_id chip_id;
+
+       bool sleep_enable;
+       bool enable_global_lpm;
+       int shutdown_fps_period[MAX77620_FPS_COUNT];
+       int suspend_fps_period[MAX77620_FPS_COUNT];
+
+       struct regmap_irq_chip_data *top_irq_data;
+       struct regmap_irq_chip_data *gpio_irq_data;
+};
+
+#endif /* _MFD_MAX77620_H_ */
diff --git a/include/linux/mfd/syscon.h b/include/linux/mfd/syscon.h

index 1088149be0c95e26287d9c781eb467e056b0b08a..40a76b97b7ab9d8eabf07b1fa7272f477e92b074 100644 (file)
--- a/include/linux/mfd/syscon.h
+++ b/include/linux/mfd/syscon.h
@@ -16,6 +16,7 @@
  #define __LINUX_MFD_SYSCON_H__
  
  #include <linux/err.h>
+#include <linux/errno.h>
  
  struct device_node;
  
diff --git a/include/linux/mfd/wm8400-private.h b/include/linux/mfd/wm8400-private.h

index 2de565b94d0c39e9c0e3b049bbcf0237cceddb8f..4ee908f5b8348b6c2257c4b3689a2b97a3f58c41 100644 (file)
--- a/include/linux/mfd/wm8400-private.h
+++ b/include/linux/mfd/wm8400-private.h
@@ -923,7 +923,6 @@ struct wm8400 {
  #define WM8400_LINE_CMP_VTHD_SHIFT                   0  /* LINE_CMP_VTHD - [3:0] */
  #define WM8400_LINE_CMP_VTHD_WIDTH                   4  /* LINE_CMP_VTHD - [3:0] */
  
-u16 wm8400_reg_read(struct wm8400 *wm8400, u8 reg);
  int wm8400_block_read(struct wm8400 *wm8400, u8 reg, int count, u16 *data);
  
  static inline int wm8400_set_bits(struct wm8400 *wm8400, u8 reg,
diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h

index b2c9fada8eac36b282c5cc05ffe6b7142f456805..2be976dd49669c21829c5c798711786e2cb68f74 100644 (file)
--- a/include/linux/mlx5/cq.h
+++ b/include/linux/mlx5/cq.h
@@ -53,6 +53,11 @@ struct mlx5_core_cq {
         unsigned                arm_sn;
         struct mlx5_rsc_debug   *dbg;
         int                     pid;
+       struct {
+               struct list_head list;
+               void (*comp)(struct mlx5_core_cq *);
+               void            *priv;
+       } tasklet_ctx;
  };
  
  
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h

index 07b504f7eb8479f295cb48393a0a0104c778151a..80776d0c52dc9c48b7a02842caacbf9699e73507 100644 (file)
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -42,6 +42,7 @@
  #include <linux/vmalloc.h>
  #include <linux/radix-tree.h>
  #include <linux/workqueue.h>
+#include <linux/interrupt.h>
  
  #include <linux/mlx5/device.h>
  #include <linux/mlx5/doorbell.h>
@@ -312,6 +313,14 @@ struct mlx5_buf {
         u8                      page_shift;
  };
  
+struct mlx5_eq_tasklet {
+       struct list_head list;
+       struct list_head process_list;
+       struct tasklet_struct task;
+       /* lock on completion tasklet list */
+       spinlock_t lock;
+};
+
  struct mlx5_eq {
         struct mlx5_core_dev   *dev;
         __be32 __iomem         *doorbell;
@@ -325,6 +334,7 @@ struct mlx5_eq {
         struct list_head        list;
         int                     index;
         struct mlx5_rsc_debug   *dbg;
+       struct mlx5_eq_tasklet  tasklet_ctx;
  };
  
  struct mlx5_core_psv {
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 727f799757abf411404937fcef6783356ea1c6cd..2b97be1147ecf640dcfa9f7d1707d17a3a1738ef 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -447,14 +447,14 @@ unsigned long vmalloc_to_pfn(const void *addr);
   * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
   * is no special casing required.
   */
-static inline int is_vmalloc_addr(const void *x)
+static inline bool is_vmalloc_addr(const void *x)
  {
  #ifdef CONFIG_MMU
         unsigned long addr = (unsigned long)x;
  
         return addr >= VMALLOC_START && addr < VMALLOC_END;
  #else
-       return 0;
+       return false;
  #endif
  }
  #ifdef CONFIG_MMU
@@ -734,7 +734,7 @@ static inline void get_page(struct page *page)
         page = compound_head(page);
         /*
          * Getting a normal page or the head of a compound page
-        * requires to already have an elevated page->_count.
+        * requires to already have an elevated page->_refcount.
          */
         VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
         page_ref_inc(page);
@@ -850,10 +850,7 @@ extern int page_cpupid_xchg_last(struct page *page, int cpupid);
  
  static inline void page_cpupid_reset_last(struct page *page)
  {
-       int cpupid = (1 << LAST_CPUPID_SHIFT) - 1;
-
-       page->flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
-       page->flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
+       page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
  }
  #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
  #else /* !CONFIG_NUMA_BALANCING */
@@ -1032,26 +1029,7 @@ static inline pgoff_t page_file_index(struct page *page)
         return page->index;
  }
  
-/*
- * Return true if this page is mapped into pagetables.
- * For compound page it returns true if any subpage of compound page is mapped.
- */
-static inline bool page_mapped(struct page *page)
-{
-       int i;
-       if (likely(!PageCompound(page)))
-               return atomic_read(&page->_mapcount) >= 0;
-       page = compound_head(page);
-       if (atomic_read(compound_mapcount_ptr(page)) >= 0)
-               return true;
-       if (PageHuge(page))
-               return false;
-       for (i = 0; i < hpage_nr_pages(page); i++) {
-               if (atomic_read(&page[i]._mapcount) >= 0)
-                       return true;
-       }
-       return false;
-}
+bool page_mapped(struct page *page);
  
  /*
   * Return true only if the page has been allocated with
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h

index 712e8c37a200b243b2fff14c379c8acc621a662f..5bd29ba4f174f531d1921f8ea2616b7727b069aa 100644 (file)
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -22,22 +22,34 @@ static inline int page_is_file_cache(struct page *page)
         return !PageSwapBacked(page);
  }
  
+static __always_inline void __update_lru_size(struct lruvec *lruvec,
+                               enum lru_list lru, int nr_pages)
+{
+       __mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
+}
+
+static __always_inline void update_lru_size(struct lruvec *lruvec,
+                               enum lru_list lru, int nr_pages)
+{
+#ifdef CONFIG_MEMCG
+       mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+#else
+       __update_lru_size(lruvec, lru, nr_pages);
+#endif
+}
+
  static __always_inline void add_page_to_lru_list(struct page *page,
                                 struct lruvec *lruvec, enum lru_list lru)
  {
-       int nr_pages = hpage_nr_pages(page);
-       mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+       update_lru_size(lruvec, lru, hpage_nr_pages(page));
         list_add(&page->lru, &lruvec->lists[lru]);
-       __mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
  }
  
  static __always_inline void del_page_from_lru_list(struct page *page,
                                 struct lruvec *lruvec, enum lru_list lru)
  {
-       int nr_pages = hpage_nr_pages(page);
-       mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
         list_del(&page->lru);
-       __mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, -nr_pages);
+       update_lru_size(lruvec, lru, -hpage_nr_pages(page));
  }
  
  /**
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

index c2d75b4fa86c05012a509dd188e2a394d2cb2170..1fda9c99ef956f3b1ce25dce253f48444511511c 100644 (file)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -73,9 +73,9 @@ struct page {
                         unsigned long counters;
  #else
                         /*
-                        * Keep _count separate from slub cmpxchg_double data.
-                        * As the rest of the double word is protected by
-                        * slab_lock but _count is not.
+                        * Keep _refcount separate from slub cmpxchg_double
+                        * data.  As the rest of the double word is protected by
+                        * slab_lock but _refcount is not.
                          */
                         unsigned counters;
  #endif
@@ -97,7 +97,11 @@ struct page {
                                         };
                                         int units;      /* SLOB */
                                 };
-                               atomic_t _count;                /* Usage count, see below. */
+                               /*
+                                * Usage count, *USE WRAPPER FUNCTION*
+                                * when manual accounting. See page_ref.h
+                                */
+                               atomic_t _refcount;
                         };
                         unsigned int active;    /* SLAB */
                 };
@@ -248,7 +252,7 @@ struct page_frag_cache {
         __u32 offset;
  #endif
         /* we maintain a pagecount bias, so that we dont dirty cache line
-        * containing page->_count every time we allocate a fragment.
+        * containing page->_refcount every time we allocate a fragment.
          */
         unsigned int            pagecnt_bias;
         bool pfmemalloc;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index c60df9257cc7c751dedcdb46f7ffc06d63677703..c60db2096fd82fbaf92599df32770b6a095442d8 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -85,13 +85,6 @@ extern int page_group_by_mobility_disabled;
         get_pfnblock_flags_mask(page, page_to_pfn(page),                \
                         PB_migrate_end, MIGRATETYPE_MASK)
  
-static inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
-{
-       BUILD_BUG_ON(PB_migrate_end - PB_migrate != 2);
-       return get_pfnblock_flags_mask(page, pfn, PB_migrate_end,
-                                       MIGRATETYPE_MASK);
-}
-
  struct free_area {
         struct list_head        free_list[MIGRATE_TYPES];
         unsigned long           nr_free;
@@ -747,7 +740,8 @@ extern struct mutex zonelists_mutex;
  void build_all_zonelists(pg_data_t *pgdat, struct zone *zone);
  void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
  bool zone_watermark_ok(struct zone *z, unsigned int order,
-               unsigned long mark, int classzone_idx, int alloc_flags);
+               unsigned long mark, int classzone_idx,
+               unsigned int alloc_flags);
  bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
                 unsigned long mark, int classzone_idx);
  enum memmap_context {
@@ -828,10 +822,7 @@ static inline int is_highmem_idx(enum zone_type idx)
  static inline int is_highmem(struct zone *zone)
  {
  #ifdef CONFIG_HIGHMEM
-       int zone_off = (char *)zone - (char *)zone->zone_pgdat->node_zones;
-       return zone_off == ZONE_HIGHMEM * sizeof(*zone) ||
-              (zone_off == ZONE_MOVABLE * sizeof(*zone) &&
-               zone_movable_is_highmem());
+       return is_highmem_idx(zone_idx(zone));
  #else
         return 0;
  #endif
@@ -922,6 +913,10 @@ static inline int zonelist_node_idx(struct zoneref *zoneref)
  #endif /* CONFIG_NUMA */
  }
  
+struct zoneref *__next_zones_zonelist(struct zoneref *z,
+                                       enum zone_type highest_zoneidx,
+                                       nodemask_t *nodes);
+
  /**
   * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
   * @z - The cursor used as a starting point for the search
@@ -934,9 +929,14 @@ static inline int zonelist_node_idx(struct zoneref *zoneref)
   * being examined. It should be advanced by one before calling
   * next_zones_zonelist again.
   */
-struct zoneref *next_zones_zonelist(struct zoneref *z,
+static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z,
                                         enum zone_type highest_zoneidx,
-                                       nodemask_t *nodes);
+                                       nodemask_t *nodes)
+{
+       if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx))
+               return z;
+       return __next_zones_zonelist(z, highest_zoneidx, nodes);
+}
  
  /**
   * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
@@ -952,13 +952,10 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
   */
  static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
                                         enum zone_type highest_zoneidx,
-                                       nodemask_t *nodes,
-                                       struct zone **zone)
+                                       nodemask_t *nodes)
  {
-       struct zoneref *z = next_zones_zonelist(zonelist->_zonerefs,
+       return next_zones_zonelist(zonelist->_zonerefs,
                                                         highest_zoneidx, nodes);
-       *zone = zonelist_zone(z);
-       return z;
  }
  
  /**
@@ -973,10 +970,17 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
   * within a given nodemask
   */
  #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
-       for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \
+       for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z);       \
+               zone;                                                   \
+               z = next_zones_zonelist(++z, highidx, nodemask),        \
+                       zone = zonelist_zone(z))
+
+#define for_next_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
+       for (zone = z->zone;    \
                 zone;                                                   \
                 z = next_zones_zonelist(++z, highidx, nodemask),        \
-                       zone = zonelist_zone(z))                        \
+                       zone = zonelist_zone(z))
+
  
  /**
   * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h

index 6e85889cf9ab9466b836578b3fe62e077d51884b..f746e44d404618dc5a2d8d974188d8e0ba4a3058 100644 (file)
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -43,8 +43,10 @@
   *
   * int first_node(mask)                        Number lowest set bit, or MAX_NUMNODES
   * int next_node(node, mask)           Next node past 'node', or MAX_NUMNODES
+ * int next_node_in(node, mask)                Next node past 'node', or wrap to first,
+ *                                     or MAX_NUMNODES
   * int first_unset_node(mask)          First node not set in mask, or 
- *                                     MAX_NUMNODES.
+ *                                     MAX_NUMNODES
   *
   * nodemask_t nodemask_of_node(node)   Return nodemask with bit 'node' set
   * NODE_MASK_ALL                       Initializer - all bits set
@@ -259,6 +261,13 @@ static inline int __next_node(int n, const nodemask_t *srcp)
         return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
  }
  
+/*
+ * Find the next present node in src, starting after node n, wrapping around to
+ * the first node in src if needed.  Returns MAX_NUMNODES if src is empty.
+ */
+#define next_node_in(n, src) __next_node_in((n), &(src))
+int __next_node_in(int node, const nodemask_t *srcp);
+
  static inline void init_nodemask_of_node(nodemask_t *mask, int node)
  {
         nodes_clear(*mask);
diff --git a/include/linux/of.h b/include/linux/of.h

index 77ddace575e8f702ee5e4f49480b53cd14646814..c7292e8ea080118b9d09542a875b95ebbddc31b3 100644 (file)
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -75,6 +75,23 @@ struct of_phandle_args {
         uint32_t args[MAX_PHANDLE_ARGS];
  };
  
+struct of_phandle_iterator {
+       /* Common iterator information */
+       const char *cells_name;
+       int cell_count;
+       const struct device_node *parent;
+
+       /* List size information */
+       const __be32 *list_end;
+       const __be32 *phandle_end;
+
+       /* Current position state */
+       const __be32 *cur;
+       uint32_t cur_count;
+       phandle phandle;
+       struct device_node *node;
+};
+
  struct of_reconfig_data {
         struct device_node      *dn;
         struct property         *prop;
@@ -334,6 +351,18 @@ extern int of_parse_phandle_with_fixed_args(const struct device_node *np,
  extern int of_count_phandle_with_args(const struct device_node *np,
         const char *list_name, const char *cells_name);
  
+/* phandle iterator functions */
+extern int of_phandle_iterator_init(struct of_phandle_iterator *it,
+                                   const struct device_node *np,
+                                   const char *list_name,
+                                   const char *cells_name,
+                                   int cell_count);
+
+extern int of_phandle_iterator_next(struct of_phandle_iterator *it);
+extern int of_phandle_iterator_args(struct of_phandle_iterator *it,
+                                   uint32_t *args,
+                                   int size);
+
  extern void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align));
  extern int of_alias_get_id(struct device_node *np, const char *stem);
  extern int of_alias_get_highest_id(const char *stem);
@@ -608,6 +637,27 @@ static inline int of_count_phandle_with_args(struct device_node *np,
         return -ENOSYS;
  }
  
+static inline int of_phandle_iterator_init(struct of_phandle_iterator *it,
+                                          const struct device_node *np,
+                                          const char *list_name,
+                                          const char *cells_name,
+                                          int cell_count)
+{
+       return -ENOSYS;
+}
+
+static inline int of_phandle_iterator_next(struct of_phandle_iterator *it)
+{
+       return -ENOSYS;
+}
+
+static inline int of_phandle_iterator_args(struct of_phandle_iterator *it,
+                                          uint32_t *args,
+                                          int size)
+{
+       return 0;
+}
+
  static inline int of_alias_get_id(struct device_node *np, const char *stem)
  {
         return -ENOSYS;
@@ -877,6 +927,12 @@ static inline int of_property_read_s32(const struct device_node *np,
         return of_property_read_u32(np, propname, (u32*) out_value);
  }
  
+#define of_for_each_phandle(it, err, np, ln, cn, cc)                   \
+       for (of_phandle_iterator_init((it), (np), (ln), (cn), (cc)),    \
+            err = of_phandle_iterator_next(it);                        \
+            err == 0;                                                  \
+            err = of_phandle_iterator_next(it))
+
  #define of_property_for_each_u32(np, propname, prop, p, u)     \
         for (prop = of_find_property(np, propname, NULL),       \
                 p = of_prop_next_u32(prop, NULL, &u);           \
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h

index 2fbe8682a66f491975ca13481572f59ef7f8ee5b..901ec01c9fba00625067ff226e925fa68802cb18 100644 (file)
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -37,8 +37,9 @@ extern bool of_fdt_is_big_endian(const void *blob,
                                  unsigned long node);
  extern int of_fdt_match(const void *blob, unsigned long node,
                         const char *const *compat);
-extern void of_fdt_unflatten_tree(const unsigned long *blob,
-                              struct device_node **mynodes);
+extern void *of_fdt_unflatten_tree(const unsigned long *blob,
+                                  struct device_node *dad,
+                                  struct device_node **mynodes);
  
  /* TBD: Temporary export of fdt globals - remove when code fully merged */
  extern int __initdata dt_root_addr_cells;
diff --git a/include/linux/of_graph.h b/include/linux/of_graph.h

index f8bcd0e21a26649fb70ad645d79b09ee00412283..bb3a5a2cd5705062231fe7d87fac4361a5bde889 100644 (file)
--- a/include/linux/of_graph.h
+++ b/include/linux/of_graph.h
@@ -15,6 +15,7 @@
  #define __LINUX_OF_GRAPH_H
  
  #include <linux/types.h>
+#include <linux/errno.h>
  
  /**
   * struct of_endpoint - the OF graph endpoint data structure
diff --git a/include/linux/oom.h b/include/linux/oom.h

index 628a43242a343af5a3aab7779accd8ec32ad9406..83b9c39bd8b7cce02e0c477ef2cfc8e14b80ce2c 100644 (file)
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -72,6 +72,14 @@ static inline bool oom_task_origin(const struct task_struct *p)
  
  extern void mark_oom_victim(struct task_struct *tsk);
  
+#ifdef CONFIG_MMU
+extern void try_oom_reaper(struct task_struct *tsk);
+#else
+static inline void try_oom_reaper(struct task_struct *tsk)
+{
+}
+#endif
+
  extern unsigned long oom_badness(struct task_struct *p,
                 struct mem_cgroup *memcg, const nodemask_t *nodemask,
                 unsigned long totalpages);
diff --git a/include/linux/padata.h b/include/linux/padata.h

index 438694650471cc66b62cb7890e19fde5799314f7..113ee626a4dcc775b5a1b50f7d0a59418dd01eeb 100644 (file)
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -175,11 +175,6 @@ extern int padata_do_parallel(struct padata_instance *pinst,
  extern void padata_do_serial(struct padata_priv *padata);
  extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
                               cpumask_var_t cpumask);
-extern int padata_set_cpumasks(struct padata_instance *pinst,
-                              cpumask_var_t pcpumask,
-                              cpumask_var_t cbcpumask);
-extern int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask);
-extern int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask);
  extern int padata_start(struct padata_instance *pinst);
  extern void padata_stop(struct padata_instance *pinst);
  extern int padata_register_cpumask_notifier(struct padata_instance *pinst,
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h

index 6b052aa7b5b79de6712ee36fe3a424f6f93bd548..a61e06e5fbce59b598e3ac2ad6f1831323143283 100644 (file)
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -371,10 +371,15 @@ PAGEFLAG(Idle, idle, PF_ANY)
  #define PAGE_MAPPING_KSM       2
  #define PAGE_MAPPING_FLAGS     (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM)
  
+static __always_inline int PageAnonHead(struct page *page)
+{
+       return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
+}
+
  static __always_inline int PageAnon(struct page *page)
  {
         page = compound_head(page);
-       return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
+       return PageAnonHead(page);
  }
  
  #ifdef CONFIG_KSM
diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h

index e596d5d9540e2b534d6ad1fb09a1e1d68274fdf4..8b5e0a9f2431a7b833bdb836600fe512dfbe0651 100644 (file)
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -63,17 +63,17 @@ static inline void __page_ref_unfreeze(struct page *page, int v)
  
  static inline int page_ref_count(struct page *page)
  {
-       return atomic_read(&page->_count);
+       return atomic_read(&page->_refcount);
  }
  
  static inline int page_count(struct page *page)
  {
-       return atomic_read(&compound_head(page)->_count);
+       return atomic_read(&compound_head(page)->_refcount);
  }
  
  static inline void set_page_count(struct page *page, int v)
  {
-       atomic_set(&page->_count, v);
+       atomic_set(&page->_refcount, v);
         if (page_ref_tracepoint_active(__tracepoint_page_ref_set))
                 __page_ref_set(page, v);
  }
@@ -89,35 +89,35 @@ static inline void init_page_count(struct page *page)
  
  static inline void page_ref_add(struct page *page, int nr)
  {
-       atomic_add(nr, &page->_count);
+       atomic_add(nr, &page->_refcount);
         if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
                 __page_ref_mod(page, nr);
  }
  
  static inline void page_ref_sub(struct page *page, int nr)
  {
-       atomic_sub(nr, &page->_count);
+       atomic_sub(nr, &page->_refcount);
         if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
                 __page_ref_mod(page, -nr);
  }
  
  static inline void page_ref_inc(struct page *page)
  {
-       atomic_inc(&page->_count);
+       atomic_inc(&page->_refcount);
         if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
                 __page_ref_mod(page, 1);
  }
  
  static inline void page_ref_dec(struct page *page)
  {
-       atomic_dec(&page->_count);
+       atomic_dec(&page->_refcount);
         if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
                 __page_ref_mod(page, -1);
  }
  
  static inline int page_ref_sub_and_test(struct page *page, int nr)
  {
-       int ret = atomic_sub_and_test(nr, &page->_count);
+       int ret = atomic_sub_and_test(nr, &page->_refcount);
  
         if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test))
                 __page_ref_mod_and_test(page, -nr, ret);
@@ -126,7 +126,7 @@ static inline int page_ref_sub_and_test(struct page *page, int nr)
  
  static inline int page_ref_dec_and_test(struct page *page)
  {
-       int ret = atomic_dec_and_test(&page->_count);
+       int ret = atomic_dec_and_test(&page->_refcount);
  
         if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test))
                 __page_ref_mod_and_test(page, -1, ret);
@@ -135,7 +135,7 @@ static inline int page_ref_dec_and_test(struct page *page)
  
  static inline int page_ref_dec_return(struct page *page)
  {
-       int ret = atomic_dec_return(&page->_count);
+       int ret = atomic_dec_return(&page->_refcount);
  
         if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return))
                 __page_ref_mod_and_return(page, -1, ret);
@@ -144,7 +144,7 @@ static inline int page_ref_dec_return(struct page *page)
  
  static inline int page_ref_add_unless(struct page *page, int nr, int u)
  {
-       int ret = atomic_add_unless(&page->_count, nr, u);
+       int ret = atomic_add_unless(&page->_refcount, nr, u);
  
         if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_unless))
                 __page_ref_mod_unless(page, nr, ret);
@@ -153,7 +153,7 @@ static inline int page_ref_add_unless(struct page *page, int nr, int u)
  
  static inline int page_ref_freeze(struct page *page, int count)
  {
-       int ret = likely(atomic_cmpxchg(&page->_count, count, 0) == count);
+       int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count);
  
         if (page_ref_tracepoint_active(__tracepoint_page_ref_freeze))
                 __page_ref_freeze(page, count, ret);
@@ -165,7 +165,7 @@ static inline void page_ref_unfreeze(struct page *page, int count)
         VM_BUG_ON_PAGE(page_count(page) != 0, page);
         VM_BUG_ON(count == 0);
  
-       atomic_set(&page->_count, count);
+       atomic_set(&page->_refcount, count);
         if (page_ref_tracepoint_active(__tracepoint_page_ref_unfreeze))
                 __page_ref_unfreeze(page, count);
  }
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h

index 7e1ab155c67c78dd6e41defebd238bce0d59af7f..fe1513ffb7bf5ca21c004c0eb654f1c23433d955 100644 (file)
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -90,12 +90,12 @@ void release_pages(struct page **pages, int nr, bool cold);
  
  /*
   * speculatively take a reference to a page.
- * If the page is free (_count == 0), then _count is untouched, and 0
- * is returned. Otherwise, _count is incremented by 1 and 1 is returned.
+ * If the page is free (_refcount == 0), then _refcount is untouched, and 0
+ * is returned. Otherwise, _refcount is incremented by 1 and 1 is returned.
   *
   * This function must be called inside the same rcu_read_lock() section as has
   * been used to lookup the page in the pagecache radix-tree (or page table):
- * this allows allocators to use a synchronize_rcu() to stabilize _count.
+ * this allows allocators to use a synchronize_rcu() to stabilize _refcount.
   *
   * Unless an RCU grace period has passed, the count of all pages coming out
   * of the allocator must be considered unstable. page_count may return higher
@@ -111,7 +111,7 @@ void release_pages(struct page **pages, int nr, bool cold);
   * 2. conditionally increment refcount
   * 3. check the page is still in pagecache (if no, goto 1)
   *
- * Remove-side that cares about stability of _count (eg. reclaim) has the
+ * Remove-side that cares about stability of _refcount (eg. reclaim) has the
   * following (with tree_lock held for write):
   * A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg)
   * B. remove page from pagecache
diff --git a/include/linux/poll.h b/include/linux/poll.h

index 9fb4f40d9a26ed6c964990e896a3c733c5f00185..37b057b63b465afa0edf3e372cb561d7670a3089 100644 (file)
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -96,7 +96,7 @@ extern void poll_initwait(struct poll_wqueues *pwq);
  extern void poll_freewait(struct poll_wqueues *pwq);
  extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
                                  ktime_t *expires, unsigned long slack);
-extern u64 select_estimate_accuracy(struct timespec *tv);
+extern u64 select_estimate_accuracy(struct timespec64 *tv);
  
  
  static inline int poll_schedule(struct poll_wqueues *pwq, int state)
@@ -153,12 +153,13 @@ void zero_fd_set(unsigned long nr, unsigned long *fdset)
  
  #define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1)
  
-extern int do_select(int n, fd_set_bits *fds, struct timespec *end_time);
+extern int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time);
  extern int do_sys_poll(struct pollfd __user * ufds, unsigned int nfds,
-                      struct timespec *end_time);
+                      struct timespec64 *end_time);
  extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
-                          fd_set __user *exp, struct timespec *end_time);
+                          fd_set __user *exp, struct timespec64 *end_time);
  
-extern int poll_select_set_timeout(struct timespec *to, long sec, long nsec);
+extern int poll_select_set_timeout(struct timespec64 *to, time64_t sec,
+                                  long nsec);
  
  #endif /* _LINUX_POLL_H */
diff --git a/include/linux/slab.h b/include/linux/slab.h

index 508bd827e6dc9d09c2767a715bd696ef585dec6c..aeb3e6d00a66be64ef522bfa63b6d2dd1c03df68 100644 (file)
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -315,8 +315,8 @@ static __always_inline int kmalloc_index(size_t size)
  }
  #endif /* !CONFIG_SLOB */
  
-void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment;
-void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment;
+void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc;
+void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc;
  void kmem_cache_free(struct kmem_cache *, void *);
  
  /*
@@ -339,8 +339,8 @@ static __always_inline void kfree_bulk(size_t size, void **p)
  }
  
  #ifdef CONFIG_NUMA
-void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment;
-void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment;
+void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc;
+void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc;
  #else
  static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node)
  {
@@ -354,12 +354,12 @@ static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t f
  #endif
  
  #ifdef CONFIG_TRACING
-extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment;
+extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment __malloc;
  
  #ifdef CONFIG_NUMA
  extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
                                            gfp_t gfpflags,
-                                          int node, size_t size) __assume_slab_alignment;
+                                          int node, size_t size) __assume_slab_alignment __malloc;
  #else
  static __always_inline void *
  kmem_cache_alloc_node_trace(struct kmem_cache *s,
@@ -392,10 +392,10 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s,
  }
  #endif /* CONFIG_TRACING */
  
-extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment;
+extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc;
  
  #ifdef CONFIG_TRACING
-extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment;
+extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc;
  #else
  static __always_inline void *
  kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h

index 9edbbf35234034e659f31bee62ffa191405d4356..8694f7a5d92b6a8a0b481f6d72819e4fb5c88dcf 100644 (file)
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -80,6 +80,10 @@ struct kmem_cache {
         struct kasan_cache kasan_info;
  #endif
  
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+       void *random_seq;
+#endif
+
         struct kmem_cache_node *node[MAX_NUMNODES];
  };
  
diff --git a/include/linux/string.h b/include/linux/string.h

index d3993a79a3254f27ae8337e45f7018dc7aaf05a6..26b6f6a66f835b5f3fcb44da978e3d75b57054ec 100644 (file)
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -119,7 +119,7 @@ char *strreplace(char *s, char old, char new);
  
  extern void kfree_const(const void *x);
  
-extern char *kstrdup(const char *s, gfp_t gfp);
+extern char *kstrdup(const char *s, gfp_t gfp) __malloc;
  extern const char *kstrdup_const(const char *s, gfp_t gfp);
  extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
  extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
diff --git a/include/linux/time64.h b/include/linux/time64.h

index 367d5af899e811177d26fd71602043a279fe37f2..7e5d2fa9ac463ef1a9643d12b9f17ab63454cfbd 100644 (file)
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -65,7 +65,6 @@ static inline struct itimerspec64 itimerspec_to_itimerspec64(struct itimerspec *
  # define timespec64_equal              timespec_equal
  # define timespec64_compare            timespec_compare
  # define set_normalized_timespec64     set_normalized_timespec
-# define timespec64_add_safe           timespec_add_safe
  # define timespec64_add                        timespec_add
  # define timespec64_sub                        timespec_sub
  # define timespec64_valid              timespec_valid
@@ -134,15 +133,6 @@ static inline int timespec64_compare(const struct timespec64 *lhs, const struct
  
  extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec);
  
-/*
- * timespec64_add_safe assumes both values are positive and checks for
- * overflow. It will return TIME_T_MAX if the returned value would be
- * smaller then either of the arguments.
- */
-extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
-                                        const struct timespec64 rhs);
-
-
  static inline struct timespec64 timespec64_add(struct timespec64 lhs,
                                                 struct timespec64 rhs)
  {
@@ -224,4 +214,11 @@ static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns)
  
  #endif
  
+/*
+ * timespec64_add_safe assumes both values are positive and checks for
+ * overflow. It will return TIME64_MAX in case of overflow.
+ */
+extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
+                                        const struct timespec64 rhs);
+
  #endif /* _LINUX_TIME64_H */
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h

index 73fae8c4a5fb50d94b72f12bed28f98d170f5787..d2da8e053210041bfcefb9e04b59d195880d2d0e 100644 (file)
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -163,12 +163,10 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
  #ifdef CONFIG_NUMA
  
  extern unsigned long node_page_state(int node, enum zone_stat_item item);
-extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp);
  
  #else
  
  #define node_page_state(node, item) global_page_state(item)
-#define zone_statistics(_zl, _z, gfp) do { } while (0)
  
  #endif /* CONFIG_NUMA */
  
@@ -193,6 +191,10 @@ void quiet_vmstat(void);
  void cpu_vm_stats_fold(int cpu);
  void refresh_zone_stat_thresholds(void);
  
+struct ctl_table;
+int vmstat_refresh(struct ctl_table *, int write,
+                  void __user *buffer, size_t *lenp, loff_t *ppos);
+
  void drain_zonestat(struct zone *zone, struct per_cpu_pageset *);
  
  int calculate_pressure_threshold(struct zone *zone);
diff --git a/include/misc/cxl.h b/include/misc/cxl.h

index 7d5e2613c7b803932f9a9f587d46b83690550ef8..56560c5781b466cd676ad9a8ea1a65a1a16afb7c 100644 (file)
--- a/include/misc/cxl.h
+++ b/include/misc/cxl.h
@@ -126,6 +126,14 @@ int cxl_afu_reset(struct cxl_context *ctx);
   */
  void cxl_set_master(struct cxl_context *ctx);
  
+/*
+ * Sets the context to use real mode memory accesses to operate with
+ * translation disabled. Note that this only makes sense for kernel contexts
+ * under bare metal, and will not work with virtualisation. May only be
+ * performed on stopped contexts.
+ */
+int cxl_set_translation_mode(struct cxl_context *ctx, bool real_mode);
+
  /*
   * Map and unmap the AFU Problem Space area. The amount and location mapped
   * depends on if this context is a master or slave.
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h

index fb2cef4e97471bbeb67dc950f88736dd34a21b3c..fc0320c004a306a9007a93d1bc6a6633f7fc971d 100644 (file)
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -220,6 +220,7 @@ enum ib_device_cap_flags {
         IB_DEVICE_ON_DEMAND_PAGING              = (1 << 31),
         IB_DEVICE_SG_GAPS_REG                   = (1ULL << 32),
         IB_DEVICE_VIRTUAL_FUNCTION              = ((u64)1 << 33),
+       IB_DEVICE_RAW_SCATTER_FCS               = ((u64)1 << 34),
  };
  
  enum ib_signature_prot_cap {
@@ -931,6 +932,13 @@ struct ib_qp_cap {
         u32     max_send_sge;
         u32     max_recv_sge;
         u32     max_inline_data;
+
+       /*
+        * Maximum number of rdma_rw_ctx structures in flight at a time.
+        * ib_create_qp() will calculate the right amount of neededed WRs
+        * and MRs based on this.
+        */
+       u32     max_rdma_ctxs;
  };
  
  enum ib_sig_type {
@@ -981,6 +989,7 @@ enum ib_qp_create_flags {
         IB_QP_CREATE_NETIF_QP                   = 1 << 5,
         IB_QP_CREATE_SIGNATURE_EN               = 1 << 6,
         IB_QP_CREATE_USE_GFP_NOIO               = 1 << 7,
+       IB_QP_CREATE_SCATTER_FCS                = 1 << 8,
         /* reserve bits 26-31 for low level drivers' internal use */
         IB_QP_CREATE_RESERVED_START             = 1 << 26,
         IB_QP_CREATE_RESERVED_END               = 1 << 31,
@@ -1002,7 +1011,11 @@ struct ib_qp_init_attr {
         enum ib_sig_type        sq_sig_type;
         enum ib_qp_type         qp_type;
         enum ib_qp_create_flags create_flags;
-       u8                      port_num; /* special QP types only */
+
+       /*
+        * Only needed for special QP types, or when using the RW API.
+        */
+       u8                      port_num;
  };
  
  struct ib_qp_open_attr {
@@ -1421,9 +1434,14 @@ struct ib_qp {
         struct ib_pd           *pd;
         struct ib_cq           *send_cq;
         struct ib_cq           *recv_cq;
+       spinlock_t              mr_lock;
+       int                     mrs_used;
+       struct list_head        rdma_mrs;
+       struct list_head        sig_mrs;
         struct ib_srq          *srq;
         struct ib_xrcd         *xrcd; /* XRC TGT QPs only */
         struct list_head        xrcd_list;
+
         /* count times opened, mcast attaches, flow attaches */
         atomic_t                usecnt;
         struct list_head        open_list;
@@ -1438,12 +1456,16 @@ struct ib_qp {
  struct ib_mr {
         struct ib_device  *device;
         struct ib_pd      *pd;
-       struct ib_uobject *uobject;
         u32                lkey;
         u32                rkey;
         u64                iova;
         u32                length;
         unsigned int       page_size;
+       bool               need_inval;
+       union {
+               struct ib_uobject       *uobject;       /* user */
+               struct list_head        qp_entry;       /* FR */
+       };
  };
  
  struct ib_mw {
@@ -1827,7 +1849,8 @@ struct ib_device {
                                                u32 max_num_sg);
         int                        (*map_mr_sg)(struct ib_mr *mr,
                                                 struct scatterlist *sg,
-                                               int sg_nents);
+                                               int sg_nents,
+                                               unsigned int *sg_offset);
         struct ib_mw *             (*alloc_mw)(struct ib_pd *pd,
                                                enum ib_mw_type type,
                                                struct ib_udata *udata);
@@ -2317,6 +2340,18 @@ static inline bool rdma_cap_roce_gid_table(const struct ib_device *device,
                 device->add_gid && device->del_gid;
  }
  
+/*
+ * Check if the device supports READ W/ INVALIDATE.
+ */
+static inline bool rdma_cap_read_inv(struct ib_device *dev, u32 port_num)
+{
+       /*
+        * iWarp drivers must support READ W/ INVALIDATE.  No other protocol
+        * has support for it yet.
+        */
+       return rdma_protocol_iwarp(dev, port_num);
+}
+
  int ib_query_gid(struct ib_device *device,
                  u8 port_num, int index, union ib_gid *gid,
                  struct ib_gid_attr *attr);
@@ -3111,29 +3146,23 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port,
                                             u16 pkey, const union ib_gid *gid,
                                             const struct sockaddr *addr);
  
-int ib_map_mr_sg(struct ib_mr *mr,
-                struct scatterlist *sg,
-                int sg_nents,
-                unsigned int page_size);
+int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
+                unsigned int *sg_offset, unsigned int page_size);
  
  static inline int
-ib_map_mr_sg_zbva(struct ib_mr *mr,
-                 struct scatterlist *sg,
-                 int sg_nents,
-                 unsigned int page_size)
+ib_map_mr_sg_zbva(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
+                 unsigned int *sg_offset, unsigned int page_size)
  {
         int n;
  
-       n = ib_map_mr_sg(mr, sg, sg_nents, page_size);
+       n = ib_map_mr_sg(mr, sg, sg_nents, sg_offset, page_size);
         mr->iova = 0;
  
         return n;
  }
  
-int ib_sg_to_pages(struct ib_mr *mr,
-                  struct scatterlist *sgl,
-                  int sg_nents,
-                  int (*set_page)(struct ib_mr *, u64));
+int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents,
+               unsigned int *sg_offset, int (*set_page)(struct ib_mr *, u64));
  
  void ib_drain_rq(struct ib_qp *qp);
  void ib_drain_sq(struct ib_qp *qp);
diff --git a/include/rdma/mr_pool.h b/include/rdma/mr_pool.h

new file mode 100644 (file)

index 0000000..986010b
--- /dev/null
+++ b/include/rdma/mr_pool.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2016 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#ifndef _RDMA_MR_POOL_H
+#define _RDMA_MR_POOL_H 1
+
+#include <rdma/ib_verbs.h>
+
+struct ib_mr *ib_mr_pool_get(struct ib_qp *qp, struct list_head *list);
+void ib_mr_pool_put(struct ib_qp *qp, struct list_head *list, struct ib_mr *mr);
+
+int ib_mr_pool_init(struct ib_qp *qp, struct list_head *list, int nr,
+               enum ib_mr_type type, u32 max_num_sg);
+void ib_mr_pool_destroy(struct ib_qp *qp, struct list_head *list);
+
+#endif /* _RDMA_MR_POOL_H */
diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h

index a8696551abb1ef16da877252555e572a9b046429..d57ceee90d26fee02ba6535976d19e024239429e 100644 (file)
--- a/include/rdma/rdma_vt.h
+++ b/include/rdma/rdma_vt.h
@@ -467,6 +467,7 @@ static inline struct rvt_qp *rvt_lookup_qpn(struct rvt_dev_info *rdi,
  }
  
  struct rvt_dev_info *rvt_alloc_device(size_t size, int nports);
+void rvt_dealloc_device(struct rvt_dev_info *rdi);
  int rvt_register_device(struct rvt_dev_info *rvd);
  void rvt_unregister_device(struct rvt_dev_info *rvd);
  int rvt_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr);
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h

index 497e59065c2c6e31accc23896d42cd224d78887a..0e1ff2abfe927290acd42accd85598c950e89cd4 100644 (file)
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -117,8 +117,9 @@
  /*
   * Wait flags that would prevent any packet type from being sent.
   */
-#define RVT_S_ANY_WAIT_IO (RVT_S_WAIT_PIO | RVT_S_WAIT_TX | \
-       RVT_S_WAIT_DMA_DESC | RVT_S_WAIT_KMEM)
+#define RVT_S_ANY_WAIT_IO \
+       (RVT_S_WAIT_PIO | RVT_S_WAIT_PIO_DRAIN | RVT_S_WAIT_TX | \
+        RVT_S_WAIT_DMA_DESC | RVT_S_WAIT_KMEM)
  
  /*
   * Wait flags that would prevent send work requests from making progress.
diff --git a/include/rdma/rw.h b/include/rdma/rw.h

new file mode 100644 (file)

index 0000000..377d865
--- /dev/null
+++ b/include/rdma/rw.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2016 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#ifndef _RDMA_RW_H
+#define _RDMA_RW_H
+
+#include <linux/dma-mapping.h>
+#include <linux/scatterlist.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/mr_pool.h>
+
+struct rdma_rw_ctx {
+       /* number of RDMA READ/WRITE WRs (not counting MR WRs) */
+       u32                     nr_ops;
+
+       /* tag for the union below: */
+       u8                      type;
+
+       union {
+               /* for mapping a single SGE: */
+               struct {
+                       struct ib_sge           sge;
+                       struct ib_rdma_wr       wr;
+               } single;
+
+               /* for mapping of multiple SGEs: */
+               struct {
+                       struct ib_sge           *sges;
+                       struct ib_rdma_wr       *wrs;
+               } map;
+
+               /* for registering multiple WRs: */
+               struct rdma_rw_reg_ctx {
+                       struct ib_sge           sge;
+                       struct ib_rdma_wr       wr;
+                       struct ib_reg_wr        reg_wr;
+                       struct ib_send_wr       inv_wr;
+                       struct ib_mr            *mr;
+               } *reg;
+
+               struct {
+                       struct rdma_rw_reg_ctx  data;
+                       struct rdma_rw_reg_ctx  prot;
+                       struct ib_send_wr       sig_inv_wr;
+                       struct ib_mr            *sig_mr;
+                       struct ib_sge           sig_sge;
+                       struct ib_sig_handover_wr sig_wr;
+               } *sig;
+       };
+};
+
+int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
+               struct scatterlist *sg, u32 sg_cnt, u32 sg_offset,
+               u64 remote_addr, u32 rkey, enum dma_data_direction dir);
+void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
+               struct scatterlist *sg, u32 sg_cnt,
+               enum dma_data_direction dir);
+
+int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+               u8 port_num, struct scatterlist *sg, u32 sg_cnt,
+               struct scatterlist *prot_sg, u32 prot_sg_cnt,
+               struct ib_sig_attrs *sig_attrs, u64 remote_addr, u32 rkey,
+               enum dma_data_direction dir);
+void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+               u8 port_num, struct scatterlist *sg, u32 sg_cnt,
+               struct scatterlist *prot_sg, u32 prot_sg_cnt,
+               enum dma_data_direction dir);
+
+struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+               u8 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr);
+int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
+               struct ib_cqe *cqe, struct ib_send_wr *chain_wr);
+
+void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr);
+int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr);
+void rdma_rw_cleanup_mrs(struct ib_qp *qp);
+
+#endif /* _RDMA_RW_H */
diff --git a/include/target/target_core_backend.h b/include/target/target_core_backend.h

index 28ee5c2e6bcd7b08abeda34669cc2da17f4dcf9a..d8ab5101fad5533876339f788b75d855d382bc4f 100644 (file)
--- a/include/target/target_core_backend.h
+++ b/include/target/target_core_backend.h
@@ -85,7 +85,6 @@ extern struct configfs_attribute *passthrough_attrib_attrs[];
  void   *transport_kmap_data_sg(struct se_cmd *);
  void   transport_kunmap_data_sg(struct se_cmd *);
  /* core helpers also used by xcopy during internal command setup */
-int    target_alloc_sgl(struct scatterlist **, unsigned int *, u32, bool);
  sense_reason_t transport_generic_map_mem_to_cmd(struct se_cmd *,
                 struct scatterlist *, u32, struct scatterlist *, u32);
  
diff --git a/include/target/target_core_fabric.h b/include/target/target_core_fabric.h

index 8ff6d40a294fd5a1fcbd8f10d0139d289c00d741..78d88f03b2960b4655d44dfeb575f61d7ba9cf07 100644 (file)
--- a/include/target/target_core_fabric.h
+++ b/include/target/target_core_fabric.h
@@ -185,6 +185,10 @@ int        core_tpg_set_initiator_node_tag(struct se_portal_group *,
  int    core_tpg_register(struct se_wwn *, struct se_portal_group *, int);
  int    core_tpg_deregister(struct se_portal_group *);
  
+int    target_alloc_sgl(struct scatterlist **sgl, unsigned int *nents,
+               u32 length, bool zero_page, bool chainable);
+void   target_free_sgl(struct scatterlist *sgl, int nents);
+
  /*
   * The LIO target core uses DMA_TO_DEVICE to mean that data is going
   * to the target (eg handling a WRITE) and DMA_FROM_DEVICE to mean
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h

index 8126c143a519f2d499d536d2d6e10b2587e9290f..b6543d73d20a7fd03a0e77a2615679bad978d74a 100644 (file)
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -226,6 +226,7 @@ struct ib_uverbs_ex_query_device_resp {
         struct ib_uverbs_odp_caps odp_caps;
         __u64 timestamp_mask;
         __u64 hca_core_clock; /* in KHZ */
+       __u64 device_cap_flags_ex;
  };
  
  struct ib_uverbs_query_port {
diff --git a/include/video/sh_mipi_dsi.h b/include/video/sh_mipi_dsi.h

deleted file mode 100644 (file)

index a01f197..0000000
--- a/include/video/sh_mipi_dsi.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Public SH-mobile MIPI DSI header
- *
- * Copyright (C) 2010 Guennadi Liakhovetski <g.liakhovetski@gmx.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef VIDEO_SH_MIPI_DSI_H
-#define VIDEO_SH_MIPI_DSI_H
-
-enum sh_mipi_dsi_data_fmt {
-       MIPI_RGB888,
-       MIPI_RGB565,
-       MIPI_RGB666_LP,
-       MIPI_RGB666,
-       MIPI_BGR888,
-       MIPI_BGR565,
-       MIPI_BGR666_LP,
-       MIPI_BGR666,
-       MIPI_YUYV,
-       MIPI_UYVY,
-       MIPI_YUV420_L,
-       MIPI_YUV420,
-};
-
-#define SH_MIPI_DSI_HSABM      (1 << 0)
-#define SH_MIPI_DSI_HBPBM      (1 << 1)
-#define SH_MIPI_DSI_HFPBM      (1 << 2)
-#define SH_MIPI_DSI_BL2E       (1 << 3)
-#define SH_MIPI_DSI_VSEE       (1 << 4)
-#define SH_MIPI_DSI_HSEE       (1 << 5)
-#define SH_MIPI_DSI_HSAE       (1 << 6)
-
-#define SH_MIPI_DSI_HSbyteCLK  (1 << 24)
-#define SH_MIPI_DSI_HS6divCLK  (1 << 25)
-#define SH_MIPI_DSI_HS4divCLK  (1 << 26)
-
-#define SH_MIPI_DSI_SYNC_PULSES_MODE   (SH_MIPI_DSI_VSEE | \
-                                        SH_MIPI_DSI_HSEE | \
-                                        SH_MIPI_DSI_HSAE)
-#define SH_MIPI_DSI_SYNC_EVENTS_MODE   (0)
-#define SH_MIPI_DSI_SYNC_BURST_MODE    (SH_MIPI_DSI_BL2E)
-
-struct sh_mipi_dsi_info {
-       enum sh_mipi_dsi_data_fmt       data_format;
-       int                             channel;
-       int                             lane;
-       unsigned long                   flags;
-       u32                             clksrc;
-       u32                             phyctrl; /* for extra setting */
-       unsigned int                    vsynw_offset;
-       int     (*set_dot_clock)(struct platform_device *pdev,
-                                void __iomem *base,
-                                int enable);
-};
-
-#endif
diff --git a/init/Kconfig b/init/Kconfig

index 0dfd09d54c6519fb8a5069bf6153f4245184d01f..79a91a2c0444c72f8cb5c0435927d0ec06de5280 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1742,6 +1742,15 @@ config SLOB
  
  endchoice
  
+config SLAB_FREELIST_RANDOM
+       default n
+       depends on SLAB
+       bool "SLAB freelist randomization"
+       help
+         Randomizes the freelist order used on creating new SLABs. This
+         security feature reduces the predictability of the kernel slab
+         allocator against heap overflows.
+
  config SLUB_CPU_PARTIAL
         default y
         depends on SLUB && SMP
diff --git a/kernel/cpuset.c b/kernel/cpuset.c

index 1902956baba1fa19d915c2ccad402a64209b4860..73e93e53884d1098ceec4b3c8d31049c8d9fc70b 100644 (file)
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,7 +61,7 @@
  #include <linux/cgroup.h>
  #include <linux/wait.h>
  
-struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
+DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
  
  /* See "Frequency meter" comments, below. */
  
@@ -2528,27 +2528,27 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
   *     GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
   *     GFP_USER     - only nodes in current tasks mems allowed ok.
   */
-int __cpuset_node_allowed(int node, gfp_t gfp_mask)
+bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
  {
         struct cpuset *cs;              /* current cpuset ancestors */
         int allowed;                    /* is allocation in zone z allowed? */
         unsigned long flags;
  
         if (in_interrupt())
-               return 1;
+               return true;
         if (node_isset(node, current->mems_allowed))
-               return 1;
+               return true;
         /*
          * Allow tasks that have access to memory reserves because they have
          * been OOM killed to get memory anywhere.
          */
         if (unlikely(test_thread_flag(TIF_MEMDIE)))
-               return 1;
+               return true;
         if (gfp_mask & __GFP_HARDWALL)  /* If hardwall request, stop here */
-               return 0;
+               return false;
  
         if (current->flags & PF_EXITING) /* Let dying task have memory */
-               return 1;
+               return true;
  
         /* Not hardwall and node outside mems_allowed: scan up cpusets */
         spin_lock_irqsave(&callback_lock, flags);
@@ -2591,13 +2591,7 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask)
  
  static int cpuset_spread_node(int *rotor)
  {
-       int node;
-
-       node = next_node(*rotor, current->mems_allowed);
-       if (node == MAX_NUMNODES)
-               node = first_node(current->mems_allowed);
-       *rotor = node;
-       return node;
+       return *rotor = next_node_in(*rotor, current->mems_allowed);
  }
  
  int cpuset_mem_spread_node(void)
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c

index 1391d3ee3b8666c07b677ffd35a5fd2f2b2882d7..1c03dfb4abfd31fa2d15fbd74c1c3e11794e3920 100644 (file)
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1410,7 +1410,7 @@ static int __init crash_save_vmcoreinfo_init(void)
         VMCOREINFO_STRUCT_SIZE(list_head);
         VMCOREINFO_SIZE(nodemask_t);
         VMCOREINFO_OFFSET(page, flags);
-       VMCOREINFO_OFFSET(page, _count);
+       VMCOREINFO_OFFSET(page, _refcount);
         VMCOREINFO_OFFSET(page, mapping);
         VMCOREINFO_OFFSET(page, lru);
         VMCOREINFO_OFFSET(page, _mapcount);
diff --git a/kernel/memremap.c b/kernel/memremap.c

index a6d382312e6f3ff2cdb0bb526f8bbd2978ebffc1..017532193fb1c009b83801b27172f8210988f074 100644 (file)
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -27,6 +27,13 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
  }
  #endif
  
+#ifndef arch_memremap_wb
+static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
+{
+       return (__force void *)ioremap_cache(offset, size);
+}
+#endif
+
  static void *try_ram_remap(resource_size_t offset, size_t size)
  {
         unsigned long pfn = PHYS_PFN(offset);
@@ -34,7 +41,7 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
         /* In the simple case just return the existing linear address */
         if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)))
                 return __va(offset);
-       return NULL; /* fallback to ioremap_cache */
+       return NULL; /* fallback to arch_memremap_wb */
  }
  
  /**
@@ -90,7 +97,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
                 if (is_ram == REGION_INTERSECTS)
                         addr = try_ram_remap(offset, size);
                 if (!addr)
-                       addr = ioremap_cache(offset, size);
+                       addr = arch_memremap_wb(offset, size);
         }
  
         /*
diff --git a/kernel/padata.c b/kernel/padata.c

index b38bea9c466a012a9bc04423af25a5c7d04be34d..993278895ccc6d325d525d5f29af5323c273deab 100644 (file)
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -606,33 +606,6 @@ out_replace:
         return 0;
  }
  
-/**
- * padata_set_cpumasks - Set both parallel and serial cpumasks. The first
- *                       one is used by parallel workers and the second one
- *                       by the wokers doing serialization.
- *
- * @pinst: padata instance
- * @pcpumask: the cpumask to use for parallel workers
- * @cbcpumask: the cpumsak to use for serial workers
- */
-int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask,
-                       cpumask_var_t cbcpumask)
-{
-       int err;
-
-       mutex_lock(&pinst->lock);
-       get_online_cpus();
-
-       err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask);
-
-       put_online_cpus();
-       mutex_unlock(&pinst->lock);
-
-       return err;
-
-}
-EXPORT_SYMBOL(padata_set_cpumasks);
-
  /**
   * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value
   *                     equivalent to @cpumask.
@@ -674,6 +647,43 @@ out:
  }
  EXPORT_SYMBOL(padata_set_cpumask);
  
+/**
+ * padata_start - start the parallel processing
+ *
+ * @pinst: padata instance to start
+ */
+int padata_start(struct padata_instance *pinst)
+{
+       int err = 0;
+
+       mutex_lock(&pinst->lock);
+
+       if (pinst->flags & PADATA_INVALID)
+               err = -EINVAL;
+
+        __padata_start(pinst);
+
+       mutex_unlock(&pinst->lock);
+
+       return err;
+}
+EXPORT_SYMBOL(padata_start);
+
+/**
+ * padata_stop - stop the parallel processing
+ *
+ * @pinst: padata instance to stop
+ */
+void padata_stop(struct padata_instance *pinst)
+{
+       mutex_lock(&pinst->lock);
+       __padata_stop(pinst);
+       mutex_unlock(&pinst->lock);
+}
+EXPORT_SYMBOL(padata_stop);
+
+#ifdef CONFIG_HOTPLUG_CPU
+
  static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
  {
         struct parallel_data *pd;
@@ -694,42 +704,6 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
         return 0;
  }
  
- /**
- * padata_add_cpu - add a cpu to one or both(parallel and serial)
- *                  padata cpumasks.
- *
- * @pinst: padata instance
- * @cpu: cpu to add
- * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added.
- *        The @mask may be any combination of the following flags:
- *          PADATA_CPU_SERIAL   - serial cpumask
- *          PADATA_CPU_PARALLEL - parallel cpumask
- */
-
-int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask)
-{
-       int err;
-
-       if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
-               return -EINVAL;
-
-       mutex_lock(&pinst->lock);
-
-       get_online_cpus();
-       if (mask & PADATA_CPU_SERIAL)
-               cpumask_set_cpu(cpu, pinst->cpumask.cbcpu);
-       if (mask & PADATA_CPU_PARALLEL)
-               cpumask_set_cpu(cpu, pinst->cpumask.pcpu);
-
-       err = __padata_add_cpu(pinst, cpu);
-       put_online_cpus();
-
-       mutex_unlock(&pinst->lock);
-
-       return err;
-}
-EXPORT_SYMBOL(padata_add_cpu);
-
  static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
  {
         struct parallel_data *pd = NULL;
@@ -789,43 +763,6 @@ int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask)
  }
  EXPORT_SYMBOL(padata_remove_cpu);
  
-/**
- * padata_start - start the parallel processing
- *
- * @pinst: padata instance to start
- */
-int padata_start(struct padata_instance *pinst)
-{
-       int err = 0;
-
-       mutex_lock(&pinst->lock);
-
-       if (pinst->flags & PADATA_INVALID)
-               err =-EINVAL;
-
-        __padata_start(pinst);
-
-       mutex_unlock(&pinst->lock);
-
-       return err;
-}
-EXPORT_SYMBOL(padata_start);
-
-/**
- * padata_stop - stop the parallel processing
- *
- * @pinst: padata instance to stop
- */
-void padata_stop(struct padata_instance *pinst)
-{
-       mutex_lock(&pinst->lock);
-       __padata_stop(pinst);
-       mutex_unlock(&pinst->lock);
-}
-EXPORT_SYMBOL(padata_stop);
-
-#ifdef CONFIG_HOTPLUG_CPU
-
  static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu)
  {
         return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) ||
@@ -1091,7 +1028,6 @@ err_free_inst:
  err:
         return NULL;
  }
-EXPORT_SYMBOL(padata_alloc);
  
  /**
   * padata_free - free a padata instance
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c

index 3ccdc8eebc5afff02707b7b20772e8b4f978486c..3e888cd5a5941c43dd05f52209f7f5e884a9bf43 100644 (file)
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -380,29 +380,9 @@ void destroy_rcu_head(struct rcu_head *head)
         debug_object_free(head, &rcuhead_debug_descr);
  }
  
-/*
- * fixup_activate is called when:
- * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
- * Activation is performed internally by call_rcu().
- */
-static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
+static bool rcuhead_is_static_object(void *addr)
  {
-       struct rcu_head *head = addr;
-
-       switch (state) {
-
-       case ODEBUG_STATE_NOTAVAILABLE:
-               /*
-                * This is not really a fixup. We just make sure that it is
-                * tracked in the object tracker.
-                */
-               debug_object_init(head, &rcuhead_debug_descr);
-               debug_object_activate(head, &rcuhead_debug_descr);
-               return 0;
-       default:
-               return 1;
-       }
+       return true;
  }
  
  /**
@@ -440,7 +420,7 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
  
  struct debug_obj_descr rcuhead_debug_descr = {
         .name = "rcu_head",
-       .fixup_activate = rcuhead_fixup_activate,
+       .is_static_object = rcuhead_is_static_object,
  };
  EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
  #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index c8b318663525d02b2098238341aca72c701966fe..2effd84d83e3f5f8d8c2b0496d77cbe1a61b590e 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1521,6 +1521,13 @@ static struct ctl_table vm_table[] = {
                 .mode           = 0644,
                 .proc_handler   = proc_dointvec_jiffies,
         },
+       {
+               .procname       = "stat_refresh",
+               .data           = NULL,
+               .maxlen         = 0,
+               .mode           = 0600,
+               .proc_handler   = vmstat_refresh,
+       },
  #endif
  #ifdef CONFIG_MMU
         {
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c

index fa0b983290cf722fa7ed3f054dcf0cdc45318d5f..8c7392c4fdbd9ace42b096a2710257f1ea70e5ca 100644 (file)
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -334,7 +334,7 @@ static void *hrtimer_debug_hint(void *addr)
   * fixup_init is called when:
   * - an active object is initialized
   */
-static int hrtimer_fixup_init(void *addr, enum debug_obj_state state)
+static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state)
  {
         struct hrtimer *timer = addr;
  
@@ -342,30 +342,25 @@ static int hrtimer_fixup_init(void *addr, enum debug_obj_state state)
         case ODEBUG_STATE_ACTIVE:
                 hrtimer_cancel(timer);
                 debug_object_init(timer, &hrtimer_debug_descr);
-               return 1;
+               return true;
         default:
-               return 0;
+               return false;
         }
  }
  
  /*
   * fixup_activate is called when:
   * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
+ * - an unknown non-static object is activated
   */
-static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
+static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
  {
         switch (state) {
-
-       case ODEBUG_STATE_NOTAVAILABLE:
-               WARN_ON_ONCE(1);
-               return 0;
-
         case ODEBUG_STATE_ACTIVE:
                 WARN_ON(1);
  
         default:
-               return 0;
+               return false;
         }
  }
  
@@ -373,7 +368,7 @@ static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
   * fixup_free is called when:
   * - an active object is freed
   */
-static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
+static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state)
  {
         struct hrtimer *timer = addr;
  
@@ -381,9 +376,9 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
         case ODEBUG_STATE_ACTIVE:
                 hrtimer_cancel(timer);
                 debug_object_free(timer, &hrtimer_debug_descr);
-               return 1;
+               return true;
         default:
-               return 0;
+               return false;
         }
  }
  
diff --git a/kernel/time/time.c b/kernel/time/time.c

index a4064b6120664c0cb87b553cba3dc8c454dec997..667b9335f5d6bf4b86338eadf840dc22e8fc94d5 100644 (file)
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -769,3 +769,24 @@ struct timespec timespec_add_safe(const struct timespec lhs,
  
         return res;
  }
+
+/*
+ * Add two timespec64 values and do a safety check for overflow.
+ * It's assumed that both values are valid (>= 0).
+ * And, each timespec64 is in normalized form.
+ */
+struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
+                               const struct timespec64 rhs)
+{
+       struct timespec64 res;
+
+       set_normalized_timespec64(&res, lhs.tv_sec + rhs.tv_sec,
+                       lhs.tv_nsec + rhs.tv_nsec);
+
+       if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) {
+               res.tv_sec = TIME64_MAX;
+               res.tv_nsec = 0;
+       }
+
+       return res;
+}
diff --git a/kernel/time/timer.c b/kernel/time/timer.c

index 73164c3aa56b51fbeddc7cd3cc26a91fc92d8525..3a95f9728778c587b316274fd510b79ed5574494 100644 (file)
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -489,11 +489,19 @@ static void *timer_debug_hint(void *addr)
         return ((struct timer_list *) addr)->function;
  }
  
+static bool timer_is_static_object(void *addr)
+{
+       struct timer_list *timer = addr;
+
+       return (timer->entry.pprev == NULL &&
+               timer->entry.next == TIMER_ENTRY_STATIC);
+}
+
  /*
   * fixup_init is called when:
   * - an active object is initialized
   */
-static int timer_fixup_init(void *addr, enum debug_obj_state state)
+static bool timer_fixup_init(void *addr, enum debug_obj_state state)
  {
         struct timer_list *timer = addr;
  
@@ -501,9 +509,9 @@ static int timer_fixup_init(void *addr, enum debug_obj_state state)
         case ODEBUG_STATE_ACTIVE:
                 del_timer_sync(timer);
                 debug_object_init(timer, &timer_debug_descr);
-               return 1;
+               return true;
         default:
-               return 0;
+               return false;
         }
  }
  
@@ -516,36 +524,22 @@ static void stub_timer(unsigned long data)
  /*
   * fixup_activate is called when:
   * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
+ * - an unknown non-static object is activated
   */
-static int timer_fixup_activate(void *addr, enum debug_obj_state state)
+static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
  {
         struct timer_list *timer = addr;
  
         switch (state) {
-
         case ODEBUG_STATE_NOTAVAILABLE:
-               /*
-                * This is not really a fixup. The timer was
-                * statically initialized. We just make sure that it
-                * is tracked in the object tracker.
-                */
-               if (timer->entry.pprev == NULL &&
-                   timer->entry.next == TIMER_ENTRY_STATIC) {
-                       debug_object_init(timer, &timer_debug_descr);
-                       debug_object_activate(timer, &timer_debug_descr);
-                       return 0;
-               } else {
-                       setup_timer(timer, stub_timer, 0);
-                       return 1;
-               }
-               return 0;
+               setup_timer(timer, stub_timer, 0);
+               return true;
  
         case ODEBUG_STATE_ACTIVE:
                 WARN_ON(1);
  
         default:
-               return 0;
+               return false;
         }
  }
  
@@ -553,7 +547,7 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state)
   * fixup_free is called when:
   * - an active object is freed
   */
-static int timer_fixup_free(void *addr, enum debug_obj_state state)
+static bool timer_fixup_free(void *addr, enum debug_obj_state state)
  {
         struct timer_list *timer = addr;
  
@@ -561,9 +555,9 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
         case ODEBUG_STATE_ACTIVE:
                 del_timer_sync(timer);
                 debug_object_free(timer, &timer_debug_descr);
-               return 1;
+               return true;
         default:
-               return 0;
+               return false;
         }
  }
  
@@ -571,32 +565,23 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
   * fixup_assert_init is called when:
   * - an untracked/uninit-ed object is found
   */
-static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
+static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
  {
         struct timer_list *timer = addr;
  
         switch (state) {
         case ODEBUG_STATE_NOTAVAILABLE:
-               if (timer->entry.next == TIMER_ENTRY_STATIC) {
-                       /*
-                        * This is not really a fixup. The timer was
-                        * statically initialized. We just make sure that it
-                        * is tracked in the object tracker.
-                        */
-                       debug_object_init(timer, &timer_debug_descr);
-                       return 0;
-               } else {
-                       setup_timer(timer, stub_timer, 0);
-                       return 1;
-               }
+               setup_timer(timer, stub_timer, 0);
+               return true;
         default:
-               return 0;
+               return false;
         }
  }
  
  static struct debug_obj_descr timer_debug_descr = {
         .name                   = "timer_list",
         .debug_hint             = timer_debug_hint,
+       .is_static_object       = timer_is_static_object,
         .fixup_init             = timer_fixup_init,
         .fixup_activate         = timer_fixup_activate,
         .fixup_free             = timer_fixup_free,
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c

index 7e8d792da9630ee9db915e7e849de268ff999878..a6c8252d7776b3954df1db09fb1831c603093c7c 100644 (file)
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3456,11 +3456,23 @@ struct ftrace_glob {
         int type;
  };
  
+/*
+ * If symbols in an architecture don't correspond exactly to the user-visible
+ * name of what they represent, it is possible to define this function to
+ * perform the necessary adjustments.
+*/
+char * __weak arch_ftrace_match_adjust(char *str, const char *search)
+{
+       return str;
+}
+
  static int ftrace_match(char *str, struct ftrace_glob *g)
  {
         int matched = 0;
         int slen;
  
+       str = arch_ftrace_match_adjust(str, g->search);
+
         switch (g->type) {
         case MATCH_FULL:
                 if (strcmp(str, g->search) == 0)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c

index 5f5068e94003d80836040d75931269aca247e853..e1c0e996b5ae63175feb24d5d6a6c8b541b63287 100644 (file)
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -433,54 +433,28 @@ static void *work_debug_hint(void *addr)
         return ((struct work_struct *) addr)->func;
  }
  
-/*
- * fixup_init is called when:
- * - an active object is initialized
- */
-static int work_fixup_init(void *addr, enum debug_obj_state state)
+static bool work_is_static_object(void *addr)
  {
         struct work_struct *work = addr;
  
-       switch (state) {
-       case ODEBUG_STATE_ACTIVE:
-               cancel_work_sync(work);
-               debug_object_init(work, &work_debug_descr);
-               return 1;
-       default:
-               return 0;
-       }
+       return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work));
  }
  
  /*
- * fixup_activate is called when:
- * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
+ * fixup_init is called when:
+ * - an active object is initialized
   */
-static int work_fixup_activate(void *addr, enum debug_obj_state state)
+static bool work_fixup_init(void *addr, enum debug_obj_state state)
  {
         struct work_struct *work = addr;
  
         switch (state) {
-
-       case ODEBUG_STATE_NOTAVAILABLE:
-               /*
-                * This is not really a fixup. The work struct was
-                * statically initialized. We just make sure that it
-                * is tracked in the object tracker.
-                */
-               if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
-                       debug_object_init(work, &work_debug_descr);
-                       debug_object_activate(work, &work_debug_descr);
-                       return 0;
-               }
-               WARN_ON_ONCE(1);
-               return 0;
-
         case ODEBUG_STATE_ACTIVE:
-               WARN_ON(1);
-
+               cancel_work_sync(work);
+               debug_object_init(work, &work_debug_descr);
+               return true;
         default:
-               return 0;
+               return false;
         }
  }
  
@@ -488,7 +462,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state)
   * fixup_free is called when:
   * - an active object is freed
   */
-static int work_fixup_free(void *addr, enum debug_obj_state state)
+static bool work_fixup_free(void *addr, enum debug_obj_state state)
  {
         struct work_struct *work = addr;
  
@@ -496,17 +470,17 @@ static int work_fixup_free(void *addr, enum debug_obj_state state)
         case ODEBUG_STATE_ACTIVE:
                 cancel_work_sync(work);
                 debug_object_free(work, &work_debug_descr);
-               return 1;
+               return true;
         default:
-               return 0;
+               return false;
         }
  }
  
  static struct debug_obj_descr work_debug_descr = {
         .name           = "work_struct",
         .debug_hint     = work_debug_hint,
+       .is_static_object = work_is_static_object,
         .fixup_init     = work_fixup_init,
-       .fixup_activate = work_fixup_activate,
         .fixup_free     = work_fixup_free,
  };
  
diff --git a/lib/Makefile b/lib/Makefile

index 931396ada5eb47fbe7691d68e00cd1c24cfbeff5..42b69185f9634619fd57787543f3e5192604fcae 100644 (file)
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -25,7 +25,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
          sha1.o md5.o irq_regs.o argv_split.o \
          flex_proportions.o ratelimit.o show_mem.o \
          is_single_threaded.o plist.o decompress.o kobject_uevent.o \
-        earlycpio.o seq_buf.o nmi_backtrace.o
+        earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o
  
  obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o
  lib-$(CONFIG_MMU) += ioremap.o
diff --git a/lib/debugobjects.c b/lib/debugobjects.c

index 519b5a10fd704dd412c7cc9c7327fcb2da3c6f0e..a8e12601eb37dca5694faa1a5bfc9cb5e45497f1 100644 (file)
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -269,16 +269,15 @@ static void debug_print_object(struct debug_obj *obj, char *msg)
   * Try to repair the damage, so we have a better chance to get useful
   * debug output.
   */
-static int
-debug_object_fixup(int (*fixup)(void *addr, enum debug_obj_state state),
+static bool
+debug_object_fixup(bool (*fixup)(void *addr, enum debug_obj_state state),
                    void * addr, enum debug_obj_state state)
  {
-       int fixed = 0;
-
-       if (fixup)
-               fixed = fixup(addr, state);
-       debug_objects_fixups += fixed;
-       return fixed;
+       if (fixup && fixup(addr, state)) {
+               debug_objects_fixups++;
+               return true;
+       }
+       return false;
  }
  
  static void debug_object_is_on_stack(void *addr, int onstack)
@@ -416,7 +415,7 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr)
                         state = obj->state;
                         raw_spin_unlock_irqrestore(&db->lock, flags);
                         ret = debug_object_fixup(descr->fixup_activate, addr, state);
-                       return ret ? -EINVAL : 0;
+                       return ret ? 0 : -EINVAL;
  
                 case ODEBUG_STATE_DESTROYED:
                         debug_print_object(obj, "activate");
@@ -432,14 +431,21 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr)
  
         raw_spin_unlock_irqrestore(&db->lock, flags);
         /*
-        * This happens when a static object is activated. We
-        * let the type specific code decide whether this is
-        * true or not.
+        * We are here when a static object is activated. We
+        * let the type specific code confirm whether this is
+        * true or not. if true, we just make sure that the
+        * static object is tracked in the object tracker. If
+        * not, this must be a bug, so we try to fix it up.
          */
-       if (debug_object_fixup(descr->fixup_activate, addr,
-                          ODEBUG_STATE_NOTAVAILABLE)) {
+       if (descr->is_static_object && descr->is_static_object(addr)) {
+               /* track this static object */
+               debug_object_init(addr, descr);
+               debug_object_activate(addr, descr);
+       } else {
                 debug_print_object(&o, "activate");
-               return -EINVAL;
+               ret = debug_object_fixup(descr->fixup_activate, addr,
+                                       ODEBUG_STATE_NOTAVAILABLE);
+               return ret ? 0 : -EINVAL;
         }
         return 0;
  }
@@ -603,12 +609,18 @@ void debug_object_assert_init(void *addr, struct debug_obj_descr *descr)
  
                 raw_spin_unlock_irqrestore(&db->lock, flags);
                 /*
-                * Maybe the object is static.  Let the type specific
-                * code decide what to do.
+                * Maybe the object is static, and we let the type specific
+                * code confirm. Track this static object if true, else invoke
+                * fixup.
                  */
-               if (debug_object_fixup(descr->fixup_assert_init, addr,
-                                      ODEBUG_STATE_NOTAVAILABLE))
+               if (descr->is_static_object && descr->is_static_object(addr)) {
+                       /* Track this static object */
+                       debug_object_init(addr, descr);
+               } else {
                         debug_print_object(&o, "assert_init");
+                       debug_object_fixup(descr->fixup_assert_init, addr,
+                                          ODEBUG_STATE_NOTAVAILABLE);
+               }
                 return;
         }
  
@@ -793,11 +805,18 @@ struct self_test {
  
  static __initdata struct debug_obj_descr descr_type_test;
  
+static bool __init is_static_object(void *addr)
+{
+       struct self_test *obj = addr;
+
+       return obj->static_init;
+}
+
  /*
   * fixup_init is called when:
   * - an active object is initialized
   */
-static int __init fixup_init(void *addr, enum debug_obj_state state)
+static bool __init fixup_init(void *addr, enum debug_obj_state state)
  {
         struct self_test *obj = addr;
  
@@ -805,37 +824,31 @@ static int __init fixup_init(void *addr, enum debug_obj_state state)
         case ODEBUG_STATE_ACTIVE:
                 debug_object_deactivate(obj, &descr_type_test);
                 debug_object_init(obj, &descr_type_test);
-               return 1;
+               return true;
         default:
-               return 0;
+               return false;
         }
  }
  
  /*
   * fixup_activate is called when:
   * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
+ * - an unknown non-static object is activated
   */
-static int __init fixup_activate(void *addr, enum debug_obj_state state)
+static bool __init fixup_activate(void *addr, enum debug_obj_state state)
  {
         struct self_test *obj = addr;
  
         switch (state) {
         case ODEBUG_STATE_NOTAVAILABLE:
-               if (obj->static_init == 1) {
-                       debug_object_init(obj, &descr_type_test);
-                       debug_object_activate(obj, &descr_type_test);
-                       return 0;
-               }
-               return 1;
-
+               return true;
         case ODEBUG_STATE_ACTIVE:
                 debug_object_deactivate(obj, &descr_type_test);
                 debug_object_activate(obj, &descr_type_test);
-               return 1;
+               return true;
  
         default:
-               return 0;
+               return false;
         }
  }
  
@@ -843,7 +856,7 @@ static int __init fixup_activate(void *addr, enum debug_obj_state state)
   * fixup_destroy is called when:
   * - an active object is destroyed
   */
-static int __init fixup_destroy(void *addr, enum debug_obj_state state)
+static bool __init fixup_destroy(void *addr, enum debug_obj_state state)
  {
         struct self_test *obj = addr;
  
@@ -851,9 +864,9 @@ static int __init fixup_destroy(void *addr, enum debug_obj_state state)
         case ODEBUG_STATE_ACTIVE:
                 debug_object_deactivate(obj, &descr_type_test);
                 debug_object_destroy(obj, &descr_type_test);
-               return 1;
+               return true;
         default:
-               return 0;
+               return false;
         }
  }
  
@@ -861,7 +874,7 @@ static int __init fixup_destroy(void *addr, enum debug_obj_state state)
   * fixup_free is called when:
   * - an active object is freed
   */
-static int __init fixup_free(void *addr, enum debug_obj_state state)
+static bool __init fixup_free(void *addr, enum debug_obj_state state)
  {
         struct self_test *obj = addr;
  
@@ -869,9 +882,9 @@ static int __init fixup_free(void *addr, enum debug_obj_state state)
         case ODEBUG_STATE_ACTIVE:
                 debug_object_deactivate(obj, &descr_type_test);
                 debug_object_free(obj, &descr_type_test);
-               return 1;
+               return true;
         default:
-               return 0;
+               return false;
         }
  }
  
@@ -917,6 +930,7 @@ out:
  
  static __initdata struct debug_obj_descr descr_type_test = {
         .name                   = "selftest",
+       .is_static_object       = is_static_object,
         .fixup_init             = fixup_init,
         .fixup_activate         = fixup_activate,
         .fixup_destroy          = fixup_destroy,
diff --git a/lib/nodemask.c b/lib/nodemask.c

new file mode 100644 (file)

index 0000000..e42a5bf
--- /dev/null
+++ b/lib/nodemask.c
@@ -0,0 +1,30 @@
+#include <linux/nodemask.h>
+#include <linux/module.h>
+#include <linux/random.h>
+
+int __next_node_in(int node, const nodemask_t *srcp)
+{
+       int ret = __next_node(node, srcp);
+
+       if (ret == MAX_NUMNODES)
+               ret = __first_node(srcp);
+       return ret;
+}
+EXPORT_SYMBOL(__next_node_in);
+
+#ifdef CONFIG_NUMA
+/*
+ * Return the bit number of a random bit set in the nodemask.
+ * (returns NUMA_NO_NODE if nodemask is empty)
+ */
+int node_random(const nodemask_t *maskp)
+{
+       int w, bit = NUMA_NO_NODE;
+
+       w = nodes_weight(*maskp);
+       if (w)
+               bit = bitmap_ord_to_pos(maskp->bits,
+                       get_random_int() % w, MAX_NUMNODES);
+       return bit;
+}
+#endif
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c

index f051d69f0910a65be2dbce9799736e2ba4eee2c7..72d36113ccaa8509a7140b506cdc0330690a8b15 100644 (file)
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -19,7 +19,7 @@ static DEFINE_SPINLOCK(percpu_counters_lock);
  
  static struct debug_obj_descr percpu_counter_debug_descr;
  
-static int percpu_counter_fixup_free(void *addr, enum debug_obj_state state)
+static bool percpu_counter_fixup_free(void *addr, enum debug_obj_state state)
  {
         struct percpu_counter *fbc = addr;
  
@@ -27,9 +27,9 @@ static int percpu_counter_fixup_free(void *addr, enum debug_obj_state state)
         case ODEBUG_STATE_ACTIVE:
                 percpu_counter_destroy(fbc);
                 debug_object_free(fbc, &percpu_counter_debug_descr);
-               return 1;
+               return true;
         default:
-               return 0;
+               return false;
         }
  }
  
diff --git a/mm/Kconfig b/mm/Kconfig

index 989f8f3d77e0d7bf552a008f5aee227a5c3ddcd0..b0432b71137dc9f10eed2ae0a7ef0dfb2270dc3d 100644 (file)
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -192,6 +192,22 @@ config MEMORY_HOTPLUG_SPARSE
         def_bool y
         depends on SPARSEMEM && MEMORY_HOTPLUG
  
+config MEMORY_HOTPLUG_DEFAULT_ONLINE
+        bool "Online the newly added memory blocks by default"
+        default n
+        depends on MEMORY_HOTPLUG
+        help
+         This option sets the default policy setting for memory hotplug
+         onlining policy (/sys/devices/system/memory/auto_online_blocks) which
+         determines what happens to newly added memory regions. Policy setting
+         can always be changed at runtime.
+         See Documentation/memory-hotplug.txt for more information.
+
+         Say Y here if you want all hot-plugged memory blocks to appear in
+         'online' state by default.
+         Say N here if you want the default policy to keep all hot-plugged
+         memory blocks in 'offline' state.
+
  config MEMORY_HOTREMOVE
         bool "Allow for memory hot remove"
         select MEMORY_ISOLATION
@@ -268,11 +284,6 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION
  config PHYS_ADDR_T_64BIT
         def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
  
-config ZONE_DMA_FLAG
-       int
-       default "0" if !ZONE_DMA
-       default "1"
-
  config BOUNCE
         bool "Enable bounce buffers"
         default y
diff --git a/mm/compaction.c b/mm/compaction.c

index 8fa2540438015c1859724c606072a2770939d954..eda3c2244f3056bec88e09195b1a9d5e30928200 100644 (file)
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -42,6 +42,11 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
  #define CREATE_TRACE_POINTS
  #include <trace/events/compaction.h>
  
+#define block_start_pfn(pfn, order)    round_down(pfn, 1UL << (order))
+#define block_end_pfn(pfn, order)      ALIGN((pfn) + 1, 1UL << (order))
+#define pageblock_start_pfn(pfn)       block_start_pfn(pfn, pageblock_order)
+#define pageblock_end_pfn(pfn)         block_end_pfn(pfn, pageblock_order)
+
  static unsigned long release_freepages(struct list_head *freelist)
  {
         struct page *page, *next;
@@ -161,7 +166,7 @@ static void reset_cached_positions(struct zone *zone)
         zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
         zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
         zone->compact_cached_free_pfn =
-                       round_down(zone_end_pfn(zone) - 1, pageblock_nr_pages);
+                               pageblock_start_pfn(zone_end_pfn(zone) - 1);
  }
  
  /*
@@ -519,10 +524,10 @@ isolate_freepages_range(struct compact_control *cc,
         LIST_HEAD(freelist);
  
         pfn = start_pfn;
-       block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
+       block_start_pfn = pageblock_start_pfn(pfn);
         if (block_start_pfn < cc->zone->zone_start_pfn)
                 block_start_pfn = cc->zone->zone_start_pfn;
-       block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+       block_end_pfn = pageblock_end_pfn(pfn);
  
         for (; pfn < end_pfn; pfn += isolated,
                                 block_start_pfn = block_end_pfn,
@@ -538,8 +543,8 @@ isolate_freepages_range(struct compact_control *cc,
                  * scanning range to right one.
                  */
                 if (pfn >= block_end_pfn) {
-                       block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
-                       block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+                       block_start_pfn = pageblock_start_pfn(pfn);
+                       block_end_pfn = pageblock_end_pfn(pfn);
                         block_end_pfn = min(block_end_pfn, end_pfn);
                 }
  
@@ -633,12 +638,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
  {
         struct zone *zone = cc->zone;
         unsigned long nr_scanned = 0, nr_isolated = 0;
-       struct list_head *migratelist = &cc->migratepages;
         struct lruvec *lruvec;
         unsigned long flags = 0;
         bool locked = false;
         struct page *page = NULL, *valid_page = NULL;
         unsigned long start_pfn = low_pfn;
+       bool skip_on_failure = false;
+       unsigned long next_skip_pfn = 0;
  
         /*
          * Ensure that there are not too many pages isolated from the LRU
@@ -659,10 +665,37 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
         if (compact_should_abort(cc))
                 return 0;
  
+       if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) {
+               skip_on_failure = true;
+               next_skip_pfn = block_end_pfn(low_pfn, cc->order);
+       }
+
         /* Time to isolate some pages for migration */
         for (; low_pfn < end_pfn; low_pfn++) {
                 bool is_lru;
  
+               if (skip_on_failure && low_pfn >= next_skip_pfn) {
+                       /*
+                        * We have isolated all migration candidates in the
+                        * previous order-aligned block, and did not skip it due
+                        * to failure. We should migrate the pages now and
+                        * hopefully succeed compaction.
+                        */
+                       if (nr_isolated)
+                               break;
+
+                       /*
+                        * We failed to isolate in the previous order-aligned
+                        * block. Set the new boundary to the end of the
+                        * current block. Note we can't simply increase
+                        * next_skip_pfn by 1 << order, as low_pfn might have
+                        * been incremented by a higher number due to skipping
+                        * a compound or a high-order buddy page in the
+                        * previous loop iteration.
+                        */
+                       next_skip_pfn = block_end_pfn(low_pfn, cc->order);
+               }
+
                 /*
                  * Periodically drop the lock (if held) regardless of its
                  * contention, to give chance to IRQs. Abort async compaction
@@ -674,7 +707,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                         break;
  
                 if (!pfn_valid_within(low_pfn))
-                       continue;
+                       goto isolate_fail;
                 nr_scanned++;
  
                 page = pfn_to_page(low_pfn);
@@ -729,11 +762,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                         if (likely(comp_order < MAX_ORDER))
                                 low_pfn += (1UL << comp_order) - 1;
  
-                       continue;
+                       goto isolate_fail;
                 }
  
                 if (!is_lru)
-                       continue;
+                       goto isolate_fail;
  
                 /*
                  * Migration will fail if an anonymous page is pinned in memory,
@@ -742,7 +775,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                  */
                 if (!page_mapping(page) &&
                     page_count(page) > page_mapcount(page))
-                       continue;
+                       goto isolate_fail;
  
                 /* If we already hold the lock, we can skip some rechecking */
                 if (!locked) {
@@ -753,7 +786,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
  
                         /* Recheck PageLRU and PageCompound under lock */
                         if (!PageLRU(page))
-                               continue;
+                               goto isolate_fail;
  
                         /*
                          * Page become compound since the non-locked check,
@@ -762,7 +795,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                          */
                         if (unlikely(PageCompound(page))) {
                                 low_pfn += (1UL << compound_order(page)) - 1;
-                               continue;
+                               goto isolate_fail;
                         }
                 }
  
@@ -770,7 +803,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
  
                 /* Try isolate the page */
                 if (__isolate_lru_page(page, isolate_mode) != 0)
-                       continue;
+                       goto isolate_fail;
  
                 VM_BUG_ON_PAGE(PageCompound(page), page);
  
@@ -778,15 +811,55 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                 del_page_from_lru_list(page, lruvec, page_lru(page));
  
  isolate_success:
-               list_add(&page->lru, migratelist);
+               list_add(&page->lru, &cc->migratepages);
                 cc->nr_migratepages++;
                 nr_isolated++;
  
+               /*
+                * Record where we could have freed pages by migration and not
+                * yet flushed them to buddy allocator.
+                * - this is the lowest page that was isolated and likely be
+                * then freed by migration.
+                */
+               if (!cc->last_migrated_pfn)
+                       cc->last_migrated_pfn = low_pfn;
+
                 /* Avoid isolating too much */
                 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
                         ++low_pfn;
                         break;
                 }
+
+               continue;
+isolate_fail:
+               if (!skip_on_failure)
+                       continue;
+
+               /*
+                * We have isolated some pages, but then failed. Release them
+                * instead of migrating, as we cannot form the cc->order buddy
+                * page anyway.
+                */
+               if (nr_isolated) {
+                       if (locked) {
+                               spin_unlock_irqrestore(&zone->lru_lock, flags);
+                               locked = false;
+                       }
+                       acct_isolated(zone, cc);
+                       putback_movable_pages(&cc->migratepages);
+                       cc->nr_migratepages = 0;
+                       cc->last_migrated_pfn = 0;
+                       nr_isolated = 0;
+               }
+
+               if (low_pfn < next_skip_pfn) {
+                       low_pfn = next_skip_pfn - 1;
+                       /*
+                        * The check near the loop beginning would have updated
+                        * next_skip_pfn too, but this is a bit simpler.
+                        */
+                       next_skip_pfn += 1UL << cc->order;
+               }
         }
  
         /*
@@ -834,10 +907,10 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
  
         /* Scan block by block. First and last block may be incomplete */
         pfn = start_pfn;
-       block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
+       block_start_pfn = pageblock_start_pfn(pfn);
         if (block_start_pfn < cc->zone->zone_start_pfn)
                 block_start_pfn = cc->zone->zone_start_pfn;
-       block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+       block_end_pfn = pageblock_end_pfn(pfn);
  
         for (; pfn < end_pfn; pfn = block_end_pfn,
                                 block_start_pfn = block_end_pfn,
@@ -924,10 +997,10 @@ static void isolate_freepages(struct compact_control *cc)
          * is using.
          */
         isolate_start_pfn = cc->free_pfn;
-       block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
+       block_start_pfn = pageblock_start_pfn(cc->free_pfn);
         block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
                                                 zone_end_pfn(zone));
-       low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
+       low_pfn = pageblock_end_pfn(cc->migrate_pfn);
  
         /*
          * Isolate free pages until enough are available to migrate the
@@ -1070,7 +1143,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
         unsigned long block_start_pfn;
         unsigned long block_end_pfn;
         unsigned long low_pfn;
-       unsigned long isolate_start_pfn;
         struct page *page;
         const isolate_mode_t isolate_mode =
                 (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
@@ -1081,12 +1153,12 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
          * initialized by compact_zone()
          */
         low_pfn = cc->migrate_pfn;
-       block_start_pfn = cc->migrate_pfn & ~(pageblock_nr_pages - 1);
+       block_start_pfn = pageblock_start_pfn(low_pfn);
         if (block_start_pfn < zone->zone_start_pfn)
                 block_start_pfn = zone->zone_start_pfn;
  
         /* Only scan within a pageblock boundary */
-       block_end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
+       block_end_pfn = pageblock_end_pfn(low_pfn);
  
         /*
          * Iterate over whole pageblocks until we find the first suitable.
@@ -1125,7 +1197,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
                         continue;
  
                 /* Perform the isolation */
-               isolate_start_pfn = low_pfn;
                 low_pfn = isolate_migratepages_block(cc, low_pfn,
                                                 block_end_pfn, isolate_mode);
  
@@ -1134,15 +1205,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
                         return ISOLATE_ABORT;
                 }
  
-               /*
-                * Record where we could have freed pages by migration and not
-                * yet flushed them to buddy allocator.
-                * - this is the lowest page that could have been isolated and
-                * then freed by migration.
-                */
-               if (cc->nr_migratepages && !cc->last_migrated_pfn)
-                       cc->last_migrated_pfn = isolate_start_pfn;
-
                 /*
                  * Either we isolated something and proceed with migration. Or
                  * we failed and compact_zone should decide if we should
@@ -1251,7 +1313,8 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
   *   COMPACT_CONTINUE - If compaction should run now
   */
  static unsigned long __compaction_suitable(struct zone *zone, int order,
-                                       int alloc_flags, int classzone_idx)
+                                       unsigned int alloc_flags,
+                                       int classzone_idx)
  {
         int fragindex;
         unsigned long watermark;
@@ -1296,7 +1359,8 @@ static unsigned long __compaction_suitable(struct zone *zone, int order,
  }
  
  unsigned long compaction_suitable(struct zone *zone, int order,
-                                       int alloc_flags, int classzone_idx)
+                                       unsigned int alloc_flags,
+                                       int classzone_idx)
  {
         unsigned long ret;
  
@@ -1343,7 +1407,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
         cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
         cc->free_pfn = zone->compact_cached_free_pfn;
         if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
-               cc->free_pfn = round_down(end_pfn - 1, pageblock_nr_pages);
+               cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
                 zone->compact_cached_free_pfn = cc->free_pfn;
         }
         if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
@@ -1398,6 +1462,18 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                                 ret = COMPACT_CONTENDED;
                                 goto out;
                         }
+                       /*
+                        * We failed to migrate at least one page in the current
+                        * order-aligned block, so skip the rest of it.
+                        */
+                       if (cc->direct_compaction &&
+                                               (cc->mode == MIGRATE_ASYNC)) {
+                               cc->migrate_pfn = block_end_pfn(
+                                               cc->migrate_pfn - 1, cc->order);
+                               /* Draining pcplists is useless in this case */
+                               cc->last_migrated_pfn = 0;
+
+                       }
                 }
  
  check_drain:
@@ -1411,7 +1487,7 @@ check_drain:
                 if (cc->order > 0 && cc->last_migrated_pfn) {
                         int cpu;
                         unsigned long current_block_start =
-                               cc->migrate_pfn & ~((1UL << cc->order) - 1);
+                               block_start_pfn(cc->migrate_pfn, cc->order);
  
                         if (cc->last_migrated_pfn < current_block_start) {
                                 cpu = get_cpu();
@@ -1436,7 +1512,7 @@ out:
                 cc->nr_freepages = 0;
                 VM_BUG_ON(free_pfn == 0);
                 /* The cached pfn is always the first in a pageblock */
-               free_pfn &= ~(pageblock_nr_pages-1);
+               free_pfn = pageblock_start_pfn(free_pfn);
                 /*
                  * Only go back, not forward. The cached pfn might have been
                  * already reset to zone end in compact_finished()
@@ -1456,7 +1532,7 @@ out:
  
  static unsigned long compact_zone_order(struct zone *zone, int order,
                 gfp_t gfp_mask, enum migrate_mode mode, int *contended,
-               int alloc_flags, int classzone_idx)
+               unsigned int alloc_flags, int classzone_idx)
  {
         unsigned long ret;
         struct compact_control cc = {
@@ -1497,8 +1573,8 @@ int sysctl_extfrag_threshold = 500;
   * This is the main entry point for direct page compaction.
   */
  unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
-                       int alloc_flags, const struct alloc_context *ac,
-                       enum migrate_mode mode, int *contended)
+               unsigned int alloc_flags, const struct alloc_context *ac,
+               enum migrate_mode mode, int *contended)
  {
         int may_enter_fs = gfp_mask & __GFP_FS;
         int may_perform_io = gfp_mask & __GFP_IO;
@@ -1526,7 +1602,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
  
                 status = compact_zone_order(zone, order, gfp_mask, mode,
                                 &zone_contended, alloc_flags,
-                               ac->classzone_idx);
+                               ac_classzone_idx(ac));
                 rc = max(status, rc);
                 /*
                  * It takes at least one zone that wasn't lock contended
@@ -1536,7 +1612,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
  
                 /* If a normal allocation would succeed, stop compacting */
                 if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
-                                       ac->classzone_idx, alloc_flags)) {
+                                       ac_classzone_idx(ac), alloc_flags)) {
                         /*
                          * We think the allocation will succeed in this zone,
                          * but it is not certain, hence the false. The caller
diff --git a/mm/filemap.c b/mm/filemap.c

index 182b21825255397d8691431f7f47673b52ff79c7..01690338e3d283a1de55772219ea1703e125a34d 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -213,7 +213,7 @@ void __delete_from_page_cache(struct page *page, void *shadow)
                          * some other bad page check should catch it later.
                          */
                         page_mapcount_reset(page);
-                       atomic_sub(mapcount, &page->_count);
+                       page_ref_sub(page, mapcount);
                 }
         }
  
diff --git a/mm/highmem.c b/mm/highmem.c

index 123bcd3ed4f209ba3710d9bfcaf8725d0a105534..50b4ca6787f08e75037c81c853bfb5d22b1cbd36 100644 (file)
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -112,16 +112,12 @@ EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
  
  unsigned int nr_free_highpages (void)
  {
-       pg_data_t *pgdat;
+       struct zone *zone;
         unsigned int pages = 0;
  
-       for_each_online_pgdat(pgdat) {
-               pages += zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
-                       NR_FREE_PAGES);
-               if (zone_movable_is_highmem())
-                       pages += zone_page_state(
-                                       &pgdat->node_zones[ZONE_MOVABLE],
-                                       NR_FREE_PAGES);
+       for_each_populated_zone(zone) {
+               if (is_highmem(zone))
+                       pages += zone_page_state(zone, NR_FREE_PAGES);
         }
  
         return pages;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index b49ee126d4d1feb7a89a9c776edc89141c3351da..66675eed67be6d76ea45294faf9bfe00568a2cd2 100644 (file)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1698,20 +1698,17 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
         return 1;
  }
  
-bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
-                 unsigned long old_addr,
+bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
                   unsigned long new_addr, unsigned long old_end,
                   pmd_t *old_pmd, pmd_t *new_pmd)
  {
         spinlock_t *old_ptl, *new_ptl;
         pmd_t pmd;
-
         struct mm_struct *mm = vma->vm_mm;
  
         if ((old_addr & ~HPAGE_PMD_MASK) ||
             (new_addr & ~HPAGE_PMD_MASK) ||
-           old_end - old_addr < HPAGE_PMD_SIZE ||
-           (new_vma->vm_flags & VM_NOHUGEPAGE))
+           old_end - old_addr < HPAGE_PMD_SIZE)
                 return false;
  
         /*
@@ -3113,7 +3110,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
         VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail);
  
         /*
-        * tail_page->_count is zero and not changing from under us. But
+        * tail_page->_refcount is zero and not changing from under us. But
          * get_page_unless_zero() may be running from under us on the
          * tail_page. If we used atomic_set() below instead of atomic_inc(), we
          * would then run atomic_set() concurrently with
@@ -3340,7 +3337,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
         if (mlocked)
                 lru_add_drain();
  
-       /* Prevent deferred_split_scan() touching ->_count */
+       /* Prevent deferred_split_scan() touching ->_refcount */
         spin_lock_irqsave(&pgdata->split_queue_lock, flags);
         count = page_count(head);
         mapcount = total_mapcount(head);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 19d0d08b396fb1356bc4e834d2aaec2977ee173e..949d80609a32fb39038dac54cd0e5d1248c94ca3 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -51,6 +51,7 @@ __initdata LIST_HEAD(huge_boot_pages);
  static struct hstate * __initdata parsed_hstate;
  static unsigned long __initdata default_hstate_max_huge_pages;
  static unsigned long __initdata default_hstate_size;
+static bool __initdata parsed_valid_hugepagesz = true;
  
  /*
   * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
@@ -144,7 +145,8 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
                 }
         }
  
-       if (spool->min_hpages != -1) {          /* minimum size accounting */
+       /* minimum size accounting */
+       if (spool->min_hpages != -1 && spool->rsv_hpages) {
                 if (delta > spool->rsv_hpages) {
                         /*
                          * Asking for more reserves than those already taken on
@@ -182,7 +184,8 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
         if (spool->max_hpages != -1)            /* maximum size accounting */
                 spool->used_hpages -= delta;
  
-       if (spool->min_hpages != -1) {          /* minimum size accounting */
+        /* minimum size accounting */
+       if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
                 if (spool->rsv_hpages + delta <= spool->min_hpages)
                         ret = 0;
                 else
@@ -937,9 +940,7 @@ err:
   */
  static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
  {
-       nid = next_node(nid, *nodes_allowed);
-       if (nid == MAX_NUMNODES)
-               nid = first_node(*nodes_allowed);
+       nid = next_node_in(nid, *nodes_allowed);
         VM_BUG_ON(nid >= MAX_NUMNODES);
  
         return nid;
@@ -1030,8 +1031,8 @@ static int __alloc_gigantic_page(unsigned long start_pfn,
         return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
  }
  
-static bool pfn_range_valid_gigantic(unsigned long start_pfn,
-                               unsigned long nr_pages)
+static bool pfn_range_valid_gigantic(struct zone *z,
+                       unsigned long start_pfn, unsigned long nr_pages)
  {
         unsigned long i, end_pfn = start_pfn + nr_pages;
         struct page *page;
@@ -1042,6 +1043,9 @@ static bool pfn_range_valid_gigantic(unsigned long start_pfn,
  
                 page = pfn_to_page(i);
  
+               if (page_zone(page) != z)
+                       return false;
+
                 if (PageReserved(page))
                         return false;
  
@@ -1074,7 +1078,7 @@ static struct page *alloc_gigantic_page(int nid, unsigned int order)
  
                 pfn = ALIGN(z->zone_start_pfn, nr_pages);
                 while (zone_spans_last_pfn(z, pfn, nr_pages)) {
-                       if (pfn_range_valid_gigantic(pfn, nr_pages)) {
+                       if (pfn_range_valid_gigantic(z, pfn, nr_pages)) {
                                 /*
                                  * We release the zone lock here because
                                  * alloc_contig_range() will also lock the zone
@@ -2659,6 +2663,11 @@ static int __init hugetlb_init(void)
  subsys_initcall(hugetlb_init);
  
  /* Should be called on processing a hugepagesz=... option */
+void __init hugetlb_bad_size(void)
+{
+       parsed_valid_hugepagesz = false;
+}
+
  void __init hugetlb_add_hstate(unsigned int order)
  {
         struct hstate *h;
@@ -2678,8 +2687,8 @@ void __init hugetlb_add_hstate(unsigned int order)
         for (i = 0; i < MAX_NUMNODES; ++i)
                 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
         INIT_LIST_HEAD(&h->hugepage_activelist);
-       h->next_nid_to_alloc = first_node(node_states[N_MEMORY]);
-       h->next_nid_to_free = first_node(node_states[N_MEMORY]);
+       h->next_nid_to_alloc = first_memory_node;
+       h->next_nid_to_free = first_memory_node;
         snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
                                         huge_page_size(h)/1024);
  
@@ -2691,11 +2700,17 @@ static int __init hugetlb_nrpages_setup(char *s)
         unsigned long *mhp;
         static unsigned long *last_mhp;
  
+       if (!parsed_valid_hugepagesz) {
+               pr_warn("hugepages = %s preceded by "
+                       "an unsupported hugepagesz, ignoring\n", s);
+               parsed_valid_hugepagesz = true;
+               return 1;
+       }
         /*
          * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
          * so this hugepages= parameter goes to the "default hstate".
          */
-       if (!hugetlb_max_hstate)
+       else if (!hugetlb_max_hstate)
                 mhp = &default_hstate_max_huge_pages;
         else
                 mhp = &parsed_hstate->max_huge_pages;
diff --git a/mm/internal.h b/mm/internal.h

index b79abb6721cf79ac0b6a626e94fa4856c6e515a6..3ac544f1963fd8a5c5886ab02900c5a012c25f19 100644 (file)
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -58,7 +58,7 @@ static inline unsigned long ra_submit(struct file_ra_state *ra,
  }
  
  /*
- * Turn a non-refcounted page (->_count == 0) into refcounted with
+ * Turn a non-refcounted page (->_refcount == 0) into refcounted with
   * a count of one.
   */
  static inline void set_page_refcounted(struct page *page)
@@ -102,13 +102,14 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
  struct alloc_context {
         struct zonelist *zonelist;
         nodemask_t *nodemask;
-       struct zone *preferred_zone;
-       int classzone_idx;
+       struct zoneref *preferred_zoneref;
         int migratetype;
         enum zone_type high_zoneidx;
         bool spread_dirty_pages;
  };
  
+#define ac_classzone_idx(ac) zonelist_zone_idx(ac->preferred_zoneref)
+
  /*
   * Locate the struct page for both the matching buddy in our
   * pair (buddy1) and the combined O(n+1) page they form (page).
@@ -175,7 +176,7 @@ struct compact_control {
         bool direct_compaction;         /* False from kcompactd or /proc/... */
         int order;                      /* order a direct compactor needs */
         const gfp_t gfp_mask;           /* gfp mask of a direct compactor */
-       const int alloc_flags;          /* alloc flags of a direct compactor */
+       const unsigned int alloc_flags; /* alloc flags of a direct compactor */
         const int classzone_idx;        /* zone index of a direct compactor */
         struct zone *zone;
         int contended;                  /* Signal need_sched() or lock
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index fe787f5c41bd1332eeca88e85a3f0bf483bb552b..d71d387868e63233584a334299b6488ecfa7a1cb 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1023,22 +1023,40 @@ out:
   * @lru: index of lru list the page is sitting on
   * @nr_pages: positive when adding or negative when removing
   *
- * This function must be called when a page is added to or removed from an
- * lru list.
+ * This function must be called under lru_lock, just before a page is added
+ * to or just after a page is removed from an lru list (that ordering being
+ * so as to allow it to check that lru_size 0 is consistent with list_empty).
   */
  void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
                                 int nr_pages)
  {
         struct mem_cgroup_per_zone *mz;
         unsigned long *lru_size;
+       long size;
+       bool empty;
+
+       __update_lru_size(lruvec, lru, nr_pages);
  
         if (mem_cgroup_disabled())
                 return;
  
         mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
         lru_size = mz->lru_size + lru;
-       *lru_size += nr_pages;
-       VM_BUG_ON((long)(*lru_size) < 0);
+       empty = list_empty(lruvec->lists + lru);
+
+       if (nr_pages < 0)
+               *lru_size += nr_pages;
+
+       size = *lru_size;
+       if (WARN_ONCE(size < 0 || empty != !size,
+               "%s(%p, %d, %d): lru_size %ld but %sempty\n",
+               __func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) {
+               VM_BUG_ON(1);
+               *lru_size = 0;
+       }
+
+       if (nr_pages > 0)
+               *lru_size += nr_pages;
  }
  
  bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
@@ -1257,6 +1275,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
          */
         if (fatal_signal_pending(current) || task_will_free_mem(current)) {
                 mark_oom_victim(current);
+               try_oom_reaper(current);
                 goto unlock;
         }
  
@@ -1389,14 +1408,11 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
         mem_cgroup_may_update_nodemask(memcg);
         node = memcg->last_scanned_node;
  
-       node = next_node(node, memcg->scan_nodes);
-       if (node == MAX_NUMNODES)
-               node = first_node(memcg->scan_nodes);
+       node = next_node_in(node, memcg->scan_nodes);
         /*
-        * We call this when we hit limit, not when pages are added to LRU.
-        * No LRU may hold pages because all pages are UNEVICTABLE or
-        * memcg is too small and all pages are not on LRU. In that case,
-        * we use curret node.
+        * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
+        * last time it really checked all the LRUs due to rate limiting.
+        * Fallback to the current node in that case for simplicity.
          */
         if (unlikely(node == MAX_NUMNODES))
                 node = numa_node_id();
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c

index aa34431c3f31769b05a43820aca0a75c6975e8a3..caf2a14c37ada40b4d690eaae18e6d72cd274d9b 100644 (file)
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -78,9 +78,24 @@ static struct {
  #define memhp_lock_acquire()      lock_map_acquire(&mem_hotplug.dep_map)
  #define memhp_lock_release()      lock_map_release(&mem_hotplug.dep_map)
  
+#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
  bool memhp_auto_online;
+#else
+bool memhp_auto_online = true;
+#endif
  EXPORT_SYMBOL_GPL(memhp_auto_online);
  
+static int __init setup_memhp_default_state(char *str)
+{
+       if (!strcmp(str, "online"))
+               memhp_auto_online = true;
+       else if (!strcmp(str, "offline"))
+               memhp_auto_online = false;
+
+       return 1;
+}
+__setup("memhp_default_state=", setup_memhp_default_state);
+
  void get_online_mems(void)
  {
         might_sleep();
@@ -1410,7 +1425,7 @@ static struct page *next_active_pageblock(struct page *page)
  }
  
  /* Checks if this range of memory is likely to be hot-removable. */
-int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
+bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
  {
         struct page *page = pfn_to_page(start_pfn);
         struct page *end_page = page + nr_pages;
@@ -1418,12 +1433,12 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
         /* Check the starting page of each pageblock within the range */
         for (; page < end_page; page = next_active_pageblock(page)) {
                 if (!is_pageblock_removable_nolock(page))
-                       return 0;
+                       return false;
                 cond_resched();
         }
  
         /* All pageblocks in the memory block are likely to be hot-removable */
-       return 1;
+       return true;
  }
  
  /*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c

index 36cc01bc950a6e5f8ef94c0d0506f463e01791d9..297d6854f84920f4420d5929fd3bc1084dd8a15e 100644 (file)
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -97,7 +97,6 @@
  
  #include <asm/tlbflush.h>
  #include <asm/uaccess.h>
-#include <linux/random.h>
  
  #include "internal.h"
  
@@ -347,9 +346,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
                 BUG();
  
         if (!node_isset(current->il_next, tmp)) {
-               current->il_next = next_node(current->il_next, tmp);
-               if (current->il_next >= MAX_NUMNODES)
-                       current->il_next = first_node(tmp);
+               current->il_next = next_node_in(current->il_next, tmp);
                 if (current->il_next >= MAX_NUMNODES)
                         current->il_next = numa_node_id();
         }
@@ -1709,9 +1706,7 @@ static unsigned interleave_nodes(struct mempolicy *policy)
         struct task_struct *me = current;
  
         nid = me->il_next;
-       next = next_node(nid, policy->v.nodes);
-       if (next >= MAX_NUMNODES)
-               next = first_node(policy->v.nodes);
+       next = next_node_in(nid, policy->v.nodes);
         if (next < MAX_NUMNODES)
                 me->il_next = next;
         return nid;
@@ -1744,18 +1739,18 @@ unsigned int mempolicy_slab_node(void)
                 return interleave_nodes(policy);
  
         case MPOL_BIND: {
+               struct zoneref *z;
+
                 /*
                  * Follow bind policy behavior and start allocation at the
                  * first node.
                  */
                 struct zonelist *zonelist;
-               struct zone *zone;
                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
                 zonelist = &NODE_DATA(node)->node_zonelists[0];
-               (void)first_zones_zonelist(zonelist, highest_zoneidx,
-                                                       &policy->v.nodes,
-                                                       &zone);
-               return zone ? zone->node : node;
+               z = first_zones_zonelist(zonelist, highest_zoneidx,
+                                                       &policy->v.nodes);
+               return z->zone ? z->zone->node : node;
         }
  
         default:
@@ -1763,23 +1758,25 @@ unsigned int mempolicy_slab_node(void)
         }
  }
  
-/* Do static interleaving for a VMA with known offset. */
+/*
+ * Do static interleaving for a VMA with known offset @n.  Returns the n'th
+ * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
+ * number of present nodes.
+ */
  static unsigned offset_il_node(struct mempolicy *pol,
-               struct vm_area_struct *vma, unsigned long off)
+                              struct vm_area_struct *vma, unsigned long n)
  {
         unsigned nnodes = nodes_weight(pol->v.nodes);
         unsigned target;
-       int c;
-       int nid = NUMA_NO_NODE;
+       int i;
+       int nid;
  
         if (!nnodes)
                 return numa_node_id();
-       target = (unsigned int)off % nnodes;
-       c = 0;
-       do {
+       target = (unsigned int)n % nnodes;
+       nid = first_node(pol->v.nodes);
+       for (i = 0; i < target; i++)
                 nid = next_node(nid, pol->v.nodes);
-               c++;
-       } while (c <= target);
         return nid;
  }
  
@@ -1805,21 +1802,6 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
                 return interleave_nodes(pol);
  }
  
-/*
- * Return the bit number of a random bit set in the nodemask.
- * (returns NUMA_NO_NODE if nodemask is empty)
- */
-int node_random(const nodemask_t *maskp)
-{
-       int w, bit = NUMA_NO_NODE;
-
-       w = nodes_weight(*maskp);
-       if (w)
-               bit = bitmap_ord_to_pos(maskp->bits,
-                       get_random_int() % w, MAX_NUMNODES);
-       return bit;
-}
-
  #ifdef CONFIG_HUGETLBFS
  /*
   * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
@@ -2284,7 +2266,7 @@ static void sp_free(struct sp_node *n)
  int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
  {
         struct mempolicy *pol;
-       struct zone *zone;
+       struct zoneref *z;
         int curnid = page_to_nid(page);
         unsigned long pgoff;
         int thiscpu = raw_smp_processor_id();
@@ -2316,6 +2298,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
                 break;
  
         case MPOL_BIND:
+
                 /*
                  * allows binding to multiple nodes.
                  * use current page if in policy nodemask,
@@ -2324,11 +2307,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
                  */
                 if (node_isset(curnid, pol->v.nodes))
                         goto out;
-               (void)first_zones_zonelist(
+               z = first_zones_zonelist(
                                 node_zonelist(numa_node_id(), GFP_HIGHUSER),
                                 gfp_zone(GFP_HIGHUSER),
-                               &pol->v.nodes, &zone);
-               polnid = zone->node;
+                               &pol->v.nodes);
+               polnid = z->zone->node;
                 break;
  
         default:
diff --git a/mm/migrate.c b/mm/migrate.c

index f9dfb18a4ebac9f2f36d798ce6fa6b4ecd4d3c77..53ab6398e7a2a2d3f122db43f0c36fbf62e562f2 100644 (file)
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -332,7 +332,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
                 newpage->index = page->index;
                 newpage->mapping = page->mapping;
                 if (PageSwapBacked(page))
-                       SetPageSwapBacked(newpage);
+                       __SetPageSwapBacked(newpage);
  
                 return MIGRATEPAGE_SUCCESS;
         }
@@ -378,7 +378,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
         newpage->index = page->index;
         newpage->mapping = page->mapping;
         if (PageSwapBacked(page))
-               SetPageSwapBacked(newpage);
+               __SetPageSwapBacked(newpage);
  
         get_page(newpage);      /* add cache reference */
         if (PageSwapCache(page)) {
@@ -1791,7 +1791,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
  
         /* Prepare a page as a migration target */
         __SetPageLocked(new_page);
-       SetPageSwapBacked(new_page);
+       __SetPageSwapBacked(new_page);
  
         /* anon mapping, we can simply copy page->mapping to the new page: */
         new_page->mapping = page->mapping;
diff --git a/mm/mmap.c b/mm/mmap.c

index bd2e1a533bc182a113523bd16b1af09e81a3c8ea..fba246b8f1a54bd61fa761451adb8e5fac85f421 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -55,10 +55,6 @@
  #define arch_mmap_check(addr, len, flags)      (0)
  #endif
  
-#ifndef arch_rebalance_pgtables
-#define arch_rebalance_pgtables(addr, len)             (addr)
-#endif
-
  #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
  const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
  const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX;
@@ -1911,7 +1907,6 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
         if (offset_in_page(addr))
                 return -EINVAL;
  
-       addr = arch_rebalance_pgtables(addr, len);
         error = security_mmap_addr(addr);
         return error ? error : addr;
  }
diff --git a/mm/mmzone.c b/mm/mmzone.c

index 52687fb4de6f46ebcb095987b9474e3957b5eafd..5652be858e5e320c7a748d4e6a407c66baaa62a0 100644 (file)
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -52,7 +52,7 @@ static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
  }
  
  /* Returns the next zone at or below highest_zoneidx in a zonelist */
-struct zoneref *next_zones_zonelist(struct zoneref *z,
+struct zoneref *__next_zones_zonelist(struct zoneref *z,
                                         enum zone_type highest_zoneidx,
                                         nodemask_t *nodes)
  {
diff --git a/mm/mremap.c b/mm/mremap.c

index 3fa0a467df66749e5021e29cb504a1924526bcad..9dc499977924d6ec3b86dbd59533c2c7bd11b8c1 100644 (file)
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -70,6 +70,22 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
         return pmd;
  }
  
+static void take_rmap_locks(struct vm_area_struct *vma)
+{
+       if (vma->vm_file)
+               i_mmap_lock_write(vma->vm_file->f_mapping);
+       if (vma->anon_vma)
+               anon_vma_lock_write(vma->anon_vma);
+}
+
+static void drop_rmap_locks(struct vm_area_struct *vma)
+{
+       if (vma->anon_vma)
+               anon_vma_unlock_write(vma->anon_vma);
+       if (vma->vm_file)
+               i_mmap_unlock_write(vma->vm_file->f_mapping);
+}
+
  static pte_t move_soft_dirty_pte(pte_t pte)
  {
         /*
@@ -90,8 +106,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                 struct vm_area_struct *new_vma, pmd_t *new_pmd,
                 unsigned long new_addr, bool need_rmap_locks)
  {
-       struct address_space *mapping = NULL;
-       struct anon_vma *anon_vma = NULL;
         struct mm_struct *mm = vma->vm_mm;
         pte_t *old_pte, *new_pte, pte;
         spinlock_t *old_ptl, *new_ptl;
@@ -114,16 +128,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
          *   serialize access to individual ptes, but only rmap traversal
          *   order guarantees that we won't miss both the old and new ptes).
          */
-       if (need_rmap_locks) {
-               if (vma->vm_file) {
-                       mapping = vma->vm_file->f_mapping;
-                       i_mmap_lock_write(mapping);
-               }
-               if (vma->anon_vma) {
-                       anon_vma = vma->anon_vma;
-                       anon_vma_lock_write(anon_vma);
-               }
-       }
+       if (need_rmap_locks)
+               take_rmap_locks(vma);
  
         /*
          * We don't have to worry about the ordering of src and dst
@@ -151,10 +157,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                 spin_unlock(new_ptl);
         pte_unmap(new_pte - 1);
         pte_unmap_unlock(old_pte - 1, old_ptl);
-       if (anon_vma)
-               anon_vma_unlock_write(anon_vma);
-       if (mapping)
-               i_mmap_unlock_write(mapping);
+       if (need_rmap_locks)
+               drop_rmap_locks(vma);
  }
  
  #define LATENCY_LIMIT  (64 * PAGE_SIZE)
@@ -193,16 +197,13 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                 if (pmd_trans_huge(*old_pmd)) {
                         if (extent == HPAGE_PMD_SIZE) {
                                 bool moved;
-                               VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma,
-                                             vma);
                                 /* See comment in move_ptes() */
                                 if (need_rmap_locks)
-                                       anon_vma_lock_write(vma->anon_vma);
-                               moved = move_huge_pmd(vma, new_vma, old_addr,
-                                                   new_addr, old_end,
-                                                   old_pmd, new_pmd);
+                                       take_rmap_locks(vma);
+                               moved = move_huge_pmd(vma, old_addr, new_addr,
+                                                   old_end, old_pmd, new_pmd);
                                 if (need_rmap_locks)
-                                       anon_vma_unlock_write(vma->anon_vma);
+                                       drop_rmap_locks(vma);
                                 if (moved) {
                                         need_flush = true;
                                         continue;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c

index 86349586eacbac5022affca24ebaa2b57d19aa15..415f7eb913fa5b6c19c89f9963df254aa2d3938d 100644 (file)
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -412,6 +412,25 @@ bool oom_killer_disabled __read_mostly;
  
  #define K(x) ((x) << (PAGE_SHIFT-10))
  
+/*
+ * task->mm can be NULL if the task is the exited group leader.  So to
+ * determine whether the task is using a particular mm, we examine all the
+ * task's threads: if one of those is using this mm then this task was also
+ * using it.
+ */
+static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
+{
+       struct task_struct *t;
+
+       for_each_thread(p, t) {
+               struct mm_struct *t_mm = READ_ONCE(t->mm);
+               if (t_mm)
+                       return t_mm == mm;
+       }
+       return false;
+}
+
+
  #ifdef CONFIG_MMU
  /*
   * OOM Reaper kernel thread which tries to reap the memory used by the OOM
@@ -491,14 +510,10 @@ static bool __oom_reap_task(struct task_struct *tsk)
         up_read(&mm->mmap_sem);
  
         /*
-        * Clear TIF_MEMDIE because the task shouldn't be sitting on a
-        * reasonably reclaimable memory anymore. OOM killer can continue
-        * by selecting other victim if unmapping hasn't led to any
-        * improvements. This also means that selecting this task doesn't
-        * make any sense.
+        * This task can be safely ignored because we cannot do much more
+        * to release its memory.
          */
         tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN;
-       exit_oom_victim(tsk);
  out:
         mmput(mm);
         return ret;
@@ -519,6 +534,15 @@ static void oom_reap_task(struct task_struct *tsk)
                 debug_show_all_locks();
         }
  
+       /*
+        * Clear TIF_MEMDIE because the task shouldn't be sitting on a
+        * reasonably reclaimable memory anymore or it is not a good candidate
+        * for the oom victim right now because it cannot release its memory
+        * itself nor by the oom reaper.
+        */
+       tsk->oom_reaper_list = NULL;
+       exit_oom_victim(tsk);
+
         /* Drop a reference taken by wake_oom_reaper */
         put_task_struct(tsk);
  }
@@ -563,6 +587,53 @@ static void wake_oom_reaper(struct task_struct *tsk)
         wake_up(&oom_reaper_wait);
  }
  
+/* Check if we can reap the given task. This has to be called with stable
+ * tsk->mm
+ */
+void try_oom_reaper(struct task_struct *tsk)
+{
+       struct mm_struct *mm = tsk->mm;
+       struct task_struct *p;
+
+       if (!mm)
+               return;
+
+       /*
+        * There might be other threads/processes which are either not
+        * dying or even not killable.
+        */
+       if (atomic_read(&mm->mm_users) > 1) {
+               rcu_read_lock();
+               for_each_process(p) {
+                       bool exiting;
+
+                       if (!process_shares_mm(p, mm))
+                               continue;
+                       if (same_thread_group(p, tsk))
+                               continue;
+                       if (fatal_signal_pending(p))
+                               continue;
+
+                       /*
+                        * If the task is exiting make sure the whole thread group
+                        * is exiting and cannot acces mm anymore.
+                        */
+                       spin_lock_irq(&p->sighand->siglock);
+                       exiting = signal_group_exit(p->signal);
+                       spin_unlock_irq(&p->sighand->siglock);
+                       if (exiting)
+                               continue;
+
+                       /* Give up */
+                       rcu_read_unlock();
+                       return;
+               }
+               rcu_read_unlock();
+       }
+
+       wake_oom_reaper(tsk);
+}
+
  static int __init oom_init(void)
  {
         oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
@@ -652,24 +723,6 @@ void oom_killer_enable(void)
         oom_killer_disabled = false;
  }
  
-/*
- * task->mm can be NULL if the task is the exited group leader.  So to
- * determine whether the task is using a particular mm, we examine all the
- * task's threads: if one of those is using this mm then this task was also
- * using it.
- */
-static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
-{
-       struct task_struct *t;
-
-       for_each_thread(p, t) {
-               struct mm_struct *t_mm = READ_ONCE(t->mm);
-               if (t_mm)
-                       return t_mm == mm;
-       }
-       return false;
-}
-
  /*
   * Must be called while holding a reference to p, which will be released upon
   * returning.
@@ -694,6 +747,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
         task_lock(p);
         if (p->mm && task_will_free_mem(p)) {
                 mark_oom_victim(p);
+               try_oom_reaper(p);
                 task_unlock(p);
                 put_task_struct(p);
                 return;
@@ -873,9 +927,19 @@ bool out_of_memory(struct oom_control *oc)
         if (current->mm &&
             (fatal_signal_pending(current) || task_will_free_mem(current))) {
                 mark_oom_victim(current);
+               try_oom_reaper(current);
                 return true;
         }
  
+       /*
+        * The OOM killer does not compensate for IO-less reclaim.
+        * pagefault_out_of_memory lost its gfp context so we have to
+        * make sure exclude 0 mask - all other users should have at least
+        * ___GFP_DIRECT_RECLAIM to get here.
+        */
+       if (oc->gfp_mask && !(oc->gfp_mask & (__GFP_FS|__GFP_NOFAIL)))
+               return true;
+
         /*
          * Check if there were limitations on the allocation (only relevant for
          * NUMA) that may require different handling.
diff --git a/mm/page-writeback.c b/mm/page-writeback.c

index bc5149d5ec38016da91a8b1c85aeca0193143f0c..3b88795ab46e1a70752990bf4ce7da925d97908c 100644 (file)
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -296,11 +296,15 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
  #ifdef CONFIG_HIGHMEM
         int node;
         unsigned long x = 0;
+       int i;
  
         for_each_node_state(node, N_HIGH_MEMORY) {
-               struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
+               for (i = 0; i < MAX_NR_ZONES; i++) {
+                       struct zone *z = &NODE_DATA(node)->node_zones[i];
  
-               x += zone_dirtyable_memory(z);
+                       if (is_highmem(z))
+                               x += zone_dirtyable_memory(z);
+               }
         }
         /*
          * Unreclaimable memory (kernel memory or anonymous memory
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index c1069efcc4d7477a5fc517303b67747f89b77074..5c469c1dfb8bd420c9c0b353cfb24f74a374afa4 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -352,6 +352,106 @@ static inline bool update_defer_init(pg_data_t *pgdat,
  }
  #endif
  
+/* Return a pointer to the bitmap storing bits affecting a block of pages */
+static inline unsigned long *get_pageblock_bitmap(struct page *page,
+                                                       unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+       return __pfn_to_section(pfn)->pageblock_flags;
+#else
+       return page_zone(page)->pageblock_flags;
+#endif /* CONFIG_SPARSEMEM */
+}
+
+static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+       pfn &= (PAGES_PER_SECTION-1);
+       return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
+#else
+       pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
+       return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
+#endif /* CONFIG_SPARSEMEM */
+}
+
+/**
+ * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @end_bitidx: The last bit of interest to retrieve
+ * @mask: mask of bits that the caller is interested in
+ *
+ * Return: pageblock_bits flags
+ */
+static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
+                                       unsigned long pfn,
+                                       unsigned long end_bitidx,
+                                       unsigned long mask)
+{
+       unsigned long *bitmap;
+       unsigned long bitidx, word_bitidx;
+       unsigned long word;
+
+       bitmap = get_pageblock_bitmap(page, pfn);
+       bitidx = pfn_to_bitidx(page, pfn);
+       word_bitidx = bitidx / BITS_PER_LONG;
+       bitidx &= (BITS_PER_LONG-1);
+
+       word = bitmap[word_bitidx];
+       bitidx += end_bitidx;
+       return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
+}
+
+unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
+                                       unsigned long end_bitidx,
+                                       unsigned long mask)
+{
+       return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);
+}
+
+static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
+{
+       return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
+}
+
+/**
+ * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @flags: The flags to set
+ * @pfn: The target page frame number
+ * @end_bitidx: The last bit of interest
+ * @mask: mask of bits that the caller is interested in
+ */
+void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
+                                       unsigned long pfn,
+                                       unsigned long end_bitidx,
+                                       unsigned long mask)
+{
+       unsigned long *bitmap;
+       unsigned long bitidx, word_bitidx;
+       unsigned long old_word, word;
+
+       BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
+
+       bitmap = get_pageblock_bitmap(page, pfn);
+       bitidx = pfn_to_bitidx(page, pfn);
+       word_bitidx = bitidx / BITS_PER_LONG;
+       bitidx &= (BITS_PER_LONG-1);
+
+       VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
+
+       bitidx += end_bitidx;
+       mask <<= (BITS_PER_LONG - bitidx - 1);
+       flags <<= (BITS_PER_LONG - bitidx - 1);
+
+       word = READ_ONCE(bitmap[word_bitidx]);
+       for (;;) {
+               old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
+               if (word == old_word)
+                       break;
+               word = old_word;
+       }
+}
  
  void set_pageblock_migratetype(struct page *page, int migratetype)
  {
@@ -784,17 +884,42 @@ out:
         zone->free_area[order].nr_free++;
  }
  
-static inline int free_pages_check(struct page *page)
+/*
+ * A bad page could be due to a number of fields. Instead of multiple branches,
+ * try and check multiple fields with one check. The caller must do a detailed
+ * check if necessary.
+ */
+static inline bool page_expected_state(struct page *page,
+                                       unsigned long check_flags)
  {
-       const char *bad_reason = NULL;
-       unsigned long bad_flags = 0;
+       if (unlikely(atomic_read(&page->_mapcount) != -1))
+               return false;
+
+       if (unlikely((unsigned long)page->mapping |
+                       page_ref_count(page) |
+#ifdef CONFIG_MEMCG
+                       (unsigned long)page->mem_cgroup |
+#endif
+                       (page->flags & check_flags)))
+               return false;
+
+       return true;
+}
+
+static void free_pages_check_bad(struct page *page)
+{
+       const char *bad_reason;
+       unsigned long bad_flags;
+
+       bad_reason = NULL;
+       bad_flags = 0;
  
         if (unlikely(atomic_read(&page->_mapcount) != -1))
                 bad_reason = "nonzero mapcount";
         if (unlikely(page->mapping != NULL))
                 bad_reason = "non-NULL mapping";
         if (unlikely(page_ref_count(page) != 0))
-               bad_reason = "nonzero _count";
+               bad_reason = "nonzero _refcount";
         if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
                 bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
                 bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
@@ -803,16 +928,146 @@ static inline int free_pages_check(struct page *page)
         if (unlikely(page->mem_cgroup))
                 bad_reason = "page still charged to cgroup";
  #endif
-       if (unlikely(bad_reason)) {
-               bad_page(page, bad_reason, bad_flags);
-               return 1;
+       bad_page(page, bad_reason, bad_flags);
+}
+
+static inline int free_pages_check(struct page *page)
+{
+       if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
+               return 0;
+
+       /* Something has gone sideways, find it */
+       free_pages_check_bad(page);
+       return 1;
+}
+
+static int free_tail_pages_check(struct page *head_page, struct page *page)
+{
+       int ret = 1;
+
+       /*
+        * We rely page->lru.next never has bit 0 set, unless the page
+        * is PageTail(). Let's make sure that's true even for poisoned ->lru.
+        */
+       BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
+
+       if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
+               ret = 0;
+               goto out;
+       }
+       switch (page - head_page) {
+       case 1:
+               /* the first tail page: ->mapping is compound_mapcount() */
+               if (unlikely(compound_mapcount(page))) {
+                       bad_page(page, "nonzero compound_mapcount", 0);
+                       goto out;
+               }
+               break;
+       case 2:
+               /*
+                * the second tail page: ->mapping is
+                * page_deferred_list().next -- ignore value.
+                */
+               break;
+       default:
+               if (page->mapping != TAIL_MAPPING) {
+                       bad_page(page, "corrupted mapping in tail page", 0);
+                       goto out;
+               }
+               break;
+       }
+       if (unlikely(!PageTail(page))) {
+               bad_page(page, "PageTail not set", 0);
+               goto out;
         }
+       if (unlikely(compound_head(page) != head_page)) {
+               bad_page(page, "compound_head not consistent", 0);
+               goto out;
+       }
+       ret = 0;
+out:
+       page->mapping = NULL;
+       clear_compound_head(page);
+       return ret;
+}
+
+static __always_inline bool free_pages_prepare(struct page *page,
+                                       unsigned int order, bool check_free)
+{
+       int bad = 0;
+
+       VM_BUG_ON_PAGE(PageTail(page), page);
+
+       trace_mm_page_free(page, order);
+       kmemcheck_free_shadow(page, order);
+       kasan_free_pages(page, order);
+
+       /*
+        * Check tail pages before head page information is cleared to
+        * avoid checking PageCompound for order-0 pages.
+        */
+       if (unlikely(order)) {
+               bool compound = PageCompound(page);
+               int i;
+
+               VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
+
+               for (i = 1; i < (1 << order); i++) {
+                       if (compound)
+                               bad += free_tail_pages_check(page, page + i);
+                       if (unlikely(free_pages_check(page + i))) {
+                               bad++;
+                               continue;
+                       }
+                       (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+               }
+       }
+       if (PageAnonHead(page))
+               page->mapping = NULL;
+       if (check_free)
+               bad += free_pages_check(page);
+       if (bad)
+               return false;
+
         page_cpupid_reset_last(page);
-       if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
-               page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
-       return 0;
+       page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+       reset_page_owner(page, order);
+
+       if (!PageHighMem(page)) {
+               debug_check_no_locks_freed(page_address(page),
+                                          PAGE_SIZE << order);
+               debug_check_no_obj_freed(page_address(page),
+                                          PAGE_SIZE << order);
+       }
+       arch_free_page(page, order);
+       kernel_poison_pages(page, 1 << order, 0);
+       kernel_map_pages(page, 1 << order, 0);
+
+       return true;
+}
+
+#ifdef CONFIG_DEBUG_VM
+static inline bool free_pcp_prepare(struct page *page)
+{
+       return free_pages_prepare(page, 0, true);
+}
+
+static inline bool bulkfree_pcp_prepare(struct page *page)
+{
+       return false;
+}
+#else
+static bool free_pcp_prepare(struct page *page)
+{
+       return free_pages_prepare(page, 0, false);
  }
  
+static bool bulkfree_pcp_prepare(struct page *page)
+{
+       return free_pages_check(page);
+}
+#endif /* CONFIG_DEBUG_VM */
+
  /*
   * Frees a number of pages from the PCP lists
   * Assumes all pages on list are in same zone, and of same order.
@@ -829,15 +1084,16 @@ static void free_pcppages_bulk(struct zone *zone, int count,
  {
         int migratetype = 0;
         int batch_free = 0;
-       int to_free = count;
         unsigned long nr_scanned;
+       bool isolated_pageblocks;
  
         spin_lock(&zone->lock);
+       isolated_pageblocks = has_isolate_pageblock(zone);
         nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
         if (nr_scanned)
                 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
  
-       while (to_free) {
+       while (count) {
                 struct page *page;
                 struct list_head *list;
  
@@ -857,7 +1113,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
  
                 /* This is the only non-empty list. Free them all. */
                 if (batch_free == MIGRATE_PCPTYPES)
-                       batch_free = to_free;
+                       batch_free = count;
  
                 do {
                         int mt; /* migratetype of the to-be-freed page */
@@ -870,12 +1126,15 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                         /* MIGRATE_ISOLATE page should not go to pcplists */
                         VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
                         /* Pageblock could have been isolated meanwhile */
-                       if (unlikely(has_isolate_pageblock(zone)))
+                       if (unlikely(isolated_pageblocks))
                                 mt = get_pageblock_migratetype(page);
  
+                       if (bulkfree_pcp_prepare(page))
+                               continue;
+
                         __free_one_page(page, page_to_pfn(page), zone, 0, mt);
                         trace_mm_page_pcpu_drain(page, 0, mt);
-               } while (--to_free && --batch_free && !list_empty(list));
+               } while (--count && --batch_free && !list_empty(list));
         }
         spin_unlock(&zone->lock);
  }
@@ -899,56 +1158,6 @@ static void free_one_page(struct zone *zone,
         spin_unlock(&zone->lock);
  }
  
-static int free_tail_pages_check(struct page *head_page, struct page *page)
-{
-       int ret = 1;
-
-       /*
-        * We rely page->lru.next never has bit 0 set, unless the page
-        * is PageTail(). Let's make sure that's true even for poisoned ->lru.
-        */
-       BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
-
-       if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
-               ret = 0;
-               goto out;
-       }
-       switch (page - head_page) {
-       case 1:
-               /* the first tail page: ->mapping is compound_mapcount() */
-               if (unlikely(compound_mapcount(page))) {
-                       bad_page(page, "nonzero compound_mapcount", 0);
-                       goto out;
-               }
-               break;
-       case 2:
-               /*
-                * the second tail page: ->mapping is
-                * page_deferred_list().next -- ignore value.
-                */
-               break;
-       default:
-               if (page->mapping != TAIL_MAPPING) {
-                       bad_page(page, "corrupted mapping in tail page", 0);
-                       goto out;
-               }
-               break;
-       }
-       if (unlikely(!PageTail(page))) {
-               bad_page(page, "PageTail not set", 0);
-               goto out;
-       }
-       if (unlikely(compound_head(page) != head_page)) {
-               bad_page(page, "compound_head not consistent", 0);
-               goto out;
-       }
-       ret = 0;
-out:
-       page->mapping = NULL;
-       clear_compound_head(page);
-       return ret;
-}
-
  static void __meminit __init_single_page(struct page *page, unsigned long pfn,
                                 unsigned long zone, int nid)
  {
@@ -1008,56 +1217,18 @@ void __meminit reserve_bootmem_region(unsigned long start, unsigned long end)
         unsigned long start_pfn = PFN_DOWN(start);
         unsigned long end_pfn = PFN_UP(end);
  
-       for (; start_pfn < end_pfn; start_pfn++) {
-               if (pfn_valid(start_pfn)) {
-                       struct page *page = pfn_to_page(start_pfn);
-
-                       init_reserved_page(start_pfn);
-
-                       /* Avoid false-positive PageTail() */
-                       INIT_LIST_HEAD(&page->lru);
-
-                       SetPageReserved(page);
-               }
-       }
-}
-
-static bool free_pages_prepare(struct page *page, unsigned int order)
-{
-       bool compound = PageCompound(page);
-       int i, bad = 0;
-
-       VM_BUG_ON_PAGE(PageTail(page), page);
-       VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
-
-       trace_mm_page_free(page, order);
-       kmemcheck_free_shadow(page, order);
-       kasan_free_pages(page, order);
-
-       if (PageAnon(page))
-               page->mapping = NULL;
-       bad += free_pages_check(page);
-       for (i = 1; i < (1 << order); i++) {
-               if (compound)
-                       bad += free_tail_pages_check(page, page + i);
-               bad += free_pages_check(page + i);
-       }
-       if (bad)
-               return false;
+       for (; start_pfn < end_pfn; start_pfn++) {
+               if (pfn_valid(start_pfn)) {
+                       struct page *page = pfn_to_page(start_pfn);
  
-       reset_page_owner(page, order);
+                       init_reserved_page(start_pfn);
  
-       if (!PageHighMem(page)) {
-               debug_check_no_locks_freed(page_address(page),
-                                          PAGE_SIZE << order);
-               debug_check_no_obj_freed(page_address(page),
-                                          PAGE_SIZE << order);
-       }
-       arch_free_page(page, order);
-       kernel_poison_pages(page, 1 << order, 0);
-       kernel_map_pages(page, 1 << order, 0);
+                       /* Avoid false-positive PageTail() */
+                       INIT_LIST_HEAD(&page->lru);
  
-       return true;
+                       SetPageReserved(page);
+               }
+       }
  }
  
  static void __free_pages_ok(struct page *page, unsigned int order)
@@ -1066,7 +1237,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
         int migratetype;
         unsigned long pfn = page_to_pfn(page);
  
-       if (!free_pages_prepare(page, order))
+       if (!free_pages_prepare(page, order, true))
                 return;
  
         migratetype = get_pfnblock_migratetype(page, pfn);
@@ -1076,8 +1247,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
         local_irq_restore(flags);
  }
  
-static void __init __free_pages_boot_core(struct page *page,
-                                       unsigned long pfn, unsigned int order)
+static void __init __free_pages_boot_core(struct page *page, unsigned int order)
  {
         unsigned int nr_pages = 1 << order;
         struct page *p = page;
@@ -1154,7 +1324,7 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
  {
         if (early_page_uninitialised(pfn))
                 return;
-       return __free_pages_boot_core(page, pfn, order);
+       return __free_pages_boot_core(page, order);
  }
  
  /*
@@ -1239,12 +1409,12 @@ static void __init deferred_free_range(struct page *page,
         if (nr_pages == MAX_ORDER_NR_PAGES &&
             (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) {
                 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
-               __free_pages_boot_core(page, pfn, MAX_ORDER-1);
+               __free_pages_boot_core(page, MAX_ORDER-1);
                 return;
         }
  
-       for (i = 0; i < nr_pages; i++, page++, pfn++)
-               __free_pages_boot_core(page, pfn, 0);
+       for (i = 0; i < nr_pages; i++, page++)
+               __free_pages_boot_core(page, 0);
  }
  
  /* Completion tracking for deferred_init_memmap() threads */
@@ -1477,10 +1647,7 @@ static inline void expand(struct zone *zone, struct page *page,
         }
  }
  
-/*
- * This page is about to be returned from the page allocator
- */
-static inline int check_new_page(struct page *page)
+static void check_new_page_bad(struct page *page)
  {
         const char *bad_reason = NULL;
         unsigned long bad_flags = 0;
@@ -1503,11 +1670,20 @@ static inline int check_new_page(struct page *page)
         if (unlikely(page->mem_cgroup))
                 bad_reason = "page still charged to cgroup";
  #endif
-       if (unlikely(bad_reason)) {
-               bad_page(page, bad_reason, bad_flags);
-               return 1;
-       }
-       return 0;
+       bad_page(page, bad_reason, bad_flags);
+}
+
+/*
+ * This page is about to be returned from the page allocator
+ */
+static inline int check_new_page(struct page *page)
+{
+       if (likely(page_expected_state(page,
+                               PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
+               return 0;
+
+       check_new_page_bad(page);
+       return 1;
  }
  
  static inline bool free_pages_prezeroed(bool poisoned)
@@ -1516,16 +1692,48 @@ static inline bool free_pages_prezeroed(bool poisoned)
                 page_poisoning_enabled() && poisoned;
  }
  
-static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
-                                                               int alloc_flags)
+#ifdef CONFIG_DEBUG_VM
+static bool check_pcp_refill(struct page *page)
+{
+       return false;
+}
+
+static bool check_new_pcp(struct page *page)
+{
+       return check_new_page(page);
+}
+#else
+static bool check_pcp_refill(struct page *page)
+{
+       return check_new_page(page);
+}
+static bool check_new_pcp(struct page *page)
+{
+       return false;
+}
+#endif /* CONFIG_DEBUG_VM */
+
+static bool check_new_pages(struct page *page, unsigned int order)
+{
+       int i;
+       for (i = 0; i < (1 << order); i++) {
+               struct page *p = page + i;
+
+               if (unlikely(check_new_page(p)))
+                       return true;
+       }
+
+       return false;
+}
+
+static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
+                                                       unsigned int alloc_flags)
  {
         int i;
         bool poisoned = true;
  
         for (i = 0; i < (1 << order); i++) {
                 struct page *p = page + i;
-               if (unlikely(check_new_page(p)))
-                       return 1;
                 if (poisoned)
                         poisoned &= page_is_poisoned(p);
         }
@@ -1557,8 +1765,6 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
                 set_page_pfmemalloc(page);
         else
                 clear_page_pfmemalloc(page);
-
-       return 0;
  }
  
  /*
@@ -1980,6 +2186,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                 if (unlikely(page == NULL))
                         break;
  
+               if (unlikely(check_pcp_refill(page)))
+                       continue;
+
                 /*
                  * Split buddy pages returned by expand() are received here
                  * in physical page order. The page is added to the callers and
@@ -2157,6 +2366,10 @@ void mark_free_pages(struct zone *zone)
         for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
                 if (pfn_valid(pfn)) {
                         page = pfn_to_page(pfn);
+
+                       if (page_zone(page) != zone)
+                               continue;
+
                         if (!swsusp_page_is_forbidden(page))
                                 swsusp_unset_page_free(page);
                 }
@@ -2187,7 +2400,7 @@ void free_hot_cold_page(struct page *page, bool cold)
         unsigned long pfn = page_to_pfn(page);
         int migratetype;
  
-       if (!free_pages_prepare(page, 0))
+       if (!free_pcp_prepare(page))
                 return;
  
         migratetype = get_pfnblock_migratetype(page, pfn);
@@ -2342,13 +2555,45 @@ int split_free_page(struct page *page)
         return nr_pages;
  }
  
+/*
+ * Update NUMA hit/miss statistics
+ *
+ * Must be called with interrupts disabled.
+ *
+ * When __GFP_OTHER_NODE is set assume the node of the preferred
+ * zone is the local node. This is useful for daemons who allocate
+ * memory on behalf of other processes.
+ */
+static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
+                                                               gfp_t flags)
+{
+#ifdef CONFIG_NUMA
+       int local_nid = numa_node_id();
+       enum zone_stat_item local_stat = NUMA_LOCAL;
+
+       if (unlikely(flags & __GFP_OTHER_NODE)) {
+               local_stat = NUMA_OTHER;
+               local_nid = preferred_zone->node;
+       }
+
+       if (z->node == local_nid) {
+               __inc_zone_state(z, NUMA_HIT);
+               __inc_zone_state(z, local_stat);
+       } else {
+               __inc_zone_state(z, NUMA_MISS);
+               __inc_zone_state(preferred_zone, NUMA_FOREIGN);
+       }
+#endif
+}
+
  /*
   * Allocate a page from the given zone. Use pcplists for order-0 allocations.
   */
  static inline
  struct page *buffered_rmqueue(struct zone *preferred_zone,
                         struct zone *zone, unsigned int order,
-                       gfp_t gfp_flags, int alloc_flags, int migratetype)
+                       gfp_t gfp_flags, unsigned int alloc_flags,
+                       int migratetype)
  {
         unsigned long flags;
         struct page *page;
@@ -2359,21 +2604,24 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
                 struct list_head *list;
  
                 local_irq_save(flags);
-               pcp = &this_cpu_ptr(zone->pageset)->pcp;
-               list = &pcp->lists[migratetype];
-               if (list_empty(list)) {
-                       pcp->count += rmqueue_bulk(zone, 0,
-                                       pcp->batch, list,
-                                       migratetype, cold);
-                       if (unlikely(list_empty(list)))
-                               goto failed;
-               }
+               do {
+                       pcp = &this_cpu_ptr(zone->pageset)->pcp;
+                       list = &pcp->lists[migratetype];
+                       if (list_empty(list)) {
+                               pcp->count += rmqueue_bulk(zone, 0,
+                                               pcp->batch, list,
+                                               migratetype, cold);
+                               if (unlikely(list_empty(list)))
+                                       goto failed;
+                       }
  
-               if (cold)
-                       page = list_last_entry(list, struct page, lru);
-               else
-                       page = list_first_entry(list, struct page, lru);
+                       if (cold)
+                               page = list_last_entry(list, struct page, lru);
+                       else
+                               page = list_first_entry(list, struct page, lru);
+               } while (page && check_new_pcp(page));
  
+               __dec_zone_state(zone, NR_ALLOC_BATCH);
                 list_del(&page->lru);
                 pcp->count--;
         } else {
@@ -2384,22 +2632,24 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
                 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
                 spin_lock_irqsave(&zone->lock, flags);
  
-               page = NULL;
-               if (alloc_flags & ALLOC_HARDER) {
-                       page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
-                       if (page)
-                               trace_mm_page_alloc_zone_locked(page, order, migratetype);
-               }
-               if (!page)
-                       page = __rmqueue(zone, order, migratetype);
+               do {
+                       page = NULL;
+                       if (alloc_flags & ALLOC_HARDER) {
+                               page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+                               if (page)
+                                       trace_mm_page_alloc_zone_locked(page, order, migratetype);
+                       }
+                       if (!page)
+                               page = __rmqueue(zone, order, migratetype);
+               } while (page && check_new_pages(page, order));
                 spin_unlock(&zone->lock);
                 if (!page)
                         goto failed;
+               __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
                 __mod_zone_freepage_state(zone, -(1 << order),
                                           get_pcppage_migratetype(page));
         }
  
-       __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
         if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
             !test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
                 set_bit(ZONE_FAIR_DEPLETED, &zone->flags);
@@ -2501,12 +2751,13 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
   * to check in the allocation paths if no pages are free.
   */
  static bool __zone_watermark_ok(struct zone *z, unsigned int order,
-                       unsigned long mark, int classzone_idx, int alloc_flags,
+                       unsigned long mark, int classzone_idx,
+                       unsigned int alloc_flags,
                         long free_pages)
  {
         long min = mark;
         int o;
-       const int alloc_harder = (alloc_flags & ALLOC_HARDER);
+       const bool alloc_harder = (alloc_flags & ALLOC_HARDER);
  
         /* free_pages may go negative - that's OK */
         free_pages -= (1 << order) - 1;
@@ -2569,12 +2820,38 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
  }
  
  bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
-                     int classzone_idx, int alloc_flags)
+                     int classzone_idx, unsigned int alloc_flags)
  {
         return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
                                         zone_page_state(z, NR_FREE_PAGES));
  }
  
+static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
+               unsigned long mark, int classzone_idx, unsigned int alloc_flags)
+{
+       long free_pages = zone_page_state(z, NR_FREE_PAGES);
+       long cma_pages = 0;
+
+#ifdef CONFIG_CMA
+       /* If allocation can't use CMA areas don't use free CMA pages */
+       if (!(alloc_flags & ALLOC_CMA))
+               cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
+#endif
+
+       /*
+        * Fast check for order-0 only. If this fails then the reserves
+        * need to be calculated. There is a corner case where the check
+        * passes but only the high-order atomic reserve are free. If
+        * the caller is !atomic then it'll uselessly search the free
+        * list. That corner case is then slower but it is harmless.
+        */
+       if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
+               return true;
+
+       return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+                                       free_pages);
+}
+
  bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
                         unsigned long mark, int classzone_idx)
  {
@@ -2630,27 +2907,24 @@ static struct page *
  get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
                                                 const struct alloc_context *ac)
  {
-       struct zonelist *zonelist = ac->zonelist;
-       struct zoneref *z;
-       struct page *page = NULL;
+       struct zoneref *z = ac->preferred_zoneref;
         struct zone *zone;
-       int nr_fair_skipped = 0;
-       bool zonelist_rescan;
+       bool fair_skipped = false;
+       bool apply_fair = (alloc_flags & ALLOC_FAIR);
  
  zonelist_scan:
-       zonelist_rescan = false;
-
         /*
          * Scan zonelist, looking for a zone with enough free.
          * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
          */
-       for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
+       for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
                                                                 ac->nodemask) {
+               struct page *page;
                 unsigned long mark;
  
                 if (cpusets_enabled() &&
                         (alloc_flags & ALLOC_CPUSET) &&
-                       !cpuset_zone_allowed(zone, gfp_mask))
+                       !__cpuset_zone_allowed(zone, gfp_mask))
                                 continue;
                 /*
                  * Distribute pages in proportion to the individual
@@ -2658,13 +2932,16 @@ zonelist_scan:
                  * page was allocated in should have no effect on the
                  * time the page has in memory before being reclaimed.
                  */
-               if (alloc_flags & ALLOC_FAIR) {
-                       if (!zone_local(ac->preferred_zone, zone))
-                               break;
+               if (apply_fair) {
                         if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
-                               nr_fair_skipped++;
+                               fair_skipped = true;
                                 continue;
                         }
+                       if (!zone_local(ac->preferred_zoneref->zone, zone)) {
+                               if (fair_skipped)
+                                       goto reset_fair;
+                               apply_fair = false;
+                       }
                 }
                 /*
                  * When allocating a page cache page for writing, we
@@ -2696,8 +2973,8 @@ zonelist_scan:
                         continue;
  
                 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
-               if (!zone_watermark_ok(zone, order, mark,
-                                      ac->classzone_idx, alloc_flags)) {
+               if (!zone_watermark_fast(zone, order, mark,
+                                      ac_classzone_idx(ac), alloc_flags)) {
                         int ret;
  
                         /* Checked here to keep the fast path fast */
@@ -2706,7 +2983,7 @@ zonelist_scan:
                                 goto try_this_zone;
  
                         if (zone_reclaim_mode == 0 ||
-                           !zone_allows_reclaim(ac->preferred_zone, zone))
+                           !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
                                 continue;
  
                         ret = zone_reclaim(zone, gfp_mask, order);
@@ -2720,7 +2997,7 @@ zonelist_scan:
                         default:
                                 /* did we reclaim enough */
                                 if (zone_watermark_ok(zone, order, mark,
-                                               ac->classzone_idx, alloc_flags))
+                                               ac_classzone_idx(ac), alloc_flags))
                                         goto try_this_zone;
  
                                 continue;
@@ -2728,11 +3005,10 @@ zonelist_scan:
                 }
  
  try_this_zone:
-               page = buffered_rmqueue(ac->preferred_zone, zone, order,
+               page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order,
                                 gfp_mask, alloc_flags, ac->migratetype);
                 if (page) {
-                       if (prep_new_page(page, order, gfp_mask, alloc_flags))
-                               goto try_this_zone;
+                       prep_new_page(page, order, gfp_mask, alloc_flags);
  
                         /*
                          * If this is a high-order atomic allocation then check
@@ -2753,18 +3029,13 @@ try_this_zone:
          * include remote zones now, before entering the slowpath and waking
          * kswapd: prefer spilling to a remote zone over swapping locally.
          */
-       if (alloc_flags & ALLOC_FAIR) {
-               alloc_flags &= ~ALLOC_FAIR;
-               if (nr_fair_skipped) {
-                       zonelist_rescan = true;
-                       reset_alloc_batches(ac->preferred_zone);
-               }
-               if (nr_online_nodes > 1)
-                       zonelist_rescan = true;
-       }
-
-       if (zonelist_rescan)
+       if (fair_skipped) {
+reset_fair:
+               apply_fair = false;
+               fair_skipped = false;
+               reset_alloc_batches(ac->preferred_zoneref->zone);
                 goto zonelist_scan;
+       }
  
         return NULL;
  }
@@ -2872,22 +3143,18 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                 /* The OOM killer does not needlessly kill tasks for lowmem */
                 if (ac->high_zoneidx < ZONE_NORMAL)
                         goto out;
-               /* The OOM killer does not compensate for IO-less reclaim */
-               if (!(gfp_mask & __GFP_FS)) {
-                       /*
-                        * XXX: Page reclaim didn't yield anything,
-                        * and the OOM killer can't be invoked, but
-                        * keep looping as per tradition.
-                        *
-                        * But do not keep looping if oom_killer_disable()
-                        * was already called, for the system is trying to
-                        * enter a quiescent state during suspend.
-                        */
-                       *did_some_progress = !oom_killer_disabled;
-                       goto out;
-               }
                 if (pm_suspended_storage())
                         goto out;
+               /*
+                * XXX: GFP_NOFS allocations should rather fail than rely on
+                * other request to make a forward progress.
+                * We are in an unfortunate situation where out_of_memory cannot
+                * do much for this context but let's try it to at least get
+                * access to memory reserved if the current task is killed (see
+                * out_of_memory). Once filesystems are ready to handle allocation
+                * failures more gracefully we should just bail out here.
+                */
+
                 /* The OOM killer may not free memory on a specific node */
                 if (gfp_mask & __GFP_THISNODE)
                         goto out;
@@ -2917,7 +3184,7 @@ out:
  /* Try memory compaction for high-order allocations before reclaim */
  static struct page *
  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
-               int alloc_flags, const struct alloc_context *ac,
+               unsigned int alloc_flags, const struct alloc_context *ac,
                 enum migrate_mode mode, int *contended_compaction,
                 bool *deferred_compaction)
  {
@@ -2973,7 +3240,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
  #else
  static inline struct page *
  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
-               int alloc_flags, const struct alloc_context *ac,
+               unsigned int alloc_flags, const struct alloc_context *ac,
                 enum migrate_mode mode, int *contended_compaction,
                 bool *deferred_compaction)
  {
@@ -3013,7 +3280,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
  /* The really slow allocator path where we enter direct reclaim */
  static inline struct page *
  __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
-               int alloc_flags, const struct alloc_context *ac,
+               unsigned int alloc_flags, const struct alloc_context *ac,
                 unsigned long *did_some_progress)
  {
         struct page *page = NULL;
@@ -3049,13 +3316,13 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
  
         for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
                                                 ac->high_zoneidx, ac->nodemask)
-               wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone));
+               wakeup_kswapd(zone, order, ac_classzone_idx(ac));
  }
  
-static inline int
+static inline unsigned int
  gfp_to_alloc_flags(gfp_t gfp_mask)
  {
-       int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
+       unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
  
         /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
         BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
@@ -3116,7 +3383,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
  {
         bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
         struct page *page = NULL;
-       int alloc_flags;
+       unsigned int alloc_flags;
         unsigned long pages_reclaimed = 0;
         unsigned long did_some_progress;
         enum migrate_mode migration_mode = MIGRATE_ASYNC;
@@ -3153,17 +3420,6 @@ retry:
          */
         alloc_flags = gfp_to_alloc_flags(gfp_mask);
  
-       /*
-        * Find the true preferred zone if the allocation is unconstrained by
-        * cpusets.
-        */
-       if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) {
-               struct zoneref *preferred_zoneref;
-               preferred_zoneref = first_zones_zonelist(ac->zonelist,
-                               ac->high_zoneidx, NULL, &ac->preferred_zone);
-               ac->classzone_idx = zonelist_zone_idx(preferred_zoneref);
-       }
-
         /* This is the last chance, in general, before the goto nopage. */
         page = get_page_from_freelist(gfp_mask, order,
                                 alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
@@ -3278,7 +3534,7 @@ retry:
         if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) ||
             ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) {
                 /* Wait for some write requests to complete then retry */
-               wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50);
+               wait_iff_congested(ac->preferred_zoneref->zone, BLK_RW_ASYNC, HZ/50);
                 goto retry;
         }
  
@@ -3316,17 +3572,24 @@ struct page *
  __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
                         struct zonelist *zonelist, nodemask_t *nodemask)
  {
-       struct zoneref *preferred_zoneref;
-       struct page *page = NULL;
+       struct page *page;
         unsigned int cpuset_mems_cookie;
-       int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
-       gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
+       unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR;
+       gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
         struct alloc_context ac = {
                 .high_zoneidx = gfp_zone(gfp_mask),
+               .zonelist = zonelist,
                 .nodemask = nodemask,
                 .migratetype = gfpflags_to_migratetype(gfp_mask),
         };
  
+       if (cpusets_enabled()) {
+               alloc_mask |= __GFP_HARDWALL;
+               alloc_flags |= ALLOC_CPUSET;
+               if (!ac.nodemask)
+                       ac.nodemask = &cpuset_current_mems_allowed;
+       }
+
         gfp_mask &= gfp_allowed_mask;
  
         lockdep_trace_alloc(gfp_mask);
@@ -3350,49 +3613,54 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
  retry_cpuset:
         cpuset_mems_cookie = read_mems_allowed_begin();
  
-       /* We set it here, as __alloc_pages_slowpath might have changed it */
-       ac.zonelist = zonelist;
-
         /* Dirty zone balancing only done in the fast path */
         ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
  
         /* The preferred zone is used for statistics later */
-       preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
-                               ac.nodemask ? : &cpuset_current_mems_allowed,
-                               &ac.preferred_zone);
-       if (!ac.preferred_zone)
-               goto out;
-       ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);
+       ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
+                                       ac.high_zoneidx, ac.nodemask);
+       if (!ac.preferred_zoneref) {
+               page = NULL;
+               goto no_zone;
+       }
  
         /* First allocation attempt */
-       alloc_mask = gfp_mask|__GFP_HARDWALL;
         page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
-       if (unlikely(!page)) {
-               /*
-                * Runtime PM, block IO and its error handling path
-                * can deadlock because I/O on the device might not
-                * complete.
-                */
-               alloc_mask = memalloc_noio_flags(gfp_mask);
-               ac.spread_dirty_pages = false;
-
-               page = __alloc_pages_slowpath(alloc_mask, order, &ac);
-       }
+       if (likely(page))
+               goto out;
  
-       if (kmemcheck_enabled && page)
-               kmemcheck_pagealloc_alloc(page, order, gfp_mask);
+       /*
+        * Runtime PM, block IO and its error handling path can deadlock
+        * because I/O on the device might not complete.
+        */
+       alloc_mask = memalloc_noio_flags(gfp_mask);
+       ac.spread_dirty_pages = false;
  
-       trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
+       /*
+        * Restore the original nodemask if it was potentially replaced with
+        * &cpuset_current_mems_allowed to optimize the fast-path attempt.
+        */
+       if (cpusets_enabled())
+               ac.nodemask = nodemask;
+       page = __alloc_pages_slowpath(alloc_mask, order, &ac);
  
-out:
+no_zone:
         /*
          * When updating a task's mems_allowed, it is possible to race with
          * parallel threads in such a way that an allocation can fail while
          * the mask is being updated. If a page allocation is about to fail,
          * check if the cpuset changed during allocation and if so, retry.
          */
-       if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+       if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) {
+               alloc_mask = gfp_mask;
                 goto retry_cpuset;
+       }
+
+out:
+       if (kmemcheck_enabled && page)
+               kmemcheck_pagealloc_alloc(page, order, gfp_mask);
+
+       trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
  
         return page;
  }
@@ -3790,6 +4058,8 @@ void si_meminfo_node(struct sysinfo *val, int nid)
  {
         int zone_type;          /* needs to be signed */
         unsigned long managed_pages = 0;
+       unsigned long managed_highpages = 0;
+       unsigned long free_highpages = 0;
         pg_data_t *pgdat = NODE_DATA(nid);
  
         for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
@@ -3798,12 +4068,19 @@ void si_meminfo_node(struct sysinfo *val, int nid)
         val->sharedram = node_page_state(nid, NR_SHMEM);
         val->freeram = node_page_state(nid, NR_FREE_PAGES);
  #ifdef CONFIG_HIGHMEM
-       val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
-       val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
-                       NR_FREE_PAGES);
+       for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
+               struct zone *zone = &pgdat->node_zones[zone_type];
+
+               if (is_highmem(zone)) {
+                       managed_highpages += zone->managed_pages;
+                       free_highpages += zone_page_state(zone, NR_FREE_PAGES);
+               }
+       }
+       val->totalhigh = managed_highpages;
+       val->freehigh = free_highpages;
  #else
-       val->totalhigh = 0;
-       val->freehigh = 0;
+       val->totalhigh = managed_highpages;
+       val->freehigh = free_highpages;
  #endif
         val->mem_unit = PAGE_SIZE;
  }
@@ -4390,13 +4667,12 @@ static void build_zonelists(pg_data_t *pgdat)
   */
  int local_memory_node(int node)
  {
-       struct zone *zone;
+       struct zoneref *z;
  
-       (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
+       z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
                                    gfp_zone(GFP_KERNEL),
-                                  NULL,
-                                  &zone);
-       return zone->node;
+                                  NULL);
+       return z->zone->node;
  }
  #endif
  
@@ -6725,98 +7001,6 @@ void *__init alloc_large_system_hash(const char *tablename,
         return table;
  }
  
-/* Return a pointer to the bitmap storing bits affecting a block of pages */
-static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
-                                                       unsigned long pfn)
-{
-#ifdef CONFIG_SPARSEMEM
-       return __pfn_to_section(pfn)->pageblock_flags;
-#else
-       return zone->pageblock_flags;
-#endif /* CONFIG_SPARSEMEM */
-}
-
-static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
-{
-#ifdef CONFIG_SPARSEMEM
-       pfn &= (PAGES_PER_SECTION-1);
-       return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
-#else
-       pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
-       return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
-#endif /* CONFIG_SPARSEMEM */
-}
-
-/**
- * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
- * @page: The page within the block of interest
- * @pfn: The target page frame number
- * @end_bitidx: The last bit of interest to retrieve
- * @mask: mask of bits that the caller is interested in
- *
- * Return: pageblock_bits flags
- */
-unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
-                                       unsigned long end_bitidx,
-                                       unsigned long mask)
-{
-       struct zone *zone;
-       unsigned long *bitmap;
-       unsigned long bitidx, word_bitidx;
-       unsigned long word;
-
-       zone = page_zone(page);
-       bitmap = get_pageblock_bitmap(zone, pfn);
-       bitidx = pfn_to_bitidx(zone, pfn);
-       word_bitidx = bitidx / BITS_PER_LONG;
-       bitidx &= (BITS_PER_LONG-1);
-
-       word = bitmap[word_bitidx];
-       bitidx += end_bitidx;
-       return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
-}
-
-/**
- * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
- * @page: The page within the block of interest
- * @flags: The flags to set
- * @pfn: The target page frame number
- * @end_bitidx: The last bit of interest
- * @mask: mask of bits that the caller is interested in
- */
-void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
-                                       unsigned long pfn,
-                                       unsigned long end_bitidx,
-                                       unsigned long mask)
-{
-       struct zone *zone;
-       unsigned long *bitmap;
-       unsigned long bitidx, word_bitidx;
-       unsigned long old_word, word;
-
-       BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
-
-       zone = page_zone(page);
-       bitmap = get_pageblock_bitmap(zone, pfn);
-       bitidx = pfn_to_bitidx(zone, pfn);
-       word_bitidx = bitidx / BITS_PER_LONG;
-       bitidx &= (BITS_PER_LONG-1);
-
-       VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page);
-
-       bitidx += end_bitidx;
-       mask <<= (BITS_PER_LONG - bitidx - 1);
-       flags <<= (BITS_PER_LONG - bitidx - 1);
-
-       word = READ_ONCE(bitmap[word_bitidx]);
-       for (;;) {
-               old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
-               if (word == old_word)
-                       break;
-               word = old_word;
-       }
-}
-
  /*
   * This function checks whether pageblock includes unmovable pages or not.
   * If @count is not zero, it is okay to include less @count unmovable pages
@@ -6864,7 +7048,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
                  * We can't use page_count without pin a page
                  * because another CPU can free compound page.
                  * This check already skips compound tails of THP
-                * because their page->_count is zero at all time.
+                * because their page->_refcount is zero at all time.
                  */
                 if (!page_ref_count(page)) {
                         if (PageBuddy(page))
@@ -7177,7 +7361,8 @@ void zone_pcp_reset(struct zone *zone)
  
  #ifdef CONFIG_MEMORY_HOTREMOVE
  /*
- * All pages in the range must be isolated before calling this.
+ * All pages in the range must be in a single zone and isolated
+ * before calling this.
   */
  void
  __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
diff --git a/mm/page_isolation.c b/mm/page_isolation.c

index c4f568206544b616969d59631017d51da0d46a85..612122bf6a4236ff57a8bacf1d3dcd600c02c821 100644 (file)
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -246,6 +246,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
         return pfn;
  }
  
+/* Caller should ensure that requested range is in a single zone */
  int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
                         bool skip_hwpoisoned_pages)
  {
@@ -288,13 +289,10 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private,
          * accordance with memory policy of the user process if possible. For
          * now as a simple work-around, we use the next node for destination.
          */
-       if (PageHuge(page)) {
-               int node = next_online_node(page_to_nid(page));
-               if (node == MAX_NUMNODES)
-                       node = first_online_node;
+       if (PageHuge(page))
                 return alloc_huge_page_node(page_hstate(compound_head(page)),
-                                           node);
-       }
+                                           next_node_in(page_to_nid(page),
+                                                        node_online_map));
  
         if (PageHighMem(page))
                 gfp_mask |= __GFP_HIGHMEM;
diff --git a/mm/page_owner.c b/mm/page_owner.c

index ac3d8d129974398cb98bf0bbf2b56207fdc18642..792b56da13d8564f4f6fbab68dcf46297bd13c30 100644 (file)
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -143,7 +143,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
                 goto err;
  
         /* Print information relevant to grouping pages by mobility */
-       pageblock_mt = get_pfnblock_migratetype(page, pfn);
+       pageblock_mt = get_pageblock_migratetype(page);
         page_mt  = gfpflags_to_migratetype(page_ext->gfp_mask);
         ret += snprintf(kbuf + ret, count - ret,
                         "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
@@ -301,6 +301,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
  
                         page = pfn_to_page(pfn);
  
+                       if (page_zone(page) != zone)
+                               continue;
+
                         /*
                          * We are safe to check buddy flag and order, because
                          * this is init stage and only single thread runs.
diff --git a/mm/rmap.c b/mm/rmap.c

index 307b555024efb6787cca6030d1f58060ab4031fe..8a839935b18c000ecaa8411f0ffaf6c5ee81b998 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -409,7 +409,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
         list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
                 struct anon_vma *anon_vma = avc->anon_vma;
  
-               BUG_ON(anon_vma->degree);
+               VM_WARN_ON(anon_vma->degree);
                 put_anon_vma(anon_vma);
  
                 list_del(&avc->same_vma);
@@ -1249,7 +1249,7 @@ void page_add_new_anon_rmap(struct page *page,
         int nr = compound ? hpage_nr_pages(page) : 1;
  
         VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
-       SetPageSwapBacked(page);
+       __SetPageSwapBacked(page);
         if (compound) {
                 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
                 /* increment count (starts at -1) */
diff --git a/mm/shmem.c b/mm/shmem.c

index e684a914022805cce8dddf3fb84ea66359f4b315..e418a995427d67b4e386c366da4dd427dc27c889 100644 (file)
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -101,7 +101,6 @@ struct shmem_falloc {
  enum sgp_type {
         SGP_READ,       /* don't exceed i_size, don't allocate page */
         SGP_CACHE,      /* don't exceed i_size, may allocate page */
-       SGP_DIRTY,      /* like SGP_CACHE, but set new page dirty */
         SGP_WRITE,      /* may exceed i_size, may allocate !Uptodate page */
         SGP_FALLOC,     /* like SGP_WRITE, but make existing page Uptodate */
  };
@@ -122,13 +121,14 @@ static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
  static int shmem_replace_page(struct page **pagep, gfp_t gfp,
                                 struct shmem_inode_info *info, pgoff_t index);
  static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
-       struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
+               struct page **pagep, enum sgp_type sgp,
+               gfp_t gfp, struct mm_struct *fault_mm, int *fault_type);
  
  static inline int shmem_getpage(struct inode *inode, pgoff_t index,
-       struct page **pagep, enum sgp_type sgp, int *fault_type)
+               struct page **pagep, enum sgp_type sgp)
  {
         return shmem_getpage_gfp(inode, index, pagep, sgp,
-                       mapping_gfp_mask(inode->i_mapping), fault_type);
+               mapping_gfp_mask(inode->i_mapping), NULL, NULL);
  }
  
  static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
@@ -169,7 +169,7 @@ static inline int shmem_reacct_size(unsigned long flags,
  
  /*
   * ... whereas tmpfs objects are accounted incrementally as
- * pages are allocated, in order to allow huge sparse files.
+ * pages are allocated, in order to allow large sparse files.
   * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
   * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
   */
@@ -528,7 +528,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
  
         if (partial_start) {
                 struct page *page = NULL;
-               shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
+               shmem_getpage(inode, start - 1, &page, SGP_READ);
                 if (page) {
                         unsigned int top = PAGE_SIZE;
                         if (start > end) {
@@ -543,7 +543,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
         }
         if (partial_end) {
                 struct page *page = NULL;
-               shmem_getpage(inode, end, &page, SGP_READ, NULL);
+               shmem_getpage(inode, end, &page, SGP_READ);
                 if (page) {
                         zero_user_segment(page, 0, partial_end);
                         set_page_dirty(page);
@@ -947,8 +947,7 @@ redirty:
         return 0;
  }
  
-#ifdef CONFIG_NUMA
-#ifdef CONFIG_TMPFS
+#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
  static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
  {
         char buffer[64];
@@ -972,7 +971,18 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
         }
         return mpol;
  }
-#endif /* CONFIG_TMPFS */
+#else /* !CONFIG_NUMA || !CONFIG_TMPFS */
+static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
+{
+}
+static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
+{
+       return NULL;
+}
+#endif /* CONFIG_NUMA && CONFIG_TMPFS */
+#ifndef CONFIG_NUMA
+#define vm_policy vm_private_data
+#endif
  
  static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
                         struct shmem_inode_info *info, pgoff_t index)
@@ -1008,39 +1018,17 @@ static struct page *shmem_alloc_page(gfp_t gfp,
         pvma.vm_ops = NULL;
         pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
  
-       page = alloc_page_vma(gfp, &pvma, 0);
+       page = alloc_pages_vma(gfp, 0, &pvma, 0, numa_node_id(), false);
+       if (page) {
+               __SetPageLocked(page);
+               __SetPageSwapBacked(page);
+       }
  
         /* Drop reference taken by mpol_shared_policy_lookup() */
         mpol_cond_put(pvma.vm_policy);
  
         return page;
  }
-#else /* !CONFIG_NUMA */
-#ifdef CONFIG_TMPFS
-static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
-{
-}
-#endif /* CONFIG_TMPFS */
-
-static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
-                       struct shmem_inode_info *info, pgoff_t index)
-{
-       return swapin_readahead(swap, gfp, NULL, 0);
-}
-
-static inline struct page *shmem_alloc_page(gfp_t gfp,
-                       struct shmem_inode_info *info, pgoff_t index)
-{
-       return alloc_page(gfp);
-}
-#endif /* CONFIG_NUMA */
-
-#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS)
-static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
-{
-       return NULL;
-}
-#endif
  
  /*
   * When a page is moved from swapcache to shmem filecache (either by the
@@ -1084,9 +1072,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
         copy_highpage(newpage, oldpage);
         flush_dcache_page(newpage);
  
-       __SetPageLocked(newpage);
         SetPageUptodate(newpage);
-       SetPageSwapBacked(newpage);
         set_page_private(newpage, swap_index);
         SetPageSwapCache(newpage);
  
@@ -1130,14 +1116,19 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
   *
   * If we allocate a new one we do not mark it dirty. That's up to the
   * vm. If we swap it in we mark it dirty since we also free the swap
- * entry since a page cannot live in both the swap and page cache
+ * entry since a page cannot live in both the swap and page cache.
+ *
+ * fault_mm and fault_type are only supplied by shmem_fault:
+ * otherwise they are NULL.
   */
  static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
-       struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
+       struct page **pagep, enum sgp_type sgp, gfp_t gfp,
+       struct mm_struct *fault_mm, int *fault_type)
  {
         struct address_space *mapping = inode->i_mapping;
         struct shmem_inode_info *info;
         struct shmem_sb_info *sbinfo;
+       struct mm_struct *charge_mm;
         struct mem_cgroup *memcg;
         struct page *page;
         swp_entry_t swap;
@@ -1155,7 +1146,7 @@ repeat:
                 page = NULL;
         }
  
-       if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
+       if (sgp <= SGP_CACHE &&
             ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
                 error = -EINVAL;
                 goto unlock;
@@ -1183,14 +1174,19 @@ repeat:
          */
         info = SHMEM_I(inode);
         sbinfo = SHMEM_SB(inode->i_sb);
+       charge_mm = fault_mm ? : current->mm;
  
         if (swap.val) {
                 /* Look it up and read it in.. */
                 page = lookup_swap_cache(swap);
                 if (!page) {
-                       /* here we actually do the io */
-                       if (fault_type)
+                       /* Or update major stats only when swapin succeeds?? */
+                       if (fault_type) {
                                 *fault_type |= VM_FAULT_MAJOR;
+                               count_vm_event(PGMAJFAULT);
+                               mem_cgroup_count_vm_event(fault_mm, PGMAJFAULT);
+                       }
+                       /* Here we actually start the io */
                         page = shmem_swapin(swap, gfp, info, index);
                         if (!page) {
                                 error = -ENOMEM;
@@ -1217,7 +1213,7 @@ repeat:
                                 goto failed;
                 }
  
-               error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
+               error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
                                 false);
                 if (!error) {
                         error = shmem_add_to_page_cache(page, mapping, index,
@@ -1275,13 +1271,10 @@ repeat:
                         error = -ENOMEM;
                         goto decused;
                 }
-
-               __SetPageSwapBacked(page);
-               __SetPageLocked(page);
                 if (sgp == SGP_WRITE)
                         __SetPageReferenced(page);
  
-               error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
+               error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
                                 false);
                 if (error)
                         goto decused;
@@ -1321,12 +1314,10 @@ clear:
                         flush_dcache_page(page);
                         SetPageUptodate(page);
                 }
-               if (sgp == SGP_DIRTY)
-                       set_page_dirty(page);
         }
  
         /* Perhaps the file has been truncated since we checked */
-       if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
+       if (sgp <= SGP_CACHE &&
             ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
                 if (alloced) {
                         ClearPageDirty(page);
@@ -1372,6 +1363,7 @@ unlock:
  static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
  {
         struct inode *inode = file_inode(vma->vm_file);
+       gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
         int error;
         int ret = VM_FAULT_LOCKED;
  
@@ -1433,14 +1425,10 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                 spin_unlock(&inode->i_lock);
         }
  
-       error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
+       error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE,
+                                 gfp, vma->vm_mm, &ret);
         if (error)
                 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
-
-       if (ret & VM_FAULT_MAJOR) {
-               count_vm_event(PGMAJFAULT);
-               mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
-       }
         return ret;
  }
  
@@ -1587,7 +1575,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
                         return -EPERM;
         }
  
-       return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
+       return shmem_getpage(inode, index, pagep, SGP_WRITE);
  }
  
  static int
@@ -1633,7 +1621,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
          * and even mark them dirty, so it cannot exceed the max_blocks limit.
          */
         if (!iter_is_iovec(to))
-               sgp = SGP_DIRTY;
+               sgp = SGP_CACHE;
  
         index = *ppos >> PAGE_SHIFT;
         offset = *ppos & ~PAGE_MASK;
@@ -1653,14 +1641,17 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
                                 break;
                 }
  
-               error = shmem_getpage(inode, index, &page, sgp, NULL);
+               error = shmem_getpage(inode, index, &page, sgp);
                 if (error) {
                         if (error == -EINVAL)
                                 error = 0;
                         break;
                 }
-               if (page)
+               if (page) {
+                       if (sgp == SGP_CACHE)
+                               set_page_dirty(page);
                         unlock_page(page);
+               }
  
                 /*
                  * We must evaluate after, since reads (unlike writes)
@@ -1766,7 +1757,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
         error = 0;
  
         while (spd.nr_pages < nr_pages) {
-               error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL);
+               error = shmem_getpage(inode, index, &page, SGP_CACHE);
                 if (error)
                         break;
                 unlock_page(page);
@@ -1788,8 +1779,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
                 page = spd.pages[page_nr];
  
                 if (!PageUptodate(page) || page->mapping != mapping) {
-                       error = shmem_getpage(inode, index, &page,
-                                                       SGP_CACHE, NULL);
+                       error = shmem_getpage(inode, index, &page, SGP_CACHE);
                         if (error)
                                 break;
                         unlock_page(page);
@@ -2232,8 +2222,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
                 else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
                         error = -ENOMEM;
                 else
-                       error = shmem_getpage(inode, index, &page, SGP_FALLOC,
-                                                                       NULL);
+                       error = shmem_getpage(inode, index, &page, SGP_FALLOC);
                 if (error) {
                         /* Remove the !PageUptodate pages we added */
                         shmem_undo_range(inode,
@@ -2551,7 +2540,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
                 inode->i_op = &shmem_short_symlink_operations;
         } else {
                 inode_nohighmem(inode);
-               error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
+               error = shmem_getpage(inode, 0, &page, SGP_WRITE);
                 if (error) {
                         iput(inode);
                         return error;
@@ -2592,7 +2581,7 @@ static const char *shmem_get_link(struct dentry *dentry,
                         return ERR_PTR(-ECHILD);
                 }
         } else {
-               error = shmem_getpage(inode, 0, &page, SGP_READ, NULL);
+               error = shmem_getpage(inode, 0, &page, SGP_READ);
                 if (error)
                         return ERR_PTR(error);
                 unlock_page(page);
@@ -3496,7 +3485,8 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
         int error;
  
         BUG_ON(mapping->a_ops != &shmem_aops);
-       error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL);
+       error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
+                                 gfp, NULL, NULL);
         if (error)
                 page = ERR_PTR(error);
         else
diff --git a/mm/slab.c b/mm/slab.c

index 17e2848979c53a369ad9d7d766d9872173c1787f..c11bf50079522f7ef60170301f629ac74c0aad0d 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -213,6 +213,11 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list);
  static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
  static void cache_reap(struct work_struct *unused);
  
+static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
+                                               void **list);
+static inline void fixup_slab_list(struct kmem_cache *cachep,
+                               struct kmem_cache_node *n, struct page *page,
+                               void **list);
  static int slab_early_init = 1;
  
  #define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node))
@@ -421,8 +426,6 @@ static struct kmem_cache kmem_cache_boot = {
         .name = "kmem_cache",
  };
  
-#define BAD_ALIEN_MAGIC 0x01020304ul
-
  static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
  
  static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
@@ -519,22 +522,15 @@ static DEFINE_PER_CPU(unsigned long, slab_reap_node);
  
  static void init_reap_node(int cpu)
  {
-       int node;
-
-       node = next_node(cpu_to_mem(cpu), node_online_map);
-       if (node == MAX_NUMNODES)
-               node = first_node(node_online_map);
-
-       per_cpu(slab_reap_node, cpu) = node;
+       per_cpu(slab_reap_node, cpu) = next_node_in(cpu_to_mem(cpu),
+                                                   node_online_map);
  }
  
  static void next_reap_node(void)
  {
         int node = __this_cpu_read(slab_reap_node);
  
-       node = next_node(node, node_online_map);
-       if (unlikely(node >= MAX_NUMNODES))
-               node = first_node(node_online_map);
+       node = next_node_in(node, node_online_map);
         __this_cpu_write(slab_reap_node, node);
  }
  
@@ -644,7 +640,7 @@ static int transfer_objects(struct array_cache *to,
  static inline struct alien_cache **alloc_alien_cache(int node,
                                                 int limit, gfp_t gfp)
  {
-       return (struct alien_cache **)BAD_ALIEN_MAGIC;
+       return NULL;
  }
  
  static inline void free_alien_cache(struct alien_cache **ac_ptr)
@@ -850,6 +846,46 @@ static inline gfp_t gfp_exact_node(gfp_t flags)
  }
  #endif
  
+static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp)
+{
+       struct kmem_cache_node *n;
+
+       /*
+        * Set up the kmem_cache_node for cpu before we can
+        * begin anything. Make sure some other cpu on this
+        * node has not already allocated this
+        */
+       n = get_node(cachep, node);
+       if (n) {
+               spin_lock_irq(&n->list_lock);
+               n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount +
+                               cachep->num;
+               spin_unlock_irq(&n->list_lock);
+
+               return 0;
+       }
+
+       n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node);
+       if (!n)
+               return -ENOMEM;
+
+       kmem_cache_node_init(n);
+       n->next_reap = jiffies + REAPTIMEOUT_NODE +
+                   ((unsigned long)cachep) % REAPTIMEOUT_NODE;
+
+       n->free_limit =
+               (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num;
+
+       /*
+        * The kmem_cache_nodes don't come and go as CPUs
+        * come and go.  slab_mutex is sufficient
+        * protection here.
+        */
+       cachep->node[node] = n;
+
+       return 0;
+}
+
  /*
   * Allocates and initializes node for a node on each slab cache, used for
   * either memory or cpu hotplug.  If memory is being hot-added, the kmem_cache_node
@@ -861,46 +897,82 @@ static inline gfp_t gfp_exact_node(gfp_t flags)
   */
  static int init_cache_node_node(int node)
  {
+       int ret;
         struct kmem_cache *cachep;
-       struct kmem_cache_node *n;
-       const size_t memsize = sizeof(struct kmem_cache_node);
  
         list_for_each_entry(cachep, &slab_caches, list) {
-               /*
-                * Set up the kmem_cache_node for cpu before we can
-                * begin anything. Make sure some other cpu on this
-                * node has not already allocated this
-                */
-               n = get_node(cachep, node);
-               if (!n) {
-                       n = kmalloc_node(memsize, GFP_KERNEL, node);
-                       if (!n)
-                               return -ENOMEM;
-                       kmem_cache_node_init(n);
-                       n->next_reap = jiffies + REAPTIMEOUT_NODE +
-                           ((unsigned long)cachep) % REAPTIMEOUT_NODE;
-
-                       /*
-                        * The kmem_cache_nodes don't come and go as CPUs
-                        * come and go.  slab_mutex is sufficient
-                        * protection here.
-                        */
-                       cachep->node[node] = n;
-               }
-
-               spin_lock_irq(&n->list_lock);
-               n->free_limit =
-                       (1 + nr_cpus_node(node)) *
-                       cachep->batchcount + cachep->num;
-               spin_unlock_irq(&n->list_lock);
+               ret = init_cache_node(cachep, node, GFP_KERNEL);
+               if (ret)
+                       return ret;
         }
+
         return 0;
  }
  
-static inline int slabs_tofree(struct kmem_cache *cachep,
-                                               struct kmem_cache_node *n)
+static int setup_kmem_cache_node(struct kmem_cache *cachep,
+                               int node, gfp_t gfp, bool force_change)
  {
-       return (n->free_objects + cachep->num - 1) / cachep->num;
+       int ret = -ENOMEM;
+       struct kmem_cache_node *n;
+       struct array_cache *old_shared = NULL;
+       struct array_cache *new_shared = NULL;
+       struct alien_cache **new_alien = NULL;
+       LIST_HEAD(list);
+
+       if (use_alien_caches) {
+               new_alien = alloc_alien_cache(node, cachep->limit, gfp);
+               if (!new_alien)
+                       goto fail;
+       }
+
+       if (cachep->shared) {
+               new_shared = alloc_arraycache(node,
+                       cachep->shared * cachep->batchcount, 0xbaadf00d, gfp);
+               if (!new_shared)
+                       goto fail;
+       }
+
+       ret = init_cache_node(cachep, node, gfp);
+       if (ret)
+               goto fail;
+
+       n = get_node(cachep, node);
+       spin_lock_irq(&n->list_lock);
+       if (n->shared && force_change) {
+               free_block(cachep, n->shared->entry,
+                               n->shared->avail, node, &list);
+               n->shared->avail = 0;
+       }
+
+       if (!n->shared || force_change) {
+               old_shared = n->shared;
+               n->shared = new_shared;
+               new_shared = NULL;
+       }
+
+       if (!n->alien) {
+               n->alien = new_alien;
+               new_alien = NULL;
+       }
+
+       spin_unlock_irq(&n->list_lock);
+       slabs_destroy(cachep, &list);
+
+       /*
+        * To protect lockless access to n->shared during irq disabled context.
+        * If n->shared isn't NULL in irq disabled context, accessing to it is
+        * guaranteed to be valid until irq is re-enabled, because it will be
+        * freed after synchronize_sched().
+        */
+       if (force_change)
+               synchronize_sched();
+
+fail:
+       kfree(old_shared);
+       kfree(new_shared);
+       free_alien_cache(new_alien);
+
+       return ret;
  }
  
  static void cpuup_canceled(long cpu)
@@ -967,14 +1039,13 @@ free_slab:
                 n = get_node(cachep, node);
                 if (!n)
                         continue;
-               drain_freelist(cachep, n, slabs_tofree(cachep, n));
+               drain_freelist(cachep, n, INT_MAX);
         }
  }
  
  static int cpuup_prepare(long cpu)
  {
         struct kmem_cache *cachep;
-       struct kmem_cache_node *n = NULL;
         int node = cpu_to_mem(cpu);
         int err;
  
@@ -993,44 +1064,9 @@ static int cpuup_prepare(long cpu)
          * array caches
          */
         list_for_each_entry(cachep, &slab_caches, list) {
-               struct array_cache *shared = NULL;
-               struct alien_cache **alien = NULL;
-
-               if (cachep->shared) {
-                       shared = alloc_arraycache(node,
-                               cachep->shared * cachep->batchcount,
-                               0xbaadf00d, GFP_KERNEL);
-                       if (!shared)
-                               goto bad;
-               }
-               if (use_alien_caches) {
-                       alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
-                       if (!alien) {
-                               kfree(shared);
-                               goto bad;
-                       }
-               }
-               n = get_node(cachep, node);
-               BUG_ON(!n);
-
-               spin_lock_irq(&n->list_lock);
-               if (!n->shared) {
-                       /*
-                        * We are serialised from CPU_DEAD or
-                        * CPU_UP_CANCELLED by the cpucontrol lock
-                        */
-                       n->shared = shared;
-                       shared = NULL;
-               }
-#ifdef CONFIG_NUMA
-               if (!n->alien) {
-                       n->alien = alien;
-                       alien = NULL;
-               }
-#endif
-               spin_unlock_irq(&n->list_lock);
-               kfree(shared);
-               free_alien_cache(alien);
+               err = setup_kmem_cache_node(cachep, node, GFP_KERNEL, false);
+               if (err)
+                       goto bad;
         }
  
         return 0;
@@ -1119,7 +1155,7 @@ static int __meminit drain_cache_node_node(int node)
                 if (!n)
                         continue;
  
-               drain_freelist(cachep, n, slabs_tofree(cachep, n));
+               drain_freelist(cachep, n, INT_MAX);
  
                 if (!list_empty(&n->slabs_full) ||
                     !list_empty(&n->slabs_partial)) {
@@ -1200,6 +1236,61 @@ static void __init set_up_node(struct kmem_cache *cachep, int index)
         }
  }
  
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+static void freelist_randomize(struct rnd_state *state, freelist_idx_t *list,
+                       size_t count)
+{
+       size_t i;
+       unsigned int rand;
+
+       for (i = 0; i < count; i++)
+               list[i] = i;
+
+       /* Fisher-Yates shuffle */
+       for (i = count - 1; i > 0; i--) {
+               rand = prandom_u32_state(state);
+               rand %= (i + 1);
+               swap(list[i], list[rand]);
+       }
+}
+
+/* Create a random sequence per cache */
+static int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp)
+{
+       unsigned int seed, count = cachep->num;
+       struct rnd_state state;
+
+       if (count < 2)
+               return 0;
+
+       /* If it fails, we will just use the global lists */
+       cachep->random_seq = kcalloc(count, sizeof(freelist_idx_t), gfp);
+       if (!cachep->random_seq)
+               return -ENOMEM;
+
+       /* Get best entropy at this stage */
+       get_random_bytes_arch(&seed, sizeof(seed));
+       prandom_seed_state(&state, seed);
+
+       freelist_randomize(&state, cachep->random_seq, count);
+       return 0;
+}
+
+/* Destroy the per-cache random freelist sequence */
+static void cache_random_seq_destroy(struct kmem_cache *cachep)
+{
+       kfree(cachep->random_seq);
+       cachep->random_seq = NULL;
+}
+#else
+static inline int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp)
+{
+       return 0;
+}
+static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { }
+#endif /* CONFIG_SLAB_FREELIST_RANDOM */
+
+
  /*
   * Initialisation.  Called after the page allocator have been initialised and
   * before smp_init().
@@ -1212,7 +1303,7 @@ void __init kmem_cache_init(void)
                                         sizeof(struct rcu_head));
         kmem_cache = &kmem_cache_boot;
  
-       if (num_possible_nodes() == 1)
+       if (!IS_ENABLED(CONFIG_NUMA) || num_possible_nodes() == 1)
                 use_alien_caches = 0;
  
         for (i = 0; i < NUM_INIT_LISTS; i++)
@@ -1781,7 +1872,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
  
                         /*
                          * Needed to avoid possible looping condition
-                        * in cache_grow()
+                        * in cache_grow_begin()
                          */
                         if (OFF_SLAB(freelist_cache))
                                 continue;
@@ -2138,7 +2229,7 @@ done:
         cachep->freelist_size = cachep->num * sizeof(freelist_idx_t);
         cachep->flags = flags;
         cachep->allocflags = __GFP_COMP;
-       if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
+       if (flags & SLAB_CACHE_DMA)
                 cachep->allocflags |= GFP_DMA;
         cachep->size = size;
         cachep->reciprocal_buffer_size = reciprocal_value(size);
@@ -2180,6 +2271,11 @@ static void check_irq_on(void)
         BUG_ON(irqs_disabled());
  }
  
+static void check_mutex_acquired(void)
+{
+       BUG_ON(!mutex_is_locked(&slab_mutex));
+}
+
  static void check_spinlock_acquired(struct kmem_cache *cachep)
  {
  #ifdef CONFIG_SMP
@@ -2199,13 +2295,27 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
  #else
  #define check_irq_off()        do { } while(0)
  #define check_irq_on() do { } while(0)
+#define check_mutex_acquired() do { } while(0)
  #define check_spinlock_acquired(x) do { } while(0)
  #define check_spinlock_acquired_node(x, y) do { } while(0)
  #endif
  
-static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
-                       struct array_cache *ac,
-                       int force, int node);
+static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
+                               int node, bool free_all, struct list_head *list)
+{
+       int tofree;
+
+       if (!ac || !ac->avail)
+               return;
+
+       tofree = free_all ? ac->avail : (ac->limit + 4) / 5;
+       if (tofree > ac->avail)
+               tofree = (ac->avail + 1) / 2;
+
+       free_block(cachep, ac->entry, tofree, node, list);
+       ac->avail -= tofree;
+       memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail);
+}
  
  static void do_drain(void *arg)
  {
@@ -2229,6 +2339,7 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
  {
         struct kmem_cache_node *n;
         int node;
+       LIST_HEAD(list);
  
         on_each_cpu(do_drain, cachep, 1);
         check_irq_on();
@@ -2236,8 +2347,13 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
                 if (n->alien)
                         drain_alien_cache(cachep, n->alien);
  
-       for_each_kmem_cache_node(cachep, node, n)
-               drain_array(cachep, n, n->shared, 1, node);
+       for_each_kmem_cache_node(cachep, node, n) {
+               spin_lock_irq(&n->list_lock);
+               drain_array_locked(cachep, n->shared, node, true, &list);
+               spin_unlock_irq(&n->list_lock);
+
+               slabs_destroy(cachep, &list);
+       }
  }
  
  /*
@@ -2288,7 +2404,7 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
  
         check_irq_on();
         for_each_kmem_cache_node(cachep, node, n) {
-               drain_freelist(cachep, n, slabs_tofree(cachep, n));
+               drain_freelist(cachep, n, INT_MAX);
  
                 ret += !list_empty(&n->slabs_full) ||
                         !list_empty(&n->slabs_partial);
@@ -2306,6 +2422,8 @@ void __kmem_cache_release(struct kmem_cache *cachep)
         int i;
         struct kmem_cache_node *n;
  
+       cache_random_seq_destroy(cachep);
+
         free_percpu(cachep->cpu_cache);
  
         /* NUMA: free the node structures */
@@ -2412,15 +2530,115 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
  #endif
  }
  
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+/* Hold information during a freelist initialization */
+union freelist_init_state {
+       struct {
+               unsigned int pos;
+               freelist_idx_t *list;
+               unsigned int count;
+               unsigned int rand;
+       };
+       struct rnd_state rnd_state;
+};
+
+/*
+ * Initialize the state based on the randomization methode available.
+ * return true if the pre-computed list is available, false otherwize.
+ */
+static bool freelist_state_initialize(union freelist_init_state *state,
+                               struct kmem_cache *cachep,
+                               unsigned int count)
+{
+       bool ret;
+       unsigned int rand;
+
+       /* Use best entropy available to define a random shift */
+       get_random_bytes_arch(&rand, sizeof(rand));
+
+       /* Use a random state if the pre-computed list is not available */
+       if (!cachep->random_seq) {
+               prandom_seed_state(&state->rnd_state, rand);
+               ret = false;
+       } else {
+               state->list = cachep->random_seq;
+               state->count = count;
+               state->pos = 0;
+               state->rand = rand;
+               ret = true;
+       }
+       return ret;
+}
+
+/* Get the next entry on the list and randomize it using a random shift */
+static freelist_idx_t next_random_slot(union freelist_init_state *state)
+{
+       return (state->list[state->pos++] + state->rand) % state->count;
+}
+
+/*
+ * Shuffle the freelist initialization state based on pre-computed lists.
+ * return true if the list was successfully shuffled, false otherwise.
+ */
+static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page)
+{
+       unsigned int objfreelist = 0, i, count = cachep->num;
+       union freelist_init_state state;
+       bool precomputed;
+
+       if (count < 2)
+               return false;
+
+       precomputed = freelist_state_initialize(&state, cachep, count);
+
+       /* Take a random entry as the objfreelist */
+       if (OBJFREELIST_SLAB(cachep)) {
+               if (!precomputed)
+                       objfreelist = count - 1;
+               else
+                       objfreelist = next_random_slot(&state);
+               page->freelist = index_to_obj(cachep, page, objfreelist) +
+                                               obj_offset(cachep);
+               count--;
+       }
+
+       /*
+        * On early boot, generate the list dynamically.
+        * Later use a pre-computed list for speed.
+        */
+       if (!precomputed) {
+               freelist_randomize(&state.rnd_state, page->freelist, count);
+       } else {
+               for (i = 0; i < count; i++)
+                       set_free_obj(page, i, next_random_slot(&state));
+       }
+
+       if (OBJFREELIST_SLAB(cachep))
+               set_free_obj(page, cachep->num - 1, objfreelist);
+
+       return true;
+}
+#else
+static inline bool shuffle_freelist(struct kmem_cache *cachep,
+                               struct page *page)
+{
+       return false;
+}
+#endif /* CONFIG_SLAB_FREELIST_RANDOM */
+
  static void cache_init_objs(struct kmem_cache *cachep,
                             struct page *page)
  {
         int i;
         void *objp;
+       bool shuffled;
  
         cache_init_objs_debug(cachep, page);
  
-       if (OBJFREELIST_SLAB(cachep)) {
+       /* Try to randomize the freelist if enabled */
+       shuffled = shuffle_freelist(cachep, page);
+
+       if (!shuffled && OBJFREELIST_SLAB(cachep)) {
                 page->freelist = index_to_obj(cachep, page, cachep->num - 1) +
                                                 obj_offset(cachep);
         }
@@ -2434,17 +2652,8 @@ static void cache_init_objs(struct kmem_cache *cachep,
                         kasan_poison_object_data(cachep, objp);
                 }
  
-               set_free_obj(page, i, i);
-       }
-}
-
-static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
-{
-       if (CONFIG_ZONE_DMA_FLAG) {
-               if (flags & GFP_DMA)
-                       BUG_ON(!(cachep->allocflags & GFP_DMA));
-               else
-                       BUG_ON(cachep->allocflags & GFP_DMA);
+               if (!shuffled)
+                       set_free_obj(page, i, i);
         }
  }
  
@@ -2502,13 +2711,15 @@ static void slab_map_pages(struct kmem_cache *cache, struct page *page,
   * Grow (by 1) the number of slabs within a cache.  This is called by
   * kmem_cache_alloc() when there are no active objs left in a cache.
   */
-static int cache_grow(struct kmem_cache *cachep,
-               gfp_t flags, int nodeid, struct page *page)
+static struct page *cache_grow_begin(struct kmem_cache *cachep,
+                               gfp_t flags, int nodeid)
  {
         void *freelist;
         size_t offset;
         gfp_t local_flags;
+       int page_node;
         struct kmem_cache_node *n;
+       struct page *page;
  
         /*
          * Be lazy and only check for valid flags here,  keeping it out of the
@@ -2520,43 +2731,35 @@ static int cache_grow(struct kmem_cache *cachep,
         }
         local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
  
-       /* Take the node list lock to change the colour_next on this node */
         check_irq_off();
-       n = get_node(cachep, nodeid);
-       spin_lock(&n->list_lock);
-
-       /* Get colour for the slab, and cal the next value. */
-       offset = n->colour_next;
-       n->colour_next++;
-       if (n->colour_next >= cachep->colour)
-               n->colour_next = 0;
-       spin_unlock(&n->list_lock);
-
-       offset *= cachep->colour_off;
-
         if (gfpflags_allow_blocking(local_flags))
                 local_irq_enable();
  
-       /*
-        * The test for missing atomic flag is performed here, rather than
-        * the more obvious place, simply to reduce the critical path length
-        * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
-        * will eventually be caught here (where it matters).
-        */
-       kmem_flagcheck(cachep, flags);
-
         /*
          * Get mem for the objs.  Attempt to allocate a physical page from
          * 'nodeid'.
          */
-       if (!page)
-               page = kmem_getpages(cachep, local_flags, nodeid);
+       page = kmem_getpages(cachep, local_flags, nodeid);
         if (!page)
                 goto failed;
  
+       page_node = page_to_nid(page);
+       n = get_node(cachep, page_node);
+
+       /* Get colour for the slab, and cal the next value. */
+       n->colour_next++;
+       if (n->colour_next >= cachep->colour)
+               n->colour_next = 0;
+
+       offset = n->colour_next;
+       if (offset >= cachep->colour)
+               offset = 0;
+
+       offset *= cachep->colour_off;
+
         /* Get slab management. */
         freelist = alloc_slabmgmt(cachep, page, offset,
-                       local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
+                       local_flags & ~GFP_CONSTRAINT_MASK, page_node);
         if (OFF_SLAB(cachep) && !freelist)
                 goto opps1;
  
@@ -2567,21 +2770,40 @@ static int cache_grow(struct kmem_cache *cachep,
  
         if (gfpflags_allow_blocking(local_flags))
                 local_irq_disable();
-       check_irq_off();
-       spin_lock(&n->list_lock);
  
-       /* Make slab active. */
-       list_add_tail(&page->lru, &(n->slabs_free));
-       STATS_INC_GROWN(cachep);
-       n->free_objects += cachep->num;
-       spin_unlock(&n->list_lock);
-       return 1;
+       return page;
+
  opps1:
         kmem_freepages(cachep, page);
  failed:
         if (gfpflags_allow_blocking(local_flags))
                 local_irq_disable();
-       return 0;
+       return NULL;
+}
+
+static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
+{
+       struct kmem_cache_node *n;
+       void *list = NULL;
+
+       check_irq_off();
+
+       if (!page)
+               return;
+
+       INIT_LIST_HEAD(&page->lru);
+       n = get_node(cachep, page_to_nid(page));
+
+       spin_lock(&n->list_lock);
+       if (!page->active)
+               list_add_tail(&page->lru, &(n->slabs_free));
+       else
+               fixup_slab_list(cachep, n, page, &list);
+       STATS_INC_GROWN(cachep);
+       n->free_objects += cachep->num - page->active;
+       spin_unlock(&n->list_lock);
+
+       fixup_objfreelist_debug(cachep, &list);
  }
  
  #if DEBUG
@@ -2785,18 +3007,42 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
         return obj;
  }
  
+/*
+ * Slab list should be fixed up by fixup_slab_list() for existing slab
+ * or cache_grow_end() for new slab
+ */
+static __always_inline int alloc_block(struct kmem_cache *cachep,
+               struct array_cache *ac, struct page *page, int batchcount)
+{
+       /*
+        * There must be at least one object available for
+        * allocation.
+        */
+       BUG_ON(page->active >= cachep->num);
+
+       while (page->active < cachep->num && batchcount--) {
+               STATS_INC_ALLOCED(cachep);
+               STATS_INC_ACTIVE(cachep);
+               STATS_SET_HIGH(cachep);
+
+               ac->entry[ac->avail++] = slab_get_obj(cachep, page);
+       }
+
+       return batchcount;
+}
+
  static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
  {
         int batchcount;
         struct kmem_cache_node *n;
-       struct array_cache *ac;
+       struct array_cache *ac, *shared;
         int node;
         void *list = NULL;
+       struct page *page;
  
         check_irq_off();
         node = numa_mem_id();
  
-retry:
         ac = cpu_cache_get(cachep);
         batchcount = ac->batchcount;
         if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -2810,16 +3056,20 @@ retry:
         n = get_node(cachep, node);
  
         BUG_ON(ac->avail > 0 || !n);
+       shared = READ_ONCE(n->shared);
+       if (!n->free_objects && (!shared || !shared->avail))
+               goto direct_grow;
+
         spin_lock(&n->list_lock);
+       shared = READ_ONCE(n->shared);
  
         /* See if we can refill from the shared array */
-       if (n->shared && transfer_objects(ac, n->shared, batchcount)) {
-               n->shared->touched = 1;
+       if (shared && transfer_objects(ac, shared, batchcount)) {
+               shared->touched = 1;
                 goto alloc_done;
         }
  
         while (batchcount > 0) {
-               struct page *page;
                 /* Get slab alloc is to come from. */
                 page = get_first_slab(n, false);
                 if (!page)
@@ -2827,21 +3077,7 @@ retry:
  
                 check_spinlock_acquired(cachep);
  
-               /*
-                * The slab was either on partial or free list so
-                * there must be at least one object available for
-                * allocation.
-                */
-               BUG_ON(page->active >= cachep->num);
-
-               while (page->active < cachep->num && batchcount--) {
-                       STATS_INC_ALLOCED(cachep);
-                       STATS_INC_ACTIVE(cachep);
-                       STATS_SET_HIGH(cachep);
-
-                       ac->entry[ac->avail++] = slab_get_obj(cachep, page);
-               }
-
+               batchcount = alloc_block(cachep, ac, page, batchcount);
                 fixup_slab_list(cachep, n, page, &list);
         }
  
@@ -2851,9 +3087,8 @@ alloc_done:
         spin_unlock(&n->list_lock);
         fixup_objfreelist_debug(cachep, &list);
  
+direct_grow:
         if (unlikely(!ac->avail)) {
-               int x;
-
                 /* Check if we can use obj in pfmemalloc slab */
                 if (sk_memalloc_socks()) {
                         void *obj = cache_alloc_pfmemalloc(cachep, n, flags);
@@ -2862,18 +3097,19 @@ alloc_done:
                                 return obj;
                 }
  
-               x = cache_grow(cachep, gfp_exact_node(flags), node, NULL);
+               page = cache_grow_begin(cachep, gfp_exact_node(flags), node);
  
-               /* cache_grow can reenable interrupts, then ac could change. */
+               /*
+                * cache_grow_begin() can reenable interrupts,
+                * then ac could change.
+                */
                 ac = cpu_cache_get(cachep);
-               node = numa_mem_id();
+               if (!ac->avail && page)
+                       alloc_block(cachep, ac, page, batchcount);
+               cache_grow_end(cachep, page);
  
-               /* no objects in sight? abort */
-               if (!x && ac->avail == 0)
+               if (!ac->avail)
                         return NULL;
-
-               if (!ac->avail)         /* objects refilled by interrupt? */
-                       goto retry;
         }
         ac->touched = 1;
  
@@ -2884,9 +3120,6 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
                                                 gfp_t flags)
  {
         might_sleep_if(gfpflags_allow_blocking(flags));
-#if DEBUG
-       kmem_flagcheck(cachep, flags);
-#endif
  }
  
  #if DEBUG
@@ -2998,19 +3231,17 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
  static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
  {
         struct zonelist *zonelist;
-       gfp_t local_flags;
         struct zoneref *z;
         struct zone *zone;
         enum zone_type high_zoneidx = gfp_zone(flags);
         void *obj = NULL;
+       struct page *page;
         int nid;
         unsigned int cpuset_mems_cookie;
  
         if (flags & __GFP_THISNODE)
                 return NULL;
  
-       local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
-
  retry_cpuset:
         cpuset_mems_cookie = read_mems_allowed_begin();
         zonelist = node_zonelist(mempolicy_slab_node(), flags);
@@ -3040,33 +3271,19 @@ retry:
                  * We may trigger various forms of reclaim on the allowed
                  * set and go into memory reserves if necessary.
                  */
-               struct page *page;
-
-               if (gfpflags_allow_blocking(local_flags))
-                       local_irq_enable();
-               kmem_flagcheck(cache, flags);
-               page = kmem_getpages(cache, local_flags, numa_mem_id());
-               if (gfpflags_allow_blocking(local_flags))
-                       local_irq_disable();
+               page = cache_grow_begin(cache, flags, numa_mem_id());
+               cache_grow_end(cache, page);
                 if (page) {
+                       nid = page_to_nid(page);
+                       obj = ____cache_alloc_node(cache,
+                               gfp_exact_node(flags), nid);
+
                         /*
-                        * Insert into the appropriate per node queues
+                        * Another processor may allocate the objects in
+                        * the slab since we are not holding any locks.
                          */
-                       nid = page_to_nid(page);
-                       if (cache_grow(cache, flags, nid, page)) {
-                               obj = ____cache_alloc_node(cache,
-                                       gfp_exact_node(flags), nid);
-                               if (!obj)
-                                       /*
-                                        * Another processor may allocate the
-                                        * objects in the slab since we are
-                                        * not holding any locks.
-                                        */
-                                       goto retry;
-                       } else {
-                               /* cache_grow already freed obj */
-                               obj = NULL;
-                       }
+                       if (!obj)
+                               goto retry;
                 }
         }
  
@@ -3083,15 +3300,13 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
  {
         struct page *page;
         struct kmem_cache_node *n;
-       void *obj;
+       void *obj = NULL;
         void *list = NULL;
-       int x;
  
         VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES);
         n = get_node(cachep, nodeid);
         BUG_ON(!n);
  
-retry:
         check_irq_off();
         spin_lock(&n->list_lock);
         page = get_first_slab(n, false);
@@ -3113,18 +3328,18 @@ retry:
  
         spin_unlock(&n->list_lock);
         fixup_objfreelist_debug(cachep, &list);
-       goto done;
+       return obj;
  
  must_grow:
         spin_unlock(&n->list_lock);
-       x = cache_grow(cachep, gfp_exact_node(flags), nodeid, NULL);
-       if (x)
-               goto retry;
-
-       return fallback_alloc(cachep, flags);
+       page = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid);
+       if (page) {
+               /* This slab isn't counted yet so don't update free_objects */
+               obj = slab_get_obj(cachep, page);
+       }
+       cache_grow_end(cachep, page);
  
-done:
-       return obj;
+       return obj ? obj : fallback_alloc(cachep, flags);
  }
  
  static __always_inline void *
@@ -3242,6 +3457,9 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
  {
         int i;
         struct kmem_cache_node *n = get_node(cachep, node);
+       struct page *page;
+
+       n->free_objects += nr_objects;
  
         for (i = 0; i < nr_objects; i++) {
                 void *objp;
@@ -3254,17 +3472,11 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
                 check_spinlock_acquired_node(cachep, node);
                 slab_put_obj(cachep, page, objp);
                 STATS_DEC_ACTIVE(cachep);
-               n->free_objects++;
  
                 /* fixup slab chains */
-               if (page->active == 0) {
-                       if (n->free_objects > n->free_limit) {
-                               n->free_objects -= cachep->num;
-                               list_add_tail(&page->lru, list);
-                       } else {
-                               list_add(&page->lru, &n->slabs_free);
-                       }
-               } else {
+               if (page->active == 0)
+                       list_add(&page->lru, &n->slabs_free);
+               else {
                         /* Unconditionally move a slab to the end of the
                          * partial list on free - maximum time for the
                          * other objects to be freed, too.
@@ -3272,6 +3484,14 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
                         list_add_tail(&page->lru, &n->slabs_partial);
                 }
         }
+
+       while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) {
+               n->free_objects -= cachep->num;
+
+               page = list_last_entry(&n->slabs_free, struct page, lru);
+               list_del(&page->lru);
+               list_add(&page->lru, list);
+       }
  }
  
  static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
@@ -3645,72 +3865,19 @@ EXPORT_SYMBOL(kfree);
  /*
   * This initializes kmem_cache_node or resizes various caches for all nodes.
   */
-static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp)
+static int setup_kmem_cache_nodes(struct kmem_cache *cachep, gfp_t gfp)
  {
+       int ret;
         int node;
         struct kmem_cache_node *n;
-       struct array_cache *new_shared;
-       struct alien_cache **new_alien = NULL;
  
         for_each_online_node(node) {
-
-               if (use_alien_caches) {
-                       new_alien = alloc_alien_cache(node, cachep->limit, gfp);
-                       if (!new_alien)
-                               goto fail;
-               }
-
-               new_shared = NULL;
-               if (cachep->shared) {
-                       new_shared = alloc_arraycache(node,
-                               cachep->shared*cachep->batchcount,
-                                       0xbaadf00d, gfp);
-                       if (!new_shared) {
-                               free_alien_cache(new_alien);
-                               goto fail;
-                       }
-               }
-
-               n = get_node(cachep, node);
-               if (n) {
-                       struct array_cache *shared = n->shared;
-                       LIST_HEAD(list);
-
-                       spin_lock_irq(&n->list_lock);
-
-                       if (shared)
-                               free_block(cachep, shared->entry,
-                                               shared->avail, node, &list);
-
-                       n->shared = new_shared;
-                       if (!n->alien) {
-                               n->alien = new_alien;
-                               new_alien = NULL;
-                       }
-                       n->free_limit = (1 + nr_cpus_node(node)) *
-                                       cachep->batchcount + cachep->num;
-                       spin_unlock_irq(&n->list_lock);
-                       slabs_destroy(cachep, &list);
-                       kfree(shared);
-                       free_alien_cache(new_alien);
-                       continue;
-               }
-               n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node);
-               if (!n) {
-                       free_alien_cache(new_alien);
-                       kfree(new_shared);
+               ret = setup_kmem_cache_node(cachep, node, gfp, true);
+               if (ret)
                         goto fail;
-               }
  
-               kmem_cache_node_init(n);
-               n->next_reap = jiffies + REAPTIMEOUT_NODE +
-                               ((unsigned long)cachep) % REAPTIMEOUT_NODE;
-               n->shared = new_shared;
-               n->alien = new_alien;
-               n->free_limit = (1 + nr_cpus_node(node)) *
-                                       cachep->batchcount + cachep->num;
-               cachep->node[node] = n;
         }
+
         return 0;
  
  fail:
@@ -3752,7 +3919,7 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
         cachep->shared = shared;
  
         if (!prev)
-               goto alloc_node;
+               goto setup_node;
  
         for_each_online_cpu(cpu) {
                 LIST_HEAD(list);
@@ -3769,8 +3936,8 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
         }
         free_percpu(prev);
  
-alloc_node:
-       return alloc_kmem_cache_node(cachep, gfp);
+setup_node:
+       return setup_kmem_cache_nodes(cachep, gfp);
  }
  
  static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
@@ -3804,6 +3971,10 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
         int shared = 0;
         int batchcount = 0;
  
+       err = cache_random_seq_create(cachep, gfp);
+       if (err)
+               goto end;
+
         if (!is_root_cache(cachep)) {
                 struct kmem_cache *root = memcg_root_cache(cachep);
                 limit = root->limit;
@@ -3857,6 +4028,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
         batchcount = (limit + 1) / 2;
  skip_setup:
         err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
+end:
         if (err)
                 pr_err("enable_cpucache failed for %s, error %d\n",
                        cachep->name, -err);
@@ -3869,29 +4041,26 @@ skip_setup:
   * if drain_array() is used on the shared array.
   */
  static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
-                        struct array_cache *ac, int force, int node)
+                        struct array_cache *ac, int node)
  {
         LIST_HEAD(list);
-       int tofree;
+
+       /* ac from n->shared can be freed if we don't hold the slab_mutex. */
+       check_mutex_acquired();
  
         if (!ac || !ac->avail)
                 return;
-       if (ac->touched && !force) {
+
+       if (ac->touched) {
                 ac->touched = 0;
-       } else {
-               spin_lock_irq(&n->list_lock);
-               if (ac->avail) {
-                       tofree = force ? ac->avail : (ac->limit + 4) / 5;
-                       if (tofree > ac->avail)
-                               tofree = (ac->avail + 1) / 2;
-                       free_block(cachep, ac->entry, tofree, node, &list);
-                       ac->avail -= tofree;
-                       memmove(ac->entry, &(ac->entry[tofree]),
-                               sizeof(void *) * ac->avail);
-               }
-               spin_unlock_irq(&n->list_lock);
-               slabs_destroy(cachep, &list);
+               return;
         }
+
+       spin_lock_irq(&n->list_lock);
+       drain_array_locked(cachep, ac, node, false, &list);
+       spin_unlock_irq(&n->list_lock);
+
+       slabs_destroy(cachep, &list);
  }
  
  /**
@@ -3929,7 +4098,7 @@ static void cache_reap(struct work_struct *w)
  
                 reap_alien(searchp, n);
  
-               drain_array(searchp, n, cpu_cache_get(searchp), 0, node);
+               drain_array(searchp, n, cpu_cache_get(searchp), node);
  
                 /*
                  * These are racy checks but it does not matter
@@ -3940,7 +4109,7 @@ static void cache_reap(struct work_struct *w)
  
                 n->next_reap = jiffies + REAPTIMEOUT_NODE;
  
-               drain_array(searchp, n, n->shared, 0, node);
+               drain_array(searchp, n, n->shared, node);
  
                 if (n->free_touched)
                         n->free_touched = 0;
diff --git a/mm/slub.c b/mm/slub.c

index 4dbb109eb8cd1f5eb03bd82e90285be1090eacd7..cf1faa4d3992520ead5dd61deed5e7c41ef9175a 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -329,8 +329,8 @@ static inline void set_page_slub_counters(struct page *page, unsigned long count
         tmp.counters = counters_new;
         /*
          * page->counters can cover frozen/inuse/objects as well
-        * as page->_count.  If we assign to ->counters directly
-        * we run the risk of losing updates to page->_count, so
+        * as page->_refcount.  If we assign to ->counters directly
+        * we run the risk of losing updates to page->_refcount, so
          * be careful and only assign to the fields we need.
          */
         page->frozen  = tmp.frozen;
@@ -1735,11 +1735,11 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
          * may return off node objects because partial slabs are obtained
          * from other nodes and filled up.
          *
-        * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes
-        * defrag_ratio = 1000) then every (well almost) allocation will
-        * first attempt to defrag slab caches on other nodes. This means
-        * scanning over all nodes to look for partial slabs which may be
-        * expensive if we do it every time we are trying to find a slab
+        * If /sys/kernel/slab/xx/remote_node_defrag_ratio is set to 100
+        * (which makes defrag_ratio = 1000) then every (well almost)
+        * allocation will first attempt to defrag slab caches on other nodes.
+        * This means scanning over all nodes to look for partial slabs which
+        * may be expensive if we do it every time we are trying to find a slab
          * with available objects.
          */
         if (!s->remote_node_defrag_ratio ||
@@ -3697,7 +3697,7 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
                  * s->cpu_partial is checked locklessly (see put_cpu_partial),
                  * so we have to make sure the change is visible.
                  */
-               kick_all_cpus_sync();
+               synchronize_sched();
         }
  
         flush_all(s);
diff --git a/mm/swap_state.c b/mm/swap_state.c

index 366ce3518703ecb7cd780b43acd7154adece2b9d..0d457e7db8d6df5e3bd185ac2b2333830bccf7b4 100644 (file)
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -358,7 +358,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
  
                 /* May fail (-ENOMEM) if radix-tree node allocation failed. */
                 __SetPageLocked(new_page);
-               SetPageSwapBacked(new_page);
+               __SetPageSwapBacked(new_page);
                 err = __add_to_swap_cache(new_page, entry);
                 if (likely(!err)) {
                         radix_tree_preload_end();
@@ -370,7 +370,6 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                         return new_page;
                 }
                 radix_tree_preload_end();
-               ClearPageSwapBacked(new_page);
                 __ClearPageLocked(new_page);
                 /*
                  * add_to_swap_cache() doesn't return -EEXIST, so we can safely
diff --git a/mm/util.c b/mm/util.c

index 6cc81e7b870523342cb8882ba222149644d894ac..8a1b3a1fb595878dba5032f4ac31aa37e2b5d1e8 100644 (file)
--- a/mm/util.c
+++ b/mm/util.c
@@ -346,6 +346,29 @@ void *page_rmapping(struct page *page)
         return __page_rmapping(page);
  }
  
+/*
+ * Return true if this page is mapped into pagetables.
+ * For compound page it returns true if any subpage of compound page is mapped.
+ */
+bool page_mapped(struct page *page)
+{
+       int i;
+
+       if (likely(!PageCompound(page)))
+               return atomic_read(&page->_mapcount) >= 0;
+       page = compound_head(page);
+       if (atomic_read(compound_mapcount_ptr(page)) >= 0)
+               return true;
+       if (PageHuge(page))
+               return false;
+       for (i = 0; i < hpage_nr_pages(page); i++) {
+               if (atomic_read(&page[i]._mapcount) >= 0)
+                       return true;
+       }
+       return false;
+}
+EXPORT_SYMBOL(page_mapped);
+
  struct anon_vma *page_anon_vma(struct page *page)
  {
         unsigned long mapping;
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 142cb61f4822454bf3819a4c10c4e8b479a70ec8..dcfdfc1a09420714e9f4dcff8fd3b66de807455e 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -633,7 +633,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
          *
          * Reversing the order of the tests ensures such a situation cannot
          * escape unnoticed. The smp_rmb is needed to ensure the page->flags
-        * load is not satisfied before that of page->_count.
+        * load is not satisfied before that of page->_refcount.
          *
          * Note that if SetPageDirty is always performed via set_page_dirty,
          * and thus under tree_lock, then this ordering is not required.
@@ -1374,7 +1374,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
         for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
                                         !list_empty(src); scan++) {
                 struct page *page;
-               int nr_pages;
  
                 page = lru_to_page(src);
                 prefetchw_prev_lru_page(page, src, flags);
@@ -1383,10 +1382,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
  
                 switch (__isolate_lru_page(page, mode)) {
                 case 0:
-                       nr_pages = hpage_nr_pages(page);
-                       mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
+                       nr_taken += hpage_nr_pages(page);
                         list_move(&page->lru, dst);
-                       nr_taken += nr_pages;
                         break;
  
                 case -EBUSY:
@@ -1602,8 +1599,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
         nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
                                      &nr_scanned, sc, isolate_mode, lru);
  
-       __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
+       update_lru_size(lruvec, lru, -nr_taken);
         __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+       reclaim_stat->recent_scanned[file] += nr_taken;
  
         if (global_reclaim(sc)) {
                 __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
@@ -1624,8 +1622,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
  
         spin_lock_irq(&zone->lru_lock);
  
-       reclaim_stat->recent_scanned[file] += nr_taken;
-
         if (global_reclaim(sc)) {
                 if (current_is_kswapd())
                         __count_zone_vm_events(PGSTEAL_KSWAPD, zone,
@@ -1720,7 +1716,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
   * It is safe to rely on PG_active against the non-LRU pages in here because
   * nobody will play with that bit on a non-LRU page.
   *
- * The downside is that we have to touch page->_count against each page.
+ * The downside is that we have to touch page->_refcount against each page.
   * But we had to alter page->flags anyway.
   */
  
@@ -1742,7 +1738,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
                 SetPageLRU(page);
  
                 nr_pages = hpage_nr_pages(page);
-               mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+               update_lru_size(lruvec, lru, nr_pages);
                 list_move(&page->lru, &lruvec->lists[lru]);
                 pgmoved += nr_pages;
  
@@ -1760,7 +1756,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
                                 list_add(&page->lru, pages_to_free);
                 }
         }
-       __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
+
         if (!is_active_lru(lru))
                 __count_vm_events(PGDEACTIVATE, pgmoved);
  }
@@ -1794,14 +1790,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
  
         nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
                                      &nr_scanned, sc, isolate_mode, lru);
-       if (global_reclaim(sc))
-               __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
  
+       update_lru_size(lruvec, lru, -nr_taken);
+       __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
         reclaim_stat->recent_scanned[file] += nr_taken;
  
+       if (global_reclaim(sc))
+               __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
         __count_zone_vm_events(PGREFILL, zone, nr_scanned);
-       __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
-       __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+
         spin_unlock_irq(&zone->lru_lock);
  
         while (!list_empty(&l_hold)) {
diff --git a/mm/vmstat.c b/mm/vmstat.c

index 5e43004828971e778a185ec69f2ef9710f085fb4..5b72a8ad281353223fe678e0ae8df354453303ca 100644 (file)
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -569,50 +569,19 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
  #endif
  
  #ifdef CONFIG_NUMA
-/*
- * zonelist = the list of zones passed to the allocator
- * z       = the zone from which the allocation occurred.
- *
- * Must be called with interrupts disabled.
- *
- * When __GFP_OTHER_NODE is set assume the node of the preferred
- * zone is the local node. This is useful for daemons who allocate
- * memory on behalf of other processes.
- */
-void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
-{
-       if (z->zone_pgdat == preferred_zone->zone_pgdat) {
-               __inc_zone_state(z, NUMA_HIT);
-       } else {
-               __inc_zone_state(z, NUMA_MISS);
-               __inc_zone_state(preferred_zone, NUMA_FOREIGN);
-       }
-       if (z->node == ((flags & __GFP_OTHER_NODE) ?
-                       preferred_zone->node : numa_node_id()))
-               __inc_zone_state(z, NUMA_LOCAL);
-       else
-               __inc_zone_state(z, NUMA_OTHER);
-}
-
  /*
   * Determine the per node value of a stat item.
   */
  unsigned long node_page_state(int node, enum zone_stat_item item)
  {
         struct zone *zones = NODE_DATA(node)->node_zones;
+       int i;
+       unsigned long count = 0;
  
-       return
-#ifdef CONFIG_ZONE_DMA
-               zone_page_state(&zones[ZONE_DMA], item) +
-#endif
-#ifdef CONFIG_ZONE_DMA32
-               zone_page_state(&zones[ZONE_DMA32], item) +
-#endif
-#ifdef CONFIG_HIGHMEM
-               zone_page_state(&zones[ZONE_HIGHMEM], item) +
-#endif
-               zone_page_state(&zones[ZONE_NORMAL], item) +
-               zone_page_state(&zones[ZONE_MOVABLE], item);
+       for (i = 0; i < MAX_NR_ZONES; i++)
+               count += zone_page_state(zones + i, item);
+
+       return count;
  }
  
  #endif
@@ -1010,6 +979,9 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
                 if (!memmap_valid_within(pfn, page, zone))
                         continue;
  
+               if (page_zone(page) != zone)
+                       continue;
+
                 mtype = get_pageblock_migratetype(page);
  
                 if (mtype < MIGRATE_TYPES)
@@ -1069,13 +1041,17 @@ static void pagetypeinfo_showmixedcount_print(struct seq_file *m,
                 block_end_pfn = min(block_end_pfn, end_pfn);
  
                 page = pfn_to_page(pfn);
-               pageblock_mt = get_pfnblock_migratetype(page, pfn);
+               pageblock_mt = get_pageblock_migratetype(page);
  
                 for (; pfn < block_end_pfn; pfn++) {
                         if (!pfn_valid_within(pfn))
                                 continue;
  
                         page = pfn_to_page(pfn);
+
+                       if (page_zone(page) != zone)
+                               continue;
+
                         if (PageBuddy(page)) {
                                 pfn += (1UL << page_order(page)) - 1;
                                 continue;
@@ -1378,6 +1354,66 @@ static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
  int sysctl_stat_interval __read_mostly = HZ;
  static cpumask_var_t cpu_stat_off;
  
+#ifdef CONFIG_PROC_FS
+static void refresh_vm_stats(struct work_struct *work)
+{
+       refresh_cpu_vm_stats(true);
+}
+
+int vmstat_refresh(struct ctl_table *table, int write,
+                  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+       long val;
+       int err;
+       int i;
+
+       /*
+        * The regular update, every sysctl_stat_interval, may come later
+        * than expected: leaving a significant amount in per_cpu buckets.
+        * This is particularly misleading when checking a quantity of HUGE
+        * pages, immediately after running a test.  /proc/sys/vm/stat_refresh,
+        * which can equally be echo'ed to or cat'ted from (by root),
+        * can be used to update the stats just before reading them.
+        *
+        * Oh, and since global_page_state() etc. are so careful to hide
+        * transiently negative values, report an error here if any of
+        * the stats is negative, so we know to go looking for imbalance.
+        */
+       err = schedule_on_each_cpu(refresh_vm_stats);
+       if (err)
+               return err;
+       for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+               val = atomic_long_read(&vm_stat[i]);
+               if (val < 0) {
+                       switch (i) {
+                       case NR_ALLOC_BATCH:
+                       case NR_PAGES_SCANNED:
+                               /*
+                                * These are often seen to go negative in
+                                * recent kernels, but not to go permanently
+                                * negative.  Whilst it would be nicer not to
+                                * have exceptions, rooting them out would be
+                                * another task, of rather low priority.
+                                */
+                               break;
+                       default:
+                               pr_warn("%s: %s %ld\n",
+                                       __func__, vmstat_text[i], val);
+                               err = -EINVAL;
+                               break;
+                       }
+               }
+       }
+       if (err)
+               return err;
+       if (write)
+               *ppos += *lenp;
+       else
+               *lenp = 0;
+       return 0;
+}
+#endif /* CONFIG_PROC_FS */
+
  static void vmstat_update(struct work_struct *w)
  {
         if (refresh_cpu_vm_stats(true)) {
diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c

index 93ff038ea9d1ce1cde783a877d629216babab8a6..d921adc62765dba0bbaec9a7489227348405082e 100644 (file)
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -111,7 +111,7 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
                 cpu_relax();
         }
  
-       ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, PAGE_SIZE);
+       ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, 0, PAGE_SIZE);
         if (unlikely(ret != ibmr->sg_len))
                 return ret < 0 ? ret : -EINVAL;
  
diff --git a/net/socket.c b/net/socket.c

index e7793f5601ae8c6d2a0f6706cc32c96073619f3b..a1bd16106625a14ef95f1da0e560a5638ffa4774 100644 (file)
--- a/net/socket.c
+++ b/net/socket.c
@@ -2168,7 +2168,8 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
         struct mmsghdr __user *entry;
         struct compat_mmsghdr __user *compat_entry;
         struct msghdr msg_sys;
-       struct timespec end_time;
+       struct timespec64 end_time;
+       struct timespec64 timeout64;
  
         if (timeout &&
             poll_select_set_timeout(&end_time, timeout->tv_sec,
@@ -2220,8 +2221,9 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
                         flags |= MSG_DONTWAIT;
  
                 if (timeout) {
-                       ktime_get_ts(timeout);
-                       *timeout = timespec_sub(end_time, *timeout);
+                       ktime_get_ts64(&timeout64);
+                       *timeout = timespec64_to_timespec(
+                                       timespec64_sub(end_time, timeout64));
                         if (timeout->tv_sec < 0) {
                                 timeout->tv_sec = timeout->tv_nsec = 0;
                                 break;
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c

index c250924a9fd3c6489a123a46ee9f5dc6ae67366e..94c3fa910b85e5e4a53109a2121dd9be51067c49 100644 (file)
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -421,7 +421,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
                 return -ENOMEM;
         }
  
-       n = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, PAGE_SIZE);
+       n = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, NULL, PAGE_SIZE);
         if (unlikely(n != frmr->sg_nents)) {
                 pr_err("RPC:       %s: failed to map mr %p (%u/%u)\n",
                        __func__, frmr->fr_mr, n, frmr->sg_nents);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c

index 3b24a646eb46725219011ffe859d998a0af06bb3..fbe7444e7de6ab05ee1c6e91d09b03eca7ba1f3f 100644 (file)
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -281,7 +281,7 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
         }
         atomic_inc(&xprt->sc_dma_used);
  
-       n = ib_map_mr_sg(frmr->mr, frmr->sg, frmr->sg_nents, PAGE_SIZE);
+       n = ib_map_mr_sg(frmr->mr, frmr->sg, frmr->sg_nents, NULL, PAGE_SIZE);
         if (unlikely(n != frmr->sg_nents)) {
                 pr_err("svcrdma: failed to map mr %p (%d/%d elements)\n",
                        frmr->mr, n, frmr->sg_nents);
diff --git a/net/wireless/util.c b/net/wireless/util.c

index 219bd197039e98b7cb1722b211722732b74abea8..4e809e978b7d274f3967349c87acda43aac57370 100644 (file)
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -651,7 +651,7 @@ __frame_add_frag(struct sk_buff *skb, struct page *page,
         struct skb_shared_info *sh = skb_shinfo(skb);
         int page_offset;
  
-       atomic_inc(&page->_count);
+       page_ref_inc(page);
         page_offset = ptr - page_address(page);
         skb_add_rx_frag(skb, sh->nr_frags, page, page_offset, len, size);
  }
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib

index ddf83d0181e73d30fb86f68c0fa0a3f888335c0c..ed1b7c4fb674a92c0c566a47796d013abd1c5275 100644 (file)
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -277,6 +277,11 @@ cmd_gzip = (cat $(filter-out FORCE,$^) | gzip -n -f -9 > $@) || \
  # ---------------------------------------------------------------------------
  DTC ?= $(objtree)/scripts/dtc/dtc
  
+# Disable noisy checks by default
+ifeq ($(KBUILD_ENABLE_EXTRA_GCC_CHECKS),)
+DTC_FLAGS += -Wno-unit_address_vs_reg
+endif
+
  # Generate an assembly file to wrap the output of the device tree compiler
  quiet_cmd_dt_S_dtb= DTB     $@
  cmd_dt_S_dtb=                                          \
diff --git a/scripts/bloat-o-meter b/scripts/bloat-o-meter

index 38b64f4873152bc9012663470416b13614fa7570..0254f3ba0dbab0c55c2c18ccebc037c357fac1ab 100755 (executable)
--- a/scripts/bloat-o-meter
+++ b/scripts/bloat-o-meter
@@ -32,18 +32,21 @@ old = getsizes(sys.argv[1])
  new = getsizes(sys.argv[2])
  grow, shrink, add, remove, up, down = 0, 0, 0, 0, 0, 0
  delta, common = [], {}
+otot, ntot = 0, 0
  
  for a in old:
      if a in new:
          common[a] = 1
  
  for name in old:
+    otot += old[name]
      if name not in common:
          remove += 1
          down += old[name]
          delta.append((-old[name], name))
  
  for name in new:
+    ntot += new[name]
      if name not in common:
          add += 1
          up += new[name]
@@ -63,3 +66,6 @@ print("add/remove: %s/%s grow/shrink: %s/%s up/down: %s/%s (%s)" % \
  print("%-40s %7s %7s %+7s" % ("function", "old", "new", "delta"))
  for d, n in delta:
      if d: print("%-40s %7s %7s %+7d" % (n, old.get(n,"-"), new.get(n,"-"), d))
+
+print("Total: Before=%d, After=%d, chg %f%%" % \
+    (otot, ntot, (ntot - otot)*100/otot))
diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh

index 00d6d53c2681dac3ce2f736baff7cbc177825b0e..c332684e1b5ace1e4ea8a357ed83b5931e04cdfd 100755 (executable)
--- a/scripts/decode_stacktrace.sh
+++ b/scripts/decode_stacktrace.sh
@@ -2,15 +2,17 @@
  # (c) 2014, Sasha Levin <sasha.levin@oracle.com>
  #set -x
  
-if [[ $# != 2 ]]; then
+if [[ $# < 2 ]]; then
         echo "Usage:"
-       echo "  $0 [vmlinux] [base path]"
+       echo "  $0 [vmlinux] [base path] [modules path]"
         exit 1
  fi
  
  vmlinux=$1
  basepath=$2
+modpath=$3
  declare -A cache
+declare -A modcache
  
  parse_symbol() {
         # The structure of symbol at this point is:
@@ -19,6 +21,17 @@ parse_symbol() {
         # For example:
         #   do_basic_setup+0x9c/0xbf
  
+       if [[ $module == "" ]] ; then
+               local objfile=$vmlinux
+       elif [[ "${modcache[$module]+isset}" == "isset" ]]; then
+               local objfile=${modcache[$module]}
+       else
+               [[ $modpath == "" ]] && return
+               local objfile=$(find "$modpath" -name $module.ko -print -quit)
+               [[ $objfile == "" ]] && return
+               modcache[$module]=$objfile
+       fi
+
         # Remove the englobing parenthesis
         symbol=${symbol#\(}
         symbol=${symbol%\)}
@@ -29,11 +42,11 @@ parse_symbol() {
         # Use 'nm vmlinux' to figure out the base address of said symbol.
         # It's actually faster to call it every time than to load it
         # all into bash.
-       if [[ "${cache[$name]+isset}" == "isset" ]]; then
-               local base_addr=${cache[$name]}
+       if [[ "${cache[$module,$name]+isset}" == "isset" ]]; then
+               local base_addr=${cache[$module,$name]}
         else
-               local base_addr=$(nm "$vmlinux" | grep -i ' t ' | awk "/ $name\$/ {print \$1}" | head -n1)
-               cache["$name"]="$base_addr"
+               local base_addr=$(nm "$objfile" | grep -i ' t ' | awk "/ $name\$/ {print \$1}" | head -n1)
+               cache[$module,$name]="$base_addr"
         fi
         # Let's start doing the math to get the exact address into the
         # symbol. First, strip out the symbol total length.
@@ -48,12 +61,12 @@ parse_symbol() {
         local address=$(printf "%x\n" "$expr")
  
         # Pass it to addr2line to get filename and line number
-        # Could get more than one result
-       if [[ "${cache[$address]+isset}" == "isset" ]]; then
-               local code=${cache[$address]}
+       # Could get more than one result
+       if [[ "${cache[$module,$address]+isset}" == "isset" ]]; then
+               local code=${cache[$module,$address]}
         else
-               local code=$(addr2line -i -e "$vmlinux" "$address")
-               cache[$address]=$code
+               local code=$(addr2line -i -e "$objfile" "$address")
+               cache[$module,$address]=$code
         fi
  
         # addr2line doesn't return a proper error code if it fails, so
@@ -105,13 +118,23 @@ handle_line() {
                 fi
         done
  
-       # The symbol is the last element, process it
-       symbol=${words[$last]}
+       if [[ ${words[$last]} =~ \[([^]]+)\] ]]; then
+               module=${words[$last]}
+               module=${module#\[}
+               module=${module%\]}
+               symbol=${words[$last-1]}
+               unset words[$last-1]
+       else
+               # The symbol is the last element, process it
+               symbol=${words[$last]}
+               module=
+       fi
+
         unset words[$last]
         parse_symbol # modifies $symbol
  
         # Add up the line number to the symbol
-       echo "${words[@]}" "$symbol"
+       echo "${words[@]}" "$symbol $module"
  }
  
  while read line; do
@@ -121,8 +144,8 @@ while read line; do
                 handle_line "$line"
         # Is it a code line?
         elif [[ $line == *Code:* ]]; then
-                decode_code "$line"
-        else
+               decode_code "$line"
+       else
                 # Nothing special in this line, show it as is
                 echo "$line"
         fi
diff --git a/scripts/dtc/checks.c b/scripts/dtc/checks.c

index 0c03ac9159c10a1e5eef0eca7136d246353aed16..386f9563313f713f896236814de35811d145d121 100644 (file)
--- a/scripts/dtc/checks.c
+++ b/scripts/dtc/checks.c
@@ -294,6 +294,30 @@ static void check_node_name_format(struct check *c, struct node *dt,
  }
  NODE_ERROR(node_name_format, NULL, &node_name_chars);
  
+static void check_unit_address_vs_reg(struct check *c, struct node *dt,
+                            struct node *node)
+{
+       const char *unitname = get_unitname(node);
+       struct property *prop = get_property(node, "reg");
+
+       if (!prop) {
+               prop = get_property(node, "ranges");
+               if (prop && !prop->val.len)
+                       prop = NULL;
+       }
+
+       if (prop) {
+               if (!unitname[0])
+                       FAIL(c, "Node %s has a reg or ranges property, but no unit name",
+                           node->fullpath);
+       } else {
+               if (unitname[0])
+                       FAIL(c, "Node %s has a unit name, but no reg property",
+                           node->fullpath);
+       }
+}
+NODE_WARNING(unit_address_vs_reg, NULL);
+
  static void check_property_name_chars(struct check *c, struct node *dt,
                                       struct node *node, struct property *prop)
  {
@@ -667,6 +691,8 @@ static struct check *check_table[] = {
  
         &addr_size_cells, &reg_format, &ranges_format,
  
+       &unit_address_vs_reg,
+
         &avoid_default_addr_size,
         &obsolete_chosen_interrupt_controller,
  
diff --git a/scripts/dtc/flattree.c b/scripts/dtc/flattree.c

index bd99fa2d33b85e873bd00178d6390d70f4afaa0d..ec14954f5810de3ad7262d5ee60e21b23565c155 100644 (file)
--- a/scripts/dtc/flattree.c
+++ b/scripts/dtc/flattree.c
@@ -889,7 +889,7 @@ struct boot_info *dt_from_blob(const char *fname)
  
         if (version >= 3) {
                 uint32_t size_str = fdt32_to_cpu(fdt->size_dt_strings);
-               if (off_str+size_str > totalsize)
+               if ((off_str+size_str < off_str) || (off_str+size_str > totalsize))
                         die("String table extends past total size\n");
                 inbuf_init(&strbuf, blob + off_str, blob + off_str + size_str);
         } else {
@@ -898,7 +898,7 @@ struct boot_info *dt_from_blob(const char *fname)
  
         if (version >= 17) {
                 size_dt = fdt32_to_cpu(fdt->size_dt_struct);
-               if (off_dt+size_dt > totalsize)
+               if ((off_dt+size_dt < off_dt) || (off_dt+size_dt > totalsize))
                         die("Structure block extends past total size\n");
         }
  
diff --git a/scripts/dtc/libfdt/fdt_ro.c b/scripts/dtc/libfdt/fdt_ro.c

index e5b313682007265ed67036b64c39890c14c830f8..50cce864283c4dd2818c3d5f2666b7b23f54086a 100644 (file)
--- a/scripts/dtc/libfdt/fdt_ro.c
+++ b/scripts/dtc/libfdt/fdt_ro.c
@@ -647,10 +647,8 @@ int fdt_node_check_compatible(const void *fdt, int nodeoffset,
         prop = fdt_getprop(fdt, nodeoffset, "compatible", &len);
         if (!prop)
                 return len;
-       if (fdt_stringlist_contains(prop, len, compatible))
-               return 0;
-       else
-               return 1;
+
+       return !fdt_stringlist_contains(prop, len, compatible);
  }
  
  int fdt_node_offset_by_compatible(const void *fdt, int startoffset,
diff --git a/scripts/dtc/version_gen.h b/scripts/dtc/version_gen.h

index 11d93e6d8220c45d737175d22af4ab0ccaf871a1..ad9b05ae698b0495ecbda42ffcf4743555313a27 100644 (file)
--- a/scripts/dtc/version_gen.h
+++ b/scripts/dtc/version_gen.h
@@ -1 +1 @@
-#define DTC_VERSION "DTC 1.4.1-gb06e55c8"
+#define DTC_VERSION "DTC 1.4.1-g53bf130b"
diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c

index 638b143ee60f4246cc16d67936bc62095a66c646..1f22a186c18cb56a2c19a4b38bf90f41f1b171d8 100644 (file)
--- a/scripts/kallsyms.c
+++ b/scripts/kallsyms.c
@@ -63,7 +63,6 @@ static unsigned int table_size, table_cnt;
  static int all_symbols = 0;
  static int absolute_percpu = 0;
  static char symbol_prefix_char = '\0';
-static unsigned long long kernel_start_addr = 0;
  static int base_relative = 0;
  
  int token_profit[0x10000];
@@ -223,15 +222,13 @@ static int symbol_valid(struct sym_entry *s)
  
         static char *special_suffixes[] = {
                 "_veneer",              /* arm */
+               "_from_arm",            /* arm */
+               "_from_thumb",          /* arm */
                 NULL };
  
         int i;
         char *sym_name = (char *)s->sym + 1;
  
-
-       if (s->addr < kernel_start_addr)
-               return 0;
-
         /* skip prefix char */
         if (symbol_prefix_char && *sym_name == symbol_prefix_char)
                 sym_name++;
@@ -765,9 +762,6 @@ int main(int argc, char **argv)
                                 if ((*p == '"' && *(p+2) == '"') || (*p == '\'' && *(p+2) == '\''))
                                         p++;
                                 symbol_prefix_char = *p;
-                       } else if (strncmp(argv[i], "--page-offset=", 14) == 0) {
-                               const char *p = &argv[i][14];
-                               kernel_start_addr = strtoull(p, NULL, 16);
                         } else if (strcmp(argv[i], "--base-relative") == 0)
                                 base_relative = 1;
                         else
diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh

index 49d61ade942563eb47f7612b8b7d427c9595e940..f0f6d9d75435f4a8311ab9c2efc894b1c9670b43 100755 (executable)
--- a/scripts/link-vmlinux.sh
+++ b/scripts/link-vmlinux.sh
@@ -82,10 +82,6 @@ kallsyms()
                 kallsymopt="${kallsymopt} --all-symbols"
         fi
  
-       if [ -n "${CONFIG_ARM}" ] && [ -z "${CONFIG_XIP_KERNEL}" ] && [ -n "${CONFIG_PAGE_OFFSET}" ]; then
-               kallsymopt="${kallsymopt} --page-offset=$CONFIG_PAGE_OFFSET"
-       fi
-
         if [ -n "${CONFIG_KALLSYMS_ABSOLUTE_PERCPU}" ]; then
                 kallsymopt="${kallsymopt} --absolute-percpu"
         fi
diff --git a/scripts/spelling.txt b/scripts/spelling.txt

index 946caf3bd694ea4f41c92d8b0e51ac996f8b98e7..fa79c6d2a5b88975d4504de87234aa33c64b1938 100644 (file)
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -428,6 +428,7 @@ feautures||features
  fetaure||feature
  fetaures||features
  fileystem||filesystem
+fimware||firmware
  finanize||finalize
  findn||find
  finilizes||finalizes
diff --git a/tools/perf/arch/powerpc/include/perf_regs.h b/tools/perf/arch/powerpc/include/perf_regs.h

new file mode 100644 (file)

index 0000000..75de0e9
--- /dev/null
+++ b/tools/perf/arch/powerpc/include/perf_regs.h
@@ -0,0 +1,69 @@
+#ifndef ARCH_PERF_REGS_H
+#define ARCH_PERF_REGS_H
+
+#include <stdlib.h>
+#include <linux/types.h>
+#include <asm/perf_regs.h>
+
+#define PERF_REGS_MASK  ((1ULL << PERF_REG_POWERPC_MAX) - 1)
+#define PERF_REGS_MAX   PERF_REG_POWERPC_MAX
+#ifdef __powerpc64__
+       #define PERF_SAMPLE_REGS_ABI    PERF_SAMPLE_REGS_ABI_64
+#else
+       #define PERF_SAMPLE_REGS_ABI    PERF_SAMPLE_REGS_ABI_32
+#endif
+
+#define PERF_REG_IP     PERF_REG_POWERPC_NIP
+#define PERF_REG_SP     PERF_REG_POWERPC_R1
+
+static const char *reg_names[] = {
+       [PERF_REG_POWERPC_R0] = "r0",
+       [PERF_REG_POWERPC_R1] = "r1",
+       [PERF_REG_POWERPC_R2] = "r2",
+       [PERF_REG_POWERPC_R3] = "r3",
+       [PERF_REG_POWERPC_R4] = "r4",
+       [PERF_REG_POWERPC_R5] = "r5",
+       [PERF_REG_POWERPC_R6] = "r6",
+       [PERF_REG_POWERPC_R7] = "r7",
+       [PERF_REG_POWERPC_R8] = "r8",
+       [PERF_REG_POWERPC_R9] = "r9",
+       [PERF_REG_POWERPC_R10] = "r10",
+       [PERF_REG_POWERPC_R11] = "r11",
+       [PERF_REG_POWERPC_R12] = "r12",
+       [PERF_REG_POWERPC_R13] = "r13",
+       [PERF_REG_POWERPC_R14] = "r14",
+       [PERF_REG_POWERPC_R15] = "r15",
+       [PERF_REG_POWERPC_R16] = "r16",
+       [PERF_REG_POWERPC_R17] = "r17",
+       [PERF_REG_POWERPC_R18] = "r18",
+       [PERF_REG_POWERPC_R19] = "r19",
+       [PERF_REG_POWERPC_R20] = "r20",
+       [PERF_REG_POWERPC_R21] = "r21",
+       [PERF_REG_POWERPC_R22] = "r22",
+       [PERF_REG_POWERPC_R23] = "r23",
+       [PERF_REG_POWERPC_R24] = "r24",
+       [PERF_REG_POWERPC_R25] = "r25",
+       [PERF_REG_POWERPC_R26] = "r26",
+       [PERF_REG_POWERPC_R27] = "r27",
+       [PERF_REG_POWERPC_R28] = "r28",
+       [PERF_REG_POWERPC_R29] = "r29",
+       [PERF_REG_POWERPC_R30] = "r30",
+       [PERF_REG_POWERPC_R31] = "r31",
+       [PERF_REG_POWERPC_NIP] = "nip",
+       [PERF_REG_POWERPC_MSR] = "msr",
+       [PERF_REG_POWERPC_ORIG_R3] = "orig_r3",
+       [PERF_REG_POWERPC_CTR] = "ctr",
+       [PERF_REG_POWERPC_LINK] = "link",
+       [PERF_REG_POWERPC_XER] = "xer",
+       [PERF_REG_POWERPC_CCR] = "ccr",
+       [PERF_REG_POWERPC_SOFTE] = "softe",
+       [PERF_REG_POWERPC_TRAP] = "trap",
+       [PERF_REG_POWERPC_DAR] = "dar",
+       [PERF_REG_POWERPC_DSISR] = "dsisr"
+};
+
+static inline const char *perf_reg_name(int id)
+{
+       return reg_names[id];
+}
+#endif /* ARCH_PERF_REGS_H */
diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build

index c8fe2074d2177f0709f0e1d54f63cd77f84f2233..90ad64b231cd821abe8756ed92d68d7c4c367804 100644 (file)
--- a/tools/perf/arch/powerpc/util/Build
+++ b/tools/perf/arch/powerpc/util/Build
@@ -1,6 +1,8 @@
  libperf-y += header.o
  libperf-y += sym-handling.o
  libperf-y += kvm-stat.o
+libperf-y += perf_regs.o
  
  libperf-$(CONFIG_DWARF) += dwarf-regs.o
  libperf-$(CONFIG_DWARF) += skip-callchain-idx.o
+libperf-$(CONFIG_LIBUNWIND) += unwind-libunwind.o
diff --git a/tools/perf/arch/powerpc/util/perf_regs.c b/tools/perf/arch/powerpc/util/perf_regs.c

new file mode 100644 (file)

index 0000000..a3c3e1c
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/perf_regs.c
@@ -0,0 +1,49 @@
+#include "../../perf.h"
+#include "../../util/perf_regs.h"
+
+const struct sample_reg sample_reg_masks[] = {
+       SMPL_REG(r0, PERF_REG_POWERPC_R0),
+       SMPL_REG(r1, PERF_REG_POWERPC_R1),
+       SMPL_REG(r2, PERF_REG_POWERPC_R2),
+       SMPL_REG(r3, PERF_REG_POWERPC_R3),
+       SMPL_REG(r4, PERF_REG_POWERPC_R4),
+       SMPL_REG(r5, PERF_REG_POWERPC_R5),
+       SMPL_REG(r6, PERF_REG_POWERPC_R6),
+       SMPL_REG(r7, PERF_REG_POWERPC_R7),
+       SMPL_REG(r8, PERF_REG_POWERPC_R8),
+       SMPL_REG(r9, PERF_REG_POWERPC_R9),
+       SMPL_REG(r10, PERF_REG_POWERPC_R10),
+       SMPL_REG(r11, PERF_REG_POWERPC_R11),
+       SMPL_REG(r12, PERF_REG_POWERPC_R12),
+       SMPL_REG(r13, PERF_REG_POWERPC_R13),
+       SMPL_REG(r14, PERF_REG_POWERPC_R14),
+       SMPL_REG(r15, PERF_REG_POWERPC_R15),
+       SMPL_REG(r16, PERF_REG_POWERPC_R16),
+       SMPL_REG(r17, PERF_REG_POWERPC_R17),
+       SMPL_REG(r18, PERF_REG_POWERPC_R18),
+       SMPL_REG(r19, PERF_REG_POWERPC_R19),
+       SMPL_REG(r20, PERF_REG_POWERPC_R20),
+       SMPL_REG(r21, PERF_REG_POWERPC_R21),
+       SMPL_REG(r22, PERF_REG_POWERPC_R22),
+       SMPL_REG(r23, PERF_REG_POWERPC_R23),
+       SMPL_REG(r24, PERF_REG_POWERPC_R24),
+       SMPL_REG(r25, PERF_REG_POWERPC_R25),
+       SMPL_REG(r26, PERF_REG_POWERPC_R26),
+       SMPL_REG(r27, PERF_REG_POWERPC_R27),
+       SMPL_REG(r28, PERF_REG_POWERPC_R28),
+       SMPL_REG(r29, PERF_REG_POWERPC_R29),
+       SMPL_REG(r30, PERF_REG_POWERPC_R30),
+       SMPL_REG(r31, PERF_REG_POWERPC_R31),
+       SMPL_REG(nip, PERF_REG_POWERPC_NIP),
+       SMPL_REG(msr, PERF_REG_POWERPC_MSR),
+       SMPL_REG(orig_r3, PERF_REG_POWERPC_ORIG_R3),
+       SMPL_REG(ctr, PERF_REG_POWERPC_CTR),
+       SMPL_REG(link, PERF_REG_POWERPC_LINK),
+       SMPL_REG(xer, PERF_REG_POWERPC_XER),
+       SMPL_REG(ccr, PERF_REG_POWERPC_CCR),
+       SMPL_REG(softe, PERF_REG_POWERPC_SOFTE),
+       SMPL_REG(trap, PERF_REG_POWERPC_TRAP),
+       SMPL_REG(dar, PERF_REG_POWERPC_DAR),
+       SMPL_REG(dsisr, PERF_REG_POWERPC_DSISR),
+       SMPL_REG_END
+};
diff --git a/tools/perf/arch/powerpc/util/unwind-libunwind.c b/tools/perf/arch/powerpc/util/unwind-libunwind.c

new file mode 100644 (file)

index 0000000..9e15f92
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/unwind-libunwind.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright 2016 Chandan Kumar, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <errno.h>
+#include <libunwind.h>
+#include <asm/perf_regs.h>
+#include "../../util/unwind.h"
+#include "../../util/debug.h"
+
+int libunwind__arch_reg_id(int regnum)
+{
+       switch (regnum) {
+       case UNW_PPC64_R0:
+               return PERF_REG_POWERPC_R0;
+       case UNW_PPC64_R1:
+               return PERF_REG_POWERPC_R1;
+       case UNW_PPC64_R2:
+               return PERF_REG_POWERPC_R2;
+       case UNW_PPC64_R3:
+               return PERF_REG_POWERPC_R3;
+       case UNW_PPC64_R4:
+               return PERF_REG_POWERPC_R4;
+       case UNW_PPC64_R5:
+               return PERF_REG_POWERPC_R5;
+       case UNW_PPC64_R6:
+               return PERF_REG_POWERPC_R6;
+       case UNW_PPC64_R7:
+               return PERF_REG_POWERPC_R7;
+       case UNW_PPC64_R8:
+               return PERF_REG_POWERPC_R8;
+       case UNW_PPC64_R9:
+               return PERF_REG_POWERPC_R9;
+       case UNW_PPC64_R10:
+               return PERF_REG_POWERPC_R10;
+       case UNW_PPC64_R11:
+               return PERF_REG_POWERPC_R11;
+       case UNW_PPC64_R12:
+               return PERF_REG_POWERPC_R12;
+       case UNW_PPC64_R13:
+               return PERF_REG_POWERPC_R13;
+       case UNW_PPC64_R14:
+               return PERF_REG_POWERPC_R14;
+       case UNW_PPC64_R15:
+               return PERF_REG_POWERPC_R15;
+       case UNW_PPC64_R16:
+               return PERF_REG_POWERPC_R16;
+       case UNW_PPC64_R17:
+               return PERF_REG_POWERPC_R17;
+       case UNW_PPC64_R18:
+               return PERF_REG_POWERPC_R18;
+       case UNW_PPC64_R19:
+               return PERF_REG_POWERPC_R19;
+       case UNW_PPC64_R20:
+               return PERF_REG_POWERPC_R20;
+       case UNW_PPC64_R21:
+               return PERF_REG_POWERPC_R21;
+       case UNW_PPC64_R22:
+               return PERF_REG_POWERPC_R22;
+       case UNW_PPC64_R23:
+               return PERF_REG_POWERPC_R23;
+       case UNW_PPC64_R24:
+               return PERF_REG_POWERPC_R24;
+       case UNW_PPC64_R25:
+               return PERF_REG_POWERPC_R25;
+       case UNW_PPC64_R26:
+               return PERF_REG_POWERPC_R26;
+       case UNW_PPC64_R27:
+               return PERF_REG_POWERPC_R27;
+       case UNW_PPC64_R28:
+               return PERF_REG_POWERPC_R28;
+       case UNW_PPC64_R29:
+               return PERF_REG_POWERPC_R29;
+       case UNW_PPC64_R30:
+               return PERF_REG_POWERPC_R30;
+       case UNW_PPC64_R31:
+               return PERF_REG_POWERPC_R31;
+       case UNW_PPC64_LR:
+               return PERF_REG_POWERPC_LINK;
+       case UNW_PPC64_CTR:
+               return PERF_REG_POWERPC_CTR;
+       case UNW_PPC64_XER:
+               return PERF_REG_POWERPC_XER;
+       case UNW_PPC64_NIP:
+               return PERF_REG_POWERPC_NIP;
+       default:
+               pr_err("unwind: invalid reg id %d\n", regnum);
+               return -EINVAL;
+       }
+       return -EINVAL;
+}
diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile

index 1e46277286c2e352417bba9c4b6f358f447a2b23..5ad0255f8756e20128b6155f174ab85280835e80 100644 (file)
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile
@@ -23,6 +23,12 @@ $(call detected_var,ARCH)
  
  NO_PERF_REGS := 1
  
+# Additional ARCH settings for ppc
+ifeq ($(ARCH),powerpc)
+  NO_PERF_REGS := 0
+  LIBUNWIND_LIBS := -lunwind -lunwind-ppc64
+endif
+
  # Additional ARCH settings for x86
  ifeq ($(ARCH),x86)
    $(call detected,CONFIG_X86)
diff --git a/tools/perf/util/perf_regs.c b/tools/perf/util/perf_regs.c

index 6b8eb13e14e4d5897fca71c41634a07cc9ec5d16..c4023f22f287dd7fb6cdf4a395af2f0d631665fb 100644 (file)
--- a/tools/perf/util/perf_regs.c
+++ b/tools/perf/util/perf_regs.c
@@ -12,18 +12,18 @@ int perf_reg_value(u64 *valp, struct regs_dump *regs, int id)
         int i, idx = 0;
         u64 mask = regs->mask;
  
-       if (regs->cache_mask & (1 << id))
+       if (regs->cache_mask & (1ULL << id))
                 goto out;
  
-       if (!(mask & (1 << id)))
+       if (!(mask & (1ULL << id)))
                 return -EINVAL;
  
         for (i = 0; i < id; i++) {
-               if (mask & (1 << i))
+               if (mask & (1ULL << i))
                         idx++;
         }
  
-       regs->cache_mask |= (1 << id);
+       regs->cache_mask |= (1ULL << id);
         regs->cache_regs[id] = regs->regs[idx];
  
  out:
diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile

index b08f77cbe31ba72baf5e773866dc8d488622a0b7..4ca83fe80654ce44472fb9ff128eb00df8dbb896 100644 (file)
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -14,6 +14,7 @@ export CFLAGS
  
  SUB_DIRS = benchmarks          \
            copyloops            \
+          context_switch       \
            dscr                 \
            mm                   \
            pmu                  \
diff --git a/tools/testing/selftests/powerpc/context_switch/.gitignore b/tools/testing/selftests/powerpc/context_switch/.gitignore

new file mode 100644 (file)

index 0000000..c1431af
--- /dev/null
+++ b/tools/testing/selftests/powerpc/context_switch/.gitignore
@@ -0,0 +1 @@
+cp_abort
diff --git a/tools/testing/selftests/powerpc/context_switch/Makefile b/tools/testing/selftests/powerpc/context_switch/Makefile

new file mode 100644 (file)

index 0000000..e164d14
--- /dev/null
+++ b/tools/testing/selftests/powerpc/context_switch/Makefile
@@ -0,0 +1,10 @@
+TEST_PROGS := cp_abort
+
+all: $(TEST_PROGS)
+
+$(TEST_PROGS): ../harness.c ../utils.c
+
+include ../../lib.mk
+
+clean:
+       rm -f $(TEST_PROGS)
diff --git a/tools/testing/selftests/powerpc/context_switch/cp_abort.c b/tools/testing/selftests/powerpc/context_switch/cp_abort.c

new file mode 100644 (file)

index 0000000..5a5b55a
--- /dev/null
+++ b/tools/testing/selftests/powerpc/context_switch/cp_abort.c
@@ -0,0 +1,110 @@
+/*
+ * Adapted from Anton Blanchard's context switch microbenchmark.
+ *
+ * Copyright 2009, Anton Blanchard, IBM Corporation.
+ * Copyright 2016, Mikey Neuling, Chris Smart, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * This program tests the copy paste abort functionality of a P9
+ * (or later) by setting up two processes on the same CPU, one
+ * which executes the copy instruction and the other which
+ * executes paste.
+ *
+ * The paste instruction should never succeed, as the cp_abort
+ * instruction is called by the kernel during a context switch.
+ *
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include "utils.h"
+#include <sched.h>
+
+#define READ_FD 0
+#define WRITE_FD 1
+
+#define NUM_LOOPS 1000
+
+/* This defines the "paste" instruction from Power ISA 3.0 Book II, section 4.4. */
+#define PASTE(RA, RB, L, RC) \
+       .long (0x7c00070c | (RA) << (31-15) | (RB) << (31-20) | (L) << (31-10) | (RC) << (31-31))
+
+int paste(void *i)
+{
+       int cr;
+
+       asm volatile(str(PASTE(0, %1, 1, 1))";"
+                       "mfcr %0;"
+                       : "=r" (cr)
+                       : "b" (i)
+                       : "memory"
+                   );
+       return cr;
+}
+
+/* This defines the "copy" instruction from Power ISA 3.0 Book II, section 4.4. */
+#define COPY(RA, RB, L) \
+       .long (0x7c00060c | (RA) << (31-15) | (RB) << (31-20) | (L) << (31-10))
+
+void copy(void *i)
+{
+       asm volatile(str(COPY(0, %0, 1))";"
+                       :
+                       : "b" (i)
+                       : "memory"
+                   );
+}
+
+int test_cp_abort(void)
+{
+       /* 128 bytes for a full cache line */
+       char buf[128] __cacheline_aligned;
+       cpu_set_t cpuset;
+       int fd1[2], fd2[2], pid;
+       char c;
+
+       /* only run this test on a P9 or later */
+       SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_3_00));
+
+       /*
+        * Run both processes on the same CPU, so that copy is more likely
+        * to leak into a paste.
+        */
+       CPU_ZERO(&cpuset);
+       CPU_SET(pick_online_cpu(), &cpuset);
+       FAIL_IF(sched_setaffinity(0, sizeof(cpuset), &cpuset));
+
+       FAIL_IF(pipe(fd1) || pipe(fd2));
+
+       pid = fork();
+       FAIL_IF(pid < 0);
+
+       if (!pid) {
+               for (int i = 0; i < NUM_LOOPS; i++) {
+                       FAIL_IF((write(fd1[WRITE_FD], &c, 1)) != 1);
+                       FAIL_IF((read(fd2[READ_FD], &c, 1)) != 1);
+                       /* A paste succeeds if CR0 EQ bit is set */
+                       FAIL_IF(paste(buf) & 0x20000000);
+               }
+       } else {
+               for (int i = 0; i < NUM_LOOPS; i++) {
+                       FAIL_IF((read(fd1[READ_FD], &c, 1)) != 1);
+                       copy(buf);
+                       FAIL_IF((write(fd2[WRITE_FD], &c, 1) != 1));
+               }
+       }
+       return 0;
+
+}
+
+int main(int argc, char *argv[])
+{
+       return test_harness(test_cp_abort, "cp_abort");
+}
diff --git a/tools/testing/selftests/powerpc/mm/subpage_prot.c b/tools/testing/selftests/powerpc/mm/subpage_prot.c

index 440180ff8089d5727bef2f231379879f07054be6..35ade7406dcdbbc778dbf7f39d46fafee0b0148f 100644 (file)
--- a/tools/testing/selftests/powerpc/mm/subpage_prot.c
+++ b/tools/testing/selftests/powerpc/mm/subpage_prot.c
@@ -73,7 +73,7 @@ static inline void check_faulted(void *addr, long page, long subpage, int write)
                 want_fault |= (subpage == ((page + 1) % 16));
  
         if (faulted != want_fault) {
-               printf("Failed at 0x%p (p=%ld,sp=%ld,w=%d), want=%s, got=%s !\n",
+               printf("Failed at %p (p=%ld,sp=%ld,w=%d), want=%s, got=%s !\n",
                        addr, page, subpage, write,
                        want_fault ? "fault" : "pass",
                        faulted ? "fault" : "pass");
@@ -82,7 +82,7 @@ static inline void check_faulted(void *addr, long page, long subpage, int write)
  
         if (faulted) {
                 if (dar != addr) {
-                       printf("Fault expected at 0x%p and happened at 0x%p !\n",
+                       printf("Fault expected at %p and happened at %p !\n",
                                addr, dar);
                 }
                 faulted = 0;
@@ -162,7 +162,7 @@ int test_anon(void)
  
         mallocblock = (void *)align;
  
-       printf("allocated malloc block of 0x%lx bytes at 0x%p\n",
+       printf("allocated malloc block of 0x%lx bytes at %p\n",
                mallocsize, mallocblock);
  
         printf("testing malloc block...\n");
@@ -197,7 +197,7 @@ int test_file(void)
                 perror("failed to map file");
                 return 1;
         }
-       printf("allocated %s for 0x%lx bytes at 0x%p\n",
+       printf("allocated %s for 0x%lx bytes at %p\n",
                file_name, filesize, fileblock);
  
         printf("testing file map...\n");
@@ -207,14 +207,16 @@ int test_file(void)
  
  int main(int argc, char *argv[])
  {
-       test_harness(test_anon, "subpage_prot_anon");
+       int rc;
+
+       rc = test_harness(test_anon, "subpage_prot_anon");
+       if (rc)
+               return rc;
  
         if (argc > 1)
                 file_name = argv[1];
         else
                 file_name = "tempfile";
  
-       test_harness(test_file, "subpage_prot_file");
-
-       return 0;
+       return test_harness(test_file, "subpage_prot_file");
  }
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/ebb.c b/tools/testing/selftests/powerpc/pmu/ebb/ebb.c

index e67452f1bcffd47141adf60187b052cdf3ae575e..46681fec549b809e2973a23ed10c5014f28365de 100644 (file)
--- a/tools/testing/selftests/powerpc/pmu/ebb/ebb.c
+++ b/tools/testing/selftests/powerpc/pmu/ebb/ebb.c
@@ -15,7 +15,6 @@
  #include <sys/ioctl.h>
  
  #include "trace.h"
-#include "reg.h"
  #include "ebb.h"
  
  
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/reg.h b/tools/testing/selftests/powerpc/pmu/ebb/reg.h

deleted file mode 100644 (file)

index 5921b0d..0000000
--- a/tools/testing/selftests/powerpc/pmu/ebb/reg.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright 2014, Michael Ellerman, IBM Corp.
- * Licensed under GPLv2.
- */
-
-#ifndef _SELFTESTS_POWERPC_REG_H
-#define _SELFTESTS_POWERPC_REG_H
-
-#define __stringify_1(x)        #x
-#define __stringify(x)          __stringify_1(x)
-
-#define mfspr(rn)       ({unsigned long rval; \
-                         asm volatile("mfspr %0," __stringify(rn) \
-                                 : "=r" (rval)); rval; })
-#define mtspr(rn, v)    asm volatile("mtspr " __stringify(rn) ",%0" : \
-                                    : "r" ((unsigned long)(v)) \
-                                    : "memory")
-
-#define mb()           asm volatile("sync" : : : "memory");
-
-#define SPRN_MMCR2     769
-#define SPRN_MMCRA     770
-#define SPRN_MMCR0     779
-#define   MMCR0_PMAO   0x00000080
-#define   MMCR0_PMAE   0x04000000
-#define   MMCR0_FC     0x80000000
-#define SPRN_EBBHR     804
-#define SPRN_EBBRR     805
-#define SPRN_BESCR     806     /* Branch event status & control register */
-#define SPRN_BESCRS    800     /* Branch event status & control set (1 bits set to 1) */
-#define SPRN_BESCRSU   801     /* Branch event status & control set upper */
-#define SPRN_BESCRR    802     /* Branch event status & control REset (1 bits set to 0) */
-#define SPRN_BESCRRU   803     /* Branch event status & control REset upper */
-
-#define BESCR_PMEO     0x1     /* PMU Event-based exception Occurred */
-#define BESCR_PME      (0x1ul << 32) /* PMU Event-based exception Enable */
-
-#define SPRN_PMC1      771
-#define SPRN_PMC2      772
-#define SPRN_PMC3      773
-#define SPRN_PMC4      774
-#define SPRN_PMC5      775
-#define SPRN_PMC6      776
-
-#define SPRN_SIAR      780
-#define SPRN_SDAR      781
-#define SPRN_SIER      768
-
-#endif /* _SELFTESTS_POWERPC_REG_H */
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/reg_access_test.c b/tools/testing/selftests/powerpc/pmu/ebb/reg_access_test.c

index 5b1188f10c1598450fff3f62cd9be107d55cccee..f923228bca224939c467ee5b50280d3813524238 100644 (file)
--- a/tools/testing/selftests/powerpc/pmu/ebb/reg_access_test.c
+++ b/tools/testing/selftests/powerpc/pmu/ebb/reg_access_test.c
@@ -7,7 +7,6 @@
  #include <stdlib.h>
  
  #include "ebb.h"
-#include "reg.h"
  
  
  /*
diff --git a/tools/testing/selftests/powerpc/reg.h b/tools/testing/selftests/powerpc/reg.h

new file mode 100644 (file)

index 0000000..65bfdee
--- /dev/null
+++ b/tools/testing/selftests/powerpc/reg.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ * Licensed under GPLv2.
+ */
+
+#ifndef _SELFTESTS_POWERPC_REG_H
+#define _SELFTESTS_POWERPC_REG_H
+
+#define __stringify_1(x)        #x
+#define __stringify(x)          __stringify_1(x)
+
+#define mfspr(rn)      ({unsigned long rval; \
+                        asm volatile("mfspr %0," _str(rn) \
+                                   : "=r" (rval)); rval; })
+#define mtspr(rn, v)   asm volatile("mtspr " _str(rn) ",%0" : \
+                                   : "r" ((unsigned long)(v)) \
+                                   : "memory")
+
+#define mb()           asm volatile("sync" : : : "memory");
+
+#define SPRN_MMCR2     769
+#define SPRN_MMCRA     770
+#define SPRN_MMCR0     779
+#define   MMCR0_PMAO   0x00000080
+#define   MMCR0_PMAE   0x04000000
+#define   MMCR0_FC     0x80000000
+#define SPRN_EBBHR     804
+#define SPRN_EBBRR     805
+#define SPRN_BESCR     806     /* Branch event status & control register */
+#define SPRN_BESCRS    800     /* Branch event status & control set (1 bits set to 1) */
+#define SPRN_BESCRSU   801     /* Branch event status & control set upper */
+#define SPRN_BESCRR    802     /* Branch event status & control REset (1 bits set to 0) */
+#define SPRN_BESCRRU   803     /* Branch event status & control REset upper */
+
+#define BESCR_PMEO     0x1     /* PMU Event-based exception Occurred */
+#define BESCR_PME      (0x1ul << 32) /* PMU Event-based exception Enable */
+
+#define SPRN_PMC1      771
+#define SPRN_PMC2      772
+#define SPRN_PMC3      773
+#define SPRN_PMC4      774
+#define SPRN_PMC5      775
+#define SPRN_PMC6      776
+
+#define SPRN_SIAR      780
+#define SPRN_SDAR      781
+#define SPRN_SIER      768
+
+#define SPRN_TEXASR     0x82
+#define SPRN_TFIAR      0x81    /* Transaction Failure Inst Addr    */
+#define SPRN_TFHAR      0x80    /* Transaction Failure Handler Addr */
+#define TEXASR_FS       0x08000000
+#define SPRN_TAR        0x32f
+
+#endif /* _SELFTESTS_POWERPC_REG_H */
diff --git a/tools/testing/selftests/powerpc/tm/.gitignore b/tools/testing/selftests/powerpc/tm/.gitignore

index 7d0f14b8cb2e465657e509e6e7875627859d348b..bb942db845bfa4a235d1bde46cd3cc5c6d43d934 100644 (file)
--- a/tools/testing/selftests/powerpc/tm/.gitignore
+++ b/tools/testing/selftests/powerpc/tm/.gitignore
@@ -3,3 +3,6 @@ tm-syscall
  tm-signal-msr-resv
  tm-signal-stack
  tm-vmxcopy
+tm-fork
+tm-tar
+tm-tmspr
diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile

index 737f72c964e65c22ea2923dfb6843f522074e5bc..d0505dbd22d5968749a084ab7c241f4891dc63ce 100644 (file)
--- a/tools/testing/selftests/powerpc/tm/Makefile
+++ b/tools/testing/selftests/powerpc/tm/Makefile
@@ -1,4 +1,4 @@
-TEST_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack tm-vmxcopy
+TEST_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack tm-vmxcopy tm-fork tm-tar tm-tmspr
  
  all: $(TEST_PROGS)
  
@@ -6,6 +6,7 @@ $(TEST_PROGS): ../harness.c ../utils.c
  
  tm-syscall: tm-syscall-asm.S
  tm-syscall: CFLAGS += -mhtm -I../../../../../usr/include
+tm-tmspr: CFLAGS += -pthread
  
  include ../../lib.mk
  
diff --git a/tools/testing/selftests/powerpc/tm/tm-fork.c b/tools/testing/selftests/powerpc/tm/tm-fork.c

new file mode 100644 (file)

index 0000000..8d48579
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-fork.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2015, Michael Neuling, IBM Corp.
+ * Licensed under GPLv2.
+ *
+ * Edited: Rashmica Gupta, Nov 2015
+ *
+ * This test does a fork syscall inside a transaction. Basic sniff test
+ * to see if we can enter the kernel during a transaction.
+ */
+
+#include <errno.h>
+#include <inttypes.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "utils.h"
+#include "tm.h"
+
+int test_fork(void)
+{
+       SKIP_IF(!have_htm());
+
+       asm __volatile__(
+               "tbegin.;"
+               "blt    1f; "
+               "li     0, 2;"  /* fork syscall */
+               "sc  ;"
+               "tend.;"
+               "1: ;"
+               : : : "memory", "r0");
+       /* If we reach here, we've passed.  Otherwise we've probably crashed
+        * the kernel */
+
+       return 0;
+}
+
+int main(int argc, char *argv[])
+{
+       return test_harness(test_fork, "tm_fork");
+}
diff --git a/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c b/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c

index 8fde93d6021f67986a931a30e0409f3bb95f5b70..d9c49f41515e704994ed1669dbe0a859a6dadd79 100644 (file)
--- a/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c
+++ b/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c
@@ -31,12 +31,6 @@
  #include "utils.h"
  #include "tm.h"
  
-#define TBEGIN          ".long 0x7C00051D ;"
-#define TEND            ".long 0x7C00055D ;"
-#define TCHECK          ".long 0x7C00059C ;"
-#define TSUSPEND        ".long 0x7C0005DD ;"
-#define TRESUME         ".long 0x7C2005DD ;"
-#define SPRN_TEXASR     0x82
  #define SPRN_DSCR       0x03
  
  int test_body(void)
@@ -55,13 +49,13 @@ int test_body(void)
                         "mtspr   %[sprn_dscr], 3;"
  
                         /* start and suspend a transaction */
-                       TBEGIN
+                       "tbegin.;"
                         "beq     1f;"
-                       TSUSPEND
+                       "tsuspend.;"
  
                         /* hard loop until the transaction becomes doomed */
                         "2: ;"
-                       TCHECK
+                       "tcheck 0;"
                         "bc      4, 0, 2b;"
  
                         /* record DSCR and TEXASR */
@@ -70,8 +64,8 @@ int test_body(void)
                         "mfspr   3, %[sprn_texasr];"
                         "std     3, %[texasr];"
  
-                       TRESUME
-                       TEND
+                       "tresume.;"
+                       "tend.;"
                         "li      %[rv], 0;"
                         "1: ;"
                         : [rv]"=r"(rv), [dscr2]"=m"(dscr2), [texasr]"=m"(texasr)
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-stack.c b/tools/testing/selftests/powerpc/tm/tm-signal-stack.c

index e44a238c1d77800d2753934cac00c48208922da6..1f0eb567438da09ba6b23eca5d21e863ae2a3019 100644 (file)
--- a/tools/testing/selftests/powerpc/tm/tm-signal-stack.c
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-stack.c
@@ -60,9 +60,9 @@ int tm_signal_stack()
                 exit(1);
         asm volatile("li 1, 0 ;"                /* stack ptr == NULL */
                      "1:"
-                    ".long 0x7C00051D ;"       /* tbegin */
+                    "tbegin.;"
                      "beq 1b ;"                 /* retry forever */
-                    ".long 0x7C0005DD ; ;"     /* tsuspend */
+                    "tsuspend.;"
                      "ld 2, 0(1) ;"             /* trigger segv" */
                      : : : "memory");
  
diff --git a/tools/testing/selftests/powerpc/tm/tm-tar.c b/tools/testing/selftests/powerpc/tm/tm-tar.c

new file mode 100644 (file)

index 0000000..2d2fcc2
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-tar.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2015, Michael Neuling, IBM Corp.
+ * Licensed under GPLv2.
+ * Original: Michael Neuling 19/7/2013
+ * Edited: Rashmica Gupta 01/12/2015
+ *
+ * Do some transactions, see if the tar is corrupted.
+ * If the transaction is aborted, the TAR should be rolled back to the
+ * checkpointed value before the transaction began. The value written to
+ * TAR in suspended mode should only remain in TAR if the transaction
+ * completes.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "tm.h"
+#include "utils.h"
+
+int    num_loops       = 10000;
+
+int test_tar(void)
+{
+       int i;
+
+       SKIP_IF(!have_htm());
+
+       for (i = 0; i < num_loops; i++)
+       {
+               uint64_t result = 0;
+               asm __volatile__(
+                       "li     7, 1;"
+                       "mtspr  %[tar], 7;"     /* tar = 1 */
+                       "tbegin.;"
+                       "beq    3f;"
+                       "li     4, 0x7000;"     /* Loop lots, to use time */
+                       "2:;"                   /* Start loop */
+                       "li     7, 2;"
+                       "mtspr  %[tar], 7;"     /* tar = 2 */
+                       "tsuspend.;"
+                       "li     7, 3;"
+                       "mtspr  %[tar], 7;"     /* tar = 3 */
+                       "tresume.;"
+                       "subi   4, 4, 1;"
+                       "cmpdi  4, 0;"
+                       "bne    2b;"
+                       "tend.;"
+
+                       /* Transaction sucess! TAR should be 3 */
+                       "mfspr  7, %[tar];"
+                       "ori    %[res], 7, 4;"  // res = 3|4 = 7
+                       "b      4f;"
+
+                       /* Abort handler. TAR should be rolled back to 1 */
+                       "3:;"
+                       "mfspr  7, %[tar];"
+                       "ori    %[res], 7, 8;"  // res = 1|8 = 9
+                       "4:;"
+
+                       : [res]"=r"(result)
+                       : [tar]"i"(SPRN_TAR)
+                          : "memory", "r0", "r4", "r7");
+
+               /* If result is anything else other than 7 or 9, the tar
+                * value must have been corrupted. */
+               if ((result != 7) && (result != 9))
+                       return 1;
+       }
+       return 0;
+}
+
+int main(int argc, char *argv[])
+{
+       /* A low number of iterations (eg 100) can cause a false pass */
+       if (argc > 1) {
+               if (strcmp(argv[1], "-h") == 0) {
+                       printf("Syntax:\n\t%s [<num loops>]\n",
+                              argv[0]);
+                       return 1;
+               } else {
+                       num_loops = atoi(argv[1]);
+               }
+       }
+
+       printf("Starting, %d loops\n", num_loops);
+
+       return test_harness(test_tar, "tm_tar");
+}
diff --git a/tools/testing/selftests/powerpc/tm/tm-tmspr.c b/tools/testing/selftests/powerpc/tm/tm-tmspr.c

new file mode 100644 (file)

index 0000000..2bda81c
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-tmspr.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright 2015, Michael Neuling, IBM Corp.
+ * Licensed under GPLv2.
+ *
+ * Original: Michael Neuling 3/4/2014
+ * Modified: Rashmica Gupta 8/12/2015
+ *
+ * Check if any of the Transaction Memory SPRs get corrupted.
+ * - TFIAR  - stores address of location of transaction failure
+ * - TFHAR  - stores address of software failure handler (if transaction
+ *   fails)
+ * - TEXASR - lots of info about the transacion(s)
+ *
+ * (1) create more threads than cpus
+ * (2) in each thread:
+ *     (a) set TFIAR and TFHAR a unique value
+ *     (b) loop for awhile, continually checking to see if
+ *     either register has been corrupted.
+ *
+ * (3) Loop:
+ *     (a) begin transaction
+ *     (b) abort transaction
+ *     (c) check TEXASR to see if FS has been corrupted
+ *
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <string.h>
+
+#include "utils.h"
+#include "tm.h"
+
+int    num_loops       = 10000;
+int    passed = 1;
+
+void tfiar_tfhar(void *in)
+{
+       int i, cpu;
+       unsigned long tfhar, tfhar_rd, tfiar, tfiar_rd;
+       cpu_set_t cpuset;
+
+       CPU_ZERO(&cpuset);
+       cpu = (unsigned long)in >> 1;
+       CPU_SET(cpu, &cpuset);
+       sched_setaffinity(0, sizeof(cpuset), &cpuset);
+
+       /* TFIAR: Last bit has to be high so userspace can read register */
+       tfiar = ((unsigned long)in) + 1;
+       tfiar += 2;
+       mtspr(SPRN_TFIAR, tfiar);
+
+       /* TFHAR: Last two bits are reserved */
+       tfhar = ((unsigned long)in);
+       tfhar &= ~0x3UL;
+       tfhar += 4;
+       mtspr(SPRN_TFHAR, tfhar);
+
+       for (i = 0; i < num_loops; i++) {
+               tfhar_rd = mfspr(SPRN_TFHAR);
+               tfiar_rd = mfspr(SPRN_TFIAR);
+               if ( (tfhar != tfhar_rd) || (tfiar != tfiar_rd) ) {
+                       passed = 0;
+                       return;
+               }
+       }
+       return;
+}
+
+void texasr(void *in)
+{
+       unsigned long i;
+       uint64_t result = 0;
+
+       for (i = 0; i < num_loops; i++) {
+               asm __volatile__(
+                       "tbegin.;"
+                       "beq    3f ;"
+                       "tabort. 0 ;"
+                       "tend.;"
+
+                       /* Abort handler */
+                       "3: ;"
+                       ::: "memory");
+
+                /* Check the TEXASR */
+                result = mfspr(SPRN_TEXASR);
+               if ((result & TEXASR_FS) == 0) {
+                       passed = 0;
+                       return;
+               }
+       }
+       return;
+}
+
+int test_tmspr()
+{
+       pthread_t       thread;
+       int             thread_num;
+       unsigned long   i;
+
+       SKIP_IF(!have_htm());
+
+       /* To cause some context switching */
+       thread_num = 10 * sysconf(_SC_NPROCESSORS_ONLN);
+
+       /* Test TFIAR and TFHAR */
+       for (i = 0 ; i < thread_num ; i += 2){
+               if (pthread_create(&thread, NULL, (void*)tfiar_tfhar, (void *)i))
+                       return EXIT_FAILURE;
+       }
+       if (pthread_join(thread, NULL) != 0)
+               return EXIT_FAILURE;
+
+       /* Test TEXASR */
+       for (i = 0 ; i < thread_num ; i++){
+               if (pthread_create(&thread, NULL, (void*)texasr, (void *)i))
+                       return EXIT_FAILURE;
+       }
+       if (pthread_join(thread, NULL) != 0)
+               return EXIT_FAILURE;
+
+       if (passed)
+               return 0;
+       else
+               return 1;
+}
+
+int main(int argc, char *argv[])
+{
+       if (argc > 1) {
+               if (strcmp(argv[1], "-h") == 0) {
+                       printf("Syntax:\t [<num loops>]\n");
+                       return 0;
+               } else {
+                       num_loops = atoi(argv[1]);
+               }
+       }
+       return test_harness(test_tmspr, "tm_tmspr");
+}
diff --git a/tools/testing/selftests/powerpc/utils.h b/tools/testing/selftests/powerpc/utils.h

index 175ac6ad10dde4cc8b753a063d456fe3b8be1269..a985cfaa535e4cd3b0426211e2dc8dec459c0b3b 100644 (file)
--- a/tools/testing/selftests/powerpc/utils.h
+++ b/tools/testing/selftests/powerpc/utils.h
@@ -6,9 +6,12 @@
  #ifndef _SELFTESTS_POWERPC_UTILS_H
  #define _SELFTESTS_POWERPC_UTILS_H
  
+#define __cacheline_aligned __attribute__((aligned(128)))
+
  #include <stdint.h>
  #include <stdbool.h>
  #include <linux/auxvec.h>
+#include "reg.h"
  
  /* Avoid headaches with PRI?64 - just use %ll? always */
  typedef unsigned long long u64;
@@ -54,4 +57,9 @@ do {                                                          \
  #define _str(s) #s
  #define str(s) _str(s)
  
+/* POWER9 feature */
+#ifndef PPC_FEATURE2_ARCH_3_00
+#define PPC_FEATURE2_ARCH_3_00 0x00800000
+#endif
+
  #endif /* _SELFTESTS_POWERPC_UTILS_H */
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 21 May 2016 02:16:12 +0000 (19:16 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 21 May 2016 02:16:12 +0000 (19:16 -0700)